├── tests
    ├── __init__.py
    └── unit
    │   ├── __init__.py
    │   └── test_infrastructure_stack.py
├── requirements-dev.txt
├── docker
    ├── requirements.txt
    ├── serve
    ├── server.sh
    ├── image-push.sh
    ├── dockerfile
    ├── dockerfile-amd
    ├── image-build.sh
    └── main.py
├── requirements.txt
├── images
    └── genai-llm-sagemaker.png
├── .github
    ├── ISSUE_TEMPLATE
    │   └── issue.md
    └── pull_request_template.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── config.yaml
├── source.bat
├── multimodel_config.yaml
├── cb_buildspec
    ├── model_download_buildspec.yaml
    └── model_build_docker_buildspec.yaml
├── LICENSE
├── lambda
    ├── configure_endpoint
    │   └── configure_endpoint.py
    └── trigger_build
    │   └── trigger_build.py
├── cdk.json
├── multimodel_cdk.py
├── app.py
├── CONTRIBUTING.md
├── notebooks
    └── inference.ipynb
├── README.md
└── infrastructure
    └── llama_cpp_stack.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.5
2 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | asgiref
3 | boto3
4 | starlette
5 | uvicorn
6 | requests


--------------------------------------------------------------------------------
/docker/serve:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo "serve"
3 | uvicorn 'main:asgi_app' --host 0.0.0.0 --port 8080 --workers 8


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.110.1
2 | constructs>=10.0.0,<11.0.0
3 | cdk-nag==2.27.214
4 | PyYAML==6.0.1


--------------------------------------------------------------------------------
/images/genai-llm-sagemaker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/genai-llm-cpu-sagemaker/HEAD/images/genai-llm-sagemaker.png


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Issue
3 | about: Report an issue
4 | title: ''
5 | labels: kind/issue
6 | assignees: ''
7 | ---


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | package-lock.json
 3 | __pycache__
 4 | .pytest_cache
 5 | .venv
 6 | *.egg-info
 7 | 
 8 | # CDK asset staging directory
 9 | .cdk.staging
10 | cdk.out
11 | .DS_Store
12 | node_modules
13 | .local.*
14 | *.zip
15 | *.tar.gz
16 | *.gguf
17 | *.diff
18 | *.out
19 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   image:
 3 |     image_tag: arm-latest
 4 |     platform: ARM
 5 |   inference:
 6 |     instance_type: ml.c7g.8xlarge
 7 |     sagemaker_model_name: llama-2-7b-chat-arm
 8 |   model:
 9 |     full_name: llama-2-7b-chat.Q4_K_M.gguf
10 |     hf_name: TheBloke/Llama-2-7b-Chat-GGUF
11 |   name: llmcpp-llama-2-7b-chat
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | <!--
 4 | Please explain the changes you've made.
 5 | -->
 6 | 
 7 | ## Checklist
 8 | 
 9 | Please make sure you've completed the relevant tasks for this PR, out of the following list:
10 | 
11 | * [ ] [Understand the repository structure](./README.md)
12 | * [ ] [Read our general contribution guidelines](./CONTRIBUTING.md)
13 | * [ ] [Read our code of conduct](./CODE_OF_CONDUCT.md)
14 | 


--------------------------------------------------------------------------------
/docker/server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | echo "server.sh"
 3 | echo "args: $1"
 4 | 
 5 | # Check if NVIDIA GPU is available
 6 | if lspci | grep -i nvidia &> /dev/null; then
 7 |   echo "NVIDIA GPU is available."
 8 |   NGL=999
 9 |   CPU_PER_SLOT=1
10 | else
11 |   echo "No NVIDIA GPU found."
12 |   NGL=0
13 |   CPU_PER_SLOT=4
14 | fi
15 | 
16 | killall llama-server
17 | /app/llama-server -m "$1" -c 2048 -t $(nproc --all) --host 0.0.0.0 --port 8081 -cb -np $(($(nproc --all) / $CPU_PER_SLOT)) -ngl $NGL &
18 | 


--------------------------------------------------------------------------------
/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/docker/image-push.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #COMMIT_HASH="latest"
 4 | 
 5 | if [[ $# -ge 3 ]]; then
 6 |     export CDK_DEPLOY_ACCOUNT=$1
 7 |     export CDK_DEPLOY_REGION=$2
 8 |     export REPOSITORY_NAME=$3
 9 |     export IMAGE_TAG=$4
10 |     shift; shift
11 |     
12 |     echo ==--------ECRPush---------==
13 |     docker push "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}"
14 |     exit $?
15 | else
16 |     echo 1>&2 "Provide account and region as first two args..."
17 |     echo 1>&2 "followed by repositopry name and image tag."
18 |     exit 1
19 | fi


--------------------------------------------------------------------------------
/tests/unit/test_infrastructure_stack.py:
--------------------------------------------------------------------------------
 1 | import aws_cdk as core
 2 | import aws_cdk.assertions as assertions
 3 | 
 4 | from infrastructure.infrastructure_stack import InfrastructureStack
 5 | 
 6 | # example tests. To run these tests, uncomment this file along with the example
 7 | # resource in infrastructure/infrastructure_stack.py
 8 | def test_sqs_queue_created():
 9 |     app = core.App()
10 |     stack = InfrastructureStack(app, "infrastructure")
11 |     template = assertions.Template.from_stack(stack)
12 | 
13 | #     template.has_resource_properties("AWS::SQS::Queue", {
14 | #         "VisibilityTimeout": 300
15 | #     })
16 | 


--------------------------------------------------------------------------------
/multimodel_config.yaml:
--------------------------------------------------------------------------------
 1 | project:
 2 | - name: "llmcpp-llama-2-7b-chat"
 3 |   model:
 4 |     hf_name: "TheBloke/Llama-2-7b-Chat-GGUF"
 5 |     full_name: "llama-2-7b-chat.Q4_K_M.gguf"
 6 |   image:
 7 |     platform: "ARM"
 8 |     image_tag: "arm-latest"
 9 |   inference:
10 |     sagemaker_model_name: "llama-2-7b-chat-arm"
11 |     instance_type: "ml.c7g.8xlarge"
12 | - name: "mistral-7b"
13 |   model:
14 |     hf_name: "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF"
15 |     full_name: "capybarahermes-2.5-mistral-7b.Q4_K_M.gguf"
16 |   image:
17 |     platform: "AMD"
18 |     image_tag: "amd-latest"
19 |   inference:
20 |     sagemaker_model_name: "mistral-7b-g5"
21 |     instance_type: "ml.g5.xlarge"


--------------------------------------------------------------------------------
/cb_buildspec/model_download_buildspec.yaml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | env:
 3 |   shell: bash
 4 | phases:
 5 |   install:
 6 |     commands:
 7 |       - echo Entered the install phase...
 8 |       - sudo apt-get update
 9 |       - sudo apt-get install -y python3-pip
10 |       - pip3 install huggingface-hub>=0.17.1 hf_transfer
11 |   build:
12 |     on-failure: CONTINUE
13 |     commands:
14 |       - echo Entered the build phase...
15 |       - echo Downloading model
16 |       - HUGGINGFACE_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download ${MODEL_HUGGING_FACE_NAME} ${MODEL_BUCKET_KEY_FULL_NAME} --local-dir . --local-dir-use-symlinks False
17 |       - echo Copying uncompressed file
18 |       - aws s3 cp ${MODEL_BUCKET_KEY_FULL_NAME} s3://${MODEL_BUCKET_NAME}/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 4 | software and associated documentation files (the "Software"), to deal in the Software
 5 | without restriction, including without limitation the rights to use, copy, modify,
 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 7 | permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/cb_buildspec/model_build_docker_buildspec.yaml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | env:
 4 |   shell: bash
 5 | 
 6 | phases:
 7 |   install:
 8 |     commands:
 9 |       - mkdir -p $HOME/.docker/cli-plugins
10 |       - export BUILDX_VERSION=$(curl --silent "https://api.github.com/repos/docker/buildx/releases/latest" |jq -r .tag_name)
11 |       - wget -O $HOME/.docker/cli-plugins/docker-buildx https://github.com/docker/buildx/releases/download/$BUILDX_VERSION/buildx-$BUILDX_VERSION.linux-arm64
12 |       - chmod a+rx $HOME/.docker/cli-plugins/docker-buildx
13 |       - docker run --privileged --rm public.ecr.aws/eks-distro-build-tooling/binfmt-misc:qemu-v7.0.0 --install arm64, amd64
14 |       - export DOCKER_BUILDKIT=1
15 |       - export DOCKER_CLI_EXPERIMENTAL=enabled
16 |   build:
17 |     commands:
18 |       - echo Entered the build phase...
19 |       - bash ./image-build.sh $CDK_DEPLOY_ACCOUNT $CDK_DEPLOY_REGION $REPOSITORY_NAME $IMAGE_TAG $PLATFORM 
20 |       - echo Entered the post_build phase...
21 |       - bash ./image-push.sh $CDK_DEPLOY_ACCOUNT $CDK_DEPLOY_REGION $REPOSITORY_NAME $IMAGE_TAG


--------------------------------------------------------------------------------
/docker/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/ggerganov/llama.cpp:full
 2 | 
 3 | # Sets dumping log messages directly to stream instead of buffering
 4 | ENV PYTHONUNBUFFERED=1
 5 | # Set MODELPATH environment variable
 6 | ENV MODELPATH=/app/llm_model.bin
 7 | 
 8 | ENV PATH=$PATH:/app
 9 | 
10 | # The working directory in the Docker image
11 | WORKDIR /app
12 | 
13 | RUN apt-get update 
14 | RUN apt-get upgrade -y
15 | RUN apt-get remove linux-libc-dev -y
16 | 
17 | # Install system dependencies
18 | RUN apt-get install -y \
19 |     unzip \
20 |     psmisc \
21 |     pciutils 
22 | 
23 | # Copy requirements.txt and install Python dependencies
24 | COPY requirements.txt ./requirements.txt
25 | #main application file
26 | COPY main.py /app/
27 | #sagemaker endpoints expects serve file to run the application
28 | COPY serve /app/
29 | COPY server.sh /app/
30 | 
31 | RUN chmod u+x serve
32 | RUN chmod u+x server.sh
33 | 
34 | RUN pip3 install -r requirements.txt
35 | RUN export PATH=/app:$PATH
36 | 
37 | ENTRYPOINT ["/bin/bash"]
38 | 
39 | # Expose port for the application to run on, has to be 8080
40 | EXPOSE 8080
41 | 


--------------------------------------------------------------------------------
/docker/dockerfile-amd:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/ggerganov/llama.cpp:full-cuda
 2 | 
 3 | # Sets dumping log messages directly to stream instead of buffering
 4 | ENV PYTHONUNBUFFERED=1
 5 | # Set MODELPATH environment variable
 6 | ENV MODELPATH=/app/llm_model.bin
 7 | 
 8 | ENV PATH=$PATH:/app
 9 | 
10 | # The working directory in the Docker image
11 | WORKDIR /app
12 | 
13 | # Install system dependencies
14 | RUN apt-get update && apt-get install -y \
15 |     unzip \
16 |     libcurl4-openssl-dev \
17 |     python3 \
18 |     python3-pip \
19 |     python3-dev \
20 |     git \
21 |     psmisc \
22 |     pciutils 
23 | 
24 | # Copy requirements.txt and install Python dependencies
25 | COPY requirements.txt ./requirements.txt
26 | #main application file
27 | COPY main.py /app/
28 | #sagemaker endpoints expects serve file to run the application
29 | COPY serve /app/
30 | COPY server.sh /app/
31 | 
32 | RUN chmod u+x serve
33 | RUN chmod u+x server.sh
34 | 
35 | RUN pip3 install -r requirements.txt
36 | RUN export PATH=/app:$PATH
37 | 
38 | ENTRYPOINT ["/bin/bash"]
39 | 
40 | # Expose port for the application to run on, has to be 8080
41 | EXPOSE 8080
42 | 


--------------------------------------------------------------------------------
/lambda/configure_endpoint/configure_endpoint.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from os import environ
 3 | import json
 4 | 
 5 | sagemaker_client = boto3.client('sagemaker-runtime')
 6 | 
 7 | def lambda_handler(event, context): 
 8 |     print(f'event : {event}')
 9 |     event_type = event['RequestType']
10 |     endpoint_name = environ['SAGEMAKER_ENDPOINT_NAME']
11 |     payload = {
12 |         "configure": {
13 |             "bucket": environ['MODEL_BUCKET_NAME'],
14 |             "key": environ['MODEL_BUCKET_KEY_NAME']
15 |         }
16 |     }
17 |     print(f' payload : {json.dumps(payload, default=str)}')
18 | 
19 |     if event_type in ['Create']:
20 |         response = sagemaker_client.invoke_endpoint(
21 |             EndpointName=endpoint_name,
22 |             ContentType='application/json',
23 |             Body=json.dumps(payload)
24 |         )
25 |         print(f"response: {response}")
26 | 
27 |         return {
28 |             'statusCode': 200,
29 |             'Response': json.dumps(response, default=str)
30 |         }
31 |     
32 |     return{
33 |         'statusCode': 200,
34 |         'Response': 'Not a create request!'
35 |     }


--------------------------------------------------------------------------------
/lambda/trigger_build/trigger_build.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from os import environ
 3 | import json
 4 | 
 5 | sfn_client = boto3.client('stepfunctions')
 6 | 
 7 | def lambda_handler(event, context): 
 8 |     print(f'event : {event}')
 9 |     event_type = event['RequestType']
10 |     sm_arn = environ['STATE_MACHINE_ARN']
11 |     
12 |     if event_type in ['Create', 'Delete']:
13 |         res = sfn_client.list_executions(
14 |             stateMachineArn=sm_arn,
15 |             maxResults=1
16 |         )
17 |         if res['executions']:
18 |             print(f'exections exists : {res["executions"]}')
19 |             if res['executions'][0]['status'] == 'RUNNING':
20 |                 print(f'execution still running. IsComplete: False.')
21 |                 return { 
22 |                    'statusCode': 200,
23 |                     'IsComplete': False
24 |                 }
25 |             else:
26 |                 print(f'execution not running. IsComplete: True.')
27 |                 return { 
28 |                    'statusCode': 200,
29 |                     'IsComplete': True
30 |                 }
31 |         else:
32 |             print(f'execution doens\'t exist. Executing stepfunction statemachine.')
33 |             response = sfn_client.start_execution(stateMachineArn=sm_arn)
34 |             print(response)
35 | 
36 |             return {
37 |                 'statusCode': 200,
38 |                 'Response': json.dumps(response, default=str)
39 |             }
40 |         


--------------------------------------------------------------------------------
/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ],
39 |     "config_file": "config.yaml"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/docker/image-build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #COMMIT_HASH="latest" #"v29.10.1"
 4 | IMAGE_NAME="llama-cpp-image"
 5 | 
 6 | if [[ $# -ge 4 ]]; then
 7 |     export CDK_DEPLOY_ACCOUNT=$1
 8 |     export CDK_DEPLOY_REGION=$2
 9 |     export REPOSITORY_NAME=$3
10 |     export IMAGE_TAG=$4
11 |     export PLATFORM=$5
12 | 
13 |     export DOCKER_BUILDKIT=1
14 |     export DOCKER_CLI_EXPERIMENTAL=enabled
15 | 
16 |     export PLATFORM_PARAMETER_VALUE="linux/arm64"
17 |     export DOCKER_FILE_NAME="dockerfile"
18 | 
19 |     if [[ $PLATFORM == *"arm"* ]]
20 |     then
21 |         PLATFORM_PARAMETER_VALUE="linux/arm64"
22 |         DOCKER_FILE_NAME="dockerfile"
23 |         echo "[INFO] Building an image for ARM platform"
24 |     elif [[ $PLATFORM == *"amd"* ]]
25 |     then
26 |         PLATFORM_PARAMETER_VALUE="linux/amd64"
27 |         DOCKER_FILE_NAME="dockerfile-amd"
28 |         echo "[INFO] Building an image for AMD platform"
29 |     else
30 |         echo "[ERROR] Platform {$PLATFORM} not supported."
31 |         exit 0
32 |     fi
33 | 
34 |     shift; shift
35 |     echo ==--------ECRLogin---------==
36 |     aws ecr get-login-password --region "${CDK_DEPLOY_REGION}" | docker login --username AWS --password-stdin "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com"
37 |     
38 |     echo ==--------ECRBuild---------==    
39 |     docker buildx build --platform "${PLATFORM_PARAMETER_VALUE}" -t "${IMAGE_NAME}:${IMAGE_TAG}" -f "${DOCKER_FILE_NAME}" .
40 |     
41 |     echo ==--------ECRTag---------== 
42 |     docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}"
43 |     exit $?
44 | else
45 |     echo 1>&2 "Provide account and region as first two args..."
46 |     echo 1>&2 "followed by repositopry name, image tag and platform."
47 |     exit 1
48 | fi


--------------------------------------------------------------------------------
/multimodel_cdk.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import yaml
 4 | import tempfile
 5 | import os
 6 | import argparse
 7 | 
 8 | def execute_command(command):
 9 |     try:
10 |         # Execute the command and capture its output
11 |         output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, universal_newlines=True)
12 |         return command, output
13 |     except subprocess.CalledProcessError as e:
14 |         # Capture error output if the command fails
15 |         return command, e.output
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(description="LlamaCpp Multimodel Deploy utility")
19 |     parser.add_argument("--deploy", action="store_true", help="Deploy model stacks")
20 |     parser.add_argument("--destroy", action="store_true", help="Destroy model stacks")
21 |     parser.add_argument("--config",help="Multimodel config file", default="multimodel_config.yaml" )
22 |     parser.add_argument("--output-dir", help="Output directory for model deployment assets", default="./cdk.out/.multimodel_deploy")
23 |     args = parser.parse_args()
24 | 
25 |     # list of cdk stacks
26 |     dotfiles_dir = args.output_dir
27 |     with open(args.config, 'r') as f:
28 |         project_config = yaml.safe_load(f)
29 | 
30 |         os.makedirs(dotfiles_dir, exist_ok=True)
31 | 
32 |         dotfiles = []
33 |         for idx, project in enumerate(project_config['project']):
34 |             dotfile = tempfile.NamedTemporaryFile(prefix='.', suffix='.yaml', delete=False, dir=dotfiles_dir)
35 |             dotfiles.append(dotfile.name)
36 |             with open(dotfile.name, 'w') as f:
37 |                 yaml.dump({'project': project}, f)
38 | 
39 |     # List of commands to execute
40 |     commands = []
41 |     print('Running following in parallel : ')
42 |     for idx, config_file in enumerate(dotfiles):
43 |         output_dir_name = os.path.splitext(config_file)[0]
44 |         if args.deploy:
45 |             commands.append(f"cdk deploy --context config_file='{config_file}' --output='{output_dir_name}' --require-approval=never")
46 |         elif args.destroy:
47 |             commands.append(f"cdk destroy --context config_file='{config_file}' --output='{output_dir_name}' --require-approval=never --force")
48 |         else:
49 |             parser.print_help()
50 |             return
51 |         print(commands[idx])
52 |     
53 |     
54 |     # Maximum number of threads to use
55 |     max_threads = 5
56 | 
57 |     # Execute commands in parallel using a ThreadPoolExecutor
58 |     with ThreadPoolExecutor(max_threads) as executor:
59 |         # Submit each command to the executor
60 |         futures = [executor.submit(execute_command, cmd) for cmd in commands]
61 |     
62 |         # Wait for all commands to complete and collect results
63 |         for future in futures:
64 |             command, result = future.result()
65 |             print(f"Output of command '{command}':")
66 |             print(result)
67 |             print("=" * 50)
68 | 
69 | if __name__ == "__main__":
70 |     main()


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from os import getenv
 3 | 
 4 | import aws_cdk as cdk
 5 | from cdk_nag import AwsSolutionsChecks, NagSuppressions, NagPackSuppression
 6 | 
 7 | import yaml
 8 | 
 9 | from infrastructure.llama_cpp_stack import LlamaCppStack
10 | 
11 | ### Set environment
12 | environment=cdk.Environment(
13 |     region=getenv("AWS_REGION", getenv("CDK_DEFAULT_REGION")),
14 |     account=getenv("AWS_ACCOUNT_ID", getenv("CDK_DEFAULT_ACCOUNT")),
15 | )
16 | 
17 | # cdk app
18 | app = cdk.App()
19 | 
20 | ## Read config
21 | with open(app.node.try_get_context('config_file'), 'r') as f:
22 |     config = yaml.safe_load(f)['project']
23 | 
24 | project_name = config['name']
25 | model_hugging_face_name = config['model']['hf_name']
26 | model_bucket_key_full_name = config['model']['full_name']
27 | platform = config['image']['platform'].lower()
28 | image_tag = config['image']['image_tag']
29 | sagemaker_model_name = config['inference']['sagemaker_model_name']
30 | sagemaker_instance_type = config['inference']['instance_type']
31 | 
32 | ### Validate input
33 | if platform not in ["arm", "amd"]:
34 |     raise ValueError(f"[ERROR] Value {platform} of the \"image.platform\" parameter does not match one of the suported values: ['arm', 'amd']") 
35 | if platform not in ["arm"] and "g" in sagemaker_instance_type.split(".")[1] and sagemaker_instance_type.split(".")[1] not in ["g5"]:
36 |     print("[WARNING] Platfrom for the image is not set to ARM, however, instance type potentially belongs to the AWS Graviton family.")
37 | 
38 | # stack
39 | llamaCppStack = LlamaCppStack(app,
40 |     f"{project_name}-LlamaCppStack",
41 |     env=environment,
42 |     project_name=project_name,
43 |     model_bucket_key_full_name=model_bucket_key_full_name,
44 |     model_hugging_face_name=model_hugging_face_name,
45 |     image_tag=image_tag,
46 |     image_platform=platform,
47 |     model_name=sagemaker_model_name,
48 |     model_instance_type=sagemaker_instance_type
49 | )
50 | 
51 | # tags
52 | tags = {
53 |     "SolutionName": "LlamacppSagemakerEndpoint",
54 |     "SolutionVersion": "v1.0.0",
55 |     "SolutionIaC": "CDK v2"
56 | }
57 | 
58 | for key, val in tags.items():
59 |     cdk.Tags.of(app).add(key,val)
60 | 
61 | # cdk-nag checks
62 | nag_suppressions = [
63 |     {"id": "AwsSolutions-IAM5", "reason": "CodePipeline policy needs to have full access to assets S3 bucket."},
64 |     {"id": "AwsSolutions-IAM4", "reason": "CustomeResource Lambda function using managed policy, following least previleges."},
65 |     {"id": "AwsSolutions-L1", "reason": "CDK CustomResource limitation."},
66 |     {"id": "AwsSolutions-SF1", "reason": "State machine used for trigger CodeBuild job in sync, thus logging ALL events is not needed."},
67 |     {"id": "AwsSolutions-SF2", "reason": "State machine used for trigger CodeBuild job in sync, thus X-ray is not needed."},
68 |     {"id": "AwsSolutions-CB4", "reason": "CodeBuild does not have to encrypt data for the purpose of this sample code. Adding KMS key would incur additional cost."}
69 | ]
70 | 
71 | for supression in nag_suppressions:
72 |     NagSuppressions.add_stack_suppressions(llamaCppStack, [NagPackSuppression(id=supression["id"], reason=supression["reason"])])
73 | 
74 | # cdk.Aspects.of(app).add(AwsSolutionsChecks(verbose=True))
75 | 
76 | app.synth()
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/notebooks/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "\"\"\"\n",
 10 |     "Here we define the functionality to interact with endpoint. \n",
 11 |     "we use different function for handling streaming response as the output format is different.\n",
 12 |     "define \"endpoint_name\" variable below based on the cloudformation stack output.\n",
 13 |     "\"\"\"\n",
 14 |     "\n",
 15 |     "import boto3\n",
 16 |     "import json\n",
 17 |     "\n",
 18 |     "sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')\n",
 19 |     "endpoint_name='llmcpp-llama-2-7b-chat-llama-2-7b-chat-arm-Endpoint'\n",
 20 |     "\n",
 21 |     "def invoke_sagemaker_endpoint(endpoint_name, llama_args):\n",
 22 |     "    payload = {\n",
 23 |     "        'inference': True,\n",
 24 |     "        'configure': False,\n",
 25 |     "        'args': llama_args\n",
 26 |     "    }\n",
 27 |     "    response = sagemaker_runtime.invoke_endpoint(\n",
 28 |     "        EndpointName=endpoint_name,\n",
 29 |     "        Body=json.dumps(llama_args),\n",
 30 |     "        ContentType='application/json',\n",
 31 |     "    )\n",
 32 |     "    response_body = json.loads(response['Body'].read().decode())\n",
 33 |     "    return response_body\n",
 34 |     "\n",
 35 |     "def invoke_sagemaker_streaming_endpoint(endpoint_name, payload):\n",
 36 |     "    response = sagemaker_runtime.invoke_endpoint_with_response_stream(\n",
 37 |     "        EndpointName=endpoint_name,\n",
 38 |     "        Body=json.dumps(payload),\n",
 39 |     "        ContentType='application/json',\n",
 40 |     "    )    \n",
 41 |     "    event_stream = response['Body']\n",
 42 |     "    for line in event_stream:\n",
 43 |     "        itm = line['PayloadPart']['Bytes'][6:]\n",
 44 |     "        try:\n",
 45 |     "            res = json.loads(itm, strict=False )\n",
 46 |     "            print(res[\"choices\"][0][\"text\"], end='')\n",
 47 |     "        except:\n",
 48 |     "            #non-valid json, e.g. empty token \n",
 49 |     "            pass\n"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 6,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "'\\n everybody has their own unique preferences and interests, but there are some places that consistently top the lists of must-see destinations in Europe. From ancient ruins to modern cities , here is a list 10 most popular destination for travelers: Rome Italy - The Eternal City boasts an incredible history dating back centuries . Visit iconic landmarks like Colosseum and Vatican city, indulge...\\nRomeItaly Europe Travel Destinations Top Lists'"
 61 |       ]
 62 |      },
 63 |      "execution_count": 6,
 64 |      "metadata": {},
 65 |      "output_type": "execute_result"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "\"\"\"\n",
 70 |     "Non-streaming inference example.   \n",
 71 |     "\"\"\"\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "llama_args = {\n",
 75 |     "    \"prompt\": \"What are top 10 destinations to visit in Europe?\",\n",
 76 |     "    \"max_tokens\": 128,\n",
 77 |     "    \"temperature\": 0.1,\n",
 78 |     "    \"repeat_penalty\":1.5,\n",
 79 |     "    \"frequency_penalty\":1.1,\n",
 80 |     "    \"top_p\": 0.5\n",
 81 |     "}\n",
 82 |     "\n",
 83 |     "inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)\n",
 84 |     "inference['choices'][0]['text']"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 7,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "\n",
 97 |       " obviously there many other destinations to visit in Europe. But here is a list of the top 10 most popular and highly recommended places: Paris France Rome Italy Barcelona Spain Amsterdam Netherlands Berlin Germany Prague Czech Republic Athens Greece Each country has its unique culture history architecture food wine beaches scenic views landmarks museums art galleries festivals events parks forests lakes rivers mountains valleys coastlines islands.\n",
 98 |       "Top 10 Destinations to Visit in Europe: A Comprehensive Guide (2023)"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "\"\"\"\n",
104 |     "Streaming inference example\n",
105 |     "to enable streaming mode, set stream=True\n",
106 |     "\"\"\"\n",
107 |     "\n",
108 |     "llama_args = {\n",
109 |     "    \"prompt\": \"What are top 10 destinations to visit in Europe?\",\n",
110 |     "    \"max_tokens\": 300,\n",
111 |     "    \"temperature\": 0.1,\n",
112 |     "    \"repeat_penalty\":1.5,\n",
113 |     "    \"frequency_penalty\":1.1,\n",
114 |     "    \"top_p\": 0.5,\n",
115 |     "    \"stream\": True\n",
116 |     "}\n",
117 |     "\n",
118 |     "invoke_sagemaker_streaming_endpoint(endpoint_name,llama_args)"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.9.6"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Large Language Models (LLMs) on CPU as SageMaker Endpoints
  2 | 
  3 | This code demonstrates how you can run Large Language Models (LLMs) on CPU-only instances including Graviton. We are using [Llama.cpp project](https://github.com/ggerganov/llama.cpp) and exposing an Sagemaker endpoint API for inference. Models are downloaded from [Hugging Face model hub](https://huggingface.co/models).
  4 | The project can be deployed to be compatible to both ARM64 and x86 architectures. 
  5 | 
  6 | ## Project Overview
  7 | 
  8 | This project is built by using [AWS Cloud Development Kit](https://aws.amazon.com/cdk/)(AWS CDK)  with Python.
  9 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 10 | 
 11 | ### Configuration
 12 | 
 13 | AWS CDK app configuration file values are in `config.yaml`:
 14 | 
 15 | | Parameter | Description | Example value | 
 16 | | :---    | :---    | :---    |
 17 | | project.name | Used as prefix for AWS resources created with this app | cpu-llm |
 18 | | model.hf_name | [HuggingFace](https://huggingface.co) model name | TheBloke/Llama-2-7b-Chat-GGUF |
 19 | | model.full_name | [HuggingFace](https://huggingface.co) model file full name | llama-2-7b-chat.Q4_K_M.gguf |
 20 | | image.platform | Platfrom used to run inference and build an image; Values: ["ARM", "AMD"]  | ARM |
 21 | | image.image_tag | Tag used to tag the image; | arm-latest |
 22 | | inference.sagemaker_model_name | SageMaker endpoint name for model inference | llama-2-7b-chat |
 23 | | inference.instance_type | Instance type used for SageMaker Endpoint | "ml.c7g.8xlarge" for ARM platform or "ml.g5.xlarge" for AMD platform |
 24 | 
 25 | At the moment the only supported options are ARM-based inference on Amazon Graviton processors and AMD-based inference for CUDA-based GPUs (G5 are highly recommended). For GPU inference we do not support weights sharding across multiple GPU cards.  
 26 | 
 27 | ### Architecture
 28 | 
 29 | ![architecture diagram](images/genai-llm-sagemaker.png)
 30 | 
 31 | The stack can be found in `./infrastructure` directory.
 32 | 
 33 | ## Prerequisites
 34 | 
 35 | Before proceeding any further, you need to identify and designate an AWS account required for the solution to work. 
 36 | 
 37 | ### Deploying from your local machine
 38 | 
 39 | You need to create an AWS account profile in ~/.aws/credentials for the designated AWS account, if you don’t already have one. The profile needs to have sufficient permissions to run an [AWS Cloud Development Kit](https://aws.amazon.com/cdk/) (AWS CDK) stack. We recommend removing the profile when you’re finished with the testing. For more information about creating an AWS account profile, see [Configuring the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). 
 40 | 
 41 | Python 3.11.x or later has to be installed on a machine to run CDK code. 
 42 | You will also need to install AWS CDK CLI as per [documentation](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) and [bootstrap](https://docs.aws.amazon.com/cdk/v2/guide/bootstrapping.html) your environment.  
 43 | 
 44 | 
 45 | 
 46 | ### Deploying from Cloud9 instance
 47 | 
 48 | If you don't want to install the necessary software locally you can spin up [Cloud9](https://docs.aws.amazon.com/cloud9/latest/user-guide/create-environment-main.html) instance that already have all necessary software preinstalled, however if this is your first CDK deployment in the account and/or region you will need to [bootstrap](https://docs.aws.amazon.com/cdk/v2/guide/bootstrapping.html) your environment. 
 49 | 
 50 | ## CDK deployment 
 51 | ### To Create Resources / Deploy Stack
 52 | 
 53 | Open the terminal and run the following commands:
 54 | 
 55 | ```bash
 56 | # uncomment the line below if you need to bootstrap your environment
 57 | # replace ACCOUNT_ID and REGION placeholders with your actual
 58 | # AWS account id and region where you deploy the application
 59 | 
 60 | # cdk bootstrap aws://ACCOUNT_ID/REGION 
 61 | 
 62 | git clone https://github.com/aws-samples/genai-llm-cpu-sagemaker llamacpp
 63 | cd llamacpp
 64 | python3 -m venv .venv
 65 | source .venv/bin/activate
 66 | pip3 install -r requirements.txt
 67 | cdk deploy
 68 | ```
 69 | 
 70 | ### To Destroy Resources / Clean-up
 71 | 
 72 | Delete stack from Cloudfromation console.
 73 | 
 74 | ### Model Selection / Change
 75 | 
 76 | Only changing a model does not require rebuidling an image, and would take approximatelly 30% less time than redeploying the whole application. You can use the following process:
 77 | 
 78 | 1. Navigate to https://huggingface.co/TheBloke and choose GGUF model of your choice for example https://huggingface.co/TheBloke/llama-2-7B-Arguments-GGUF, scroll to provided files. Usually Q4_K_M is good enough compromise (based on our testing but feel free to try yourself).
 79 | 
 80 | 2. Update values of the variables in `config.yaml` to use the new model:
 81 |     * model.hf_name - set Hugging Face model name e.g. "TheBloke/llama-2-7B-Arguments-GGUF"
 82 |     * model.full_name - set Hugging Face file full name e.g. "llama-2-7b-chat.Q4_K_M.gguf"
 83 | 
 84 | 3. Re-deploy stack by running `cdk deploy`
 85 | 
 86 | ### Platform Selection / Change
 87 | 
 88 | 1. Update values of the variables in `config.yaml` to use the different platform:
 89 |     * platform      - set platform (not case sensitive) e.g. "AMD"
 90 |     * instance_type - set instance type that matches platform e.g. "ml.g5.xlarge"
 91 |     * image_tag     - (optional) update image tag e.g. "amd-latest"
 92 | 
 93 | 2. Re-deploy stack by running `cdk deploy` 
 94 | 
 95 | 
 96 | ## Multi-Model Deployment
 97 | 
 98 | Sometimes you want to try multiple models from Hugging face to compare the quality of responses or latency. For this you can specify several models in `multimodel_config.yaml` and then use provided python script to start multiple model deployments in parallel.
 99 | 
100 | ```bash
101 | python3 multimodel_cdk.py --deploy
102 | ```
103 | 
104 | ## Inference
105 | 
106 | Use `notebooks/inference.ipynb` as an example. IAM credentials / IAM Role that you use to run the notebook has to allow `sagemaker:InvokeEndpoint` API calls. 
107 | 
108 | If you don't have an existing environment to run Juputer notebooks, the easiest way to run the notebook would be to create new Sagemaker [notebook instance](https://docs.aws.amazon.com/sagemaker/latest/dg/howitworks-create-ws.html) using default settings and letting Sagemaker to create the necessary IAM role with enough permissions to interact with provisioned LLM endpoint. 
109 | 
110 | 
111 | ## Limitations
112 | 
113 | At the moment there's [25GB limit](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-byoi-specs.html) on custom docker image size. Please make sure the size of GGUF model file you want to use is below the limit. 
114 | 
115 | ## Security
116 | 
117 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications)  for more information.
118 | 
119 | ## License
120 | 
121 | This library is licensed under the MIT-0 License. See the LICENSE file.
122 | 


--------------------------------------------------------------------------------
/docker/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from asgiref.wsgi import WsgiToAsgi
  4 | from flask import Flask, jsonify, request, Response
  5 | import urllib.parse
  6 | import requests
  7 | import time
  8 | import json
  9 | import boto3
 10 | import os
 11 | import subprocess
 12 | import traceback
 13 | 
 14 | 
 15 | app = Flask(__name__)
 16 | slot_id = -1
 17 | 
 18 | parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 19 | parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
 20 | parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
 21 | parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
 22 | parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
 23 | parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
 24 | parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8081)", default='http://127.0.0.1:8081')
 25 | parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
 26 | parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
 27 | parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8080)", default=8080)
 28 | 
 29 | args, unknown = parser.parse_known_args()
 30 | 
 31 | def is_present(json, key):
 32 |     try:
 33 |         buf = json[key]
 34 |     except KeyError:
 35 |         return False
 36 |     if json[key] == None:
 37 |         return False
 38 |     return True
 39 | 
 40 | #convert chat to prompt
 41 | def convert_chat(messages):
 42 |     prompt = "" + args.chat_prompt.replace("\\n", "\n")
 43 | 
 44 |     system_n = args.system_name.replace("\\n", "\n")
 45 |     user_n = args.user_name.replace("\\n", "\n")
 46 |     ai_n = args.ai_name.replace("\\n", "\n")
 47 |     stop = args.stop.replace("\\n", "\n")
 48 | 
 49 | 
 50 |     for line in messages:
 51 |         if (line["role"] == "system"):
 52 |             prompt += f"{system_n}{line['content']}"
 53 |         if (line["role"] == "user"):
 54 |             prompt += f"{user_n}{line['content']}"
 55 |         if (line["role"] == "assistant"):
 56 |             prompt += f"{ai_n}{line['content']}{stop}"
 57 |     prompt += ai_n.rstrip()
 58 | 
 59 |     return prompt
 60 | 
 61 | def make_postData(body, chat=False, stream=False):
 62 |     postData = {}
 63 |     if (chat):
 64 |         postData["prompt"] = convert_chat(body["messages"])
 65 |     else:
 66 |         postData["prompt"] = body["prompt"]
 67 |     if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
 68 |     if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
 69 |     if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
 70 |     if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
 71 |     if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
 72 |     if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
 73 |     if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
 74 |     if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
 75 |     if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
 76 |     if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
 77 |     if(is_present(body, "seed")): postData["seed"] = body["seed"]
 78 |     if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
 79 |     if (args.stop != ""):
 80 |         postData["stop"] = [args.stop]
 81 |     else:
 82 |         postData["stop"] = []
 83 |     if(is_present(body, "stop")): postData["stop"] += body["stop"]
 84 |     postData["n_keep"] = -1
 85 |     postData["stream"] = stream
 86 |     postData["cache_prompt"] = True
 87 |     postData["slot_id"] = slot_id
 88 |     return postData
 89 | 
 90 | def make_resData(data, chat=False, promptToken=[]):
 91 |     resData = {
 92 |         "id": "chatcmpl" if (chat) else "cmpl",
 93 |         "object": "chat.completion" if (chat) else "text_completion",
 94 |         "created": int(time.time()),
 95 |         "truncated": data["truncated"],
 96 |         "model": "LLaMA_CPP",
 97 |         "usage": {
 98 |             "prompt_tokens": data["tokens_evaluated"],
 99 |             "completion_tokens": data["tokens_predicted"],
100 |             "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
101 |         }
102 |     }
103 |     if (len(promptToken) != 0):
104 |         resData["promptToken"] = promptToken
105 |     if (chat):
106 |         #only one choice is supported
107 |         resData["choices"] = [{
108 |             "index": 0,
109 |             "message": {
110 |                 "role": "assistant",
111 |                 "content": data["content"],
112 |             },
113 |             "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
114 |         }]
115 |     else:
116 |         #only one choice is supported
117 |         resData["choices"] = [{
118 |             "text": data["content"],
119 |             "index": 0,
120 |             "logprobs": None,
121 |             "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
122 |         }]
123 |     return resData
124 | 
125 | def make_resData_stream(data, chat=False, time_now = 0, start=False):
126 |     resData = {
127 |         "id": "chatcmpl" if (chat) else "cmpl",
128 |         "object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
129 |         "created": time_now,
130 |         "model": "LLaMA_CPP",
131 |         "choices": [
132 |             {
133 |                 "finish_reason": None,
134 |                 "index": 0
135 |             }
136 |         ]
137 |     }
138 |     if (chat):
139 |         if (start):
140 |             resData["choices"][0]["delta"] =  {
141 |                 "role": "assistant"
142 |             }
143 |         else:
144 |             resData["choices"][0]["delta"] =  {
145 |                 "content": data["content"]
146 |             }
147 |             if (data["stop"]):
148 |                 resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
149 |     else:
150 |         resData["choices"][0]["text"] = data["content"]
151 |         if (data["stop"]):
152 |             resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
153 | 
154 |     return resData
155 | 
156 | def update_model(bucket, key):
157 |     try:
158 |         s3 = boto3.client('s3')
159 |         s3.download_file(bucket, key, os.environ.get('MODELPATH'))
160 |         subprocess.run(["/app/server.sh", os.environ.get('MODELPATH')])
161 |         return True
162 |     except Exception as e:
163 |         print(e)
164 |         print(str(traceback.format_exc()))
165 |         return False
166 | 
167 | @app.route('/ping', methods=['GET'])
168 | def ping():
169 |     return Response(status=200)
170 | 
171 | @app.route("/invocations", methods=['POST'])
172 | def completion():
173 |     if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
174 |         return Response(status=403)
175 |     body = request.get_json()
176 |     stream = False
177 |     tokenize = False
178 |     if (is_present(body, "configure")): 
179 |         res = update_model(body["configure"]["bucket"], body["configure"]["key"])
180 |         return Response(status=200) if (res) else Response(status=500)
181 |     if(is_present(body, "stream")): stream = body["stream"]
182 |     if(is_present(body, "tokenize")): tokenize = body["tokenize"]
183 |     postData = make_postData(body, chat=False, stream=stream)
184 | 
185 |     promptToken = []
186 |     if (tokenize):
187 |         tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
188 |         promptToken = tokenData["tokens"]
189 | 
190 |     if (not stream):
191 |         data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
192 |         print(data.json())
193 |         resData = make_resData(data.json(), chat=False, promptToken=promptToken)
194 |         return jsonify(resData)
195 |     else:
196 |         def generate():
197 |             data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
198 |             time_now = int(time.time())
199 |             for line in data.iter_lines():
200 |                 if line:
201 |                     decoded_line = line.decode('utf-8')
202 |                     resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
203 |                     yield 'data: {}\n'.format(json.dumps(resData))
204 |         return Response(generate(), mimetype='text/event-stream')
205 | 
206 | asgi_app = WsgiToAsgi(app)
207 | 
208 | #if __name__ == '__main__':
209 | #    app.run(args.host, port=args.port)
210 | 


--------------------------------------------------------------------------------
/infrastructure/llama_cpp_stack.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (
  2 |     Stack,
  3 |     RemovalPolicy,
  4 |     aws_s3 as s3,
  5 |     aws_codebuild as cb,
  6 |     aws_iam as iam,
  7 |     aws_lambda as lambda_,
  8 |     custom_resources as cr,
  9 |     aws_stepfunctions as sfn,
 10 |     aws_stepfunctions_tasks as tasks,
 11 |     aws_ecr as ecr,
 12 |     aws_s3_assets as s3_assets,
 13 |     aws_sagemaker as sagemaker,
 14 |     CustomResource, Duration, RemovalPolicy, Stack, CfnOutput
 15 | )
 16 | 
 17 | from constructs import Construct
 18 | 
 19 | import json
 20 | import os
 21 | 
 22 | class LlamaCppStack(Stack):
 23 |     def __init__(self, scope: Construct, construct_id: str,
 24 |             project_name: str, 
 25 |             model_bucket_key_full_name: str,
 26 |             model_hugging_face_name: str,
 27 |             image_tag: str,
 28 |             image_platform: str,
 29 |             model_name: str,
 30 |             model_instance_type: str,
 31 |             **kwargs) -> None:
 32 |         super().__init__(scope, construct_id, **kwargs)
 33 | 
 34 |         #============================ 
 35 |         #       model_download
 36 |         #============================ 
 37 |         bucket = s3.Bucket(
 38 |             self, 
 39 |             f"{project_name}-bucket",
 40 |             versioned=True,
 41 |             removal_policy=RemovalPolicy.DESTROY,
 42 |             enforce_ssl=True,
 43 |             encryption=s3.BucketEncryption.S3_MANAGED,
 44 |             auto_delete_objects=True
 45 |         )
 46 | 
 47 |         model_download_build_project = cb.Project(
 48 |             self,
 49 |             f"{project_name}-model-download",
 50 |             build_spec=cb.BuildSpec.from_asset(os.path.join(os.path.abspath(os.curdir), "cb_buildspec/model_download_buildspec.yaml")),
 51 |             environment=cb.BuildEnvironment(
 52 |                 privileged=True,
 53 |                 build_image=cb.LinuxBuildImage.STANDARD_6_0
 54 |             ),
 55 |             environment_variables={
 56 |                 "CDK_DEPLOY_ACCOUNT": cb.BuildEnvironmentVariable(value=self.account),
 57 |                 "CDK_DEPLOY_REGION": cb.BuildEnvironmentVariable(value=self.region),
 58 |                 "MODEL_BUCKET_NAME": cb.BuildEnvironmentVariable(value=bucket.bucket_name),
 59 |                 "MODEL_BUCKET_KEY_FULL_NAME": cb.BuildEnvironmentVariable(value=model_bucket_key_full_name),
 60 |                 "MODEL_HUGGING_FACE_NAME": cb.BuildEnvironmentVariable(value=model_hugging_face_name),
 61 |                 "TAG": cb.BuildEnvironmentVariable(value='cdk')
 62 |             },
 63 |             description='Download Large Language Model files to object store',
 64 |             timeout=Duration.minutes(60),
 65 |         )
 66 |         
 67 |         bucket.grant_read_write(model_download_build_project)
 68 | 
 69 |         sfn_model_download_task = tasks.CodeBuildStartBuild(
 70 |             self,
 71 |             f"{project_name}-start-model-download",
 72 |             project=model_download_build_project,
 73 |             integration_pattern=sfn.IntegrationPattern.RUN_JOB
 74 |         )
 75 | 
 76 | 
 77 |         #============================ 
 78 |         #       model_build
 79 |         #============================
 80 |         model_image_repo = ecr.Repository(
 81 |             self, 
 82 |             f"{project_name}-model-image-repo",
 83 |             removal_policy=RemovalPolicy.DESTROY,
 84 |             auto_delete_images=True
 85 |         )
 86 | 
 87 |         model_asset_bucket = s3_assets.Asset(
 88 |             self, 
 89 |             f"{project_name}-model-build-docker-assets",
 90 |             path = os.path.join(os.path.abspath(os.curdir), "docker"),
 91 |         )
 92 | 
 93 |         model_build_cb_project = cb.Project(
 94 |             self, 
 95 |             f"{project_name}-model-build",
 96 |             source=cb.Source.s3(
 97 |                 bucket=model_asset_bucket.bucket,
 98 |                 path=model_asset_bucket.s3_object_key
 99 |             ),
100 |             build_spec=cb.BuildSpec.from_asset(os.path.join(os.path.abspath(os.curdir), "cb_buildspec/model_build_docker_buildspec.yaml")),
101 |             environment=cb.BuildEnvironment(
102 |                 privileged=True,
103 |                 build_image=cb.LinuxBuildImage.STANDARD_6_0,
104 |                 compute_type=cb.ComputeType.X2_LARGE # to decrease wait time
105 |             ),
106 |             environment_variables={
107 |                 "CDK_DEPLOY_ACCOUNT": cb.BuildEnvironmentVariable(value=self.account),
108 |                 "CDK_DEPLOY_REGION": cb.BuildEnvironmentVariable(value=self.region),
109 |                 "REPOSITORY_NAME": cb.BuildEnvironmentVariable(value=model_image_repo.repository_name),
110 |                 "PLATFORM": cb.BuildEnvironmentVariable(value=image_platform),
111 |                 "IMAGE_TAG": cb.BuildEnvironmentVariable(value=image_tag),
112 |                 "ECR": cb.BuildEnvironmentVariable(value=model_image_repo.repository_uri),
113 |                 "TAG": cb.BuildEnvironmentVariable(value='cdk')
114 |             },
115 |             description='Project to build and push images to container registry',
116 |             timeout=Duration.minutes(60),
117 |         )
118 |         model_image_repo.grant_pull_push(model_build_cb_project)
119 | 
120 |         sfn_model_build_task = tasks.CodeBuildStartBuild(
121 |             self,
122 |             f"{project_name}-start-model-docker-build",
123 |             project=model_build_cb_project,
124 |             integration_pattern=sfn.IntegrationPattern.RUN_JOB
125 |         )
126 | 
127 |         #==========================================
128 |         #       model_download_build_deployment
129 |         #==========================================
130 |         # llama-cpp-sm
131 |         chain = sfn_model_download_task.next(
132 |             sfn_model_build_task
133 |         )
134 | 
135 |         state_machine = sfn.StateMachine(
136 |             self,
137 |             f"{project_name}-llama-cpp-statemachine",
138 |             definition_body=sfn.DefinitionBody.from_chainable(chain)
139 |         )
140 | 
141 |         trigger_lambda = lambda_.Function(
142 |             self,
143 |             f"{project_name}-trigger-llama-cpp-sm",
144 |             runtime=lambda_.Runtime.PYTHON_3_12,
145 |             handler="trigger_build.lambda_handler",
146 |             code=lambda_.Code.from_asset(os.path.join(os.path.abspath(os.curdir), "lambda/trigger_build")),
147 |             environment={
148 |                 "STATE_MACHINE_ARN": state_machine.state_machine_arn
149 |             }
150 |         )
151 | 
152 |         trigger_lambda.add_to_role_policy(iam.PolicyStatement(
153 |             actions=["states:StartExecution","states:ListExecutions"],
154 |             resources=[state_machine.state_machine_arn]
155 |         ))
156 | 
157 |         cr_provider = cr.Provider(
158 |             self,
159 |             f"{project_name}-trigger-resource-provider",
160 |             on_event_handler=trigger_lambda,
161 |             is_complete_handler=trigger_lambda,
162 |             query_interval=Duration.seconds(30)
163 |         )
164 | 
165 |         trigger_resource_cr = CustomResource(
166 |             self,
167 |             f"{project_name}-trigger-resource",
168 |             service_token=cr_provider.service_token
169 |         )
170 | 
171 |         #============================ 
172 |         #       model_serve
173 |         #============================
174 |         model_execution_role = iam.Role(
175 |             self,
176 |             f"{project_name}-model-execution-role",
177 |             assumed_by=iam.ServicePrincipal("sagemaker.amazonaws.com"),
178 |             inline_policies={
179 |                 "ResourcePolicy": iam.PolicyDocument(statements=[
180 |                     iam.PolicyStatement(
181 |                         actions=[
182 |                             "cloudwatch:PutMetricData",
183 |                             "logs:CreateLogStream",
184 |                             "logs:PutLogEvents",
185 |                             "logs:CreateLogGroup",
186 |                             "logs:DescribeLogStreams",
187 |                             "ecr:GetAuthorizationToken"
188 |                         ],
189 |                         resources=[
190 |                             f"arn:aws:logs:{self.region}:{self.account}:log-group:/aws/sagemaker/Endpoints/{project_name}-{model_name}-Endpoint:*"
191 |                         ]
192 |                     ),
193 |                     iam.PolicyStatement(
194 |                         actions=[
195 |                             "ecr:GetAuthorizationToken"
196 |                         ],
197 |                         resources=["*"]             
198 |                     ),
199 |                     iam.PolicyStatement(
200 |                         actions=[
201 |                             "ecr:ListTagsForResource",
202 |                             "ecr:ListImages",
203 |                             "ecr:DescribeRepositories",
204 |                             "ecr:BatchCheckLayerAvailability",
205 |                             "ecr:GetLifecyclePolicy",
206 |                             "ecr:DescribeImageScanFindings",
207 |                             "ecr:GetLifecyclePolicyPreview",
208 |                             "ecr:GetDownloadUrlForLayer",
209 |                             "ecr:BatchGetImage",
210 |                             "ecr:DescribeImages",
211 |                             "ecr:GetRepositoryPolicy"
212 |                         ],
213 |                         resources=[model_image_repo.repository_arn]
214 |                     ),
215 |                     iam.PolicyStatement(
216 |                         actions=[
217 |                             "s3:GetObject",
218 |                             "s3:HeadObject",
219 |                             "s3:ListBucket",
220 |                             "s3:ListBucketVersions",
221 |                             "s3:GetBucketPolicy",
222 |                             "s3:GetBucketAcl",
223 |                         ],
224 |                         resources=[bucket.bucket_arn, f"{bucket.bucket_arn}/*"]
225 |                     )
226 |                 ])
227 |             }
228 |         )
229 | 
230 |         model = sagemaker.CfnModel(
231 |             self,
232 |             f"{model_name}-Model",
233 |             execution_role_arn=model_execution_role.role_arn,
234 |             containers=[
235 |                 sagemaker.CfnModel.ContainerDefinitionProperty(
236 |                     image=f"{model_image_repo.repository_uri}:{image_tag}",
237 |                     environment={
238 |                         "MMS_MAX_RESPONSE_SIZE": "20000000",
239 |                         "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
240 |                         "SAGEMAKER_PROGRAM": "inference.py",
241 |                         "SAGEMAKER_REGION": f"{self.region}",
242 |                         "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code"            
243 |                     }
244 |                 )
245 |             ],
246 |             model_name=f"{project_name}-{model_name}-Model"
247 |         )
248 |         model.node.add_dependency(trigger_resource_cr)
249 |         
250 |         model_config = sagemaker.CfnEndpointConfig(
251 |             self,
252 |             f"{project_name}-{model_name}-Config",
253 |             endpoint_config_name=f"{project_name}-{model_name}-Config",
254 |             production_variants=[
255 |                 sagemaker.CfnEndpointConfig.ProductionVariantProperty(
256 |                     model_name=model.attr_model_name,
257 |                     variant_name="AllTraffic",
258 |                     initial_instance_count=1,
259 |                     initial_variant_weight=1,
260 |                     instance_type=model_instance_type
261 |                 )
262 |             ]
263 |         )
264 | 
265 |         model_endpoint = sagemaker.CfnEndpoint(
266 |             self,
267 |             f"{project_name}-{model_name}-Endpoint",
268 |             endpoint_name=f"{project_name}-{model_name}-Endpoint",
269 |             endpoint_config_name=model_config.attr_endpoint_config_name
270 |         )
271 | 
272 |         CfnOutput(
273 |             self,
274 |             f"{project_name}-{model_name}-endpoint", 
275 |             value=model_endpoint.endpoint_name
276 |         )
277 | 
278 |         #============================ 
279 |         #       model_configure
280 |         #============================
281 |         sagemaker_endpoint_configure_lambda = lambda_.Function(
282 |             self,
283 |             f"{project_name}-configure-sagemaker-endpoint-function",
284 |             runtime=lambda_.Runtime.PYTHON_3_12,
285 |             handler="configure_endpoint.lambda_handler",
286 |             timeout=Duration.minutes(3),
287 |             code=lambda_.Code.from_asset(os.path.join(os.path.abspath(os.curdir), "lambda/configure_endpoint")),
288 |             environment={
289 |                 "SAGEMAKER_ENDPOINT_NAME": model_endpoint.attr_endpoint_name,
290 |                 "MODEL_BUCKET_NAME": bucket.bucket_name,
291 |                 "MODEL_BUCKET_KEY_NAME": model_bucket_key_full_name
292 |             },
293 |         )
294 |         sagemaker_endpoint_configure_lambda.node.add_dependency(model_endpoint)
295 | 
296 |         sagemaker_endpoint_configure_lambda.add_to_role_policy(iam.PolicyStatement(
297 |             actions=["sagemaker:InvokeEndpoint"],
298 |             resources=[f"arn:aws:sagemaker:{self.region}:{self.account}:endpoint/*{model_name}*"]
299 |         ))
300 | 
301 |         config_endpoint_cr_provider = cr.Provider(
302 |             self,
303 |             f"{project_name}-configure-sagemaker-endpoint-provider",
304 |             on_event_handler=sagemaker_endpoint_configure_lambda,
305 |         )
306 | 
307 |         CustomResource(
308 |             self,
309 |             f"{project_name}-configure-sagemaker-endpoint-cr",
310 |             service_token=config_endpoint_cr_provider.service_token
311 |         )


--------------------------------------------------------------------------------