├── tests ├── __init__.py └── unit │ ├── __init__.py │ └── test_infrastructure_stack.py ├── requirements-dev.txt ├── docker ├── requirements.txt ├── serve ├── server.sh ├── image-push.sh ├── dockerfile ├── dockerfile-amd ├── image-build.sh └── main.py ├── requirements.txt ├── images └── genai-llm-sagemaker.png ├── .github ├── ISSUE_TEMPLATE │ └── issue.md └── pull_request_template.md ├── .gitignore ├── CODE_OF_CONDUCT.md ├── config.yaml ├── source.bat ├── multimodel_config.yaml ├── cb_buildspec ├── model_download_buildspec.yaml └── model_build_docker_buildspec.yaml ├── LICENSE ├── lambda ├── configure_endpoint │ └── configure_endpoint.py └── trigger_build │ └── trigger_build.py ├── cdk.json ├── multimodel_cdk.py ├── app.py ├── CONTRIBUTING.md ├── notebooks └── inference.ipynb ├── README.md └── infrastructure └── llama_cpp_stack.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | asgiref 3 | boto3 4 | starlette 5 | uvicorn 6 | requests -------------------------------------------------------------------------------- /docker/serve: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "serve" 3 | uvicorn 'main:asgi_app' --host 0.0.0.0 --port 8080 --workers 8 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.110.1 2 | constructs>=10.0.0,<11.0.0 3 | cdk-nag==2.27.214 4 | PyYAML==6.0.1 -------------------------------------------------------------------------------- /images/genai-llm-sagemaker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/genai-llm-cpu-sagemaker/HEAD/images/genai-llm-sagemaker.png -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue 3 | about: Report an issue 4 | title: '' 5 | labels: kind/issue 6 | assignees: '' 7 | --- -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .venv 6 | *.egg-info 7 | 8 | # CDK asset staging directory 9 | .cdk.staging 10 | cdk.out 11 | .DS_Store 12 | node_modules 13 | .local.* 14 | *.zip 15 | *.tar.gz 16 | *.gguf 17 | *.diff 18 | *.out 19 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | image: 3 | image_tag: arm-latest 4 | platform: ARM 5 | inference: 6 | instance_type: ml.c7g.8xlarge 7 | sagemaker_model_name: llama-2-7b-chat-arm 8 | model: 9 | full_name: llama-2-7b-chat.Q4_K_M.gguf 10 | hf_name: TheBloke/Llama-2-7b-Chat-GGUF 11 | name: llmcpp-llama-2-7b-chat 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | 6 | 7 | ## Checklist 8 | 9 | Please make sure you've completed the relevant tasks for this PR, out of the following list: 10 | 11 | * [ ] [Understand the repository structure](./README.md) 12 | * [ ] [Read our general contribution guidelines](./CONTRIBUTING.md) 13 | * [ ] [Read our code of conduct](./CODE_OF_CONDUCT.md) 14 | -------------------------------------------------------------------------------- /docker/server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "server.sh" 3 | echo "args: $1" 4 | 5 | # Check if NVIDIA GPU is available 6 | if lspci | grep -i nvidia &> /dev/null; then 7 | echo "NVIDIA GPU is available." 8 | NGL=999 9 | CPU_PER_SLOT=1 10 | else 11 | echo "No NVIDIA GPU found." 12 | NGL=0 13 | CPU_PER_SLOT=4 14 | fi 15 | 16 | killall llama-server 17 | /app/llama-server -m "$1" -c 2048 -t $(nproc --all) --host 0.0.0.0 --port 8081 -cb -np $(($(nproc --all) / $CPU_PER_SLOT)) -ngl $NGL & 18 | -------------------------------------------------------------------------------- /source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /docker/image-push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #COMMIT_HASH="latest" 4 | 5 | if [[ $# -ge 3 ]]; then 6 | export CDK_DEPLOY_ACCOUNT=$1 7 | export CDK_DEPLOY_REGION=$2 8 | export REPOSITORY_NAME=$3 9 | export IMAGE_TAG=$4 10 | shift; shift 11 | 12 | echo ==--------ECRPush---------== 13 | docker push "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}" 14 | exit $? 15 | else 16 | echo 1>&2 "Provide account and region as first two args..." 17 | echo 1>&2 "followed by repositopry name and image tag." 18 | exit 1 19 | fi -------------------------------------------------------------------------------- /tests/unit/test_infrastructure_stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk as core 2 | import aws_cdk.assertions as assertions 3 | 4 | from infrastructure.infrastructure_stack import InfrastructureStack 5 | 6 | # example tests. To run these tests, uncomment this file along with the example 7 | # resource in infrastructure/infrastructure_stack.py 8 | def test_sqs_queue_created(): 9 | app = core.App() 10 | stack = InfrastructureStack(app, "infrastructure") 11 | template = assertions.Template.from_stack(stack) 12 | 13 | # template.has_resource_properties("AWS::SQS::Queue", { 14 | # "VisibilityTimeout": 300 15 | # }) 16 | -------------------------------------------------------------------------------- /multimodel_config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | - name: "llmcpp-llama-2-7b-chat" 3 | model: 4 | hf_name: "TheBloke/Llama-2-7b-Chat-GGUF" 5 | full_name: "llama-2-7b-chat.Q4_K_M.gguf" 6 | image: 7 | platform: "ARM" 8 | image_tag: "arm-latest" 9 | inference: 10 | sagemaker_model_name: "llama-2-7b-chat-arm" 11 | instance_type: "ml.c7g.8xlarge" 12 | - name: "mistral-7b" 13 | model: 14 | hf_name: "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF" 15 | full_name: "capybarahermes-2.5-mistral-7b.Q4_K_M.gguf" 16 | image: 17 | platform: "AMD" 18 | image_tag: "amd-latest" 19 | inference: 20 | sagemaker_model_name: "mistral-7b-g5" 21 | instance_type: "ml.g5.xlarge" -------------------------------------------------------------------------------- /cb_buildspec/model_download_buildspec.yaml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | env: 3 | shell: bash 4 | phases: 5 | install: 6 | commands: 7 | - echo Entered the install phase... 8 | - sudo apt-get update 9 | - sudo apt-get install -y python3-pip 10 | - pip3 install huggingface-hub>=0.17.1 hf_transfer 11 | build: 12 | on-failure: CONTINUE 13 | commands: 14 | - echo Entered the build phase... 15 | - echo Downloading model 16 | - HUGGINGFACE_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download ${MODEL_HUGGING_FACE_NAME} ${MODEL_BUCKET_KEY_FULL_NAME} --local-dir . --local-dir-use-symlinks False 17 | - echo Copying uncompressed file 18 | - aws s3 cp ${MODEL_BUCKET_KEY_FULL_NAME} s3://${MODEL_BUCKET_NAME}/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /cb_buildspec/model_build_docker_buildspec.yaml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | env: 4 | shell: bash 5 | 6 | phases: 7 | install: 8 | commands: 9 | - mkdir -p $HOME/.docker/cli-plugins 10 | - export BUILDX_VERSION=$(curl --silent "https://api.github.com/repos/docker/buildx/releases/latest" |jq -r .tag_name) 11 | - wget -O $HOME/.docker/cli-plugins/docker-buildx https://github.com/docker/buildx/releases/download/$BUILDX_VERSION/buildx-$BUILDX_VERSION.linux-arm64 12 | - chmod a+rx $HOME/.docker/cli-plugins/docker-buildx 13 | - docker run --privileged --rm public.ecr.aws/eks-distro-build-tooling/binfmt-misc:qemu-v7.0.0 --install arm64, amd64 14 | - export DOCKER_BUILDKIT=1 15 | - export DOCKER_CLI_EXPERIMENTAL=enabled 16 | build: 17 | commands: 18 | - echo Entered the build phase... 19 | - bash ./image-build.sh $CDK_DEPLOY_ACCOUNT $CDK_DEPLOY_REGION $REPOSITORY_NAME $IMAGE_TAG $PLATFORM 20 | - echo Entered the post_build phase... 21 | - bash ./image-push.sh $CDK_DEPLOY_ACCOUNT $CDK_DEPLOY_REGION $REPOSITORY_NAME $IMAGE_TAG -------------------------------------------------------------------------------- /docker/dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/ggerganov/llama.cpp:full 2 | 3 | # Sets dumping log messages directly to stream instead of buffering 4 | ENV PYTHONUNBUFFERED=1 5 | # Set MODELPATH environment variable 6 | ENV MODELPATH=/app/llm_model.bin 7 | 8 | ENV PATH=$PATH:/app 9 | 10 | # The working directory in the Docker image 11 | WORKDIR /app 12 | 13 | RUN apt-get update 14 | RUN apt-get upgrade -y 15 | RUN apt-get remove linux-libc-dev -y 16 | 17 | # Install system dependencies 18 | RUN apt-get install -y \ 19 | unzip \ 20 | psmisc \ 21 | pciutils 22 | 23 | # Copy requirements.txt and install Python dependencies 24 | COPY requirements.txt ./requirements.txt 25 | #main application file 26 | COPY main.py /app/ 27 | #sagemaker endpoints expects serve file to run the application 28 | COPY serve /app/ 29 | COPY server.sh /app/ 30 | 31 | RUN chmod u+x serve 32 | RUN chmod u+x server.sh 33 | 34 | RUN pip3 install -r requirements.txt 35 | RUN export PATH=/app:$PATH 36 | 37 | ENTRYPOINT ["/bin/bash"] 38 | 39 | # Expose port for the application to run on, has to be 8080 40 | EXPOSE 8080 41 | -------------------------------------------------------------------------------- /docker/dockerfile-amd: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/ggerganov/llama.cpp:full-cuda 2 | 3 | # Sets dumping log messages directly to stream instead of buffering 4 | ENV PYTHONUNBUFFERED=1 5 | # Set MODELPATH environment variable 6 | ENV MODELPATH=/app/llm_model.bin 7 | 8 | ENV PATH=$PATH:/app 9 | 10 | # The working directory in the Docker image 11 | WORKDIR /app 12 | 13 | # Install system dependencies 14 | RUN apt-get update && apt-get install -y \ 15 | unzip \ 16 | libcurl4-openssl-dev \ 17 | python3 \ 18 | python3-pip \ 19 | python3-dev \ 20 | git \ 21 | psmisc \ 22 | pciutils 23 | 24 | # Copy requirements.txt and install Python dependencies 25 | COPY requirements.txt ./requirements.txt 26 | #main application file 27 | COPY main.py /app/ 28 | #sagemaker endpoints expects serve file to run the application 29 | COPY serve /app/ 30 | COPY server.sh /app/ 31 | 32 | RUN chmod u+x serve 33 | RUN chmod u+x server.sh 34 | 35 | RUN pip3 install -r requirements.txt 36 | RUN export PATH=/app:$PATH 37 | 38 | ENTRYPOINT ["/bin/bash"] 39 | 40 | # Expose port for the application to run on, has to be 8080 41 | EXPOSE 8080 42 | -------------------------------------------------------------------------------- /lambda/configure_endpoint/configure_endpoint.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from os import environ 3 | import json 4 | 5 | sagemaker_client = boto3.client('sagemaker-runtime') 6 | 7 | def lambda_handler(event, context): 8 | print(f'event : {event}') 9 | event_type = event['RequestType'] 10 | endpoint_name = environ['SAGEMAKER_ENDPOINT_NAME'] 11 | payload = { 12 | "configure": { 13 | "bucket": environ['MODEL_BUCKET_NAME'], 14 | "key": environ['MODEL_BUCKET_KEY_NAME'] 15 | } 16 | } 17 | print(f' payload : {json.dumps(payload, default=str)}') 18 | 19 | if event_type in ['Create']: 20 | response = sagemaker_client.invoke_endpoint( 21 | EndpointName=endpoint_name, 22 | ContentType='application/json', 23 | Body=json.dumps(payload) 24 | ) 25 | print(f"response: {response}") 26 | 27 | return { 28 | 'statusCode': 200, 29 | 'Response': json.dumps(response, default=str) 30 | } 31 | 32 | return{ 33 | 'statusCode': 200, 34 | 'Response': 'Not a create request!' 35 | } -------------------------------------------------------------------------------- /lambda/trigger_build/trigger_build.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from os import environ 3 | import json 4 | 5 | sfn_client = boto3.client('stepfunctions') 6 | 7 | def lambda_handler(event, context): 8 | print(f'event : {event}') 9 | event_type = event['RequestType'] 10 | sm_arn = environ['STATE_MACHINE_ARN'] 11 | 12 | if event_type in ['Create', 'Delete']: 13 | res = sfn_client.list_executions( 14 | stateMachineArn=sm_arn, 15 | maxResults=1 16 | ) 17 | if res['executions']: 18 | print(f'exections exists : {res["executions"]}') 19 | if res['executions'][0]['status'] == 'RUNNING': 20 | print(f'execution still running. IsComplete: False.') 21 | return { 22 | 'statusCode': 200, 23 | 'IsComplete': False 24 | } 25 | else: 26 | print(f'execution not running. IsComplete: True.') 27 | return { 28 | 'statusCode': 200, 29 | 'IsComplete': True 30 | } 31 | else: 32 | print(f'execution doens\'t exist. Executing stepfunction statemachine.') 33 | response = sfn_client.start_execution(stateMachineArn=sm_arn) 34 | print(response) 35 | 36 | return { 37 | 'statusCode': 200, 38 | 'Response': json.dumps(response, default=str) 39 | } 40 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ], 39 | "config_file": "config.yaml" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /docker/image-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #COMMIT_HASH="latest" #"v29.10.1" 4 | IMAGE_NAME="llama-cpp-image" 5 | 6 | if [[ $# -ge 4 ]]; then 7 | export CDK_DEPLOY_ACCOUNT=$1 8 | export CDK_DEPLOY_REGION=$2 9 | export REPOSITORY_NAME=$3 10 | export IMAGE_TAG=$4 11 | export PLATFORM=$5 12 | 13 | export DOCKER_BUILDKIT=1 14 | export DOCKER_CLI_EXPERIMENTAL=enabled 15 | 16 | export PLATFORM_PARAMETER_VALUE="linux/arm64" 17 | export DOCKER_FILE_NAME="dockerfile" 18 | 19 | if [[ $PLATFORM == *"arm"* ]] 20 | then 21 | PLATFORM_PARAMETER_VALUE="linux/arm64" 22 | DOCKER_FILE_NAME="dockerfile" 23 | echo "[INFO] Building an image for ARM platform" 24 | elif [[ $PLATFORM == *"amd"* ]] 25 | then 26 | PLATFORM_PARAMETER_VALUE="linux/amd64" 27 | DOCKER_FILE_NAME="dockerfile-amd" 28 | echo "[INFO] Building an image for AMD platform" 29 | else 30 | echo "[ERROR] Platform {$PLATFORM} not supported." 31 | exit 0 32 | fi 33 | 34 | shift; shift 35 | echo ==--------ECRLogin---------== 36 | aws ecr get-login-password --region "${CDK_DEPLOY_REGION}" | docker login --username AWS --password-stdin "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com" 37 | 38 | echo ==--------ECRBuild---------== 39 | docker buildx build --platform "${PLATFORM_PARAMETER_VALUE}" -t "${IMAGE_NAME}:${IMAGE_TAG}" -f "${DOCKER_FILE_NAME}" . 40 | 41 | echo ==--------ECRTag---------== 42 | docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${CDK_DEPLOY_ACCOUNT}.dkr.ecr.${CDK_DEPLOY_REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}" 43 | exit $? 44 | else 45 | echo 1>&2 "Provide account and region as first two args..." 46 | echo 1>&2 "followed by repositopry name, image tag and platform." 47 | exit 1 48 | fi -------------------------------------------------------------------------------- /multimodel_cdk.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from concurrent.futures import ThreadPoolExecutor 3 | import yaml 4 | import tempfile 5 | import os 6 | import argparse 7 | 8 | def execute_command(command): 9 | try: 10 | # Execute the command and capture its output 11 | output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) 12 | return command, output 13 | except subprocess.CalledProcessError as e: 14 | # Capture error output if the command fails 15 | return command, e.output 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(description="LlamaCpp Multimodel Deploy utility") 19 | parser.add_argument("--deploy", action="store_true", help="Deploy model stacks") 20 | parser.add_argument("--destroy", action="store_true", help="Destroy model stacks") 21 | parser.add_argument("--config",help="Multimodel config file", default="multimodel_config.yaml" ) 22 | parser.add_argument("--output-dir", help="Output directory for model deployment assets", default="./cdk.out/.multimodel_deploy") 23 | args = parser.parse_args() 24 | 25 | # list of cdk stacks 26 | dotfiles_dir = args.output_dir 27 | with open(args.config, 'r') as f: 28 | project_config = yaml.safe_load(f) 29 | 30 | os.makedirs(dotfiles_dir, exist_ok=True) 31 | 32 | dotfiles = [] 33 | for idx, project in enumerate(project_config['project']): 34 | dotfile = tempfile.NamedTemporaryFile(prefix='.', suffix='.yaml', delete=False, dir=dotfiles_dir) 35 | dotfiles.append(dotfile.name) 36 | with open(dotfile.name, 'w') as f: 37 | yaml.dump({'project': project}, f) 38 | 39 | # List of commands to execute 40 | commands = [] 41 | print('Running following in parallel : ') 42 | for idx, config_file in enumerate(dotfiles): 43 | output_dir_name = os.path.splitext(config_file)[0] 44 | if args.deploy: 45 | commands.append(f"cdk deploy --context config_file='{config_file}' --output='{output_dir_name}' --require-approval=never") 46 | elif args.destroy: 47 | commands.append(f"cdk destroy --context config_file='{config_file}' --output='{output_dir_name}' --require-approval=never --force") 48 | else: 49 | parser.print_help() 50 | return 51 | print(commands[idx]) 52 | 53 | 54 | # Maximum number of threads to use 55 | max_threads = 5 56 | 57 | # Execute commands in parallel using a ThreadPoolExecutor 58 | with ThreadPoolExecutor(max_threads) as executor: 59 | # Submit each command to the executor 60 | futures = [executor.submit(execute_command, cmd) for cmd in commands] 61 | 62 | # Wait for all commands to complete and collect results 63 | for future in futures: 64 | command, result = future.result() 65 | print(f"Output of command '{command}':") 66 | print(result) 67 | print("=" * 50) 68 | 69 | if __name__ == "__main__": 70 | main() -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from os import getenv 3 | 4 | import aws_cdk as cdk 5 | from cdk_nag import AwsSolutionsChecks, NagSuppressions, NagPackSuppression 6 | 7 | import yaml 8 | 9 | from infrastructure.llama_cpp_stack import LlamaCppStack 10 | 11 | ### Set environment 12 | environment=cdk.Environment( 13 | region=getenv("AWS_REGION", getenv("CDK_DEFAULT_REGION")), 14 | account=getenv("AWS_ACCOUNT_ID", getenv("CDK_DEFAULT_ACCOUNT")), 15 | ) 16 | 17 | # cdk app 18 | app = cdk.App() 19 | 20 | ## Read config 21 | with open(app.node.try_get_context('config_file'), 'r') as f: 22 | config = yaml.safe_load(f)['project'] 23 | 24 | project_name = config['name'] 25 | model_hugging_face_name = config['model']['hf_name'] 26 | model_bucket_key_full_name = config['model']['full_name'] 27 | platform = config['image']['platform'].lower() 28 | image_tag = config['image']['image_tag'] 29 | sagemaker_model_name = config['inference']['sagemaker_model_name'] 30 | sagemaker_instance_type = config['inference']['instance_type'] 31 | 32 | ### Validate input 33 | if platform not in ["arm", "amd"]: 34 | raise ValueError(f"[ERROR] Value {platform} of the \"image.platform\" parameter does not match one of the suported values: ['arm', 'amd']") 35 | if platform not in ["arm"] and "g" in sagemaker_instance_type.split(".")[1] and sagemaker_instance_type.split(".")[1] not in ["g5"]: 36 | print("[WARNING] Platfrom for the image is not set to ARM, however, instance type potentially belongs to the AWS Graviton family.") 37 | 38 | # stack 39 | llamaCppStack = LlamaCppStack(app, 40 | f"{project_name}-LlamaCppStack", 41 | env=environment, 42 | project_name=project_name, 43 | model_bucket_key_full_name=model_bucket_key_full_name, 44 | model_hugging_face_name=model_hugging_face_name, 45 | image_tag=image_tag, 46 | image_platform=platform, 47 | model_name=sagemaker_model_name, 48 | model_instance_type=sagemaker_instance_type 49 | ) 50 | 51 | # tags 52 | tags = { 53 | "SolutionName": "LlamacppSagemakerEndpoint", 54 | "SolutionVersion": "v1.0.0", 55 | "SolutionIaC": "CDK v2" 56 | } 57 | 58 | for key, val in tags.items(): 59 | cdk.Tags.of(app).add(key,val) 60 | 61 | # cdk-nag checks 62 | nag_suppressions = [ 63 | {"id": "AwsSolutions-IAM5", "reason": "CodePipeline policy needs to have full access to assets S3 bucket."}, 64 | {"id": "AwsSolutions-IAM4", "reason": "CustomeResource Lambda function using managed policy, following least previleges."}, 65 | {"id": "AwsSolutions-L1", "reason": "CDK CustomResource limitation."}, 66 | {"id": "AwsSolutions-SF1", "reason": "State machine used for trigger CodeBuild job in sync, thus logging ALL events is not needed."}, 67 | {"id": "AwsSolutions-SF2", "reason": "State machine used for trigger CodeBuild job in sync, thus X-ray is not needed."}, 68 | {"id": "AwsSolutions-CB4", "reason": "CodeBuild does not have to encrypt data for the purpose of this sample code. Adding KMS key would incur additional cost."} 69 | ] 70 | 71 | for supression in nag_suppressions: 72 | NagSuppressions.add_stack_suppressions(llamaCppStack, [NagPackSuppression(id=supression["id"], reason=supression["reason"])]) 73 | 74 | # cdk.Aspects.of(app).add(AwsSolutionsChecks(verbose=True)) 75 | 76 | app.synth() 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /notebooks/inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | "Here we define the functionality to interact with endpoint. \n", 11 | "we use different function for handling streaming response as the output format is different.\n", 12 | "define \"endpoint_name\" variable below based on the cloudformation stack output.\n", 13 | "\"\"\"\n", 14 | "\n", 15 | "import boto3\n", 16 | "import json\n", 17 | "\n", 18 | "sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')\n", 19 | "endpoint_name='llmcpp-llama-2-7b-chat-llama-2-7b-chat-arm-Endpoint'\n", 20 | "\n", 21 | "def invoke_sagemaker_endpoint(endpoint_name, llama_args):\n", 22 | " payload = {\n", 23 | " 'inference': True,\n", 24 | " 'configure': False,\n", 25 | " 'args': llama_args\n", 26 | " }\n", 27 | " response = sagemaker_runtime.invoke_endpoint(\n", 28 | " EndpointName=endpoint_name,\n", 29 | " Body=json.dumps(llama_args),\n", 30 | " ContentType='application/json',\n", 31 | " )\n", 32 | " response_body = json.loads(response['Body'].read().decode())\n", 33 | " return response_body\n", 34 | "\n", 35 | "def invoke_sagemaker_streaming_endpoint(endpoint_name, payload):\n", 36 | " response = sagemaker_runtime.invoke_endpoint_with_response_stream(\n", 37 | " EndpointName=endpoint_name,\n", 38 | " Body=json.dumps(payload),\n", 39 | " ContentType='application/json',\n", 40 | " ) \n", 41 | " event_stream = response['Body']\n", 42 | " for line in event_stream:\n", 43 | " itm = line['PayloadPart']['Bytes'][6:]\n", 44 | " try:\n", 45 | " res = json.loads(itm, strict=False )\n", 46 | " print(res[\"choices\"][0][\"text\"], end='')\n", 47 | " except:\n", 48 | " #non-valid json, e.g. empty token \n", 49 | " pass\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 6, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "'\\n everybody has their own unique preferences and interests, but there are some places that consistently top the lists of must-see destinations in Europe. From ancient ruins to modern cities , here is a list 10 most popular destination for travelers: Rome Italy - The Eternal City boasts an incredible history dating back centuries . Visit iconic landmarks like Colosseum and Vatican city, indulge...\\nRomeItaly Europe Travel Destinations Top Lists'" 61 | ] 62 | }, 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "\"\"\"\n", 70 | "Non-streaming inference example. \n", 71 | "\"\"\"\n", 72 | "\n", 73 | "\n", 74 | "llama_args = {\n", 75 | " \"prompt\": \"What are top 10 destinations to visit in Europe?\",\n", 76 | " \"max_tokens\": 128,\n", 77 | " \"temperature\": 0.1,\n", 78 | " \"repeat_penalty\":1.5,\n", 79 | " \"frequency_penalty\":1.1,\n", 80 | " \"top_p\": 0.5\n", 81 | "}\n", 82 | "\n", 83 | "inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)\n", 84 | "inference['choices'][0]['text']" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 7, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "\n", 97 | " obviously there many other destinations to visit in Europe. But here is a list of the top 10 most popular and highly recommended places: Paris France Rome Italy Barcelona Spain Amsterdam Netherlands Berlin Germany Prague Czech Republic Athens Greece Each country has its unique culture history architecture food wine beaches scenic views landmarks museums art galleries festivals events parks forests lakes rivers mountains valleys coastlines islands.\n", 98 | "Top 10 Destinations to Visit in Europe: A Comprehensive Guide (2023)" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "\"\"\"\n", 104 | "Streaming inference example\n", 105 | "to enable streaming mode, set stream=True\n", 106 | "\"\"\"\n", 107 | "\n", 108 | "llama_args = {\n", 109 | " \"prompt\": \"What are top 10 destinations to visit in Europe?\",\n", 110 | " \"max_tokens\": 300,\n", 111 | " \"temperature\": 0.1,\n", 112 | " \"repeat_penalty\":1.5,\n", 113 | " \"frequency_penalty\":1.1,\n", 114 | " \"top_p\": 0.5,\n", 115 | " \"stream\": True\n", 116 | "}\n", 117 | "\n", 118 | "invoke_sagemaker_streaming_endpoint(endpoint_name,llama_args)" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.9.6" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Large Language Models (LLMs) on CPU as SageMaker Endpoints 2 | 3 | This code demonstrates how you can run Large Language Models (LLMs) on CPU-only instances including Graviton. We are using [Llama.cpp project](https://github.com/ggerganov/llama.cpp) and exposing an Sagemaker endpoint API for inference. Models are downloaded from [Hugging Face model hub](https://huggingface.co/models). 4 | The project can be deployed to be compatible to both ARM64 and x86 architectures. 5 | 6 | ## Project Overview 7 | 8 | This project is built by using [AWS Cloud Development Kit](https://aws.amazon.com/cdk/)(AWS CDK) with Python. 9 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 10 | 11 | ### Configuration 12 | 13 | AWS CDK app configuration file values are in `config.yaml`: 14 | 15 | | Parameter | Description | Example value | 16 | | :--- | :--- | :--- | 17 | | project.name | Used as prefix for AWS resources created with this app | cpu-llm | 18 | | model.hf_name | [HuggingFace](https://huggingface.co) model name | TheBloke/Llama-2-7b-Chat-GGUF | 19 | | model.full_name | [HuggingFace](https://huggingface.co) model file full name | llama-2-7b-chat.Q4_K_M.gguf | 20 | | image.platform | Platfrom used to run inference and build an image; Values: ["ARM", "AMD"] | ARM | 21 | | image.image_tag | Tag used to tag the image; | arm-latest | 22 | | inference.sagemaker_model_name | SageMaker endpoint name for model inference | llama-2-7b-chat | 23 | | inference.instance_type | Instance type used for SageMaker Endpoint | "ml.c7g.8xlarge" for ARM platform or "ml.g5.xlarge" for AMD platform | 24 | 25 | At the moment the only supported options are ARM-based inference on Amazon Graviton processors and AMD-based inference for CUDA-based GPUs (G5 are highly recommended). For GPU inference we do not support weights sharding across multiple GPU cards. 26 | 27 | ### Architecture 28 | 29 | ![architecture diagram](images/genai-llm-sagemaker.png) 30 | 31 | The stack can be found in `./infrastructure` directory. 32 | 33 | ## Prerequisites 34 | 35 | Before proceeding any further, you need to identify and designate an AWS account required for the solution to work. 36 | 37 | ### Deploying from your local machine 38 | 39 | You need to create an AWS account profile in ~/.aws/credentials for the designated AWS account, if you don’t already have one. The profile needs to have sufficient permissions to run an [AWS Cloud Development Kit](https://aws.amazon.com/cdk/) (AWS CDK) stack. We recommend removing the profile when you’re finished with the testing. For more information about creating an AWS account profile, see [Configuring the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). 40 | 41 | Python 3.11.x or later has to be installed on a machine to run CDK code. 42 | You will also need to install AWS CDK CLI as per [documentation](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) and [bootstrap](https://docs.aws.amazon.com/cdk/v2/guide/bootstrapping.html) your environment. 43 | 44 | 45 | 46 | ### Deploying from Cloud9 instance 47 | 48 | If you don't want to install the necessary software locally you can spin up [Cloud9](https://docs.aws.amazon.com/cloud9/latest/user-guide/create-environment-main.html) instance that already have all necessary software preinstalled, however if this is your first CDK deployment in the account and/or region you will need to [bootstrap](https://docs.aws.amazon.com/cdk/v2/guide/bootstrapping.html) your environment. 49 | 50 | ## CDK deployment 51 | ### To Create Resources / Deploy Stack 52 | 53 | Open the terminal and run the following commands: 54 | 55 | ```bash 56 | # uncomment the line below if you need to bootstrap your environment 57 | # replace ACCOUNT_ID and REGION placeholders with your actual 58 | # AWS account id and region where you deploy the application 59 | 60 | # cdk bootstrap aws://ACCOUNT_ID/REGION 61 | 62 | git clone https://github.com/aws-samples/genai-llm-cpu-sagemaker llamacpp 63 | cd llamacpp 64 | python3 -m venv .venv 65 | source .venv/bin/activate 66 | pip3 install -r requirements.txt 67 | cdk deploy 68 | ``` 69 | 70 | ### To Destroy Resources / Clean-up 71 | 72 | Delete stack from Cloudfromation console. 73 | 74 | ### Model Selection / Change 75 | 76 | Only changing a model does not require rebuidling an image, and would take approximatelly 30% less time than redeploying the whole application. You can use the following process: 77 | 78 | 1. Navigate to https://huggingface.co/TheBloke and choose GGUF model of your choice for example https://huggingface.co/TheBloke/llama-2-7B-Arguments-GGUF, scroll to provided files. Usually Q4_K_M is good enough compromise (based on our testing but feel free to try yourself). 79 | 80 | 2. Update values of the variables in `config.yaml` to use the new model: 81 | * model.hf_name - set Hugging Face model name e.g. "TheBloke/llama-2-7B-Arguments-GGUF" 82 | * model.full_name - set Hugging Face file full name e.g. "llama-2-7b-chat.Q4_K_M.gguf" 83 | 84 | 3. Re-deploy stack by running `cdk deploy` 85 | 86 | ### Platform Selection / Change 87 | 88 | 1. Update values of the variables in `config.yaml` to use the different platform: 89 | * platform - set platform (not case sensitive) e.g. "AMD" 90 | * instance_type - set instance type that matches platform e.g. "ml.g5.xlarge" 91 | * image_tag - (optional) update image tag e.g. "amd-latest" 92 | 93 | 2. Re-deploy stack by running `cdk deploy` 94 | 95 | 96 | ## Multi-Model Deployment 97 | 98 | Sometimes you want to try multiple models from Hugging face to compare the quality of responses or latency. For this you can specify several models in `multimodel_config.yaml` and then use provided python script to start multiple model deployments in parallel. 99 | 100 | ```bash 101 | python3 multimodel_cdk.py --deploy 102 | ``` 103 | 104 | ## Inference 105 | 106 | Use `notebooks/inference.ipynb` as an example. IAM credentials / IAM Role that you use to run the notebook has to allow `sagemaker:InvokeEndpoint` API calls. 107 | 108 | If you don't have an existing environment to run Juputer notebooks, the easiest way to run the notebook would be to create new Sagemaker [notebook instance](https://docs.aws.amazon.com/sagemaker/latest/dg/howitworks-create-ws.html) using default settings and letting Sagemaker to create the necessary IAM role with enough permissions to interact with provisioned LLM endpoint. 109 | 110 | 111 | ## Limitations 112 | 113 | At the moment there's [25GB limit](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-byoi-specs.html) on custom docker image size. Please make sure the size of GGUF model file you want to use is below the limit. 114 | 115 | ## Security 116 | 117 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 118 | 119 | ## License 120 | 121 | This library is licensed under the MIT-0 License. See the LICENSE file. 122 | -------------------------------------------------------------------------------- /docker/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from asgiref.wsgi import WsgiToAsgi 4 | from flask import Flask, jsonify, request, Response 5 | import urllib.parse 6 | import requests 7 | import time 8 | import json 9 | import boto3 10 | import os 11 | import subprocess 12 | import traceback 13 | 14 | 15 | app = Flask(__name__) 16 | slot_id = -1 17 | 18 | parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.") 19 | parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n') 20 | parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ") 21 | parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ") 22 | parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ") 23 | parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '')", default="") 24 | parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8081)", default='http://127.0.0.1:8081') 25 | parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="") 26 | parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1') 27 | parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8080)", default=8080) 28 | 29 | args, unknown = parser.parse_known_args() 30 | 31 | def is_present(json, key): 32 | try: 33 | buf = json[key] 34 | except KeyError: 35 | return False 36 | if json[key] == None: 37 | return False 38 | return True 39 | 40 | #convert chat to prompt 41 | def convert_chat(messages): 42 | prompt = "" + args.chat_prompt.replace("\\n", "\n") 43 | 44 | system_n = args.system_name.replace("\\n", "\n") 45 | user_n = args.user_name.replace("\\n", "\n") 46 | ai_n = args.ai_name.replace("\\n", "\n") 47 | stop = args.stop.replace("\\n", "\n") 48 | 49 | 50 | for line in messages: 51 | if (line["role"] == "system"): 52 | prompt += f"{system_n}{line['content']}" 53 | if (line["role"] == "user"): 54 | prompt += f"{user_n}{line['content']}" 55 | if (line["role"] == "assistant"): 56 | prompt += f"{ai_n}{line['content']}{stop}" 57 | prompt += ai_n.rstrip() 58 | 59 | return prompt 60 | 61 | def make_postData(body, chat=False, stream=False): 62 | postData = {} 63 | if (chat): 64 | postData["prompt"] = convert_chat(body["messages"]) 65 | else: 66 | postData["prompt"] = body["prompt"] 67 | if(is_present(body, "temperature")): postData["temperature"] = body["temperature"] 68 | if(is_present(body, "top_k")): postData["top_k"] = body["top_k"] 69 | if(is_present(body, "top_p")): postData["top_p"] = body["top_p"] 70 | if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"] 71 | if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"] 72 | if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"] 73 | if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"] 74 | if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"] 75 | if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"] 76 | if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"] 77 | if(is_present(body, "seed")): postData["seed"] = body["seed"] 78 | if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()] 79 | if (args.stop != ""): 80 | postData["stop"] = [args.stop] 81 | else: 82 | postData["stop"] = [] 83 | if(is_present(body, "stop")): postData["stop"] += body["stop"] 84 | postData["n_keep"] = -1 85 | postData["stream"] = stream 86 | postData["cache_prompt"] = True 87 | postData["slot_id"] = slot_id 88 | return postData 89 | 90 | def make_resData(data, chat=False, promptToken=[]): 91 | resData = { 92 | "id": "chatcmpl" if (chat) else "cmpl", 93 | "object": "chat.completion" if (chat) else "text_completion", 94 | "created": int(time.time()), 95 | "truncated": data["truncated"], 96 | "model": "LLaMA_CPP", 97 | "usage": { 98 | "prompt_tokens": data["tokens_evaluated"], 99 | "completion_tokens": data["tokens_predicted"], 100 | "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"] 101 | } 102 | } 103 | if (len(promptToken) != 0): 104 | resData["promptToken"] = promptToken 105 | if (chat): 106 | #only one choice is supported 107 | resData["choices"] = [{ 108 | "index": 0, 109 | "message": { 110 | "role": "assistant", 111 | "content": data["content"], 112 | }, 113 | "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" 114 | }] 115 | else: 116 | #only one choice is supported 117 | resData["choices"] = [{ 118 | "text": data["content"], 119 | "index": 0, 120 | "logprobs": None, 121 | "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" 122 | }] 123 | return resData 124 | 125 | def make_resData_stream(data, chat=False, time_now = 0, start=False): 126 | resData = { 127 | "id": "chatcmpl" if (chat) else "cmpl", 128 | "object": "chat.completion.chunk" if (chat) else "text_completion.chunk", 129 | "created": time_now, 130 | "model": "LLaMA_CPP", 131 | "choices": [ 132 | { 133 | "finish_reason": None, 134 | "index": 0 135 | } 136 | ] 137 | } 138 | if (chat): 139 | if (start): 140 | resData["choices"][0]["delta"] = { 141 | "role": "assistant" 142 | } 143 | else: 144 | resData["choices"][0]["delta"] = { 145 | "content": data["content"] 146 | } 147 | if (data["stop"]): 148 | resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" 149 | else: 150 | resData["choices"][0]["text"] = data["content"] 151 | if (data["stop"]): 152 | resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" 153 | 154 | return resData 155 | 156 | def update_model(bucket, key): 157 | try: 158 | s3 = boto3.client('s3') 159 | s3.download_file(bucket, key, os.environ.get('MODELPATH')) 160 | subprocess.run(["/app/server.sh", os.environ.get('MODELPATH')]) 161 | return True 162 | except Exception as e: 163 | print(e) 164 | print(str(traceback.format_exc())) 165 | return False 166 | 167 | @app.route('/ping', methods=['GET']) 168 | def ping(): 169 | return Response(status=200) 170 | 171 | @app.route("/invocations", methods=['POST']) 172 | def completion(): 173 | if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): 174 | return Response(status=403) 175 | body = request.get_json() 176 | stream = False 177 | tokenize = False 178 | if (is_present(body, "configure")): 179 | res = update_model(body["configure"]["bucket"], body["configure"]["key"]) 180 | return Response(status=200) if (res) else Response(status=500) 181 | if(is_present(body, "stream")): stream = body["stream"] 182 | if(is_present(body, "tokenize")): tokenize = body["tokenize"] 183 | postData = make_postData(body, chat=False, stream=stream) 184 | 185 | promptToken = [] 186 | if (tokenize): 187 | tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json() 188 | promptToken = tokenData["tokens"] 189 | 190 | if (not stream): 191 | data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData)) 192 | print(data.json()) 193 | resData = make_resData(data.json(), chat=False, promptToken=promptToken) 194 | return jsonify(resData) 195 | else: 196 | def generate(): 197 | data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) 198 | time_now = int(time.time()) 199 | for line in data.iter_lines(): 200 | if line: 201 | decoded_line = line.decode('utf-8') 202 | resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now) 203 | yield 'data: {}\n'.format(json.dumps(resData)) 204 | return Response(generate(), mimetype='text/event-stream') 205 | 206 | asgi_app = WsgiToAsgi(app) 207 | 208 | #if __name__ == '__main__': 209 | # app.run(args.host, port=args.port) 210 | -------------------------------------------------------------------------------- /infrastructure/llama_cpp_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import ( 2 | Stack, 3 | RemovalPolicy, 4 | aws_s3 as s3, 5 | aws_codebuild as cb, 6 | aws_iam as iam, 7 | aws_lambda as lambda_, 8 | custom_resources as cr, 9 | aws_stepfunctions as sfn, 10 | aws_stepfunctions_tasks as tasks, 11 | aws_ecr as ecr, 12 | aws_s3_assets as s3_assets, 13 | aws_sagemaker as sagemaker, 14 | CustomResource, Duration, RemovalPolicy, Stack, CfnOutput 15 | ) 16 | 17 | from constructs import Construct 18 | 19 | import json 20 | import os 21 | 22 | class LlamaCppStack(Stack): 23 | def __init__(self, scope: Construct, construct_id: str, 24 | project_name: str, 25 | model_bucket_key_full_name: str, 26 | model_hugging_face_name: str, 27 | image_tag: str, 28 | image_platform: str, 29 | model_name: str, 30 | model_instance_type: str, 31 | **kwargs) -> None: 32 | super().__init__(scope, construct_id, **kwargs) 33 | 34 | #============================ 35 | # model_download 36 | #============================ 37 | bucket = s3.Bucket( 38 | self, 39 | f"{project_name}-bucket", 40 | versioned=True, 41 | removal_policy=RemovalPolicy.DESTROY, 42 | enforce_ssl=True, 43 | encryption=s3.BucketEncryption.S3_MANAGED, 44 | auto_delete_objects=True 45 | ) 46 | 47 | model_download_build_project = cb.Project( 48 | self, 49 | f"{project_name}-model-download", 50 | build_spec=cb.BuildSpec.from_asset(os.path.join(os.path.abspath(os.curdir), "cb_buildspec/model_download_buildspec.yaml")), 51 | environment=cb.BuildEnvironment( 52 | privileged=True, 53 | build_image=cb.LinuxBuildImage.STANDARD_6_0 54 | ), 55 | environment_variables={ 56 | "CDK_DEPLOY_ACCOUNT": cb.BuildEnvironmentVariable(value=self.account), 57 | "CDK_DEPLOY_REGION": cb.BuildEnvironmentVariable(value=self.region), 58 | "MODEL_BUCKET_NAME": cb.BuildEnvironmentVariable(value=bucket.bucket_name), 59 | "MODEL_BUCKET_KEY_FULL_NAME": cb.BuildEnvironmentVariable(value=model_bucket_key_full_name), 60 | "MODEL_HUGGING_FACE_NAME": cb.BuildEnvironmentVariable(value=model_hugging_face_name), 61 | "TAG": cb.BuildEnvironmentVariable(value='cdk') 62 | }, 63 | description='Download Large Language Model files to object store', 64 | timeout=Duration.minutes(60), 65 | ) 66 | 67 | bucket.grant_read_write(model_download_build_project) 68 | 69 | sfn_model_download_task = tasks.CodeBuildStartBuild( 70 | self, 71 | f"{project_name}-start-model-download", 72 | project=model_download_build_project, 73 | integration_pattern=sfn.IntegrationPattern.RUN_JOB 74 | ) 75 | 76 | 77 | #============================ 78 | # model_build 79 | #============================ 80 | model_image_repo = ecr.Repository( 81 | self, 82 | f"{project_name}-model-image-repo", 83 | removal_policy=RemovalPolicy.DESTROY, 84 | auto_delete_images=True 85 | ) 86 | 87 | model_asset_bucket = s3_assets.Asset( 88 | self, 89 | f"{project_name}-model-build-docker-assets", 90 | path = os.path.join(os.path.abspath(os.curdir), "docker"), 91 | ) 92 | 93 | model_build_cb_project = cb.Project( 94 | self, 95 | f"{project_name}-model-build", 96 | source=cb.Source.s3( 97 | bucket=model_asset_bucket.bucket, 98 | path=model_asset_bucket.s3_object_key 99 | ), 100 | build_spec=cb.BuildSpec.from_asset(os.path.join(os.path.abspath(os.curdir), "cb_buildspec/model_build_docker_buildspec.yaml")), 101 | environment=cb.BuildEnvironment( 102 | privileged=True, 103 | build_image=cb.LinuxBuildImage.STANDARD_6_0, 104 | compute_type=cb.ComputeType.X2_LARGE # to decrease wait time 105 | ), 106 | environment_variables={ 107 | "CDK_DEPLOY_ACCOUNT": cb.BuildEnvironmentVariable(value=self.account), 108 | "CDK_DEPLOY_REGION": cb.BuildEnvironmentVariable(value=self.region), 109 | "REPOSITORY_NAME": cb.BuildEnvironmentVariable(value=model_image_repo.repository_name), 110 | "PLATFORM": cb.BuildEnvironmentVariable(value=image_platform), 111 | "IMAGE_TAG": cb.BuildEnvironmentVariable(value=image_tag), 112 | "ECR": cb.BuildEnvironmentVariable(value=model_image_repo.repository_uri), 113 | "TAG": cb.BuildEnvironmentVariable(value='cdk') 114 | }, 115 | description='Project to build and push images to container registry', 116 | timeout=Duration.minutes(60), 117 | ) 118 | model_image_repo.grant_pull_push(model_build_cb_project) 119 | 120 | sfn_model_build_task = tasks.CodeBuildStartBuild( 121 | self, 122 | f"{project_name}-start-model-docker-build", 123 | project=model_build_cb_project, 124 | integration_pattern=sfn.IntegrationPattern.RUN_JOB 125 | ) 126 | 127 | #========================================== 128 | # model_download_build_deployment 129 | #========================================== 130 | # llama-cpp-sm 131 | chain = sfn_model_download_task.next( 132 | sfn_model_build_task 133 | ) 134 | 135 | state_machine = sfn.StateMachine( 136 | self, 137 | f"{project_name}-llama-cpp-statemachine", 138 | definition_body=sfn.DefinitionBody.from_chainable(chain) 139 | ) 140 | 141 | trigger_lambda = lambda_.Function( 142 | self, 143 | f"{project_name}-trigger-llama-cpp-sm", 144 | runtime=lambda_.Runtime.PYTHON_3_12, 145 | handler="trigger_build.lambda_handler", 146 | code=lambda_.Code.from_asset(os.path.join(os.path.abspath(os.curdir), "lambda/trigger_build")), 147 | environment={ 148 | "STATE_MACHINE_ARN": state_machine.state_machine_arn 149 | } 150 | ) 151 | 152 | trigger_lambda.add_to_role_policy(iam.PolicyStatement( 153 | actions=["states:StartExecution","states:ListExecutions"], 154 | resources=[state_machine.state_machine_arn] 155 | )) 156 | 157 | cr_provider = cr.Provider( 158 | self, 159 | f"{project_name}-trigger-resource-provider", 160 | on_event_handler=trigger_lambda, 161 | is_complete_handler=trigger_lambda, 162 | query_interval=Duration.seconds(30) 163 | ) 164 | 165 | trigger_resource_cr = CustomResource( 166 | self, 167 | f"{project_name}-trigger-resource", 168 | service_token=cr_provider.service_token 169 | ) 170 | 171 | #============================ 172 | # model_serve 173 | #============================ 174 | model_execution_role = iam.Role( 175 | self, 176 | f"{project_name}-model-execution-role", 177 | assumed_by=iam.ServicePrincipal("sagemaker.amazonaws.com"), 178 | inline_policies={ 179 | "ResourcePolicy": iam.PolicyDocument(statements=[ 180 | iam.PolicyStatement( 181 | actions=[ 182 | "cloudwatch:PutMetricData", 183 | "logs:CreateLogStream", 184 | "logs:PutLogEvents", 185 | "logs:CreateLogGroup", 186 | "logs:DescribeLogStreams", 187 | "ecr:GetAuthorizationToken" 188 | ], 189 | resources=[ 190 | f"arn:aws:logs:{self.region}:{self.account}:log-group:/aws/sagemaker/Endpoints/{project_name}-{model_name}-Endpoint:*" 191 | ] 192 | ), 193 | iam.PolicyStatement( 194 | actions=[ 195 | "ecr:GetAuthorizationToken" 196 | ], 197 | resources=["*"] 198 | ), 199 | iam.PolicyStatement( 200 | actions=[ 201 | "ecr:ListTagsForResource", 202 | "ecr:ListImages", 203 | "ecr:DescribeRepositories", 204 | "ecr:BatchCheckLayerAvailability", 205 | "ecr:GetLifecyclePolicy", 206 | "ecr:DescribeImageScanFindings", 207 | "ecr:GetLifecyclePolicyPreview", 208 | "ecr:GetDownloadUrlForLayer", 209 | "ecr:BatchGetImage", 210 | "ecr:DescribeImages", 211 | "ecr:GetRepositoryPolicy" 212 | ], 213 | resources=[model_image_repo.repository_arn] 214 | ), 215 | iam.PolicyStatement( 216 | actions=[ 217 | "s3:GetObject", 218 | "s3:HeadObject", 219 | "s3:ListBucket", 220 | "s3:ListBucketVersions", 221 | "s3:GetBucketPolicy", 222 | "s3:GetBucketAcl", 223 | ], 224 | resources=[bucket.bucket_arn, f"{bucket.bucket_arn}/*"] 225 | ) 226 | ]) 227 | } 228 | ) 229 | 230 | model = sagemaker.CfnModel( 231 | self, 232 | f"{model_name}-Model", 233 | execution_role_arn=model_execution_role.role_arn, 234 | containers=[ 235 | sagemaker.CfnModel.ContainerDefinitionProperty( 236 | image=f"{model_image_repo.repository_uri}:{image_tag}", 237 | environment={ 238 | "MMS_MAX_RESPONSE_SIZE": "20000000", 239 | "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", 240 | "SAGEMAKER_PROGRAM": "inference.py", 241 | "SAGEMAKER_REGION": f"{self.region}", 242 | "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code" 243 | } 244 | ) 245 | ], 246 | model_name=f"{project_name}-{model_name}-Model" 247 | ) 248 | model.node.add_dependency(trigger_resource_cr) 249 | 250 | model_config = sagemaker.CfnEndpointConfig( 251 | self, 252 | f"{project_name}-{model_name}-Config", 253 | endpoint_config_name=f"{project_name}-{model_name}-Config", 254 | production_variants=[ 255 | sagemaker.CfnEndpointConfig.ProductionVariantProperty( 256 | model_name=model.attr_model_name, 257 | variant_name="AllTraffic", 258 | initial_instance_count=1, 259 | initial_variant_weight=1, 260 | instance_type=model_instance_type 261 | ) 262 | ] 263 | ) 264 | 265 | model_endpoint = sagemaker.CfnEndpoint( 266 | self, 267 | f"{project_name}-{model_name}-Endpoint", 268 | endpoint_name=f"{project_name}-{model_name}-Endpoint", 269 | endpoint_config_name=model_config.attr_endpoint_config_name 270 | ) 271 | 272 | CfnOutput( 273 | self, 274 | f"{project_name}-{model_name}-endpoint", 275 | value=model_endpoint.endpoint_name 276 | ) 277 | 278 | #============================ 279 | # model_configure 280 | #============================ 281 | sagemaker_endpoint_configure_lambda = lambda_.Function( 282 | self, 283 | f"{project_name}-configure-sagemaker-endpoint-function", 284 | runtime=lambda_.Runtime.PYTHON_3_12, 285 | handler="configure_endpoint.lambda_handler", 286 | timeout=Duration.minutes(3), 287 | code=lambda_.Code.from_asset(os.path.join(os.path.abspath(os.curdir), "lambda/configure_endpoint")), 288 | environment={ 289 | "SAGEMAKER_ENDPOINT_NAME": model_endpoint.attr_endpoint_name, 290 | "MODEL_BUCKET_NAME": bucket.bucket_name, 291 | "MODEL_BUCKET_KEY_NAME": model_bucket_key_full_name 292 | }, 293 | ) 294 | sagemaker_endpoint_configure_lambda.node.add_dependency(model_endpoint) 295 | 296 | sagemaker_endpoint_configure_lambda.add_to_role_policy(iam.PolicyStatement( 297 | actions=["sagemaker:InvokeEndpoint"], 298 | resources=[f"arn:aws:sagemaker:{self.region}:{self.account}:endpoint/*{model_name}*"] 299 | )) 300 | 301 | config_endpoint_cr_provider = cr.Provider( 302 | self, 303 | f"{project_name}-configure-sagemaker-endpoint-provider", 304 | on_event_handler=sagemaker_endpoint_configure_lambda, 305 | ) 306 | 307 | CustomResource( 308 | self, 309 | f"{project_name}-configure-sagemaker-endpoint-cr", 310 | service_token=config_endpoint_cr_provider.service_token 311 | ) --------------------------------------------------------------------------------