├── src ├── api │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── base.py │ │ └── bedrock.py │ ├── routers │ │ ├── __init__.py │ │ ├── embeddings.py │ │ ├── model.py │ │ └── chat.py │ ├── setting.py │ ├── auth.py │ ├── app.py │ └── schema.py ├── requirements.txt ├── Dockerfile └── Dockerfile_ecs ├── assets ├── arch.png └── obj-detect.png ├── .github ├── PULL_REQUEST_TEMPLATE.md └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── .pre-commit-config.yaml ├── THIRD_PARTY ├── CODE_OF_CONDUCT.md ├── ruff.toml ├── docker-compose.yml ├── LICENSE ├── CONTRIBUTING.md ├── docs ├── Security.md ├── Troubleshooting.md ├── Usage_CN.md └── Usage.md ├── .gitignore ├── scripts └── push-to-ecr.sh ├── deployment ├── BedrockProxy.template └── BedrockProxyFargate.template └── README.md /src/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/api/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/api/routers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/bedrock-access-gateway/HEAD/assets/arch.png -------------------------------------------------------------------------------- /assets/obj-detect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/bedrock-access-gateway/HEAD/assets/obj-detect.png -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.116.1 2 | pydantic==2.11.4 3 | uvicorn==0.29.0 4 | mangum==0.17.0 5 | tiktoken==0.9.0 6 | requests==2.32.4 7 | numpy==2.2.5 8 | boto3==1.40.4 9 | botocore==1.40.4 10 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.9.10 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | types_or: [python, pyi] 9 | # Run the formatter. 10 | - id: ruff-format 11 | -------------------------------------------------------------------------------- /THIRD_PARTY: -------------------------------------------------------------------------------- 1 | certifi 2 | 3 | SPDX-License-Identifier: MPL-2.0 4 | This Source Code Form is subject to the terms of the Mozilla Public 5 | License, v. 2.0. If a copy of the MPL was not distributed with this 6 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | https://github.com/certifi/python-certifi -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | line-length = 120 2 | indent-width = 4 3 | target-version = "py312" 4 | 5 | exclude = [ 6 | ".venv", 7 | ".vscode", 8 | "test/*" 9 | ] 10 | 11 | [lint] 12 | select = ["E", "F", "I"] 13 | ignore = [ 14 | "E501", 15 | "C901", 16 | "F401", 17 | ] 18 | 19 | [format] 20 | # use double quotes for strings. 21 | quote-style = "double" -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | bedrock-access-gateway: 5 | build: 6 | context: ./src 7 | dockerfile: Dockerfile_ecs 8 | ports: 9 | - "127.0.0.1:8000:8080" 10 | environment: 11 | - ENABLE_PROMPT_CACHING=true 12 | - API_KEY=${OPENAI_API_KEY} 13 | - AWS_PROFILE 14 | - AWS_ACCESS_KEY_ID 15 | - AWS_SECRET_ACCESS_KEY 16 | - AWS_SESSION_TOKEN 17 | volumes: 18 | - ${HOME}/.aws:/home/appuser/.aws 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this solution 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the feature you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /src/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.12 2 | 3 | # Add Lambda Web Adapter for API Gateway response streaming 4 | COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.9.1 /lambda-adapter /opt/extensions/lambda-adapter 5 | 6 | COPY ./api ./api 7 | 8 | COPY requirements.txt . 9 | 10 | RUN pip3 install -r requirements.txt -U --no-cache-dir 11 | 12 | # Lambda Web Adapter requires overriding the Lambda base image entrypoint 13 | # to run the web app directly instead of the Lambda runtime handler 14 | ENTRYPOINT [] 15 | CMD ["python", "-m", "uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8080"] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | 14 | **Please complete the following information:** 15 | - [ ] Which API you used: [e.g. /chat/completions] 16 | - [ ] Which model you used: [e.g. anthropic.claude-3-sonnet-20240229-v1:0] 17 | 18 | **To Reproduce** 19 | Steps to reproduce the behavior. If possible, please share an example request. 20 | 21 | **Expected behavior** 22 | A clear and concise description of what you expected to happen. 23 | 24 | 25 | **Screenshots** 26 | If applicable, add screenshots to help explain your problem (please **DO NOT include sensitive information**). 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /src/Dockerfile_ecs: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/docker/library/python:3.13-slim 2 | 3 | WORKDIR /app 4 | 5 | COPY ./requirements.txt /app/requirements.txt 6 | 7 | RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt 8 | 9 | COPY ./api /app/api 10 | 11 | # Create non-root user 12 | RUN groupadd -r appuser && useradd -r -g appuser appuser && \ 13 | chown -R appuser:appuser /app 14 | 15 | USER appuser 16 | 17 | # Preload tiktoken encoding: https://github.com/aws-samples/bedrock-access-gateway/issues/118 18 | ENV TIKTOKEN_CACHE_DIR=/app/.cache/tiktoken 19 | RUN python3 -c 'import tiktoken_ext.openai_public as tke; tke.cl100k_base()' 20 | 21 | ENV PORT=8080 22 | 23 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 24 | CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/health').read()" 25 | 26 | CMD ["sh", "-c", "uvicorn api.app:app --host 0.0.0.0 --port ${PORT}"] 27 | -------------------------------------------------------------------------------- /src/api/setting.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | API_ROUTE_PREFIX = os.environ.get("API_ROUTE_PREFIX", "/api/v1") 4 | 5 | TITLE = "Amazon Bedrock Proxy APIs" 6 | SUMMARY = "OpenAI-Compatible RESTful APIs for Amazon Bedrock" 7 | VERSION = "0.1.0" 8 | DESCRIPTION = """ 9 | Use OpenAI-Compatible RESTful APIs for Amazon Bedrock models. 10 | """ 11 | 12 | DEBUG = os.environ.get("DEBUG", "false").lower() != "false" 13 | AWS_REGION = os.environ.get("AWS_REGION", "us-west-2") 14 | DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "anthropic.claude-3-sonnet-20240229-v1:0") 15 | DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "cohere.embed-multilingual-v3") 16 | ENABLE_CROSS_REGION_INFERENCE = os.environ.get("ENABLE_CROSS_REGION_INFERENCE", "true").lower() != "false" 17 | ENABLE_APPLICATION_INFERENCE_PROFILES = os.environ.get("ENABLE_APPLICATION_INFERENCE_PROFILES", "true").lower() != "false" 18 | ENABLE_PROMPT_CACHING = os.environ.get("ENABLE_PROMPT_CACHING", "false").lower() != "false" 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /src/api/routers/embeddings.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from fastapi import APIRouter, Body, Depends 4 | 5 | from api.auth import api_key_auth 6 | from api.models.bedrock import get_embeddings_model 7 | from api.schema import EmbeddingsRequest, EmbeddingsResponse 8 | from api.setting import DEFAULT_EMBEDDING_MODEL 9 | 10 | router = APIRouter( 11 | prefix="/embeddings", 12 | dependencies=[Depends(api_key_auth)], 13 | ) 14 | 15 | 16 | @router.post("", response_model=EmbeddingsResponse) 17 | async def embeddings( 18 | embeddings_request: Annotated[ 19 | EmbeddingsRequest, 20 | Body( 21 | examples=[ 22 | { 23 | "model": "cohere.embed-multilingual-v3", 24 | "input": ["Your text string goes here"], 25 | } 26 | ], 27 | ), 28 | ], 29 | ): 30 | if embeddings_request.model.lower().startswith("text-embedding-"): 31 | embeddings_request.model = DEFAULT_EMBEDDING_MODEL 32 | # Exception will be raised if model not supported. 33 | model = get_embeddings_model(embeddings_request.model) 34 | return model.embed(embeddings_request) 35 | -------------------------------------------------------------------------------- /src/api/routers/model.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from fastapi import APIRouter, Depends, HTTPException, Path 4 | 5 | from api.auth import api_key_auth 6 | from api.models.bedrock import BedrockModel 7 | from api.schema import Model, Models 8 | 9 | router = APIRouter( 10 | prefix="/models", 11 | dependencies=[Depends(api_key_auth)], 12 | # responses={404: {"description": "Not found"}}, 13 | ) 14 | 15 | chat_model = BedrockModel() 16 | 17 | 18 | async def validate_model_id(model_id: str): 19 | if model_id not in chat_model.list_models(): 20 | raise HTTPException(status_code=500, detail="Unsupported Model Id") 21 | 22 | 23 | @router.get("", response_model=Models) 24 | async def list_models(): 25 | model_list = [Model(id=model_id) for model_id in chat_model.list_models()] 26 | return Models(data=model_list) 27 | 28 | 29 | @router.get( 30 | "/{model_id}", 31 | response_model=Model, 32 | ) 33 | async def get_model( 34 | model_id: Annotated[ 35 | str, 36 | Path(description="Model ID", example="anthropic.claude-3-sonnet-20240229-v1:0"), 37 | ], 38 | ): 39 | await validate_model_id(model_id) 40 | return Model(id=model_id) 41 | -------------------------------------------------------------------------------- /src/api/routers/chat.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from fastapi import APIRouter, Body, Depends 4 | from fastapi.responses import StreamingResponse 5 | 6 | from api.auth import api_key_auth 7 | from api.models.bedrock import BedrockModel 8 | from api.schema import ChatRequest, ChatResponse, ChatStreamResponse, Error 9 | from api.setting import DEFAULT_MODEL 10 | 11 | router = APIRouter( 12 | prefix="/chat", 13 | dependencies=[Depends(api_key_auth)], 14 | # responses={404: {"description": "Not found"}}, 15 | ) 16 | 17 | 18 | @router.post( 19 | "/completions", response_model=ChatResponse | ChatStreamResponse | Error, response_model_exclude_unset=True 20 | ) 21 | async def chat_completions( 22 | chat_request: Annotated[ 23 | ChatRequest, 24 | Body( 25 | examples=[ 26 | { 27 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 28 | "messages": [ 29 | {"role": "system", "content": "You are a helpful assistant."}, 30 | {"role": "user", "content": "Hello!"}, 31 | ], 32 | } 33 | ], 34 | ), 35 | ], 36 | ): 37 | if chat_request.model.lower().startswith("gpt-"): 38 | chat_request.model = DEFAULT_MODEL 39 | 40 | # Exception will be raised if model not supported. 41 | model = BedrockModel() 42 | model.validate(chat_request) 43 | if chat_request.stream: 44 | return StreamingResponse(content=model.chat_stream(chat_request), media_type="text/event-stream") 45 | return await model.chat(chat_request) 46 | -------------------------------------------------------------------------------- /src/api/auth.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Annotated 4 | 5 | import boto3 6 | from botocore.exceptions import ClientError 7 | from fastapi import Depends, HTTPException, status 8 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer 9 | 10 | api_key_param = os.environ.get("API_KEY_PARAM_NAME") 11 | api_key_secret_arn = os.environ.get("API_KEY_SECRET_ARN") 12 | api_key_env = os.environ.get("API_KEY") 13 | if api_key_param: 14 | # For backward compatibility. 15 | # Please now use secrets manager instead. 16 | ssm = boto3.client("ssm") 17 | api_key = ssm.get_parameter(Name=api_key_param, WithDecryption=True)["Parameter"]["Value"] 18 | elif api_key_secret_arn: 19 | sm = boto3.client("secretsmanager") 20 | try: 21 | response = sm.get_secret_value(SecretId=api_key_secret_arn) 22 | if "SecretString" in response: 23 | secret = json.loads(response["SecretString"]) 24 | api_key = secret["api_key"] 25 | except ClientError: 26 | raise RuntimeError("Unable to retrieve API KEY, please ensure the secret ARN is correct") 27 | except KeyError: 28 | raise RuntimeError('Please ensure the secret contains a "api_key" field') 29 | elif api_key_env: 30 | api_key = api_key_env 31 | else: 32 | raise RuntimeError( 33 | "API Key is not configured. Please set up your API Key." 34 | ) 35 | 36 | security = HTTPBearer() 37 | 38 | 39 | def api_key_auth( 40 | credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)], 41 | ): 42 | if credentials.credentials != api_key: 43 | raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API Key") 44 | -------------------------------------------------------------------------------- /src/api/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import uvicorn 5 | from fastapi import FastAPI 6 | from fastapi.exceptions import RequestValidationError 7 | from fastapi.middleware.cors import CORSMiddleware 8 | from fastapi.responses import PlainTextResponse 9 | from mangum import Mangum 10 | 11 | from api.routers import chat, embeddings, model 12 | from api.setting import API_ROUTE_PREFIX, DESCRIPTION, SUMMARY, TITLE, VERSION 13 | 14 | config = { 15 | "title": TITLE, 16 | "description": DESCRIPTION, 17 | "summary": SUMMARY, 18 | "version": VERSION, 19 | } 20 | 21 | logging.basicConfig( 22 | level=logging.INFO, 23 | format="%(asctime)s [%(levelname)s] %(message)s", 24 | ) 25 | app = FastAPI(**config) 26 | 27 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", "*") 28 | origins_list = [origin.strip() for origin in allowed_origins.split(",")] if allowed_origins != "*" else ["*"] 29 | 30 | # Warn if CORS allows all origins 31 | if origins_list == ["*"]: 32 | logging.warning("CORS is configured to allow all origins (*). Set ALLOWED_ORIGINS environment variable to restrict access.") 33 | 34 | app.add_middleware( 35 | CORSMiddleware, 36 | allow_origins=origins_list, # nosec - configurable via ALLOWED_ORIGINS env var 37 | allow_credentials=True, 38 | allow_methods=["*"], 39 | allow_headers=["*"], 40 | ) 41 | 42 | 43 | app.include_router(model.router, prefix=API_ROUTE_PREFIX) 44 | app.include_router(chat.router, prefix=API_ROUTE_PREFIX) 45 | app.include_router(embeddings.router, prefix=API_ROUTE_PREFIX) 46 | 47 | 48 | @app.get("/health") 49 | async def health(): 50 | """For health check if needed""" 51 | return {"status": "OK"} 52 | 53 | 54 | @app.exception_handler(RequestValidationError) 55 | async def validation_exception_handler(request, exc): 56 | logger = logging.getLogger(__name__) 57 | 58 | # Log essential info only - avoid sensitive data and performance overhead 59 | logger.warning( 60 | "Request validation failed: %s %s - %s", 61 | request.method, 62 | request.url.path, 63 | str(exc).split('\n')[0] # First line only 64 | ) 65 | 66 | return PlainTextResponse(str(exc), status_code=400) 67 | 68 | 69 | handler = Mangum(app) 70 | 71 | if __name__ == "__main__": 72 | # Bind to 0.0.0.0 for container environments, network is handled by network policies and load balancers 73 | uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False) # nosec B104 74 | -------------------------------------------------------------------------------- /src/api/models/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import uuid 4 | from abc import ABC, abstractmethod 5 | from typing import AsyncIterable 6 | 7 | from api.schema import ( 8 | # Chat 9 | ChatRequest, 10 | ChatResponse, 11 | ChatStreamResponse, 12 | # Embeddings 13 | EmbeddingsRequest, 14 | EmbeddingsResponse, 15 | Error, 16 | ) 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class BaseChatModel(ABC): 22 | """Represent a basic chat model 23 | 24 | Currently, only Bedrock model is supported, but may be used for SageMaker models if needed. 25 | """ 26 | 27 | def list_models(self) -> list[str]: 28 | """Return a list of supported models""" 29 | return [] 30 | 31 | def validate(self, chat_request: ChatRequest): 32 | """Validate chat completion requests.""" 33 | pass 34 | 35 | @abstractmethod 36 | async def chat(self, chat_request: ChatRequest) -> ChatResponse: 37 | """Handle a basic chat completion requests.""" 38 | pass 39 | 40 | @abstractmethod 41 | async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]: 42 | """Handle a basic chat completion requests with stream response.""" 43 | pass 44 | 45 | @staticmethod 46 | def generate_message_id() -> str: 47 | return "chatcmpl-" + str(uuid.uuid4())[:8] 48 | 49 | @staticmethod 50 | def stream_response_to_bytes(response: ChatStreamResponse | Error | None = None) -> bytes: 51 | if isinstance(response, Error): 52 | logger.error("Stream error: %s", response.error.message if response.error else "Unknown error") 53 | data = response.model_dump_json() 54 | elif isinstance(response, ChatStreamResponse): 55 | # to populate other fields when using exclude_unset=True 56 | response.system_fingerprint = "fp" 57 | response.object = "chat.completion.chunk" 58 | response.created = int(time.time()) 59 | data = response.model_dump_json(exclude_unset=True) 60 | else: 61 | data = "[DONE]" 62 | 63 | return f"data: {data}\n\n".encode("utf-8") 64 | 65 | 66 | class BaseEmbeddingsModel(ABC): 67 | """Represents a basic embeddings model. 68 | 69 | Currently, only Bedrock-provided models are supported, but it may be used for SageMaker models if needed. 70 | """ 71 | 72 | @abstractmethod 73 | def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse: 74 | """Handle a basic embeddings request.""" 75 | pass 76 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /docs/Security.md: -------------------------------------------------------------------------------- 1 | # Security 2 | 3 | This document details the security configuration required for the solution. In particular, it covers: 4 | 5 | - **HTTPS Setup** 6 | 7 | Following these guidelines will help ensure that traffic is encrypted over the public network. 8 | 9 | --- 10 | 11 | ## 1. HTTPS Authentication with the ALB 12 | 13 | ### Overview 14 | 15 | Using HTTPS on your ALB guarantees that all client-to-ALB communication is encrypted. This is achieved by: 16 | - **Obtaining and managing SSL/TLS certificates** using AWS Certificate Manager (ACM). You'll need a domain but you can request a free certificate. 17 | - **Configuring HTTPS listeners** on the ALB 18 | - **Automating HTTP to HTTPS redirect** for clients that inadvertently access HTTP endpoints 19 | - **Allowing traffic in the Security Group of the ALB** 20 | 21 | ### Step-by-Step Setup 22 | 23 | #### 1.1. Request an SSL/TLS Certificate via ACM 24 | 25 | 1. **Navigate to AWS Certificate Manager (ACM):** 26 | In the AWS Management Console, go to ACM in the region where your ALB is deployed. 27 | 28 | 2. **Request the Certificate:** 29 | - Click on **"Request a certificate"**. 30 | - Choose **"Request a public certificate"** (or a private one if using a private CA). 31 | - Enter your domain names (e.g., `example.com`, `*.example.com`). 32 | - Complete the validation (via DNS or email). DNS validation is generally preferred for automation purposes. 33 | 34 | 3. **Certificate Validation:** 35 | Ensure that the certificate status becomes **"Issued"** before proceeding. 36 | 37 | #### 1.2. Configure the ALB for HTTPS 38 | 39 | 1. **Create or Modify the ALB Listener:** 40 | - Open the **EC2 Dashboard** and navigate to [Load Balancers](https://console.aws.amazon.com/ec2/home?#LoadBalancers:). 41 | - If you already have an ALB, select it; otherwise, create a new ALB. 42 | - Under the **Listeners** tab, click **Manage listener** > **Edit Listener**. 43 | - Configure the listener protocol to **HTTPS** with port **443**. 44 | - Select the certificate you requested from ACM. 45 | 46 | #### 1.3. (Optional) Redirect HTTP Traffic to HTTPS 47 | 48 | To enhance security, ensure that any HTTP requests are automatically redirected to HTTPS. 49 | 50 | 1. **Create an HTTP Listener on Port 80:** 51 | - Add a listener on port **80**. 52 | - In the listener settings, add a rule to redirect all traffic to port **443** with the protocol changed to **HTTPS**. 53 | 54 | **Example AWS CLI command for redirection:** 55 | ```bash 56 | aws elbv2 create-listener \ 57 | --load-balancer-arn \ 58 | --protocol HTTP \ 59 | --port 80 \ 60 | --default-actions Type=redirect,RedirectConfig="Protocol=https,Port=443,StatusCode=HTTP_301" 61 | ``` 62 | 63 | #### 1.4. Allow traffic in the Security Group of the ALB 64 | 65 | 1. **Create a Security Group:** 66 | - Go to the CloudFormation stack you originally used to deploy, select **Resources** and search for **ProxyALBSecurityGroup** 67 | - Click on the Security Group 68 | - Edit the Inbound Rules to allow traffic on Port 443 from `0.0.0.0/0` and (optionally) delete the Inbound Rule on Port 80. **Note**: If you delete the rule on port 80, you will need to update the base url to use HTTPS only as it won't redirect HTTP traffic to HTTPS. 69 | 70 | Now you should be able to test your application! Use the base url like: 71 | 72 | ``` 73 | https:///api/v1 74 | ``` 75 | 76 | --- 77 | 78 | By following the steps outlined in this guide, you can configure a secure environment that uses HTTPS via ALB for encrypted traffic. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | Config 163 | .vscode/launch.json 164 | -------------------------------------------------------------------------------- /docs/Troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting Guide 2 | 3 | This guide helps you troubleshoot common issues you might encounter when using the Bedrock Access Gateway. 4 | 5 | ## Common Issues 6 | 7 | ### 1. Parameter Store Access Error 8 | 9 | To see errors, first you need to access the CloudWatch Logs of the Lambda/Fargate instance. 10 | 11 | 1. Go to the [CloudWatch Console](https://console.aws.amazon.com/cloudwatch/home?#logsV2:log-groups/) 12 | 2. Search for `/aws/lambda/BedrockProxyAPI` 13 | 3. Click on the `Log Stream` to see the error details 14 | 15 | ```python 16 | botocore.exceptions.ClientError: An error occurred (ParameterNotFound) when calling the GetParameter operation: Parameter /BedrockProxyAPIKey not found. 17 | ``` 18 | 19 | This error occurs when the Lambda function cannot access the API key parameter in Parameter Store. 20 | 21 | **Possible solutions:** 22 | - Verify that you created the parameter in Parameter Store with the correct name 23 | - Check that the parameter name in the CloudFormation stack matches the one in Parameter Store 24 | - Ensure the Lambda function's IAM role has permission to access Parameter Store 25 | - If you didn't set up an API key, leave the `ApiKeyParam` field blank during deployment 26 | 27 | ### 2. Model Access Issues 28 | 29 | If you receive an error about model access: 30 | 31 | ``` 32 | {"error": {"message": "User: arn:aws:iam::XXXX:role/XXX is not authorized to perform: bedrock:InvokeModel on resource: arn:aws:bedrock:REGION::foundation-model/XXX", "type": "auth_error", "code": 401}} 33 | ``` 34 | 35 | **Possible solutions:** 36 | - Ensure you have requested access to the model in Amazon Bedrock 37 | - Verify the Lambda/Fargate role has the necessary permissions to invoke Bedrock models 38 | - Check that you're using the correct model ID 39 | - Verify the model is available in your chosen region 40 | 41 | ### 3. API Key Authentication Failures 42 | 43 | If you receive a 401 Unauthorized error: 44 | 45 | ``` 46 | {"detail": "Could not validate credentials"} 47 | ``` 48 | 49 | **Possible solutions:** 50 | - Verify you're using the correct API key in your requests 51 | - Check that the `Authorization` header is properly formatted (`Bearer YOUR-API-KEY`) 52 | - If using environment variables, ensure `OPENAI_API_KEY` is set correctly 53 | 54 | ### 4. Cross-Region Access Issues 55 | 56 | If you're trying to access models in a different region: 57 | 58 | ``` 59 | {"error": {"message": "Region 'us-east-1' is not enabled for your account", "type": "invalid_request_error", "code": 400}} 60 | ``` 61 | 62 | **Possible solutions:** 63 | - Ensure the target region is enabled for your AWS account 64 | - Verify the model you're trying to access is available in that region 65 | - Check that your IAM roles have the necessary cross-region permissions 66 | 67 | ### 5. Rate Limiting and Quotas 68 | 69 | If you're experiencing throttling or quota issues: 70 | 71 | ``` 72 | {"error": {"message": "Rate limit exceeded", "type": "rate_limit_error", "code": 429}} 73 | ``` 74 | 75 | **Possible solutions:** 76 | - Check your Bedrock service quotas in the AWS Console 77 | - Consider implementing retry logic in your application 78 | - Request a quota increase if needed 79 | 80 | ## Getting Help 81 | 82 | If you're still experiencing issues: 83 | 84 | 1. Check the CloudWatch Logs for detailed error messages 85 | 2. Verify your AWS credentials and permissions 86 | 3. Review the [Usage Guide](./Usage.md) for correct API usage 87 | 4. Open a [GitHub issue](https://github.com/aws-samples/bedrock-access-gateway/issues/new?template=bug_report.md) with: 88 | - Detailed error message 89 | - Steps to reproduce 90 | - Your deployment configuration (region, model, etc.) 91 | - Any relevant CloudWatch logs 92 | 93 | ## Additional Resources 94 | 95 | - [Amazon Bedrock Documentation](https://docs.aws.amazon.com/bedrock/) 96 | - [AWS IAM Documentation](https://docs.aws.amazon.com/IAM/latest/UserGuide/) 97 | - [AWS Systems Manager Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html) 98 | -------------------------------------------------------------------------------- /scripts/push-to-ecr.sh: -------------------------------------------------------------------------------- 1 | # NOTE: The script will try to create the ECR repository if it doesn't exist. Please grant the necessary permissions to the IAM user or role. 2 | # Usage: 3 | # cd scripts 4 | # bash ./push-to-ecr.sh 5 | 6 | set -o errexit # exit on first error 7 | set -o nounset # exit on using unset variables 8 | set -o pipefail # exit on any error in a pipeline 9 | 10 | # Change to the directory where the script is located 11 | cd "$(dirname "$0")" 12 | 13 | # Prompt user for inputs 14 | echo "================================================" 15 | echo "Bedrock Access Gateway - Build and Push to ECR" 16 | echo "================================================" 17 | echo "" 18 | 19 | # Get repository name for Lambda version 20 | read -p "Enter ECR repository name for Lambda (default: bedrock-proxy-api): " LAMBDA_REPO 21 | LAMBDA_REPO=${LAMBDA_REPO:-bedrock-proxy-api} 22 | 23 | # Get repository name for ECS/Fargate version 24 | read -p "Enter ECR repository name for ECS/Fargate (default: bedrock-proxy-api-ecs): " ECS_REPO 25 | ECS_REPO=${ECS_REPO:-bedrock-proxy-api-ecs} 26 | 27 | # Get image tag 28 | read -p "Enter image tag (default: latest): " TAG 29 | TAG=${TAG:-latest} 30 | 31 | # Get AWS region 32 | read -p "Enter AWS region (default: us-east-1): " AWS_REGION 33 | AWS_REGION=${AWS_REGION:-us-east-1} 34 | 35 | echo "" 36 | echo "Configuration:" 37 | echo " Lambda Repository: $LAMBDA_REPO" 38 | echo " ECS/Fargate Repository: $ECS_REPO" 39 | echo " Image Tag: $TAG" 40 | echo " AWS Region: $AWS_REGION" 41 | echo "" 42 | read -p "Continue with these settings? (y/n): " CONFIRM 43 | if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then 44 | echo "Aborted." 45 | exit 1 46 | fi 47 | echo "" 48 | 49 | # Acknowledgment about ECR repository creation 50 | echo "ℹ️ NOTICE: This script will automatically create ECR repositories if they don't exist." 51 | echo " The repositories will be created with the following default settings:" 52 | echo " - Image tag mutability: MUTABLE (allows overwriting tags)" 53 | echo " - Image scanning: Disabled" 54 | echo " - Encryption: AES256 (AWS managed encryption)" 55 | echo "" 56 | echo " You can modify these settings later in the AWS ECR Console if needed." 57 | echo " Required IAM permissions: ecr:CreateRepository, ecr:GetAuthorizationToken," 58 | echo " ecr:BatchCheckLayerAvailability, ecr:InitiateLayerUpload, ecr:UploadLayerPart," 59 | echo " ecr:CompleteLayerUpload, ecr:PutImage" 60 | echo "" 61 | read -p "Do you acknowledge and want to proceed? (y/n): " ACK_CONFIRM 62 | if [[ ! "$ACK_CONFIRM" =~ ^[Yy]$ ]]; then 63 | echo "Aborted." 64 | exit 1 65 | fi 66 | echo "" 67 | 68 | # Define variables 69 | ARCHS=("arm64") # Single architecture for simplicity 70 | 71 | build_and_push_image() { 72 | local IMAGE_NAME=$1 73 | local TAG=$2 74 | local DOCKERFILE_PATH=$3 75 | local REGION=$AWS_REGION 76 | local ARCH=${ARCHS[0]} 77 | 78 | echo "Building $IMAGE_NAME:$TAG..." 79 | 80 | # Build Docker image 81 | # Note: --provenance=false and --sbom=false are required for Lambda compatibility 82 | # Without these flags, Docker BuildKit (especially with docker-container driver) may create 83 | # OCI image manifests with attestations that AWS Lambda does not support. 84 | # Lambda requires Docker V2 Schema 2 format without multi-manifest index. 85 | # See: https://github.com/aws-samples/bedrock-access-gateway/issues/206 86 | docker buildx build \ 87 | --platform linux/$ARCH \ 88 | --provenance=false \ 89 | --sbom=false \ 90 | -t $IMAGE_NAME:$TAG \ 91 | -f $DOCKERFILE_PATH \ 92 | --load \ 93 | ../src/ 94 | 95 | # Get the account ID 96 | ACCOUNT_ID=$(aws sts get-caller-identity --region $REGION --query Account --output text) 97 | 98 | # Create repository URI 99 | REPOSITORY_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE_NAME}" 100 | 101 | echo "Creating ECR repository if it doesn't exist..." 102 | # Create ECR repository if it doesn't exist 103 | aws ecr create-repository --repository-name "${IMAGE_NAME}" --region $REGION || true 104 | 105 | echo "Logging in to ECR..." 106 | # Log in to ECR 107 | aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $REPOSITORY_URI 108 | 109 | echo "Pushing image to ECR..." 110 | # Tag the image for ECR 111 | docker tag $IMAGE_NAME:$TAG $REPOSITORY_URI:$TAG 112 | 113 | # Push the image to ECR 114 | docker push $REPOSITORY_URI:$TAG 115 | 116 | echo "✅ Successfully pushed $IMAGE_NAME:$TAG to $REPOSITORY_URI" 117 | echo "" 118 | } 119 | 120 | echo "Building and pushing Lambda image..." 121 | build_and_push_image "$LAMBDA_REPO" "$TAG" "../src/Dockerfile" 122 | 123 | echo "Building and pushing ECS/Fargate image..." 124 | build_and_push_image "$ECS_REPO" "$TAG" "../src/Dockerfile_ecs" 125 | 126 | echo "================================================" 127 | echo "✅ All images successfully pushed!" 128 | echo "================================================" 129 | echo "" 130 | echo "Your container image URIs:" 131 | ACCOUNT_ID=$(aws sts get-caller-identity --region $AWS_REGION --query Account --output text) 132 | echo " Lambda: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${LAMBDA_REPO}:${TAG}" 133 | echo " ECS/Fargate: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECS_REPO}:${TAG}" 134 | echo "" 135 | echo "Next steps:" 136 | echo " 1. Download the CloudFormation templates from deployment/ folder" 137 | echo " 2. Update the ContainerImageUri parameter with your image URI above" 138 | echo " 3. Deploy the stack via AWS CloudFormation Console" 139 | echo "" 140 | -------------------------------------------------------------------------------- /src/api/schema.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Iterable, Literal 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from api.setting import DEFAULT_MODEL 7 | 8 | 9 | class Model(BaseModel): 10 | id: str 11 | created: int = Field(default_factory=lambda: int(time.time())) 12 | object: str | None = "model" 13 | owned_by: str | None = "bedrock" 14 | 15 | 16 | class Models(BaseModel): 17 | object: str | None = "list" 18 | data: list[Model] = [] 19 | 20 | 21 | class ResponseFunction(BaseModel): 22 | name: str | None = None 23 | arguments: str 24 | 25 | 26 | class ToolCall(BaseModel): 27 | index: int | None = None 28 | id: str | None = None 29 | type: Literal["function"] = "function" 30 | function: ResponseFunction 31 | 32 | 33 | class TextContent(BaseModel): 34 | type: Literal["text"] = "text" 35 | text: str 36 | 37 | 38 | class ImageUrl(BaseModel): 39 | url: str 40 | detail: str | None = "auto" 41 | 42 | 43 | class ImageContent(BaseModel): 44 | type: Literal["image_url"] = "image" 45 | image_url: ImageUrl 46 | 47 | 48 | class ToolContent(BaseModel): 49 | type: Literal["text"] = "text" 50 | text: str 51 | 52 | 53 | class SystemMessage(BaseModel): 54 | name: str | None = None 55 | role: Literal["system"] = "system" 56 | content: str 57 | 58 | 59 | class UserMessage(BaseModel): 60 | name: str | None = None 61 | role: Literal["user"] = "user" 62 | content: str | list[TextContent | ImageContent] 63 | 64 | 65 | class AssistantMessage(BaseModel): 66 | name: str | None = None 67 | role: Literal["assistant"] = "assistant" 68 | content: str | list[TextContent | ImageContent] | None = None 69 | tool_calls: list[ToolCall] | None = None 70 | 71 | 72 | class ToolMessage(BaseModel): 73 | role: Literal["tool"] = "tool" 74 | content: str | list[ToolContent] | list[dict] 75 | tool_call_id: str 76 | 77 | 78 | class DeveloperMessage(BaseModel): 79 | name: str | None = None 80 | role: Literal["developer"] = "developer" 81 | content: str 82 | 83 | 84 | class Function(BaseModel): 85 | name: str 86 | description: str | None = None 87 | parameters: object 88 | 89 | 90 | class Tool(BaseModel): 91 | type: Literal["function"] = "function" 92 | function: Function 93 | 94 | 95 | class StreamOptions(BaseModel): 96 | include_usage: bool = True 97 | 98 | 99 | class ChatRequest(BaseModel): 100 | messages: list[SystemMessage | UserMessage | AssistantMessage | ToolMessage | DeveloperMessage] 101 | model: str = DEFAULT_MODEL 102 | frequency_penalty: float | None = Field(default=0.0, le=2.0, ge=-2.0) # Not used 103 | presence_penalty: float | None = Field(default=0.0, le=2.0, ge=-2.0) # Not used 104 | stream: bool | None = False 105 | stream_options: StreamOptions | None = None 106 | temperature: float | None = Field(default=None, le=2.0, ge=0.0) 107 | top_p: float | None = Field(default=None, le=1.0, ge=0.0) 108 | user: str | None = None # Not used 109 | max_tokens: int | None = 2048 110 | max_completion_tokens: int | None = None 111 | reasoning_effort: Literal["low", "medium", "high"] | None = None 112 | n: int | None = 1 # Not used 113 | tools: list[Tool] | None = None 114 | tool_choice: str | object = "auto" 115 | stop: list[str] | str | None = None 116 | extra_body: dict | None = None 117 | 118 | 119 | class PromptTokensDetails(BaseModel): 120 | """Details about prompt tokens usage, following OpenAI API format.""" 121 | cached_tokens: int = 0 122 | audio_tokens: int = 0 123 | 124 | 125 | class CompletionTokensDetails(BaseModel): 126 | """Details about completion tokens usage, following OpenAI API format.""" 127 | reasoning_tokens: int = 0 128 | audio_tokens: int = 0 129 | 130 | 131 | class Usage(BaseModel): 132 | prompt_tokens: int 133 | completion_tokens: int 134 | total_tokens: int 135 | prompt_tokens_details: PromptTokensDetails | None = None 136 | completion_tokens_details: CompletionTokensDetails | None = None 137 | 138 | 139 | class ChatResponseMessage(BaseModel): 140 | # tool_calls 141 | role: Literal["assistant"] | None = None 142 | content: str | None = None 143 | tool_calls: list[ToolCall] | None = None 144 | reasoning_content: str | None = None 145 | 146 | 147 | class BaseChoice(BaseModel): 148 | index: int | None = 0 149 | finish_reason: str | None = None 150 | logprobs: dict | None = None 151 | 152 | 153 | class Choice(BaseChoice): 154 | message: ChatResponseMessage 155 | 156 | 157 | class ChoiceDelta(BaseChoice): 158 | delta: ChatResponseMessage 159 | 160 | 161 | class BaseChatResponse(BaseModel): 162 | # id: str = Field(default_factory=lambda: "chatcmpl-" + str(uuid.uuid4())[:8]) 163 | id: str 164 | created: int = Field(default_factory=lambda: int(time.time())) 165 | model: str 166 | system_fingerprint: str = "fp" 167 | 168 | 169 | class ChatResponse(BaseChatResponse): 170 | choices: list[Choice] 171 | object: Literal["chat.completion"] = "chat.completion" 172 | usage: Usage 173 | 174 | 175 | class ChatStreamResponse(BaseChatResponse): 176 | choices: list[ChoiceDelta] 177 | object: Literal["chat.completion.chunk"] = "chat.completion.chunk" 178 | usage: Usage | None = None 179 | 180 | 181 | class EmbeddingsRequest(BaseModel): 182 | input: str | list[str] | Iterable[int | Iterable[int]] 183 | model: str 184 | encoding_format: Literal["float", "base64"] = "float" 185 | dimensions: int | None = None # not used. 186 | user: str | None = None # not used. 187 | 188 | 189 | class Embedding(BaseModel): 190 | object: Literal["embedding"] = "embedding" 191 | embedding: list[float] | bytes 192 | index: int 193 | 194 | 195 | class EmbeddingsUsage(BaseModel): 196 | prompt_tokens: int 197 | total_tokens: int 198 | 199 | 200 | class EmbeddingsResponse(BaseModel): 201 | object: Literal["list"] = "list" 202 | data: list[Embedding] 203 | model: str 204 | usage: EmbeddingsUsage 205 | 206 | 207 | class ErrorMessage(BaseModel): 208 | message: str 209 | 210 | 211 | class Error(BaseModel): 212 | error: ErrorMessage 213 | -------------------------------------------------------------------------------- /deployment/BedrockProxy.template: -------------------------------------------------------------------------------- 1 | Description: Bedrock Access Gateway - OpenAI-compatible RESTful APIs for Amazon Bedrock (API Gateway + Lambda with Streaming) 2 | Parameters: 3 | ApiKeySecretArn: 4 | Type: String 5 | AllowedPattern: ^arn:aws:secretsmanager:.*$ 6 | Description: The secret ARN in Secrets Manager used to store the API Key 7 | ContainerImageUri: 8 | Type: String 9 | Description: The ECR image URI for the Lambda function (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api:latest) 10 | DefaultModelId: 11 | Type: String 12 | Default: anthropic.claude-3-sonnet-20240229-v1:0 13 | Description: The default model ID, please make sure the model ID is supported in the current region 14 | EnablePromptCaching: 15 | Type: String 16 | Default: "false" 17 | AllowedValues: 18 | - "true" 19 | - "false" 20 | Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings. 21 | Resources: 22 | # IAM Role for Lambda 23 | ProxyApiHandlerServiceRole: 24 | Type: AWS::IAM::Role 25 | Properties: 26 | AssumeRolePolicyDocument: 27 | Statement: 28 | - Action: sts:AssumeRole 29 | Effect: Allow 30 | Principal: 31 | Service: lambda.amazonaws.com 32 | Version: "2012-10-17" 33 | ManagedPolicyArns: 34 | - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" 35 | 36 | ProxyApiHandlerServiceRoleDefaultPolicy: 37 | Type: AWS::IAM::Policy 38 | Properties: 39 | PolicyDocument: 40 | Statement: 41 | - Action: 42 | - bedrock:ListFoundationModels 43 | - bedrock:ListInferenceProfiles 44 | Effect: Allow 45 | Resource: "*" 46 | - Action: 47 | - bedrock:InvokeModel 48 | - bedrock:InvokeModelWithResponseStream 49 | Effect: Allow 50 | Resource: 51 | - arn:aws:bedrock:*::foundation-model/* 52 | - arn:aws:bedrock:*:*:inference-profile/* 53 | - arn:aws:bedrock:*:*:application-inference-profile/* 54 | - Action: 55 | - secretsmanager:GetSecretValue 56 | - secretsmanager:DescribeSecret 57 | Effect: Allow 58 | Resource: !Ref ApiKeySecretArn 59 | Version: "2012-10-17" 60 | PolicyName: ProxyApiHandlerServiceRoleDefaultPolicy 61 | Roles: 62 | - !Ref ProxyApiHandlerServiceRole 63 | 64 | # Lambda Function with Lambda Web Adapter for streaming 65 | ProxyApiHandler: 66 | Type: AWS::Lambda::Function 67 | Properties: 68 | Architectures: 69 | - arm64 70 | Code: 71 | ImageUri: !Ref ContainerImageUri 72 | Description: Bedrock Proxy API Handler with Response Streaming 73 | Environment: 74 | Variables: 75 | # Lambda Web Adapter settings 76 | AWS_LWA_INVOKE_MODE: RESPONSE_STREAM 77 | AWS_LWA_READINESS_CHECK_PATH: /health 78 | AWS_LWA_ASYNC_INIT: "true" 79 | PORT: "8080" 80 | # Application settings 81 | DEBUG: "false" 82 | API_KEY_SECRET_ARN: !Ref ApiKeySecretArn 83 | DEFAULT_MODEL: !Ref DefaultModelId 84 | DEFAULT_EMBEDDING_MODEL: cohere.embed-multilingual-v3 85 | ENABLE_CROSS_REGION_INFERENCE: "true" 86 | ENABLE_APPLICATION_INFERENCE_PROFILES: "true" 87 | ENABLE_PROMPT_CACHING: !Ref EnablePromptCaching 88 | API_ROUTE_PREFIX: /v1 89 | MemorySize: 1024 90 | PackageType: Image 91 | Role: !GetAtt ProxyApiHandlerServiceRole.Arn 92 | Timeout: 600 93 | DependsOn: 94 | - ProxyApiHandlerServiceRoleDefaultPolicy 95 | - ProxyApiHandlerServiceRole 96 | 97 | # API Gateway REST API (Regional) 98 | RestApi: 99 | Type: AWS::ApiGateway::RestApi 100 | Properties: 101 | Name: BedrockProxyApi 102 | Description: Bedrock Access Gateway - OpenAI-compatible API with streaming support 103 | EndpointConfiguration: 104 | Types: 105 | - REGIONAL 106 | Body: 107 | openapi: "3.0.1" 108 | info: 109 | title: BedrockProxyApi 110 | version: "1.0" 111 | paths: 112 | /{proxy+}: 113 | x-amazon-apigateway-any-method: 114 | parameters: 115 | - name: proxy 116 | in: path 117 | required: true 118 | schema: 119 | type: string 120 | x-amazon-apigateway-integration: 121 | type: aws_proxy 122 | httpMethod: POST 123 | uri: !Sub "arn:aws:apigateway:${AWS::Region}:lambda:path/2021-11-15/functions/${ProxyApiHandler.Arn}/response-streaming-invocations" 124 | passthroughBehavior: when_no_match 125 | timeoutInMillis: 600000 126 | responseTransferMode: STREAM 127 | responses: 128 | default: 129 | description: Default response 130 | /: 131 | x-amazon-apigateway-any-method: 132 | x-amazon-apigateway-integration: 133 | type: aws_proxy 134 | httpMethod: POST 135 | uri: !Sub "arn:aws:apigateway:${AWS::Region}:lambda:path/2021-11-15/functions/${ProxyApiHandler.Arn}/response-streaming-invocations" 136 | passthroughBehavior: when_no_match 137 | timeoutInMillis: 600000 138 | responseTransferMode: STREAM 139 | responses: 140 | default: 141 | description: Default response 142 | 143 | # Lambda Permission for API Gateway 144 | LambdaPermission: 145 | Type: AWS::Lambda::Permission 146 | Properties: 147 | FunctionName: !Ref ProxyApiHandler 148 | Action: lambda:InvokeFunction 149 | Principal: apigateway.amazonaws.com 150 | SourceArn: !Sub "arn:aws:execute-api:${AWS::Region}:${AWS::AccountId}:${RestApi}/*" 151 | 152 | # API Gateway Deployment 153 | ApiDeployment: 154 | Type: AWS::ApiGateway::Deployment 155 | Properties: 156 | RestApiId: !Ref RestApi 157 | DependsOn: 158 | - RestApi 159 | 160 | # API Gateway Stage 161 | ApiStage: 162 | Type: AWS::ApiGateway::Stage 163 | Properties: 164 | RestApiId: !Ref RestApi 165 | DeploymentId: !Ref ApiDeployment 166 | StageName: api 167 | Description: API Stage with streaming support 168 | 169 | Outputs: 170 | APIBaseUrl: 171 | Description: Proxy API Base URL (OPENAI_API_BASE) 172 | Value: !Sub "https://${RestApi}.execute-api.${AWS::Region}.amazonaws.com/api/v1" 173 | RestApiId: 174 | Description: API Gateway REST API ID 175 | Value: !Ref RestApi 176 | LambdaFunctionArn: 177 | Description: Lambda Function ARN 178 | Value: !GetAtt ProxyApiHandler.Arn 179 | -------------------------------------------------------------------------------- /deployment/BedrockProxyFargate.template: -------------------------------------------------------------------------------- 1 | Description: Bedrock Access Gateway - OpenAI-compatible RESTful APIs for Amazon Bedrock 2 | Parameters: 3 | ApiKeySecretArn: 4 | Type: String 5 | AllowedPattern: ^arn:aws:secretsmanager:.*$ 6 | Description: The secret ARN in Secrets Manager used to store the API Key 7 | ContainerImageUri: 8 | Type: String 9 | Description: The ECR image URI for the ECS/Fargate task (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api-ecs:latest) 10 | DefaultModelId: 11 | Type: String 12 | Default: anthropic.claude-3-sonnet-20240229-v1:0 13 | Description: The default model ID, please make sure the model ID is supported in the current region 14 | EnablePromptCaching: 15 | Type: String 16 | Default: "false" 17 | AllowedValues: 18 | - "true" 19 | - "false" 20 | Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings. 21 | Resources: 22 | VPCB9E5F0B4: 23 | Type: AWS::EC2::VPC 24 | Properties: 25 | CidrBlock: 10.250.0.0/16 26 | EnableDnsHostnames: true 27 | EnableDnsSupport: true 28 | InstanceTenancy: default 29 | Tags: 30 | - Key: Name 31 | Value: BedrockProxyFargate/VPC 32 | VPCPublicSubnet1SubnetB4246D30: 33 | Type: AWS::EC2::Subnet 34 | Properties: 35 | AvailabilityZone: 36 | Fn::Select: 37 | - 0 38 | - Fn::GetAZs: "" 39 | CidrBlock: 10.250.0.0/24 40 | MapPublicIpOnLaunch: true 41 | Tags: 42 | - Key: aws-cdk:subnet-name 43 | Value: Public 44 | - Key: aws-cdk:subnet-type 45 | Value: Public 46 | - Key: Name 47 | Value: BedrockProxyFargate/VPC/PublicSubnet1 48 | VpcId: 49 | Ref: VPCB9E5F0B4 50 | VPCPublicSubnet1RouteTableFEE4B781: 51 | Type: AWS::EC2::RouteTable 52 | Properties: 53 | Tags: 54 | - Key: Name 55 | Value: BedrockProxyFargate/VPC/PublicSubnet1 56 | VpcId: 57 | Ref: VPCB9E5F0B4 58 | VPCPublicSubnet1RouteTableAssociation0B0896DC: 59 | Type: AWS::EC2::SubnetRouteTableAssociation 60 | Properties: 61 | RouteTableId: 62 | Ref: VPCPublicSubnet1RouteTableFEE4B781 63 | SubnetId: 64 | Ref: VPCPublicSubnet1SubnetB4246D30 65 | VPCPublicSubnet1DefaultRoute91CEF279: 66 | Type: AWS::EC2::Route 67 | Properties: 68 | DestinationCidrBlock: 0.0.0.0/0 69 | GatewayId: 70 | Ref: VPCIGWB7E252D3 71 | RouteTableId: 72 | Ref: VPCPublicSubnet1RouteTableFEE4B781 73 | DependsOn: 74 | - VPCVPCGW99B986DC 75 | VPCPublicSubnet2Subnet74179F39: 76 | Type: AWS::EC2::Subnet 77 | Properties: 78 | AvailabilityZone: 79 | Fn::Select: 80 | - 1 81 | - Fn::GetAZs: "" 82 | CidrBlock: 10.250.1.0/24 83 | MapPublicIpOnLaunch: true 84 | Tags: 85 | - Key: aws-cdk:subnet-name 86 | Value: Public 87 | - Key: aws-cdk:subnet-type 88 | Value: Public 89 | - Key: Name 90 | Value: BedrockProxyFargate/VPC/PublicSubnet2 91 | VpcId: 92 | Ref: VPCB9E5F0B4 93 | VPCPublicSubnet2RouteTable6F1A15F1: 94 | Type: AWS::EC2::RouteTable 95 | Properties: 96 | Tags: 97 | - Key: Name 98 | Value: BedrockProxyFargate/VPC/PublicSubnet2 99 | VpcId: 100 | Ref: VPCB9E5F0B4 101 | VPCPublicSubnet2RouteTableAssociation5A808732: 102 | Type: AWS::EC2::SubnetRouteTableAssociation 103 | Properties: 104 | RouteTableId: 105 | Ref: VPCPublicSubnet2RouteTable6F1A15F1 106 | SubnetId: 107 | Ref: VPCPublicSubnet2Subnet74179F39 108 | VPCPublicSubnet2DefaultRouteB7481BBA: 109 | Type: AWS::EC2::Route 110 | Properties: 111 | DestinationCidrBlock: 0.0.0.0/0 112 | GatewayId: 113 | Ref: VPCIGWB7E252D3 114 | RouteTableId: 115 | Ref: VPCPublicSubnet2RouteTable6F1A15F1 116 | DependsOn: 117 | - VPCVPCGW99B986DC 118 | VPCIGWB7E252D3: 119 | Type: AWS::EC2::InternetGateway 120 | Properties: 121 | Tags: 122 | - Key: Name 123 | Value: BedrockProxyFargate/VPC 124 | VPCVPCGW99B986DC: 125 | Type: AWS::EC2::VPCGatewayAttachment 126 | Properties: 127 | InternetGatewayId: 128 | Ref: VPCIGWB7E252D3 129 | VpcId: 130 | Ref: VPCB9E5F0B4 131 | ProxyExecRole6947A5BE: 132 | Type: AWS::IAM::Role 133 | Properties: 134 | AssumeRolePolicyDocument: 135 | Statement: 136 | - Action: sts:AssumeRole 137 | Effect: Allow 138 | Principal: 139 | Service: ecs-tasks.amazonaws.com 140 | Version: "2012-10-17" 141 | ProxyExecRoleDefaultPolicyED41DFE7: 142 | Type: AWS::IAM::Policy 143 | Properties: 144 | PolicyDocument: 145 | Statement: 146 | - Action: 147 | - logs:CreateLogStream 148 | - logs:PutLogEvents 149 | Effect: Allow 150 | Resource: "*" 151 | - Action: 152 | - secretsmanager:GetSecretValue 153 | - secretsmanager:DescribeSecret 154 | Effect: Allow 155 | Resource: 156 | Ref: ApiKeySecretArn 157 | - Action: 158 | - ecr:BatchCheckLayerAvailability 159 | - ecr:GetDownloadUrlForLayer 160 | - ecr:BatchGetImage 161 | Effect: Allow 162 | Resource: 163 | Fn::Join: 164 | - "" 165 | - - "arn:aws:ecr:" 166 | - Fn::Select: 167 | - 3 168 | - Fn::Split: 169 | - "." 170 | - Fn::Select: 171 | - 0 172 | - Fn::Split: 173 | - "/" 174 | - Ref: ContainerImageUri 175 | - ":" 176 | - Fn::Select: 177 | - 0 178 | - Fn::Split: 179 | - "." 180 | - Fn::Select: 181 | - 0 182 | - Fn::Split: 183 | - "/" 184 | - Ref: ContainerImageUri 185 | - ":repository/" 186 | - Fn::Select: 187 | - 0 188 | - Fn::Split: 189 | - ":" 190 | - Fn::Select: 191 | - 1 192 | - Fn::Split: 193 | - "/" 194 | - Ref: ContainerImageUri 195 | - Action: ecr:GetAuthorizationToken 196 | Effect: Allow 197 | Resource: "*" 198 | Version: "2012-10-17" 199 | PolicyName: ProxyExecRoleDefaultPolicyED41DFE7 200 | Roles: 201 | - Ref: ProxyExecRole6947A5BE 202 | ProxyTaskRole5DB6A540: 203 | Type: AWS::IAM::Role 204 | Properties: 205 | AssumeRolePolicyDocument: 206 | Statement: 207 | - Action: sts:AssumeRole 208 | Effect: Allow 209 | Principal: 210 | Service: ecs-tasks.amazonaws.com 211 | Version: "2012-10-17" 212 | ProxyTaskRoleDefaultPolicy933321B8: 213 | Type: AWS::IAM::Policy 214 | Properties: 215 | PolicyDocument: 216 | Statement: 217 | - Action: 218 | - bedrock:ListFoundationModels 219 | - bedrock:ListInferenceProfiles 220 | Effect: Allow 221 | Resource: "*" 222 | - Action: 223 | - bedrock:InvokeModel 224 | - bedrock:InvokeModelWithResponseStream 225 | Effect: Allow 226 | Resource: 227 | - arn:aws:bedrock:*::foundation-model/* 228 | - arn:aws:bedrock:*:*:inference-profile/* 229 | - arn:aws:bedrock:*:*:application-inference-profile/* 230 | Version: "2012-10-17" 231 | PolicyName: ProxyTaskRoleDefaultPolicy933321B8 232 | Roles: 233 | - Ref: ProxyTaskRole5DB6A540 234 | ProxyBedrockCluster893F4261: 235 | Type: AWS::ECS::Cluster 236 | ProxyBedrockClusterD9C31EFF: 237 | Type: AWS::ECS::ClusterCapacityProviderAssociations 238 | Properties: 239 | CapacityProviders: 240 | - FARGATE 241 | - FARGATE_SPOT 242 | Cluster: 243 | Ref: ProxyBedrockCluster893F4261 244 | DefaultCapacityProviderStrategy: [] 245 | ProxyTaskDef9F2A72E5: 246 | Type: AWS::ECS::TaskDefinition 247 | Properties: 248 | ContainerDefinitions: 249 | - Environment: 250 | - Name: DEBUG 251 | Value: "false" 252 | - Name: DEFAULT_MODEL 253 | Value: 254 | Ref: DefaultModelId 255 | - Name: DEFAULT_EMBEDDING_MODEL 256 | Value: cohere.embed-multilingual-v3 257 | - Name: ENABLE_CROSS_REGION_INFERENCE 258 | Value: "true" 259 | - Name: ENABLE_APPLICATION_INFERENCE_PROFILES 260 | Value: "true" 261 | - Name: ENABLE_PROMPT_CACHING 262 | Value: 263 | Ref: EnablePromptCaching 264 | Essential: true 265 | Image: 266 | Ref: ContainerImageUri 267 | Name: proxy-api 268 | PortMappings: 269 | - ContainerPort: 8080 270 | HostPort: 8080 271 | Protocol: tcp 272 | Secrets: 273 | - Name: API_KEY 274 | ValueFrom: 275 | Fn::Join: 276 | - "" 277 | - - Ref: ApiKeySecretArn 278 | - ":api_key::" 279 | Cpu: "1024" 280 | ExecutionRoleArn: 281 | Fn::GetAtt: 282 | - ProxyExecRole6947A5BE 283 | - Arn 284 | Family: BedrockProxyFargateProxyTaskDefCD902792 285 | Memory: "2048" 286 | NetworkMode: awsvpc 287 | RequiresCompatibilities: 288 | - FARGATE 289 | RuntimePlatform: 290 | CpuArchitecture: ARM64 291 | OperatingSystemFamily: LINUX 292 | TaskRoleArn: 293 | Fn::GetAtt: 294 | - ProxyTaskRole5DB6A540 295 | - Arn 296 | ProxyApiService8651D882: 297 | Type: AWS::ECS::Service 298 | Properties: 299 | CapacityProviderStrategy: 300 | - CapacityProvider: FARGATE 301 | Weight: 1 302 | Cluster: 303 | Ref: ProxyBedrockCluster893F4261 304 | DeploymentConfiguration: 305 | Alarms: 306 | AlarmNames: [] 307 | Enable: false 308 | Rollback: false 309 | MaximumPercent: 200 310 | MinimumHealthyPercent: 50 311 | DesiredCount: 1 312 | EnableECSManagedTags: false 313 | HealthCheckGracePeriodSeconds: 60 314 | LoadBalancers: 315 | - ContainerName: proxy-api 316 | ContainerPort: 8080 317 | TargetGroupArn: 318 | Ref: ProxyALBListenerTargetsGroup187739FA 319 | NetworkConfiguration: 320 | AwsvpcConfiguration: 321 | AssignPublicIp: ENABLED 322 | SecurityGroups: 323 | - Fn::GetAtt: 324 | - ProxyApiServiceSecurityGroup51EBD9B8 325 | - GroupId 326 | Subnets: 327 | - Ref: VPCPublicSubnet1SubnetB4246D30 328 | - Ref: VPCPublicSubnet2Subnet74179F39 329 | TaskDefinition: 330 | Ref: ProxyTaskDef9F2A72E5 331 | DependsOn: 332 | - ProxyALBListener933E9515 333 | - ProxyALBListenerTargetsGroup187739FA 334 | - ProxyTaskRoleDefaultPolicy933321B8 335 | - ProxyTaskRole5DB6A540 336 | ProxyApiServiceSecurityGroup51EBD9B8: 337 | Type: AWS::EC2::SecurityGroup 338 | Properties: 339 | GroupDescription: BedrockProxyFargate/Proxy/ApiService/SecurityGroup 340 | SecurityGroupEgress: 341 | - CidrIp: 0.0.0.0/0 342 | Description: Allow all outbound traffic by default 343 | IpProtocol: "-1" 344 | VpcId: 345 | Ref: VPCB9E5F0B4 346 | DependsOn: 347 | - ProxyTaskRoleDefaultPolicy933321B8 348 | - ProxyTaskRole5DB6A540 349 | ProxyApiServiceSecurityGroupfromBedrockProxyFargateProxyALBSecurityGroup9C12825880081F8FE2: 350 | Type: AWS::EC2::SecurityGroupIngress 351 | Properties: 352 | Description: Load balancer to target 353 | FromPort: 8080 354 | GroupId: 355 | Fn::GetAtt: 356 | - ProxyApiServiceSecurityGroup51EBD9B8 357 | - GroupId 358 | IpProtocol: tcp 359 | SourceSecurityGroupId: 360 | Fn::GetAtt: 361 | - ProxyALBSecurityGroup0D6CA3DA 362 | - GroupId 363 | ToPort: 8080 364 | DependsOn: 365 | - ProxyTaskRoleDefaultPolicy933321B8 366 | - ProxyTaskRole5DB6A540 367 | ProxyALB87756780: 368 | Type: AWS::ElasticLoadBalancingV2::LoadBalancer 369 | Properties: 370 | LoadBalancerAttributes: 371 | - Key: deletion_protection.enabled 372 | Value: "false" 373 | - Key: idle_timeout.timeout_seconds 374 | Value: "600" 375 | Scheme: internet-facing 376 | SecurityGroups: 377 | - Fn::GetAtt: 378 | - ProxyALBSecurityGroup0D6CA3DA 379 | - GroupId 380 | Subnets: 381 | - Ref: VPCPublicSubnet1SubnetB4246D30 382 | - Ref: VPCPublicSubnet2Subnet74179F39 383 | Type: application 384 | DependsOn: 385 | - VPCPublicSubnet1DefaultRoute91CEF279 386 | - VPCPublicSubnet1RouteTableAssociation0B0896DC 387 | - VPCPublicSubnet2DefaultRouteB7481BBA 388 | - VPCPublicSubnet2RouteTableAssociation5A808732 389 | ProxyALBSecurityGroup0D6CA3DA: 390 | Type: AWS::EC2::SecurityGroup 391 | Properties: 392 | GroupDescription: Automatically created Security Group for ELB BedrockProxyFargateProxyALB481672E7 393 | SecurityGroupIngress: 394 | - CidrIp: 0.0.0.0/0 395 | Description: Allow from anyone on port 80 396 | FromPort: 80 397 | IpProtocol: tcp 398 | ToPort: 80 399 | VpcId: 400 | Ref: VPCB9E5F0B4 401 | ProxyALBSecurityGrouptoBedrockProxyFargateProxyApiServiceSecurityGroupDDA1C56480393D1E44: 402 | Type: AWS::EC2::SecurityGroupEgress 403 | Properties: 404 | Description: Load balancer to target 405 | DestinationSecurityGroupId: 406 | Fn::GetAtt: 407 | - ProxyApiServiceSecurityGroup51EBD9B8 408 | - GroupId 409 | FromPort: 8080 410 | GroupId: 411 | Fn::GetAtt: 412 | - ProxyALBSecurityGroup0D6CA3DA 413 | - GroupId 414 | IpProtocol: tcp 415 | ToPort: 8080 416 | ProxyALBListener933E9515: 417 | Type: AWS::ElasticLoadBalancingV2::Listener 418 | Properties: 419 | DefaultActions: 420 | - TargetGroupArn: 421 | Ref: ProxyALBListenerTargetsGroup187739FA 422 | Type: forward 423 | LoadBalancerArn: 424 | Ref: ProxyALB87756780 425 | Port: 80 426 | Protocol: HTTP 427 | ProxyALBListenerTargetsGroup187739FA: 428 | Type: AWS::ElasticLoadBalancingV2::TargetGroup 429 | Properties: 430 | HealthCheckEnabled: true 431 | HealthCheckIntervalSeconds: 60 432 | HealthCheckPath: /health 433 | HealthCheckTimeoutSeconds: 30 434 | Port: 8080 435 | Protocol: HTTP 436 | TargetGroupAttributes: 437 | - Key: stickiness.enabled 438 | Value: "false" 439 | TargetType: ip 440 | VpcId: 441 | Ref: VPCB9E5F0B4 442 | Outputs: 443 | APIBaseUrl: 444 | Description: Proxy API Base URL (OPENAI_API_BASE) 445 | Value: 446 | Fn::Join: 447 | - "" 448 | - - http:// 449 | - Fn::GetAtt: 450 | - ProxyALB87756780 451 | - DNSName 452 | - /api/v1 453 | 454 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bedrock Access Gateway 2 | 3 | OpenAI-compatible RESTful APIs for Amazon Bedrock 4 | 5 | ## What's New 🔥 6 | 7 | **API Gateway Response Streaming Support** - You can now deploy with Amazon API Gateway REST API instead of ALB, enabling true response streaming for better latency and cost optimization. See [Deployment Options](#deployment-options) for details. 8 | 9 | **Latest Models Supported:** 10 | - **Claude 4.5 Family**: Opus 4.5, Sonnet 4.5, Haiku 4.5 - Anthropic's most intelligent models with enhanced coding and agent capabilities 11 | - **Amazon Nova**: Nova Micro, Nova Lite, Nova Pro, Nova Premier - Amazon's native foundation models with multimodal support 12 | - **DeepSeek**: DeepSeek-R1 (reasoning), DeepSeek-V3.1 - Advanced reasoning and general-purpose models 13 | - **Qwen 3**: Qwen3-32B, Qwen3-235B, Qwen3-Coder-30B, Qwen3-Coder-480B - Alibaba's latest language and coding models 14 | - **OpenAI OSS**: gpt-oss-20b, gpt-oss-120b - Open-source GPT models available via Bedrock 15 | 16 | It also supports reasoning for **Claude 4/4.5** (extended thinking and interleaved thinking) and **DeepSeek R1**. Check [How to Use](./docs/Usage.md#reasoning) for more details. You need to first run the Models API to refresh the model list. 17 | 18 | ## Overview 19 | 20 | Amazon Bedrock offers a wide range of foundation models (such as Claude 3 Opus/Sonnet/Haiku, Llama 2/3, Mistral/Mixtral, 21 | etc.) and a broad set of capabilities for you to build generative AI applications. Check the [Amazon Bedrock](https://aws.amazon.com/bedrock) landing page for additional information. 22 | 23 | Sometimes, you might have applications developed using OpenAI APIs or SDKs, and you want to experiment with Amazon Bedrock without modifying your codebase. Or you may simply wish to evaluate the capabilities of these foundation models in tools like AutoGen etc. Well, this repository allows you to access Amazon Bedrock models seamlessly through OpenAI APIs and SDKs, enabling you to test these models without code changes. 24 | 25 | If you find this GitHub repository useful, please consider giving it a free star ⭐ to show your appreciation and support for the project. 26 | 27 | **Features:** 28 | 29 | - [x] Support streaming response via server-sent events (SSE) 30 | - [x] Support Model APIs 31 | - [x] Support Chat Completion APIs 32 | - [x] Support Tool Call 33 | - [x] Support Embedding API 34 | - [x] Support Multimodal API 35 | - [x] Support Cross-Region Inference 36 | - [x] Support Application Inference Profiles (**new**) 37 | - [x] Support Reasoning (**new**) 38 | - [x] Support Interleaved thinking (**new**) 39 | - [x] Support Prompt Caching (**new**) 40 | 41 | Please check [Usage Guide](./docs/Usage.md) for more details about how to use the new APIs. 42 | 43 | 44 | ## Get Started 45 | 46 | ### Prerequisites 47 | 48 | Please make sure you have met below prerequisites: 49 | 50 | - Access to Amazon Bedrock foundation models. 51 | 52 | > For more information on how to request model access, please refer to the [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) (Set Up > Model access) 53 | 54 | ### Architecture 55 | 56 | The following diagram illustrates the reference architecture. It uses [Amazon API Gateway response streaming](https://aws.amazon.com/blogs/compute/building-responsive-apis-with-amazon-api-gateway-response-streaming/) with Lambda for SSE support. 57 | 58 | ![Architecture](assets/arch.png) 59 | 60 | ### Deployment Options 61 | 62 | | Option | Pros | Cons | Best For | 63 | |--------|------|------|----------| 64 | | **API Gateway + Lambda** | No VPC required, pay-per-request, native streaming support, lower operational overhead | Potential cold starts | Most use cases, cost-sensitive deployments | 65 | | **ALB + Fargate** | Lowest streaming latency, no cold starts | Higher cost, requires VPC | High-throughput, latency-sensitive workloads | 66 | 67 | You can also use Lambda Function URL as an alternative, see [example](https://github.com/awslabs/aws-lambda-web-adapter/tree/main/examples/fastapi-response-streaming) 68 | 69 | ### Deployment 70 | 71 | Please follow the steps below to deploy the Bedrock Proxy APIs into your AWS account. Only supports regions where Amazon Bedrock is available (such as `us-west-2`). The deployment will take approximately **10-15 minutes** 🕒. 72 | 73 | **Step 1: Create your own API key in Secrets Manager (MUST)** 74 | 75 | > **Note:** This step is to use any string (without spaces) you like to create a custom API Key (credential) that will be used to access the proxy API later. This key does not have to match your actual OpenAI key, and you don't need to have an OpenAI API key. please keep the key safe and private. 76 | 77 | 1. Open the AWS Management Console and navigate to the AWS Secrets Manager service. 78 | 2. Click on "Store a new secret" button. 79 | 3. In the "Choose secret type" page, select: 80 | 81 | Secret type: Other type of secret 82 | Key/value pairs: 83 | - Key: api_key 84 | - Value: Enter your API key value 85 | 86 | Click "Next" 87 | 4. In the "Configure secret" page: 88 | Secret name: Enter a name (e.g., "BedrockProxyAPIKey") 89 | Description: (Optional) Add a description of your secret 90 | 5. Click "Next" and review all your settings and click "Store" 91 | 92 | After creation, you'll see your secret in the Secrets Manager console. Make note of the secret ARN. 93 | 94 | **Step 2: Build and push container images to ECR** 95 | 96 | 1. Clone this repository: 97 | ```bash 98 | git clone https://github.com/aws-samples/bedrock-access-gateway.git 99 | cd bedrock-access-gateway 100 | ``` 101 | 102 | 2. Run the build and push script: 103 | ```bash 104 | cd scripts 105 | bash ./push-to-ecr.sh 106 | ``` 107 | 108 | 3. Follow the prompts to configure: 109 | - ECR repository names (or use defaults) 110 | - Image tag (or use default: `latest`) 111 | - AWS region (or use default: `us-east-1`) 112 | 113 | 4. The script will build and push both Lambda and ECS/Fargate images to your ECR repositories. 114 | 115 | 5. **Important**: Copy the image URIs displayed at the end of the script output. You'll need these in the next step. 116 | 117 | **Step 3: Deploy the CloudFormation stack** 118 | 119 | 1. Download the CloudFormation template you want to use: 120 | - For API Gateway + Lambda: [`deployment/BedrockProxy.template`](deployment/BedrockProxy.template) 121 | - For ALB + Fargate: [`deployment/BedrockProxyFargate.template`](deployment/BedrockProxyFargate.template) 122 | 123 | 2. Sign in to AWS Management Console and navigate to the CloudFormation service in your target region. 124 | 125 | 3. Click "Create stack" → "With new resources (standard)". 126 | 127 | 4. Upload the template file you downloaded. 128 | 129 | 5. On the "Specify stack details" page, provide the following information: 130 | - **Stack name**: Enter a stack name (e.g., "BedrockProxyAPI") 131 | - **ApiKeySecretArn**: Enter the secret ARN from Step 1 132 | - **ContainerImageUri**: Enter the ECR image URI from Step 2 output 133 | - **DefaultModelId**: (Optional) Change the default model if needed 134 | 135 | Click "Next". 136 | 137 | 6. On the "Configure stack options" page, you can leave the default settings or customize them according to your needs. Click "Next". 138 | 139 | 7. On the "Review" page, review all details. Check the "I acknowledge that AWS CloudFormation might create IAM resources" checkbox at the bottom. Click "Submit". 140 | 141 | That is it! 🎉 Once deployed, click the CloudFormation stack and go to **Outputs** tab, you can find the API Base URL from `APIBaseUrl`, the value should look like `http://xxxx.xxx.elb.amazonaws.com/api/v1`. 142 | 143 | ### Troubleshooting 144 | 145 | If you encounter any issues, please check the [Troubleshooting Guide](./docs/Troubleshooting.md) for more details. 146 | 147 | ### SDK/API Usage 148 | 149 | All you need is the API Key and the API Base URL. If you didn't set up your own key following Step 1, the application will fail to start with an error message indicating that the API Key is not configured. 150 | 151 | Now, you can try out the proxy APIs. Let's say you want to test Claude 3 Sonnet model (model ID: `anthropic.claude-3-sonnet-20240229-v1:0`)... 152 | 153 | **Example API Usage** 154 | 155 | ```bash 156 | export OPENAI_API_KEY= 157 | export OPENAI_BASE_URL= 158 | # For older versions 159 | # https://github.com/openai/openai-python/issues/624 160 | export OPENAI_API_BASE= 161 | ``` 162 | 163 | ```bash 164 | curl $OPENAI_BASE_URL/chat/completions \ 165 | -H "Content-Type: application/json" \ 166 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 167 | -d '{ 168 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 169 | "messages": [ 170 | { 171 | "role": "user", 172 | "content": "Hello!" 173 | } 174 | ] 175 | }' 176 | ``` 177 | 178 | **Example SDK Usage** 179 | 180 | ```python 181 | from openai import OpenAI 182 | 183 | client = OpenAI() 184 | completion = client.chat.completions.create( 185 | model="anthropic.claude-3-sonnet-20240229-v1:0", 186 | messages=[{"role": "user", "content": "Hello!"}], 187 | ) 188 | 189 | print(completion.choices[0].message.content) 190 | ``` 191 | 192 | Please check [Usage Guide](./docs/Usage.md) for more details about how to use embedding API, multimodal API and tool call. 193 | 194 | ### Application Inference Profiles 195 | 196 | This proxy now supports **Application Inference Profiles**, which allow you to track usage and costs for your model invocations. You can use application inference profiles created in your AWS account for cost tracking and monitoring purposes. 197 | 198 | **Using Application Inference Profiles:** 199 | 200 | ```bash 201 | # Use an application inference profile ARN as the model ID 202 | curl $OPENAI_BASE_URL/chat/completions \ 203 | -H "Content-Type: application/json" \ 204 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 205 | -d '{ 206 | "model": "arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id", 207 | "messages": [ 208 | { 209 | "role": "user", 210 | "content": "Hello!" 211 | } 212 | ] 213 | }' 214 | ``` 215 | 216 | **SDK Usage with Application Inference Profiles:** 217 | 218 | ```python 219 | from openai import OpenAI 220 | 221 | client = OpenAI() 222 | completion = client.chat.completions.create( 223 | model="arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id", 224 | messages=[{"role": "user", "content": "Hello!"}], 225 | ) 226 | 227 | print(completion.choices[0].message.content) 228 | ``` 229 | 230 | **Benefits of Application Inference Profiles:** 231 | - **Cost Tracking**: Track usage and costs for specific applications or use cases 232 | - **Usage Monitoring**: Monitor model invocation metrics through CloudWatch 233 | - **Tag-based Cost Allocation**: Use AWS cost allocation tags for detailed billing analysis 234 | 235 | For more information about creating and managing application inference profiles, see the [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/inference-profiles-create.html). 236 | 237 | ### Prompt Caching 238 | 239 | This proxy now supports **Prompt Caching** for Claude and Nova models, which can reduce costs by up to 90% and latency by up to 85% for workloads with repeated prompts. 240 | 241 | **Supported Models:** 242 | - Claude models (Claude 3.5 Haiku, Claude 4, Claude 4.5, etc.) 243 | - Nova models (Nova Micro, Nova Lite, Nova Pro, Nova Premier) 244 | 245 | **Enabling Prompt Caching:** 246 | 247 | You can enable prompt caching in two ways: 248 | 249 | 1. **Globally via Environment Variable** (set in ECS Task Definition or Lambda): 250 | ```bash 251 | ENABLE_PROMPT_CACHING=true 252 | ``` 253 | 254 | 2. **Per-request via `extra_body`** : 255 | 256 | **Python SDK:** 257 | ```python 258 | from openai import OpenAI 259 | 260 | client = OpenAI() 261 | 262 | # Cache system prompts 263 | response = client.chat.completions.create( 264 | model="global.anthropic.claude-haiku-4-5-20251001-v1:0", 265 | messages=[ 266 | {"role": "system", "content": "You are an expert assistant with knowledge of..."}, 267 | {"role": "user", "content": "Help me with this task"} 268 | ], 269 | extra_body={ 270 | "prompt_caching": {"system": True} 271 | } 272 | ) 273 | 274 | # Check cache hit 275 | if response.usage.prompt_tokens_details: 276 | cached_tokens = response.usage.prompt_tokens_details.cached_tokens 277 | print(f"Cached tokens: {cached_tokens}") 278 | ``` 279 | 280 | **cURL:** 281 | ```bash 282 | curl $OPENAI_BASE_URL/chat/completions \ 283 | -H "Content-Type: application/json" \ 284 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 285 | -d '{ 286 | "model": "global.anthropic.claude-haiku-4-5-20251001-v1:0", 287 | "messages": [ 288 | {"role": "system", "content": "Long system prompt..."}, 289 | {"role": "user", "content": "Question"} 290 | ], 291 | "extra_body": { 292 | "prompt_caching": {"system": true} 293 | } 294 | }' 295 | ``` 296 | 297 | **Cache Options:** 298 | - `"prompt_caching": {"system": true}` - Cache system prompts 299 | - `"prompt_caching": {"messages": true}` - Cache user messages 300 | - `"prompt_caching": {"system": true, "messages": true}` - Cache both 301 | 302 | **Requirements:** 303 | - Prompt must be ≥1,024 tokens to enable caching 304 | - Cache TTL is 5 minutes (resets on each cache hit) 305 | - Nova models have a 20,000 token caching limit 306 | 307 | For more information, see the [Amazon Bedrock Prompt Caching Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html). 308 | 309 | ## Other Examples 310 | 311 | ### LangChain 312 | 313 | Make sure you use `ChatOpenAI(...)` instead of `OpenAI(...)` 314 | 315 | ```python 316 | # pip install langchain-openai 317 | import os 318 | 319 | from langchain.chains import LLMChain 320 | from langchain.prompts import PromptTemplate 321 | from langchain_openai import ChatOpenAI 322 | 323 | chat = ChatOpenAI( 324 | model="anthropic.claude-3-sonnet-20240229-v1:0", 325 | temperature=0, 326 | openai_api_key=os.environ['OPENAI_API_KEY'], 327 | openai_api_base=os.environ['OPENAI_BASE_URL'], 328 | ) 329 | 330 | template = """Question: {question} 331 | 332 | Answer: Let's think step by step.""" 333 | 334 | prompt = PromptTemplate.from_template(template) 335 | llm_chain = LLMChain(prompt=prompt, llm=chat) 336 | 337 | question = "What NFL team won the Super Bowl in the year Justin Beiber was born?" 338 | response = llm_chain.invoke(question) 339 | print(response) 340 | 341 | ``` 342 | 343 | ## FAQs 344 | 345 | ### About Privacy 346 | 347 | This application does not collect any of your data. Furthermore, it does not log any requests or responses by default. 348 | 349 | ### Why choose API Gateway vs ALB? 350 | 351 | **API Gateway + Lambda** uses [API Gateway response streaming](https://aws.amazon.com/blogs/compute/building-responsive-apis-with-amazon-api-gateway-response-streaming/) with [Lambda Web Adapter](https://github.com/awslabs/aws-lambda-web-adapter) to support SSE streaming without requiring a VPC. This is a cost-effective, serverless option with up to 10 minutes timeout. 352 | 353 | **ALB + Fargate** provides the lowest streaming latency with no cold starts, ideal for high-throughput workloads. 354 | 355 | ### Which regions are supported? 356 | 357 | Generally speaking, all regions that Amazon Bedrock supports will also be supported, if not, please raise an issue in Github. 358 | 359 | Note that not all models are available in those regions. 360 | 361 | ### Which models are supported? 362 | 363 | You can use the [Models API](./docs/Usage.md#models-api) to get/refresh a list of supported models in the current region. 364 | 365 | ### Can I run this locally 366 | 367 | Yes, you can run this locally, e.g. run below command under `src` folder: 368 | 369 | ```bash 370 | uvicorn api.app:app --host 0.0.0.0 --port 8000 371 | ``` 372 | 373 | The API base url should look like `http://localhost:8000/api/v1`. 374 | 375 | ### Any performance sacrifice or latency increase by using the proxy APIs 376 | 377 | Compared with direct AWS SDK calls, the proxy architecture will add some latency. The default API Gateway + Lambda deployment provides good streaming performance with Lambda response streaming. 378 | 379 | For lowest latency on streaming responses, consider the ALB + Fargate deployment option which eliminates cold starts and provides consistent performance. 380 | 381 | ### Any plan to support SageMaker models? 382 | 383 | Currently, there is no plan to support SageMaker models. This may change provided there's a demand from customers. 384 | 385 | ### Any plan to support Bedrock custom models? 386 | 387 | Fine-tuned models and models with Provisioned Throughput are currently not supported. You can clone the repo and make the customization if needed. 388 | 389 | ### How to upgrade? 390 | 391 | To use the latest features, you need follow the deployment guide to redeploy the application. You can upgrade the existing CloudFormation stack to get the latest changes. 392 | 393 | ## Security 394 | 395 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 396 | 397 | ## License 398 | 399 | This library is licensed under the MIT-0 License. See the LICENSE file. 400 | -------------------------------------------------------------------------------- /docs/Usage_CN.md: -------------------------------------------------------------------------------- 1 | [English](./Usage.md) 2 | 3 | # Usage Guide 4 | 5 | 假设您在部署后已设置以下环境变量: 6 | 7 | ```bash 8 | export OPENAI_API_KEY= 9 | export OPENAI_BASE_URL= 10 | ``` 11 | 12 | **API 示例:** 13 | - [Models API](#models-api) 14 | - [Embedding API](#embedding-api) 15 | - [Multimodal API](#multimodal-api) 16 | - [Tool Call](#tool-call) 17 | - [Reasoning](#reasoning) 18 | - [Interleaved thinking (beta)](#Interleaved thinking (beta)) 19 | 20 | 21 | ## Models API 22 | 23 | 你可以通过这个API 获取支持的models 列表。 另外,如果Amazon Bedrock有新模型加入后,你也可以用它来更新刷新模型列表。 24 | 25 | **Request 示例** 26 | 27 | ```bash 28 | curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq .data 29 | ``` 30 | 31 | **Response 示例** 32 | 33 | ```bash 34 | [ 35 | ... 36 | { 37 | "id": "anthropic.claude-3-5-sonnet-20240620-v1:0", 38 | "created": 1734416893, 39 | "object": "model", 40 | "owned_by": "bedrock" 41 | }, 42 | { 43 | "id": "us.anthropic.claude-3-5-sonnet-20240620-v1:0", 44 | "created": 1734416893, 45 | "object": "model", 46 | "owned_by": "bedrock" 47 | }, 48 | ... 49 | ] 50 | ``` 51 | 52 | ## Chat Completions API 53 | 54 | ### Claude Sonnet 4.5 基础示例 55 | 56 | Claude Sonnet 4.5 是 Anthropic 最智能的模型,在编码、复杂推理和基于代理的任务方面表现出色。它通过全球跨区域推理配置文件提供。 57 | 58 | **Request 示例** 59 | 60 | ```bash 61 | curl $OPENAI_BASE_URL/chat/completions \ 62 | -H "Content-Type: application/json" \ 63 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 64 | -d '{ 65 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 66 | "messages": [ 67 | { 68 | "role": "user", 69 | "content": "编写一个使用动态规划计算斐波那契数列的Python函数。" 70 | } 71 | ] 72 | }' 73 | ``` 74 | 75 | **SDK 使用示例** 76 | 77 | ```python 78 | from openai import OpenAI 79 | 80 | client = OpenAI() 81 | completion = client.chat.completions.create( 82 | model="global.anthropic.claude-sonnet-4-5-20250929-v1:0", 83 | messages=[{"role": "user", "content": "编写一个使用动态规划计算斐波那契数列的Python函数。"}], 84 | ) 85 | 86 | print(completion.choices[0].message.content) 87 | ``` 88 | 89 | ## Embedding API 90 | 91 | **重要**: 在使用此代理 API 之前,请仔细阅读以下几点: 92 | 93 | 1. 如果您之前使用 OpenAI Embedding模型来创建向量,请注意切换到新模型可能没有那么直接。不同模型具有不同的维度(例如,embed-multilingual-v3.0 有 1024 个维度),即使对于相同的文本,它们也可能产生不同的结果。 94 | 2. 如果您使用 OpenAI Embedding模型传入的是整数编码(例如与 LangChain 一起使用),此方案将尝试使用 `tiktoken` 进行解码以检索原始文本。但是,无法保证解码后的文本准确无误。 95 | 3. 如果您对长文本使用 OpenAI Embedding,您应该验证 Bedrock 模型支持的最大Token数,例如为获得最佳性能,Bedrock 建议将文本长度限制在少于 512 个Token。 96 | 97 | **Request 示例** 98 | 99 | ```bash 100 | curl $OPENAI_BASE_URL/embeddings \ 101 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 102 | -H "Content-Type: application/json" \ 103 | -d '{ 104 | "input": "The food was delicious and the waiter...", 105 | "model": "text-embedding-ada-002", 106 | "encoding_format": "float" 107 | }' 108 | ``` 109 | 110 | **Response 示例** 111 | 112 | ```json 113 | { 114 | "object": "list", 115 | "data": [ 116 | { 117 | "object": "embedding", 118 | "embedding": [ 119 | -0.02279663, 120 | -0.024612427, 121 | 0.012863159, 122 | ... 123 | 0.01612854, 124 | 0.0038928986 125 | ], 126 | "index": 0 127 | } 128 | ], 129 | "model": "cohere.embed-multilingual-v3", 130 | "usage": { 131 | "prompt_tokens": 0, 132 | "total_tokens": 0 133 | } 134 | } 135 | ``` 136 | 137 | 或者你可以使用OpenAI 的SDK 138 | 139 | ```python 140 | from openai import OpenAI 141 | 142 | client = OpenAI() 143 | 144 | def get_embedding(text, model="text-embedding-3-small"): 145 | text = text.replace("\n", " ") 146 | return client.embeddings.create(input=[text], model=model).data[0].embedding 147 | 148 | text = "hello" 149 | # will output like [0.003578186, 0.028717041, 0.031021118, -0.0014066696,...] 150 | print(get_embedding(text)) 151 | ``` 152 | 153 | 或者 LangChain 154 | 155 | ```python 156 | from langchain_openai import OpenAIEmbeddings 157 | 158 | embeddings = OpenAIEmbeddings( 159 | model="text-embedding-3-large", 160 | ) 161 | text = "This is a test document." 162 | query_result = embeddings.embed_query(text) 163 | print(query_result[:5]) 164 | doc_result = embeddings.embed_documents([text]) 165 | print(doc_result[0][:5]) 166 | ``` 167 | 168 | ## Multimodal API 169 | 170 | **Request 示例** 171 | 172 | ```bash 173 | curl $OPENAI_BASE_URL/chat/completions \ 174 | -H "Content-Type: application/json" \ 175 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 176 | -d '{ 177 | "model": "gpt-3.5-turbo", 178 | "messages": [ 179 | { 180 | "role": "user", 181 | "content": [ 182 | { 183 | "type": "text", 184 | "text": "please identify and count all the objects in this images, list all the names" 185 | }, 186 | { 187 | "type": "image_url", 188 | "image_url": { 189 | "url": "https://github.com/aws-samples/bedrock-access-gateway/blob/main/assets/obj-detect.png?raw=true" 190 | } 191 | } 192 | ] 193 | } 194 | ] 195 | }' 196 | ``` 197 | 198 | 如果您需要使用此API处理非公开图像,您可以先对图像进行base64编码,然后传递编码后的字符串。 199 | 将"image/jpeg"替换为实际的内容类型(content type)。目前仅支持"image/jpeg"、"image/png"、"image/gif"或"image/webp"。 200 | 201 | ```bash 202 | curl $OPENAI_BASE_URL/chat/completions \ 203 | -H "Content-Type: application/json" \ 204 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 205 | -d '{ 206 | "model": "gpt-3.5-turbo", 207 | "messages": [ 208 | { 209 | "role": "user", 210 | "content": [ 211 | { 212 | "type": "text", 213 | "text": "please identify and count all the objects in this images, list all the names" 214 | }, 215 | { 216 | "type": "image_url", 217 | "image_url": { 218 | "url": "data:image/jpeg;base64," 219 | } 220 | } 221 | ] 222 | } 223 | ] 224 | }' 225 | ``` 226 | 227 | **Response 示例** 228 | 229 | ```json 230 | { 231 | "id": "msg_01BY3wcz41x7XrKhxY3VzWke", 232 | "created": 1712543069, 233 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 234 | "system_fingerprint": "fp", 235 | "choices": [ 236 | { 237 | "index": 0, 238 | "finish_reason": "stop", 239 | "message": { 240 | "role": "assistant", 241 | "content": "The image contains the following objects:\n\n1. A peach-colored short-sleeve button-up shirt\n2. An olive green plaid long coat/jacket\n3. A pair of white sneakers or canvas shoes\n4. A brown shoulder bag or purse\n5. A makeup brush or cosmetic applicator\n6. A tube or container (possibly lipstick or lip balm)\n7. A pair of sunglasses\n8. A thought bubble icon\n9. A footprint icon\n10. A leaf or plant icon\n11. A flower icon\n12. A cloud icon\n\nIn total, there are 12 distinct objects depicted in the illustrated scene." 242 | } 243 | } 244 | ], 245 | "object": "chat.completion", 246 | "usage": { 247 | "prompt_tokens": 197, 248 | "completion_tokens": 147, 249 | "total_tokens": 344 250 | } 251 | } 252 | ``` 253 | 254 | 255 | ## Tool Call 256 | 257 | **重要**:在使用此代理API进行Tool Call之前,请仔细阅读以下几点: 258 | 259 | 1. OpenAI 已经废弃使用Function Call,而推荐使用Tool Call,因此Function Call在此处不受支持,您应该改为Tool Call。 260 | 261 | **Request 示例** 262 | 263 | ```bash 264 | curl $OPENAI_BASE_URL/chat/completions \ 265 | -H "Content-Type: application/json" \ 266 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 267 | -d '{ 268 | "model": "gpt-3.5-turbo", 269 | "messages": [ 270 | { 271 | "role": "user", 272 | "content": "What is the weather like in Shanghai today?" 273 | } 274 | ], 275 | "tools": [ 276 | { 277 | "type": "function", 278 | "function": { 279 | "name": "get_current_weather", 280 | "description": "Get the current weather in a given location", 281 | "parameters": { 282 | "type": "object", 283 | "properties": { 284 | "location": { 285 | "type": "string", 286 | "description": "The city or state which is required." 287 | }, 288 | "unit": { 289 | "type": "string", 290 | "enum": [ 291 | "celsius", 292 | "fahrenheit" 293 | ] 294 | } 295 | }, 296 | "required": [ 297 | "location" 298 | ] 299 | } 300 | } 301 | }, 302 | { 303 | "type": "function", 304 | "function": { 305 | "name": "get_current_location", 306 | "description": "Use this tool to get the current location if user does not provide a location", 307 | "parameters": { 308 | "type": "object", 309 | "properties": {} 310 | } 311 | } 312 | } 313 | ], 314 | "tool_choice": "auto" 315 | }' 316 | ``` 317 | 318 | **Response 示例** 319 | 320 | ```json 321 | { 322 | "id": "msg_01PjrKDWhYGsrTNdeqzWd6D9", 323 | "created": 1712543689, 324 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 325 | "system_fingerprint": "fp", 326 | "choices": [ 327 | { 328 | "index": 0, 329 | "finish_reason": "stop", 330 | "message": { 331 | "role": "assistant", 332 | "tool_calls": [ 333 | { 334 | "id": "0", 335 | "type": "function", 336 | "function": { 337 | "name": "get_current_weather", 338 | "arguments": "{\"location\": \"Shanghai\", \"unit\": \"celsius\"}" 339 | } 340 | } 341 | ] 342 | } 343 | } 344 | ], 345 | "object": "chat.completion", 346 | "usage": { 347 | "prompt_tokens": 256, 348 | "completion_tokens": 64, 349 | "total_tokens": 320 350 | } 351 | } 352 | ``` 353 | 354 | You can try it with different questions, such as: 355 | 1. Hello, who are you? (No tools are needed) 356 | 2. What is the weather like today? (Should use get_current_location tool first) 357 | 358 | ## Reasoning 359 | 360 | 361 | **重要**: 使用此 reasoning 推理模式前,请仔细阅读以下要点。 362 | 363 | - 目前仅 Claude 3.7 Sonnet / Deepseek R1 模型支持推理功能。使用前请确保所用模型支持推理。 364 | - Claude 3.7 Sonnet 推理模式(或思考模式)默认未启用,您必须在请求中传递额外的 reasoning_effort 参数,参数值可选:low,medium, high。另外,请在请求中提供正确的 max_tokens(或 max_completion_tokens)参数。budget_tokens 基于 reasoning_effort 设置(低:30%,中:60%,高:100% 的max tokens),确保最小 budget_tokens 为 1,024,Anthropic 建议至少使用 4,000 个令牌以获得全面的推理。详情请参阅 [Bedrock Document](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-37.html)。 365 | - Deepseek R1 会自动使用推理模式,不需要在中传递额外的 reasoning_effort 参数(否则会报错) 366 | - 推理结果(思维链结果、思考过程)被添加到名为 'reasoning_content' 的额外标签中,这不是 OpenAI 官方支持的格式。此设计遵循 [Deepseek Reasoning Model](https://api-docs.deepseek.com/guides/reasoning_model#api-example) 的规范。未来可能会有所变动。 367 | 368 | **Request 示例** 369 | 370 | - Claude 3.7 Sonnet 371 | 372 | ```bash 373 | curl $OPENAI_BASE_URL/chat/completions \ 374 | -H "Content-Type: application/json" \ 375 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 376 | -d '{ 377 | "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0", 378 | "messages": [ 379 | { 380 | "role": "user", 381 | "content": "which one is bigger, 3.9 or 3.11?" 382 | } 383 | ], 384 | "max_completion_tokens": 4096, 385 | "reasoning_effort": "low", 386 | "stream": false 387 | }' 388 | ``` 389 | 390 | - DeepSeek R1 391 | 392 | ```bash 393 | curl $OPENAI_BASE_URL/chat/completions \ 394 | -H "Content-Type: application/json" \ 395 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 396 | -d '{ 397 | "model": "us.deepseek.r1-v1:0", 398 | "messages": [ 399 | { 400 | "role": "user", 401 | "content": "which one is bigger, 3.9 or 3.11?" 402 | } 403 | ], 404 | "stream": false 405 | }' 406 | ``` 407 | 408 | 409 | **Response 示例** 410 | 411 | ```json 412 | { 413 | "id": "chatcmpl-83fb7a88", 414 | "created": 1740545278, 415 | "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0", 416 | "system_fingerprint": "fp", 417 | "choices": [ 418 | { 419 | "index": 0, 420 | "finish_reason": "stop", 421 | "logprobs": null, 422 | "message": { 423 | "role": "assistant", 424 | "content": "3.9 is bigger than 3.11.\n\nWhen comparing decimal numbers, we need to understand what these numbers actually represent:...", 425 | "reasoning_content": "I need to compare the decimal numbers 3.9 and 3.11.\n\nFor decimal numbers, we first compare the whole number parts, and if they're equal, we compare the decimal parts. \n\nBoth numbers ..." 426 | } 427 | } 428 | ], 429 | "object": "chat.completion", 430 | "usage": { 431 | "prompt_tokens": 51, 432 | "completion_tokens": 565, 433 | "total_tokens": 616 434 | } 435 | } 436 | ``` 437 | 438 | 或者使用 OpenAI SDK (请先运行`pip3 install -U openai` 升级到最新版本) 439 | 440 | - Non-Streaming 441 | 442 | ```python 443 | from openai import OpenAI 444 | client = OpenAI() 445 | 446 | messages = [{"role": "user", "content": "which one is bigger, 3.9 or 3.11?"}] 447 | response = client.chat.completions.create( 448 | model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", 449 | messages=messages, 450 | reasoning_effort="low", 451 | max_completion_tokens=4096, 452 | ) 453 | 454 | reasoning_content = response.choices[0].message.reasoning_content 455 | content = response.choices[0].message.content 456 | ``` 457 | 458 | - Streaming 459 | 460 | ```python 461 | from openai import OpenAI 462 | client = OpenAI() 463 | 464 | messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] 465 | response = client.chat.completions.create( 466 | model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", 467 | messages=messages, 468 | reasoning_effort="low", 469 | max_completion_tokens=4096, 470 | stream=True, 471 | ) 472 | 473 | reasoning_content = "" 474 | content = "" 475 | 476 | for chunk in response: 477 | if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content: 478 | reasoning_content += chunk.choices[0].delta.reasoning_content 479 | elif chunk.choices[0].delta.content: 480 | content += chunk.choices[0].delta.content 481 | ``` 482 | 483 | ## Interleaved thinking (beta) 484 | 485 | **重要提示**:在使用 Chat Completion API 的推理模式(reasoning mode)前,请务必仔细阅读以下内容。 486 | 487 | Claude 4 模型支持借助工具使用的扩展思维功能(Extended Thinking),其中包含交错思考([interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) )。该功能使 Claude 4 可以在多次调用工具之间进行思考,并在收到工具结果后执行更复杂的推理,这对处理更复杂的 Agentic AI 交互非常有帮助。 488 | 489 | 在交错思考模式下,budget_tokens 可以超过 max_tokens 参数,因为它代表一次助手回合中所有思考块的总 Token 预算。 490 | 491 | **支持的模型**: Claude Sonnet 4, Claude Sonnet 4.5 492 | 493 | **Request 示例** 494 | 495 | - Non-Streaming (Claude Sonnet 4.5) 496 | 497 | ```bash 498 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 499 | -H "Content-Type: application/json" \ 500 | -H "Authorization: Bearer bedrock" \ 501 | -d '{ 502 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 503 | "max_tokens": 2048, 504 | "messages": [{ 505 | "role": "user", 506 | "content": "解释如何实现一个具有自平衡功能的二叉搜索树。" 507 | }], 508 | "extra_body": { 509 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 510 | "thinking": {"type": "enabled", "budget_tokens": 4096} 511 | } 512 | }' 513 | ``` 514 | 515 | - Non-Streaming (Claude Sonnet 4) 516 | 517 | ```bash 518 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 519 | -H "Content-Type: application/json" \ 520 | -H "Authorization: Bearer bedrock" \ 521 | -d '{ 522 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", 523 | "max_tokens": 2048, 524 | "messages": [{ 525 | "role": "user", 526 | "content": "有一天,一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧,于是偷偷把分数改成了 88 分。她的父亲看到试卷后,怒发冲冠,狠狠地给了她一巴掌,怒吼道:“你这 8 怎么一半是绿的一半是红的,你以为我是傻子吗?”女孩被打后,委屈地哭了起来,什么也没说。过了一会儿,父亲突然崩溃了。请问这位父亲为什么过一会崩溃了?" 527 | }], 528 | "extra_body": { 529 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 530 | "thinking": {"type": "enabled", "budget_tokens": 4096} 531 | } 532 | }' 533 | ``` 534 | 535 | - Streaming (Claude Sonnet 4.5) 536 | 537 | ```bash 538 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 539 | -H "Content-Type: application/json" \ 540 | -H "Authorization: Bearer bedrock" \ 541 | -d '{ 542 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 543 | "max_tokens": 2048, 544 | "messages": [{ 545 | "role": "user", 546 | "content": "解释如何实现一个具有自平衡功能的二叉搜索树。" 547 | }], 548 | "stream": true, 549 | "extra_body": { 550 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 551 | "thinking": {"type": "enabled", "budget_tokens": 4096} 552 | } 553 | }' 554 | ``` 555 | 556 | - Streaming (Claude Sonnet 4) 557 | 558 | ```bash 559 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 560 | -H "Content-Type: application/json" \ 561 | -H "Authorization: Bearer bedrock" \ 562 | -d '{ 563 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", 564 | "max_tokens": 2048, 565 | "messages": [{ 566 | "role": "user", 567 | "content": "有一天,一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧,于是偷偷把分数改成了 88 分。她的父亲看到试卷后,怒发冲冠,狠狠地给了她一巴掌,怒吼道:“你这 8 怎么一半是绿的一半是红的,你以为我是傻子吗?”女孩被打后,委屈地哭了起来,什么也没说。过了一会儿,父亲突然崩溃了。请问这位父亲为什么过一会崩溃了?" 568 | }], 569 | "stream": true, 570 | "extra_body": { 571 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 572 | "thinking": {"type": "enabled", "budget_tokens": 4096} 573 | } 574 | }' 575 | ``` 576 | -------------------------------------------------------------------------------- /docs/Usage.md: -------------------------------------------------------------------------------- 1 | [中文](./Usage_CN.md) 2 | 3 | # Usage Guide 4 | 5 | Assuming you have set up below environment variables after deployed: 6 | 7 | ```bash 8 | export OPENAI_API_KEY= 9 | export OPENAI_BASE_URL= 10 | ``` 11 | 12 | **API Example:** 13 | - [Models API](#models-api) 14 | - [Embedding API](#embedding-api) 15 | - [Multimodal API](#multimodal-api) 16 | - [Tool Call](#tool-call) 17 | - [Reasoning](#reasoning) 18 | - [Interleaved thinking (beta)](#Interleaved thinking (beta)) 19 | 20 | ## Models API 21 | 22 | You can use this API to get a list of supported model IDs. 23 | 24 | Also, you can use this API to refresh the model list if new models are added to Amazon Bedrock. 25 | 26 | 27 | **Example Request** 28 | 29 | ```bash 30 | curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq .data 31 | ``` 32 | 33 | **Example Response** 34 | 35 | ```bash 36 | [ 37 | ... 38 | { 39 | "id": "anthropic.claude-3-5-sonnet-20240620-v1:0", 40 | "created": 1734416893, 41 | "object": "model", 42 | "owned_by": "bedrock" 43 | }, 44 | { 45 | "id": "us.anthropic.claude-3-5-sonnet-20240620-v1:0", 46 | "created": 1734416893, 47 | "object": "model", 48 | "owned_by": "bedrock" 49 | }, 50 | ... 51 | ] 52 | ``` 53 | 54 | ## Chat Completions API 55 | 56 | ### Basic Example with Claude Sonnet 4.5 57 | 58 | Claude Sonnet 4.5 is Anthropic's most intelligent model, excelling at coding, complex reasoning, and agent-based tasks. It's available via global cross-region inference profiles. 59 | 60 | **Example Request** 61 | 62 | ```bash 63 | curl $OPENAI_BASE_URL/chat/completions \ 64 | -H "Content-Type: application/json" \ 65 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 66 | -d '{ 67 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 68 | "messages": [ 69 | { 70 | "role": "user", 71 | "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming." 72 | } 73 | ] 74 | }' 75 | ``` 76 | 77 | **Example SDK Usage** 78 | 79 | ```python 80 | from openai import OpenAI 81 | 82 | client = OpenAI() 83 | completion = client.chat.completions.create( 84 | model="global.anthropic.claude-sonnet-4-5-20250929-v1:0", 85 | messages=[{"role": "user", "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming."}], 86 | ) 87 | 88 | print(completion.choices[0].message.content) 89 | ``` 90 | 91 | ## Embedding API 92 | 93 | **Important Notice**: Please carefully review the following points before using this proxy API for embedding. 94 | 95 | 1. If you have previously used OpenAI embedding models to create vectors, be aware that switching to a new model may not be straightforward. Different models have varying dimensions (e.g., embed-multilingual-v3.0 has 1024 dimensions), and even for the same text, they may produce different results. 96 | 2. If you are using OpenAI embedding models for encoded integers (such as with LangChain), this solution will attempt to decode the integers using `tiktoken` to retrieve the original text. However, there is no guarantee that the decoded text will be accurate. 97 | 3. If you are using OpenAI embedding models for long texts, you should verify the maximum number of tokens supported for Bedrock models, e.g. for optimal performance, Bedrock recommends limiting the text length to less than 512 tokens. 98 | 99 | 100 | **Example Request** 101 | 102 | ```bash 103 | curl $OPENAI_BASE_URL/embeddings \ 104 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 105 | -H "Content-Type: application/json" \ 106 | -d '{ 107 | "input": "The food was delicious and the waiter...", 108 | "model": "text-embedding-ada-002", 109 | "encoding_format": "float" 110 | }' 111 | ``` 112 | 113 | **Example Response** 114 | 115 | ```json 116 | { 117 | "object": "list", 118 | "data": [ 119 | { 120 | "object": "embedding", 121 | "embedding": [ 122 | -0.02279663, 123 | -0.024612427, 124 | 0.012863159, 125 | ... 126 | 0.01612854, 127 | 0.0038928986 128 | ], 129 | "index": 0 130 | } 131 | ], 132 | "model": "cohere.embed-multilingual-v3", 133 | "usage": { 134 | "prompt_tokens": 0, 135 | "total_tokens": 0 136 | } 137 | } 138 | ``` 139 | 140 | Alternatively, you can use the OpenAI SDK 141 | 142 | ```python 143 | from openai import OpenAI 144 | 145 | client = OpenAI() 146 | 147 | def get_embedding(text, model="text-embedding-3-small"): 148 | text = text.replace("\n", " ") 149 | return client.embeddings.create(input=[text], model=model).data[0].embedding 150 | 151 | text = "hello" 152 | # will output like [0.003578186, 0.028717041, 0.031021118, -0.0014066696,...] 153 | print(get_embedding(text)) 154 | ``` 155 | 156 | Or LangChain 157 | 158 | ```python 159 | from langchain_openai import OpenAIEmbeddings 160 | 161 | embeddings = OpenAIEmbeddings( 162 | model="text-embedding-3-large", 163 | ) 164 | text = "This is a test document." 165 | query_result = embeddings.embed_query(text) 166 | print(query_result[:5]) 167 | doc_result = embeddings.embed_documents([text]) 168 | print(doc_result[0][:5]) 169 | ``` 170 | 171 | ## Multimodal API 172 | 173 | **Example Request** 174 | 175 | ```bash 176 | curl $OPENAI_BASE_URL/chat/completions \ 177 | curl $OPENAI_BASE_URL/chat/completions \ 178 | -H "Content-Type: application/json" \ 179 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 180 | -d '{ 181 | "model": "gpt-3.5-turbo", 182 | "messages": [ 183 | { 184 | "role": "user", 185 | "content": [ 186 | { 187 | "type": "text", 188 | "text": "please identify and count all the objects in these images, list all the names" 189 | }, 190 | { 191 | "type": "image_url", 192 | "image_url": { 193 | "url": "https://github.com/aws-samples/bedrock-access-gateway/blob/main/assets/obj-detect.png?raw=true" 194 | } 195 | } 196 | ] 197 | } 198 | ] 199 | }' 200 | ``` 201 | 202 | If you need to use this API with non-public images, you can do base64 the image first and pass the encoded string. 203 | Replace `image/jpeg` with the actual content type. Currently, only 'image/jpeg', 'image/png', 'image/gif' or 'image/webp' is supported. 204 | 205 | ```bash 206 | curl $OPENAI_BASE_URL/chat/completions \ 207 | -H "Content-Type: application/json" \ 208 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 209 | -d '{ 210 | "model": "gpt-3.5-turbo", 211 | "messages": [ 212 | { 213 | "role": "user", 214 | "content": [ 215 | { 216 | "type": "text", 217 | "text": "please identify and count all the objects in this images, list all the names" 218 | }, 219 | { 220 | "type": "image_url", 221 | "image_url": { 222 | "url": "data:image/jpeg;base64," 223 | } 224 | } 225 | ] 226 | } 227 | ] 228 | }' 229 | ``` 230 | 231 | **Example Response** 232 | 233 | ```json 234 | { 235 | "id": "msg_01BY3wcz41x7XrKhxY3VzWke", 236 | "created": 1712543069, 237 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 238 | "system_fingerprint": "fp", 239 | "choices": [ 240 | { 241 | "index": 0, 242 | "finish_reason": "stop", 243 | "message": { 244 | "role": "assistant", 245 | "content": "The image contains the following objects:\n\n1. A peach-colored short-sleeve button-up shirt\n2. An olive green plaid long coat/jacket\n3. A pair of white sneakers or canvas shoes\n4. A brown shoulder bag or purse\n5. A makeup brush or cosmetic applicator\n6. A tube or container (possibly lipstick or lip balm)\n7. A pair of sunglasses\n8. A thought bubble icon\n9. A footprint icon\n10. A leaf or plant icon\n11. A flower icon\n12. A cloud icon\n\nIn total, there are 12 distinct objects depicted in the illustrated scene." 246 | } 247 | } 248 | ], 249 | "object": "chat.completion", 250 | "usage": { 251 | "prompt_tokens": 197, 252 | "completion_tokens": 147, 253 | "total_tokens": 344 254 | } 255 | } 256 | ``` 257 | 258 | 259 | ## Tool Call 260 | 261 | **Important Notice**: Please carefully review the following points before using this Tool Call for Chat completion API. 262 | 263 | 1. Function Call is now deprecated in favor of Tool Call by OpenAI, hence it's not supported here, you should use Tool Call instead. 264 | 265 | **Example Request** 266 | 267 | ```bash 268 | curl $OPENAI_BASE_URL/chat/completions \ 269 | -H "Content-Type: application/json" \ 270 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 271 | -d '{ 272 | "model": "gpt-3.5-turbo", 273 | "messages": [ 274 | { 275 | "role": "user", 276 | "content": "What is the weather like in Shanghai today?" 277 | } 278 | ], 279 | "tools": [ 280 | { 281 | "type": "function", 282 | "function": { 283 | "name": "get_current_weather", 284 | "description": "Get the current weather in a given location", 285 | "parameters": { 286 | "type": "object", 287 | "properties": { 288 | "location": { 289 | "type": "string", 290 | "description": "The city or state which is required." 291 | }, 292 | "unit": { 293 | "type": "string", 294 | "enum": [ 295 | "celsius", 296 | "fahrenheit" 297 | ] 298 | } 299 | }, 300 | "required": [ 301 | "location" 302 | ] 303 | } 304 | } 305 | }, 306 | { 307 | "type": "function", 308 | "function": { 309 | "name": "get_current_location", 310 | "description": "Use this tool to get the current location if user does not provide a location", 311 | "parameters": { 312 | "type": "object", 313 | "properties": {} 314 | } 315 | } 316 | } 317 | ], 318 | "tool_choice": "auto" 319 | }' 320 | ``` 321 | 322 | **Example Response** 323 | 324 | ```json 325 | { 326 | "id": "msg_01PjrKDWhYGsrTNdeqzWd6D9", 327 | "created": 1712543689, 328 | "model": "anthropic.claude-3-sonnet-20240229-v1:0", 329 | "system_fingerprint": "fp", 330 | "choices": [ 331 | { 332 | "index": 0, 333 | "finish_reason": "stop", 334 | "message": { 335 | "role": "assistant", 336 | "tool_calls": [ 337 | { 338 | "id": "0", 339 | "type": "function", 340 | "function": { 341 | "name": "get_current_weather", 342 | "arguments": "{\"location\": \"Shanghai\", \"unit\": \"celsius\"}" 343 | } 344 | } 345 | ] 346 | } 347 | } 348 | ], 349 | "object": "chat.completion", 350 | "usage": { 351 | "prompt_tokens": 256, 352 | "completion_tokens": 64, 353 | "total_tokens": 320 354 | } 355 | } 356 | ``` 357 | 358 | You can try it with different questions, such as: 359 | 1. Hello, who are you? (No tools are needed) 360 | 2. What is the weather like today? (Should use get_current_location tool first) 361 | 362 | 363 | ## Reasoning 364 | 365 | **Important Notice**: Please carefully review the following points before using reasoning mode for Chat completion API. 366 | - Only Claude 3.7 Sonnet (extended thinking) and DeepSeek R1 support Reasoning so far. Please make sure the model supports reasoning before use. 367 | - For Claude 3.7 Sonnet, the reasoning mode (or thinking mode) is not enabled by default, you must pass additional `reasoning_effort` parameter in your request. Please also provide the right max_tokens (or max_completion_tokens) in your request. The budget_tokens is based on reasoning_effort (low: 30%, medium: 60%, high: 100% of max tokens), ensuring minimum budget_tokens of 1,024 with Anthropic recommending at least 4,000 tokens for comprehensive reasoning. Check [Bedrock Document](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-37.html) for more details. 368 | - For DeepSeek R1, you don't need additional reasoning_effort parameter, otherwise, you may get an error. 369 | - The reasoning response (CoT, thoughts) is added in an additional tag 'reasoning_content' which is not officially supported by OpenAI. This is to follow [Deepseek Reasoning Model](https://api-docs.deepseek.com/guides/reasoning_model#api-example). This may be changed in the future. 370 | 371 | **Example Request** 372 | 373 | - Claude 3.7 Sonnet 374 | 375 | ```bash 376 | curl $OPENAI_BASE_URL/chat/completions \ 377 | -H "Content-Type: application/json" \ 378 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 379 | -d '{ 380 | "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0", 381 | "messages": [ 382 | "role": "user", 383 | "content": "which one is bigger, 3.9 or 3.11?" 384 | } 385 | ], 386 | "max_completion_tokens": 4096, 387 | "reasoning_effort": "low", 388 | "stream": false 389 | }' 390 | ``` 391 | 392 | - DeepSeek R1 393 | 394 | ```bash 395 | curl $OPENAI_BASE_URL/chat/completions \ 396 | -H "Content-Type: application/json" \ 397 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 398 | -d '{ 399 | "model": "us.deepseek.r1-v1:0", 400 | "messages": [ 401 | { 402 | "role": "user", 403 | "content": "which one is bigger, 3.9 or 3.11?" 404 | } 405 | ], 406 | "stream": false 407 | }' 408 | ``` 409 | 410 | **Example Response** 411 | 412 | ```json 413 | { 414 | "id": "chatcmpl-83fb7a88", 415 | "created": 1740545278, 416 | "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0", 417 | "system_fingerprint": "fp", 418 | "choices": [ 419 | { 420 | "index": 0, 421 | "finish_reason": "stop", 422 | "logprobs": null, 423 | "message": { 424 | "role": "assistant", 425 | "content": "3.9 is bigger than 3.11.\n\nWhen comparing decimal numbers, we need to understand what these numbers actually represent:...", 426 | "reasoning_content": "I need to compare the decimal numbers 3.9 and 3.11.\n\nFor decimal numbers, we first compare the whole number parts, and if they're equal, we compare the decimal parts. \n\nBoth numbers ..." 427 | } 428 | } 429 | ], 430 | "object": "chat.completion", 431 | "usage": { 432 | "prompt_tokens": 51, 433 | "completion_tokens": 565, 434 | "total_tokens": 616 435 | } 436 | } 437 | ``` 438 | 439 | You can also use OpenAI SDK (run `pip3 install -U openai` first ) 440 | 441 | - Non-Streaming 442 | 443 | ```python 444 | from openai import OpenAI 445 | client = OpenAI() 446 | 447 | messages = [{"role": "user", "content": "which one is bigger, 3.9 or 3.11?"}] 448 | response = client.chat.completions.create( 449 | model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", 450 | messages=messages, 451 | reasoning_effort="low", 452 | max_completion_tokens=4096, 453 | ) 454 | 455 | reasoning_content = response.choices[0].message.reasoning_content 456 | content = response.choices[0].message.content 457 | ``` 458 | 459 | - Streaming 460 | 461 | ```python 462 | from openai import OpenAI 463 | client = OpenAI() 464 | 465 | messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] 466 | response = client.chat.completions.create( 467 | model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", 468 | messages=messages, 469 | reasoning_effort="low", 470 | max_completion_tokens=4096, 471 | stream=True, 472 | ) 473 | 474 | reasoning_content = "" 475 | content = "" 476 | 477 | for chunk in response: 478 | if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content: 479 | reasoning_content += chunk.choices[0].delta.reasoning_content 480 | elif chunk.choices[0].delta.content: 481 | content += chunk.choices[0].delta.content 482 | ``` 483 | 484 | ## Interleaved thinking (beta) 485 | 486 | **Important Notice**: Please carefully review the following points before using reasoning mode for Chat completion API. 487 | 488 | Extended thinking with tool use in Claude 4 models supports [interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) enables Claude 4 models to think between tool calls and run more sophisticated reasoning after receiving tool results. which is helpful for more complex agentic interactions. 489 | With interleaved thinking, the `budget_tokens` can exceed the `max_tokens` parameter because it represents the total budget across all thinking blocks within one assistant turn. 490 | 491 | **Supported Models**: Claude Sonnet 4, Claude Sonnet 4.5 492 | 493 | **Example Request** 494 | 495 | - Non-Streaming (Claude Sonnet 4.5) 496 | 497 | ```bash 498 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 499 | -H "Content-Type: application/json" \ 500 | -H "Authorization: Bearer bedrock" \ 501 | -d '{ 502 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 503 | "max_tokens": 2048, 504 | "messages": [{ 505 | "role": "user", 506 | "content": "Explain how to implement a binary search tree with self-balancing capabilities." 507 | }], 508 | "extra_body": { 509 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 510 | "thinking": {"type": "enabled", "budget_tokens": 4096} 511 | } 512 | }' 513 | ``` 514 | 515 | - Non-Streaming (Claude Sonnet 4) 516 | 517 | ```bash 518 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 519 | -H "Content-Type: application/json" \ 520 | -H "Authorization: Bearer bedrock" \ 521 | -d '{ 522 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", 523 | "max_tokens": 2048, 524 | "messages": [{ 525 | "role": "user", 526 | "content": "有一天,一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧,于是偷偷把分数改成了 88 分。她的父亲看到试卷后,怒发冲冠,狠狠地给了她一巴掌,怒吼道:“你这 8 怎么一半是绿的一半是红的,你以为我是傻子吗?”女孩被打后,委屈地哭了起来,什么也没说。过了一会儿,父亲突然崩溃了。请问这位父亲为什么过一会崩溃了?" 527 | }], 528 | "extra_body": { 529 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 530 | "thinking": {"type": "enabled", "budget_tokens": 4096} 531 | } 532 | }' 533 | ``` 534 | 535 | - Streaming (Claude Sonnet 4.5) 536 | 537 | ```bash 538 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 539 | -H "Content-Type: application/json" \ 540 | -H "Authorization: Bearer bedrock" \ 541 | -d '{ 542 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0", 543 | "max_tokens": 2048, 544 | "messages": [{ 545 | "role": "user", 546 | "content": "Explain how to implement a binary search tree with self-balancing capabilities." 547 | }], 548 | "stream": true, 549 | "extra_body": { 550 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 551 | "thinking": {"type": "enabled", "budget_tokens": 4096} 552 | } 553 | }' 554 | ``` 555 | 556 | - Streaming (Claude Sonnet 4) 557 | 558 | ```bash 559 | curl http://127.0.0.1:8000/api/v1/chat/completions \ 560 | -H "Content-Type: application/json" \ 561 | -H "Authorization: Bearer bedrock" \ 562 | -d '{ 563 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", 564 | "max_tokens": 2048, 565 | "messages": [{ 566 | "role": "user", 567 | "content": "有一天,一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧,于是偷偷把分数改成了 88 分。她的父亲看到试卷后,怒发冲冠,狠狠地给了她一巴掌,怒吼道:“你这 8 怎么一半是绿的一半是红的,你以为我是傻子吗?”女孩被打后,委屈地哭了起来,什么也没说。过了一会儿,父亲突然崩溃了。请问这位父亲为什么过一会崩溃了?" 568 | }], 569 | "stream": true, 570 | "extra_body": { 571 | "anthropic_beta": ["interleaved-thinking-2025-05-14"], 572 | "thinking": {"type": "enabled", "budget_tokens": 4096} 573 | } 574 | }' 575 | ``` 576 | -------------------------------------------------------------------------------- /src/api/models/bedrock.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import logging 4 | import re 5 | import time 6 | from abc import ABC 7 | from typing import AsyncIterable, Iterable, Literal 8 | 9 | import boto3 10 | import numpy as np 11 | import requests 12 | import tiktoken 13 | from botocore.config import Config 14 | from fastapi import HTTPException 15 | from starlette.concurrency import run_in_threadpool 16 | 17 | from api.models.base import BaseChatModel, BaseEmbeddingsModel 18 | from api.schema import ( 19 | AssistantMessage, 20 | ChatRequest, 21 | ChatResponse, 22 | ChatResponseMessage, 23 | ChatStreamResponse, 24 | Choice, 25 | ChoiceDelta, 26 | CompletionTokensDetails, 27 | DeveloperMessage, 28 | Embedding, 29 | EmbeddingsRequest, 30 | EmbeddingsResponse, 31 | EmbeddingsUsage, 32 | Error, 33 | ErrorMessage, 34 | Function, 35 | ImageContent, 36 | PromptTokensDetails, 37 | ResponseFunction, 38 | TextContent, 39 | ToolCall, 40 | ToolContent, 41 | ToolMessage, 42 | Usage, 43 | UserMessage, 44 | ) 45 | from api.setting import ( 46 | AWS_REGION, 47 | DEBUG, 48 | DEFAULT_MODEL, 49 | ENABLE_CROSS_REGION_INFERENCE, 50 | ENABLE_APPLICATION_INFERENCE_PROFILES, 51 | ENABLE_PROMPT_CACHING, 52 | ) 53 | 54 | logger = logging.getLogger(__name__) 55 | 56 | config = Config( 57 | connect_timeout=60, # Connection timeout: 60 seconds 58 | read_timeout=900, # Read timeout: 15 minutes (suitable for long streaming responses) 59 | retries={ 60 | 'max_attempts': 8, # Maximum retry attempts 61 | 'mode': 'adaptive' # Adaptive retry mode 62 | }, 63 | max_pool_connections=50 # Maximum connection pool size 64 | ) 65 | 66 | bedrock_runtime = boto3.client( 67 | service_name="bedrock-runtime", 68 | region_name=AWS_REGION, 69 | config=config, 70 | ) 71 | bedrock_client = boto3.client( 72 | service_name="bedrock", 73 | region_name=AWS_REGION, 74 | config=config, 75 | ) 76 | 77 | SUPPORTED_BEDROCK_EMBEDDING_MODELS = { 78 | "cohere.embed-multilingual-v3": "Cohere Embed Multilingual", 79 | "cohere.embed-english-v3": "Cohere Embed English", 80 | "amazon.titan-embed-text-v1": "Titan Embeddings G1 - Text", 81 | "amazon.titan-embed-text-v2:0": "Titan Embeddings G2 - Text", 82 | # Disable Titan embedding. 83 | # "amazon.titan-embed-image-v1": "Titan Multimodal Embeddings G1" 84 | } 85 | 86 | ENCODER = tiktoken.get_encoding("cl100k_base") 87 | 88 | # Global mapping: Profile ID/ARN → Foundation Model ID 89 | # Handles both SYSTEM_DEFINED (cross-region) and APPLICATION profiles 90 | # This enables feature detection for all profile types without pattern matching 91 | profile_metadata = {} 92 | 93 | # Models that don't support both temperature and topP simultaneously 94 | # When both are provided, temperature takes precedence and topP is removed 95 | TEMPERATURE_TOPP_CONFLICT_MODELS = { 96 | "claude-sonnet-4-5", 97 | "claude-haiku-4-5", 98 | "claude-opus-4-5", 99 | } 100 | 101 | 102 | def list_bedrock_models() -> dict: 103 | """Automatically getting a list of supported models. 104 | 105 | Returns a model list combines: 106 | - ON_DEMAND models. 107 | - Cross-Region Inference Profiles (if enabled via Env) 108 | - Application Inference Profiles (if enabled via Env) 109 | """ 110 | model_list = {} 111 | try: 112 | if ENABLE_CROSS_REGION_INFERENCE: 113 | # List system defined inference profile IDs and store underlying model mapping 114 | paginator = bedrock_client.get_paginator('list_inference_profiles') 115 | for page in paginator.paginate(maxResults=1000, typeEquals="SYSTEM_DEFINED"): 116 | for profile in page["inferenceProfileSummaries"]: 117 | profile_id = profile.get("inferenceProfileId") 118 | if not profile_id: 119 | continue 120 | 121 | # Extract underlying model from first model in the profile 122 | models = profile.get("models", []) 123 | if models: 124 | model_arn = models[0].get("modelArn", "") 125 | if model_arn: 126 | # Extract foundation model ID from ARN 127 | model_id = model_arn.split('/')[-1] 128 | profile_metadata[profile_id] = { 129 | "underlying_model_id": model_id, 130 | "profile_type": "SYSTEM_DEFINED", 131 | } 132 | 133 | if ENABLE_APPLICATION_INFERENCE_PROFILES: 134 | # List application defined inference profile IDs and create mapping 135 | paginator = bedrock_client.get_paginator('list_inference_profiles') 136 | for page in paginator.paginate(maxResults=1000, typeEquals="APPLICATION"): 137 | for profile in page["inferenceProfileSummaries"]: 138 | try: 139 | profile_arn = profile.get("inferenceProfileArn") 140 | if not profile_arn: 141 | continue 142 | 143 | # Process all models in the profile 144 | models = profile.get("models", []) 145 | if not models: 146 | logger.warning(f"Application profile {profile_arn} has no models") 147 | continue 148 | 149 | # Take first model - all models in array are same type (regional instances) 150 | first_model = models[0] 151 | model_arn = first_model.get("modelArn", "") 152 | if not model_arn: 153 | continue 154 | 155 | # Extract model ID from ARN (works for both foundation models and cross-region profiles) 156 | model_id = model_arn.split('/')[-1] if '/' in model_arn else model_arn 157 | 158 | # Store in unified profile metadata for feature detection 159 | profile_metadata[profile_arn] = { 160 | "underlying_model_id": model_id, 161 | "profile_type": "APPLICATION", 162 | "profile_name": profile.get("inferenceProfileName", ""), 163 | } 164 | except Exception as e: 165 | logger.warning(f"Error processing application profile: {e}") 166 | continue 167 | 168 | # List foundation models, only cares about text outputs here. 169 | response = bedrock_client.list_foundation_models(byOutputModality="TEXT") 170 | 171 | for model in response["modelSummaries"]: 172 | model_id = model.get("modelId", "N/A") 173 | stream_supported = model.get("responseStreamingSupported", True) 174 | status = model["modelLifecycle"].get("status", "ACTIVE") 175 | 176 | # currently, use this to filter out rerank models and legacy models 177 | if not stream_supported or status not in ["ACTIVE", "LEGACY"]: 178 | continue 179 | 180 | inference_types = model.get("inferenceTypesSupported", []) 181 | input_modalities = model["inputModalities"] 182 | # Add on-demand model list 183 | if "ON_DEMAND" in inference_types: 184 | model_list[model_id] = {"modalities": input_modalities} 185 | 186 | # Add all inference profiles (cross-region and application) for this model 187 | for profile_id, metadata in profile_metadata.items(): 188 | if metadata.get("underlying_model_id") == model_id: 189 | model_list[profile_id] = {"modalities": input_modalities} 190 | 191 | except Exception as e: 192 | logger.error(f"Unable to list models: {str(e)}") 193 | 194 | if not model_list: 195 | # In case stack not updated. 196 | model_list[DEFAULT_MODEL] = {"modalities": ["TEXT", "IMAGE"]} 197 | 198 | return model_list 199 | 200 | 201 | # Initialize the model list. 202 | bedrock_model_list = list_bedrock_models() 203 | 204 | 205 | class BedrockModel(BaseChatModel): 206 | def list_models(self) -> list[str]: 207 | """Always refresh the latest model list""" 208 | global bedrock_model_list 209 | bedrock_model_list = list_bedrock_models() 210 | return list(bedrock_model_list.keys()) 211 | 212 | def validate(self, chat_request: ChatRequest): 213 | """Perform basic validation on requests""" 214 | error = "" 215 | # check if model is supported 216 | if chat_request.model not in bedrock_model_list.keys(): 217 | # Provide helpful error for application profiles 218 | if "application-inference-profile" in chat_request.model: 219 | error = ( 220 | f"Application profile {chat_request.model} not found. " 221 | f"Available profiles can be listed via GET /models API. " 222 | f"Ensure ENABLE_APPLICATION_INFERENCE_PROFILES=true and " 223 | f"the profile exists in your AWS account." 224 | ) 225 | else: 226 | error = f"Unsupported model {chat_request.model}, please use models API to get a list of supported models" 227 | logger.error("Unsupported model: %s", chat_request.model) 228 | 229 | # Validate profile has resolvable underlying model 230 | if not error and chat_request.model in profile_metadata: 231 | resolved = self._resolve_to_foundation_model(chat_request.model) 232 | if resolved == chat_request.model: 233 | logger.warning( 234 | f"Could not resolve profile {chat_request.model} " 235 | f"to underlying model. Some features may not work correctly." 236 | ) 237 | 238 | if error: 239 | raise HTTPException( 240 | status_code=400, 241 | detail=error, 242 | ) 243 | 244 | def _resolve_to_foundation_model(self, model_id: str) -> str: 245 | """ 246 | Resolve any model identifier to foundation model ID for feature detection. 247 | 248 | Handles: 249 | - Cross-region profiles (us.*, eu.*, apac.*, global.*) 250 | - Application profiles (arn:aws:bedrock:...:application-inference-profile/...) 251 | - Foundation models (pass through unchanged) 252 | 253 | No pattern matching needed - just dictionary lookup. 254 | Unknown identifiers pass through unchanged (graceful fallback). 255 | 256 | Args: 257 | model_id: Can be foundation model ID, cross-region profile, or app profile ARN 258 | 259 | Returns: 260 | Foundation model ID if mapping exists, otherwise original model_id 261 | """ 262 | if model_id in profile_metadata: 263 | return profile_metadata[model_id]["underlying_model_id"] 264 | return model_id 265 | 266 | def _supports_prompt_caching(self, model_id: str) -> bool: 267 | """ 268 | Check if model supports prompt caching based on model ID pattern. 269 | 270 | Uses pattern matching instead of hardcoded whitelist for better maintainability. 271 | Automatically supports new models following the naming convention. 272 | 273 | Supported models: 274 | - Claude: anthropic.claude-* (excluding very old versions) 275 | - Nova: amazon.nova-* 276 | 277 | Returns: 278 | bool: True if model supports prompt caching 279 | """ 280 | # Resolve profile to underlying model for feature detection 281 | resolved_model = self._resolve_to_foundation_model(model_id) 282 | model_lower = resolved_model.lower() 283 | 284 | # Claude models pattern matching 285 | if "anthropic.claude" in model_lower: 286 | # Exclude very old models that don't support caching 287 | excluded_patterns = ["claude-instant", "claude-v1", "claude-v2"] 288 | if any(pattern in model_lower for pattern in excluded_patterns): 289 | return False 290 | return True 291 | 292 | # Nova models pattern matching 293 | if "amazon.nova" in model_lower: 294 | return True 295 | 296 | # Future providers can be added here 297 | # Example: if "provider.model-name" in model_lower: return True 298 | 299 | return False 300 | 301 | def _get_max_cache_tokens(self, model_id: str) -> int | None: 302 | """ 303 | Get maximum cacheable tokens limit for the model. 304 | 305 | Different models have different caching limits: 306 | - Claude: No explicit limit mentioned in docs 307 | - Nova: 20,000 tokens max 308 | 309 | Returns: 310 | int | None: Max tokens, or None if unlimited 311 | """ 312 | # Resolve profile to underlying model for feature detection 313 | resolved_model = self._resolve_to_foundation_model(model_id) 314 | model_lower = resolved_model.lower() 315 | 316 | # Nova models have 20K limit 317 | if "amazon.nova" in model_lower: 318 | return 20_000 319 | 320 | # Claude: No explicit limit 321 | if "anthropic.claude" in model_lower: 322 | return None 323 | 324 | return None 325 | 326 | async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False): 327 | """Common logic for invoke bedrock models""" 328 | if DEBUG: 329 | logger.info("Raw request: " + chat_request.model_dump_json()) 330 | 331 | # Log profile resolution for debugging 332 | if chat_request.model in profile_metadata: 333 | resolved = self._resolve_to_foundation_model(chat_request.model) 334 | profile_type = profile_metadata[chat_request.model].get("profile_type", "UNKNOWN") 335 | logger.info( 336 | f"Profile resolution: {chat_request.model} ({profile_type}) → {resolved}" 337 | ) 338 | 339 | # convert OpenAI chat request to Bedrock SDK request 340 | args = self._parse_request(chat_request) 341 | if DEBUG: 342 | logger.info("Bedrock request: " + json.dumps(str(args))) 343 | 344 | try: 345 | if stream: 346 | # Run the blocking boto3 call in a thread pool 347 | response = await run_in_threadpool( 348 | bedrock_runtime.converse_stream, **args 349 | ) 350 | else: 351 | # Run the blocking boto3 call in a thread pool 352 | response = await run_in_threadpool(bedrock_runtime.converse, **args) 353 | except bedrock_runtime.exceptions.ValidationException as e: 354 | logger.error("Bedrock validation error for model %s: %s", chat_request.model, str(e)) 355 | raise HTTPException(status_code=400, detail=str(e)) 356 | except bedrock_runtime.exceptions.ThrottlingException as e: 357 | logger.warning("Bedrock throttling for model %s: %s", chat_request.model, str(e)) 358 | raise HTTPException(status_code=429, detail=str(e)) 359 | except Exception as e: 360 | logger.error("Bedrock invocation failed for model %s: %s", chat_request.model, str(e)) 361 | raise HTTPException(status_code=500, detail=str(e)) 362 | return response 363 | 364 | async def chat(self, chat_request: ChatRequest) -> ChatResponse: 365 | """Default implementation for Chat API.""" 366 | 367 | message_id = self.generate_message_id() 368 | response = await self._invoke_bedrock(chat_request) 369 | 370 | output_message = response["output"]["message"] 371 | usage = response["usage"] 372 | 373 | # Extract all token counts 374 | output_tokens = usage["outputTokens"] 375 | total_tokens = usage["totalTokens"] 376 | finish_reason = response["stopReason"] 377 | 378 | # Extract prompt caching metrics if available 379 | cache_read_tokens = usage.get("cacheReadInputTokens", 0) 380 | cache_creation_tokens = usage.get("cacheWriteInputTokens", 0) 381 | 382 | # Calculate actual prompt tokens 383 | # Bedrock's totalTokens includes all: inputTokens + cacheRead + cacheWrite + outputTokens 384 | # So: prompt_tokens = totalTokens - outputTokens 385 | actual_prompt_tokens = total_tokens - output_tokens 386 | 387 | chat_response = self._create_response( 388 | model=chat_request.model, 389 | message_id=message_id, 390 | content=output_message["content"], 391 | finish_reason=finish_reason, 392 | input_tokens=actual_prompt_tokens, 393 | output_tokens=output_tokens, 394 | total_tokens=total_tokens, 395 | cache_read_tokens=cache_read_tokens, 396 | cache_creation_tokens=cache_creation_tokens, 397 | ) 398 | if DEBUG: 399 | logger.info("Proxy response :" + chat_response.model_dump_json()) 400 | return chat_response 401 | 402 | async def _async_iterate(self, stream): 403 | """Helper method to convert sync iterator to async iterator""" 404 | for chunk in stream: 405 | await run_in_threadpool(lambda: chunk) 406 | yield chunk 407 | 408 | async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]: 409 | """Default implementation for Chat Stream API""" 410 | try: 411 | response = await self._invoke_bedrock(chat_request, stream=True) 412 | message_id = self.generate_message_id() 413 | stream = response.get("stream") 414 | self.think_emitted = False 415 | async for chunk in self._async_iterate(stream): 416 | args = {"model_id": chat_request.model, "message_id": message_id, "chunk": chunk} 417 | stream_response = self._create_response_stream(**args) 418 | if not stream_response: 419 | continue 420 | if DEBUG: 421 | logger.info("Proxy response :" + stream_response.model_dump_json()) 422 | if stream_response.choices: 423 | yield self.stream_response_to_bytes(stream_response) 424 | elif chat_request.stream_options and chat_request.stream_options.include_usage: 425 | # An empty choices for Usage as per OpenAI doc below: 426 | # if you set stream_options: {"include_usage": true}. 427 | # an additional chunk will be streamed before the data: [DONE] message. 428 | # The usage field on this chunk shows the token usage statistics for the entire request, 429 | # and the choices field will always be an empty array. 430 | # All other chunks will also include a usage field, but with a null value. 431 | yield self.stream_response_to_bytes(stream_response) 432 | 433 | # return an [DONE] message at the end. 434 | yield self.stream_response_to_bytes() 435 | self.think_emitted = False # Cleanup 436 | except Exception as e: 437 | logger.error("Stream error for model %s: %s", chat_request.model, str(e)) 438 | error_event = Error(error=ErrorMessage(message=str(e))) 439 | yield self.stream_response_to_bytes(error_event) 440 | 441 | def _parse_system_prompts(self, chat_request: ChatRequest) -> list[dict[str, str]]: 442 | """Create system prompts with optional prompt caching support. 443 | 444 | Prompt caching can be enabled via: 445 | 1. ENABLE_PROMPT_CACHING environment variable (global default) 446 | 2. extra_body.prompt_caching.system = True/False (per-request override) 447 | 448 | Only adds cachePoint if: 449 | - Model supports caching (Claude, Nova) 450 | - Caching is enabled (ENV or extra_body) 451 | - System prompts exist and meet minimum token requirements 452 | 453 | Example output: [{"text" : system_prompt}, {"cachePoint": {"type": "default"}}] 454 | 455 | See: https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html 456 | """ 457 | system_prompts = [] 458 | for message in chat_request.messages: 459 | if message.role not in ("system", "developer"): 460 | continue 461 | if not isinstance(message.content, str): 462 | raise TypeError(f"System message content must be a string, got {type(message.content).__name__}") 463 | system_prompts.append({"text": message.content}) 464 | 465 | if not system_prompts: 466 | return system_prompts 467 | 468 | # Check if model supports prompt caching 469 | if not self._supports_prompt_caching(chat_request.model): 470 | return system_prompts 471 | 472 | # Determine if caching should be enabled 473 | cache_enabled = ENABLE_PROMPT_CACHING # Default from ENV 474 | 475 | # Check for extra_body override 476 | if chat_request.extra_body and isinstance(chat_request.extra_body, dict): 477 | prompt_caching = chat_request.extra_body.get("prompt_caching", {}) 478 | if "system" in prompt_caching: 479 | # extra_body explicitly controls caching 480 | cache_enabled = prompt_caching.get("system") is True 481 | 482 | if not cache_enabled: 483 | return system_prompts 484 | 485 | # Estimate total tokens for limit check 486 | total_text = " ".join(p.get("text", "") for p in system_prompts) 487 | estimated_tokens = len(total_text.split()) * 1.3 # Rough estimate 488 | 489 | # Check token limits (Nova has 20K limit) 490 | max_tokens = self._get_max_cache_tokens(chat_request.model) 491 | if max_tokens and estimated_tokens > max_tokens: 492 | logger.warning( 493 | f"System prompts (~{estimated_tokens:.0f} tokens) exceed model cache limit ({max_tokens} tokens). " 494 | f"Caching will still be attempted but may not work optimally." 495 | ) 496 | # Still add cachePoint - let Bedrock handle the limit 497 | 498 | # Add cache checkpoint after system prompts 499 | system_prompts.append({"cachePoint": {"type": "default"}}) 500 | 501 | if DEBUG: 502 | logger.info(f"Added cachePoint to system prompts for model {chat_request.model}") 503 | 504 | return system_prompts 505 | 506 | def _parse_messages(self, chat_request: ChatRequest) -> list[dict]: 507 | """ 508 | Converse API only support user and assistant messages. 509 | 510 | example output: [{ 511 | "role": "user", 512 | "content": [{"text": input_text}] 513 | }] 514 | 515 | See example: 516 | https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples 517 | """ 518 | messages = [] 519 | for message in chat_request.messages: 520 | if isinstance(message, UserMessage): 521 | messages.append( 522 | { 523 | "role": message.role, 524 | "content": self._parse_content_parts( 525 | message, chat_request.model 526 | ), 527 | } 528 | ) 529 | elif isinstance(message, AssistantMessage): 530 | # Check if message has content that's not empty 531 | has_content = False 532 | if isinstance(message.content, str): 533 | has_content = message.content.strip() != "" 534 | elif isinstance(message.content, list): 535 | has_content = len(message.content) > 0 536 | elif message.content is not None: 537 | has_content = True 538 | 539 | if has_content: 540 | # Text message 541 | messages.append( 542 | { 543 | "role": message.role, 544 | "content": self._parse_content_parts( 545 | message, chat_request.model 546 | ), 547 | } 548 | ) 549 | if message.tool_calls: 550 | # Tool use message 551 | for tool_call in message.tool_calls: 552 | tool_input = json.loads(tool_call.function.arguments) 553 | messages.append( 554 | { 555 | "role": message.role, 556 | "content": [ 557 | { 558 | "toolUse": { 559 | "toolUseId": tool_call.id, 560 | "name": tool_call.function.name, 561 | "input": tool_input, 562 | } 563 | } 564 | ], 565 | } 566 | ) 567 | elif isinstance(message, ToolMessage): 568 | # Bedrock does not support tool role, 569 | # Add toolResult to content 570 | # https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolResultBlock.html 571 | 572 | # Handle different content formats from OpenAI SDK 573 | tool_content = self._extract_tool_content(message.content) 574 | 575 | messages.append( 576 | { 577 | "role": "user", 578 | "content": [ 579 | { 580 | "toolResult": { 581 | "toolUseId": message.tool_call_id, 582 | "content": [{"text": tool_content}], 583 | } 584 | } 585 | ], 586 | } 587 | ) 588 | 589 | else: 590 | # ignore others, such as system messages 591 | continue 592 | return self._reframe_multi_payloard(messages, chat_request) 593 | 594 | def _extract_tool_content(self, content) -> str: 595 | """Extract text content from various OpenAI SDK tool message formats. 596 | 597 | Handles: 598 | - String content (legacy format) 599 | - List of content objects (OpenAI SDK 1.91.0+) 600 | - Nested JSON structures within text content 601 | """ 602 | try: 603 | if isinstance(content, str): 604 | return content 605 | 606 | if isinstance(content, list): 607 | text_parts = [] 608 | for i, item in enumerate(content): 609 | if isinstance(item, dict): 610 | # Handle dict with 'text' field 611 | if "text" in item: 612 | item_text = item["text"] 613 | if isinstance(item_text, str): 614 | # Try to parse as JSON if it looks like JSON 615 | if item_text.strip().startswith('{') and item_text.strip().endswith('}'): 616 | try: 617 | parsed_json = json.loads(item_text) 618 | # Convert JSON object to readable text 619 | text_parts.append(json.dumps(parsed_json, indent=2)) 620 | except json.JSONDecodeError: 621 | # Silently fallback to original text 622 | text_parts.append(item_text) 623 | else: 624 | text_parts.append(item_text) 625 | else: 626 | text_parts.append(str(item_text)) 627 | else: 628 | # Handle other dict formats - convert to JSON string 629 | text_parts.append(json.dumps(item, indent=2)) 630 | elif hasattr(item, 'text'): 631 | # Handle ToolContent objects 632 | text_parts.append(item.text) 633 | else: 634 | # Convert any other type to string 635 | text_parts.append(str(item)) 636 | return "\n".join(text_parts) 637 | 638 | # Fallback for any other type 639 | return str(content) 640 | except Exception as e: 641 | logger.warning("Tool content extraction failed: %s", str(e)) 642 | # Return a safe fallback 643 | return str(content) if content is not None else "" 644 | 645 | def _reframe_multi_payloard(self, messages: list, chat_request: ChatRequest = None) -> list: 646 | """Receive messages and reformat them to comply with the Claude format 647 | 648 | With OpenAI format requests, it's not a problem to repeatedly receive messages from the same role, but 649 | with Claude format requests, you cannot repeatedly receive messages from the same role. 650 | 651 | This method searches through the OpenAI format messages in order and reformats them to the Claude format. 652 | 653 | ``` 654 | openai_format_messages=[ 655 | {"role": "user", "content": "Hello"}, 656 | {"role": "user", "content": "Who are you?"}, 657 | ] 658 | 659 | bedrock_format_messages=[ 660 | { 661 | "role": "user", 662 | "content": [ 663 | {"text": "Hello"}, 664 | {"text": "Who are you?"} 665 | ] 666 | }, 667 | ] 668 | """ 669 | reformatted_messages = [] 670 | current_role = None 671 | current_content = [] 672 | 673 | # Search through the list of messages and combine messages from the same role into one list 674 | for message in messages: 675 | next_role = message["role"] 676 | next_content = message["content"] 677 | 678 | # If the next role is different from the previous message, add the previous role's messages to the list 679 | if next_role != current_role: 680 | if current_content: 681 | reformatted_messages.append( 682 | {"role": current_role, "content": current_content} 683 | ) 684 | # Switch to the new role 685 | current_role = next_role 686 | current_content = [] 687 | 688 | # Add the message content to current_content 689 | if isinstance(next_content, str): 690 | current_content.append({"text": next_content}) 691 | elif isinstance(next_content, list): 692 | current_content.extend(next_content) 693 | 694 | # Add the last role's messages to the list 695 | if current_content: 696 | reformatted_messages.append( 697 | {"role": current_role, "content": current_content} 698 | ) 699 | 700 | # Add cachePoint to messages if enabled and supported 701 | if chat_request and reformatted_messages: 702 | if not self._supports_prompt_caching(chat_request.model): 703 | return reformatted_messages 704 | 705 | # Determine if messages caching should be enabled 706 | cache_enabled = ENABLE_PROMPT_CACHING 707 | 708 | if chat_request.extra_body and isinstance(chat_request.extra_body, dict): 709 | prompt_caching = chat_request.extra_body.get("prompt_caching", {}) 710 | if "messages" in prompt_caching: 711 | cache_enabled = prompt_caching.get("messages") is True 712 | 713 | if cache_enabled: 714 | # Add cachePoint to the last user message content 715 | for msg in reversed(reformatted_messages): 716 | if msg["role"] == "user" and msg.get("content"): 717 | # Add cachePoint at the end of user message content 718 | msg["content"].append({"cachePoint": {"type": "default"}}) 719 | if DEBUG: 720 | logger.info(f"Added cachePoint to last user message for model {chat_request.model}") 721 | break 722 | 723 | return reformatted_messages 724 | 725 | def _parse_request(self, chat_request: ChatRequest) -> dict: 726 | """Create default converse request body. 727 | 728 | Also perform validations to tool call etc. 729 | 730 | Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html 731 | """ 732 | messages = self._parse_messages(chat_request) 733 | system_prompts = self._parse_system_prompts(chat_request) 734 | 735 | # Base inference parameters. 736 | inference_config = { 737 | "maxTokens": chat_request.max_tokens, 738 | } 739 | 740 | # Only include optional parameters when specified 741 | if chat_request.temperature is not None: 742 | inference_config["temperature"] = chat_request.temperature 743 | if chat_request.top_p is not None: 744 | inference_config["topP"] = chat_request.top_p 745 | 746 | # Some models (Claude Sonnet 4.5, Haiku 4.5) don't support both temperature and topP 747 | # When both are provided, keep temperature and remove topP 748 | # Resolve profile to underlying model for feature detection 749 | resolved_model = self._resolve_to_foundation_model(chat_request.model) 750 | model_lower = resolved_model.lower() 751 | 752 | # Check if model is in the conflict list and both parameters are present 753 | if "temperature" in inference_config and "topP" in inference_config: 754 | if any(conflict_model in model_lower for conflict_model in TEMPERATURE_TOPP_CONFLICT_MODELS): 755 | inference_config.pop("topP", None) 756 | if DEBUG: 757 | logger.info(f"Removed topP for {chat_request.model} (conflicts with temperature)") 758 | 759 | if chat_request.stop is not None: 760 | stop = chat_request.stop 761 | if isinstance(stop, str): 762 | stop = [stop] 763 | inference_config["stopSequences"] = stop 764 | 765 | args = { 766 | "modelId": chat_request.model, 767 | "messages": messages, 768 | "system": system_prompts, 769 | "inferenceConfig": inference_config, 770 | } 771 | if chat_request.reasoning_effort: 772 | # reasoning_effort is supported by Claude and DeepSeek v3 773 | # Different models use different formats 774 | # Resolve profile to underlying model for feature detection 775 | resolved_model = self._resolve_to_foundation_model(chat_request.model) 776 | model_lower = resolved_model.lower() 777 | 778 | if "anthropic.claude" in model_lower: 779 | # Claude format: reasoning_config = object with budget_tokens 780 | max_tokens = ( 781 | chat_request.max_completion_tokens 782 | if chat_request.max_completion_tokens 783 | else chat_request.max_tokens 784 | ) 785 | budget_tokens = self._calc_budget_tokens( 786 | max_tokens, chat_request.reasoning_effort 787 | ) 788 | inference_config["maxTokens"] = max_tokens 789 | # unset topP - Not supported 790 | inference_config.pop("topP", None) 791 | 792 | args["additionalModelRequestFields"] = { 793 | "reasoning_config": {"type": "enabled", "budget_tokens": budget_tokens} 794 | } 795 | elif "deepseek.v3" in model_lower or "deepseek.deepseek-v3" in model_lower: 796 | # DeepSeek v3 format: reasoning_config = string ('low', 'medium', 'high') 797 | # From Bedrock Playground: {"reasoning_config": "high"} 798 | args["additionalModelRequestFields"] = { 799 | "reasoning_config": chat_request.reasoning_effort # Direct string: low/medium/high 800 | } 801 | if DEBUG: 802 | logger.info(f"Applied reasoning_config={chat_request.reasoning_effort} for DeepSeek v3") 803 | else: 804 | # For other models (Qwen, etc.), ignore reasoning_effort parameter 805 | if DEBUG: 806 | logger.info(f"reasoning_effort parameter ignored for model {chat_request.model} (not supported)") 807 | # add tool config 808 | if chat_request.tools: 809 | tool_config = {"tools": [self._convert_tool_spec(t.function) for t in chat_request.tools]} 810 | 811 | if chat_request.tool_choice and not chat_request.model.startswith( 812 | "meta.llama3-1-" 813 | ): 814 | if isinstance(chat_request.tool_choice, str): 815 | # auto (default) is mapped to {"auto" : {}} 816 | # required is mapped to {"any" : {}} 817 | if chat_request.tool_choice == "required": 818 | tool_config["toolChoice"] = {"any": {}} 819 | else: 820 | tool_config["toolChoice"] = {"auto": {}} 821 | else: 822 | # Specific tool to use 823 | if "function" not in chat_request.tool_choice: 824 | raise ValueError("tool_choice must contain 'function' key when specifying a specific tool") 825 | tool_config["toolChoice"] = {"tool": {"name": chat_request.tool_choice["function"].get("name", "")}} 826 | args["toolConfig"] = tool_config 827 | # Add additional fields to enable extend thinking or other model-specific features 828 | if chat_request.extra_body: 829 | # Filter out prompt_caching (our control field, not for Bedrock) 830 | additional_fields = { 831 | k: v for k, v in chat_request.extra_body.items() 832 | if k != "prompt_caching" 833 | } 834 | 835 | if additional_fields: 836 | # Only set additionalModelRequestFields if there are actual fields to pass 837 | args["additionalModelRequestFields"] = additional_fields 838 | 839 | # Extended thinking doesn't support both temperature and topP 840 | # Remove topP to avoid validation error 841 | if "thinking" in additional_fields: 842 | inference_config.pop("topP", None) 843 | 844 | return args 845 | 846 | def _estimate_reasoning_tokens(self, content: list[dict]) -> int: 847 | """ 848 | Estimate reasoning tokens from reasoningContent blocks. 849 | 850 | Bedrock doesn't separately report reasoning tokens, so we estimate 851 | them using tiktoken to maintain OpenAI API compatibility. 852 | """ 853 | reasoning_text = "" 854 | for block in content: 855 | if "reasoningContent" in block: 856 | reasoning_text += block["reasoningContent"]["reasoningText"].get("text", "") 857 | 858 | if reasoning_text: 859 | # Use tiktoken to estimate token count 860 | return len(ENCODER.encode(reasoning_text)) 861 | return 0 862 | 863 | def _create_response( 864 | self, 865 | model: str, 866 | message_id: str, 867 | content: list[dict] | None = None, 868 | finish_reason: str | None = None, 869 | input_tokens: int = 0, 870 | output_tokens: int = 0, 871 | total_tokens: int = 0, 872 | cache_read_tokens: int = 0, 873 | cache_creation_tokens: int = 0, 874 | ) -> ChatResponse: 875 | message = ChatResponseMessage( 876 | role="assistant", 877 | ) 878 | if finish_reason == "tool_use": 879 | # https://docs.aws.amazon.com/bedrock/latest/userguide/tool-use.html#tool-use-examples 880 | tool_calls = [] 881 | for part in content: 882 | if "toolUse" in part: 883 | tool = part["toolUse"] 884 | tool_calls.append( 885 | ToolCall( 886 | id=tool["toolUseId"], 887 | type="function", 888 | function=ResponseFunction( 889 | name=tool["name"], 890 | arguments=json.dumps(tool["input"]), 891 | ), 892 | ) 893 | ) 894 | message.tool_calls = tool_calls 895 | message.content = None 896 | else: 897 | message.content = "" 898 | for c in content: 899 | if "reasoningContent" in c: 900 | message.reasoning_content = c["reasoningContent"][ 901 | "reasoningText" 902 | ].get("text", "") 903 | elif "text" in c: 904 | message.content = c["text"] 905 | else: 906 | logger.warning( 907 | "Unknown tag in message content " + ",".join(c.keys()) 908 | ) 909 | if message.reasoning_content: 910 | message.content = f"{message.reasoning_content}{message.content}" 911 | message.reasoning_content = None 912 | 913 | # Create prompt_tokens_details if cache metrics are available 914 | prompt_tokens_details = None 915 | if cache_read_tokens > 0 or cache_creation_tokens > 0: 916 | # Map Bedrock cache metrics to OpenAI format 917 | # cached_tokens represents tokens read from cache (cache hits) 918 | prompt_tokens_details = PromptTokensDetails( 919 | cached_tokens=cache_read_tokens, 920 | audio_tokens=0, 921 | ) 922 | 923 | # Create completion_tokens_details if reasoning content exists 924 | completion_tokens_details = None 925 | reasoning_tokens = self._estimate_reasoning_tokens(content) if content else 0 926 | if reasoning_tokens > 0: 927 | completion_tokens_details = CompletionTokensDetails( 928 | reasoning_tokens=reasoning_tokens, 929 | audio_tokens=0, 930 | ) 931 | 932 | response = ChatResponse( 933 | id=message_id, 934 | model=model, 935 | choices=[ 936 | Choice( 937 | index=0, 938 | message=message, 939 | finish_reason=self._convert_finish_reason(finish_reason), 940 | logprobs=None, 941 | ) 942 | ], 943 | usage=Usage( 944 | prompt_tokens=input_tokens, 945 | completion_tokens=output_tokens, 946 | total_tokens=total_tokens if total_tokens > 0 else input_tokens + output_tokens, 947 | prompt_tokens_details=prompt_tokens_details, 948 | completion_tokens_details=completion_tokens_details, 949 | ), 950 | ) 951 | response.system_fingerprint = "fp" 952 | response.object = "chat.completion" 953 | response.created = int(time.time()) 954 | return response 955 | 956 | def _create_response_stream( 957 | self, model_id: str, message_id: str, chunk: dict 958 | ) -> ChatStreamResponse | None: 959 | """Parsing the Bedrock stream response chunk. 960 | 961 | Ref: https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples 962 | """ 963 | if DEBUG: 964 | logger.info("Bedrock response chunk: " + str(chunk)) 965 | 966 | finish_reason = None 967 | message = None 968 | usage = None 969 | 970 | if "messageStart" in chunk: 971 | message = ChatResponseMessage( 972 | role=chunk["messageStart"]["role"], 973 | content="", 974 | ) 975 | 976 | if "contentBlockStart" in chunk: 977 | # tool call start 978 | delta = chunk["contentBlockStart"]["start"] 979 | if "toolUse" in delta: 980 | # first index is content 981 | index = chunk["contentBlockStart"]["contentBlockIndex"] - 1 982 | message = ChatResponseMessage( 983 | tool_calls=[ 984 | ToolCall( 985 | index=index, 986 | type="function", 987 | id=delta["toolUse"]["toolUseId"], 988 | function=ResponseFunction( 989 | name=delta["toolUse"]["name"], 990 | arguments="", 991 | ), 992 | ) 993 | ] 994 | ) 995 | 996 | if "contentBlockDelta" in chunk: 997 | delta = chunk["contentBlockDelta"]["delta"] 998 | if "text" in delta: 999 | # Regular text content - close thinking tag if open 1000 | content = delta["text"] 1001 | if self.think_emitted: 1002 | # Transition from reasoning to regular text 1003 | content = "" + content 1004 | self.think_emitted = False 1005 | message = ChatResponseMessage(content=content) 1006 | elif "reasoningContent" in delta: 1007 | if "text" in delta["reasoningContent"]: 1008 | content = delta["reasoningContent"]["text"] 1009 | if not self.think_emitted: 1010 | # Start of reasoning content 1011 | content = "" + content 1012 | self.think_emitted = True 1013 | message = ChatResponseMessage(content=content) 1014 | elif "signature" in delta["reasoningContent"]: 1015 | # Port of "signature_delta" (for models that send it) 1016 | if self.think_emitted: 1017 | message = ChatResponseMessage(content="") 1018 | self.think_emitted = False 1019 | else: 1020 | return None # Ignore signature if no started 1021 | else: 1022 | # tool use 1023 | index = chunk["contentBlockDelta"]["contentBlockIndex"] - 1 1024 | message = ChatResponseMessage( 1025 | tool_calls=[ 1026 | ToolCall( 1027 | index=index, 1028 | function=ResponseFunction( 1029 | arguments=delta["toolUse"]["input"], 1030 | ), 1031 | ) 1032 | ] 1033 | ) 1034 | 1035 | if "messageStop" in chunk: 1036 | # Safety check: Close any open thinking tags before message stops 1037 | if self.think_emitted: 1038 | self.think_emitted = False 1039 | return ChatStreamResponse( 1040 | id=message_id, 1041 | model=model_id, 1042 | choices=[ 1043 | ChoiceDelta( 1044 | index=0, 1045 | delta=ChatResponseMessage(content=""), 1046 | logprobs=None, 1047 | finish_reason=None, 1048 | ) 1049 | ], 1050 | ) 1051 | message = ChatResponseMessage() 1052 | finish_reason = chunk["messageStop"]["stopReason"] 1053 | 1054 | if "metadata" in chunk: 1055 | # usage information in metadata. 1056 | metadata = chunk["metadata"] 1057 | if "usage" in metadata: 1058 | # token usage 1059 | usage_data = metadata["usage"] 1060 | 1061 | # Extract prompt caching metrics if available 1062 | cache_read_tokens = usage_data.get("cacheReadInputTokens", 0) 1063 | cache_creation_tokens = usage_data.get("cacheWriteInputTokens", 0) 1064 | 1065 | # Create prompt_tokens_details if cache metrics are available 1066 | prompt_tokens_details = None 1067 | if cache_read_tokens > 0 or cache_creation_tokens > 0: 1068 | prompt_tokens_details = PromptTokensDetails( 1069 | cached_tokens=cache_read_tokens, 1070 | audio_tokens=0, 1071 | ) 1072 | 1073 | # Calculate actual prompt tokens 1074 | # Bedrock's totalTokens includes all tokens 1075 | # prompt_tokens = totalTokens - outputTokens 1076 | total_tokens = usage_data["totalTokens"] 1077 | output_tokens = usage_data["outputTokens"] 1078 | actual_prompt_tokens = total_tokens - output_tokens 1079 | 1080 | return ChatStreamResponse( 1081 | id=message_id, 1082 | model=model_id, 1083 | choices=[], 1084 | usage=Usage( 1085 | prompt_tokens=actual_prompt_tokens, 1086 | completion_tokens=output_tokens, 1087 | total_tokens=total_tokens, 1088 | prompt_tokens_details=prompt_tokens_details, 1089 | ), 1090 | ) 1091 | 1092 | if message: 1093 | return ChatStreamResponse( 1094 | id=message_id, 1095 | model=model_id, 1096 | choices=[ 1097 | ChoiceDelta( 1098 | index=0, 1099 | delta=message, 1100 | logprobs=None, 1101 | finish_reason=self._convert_finish_reason(finish_reason), 1102 | ) 1103 | ], 1104 | usage=usage, 1105 | ) 1106 | 1107 | return None 1108 | 1109 | def _parse_image(self, image_url: str) -> tuple[bytes, str]: 1110 | """Try to get the raw data from an image url. 1111 | 1112 | Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ImageSource.html 1113 | returns a tuple of (Image Data, Content Type) 1114 | """ 1115 | pattern = r"^data:(image/[a-z]*);base64,\s*" 1116 | content_type = re.search(pattern, image_url) 1117 | # if already base64 encoded. 1118 | # Only supports 'image/jpeg', 'image/png', 'image/gif' or 'image/webp' 1119 | if content_type: 1120 | image_data = re.sub(pattern, "", image_url) 1121 | return base64.b64decode(image_data), content_type.group(1) 1122 | 1123 | # Send a request to the image URL 1124 | response = requests.get(image_url, timeout=30) 1125 | # Check if the request was successful 1126 | if response.status_code == 200: 1127 | content_type = response.headers.get("Content-Type") 1128 | if not content_type.startswith("image"): 1129 | content_type = "image/jpeg" 1130 | # Get the image content 1131 | image_content = response.content 1132 | return image_content, content_type 1133 | else: 1134 | raise HTTPException( 1135 | status_code=500, detail="Unable to access the image url" 1136 | ) 1137 | 1138 | def _parse_content_parts( 1139 | self, 1140 | message: UserMessage | AssistantMessage, 1141 | model_id: str, 1142 | ) -> list[dict]: 1143 | if isinstance(message.content, str): 1144 | return [ 1145 | { 1146 | "text": message.content, 1147 | } 1148 | ] 1149 | content_parts = [] 1150 | for part in message.content: 1151 | if isinstance(part, TextContent): 1152 | content_parts.append( 1153 | { 1154 | "text": part.text, 1155 | } 1156 | ) 1157 | elif isinstance(part, ImageContent): 1158 | if not self.is_supported_modality(model_id, modality="IMAGE"): 1159 | raise HTTPException( 1160 | status_code=400, 1161 | detail=f"Multimodal message is currently not supported by {model_id}", 1162 | ) 1163 | image_data, content_type = self._parse_image(part.image_url.url) 1164 | content_parts.append( 1165 | { 1166 | "image": { 1167 | "format": content_type[6:], # image/ 1168 | "source": {"bytes": image_data}, 1169 | }, 1170 | } 1171 | ) 1172 | else: 1173 | # Ignore.. 1174 | continue 1175 | return content_parts 1176 | 1177 | @staticmethod 1178 | def is_supported_modality(model_id: str, modality: str = "IMAGE") -> bool: 1179 | model = bedrock_model_list.get(model_id, {}) 1180 | modalities = model.get("modalities", []) 1181 | if modality in modalities: 1182 | return True 1183 | return False 1184 | 1185 | def _convert_tool_spec(self, func: Function) -> dict: 1186 | return { 1187 | "toolSpec": { 1188 | "name": func.name, 1189 | "description": func.description, 1190 | "inputSchema": { 1191 | "json": func.parameters, 1192 | }, 1193 | } 1194 | } 1195 | 1196 | def _calc_budget_tokens( 1197 | self, max_tokens: int, reasoning_effort: Literal["low", "medium", "high"] 1198 | ) -> int: 1199 | # Helper function to calculate budget_tokens based on the max_tokens. 1200 | # Ratio for efforts: Low - 30%, medium - 60%, High: Max token - 1 1201 | # Note that The minimum budget_tokens is 1,024 tokens so far. 1202 | # But it may be changed for different models in the future. 1203 | if reasoning_effort == "low": 1204 | return int(max_tokens * 0.3) 1205 | elif reasoning_effort == "medium": 1206 | return int(max_tokens * 0.6) 1207 | else: 1208 | return max_tokens - 1 1209 | 1210 | def _convert_finish_reason(self, finish_reason: str | None) -> str | None: 1211 | """ 1212 | Below is a list of finish reason according to OpenAI doc: 1213 | 1214 | - stop: if the model hit a natural stop point or a provided stop sequence, 1215 | - length: if the maximum number of tokens specified in the request was reached, 1216 | - content_filter: if content was omitted due to a flag from our content filters, 1217 | - tool_calls: if the model called a tool 1218 | """ 1219 | if finish_reason: 1220 | finish_reason_mapping = { 1221 | "tool_use": "tool_calls", 1222 | "finished": "stop", 1223 | "end_turn": "stop", 1224 | "max_tokens": "length", 1225 | "stop_sequence": "stop", 1226 | "complete": "stop", 1227 | "content_filtered": "content_filter", 1228 | } 1229 | return finish_reason_mapping.get( 1230 | finish_reason.lower(), finish_reason.lower() 1231 | ) 1232 | return None 1233 | 1234 | 1235 | class BedrockEmbeddingsModel(BaseEmbeddingsModel, ABC): 1236 | accept = "application/json" 1237 | content_type = "application/json" 1238 | 1239 | def _invoke_model(self, args: dict, model_id: str): 1240 | body = json.dumps(args) 1241 | if DEBUG: 1242 | logger.info("Invoke Bedrock Model: " + model_id) 1243 | logger.info("Bedrock request body: " + body) 1244 | try: 1245 | return bedrock_runtime.invoke_model( 1246 | body=body, 1247 | modelId=model_id, 1248 | accept=self.accept, 1249 | contentType=self.content_type, 1250 | ) 1251 | except bedrock_runtime.exceptions.ValidationException as e: 1252 | logger.error("Validation Error: " + str(e)) 1253 | raise HTTPException(status_code=400, detail=str(e)) 1254 | except bedrock_runtime.exceptions.ThrottlingException as e: 1255 | logger.error("Throttling Error: " + str(e)) 1256 | raise HTTPException(status_code=429, detail=str(e)) 1257 | except Exception as e: 1258 | logger.error(e) 1259 | raise HTTPException(status_code=500, detail=str(e)) 1260 | 1261 | def _create_response( 1262 | self, 1263 | embeddings: list[float], 1264 | model: str, 1265 | input_tokens: int = 0, 1266 | output_tokens: int = 0, 1267 | encoding_format: Literal["float", "base64"] = "float", 1268 | ) -> EmbeddingsResponse: 1269 | data = [] 1270 | for i, embedding in enumerate(embeddings): 1271 | if encoding_format == "base64": 1272 | arr = np.array(embedding, dtype=np.float32) 1273 | arr_bytes = arr.tobytes() 1274 | encoded_embedding = base64.b64encode(arr_bytes) 1275 | data.append(Embedding(index=i, embedding=encoded_embedding)) 1276 | else: 1277 | data.append(Embedding(index=i, embedding=embedding)) 1278 | response = EmbeddingsResponse( 1279 | data=data, 1280 | model=model, 1281 | usage=EmbeddingsUsage( 1282 | prompt_tokens=input_tokens, 1283 | total_tokens=input_tokens + output_tokens, 1284 | ), 1285 | ) 1286 | if DEBUG: 1287 | logger.info("Proxy response :" + response.model_dump_json()) 1288 | return response 1289 | 1290 | 1291 | class CohereEmbeddingsModel(BedrockEmbeddingsModel): 1292 | def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict: 1293 | texts = [] 1294 | if isinstance(embeddings_request.input, str): 1295 | texts = [embeddings_request.input] 1296 | elif isinstance(embeddings_request.input, list): 1297 | texts = embeddings_request.input 1298 | elif isinstance(embeddings_request.input, Iterable): 1299 | # For encoded input 1300 | # The workaround is to use tiktoken to decode to get the original text. 1301 | encodings = [] 1302 | for inner in embeddings_request.input: 1303 | if isinstance(inner, int): 1304 | # Iterable[int] 1305 | encodings.append(inner) 1306 | else: 1307 | # Iterable[Iterable[int]] 1308 | text = ENCODER.decode(list(inner)) 1309 | texts.append(text) 1310 | if encodings: 1311 | texts.append(ENCODER.decode(encodings)) 1312 | 1313 | # Maximum of 2048 characters 1314 | args = { 1315 | "texts": texts, 1316 | "input_type": "search_document", 1317 | "truncate": "END", # "NONE|START|END" 1318 | } 1319 | return args 1320 | 1321 | def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse: 1322 | response = self._invoke_model( 1323 | args=self._parse_args(embeddings_request), model_id=embeddings_request.model 1324 | ) 1325 | response_body = json.loads(response.get("body").read()) 1326 | if DEBUG: 1327 | logger.info("Bedrock response body: " + str(response_body)) 1328 | 1329 | return self._create_response( 1330 | embeddings=response_body["embeddings"], 1331 | model=embeddings_request.model, 1332 | encoding_format=embeddings_request.encoding_format, 1333 | ) 1334 | 1335 | 1336 | class TitanEmbeddingsModel(BedrockEmbeddingsModel): 1337 | def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict: 1338 | if isinstance(embeddings_request.input, str): 1339 | input_text = embeddings_request.input 1340 | elif ( 1341 | isinstance(embeddings_request.input, list) 1342 | and len(embeddings_request.input) == 1 1343 | ): 1344 | input_text = embeddings_request.input[0] 1345 | else: 1346 | raise ValueError( 1347 | "Amazon Titan Embeddings models support only single strings as input." 1348 | ) 1349 | args = { 1350 | "inputText": input_text, 1351 | # Note: inputImage is not supported! 1352 | } 1353 | if embeddings_request.model == "amazon.titan-embed-image-v1": 1354 | args["embeddingConfig"] = ( 1355 | embeddings_request.embedding_config 1356 | if embeddings_request.embedding_config 1357 | else {"outputEmbeddingLength": 1024} 1358 | ) 1359 | return args 1360 | 1361 | def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse: 1362 | response = self._invoke_model( 1363 | args=self._parse_args(embeddings_request), model_id=embeddings_request.model 1364 | ) 1365 | response_body = json.loads(response.get("body").read()) 1366 | if DEBUG: 1367 | logger.info("Bedrock response body: " + str(response_body)) 1368 | 1369 | return self._create_response( 1370 | embeddings=[response_body["embedding"]], 1371 | model=embeddings_request.model, 1372 | input_tokens=response_body["inputTextTokenCount"], 1373 | ) 1374 | 1375 | 1376 | def get_embeddings_model(model_id: str) -> BedrockEmbeddingsModel: 1377 | model_name = SUPPORTED_BEDROCK_EMBEDDING_MODELS.get(model_id, "") 1378 | if DEBUG: 1379 | logger.info("model name is " + model_name) 1380 | match model_name: 1381 | case "Cohere Embed Multilingual" | "Cohere Embed English": 1382 | return CohereEmbeddingsModel() 1383 | case "Titan Embeddings G2 - Text": 1384 | return TitanEmbeddingsModel() 1385 | case _: 1386 | logger.error("Unsupported model id " + model_id) 1387 | raise HTTPException( 1388 | status_code=400, 1389 | detail="Unsupported embedding model id " + model_id, 1390 | ) 1391 | --------------------------------------------------------------------------------