├── src
    ├── api
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── bedrock.py
    │   ├── routers
    │   │   ├── __init__.py
    │   │   ├── embeddings.py
    │   │   ├── model.py
    │   │   └── chat.py
    │   ├── setting.py
    │   ├── auth.py
    │   ├── app.py
    │   └── schema.py
    ├── requirements.txt
    ├── Dockerfile
    └── Dockerfile_ecs
├── assets
    ├── arch.png
    └── obj-detect.png
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── .pre-commit-config.yaml
├── THIRD_PARTY
├── CODE_OF_CONDUCT.md
├── ruff.toml
├── docker-compose.yml
├── LICENSE
├── CONTRIBUTING.md
├── docs
    ├── Security.md
    ├── Troubleshooting.md
    ├── Usage_CN.md
    └── Usage.md
├── .gitignore
├── scripts
    └── push-to-ecr.sh
├── deployment
    ├── BedrockProxy.template
    └── BedrockProxyFargate.template
└── README.md


/src/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/api/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/api/routers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/bedrock-access-gateway/HEAD/assets/arch.png


--------------------------------------------------------------------------------
/assets/obj-detect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/bedrock-access-gateway/HEAD/assets/obj-detect.png


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.116.1
 2 | pydantic==2.11.4
 3 | uvicorn==0.29.0
 4 | mangum==0.17.0
 5 | tiktoken==0.9.0
 6 | requests==2.32.4
 7 | numpy==2.2.5
 8 | boto3==1.40.4
 9 | botocore==1.40.4
10 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     # Ruff version.
 4 |     rev: v0.9.10
 5 |     hooks:
 6 |       # Run the linter.
 7 |       - id: ruff
 8 |         types_or: [python, pyi]
 9 |       # Run the formatter.
10 |       - id: ruff-format
11 | 


--------------------------------------------------------------------------------
/THIRD_PARTY:
--------------------------------------------------------------------------------
1 | certifi
2 | 
3 | SPDX-License-Identifier: MPL-2.0
4 | This Source Code Form is subject to the terms of the Mozilla Public
5 | License, v. 2.0. If a copy of the MPL was not distributed with this
6 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 | 
8 | https://github.com/certifi/python-certifi


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | line-length = 120
 2 | indent-width = 4
 3 | target-version = "py312"
 4 | 
 5 | exclude = [
 6 |     ".venv",
 7 |     ".vscode",
 8 |     "test/*"
 9 | ]
10 | 
11 | [lint]
12 | select = ["E", "F", "I"]
13 | ignore = [
14 |     "E501",
15 |     "C901",
16 |     "F401",
17 | ]
18 | 
19 | [format]
20 | # use double quotes for strings.
21 | quote-style = "double"


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   bedrock-access-gateway:
 5 |     build:
 6 |       context: ./src
 7 |       dockerfile: Dockerfile_ecs
 8 |     ports:
 9 |       - "127.0.0.1:8000:8080"
10 |     environment:
11 |       - ENABLE_PROMPT_CACHING=true
12 |       - API_KEY=${OPENAI_API_KEY}
13 |       - AWS_PROFILE
14 |       - AWS_ACCESS_KEY_ID
15 |       - AWS_SECRET_ACCESS_KEY
16 |       - AWS_SESSION_TOKEN
17 |     volumes:
18 |       - ${HOME}/.aws:/home/appuser/.aws
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this solution
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the feature you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/src/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.12
 2 | 
 3 | # Add Lambda Web Adapter for API Gateway response streaming
 4 | COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.9.1 /lambda-adapter /opt/extensions/lambda-adapter
 5 | 
 6 | COPY ./api ./api
 7 | 
 8 | COPY requirements.txt .
 9 | 
10 | RUN pip3 install -r requirements.txt -U --no-cache-dir
11 | 
12 | # Lambda Web Adapter requires overriding the Lambda base image entrypoint
13 | # to run the web app directly instead of the Lambda runtime handler
14 | ENTRYPOINT []
15 | CMD ["python", "-m", "uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8080"]


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | 
14 | **Please complete the following information:**
15 | - [ ] Which API you used: [e.g. /chat/completions]
16 | - [ ] Which model you used: [e.g. anthropic.claude-3-sonnet-20240229-v1:0]
17 | 
18 | **To Reproduce**
19 | Steps to reproduce the behavior. If possible, please share an example request.
20 | 
21 | **Expected behavior**
22 | A clear and concise description of what you expected to happen.
23 | 
24 | 
25 | **Screenshots**
26 | If applicable, add screenshots to help explain your problem (please **DO NOT include sensitive information**).
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/src/Dockerfile_ecs:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/docker/library/python:3.13-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY ./requirements.txt /app/requirements.txt
 6 | 
 7 | RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
 8 | 
 9 | COPY ./api /app/api
10 | 
11 | # Create non-root user
12 | RUN groupadd -r appuser && useradd -r -g appuser appuser && \
13 |     chown -R appuser:appuser /app
14 | 
15 | USER appuser
16 | 
17 | # Preload tiktoken encoding: https://github.com/aws-samples/bedrock-access-gateway/issues/118
18 | ENV TIKTOKEN_CACHE_DIR=/app/.cache/tiktoken
19 | RUN python3 -c 'import tiktoken_ext.openai_public as tke; tke.cl100k_base()'
20 | 
21 | ENV PORT=8080
22 | 
23 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
24 |   CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/health').read()"
25 | 
26 | CMD ["sh", "-c", "uvicorn api.app:app --host 0.0.0.0 --port ${PORT}"]
27 | 


--------------------------------------------------------------------------------
/src/api/setting.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | API_ROUTE_PREFIX = os.environ.get("API_ROUTE_PREFIX", "/api/v1")
 4 | 
 5 | TITLE = "Amazon Bedrock Proxy APIs"
 6 | SUMMARY = "OpenAI-Compatible RESTful APIs for Amazon Bedrock"
 7 | VERSION = "0.1.0"
 8 | DESCRIPTION = """
 9 | Use OpenAI-Compatible RESTful APIs for Amazon Bedrock models.
10 | """
11 | 
12 | DEBUG = os.environ.get("DEBUG", "false").lower() != "false"
13 | AWS_REGION = os.environ.get("AWS_REGION", "us-west-2")
14 | DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "anthropic.claude-3-sonnet-20240229-v1:0")
15 | DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "cohere.embed-multilingual-v3")
16 | ENABLE_CROSS_REGION_INFERENCE = os.environ.get("ENABLE_CROSS_REGION_INFERENCE", "true").lower() != "false"
17 | ENABLE_APPLICATION_INFERENCE_PROFILES = os.environ.get("ENABLE_APPLICATION_INFERENCE_PROFILES", "true").lower() != "false"
18 | ENABLE_PROMPT_CACHING = os.environ.get("ENABLE_PROMPT_CACHING", "false").lower() != "false"
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/src/api/routers/embeddings.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | from fastapi import APIRouter, Body, Depends
 4 | 
 5 | from api.auth import api_key_auth
 6 | from api.models.bedrock import get_embeddings_model
 7 | from api.schema import EmbeddingsRequest, EmbeddingsResponse
 8 | from api.setting import DEFAULT_EMBEDDING_MODEL
 9 | 
10 | router = APIRouter(
11 |     prefix="/embeddings",
12 |     dependencies=[Depends(api_key_auth)],
13 | )
14 | 
15 | 
16 | @router.post("", response_model=EmbeddingsResponse)
17 | async def embeddings(
18 |     embeddings_request: Annotated[
19 |         EmbeddingsRequest,
20 |         Body(
21 |             examples=[
22 |                 {
23 |                     "model": "cohere.embed-multilingual-v3",
24 |                     "input": ["Your text string goes here"],
25 |                 }
26 |             ],
27 |         ),
28 |     ],
29 | ):
30 |     if embeddings_request.model.lower().startswith("text-embedding-"):
31 |         embeddings_request.model = DEFAULT_EMBEDDING_MODEL
32 |     # Exception will be raised if model not supported.
33 |     model = get_embeddings_model(embeddings_request.model)
34 |     return model.embed(embeddings_request)
35 | 


--------------------------------------------------------------------------------
/src/api/routers/model.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | from fastapi import APIRouter, Depends, HTTPException, Path
 4 | 
 5 | from api.auth import api_key_auth
 6 | from api.models.bedrock import BedrockModel
 7 | from api.schema import Model, Models
 8 | 
 9 | router = APIRouter(
10 |     prefix="/models",
11 |     dependencies=[Depends(api_key_auth)],
12 |     # responses={404: {"description": "Not found"}},
13 | )
14 | 
15 | chat_model = BedrockModel()
16 | 
17 | 
18 | async def validate_model_id(model_id: str):
19 |     if model_id not in chat_model.list_models():
20 |         raise HTTPException(status_code=500, detail="Unsupported Model Id")
21 | 
22 | 
23 | @router.get("", response_model=Models)
24 | async def list_models():
25 |     model_list = [Model(id=model_id) for model_id in chat_model.list_models()]
26 |     return Models(data=model_list)
27 | 
28 | 
29 | @router.get(
30 |     "/{model_id}",
31 |     response_model=Model,
32 | )
33 | async def get_model(
34 |     model_id: Annotated[
35 |         str,
36 |         Path(description="Model ID", example="anthropic.claude-3-sonnet-20240229-v1:0"),
37 |     ],
38 | ):
39 |     await validate_model_id(model_id)
40 |     return Model(id=model_id)
41 | 


--------------------------------------------------------------------------------
/src/api/routers/chat.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | from fastapi import APIRouter, Body, Depends
 4 | from fastapi.responses import StreamingResponse
 5 | 
 6 | from api.auth import api_key_auth
 7 | from api.models.bedrock import BedrockModel
 8 | from api.schema import ChatRequest, ChatResponse, ChatStreamResponse, Error
 9 | from api.setting import DEFAULT_MODEL
10 | 
11 | router = APIRouter(
12 |     prefix="/chat",
13 |     dependencies=[Depends(api_key_auth)],
14 |     # responses={404: {"description": "Not found"}},
15 | )
16 | 
17 | 
18 | @router.post(
19 |     "/completions", response_model=ChatResponse | ChatStreamResponse | Error, response_model_exclude_unset=True
20 | )
21 | async def chat_completions(
22 |     chat_request: Annotated[
23 |         ChatRequest,
24 |         Body(
25 |             examples=[
26 |                 {
27 |                     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
28 |                     "messages": [
29 |                         {"role": "system", "content": "You are a helpful assistant."},
30 |                         {"role": "user", "content": "Hello!"},
31 |                     ],
32 |                 }
33 |             ],
34 |         ),
35 |     ],
36 | ):
37 |     if chat_request.model.lower().startswith("gpt-"):
38 |         chat_request.model = DEFAULT_MODEL
39 | 
40 |     # Exception will be raised if model not supported.
41 |     model = BedrockModel()
42 |     model.validate(chat_request)
43 |     if chat_request.stream:
44 |         return StreamingResponse(content=model.chat_stream(chat_request), media_type="text/event-stream")
45 |     return await model.chat(chat_request)
46 | 


--------------------------------------------------------------------------------
/src/api/auth.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import Annotated
 4 | 
 5 | import boto3
 6 | from botocore.exceptions import ClientError
 7 | from fastapi import Depends, HTTPException, status
 8 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 9 | 
10 | api_key_param = os.environ.get("API_KEY_PARAM_NAME")
11 | api_key_secret_arn = os.environ.get("API_KEY_SECRET_ARN")
12 | api_key_env = os.environ.get("API_KEY")
13 | if api_key_param:
14 |     # For backward compatibility.
15 |     # Please now use secrets manager instead.
16 |     ssm = boto3.client("ssm")
17 |     api_key = ssm.get_parameter(Name=api_key_param, WithDecryption=True)["Parameter"]["Value"]
18 | elif api_key_secret_arn:
19 |     sm = boto3.client("secretsmanager")
20 |     try:
21 |         response = sm.get_secret_value(SecretId=api_key_secret_arn)
22 |         if "SecretString" in response:
23 |             secret = json.loads(response["SecretString"])
24 |             api_key = secret["api_key"]
25 |     except ClientError:
26 |         raise RuntimeError("Unable to retrieve API KEY, please ensure the secret ARN is correct")
27 |     except KeyError:
28 |         raise RuntimeError('Please ensure the secret contains a "api_key" field')
29 | elif api_key_env:
30 |     api_key = api_key_env
31 | else:
32 |     raise RuntimeError(
33 |         "API Key is not configured. Please set up your API Key."
34 |     )
35 | 
36 | security = HTTPBearer()
37 | 
38 | 
39 | def api_key_auth(
40 |     credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)],
41 | ):
42 |     if credentials.credentials != api_key:
43 |         raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API Key")
44 | 


--------------------------------------------------------------------------------
/src/api/app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import uvicorn
 5 | from fastapi import FastAPI
 6 | from fastapi.exceptions import RequestValidationError
 7 | from fastapi.middleware.cors import CORSMiddleware
 8 | from fastapi.responses import PlainTextResponse
 9 | from mangum import Mangum
10 | 
11 | from api.routers import chat, embeddings, model
12 | from api.setting import API_ROUTE_PREFIX, DESCRIPTION, SUMMARY, TITLE, VERSION
13 | 
14 | config = {
15 |     "title": TITLE,
16 |     "description": DESCRIPTION,
17 |     "summary": SUMMARY,
18 |     "version": VERSION,
19 | }
20 | 
21 | logging.basicConfig(
22 |     level=logging.INFO,
23 |     format="%(asctime)s [%(levelname)s] %(message)s",
24 | )
25 | app = FastAPI(**config)
26 | 
27 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", "*")
28 | origins_list = [origin.strip() for origin in allowed_origins.split(",")] if allowed_origins != "*" else ["*"]
29 | 
30 | # Warn if CORS allows all origins
31 | if origins_list == ["*"]:
32 |     logging.warning("CORS is configured to allow all origins (*). Set ALLOWED_ORIGINS environment variable to restrict access.")
33 | 
34 | app.add_middleware(
35 |     CORSMiddleware,
36 |     allow_origins=origins_list,  # nosec - configurable via ALLOWED_ORIGINS env var
37 |     allow_credentials=True,
38 |     allow_methods=["*"],
39 |     allow_headers=["*"],
40 | )
41 | 
42 | 
43 | app.include_router(model.router, prefix=API_ROUTE_PREFIX)
44 | app.include_router(chat.router, prefix=API_ROUTE_PREFIX)
45 | app.include_router(embeddings.router, prefix=API_ROUTE_PREFIX)
46 | 
47 | 
48 | @app.get("/health")
49 | async def health():
50 |     """For health check if needed"""
51 |     return {"status": "OK"}
52 | 
53 | 
54 | @app.exception_handler(RequestValidationError)
55 | async def validation_exception_handler(request, exc):
56 |     logger = logging.getLogger(__name__)
57 |     
58 |     # Log essential info only - avoid sensitive data and performance overhead
59 |     logger.warning(
60 |         "Request validation failed: %s %s - %s", 
61 |         request.method, 
62 |         request.url.path,
63 |         str(exc).split('\n')[0]  # First line only
64 |     )
65 |     
66 |     return PlainTextResponse(str(exc), status_code=400)
67 | 
68 | 
69 | handler = Mangum(app)
70 | 
71 | if __name__ == "__main__":
72 |     # Bind to 0.0.0.0 for container environments, network is handled by network policies and load balancers
73 |     uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)  # nosec B104
74 | 


--------------------------------------------------------------------------------
/src/api/models/base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | import uuid
 4 | from abc import ABC, abstractmethod
 5 | from typing import AsyncIterable
 6 | 
 7 | from api.schema import (
 8 |     # Chat
 9 |     ChatRequest,
10 |     ChatResponse,
11 |     ChatStreamResponse,
12 |     # Embeddings
13 |     EmbeddingsRequest,
14 |     EmbeddingsResponse,
15 |     Error,
16 | )
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | class BaseChatModel(ABC):
22 |     """Represent a basic chat model
23 | 
24 |     Currently, only Bedrock model is supported, but may be used for SageMaker models if needed.
25 |     """
26 | 
27 |     def list_models(self) -> list[str]:
28 |         """Return a list of supported models"""
29 |         return []
30 | 
31 |     def validate(self, chat_request: ChatRequest):
32 |         """Validate chat completion requests."""
33 |         pass
34 | 
35 |     @abstractmethod
36 |     async def chat(self, chat_request: ChatRequest) -> ChatResponse:
37 |         """Handle a basic chat completion requests."""
38 |         pass
39 | 
40 |     @abstractmethod
41 |     async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
42 |         """Handle a basic chat completion requests with stream response."""
43 |         pass
44 | 
45 |     @staticmethod
46 |     def generate_message_id() -> str:
47 |         return "chatcmpl-" + str(uuid.uuid4())[:8]
48 | 
49 |     @staticmethod
50 |     def stream_response_to_bytes(response: ChatStreamResponse | Error | None = None) -> bytes:
51 |         if isinstance(response, Error):
52 |             logger.error("Stream error: %s", response.error.message if response.error else "Unknown error")
53 |             data = response.model_dump_json()
54 |         elif isinstance(response, ChatStreamResponse):
55 |             # to populate other fields when using exclude_unset=True
56 |             response.system_fingerprint = "fp"
57 |             response.object = "chat.completion.chunk"
58 |             response.created = int(time.time())
59 |             data = response.model_dump_json(exclude_unset=True)
60 |         else:
61 |             data = "[DONE]"
62 | 
63 |         return f"data: {data}\n\n".encode("utf-8")
64 | 
65 | 
66 | class BaseEmbeddingsModel(ABC):
67 |     """Represents a basic embeddings model.
68 | 
69 |     Currently, only Bedrock-provided models are supported, but it may be used for SageMaker models if needed.
70 |     """
71 | 
72 |     @abstractmethod
73 |     def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse:
74 |         """Handle a basic embeddings request."""
75 |         pass
76 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/docs/Security.md:
--------------------------------------------------------------------------------
 1 | # Security
 2 | 
 3 | This document details the security configuration required for the solution. In particular, it covers:
 4 | 
 5 | - **HTTPS Setup**
 6 | 
 7 | Following these guidelines will help ensure that traffic is encrypted over the public network.
 8 | 
 9 | ---
10 | 
11 | ## 1. HTTPS Authentication with the ALB
12 | 
13 | ### Overview
14 | 
15 | Using HTTPS on your ALB guarantees that all client-to-ALB communication is encrypted. This is achieved by:
16 | - **Obtaining and managing SSL/TLS certificates** using AWS Certificate Manager (ACM). You'll need a domain but you can request a free certificate.
17 | - **Configuring HTTPS listeners** on the ALB
18 | - **Automating HTTP to HTTPS redirect** for clients that inadvertently access HTTP endpoints
19 | - **Allowing traffic in the Security Group of the ALB**
20 | 
21 | ### Step-by-Step Setup
22 | 
23 | #### 1.1. Request an SSL/TLS Certificate via ACM
24 | 
25 | 1. **Navigate to AWS Certificate Manager (ACM):**  
26 |    In the AWS Management Console, go to ACM in the region where your ALB is deployed.
27 | 
28 | 2. **Request the Certificate:**  
29 |    - Click on **"Request a certificate"**.
30 |    - Choose **"Request a public certificate"** (or a private one if using a private CA).
31 |    - Enter your domain names (e.g., `example.com`, `*.example.com`).
32 |    - Complete the validation (via DNS or email). DNS validation is generally preferred for automation purposes.
33 | 
34 | 3. **Certificate Validation:**  
35 |    Ensure that the certificate status becomes **"Issued"** before proceeding.
36 | 
37 | #### 1.2. Configure the ALB for HTTPS
38 | 
39 | 1. **Create or Modify the ALB Listener:**  
40 |    - Open the **EC2 Dashboard** and navigate to [Load Balancers](https://console.aws.amazon.com/ec2/home?#LoadBalancers:).
41 |    - If you already have an ALB, select it; otherwise, create a new ALB.
42 |    - Under the **Listeners** tab, click **Manage listener** > **Edit Listener**.
43 |    - Configure the listener protocol to **HTTPS** with port **443**.
44 |    - Select the certificate you requested from ACM.
45 | 
46 | #### 1.3. (Optional) Redirect HTTP Traffic to HTTPS
47 | 
48 | To enhance security, ensure that any HTTP requests are automatically redirected to HTTPS.
49 | 
50 | 1. **Create an HTTP Listener on Port 80:**
51 |    - Add a listener on port **80**.
52 |    - In the listener settings, add a rule to redirect all traffic to port **443** with the protocol changed to **HTTPS**.
53 |      
54 |    **Example AWS CLI command for redirection:**
55 |    ```bash
56 |    aws elbv2 create-listener \
57 |        --load-balancer-arn <your-alb-arn> \
58 |        --protocol HTTP \
59 |        --port 80 \
60 |        --default-actions Type=redirect,RedirectConfig="Protocol=https,Port=443,StatusCode=HTTP_301"
61 |    ```
62 | 
63 | #### 1.4. Allow traffic in the Security Group of the ALB
64 | 
65 | 1. **Create a Security Group:**
66 |    - Go to the CloudFormation stack you originally used to deploy, select **Resources** and search for **ProxyALBSecurityGroup**
67 |    - Click on the Security Group
68 |    - Edit the Inbound Rules to allow traffic on Port 443 from `0.0.0.0/0` and (optionally) delete the Inbound Rule on Port 80. **Note**: If you delete the rule on port 80, you will need to update the base url to use HTTPS only as it won't redirect HTTP traffic to HTTPS.
69 | 
70 | Now you should be able to test your application! Use the base url like:
71 | 
72 | ```
73 | https://<your-domain>/api/v1
74 | ```
75 | 
76 | ---
77 | 
78 | By following the steps outlined in this guide, you can configure a secure environment that uses HTTPS via ALB for encrypted traffic.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | Config
163 | .vscode/launch.json
164 | 


--------------------------------------------------------------------------------
/docs/Troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting Guide
 2 | 
 3 | This guide helps you troubleshoot common issues you might encounter when using the Bedrock Access Gateway.
 4 | 
 5 | ## Common Issues
 6 | 
 7 | ### 1. Parameter Store Access Error
 8 | 
 9 | To see errors, first you need to access the CloudWatch Logs of the Lambda/Fargate instance.
10 | 
11 | 1. Go to the [CloudWatch Console](https://console.aws.amazon.com/cloudwatch/home?#logsV2:log-groups/)
12 | 2. Search for `/aws/lambda/BedrockProxyAPI`
13 | 3. Click on the `Log Stream` to see the error details
14 | 
15 | ```python
16 | botocore.exceptions.ClientError: An error occurred (ParameterNotFound) when calling the GetParameter operation: Parameter /BedrockProxyAPIKey not found.
17 | ```
18 | 
19 | This error occurs when the Lambda function cannot access the API key parameter in Parameter Store.
20 | 
21 | **Possible solutions:**
22 | - Verify that you created the parameter in Parameter Store with the correct name
23 | - Check that the parameter name in the CloudFormation stack matches the one in Parameter Store
24 | - Ensure the Lambda function's IAM role has permission to access Parameter Store
25 | - If you didn't set up an API key, leave the `ApiKeyParam` field blank during deployment
26 | 
27 | ### 2. Model Access Issues
28 | 
29 | If you receive an error about model access:
30 | 
31 | ```
32 | {"error": {"message": "User: arn:aws:iam::XXXX:role/XXX is not authorized to perform: bedrock:InvokeModel on resource: arn:aws:bedrock:REGION::foundation-model/XXX", "type": "auth_error", "code": 401}}
33 | ```
34 | 
35 | **Possible solutions:**
36 | - Ensure you have requested access to the model in Amazon Bedrock
37 | - Verify the Lambda/Fargate role has the necessary permissions to invoke Bedrock models
38 | - Check that you're using the correct model ID
39 | - Verify the model is available in your chosen region
40 | 
41 | ### 3. API Key Authentication Failures
42 | 
43 | If you receive a 401 Unauthorized error:
44 | 
45 | ```
46 | {"detail": "Could not validate credentials"}
47 | ```
48 | 
49 | **Possible solutions:**
50 | - Verify you're using the correct API key in your requests
51 | - Check that the `Authorization` header is properly formatted (`Bearer YOUR-API-KEY`)
52 | - If using environment variables, ensure `OPENAI_API_KEY` is set correctly
53 | 
54 | ### 4. Cross-Region Access Issues
55 | 
56 | If you're trying to access models in a different region:
57 | 
58 | ```
59 | {"error": {"message": "Region 'us-east-1' is not enabled for your account", "type": "invalid_request_error", "code": 400}}
60 | ```
61 | 
62 | **Possible solutions:**
63 | - Ensure the target region is enabled for your AWS account
64 | - Verify the model you're trying to access is available in that region
65 | - Check that your IAM roles have the necessary cross-region permissions
66 | 
67 | ### 5. Rate Limiting and Quotas
68 | 
69 | If you're experiencing throttling or quota issues:
70 | 
71 | ```
72 | {"error": {"message": "Rate limit exceeded", "type": "rate_limit_error", "code": 429}}
73 | ```
74 | 
75 | **Possible solutions:**
76 | - Check your Bedrock service quotas in the AWS Console
77 | - Consider implementing retry logic in your application
78 | - Request a quota increase if needed
79 | 
80 | ## Getting Help
81 | 
82 | If you're still experiencing issues:
83 | 
84 | 1. Check the CloudWatch Logs for detailed error messages
85 | 2. Verify your AWS credentials and permissions
86 | 3. Review the [Usage Guide](./Usage.md) for correct API usage
87 | 4. Open a [GitHub issue](https://github.com/aws-samples/bedrock-access-gateway/issues/new?template=bug_report.md) with:
88 |    - Detailed error message
89 |    - Steps to reproduce
90 |    - Your deployment configuration (region, model, etc.)
91 |    - Any relevant CloudWatch logs
92 | 
93 | ## Additional Resources
94 | 
95 | - [Amazon Bedrock Documentation](https://docs.aws.amazon.com/bedrock/)
96 | - [AWS IAM Documentation](https://docs.aws.amazon.com/IAM/latest/UserGuide/)
97 | - [AWS Systems Manager Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html)
98 | 


--------------------------------------------------------------------------------
/scripts/push-to-ecr.sh:
--------------------------------------------------------------------------------
  1 | # NOTE: The script will try to create the ECR repository if it doesn't exist. Please grant the necessary permissions to the IAM user or role.
  2 | # Usage:
  3 | #    cd scripts
  4 | #    bash ./push-to-ecr.sh
  5 | 
  6 | set -o errexit  # exit on first error
  7 | set -o nounset  # exit on using unset variables
  8 | set -o pipefail # exit on any error in a pipeline
  9 | 
 10 | # Change to the directory where the script is located
 11 | cd "$(dirname "$0")"
 12 | 
 13 | # Prompt user for inputs
 14 | echo "================================================"
 15 | echo "Bedrock Access Gateway - Build and Push to ECR"
 16 | echo "================================================"
 17 | echo ""
 18 | 
 19 | # Get repository name for Lambda version
 20 | read -p "Enter ECR repository name for Lambda (default: bedrock-proxy-api): " LAMBDA_REPO
 21 | LAMBDA_REPO=${LAMBDA_REPO:-bedrock-proxy-api}
 22 | 
 23 | # Get repository name for ECS/Fargate version
 24 | read -p "Enter ECR repository name for ECS/Fargate (default: bedrock-proxy-api-ecs): " ECS_REPO
 25 | ECS_REPO=${ECS_REPO:-bedrock-proxy-api-ecs}
 26 | 
 27 | # Get image tag
 28 | read -p "Enter image tag (default: latest): " TAG
 29 | TAG=${TAG:-latest}
 30 | 
 31 | # Get AWS region
 32 | read -p "Enter AWS region (default: us-east-1): " AWS_REGION
 33 | AWS_REGION=${AWS_REGION:-us-east-1}
 34 | 
 35 | echo ""
 36 | echo "Configuration:"
 37 | echo "  Lambda Repository: $LAMBDA_REPO"
 38 | echo "  ECS/Fargate Repository: $ECS_REPO"
 39 | echo "  Image Tag: $TAG"
 40 | echo "  AWS Region: $AWS_REGION"
 41 | echo ""
 42 | read -p "Continue with these settings? (y/n): " CONFIRM
 43 | if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then
 44 |     echo "Aborted."
 45 |     exit 1
 46 | fi
 47 | echo ""
 48 | 
 49 | # Acknowledgment about ECR repository creation
 50 | echo "ℹ️  NOTICE: This script will automatically create ECR repositories if they don't exist."
 51 | echo "   The repositories will be created with the following default settings:"
 52 | echo "   - Image tag mutability: MUTABLE (allows overwriting tags)"
 53 | echo "   - Image scanning: Disabled"
 54 | echo "   - Encryption: AES256 (AWS managed encryption)"
 55 | echo ""
 56 | echo "   You can modify these settings later in the AWS ECR Console if needed."
 57 | echo "   Required IAM permissions: ecr:CreateRepository, ecr:GetAuthorizationToken,"
 58 | echo "   ecr:BatchCheckLayerAvailability, ecr:InitiateLayerUpload, ecr:UploadLayerPart,"
 59 | echo "   ecr:CompleteLayerUpload, ecr:PutImage"
 60 | echo ""
 61 | read -p "Do you acknowledge and want to proceed? (y/n): " ACK_CONFIRM
 62 | if [[ ! "$ACK_CONFIRM" =~ ^[Yy]$ ]]; then
 63 |     echo "Aborted."
 64 |     exit 1
 65 | fi
 66 | echo ""
 67 | 
 68 | # Define variables
 69 | ARCHS=("arm64")  # Single architecture for simplicity
 70 | 
 71 | build_and_push_image() {
 72 |     local IMAGE_NAME=$1
 73 |     local TAG=$2
 74 |     local DOCKERFILE_PATH=$3
 75 |     local REGION=$AWS_REGION
 76 |     local ARCH=${ARCHS[0]}
 77 | 
 78 |     echo "Building $IMAGE_NAME:$TAG..."
 79 | 
 80 |     # Build Docker image
 81 |     # Note: --provenance=false and --sbom=false are required for Lambda compatibility
 82 |     # Without these flags, Docker BuildKit (especially with docker-container driver) may create
 83 |     # OCI image manifests with attestations that AWS Lambda does not support.
 84 |     # Lambda requires Docker V2 Schema 2 format without multi-manifest index.
 85 |     # See: https://github.com/aws-samples/bedrock-access-gateway/issues/206
 86 |     docker buildx build \
 87 |         --platform linux/$ARCH \
 88 |         --provenance=false \
 89 |         --sbom=false \
 90 |         -t $IMAGE_NAME:$TAG \
 91 |         -f $DOCKERFILE_PATH \
 92 |         --load \
 93 |         ../src/
 94 | 
 95 |     # Get the account ID
 96 |     ACCOUNT_ID=$(aws sts get-caller-identity --region $REGION --query Account --output text)
 97 | 
 98 |     # Create repository URI
 99 |     REPOSITORY_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE_NAME}"
100 | 
101 |     echo "Creating ECR repository if it doesn't exist..."
102 |     # Create ECR repository if it doesn't exist
103 |     aws ecr create-repository --repository-name "${IMAGE_NAME}" --region $REGION || true
104 | 
105 |     echo "Logging in to ECR..."
106 |     # Log in to ECR
107 |     aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $REPOSITORY_URI
108 | 
109 |     echo "Pushing image to ECR..."
110 |     # Tag the image for ECR
111 |     docker tag $IMAGE_NAME:$TAG $REPOSITORY_URI:$TAG
112 | 
113 |     # Push the image to ECR
114 |     docker push $REPOSITORY_URI:$TAG
115 | 
116 |     echo "✅ Successfully pushed $IMAGE_NAME:$TAG to $REPOSITORY_URI"
117 |     echo ""
118 | }
119 | 
120 | echo "Building and pushing Lambda image..."
121 | build_and_push_image "$LAMBDA_REPO" "$TAG" "../src/Dockerfile"
122 | 
123 | echo "Building and pushing ECS/Fargate image..."
124 | build_and_push_image "$ECS_REPO" "$TAG" "../src/Dockerfile_ecs"
125 | 
126 | echo "================================================"
127 | echo "✅ All images successfully pushed!"
128 | echo "================================================"
129 | echo ""
130 | echo "Your container image URIs:"
131 | ACCOUNT_ID=$(aws sts get-caller-identity --region $AWS_REGION --query Account --output text)
132 | echo "  Lambda: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${LAMBDA_REPO}:${TAG}"
133 | echo "  ECS/Fargate: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECS_REPO}:${TAG}"
134 | echo ""
135 | echo "Next steps:"
136 | echo "  1. Download the CloudFormation templates from deployment/ folder"
137 | echo "  2. Update the ContainerImageUri parameter with your image URI above"
138 | echo "  3. Deploy the stack via AWS CloudFormation Console"
139 | echo ""
140 | 


--------------------------------------------------------------------------------
/src/api/schema.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Iterable, Literal
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | 
  6 | from api.setting import DEFAULT_MODEL
  7 | 
  8 | 
  9 | class Model(BaseModel):
 10 |     id: str
 11 |     created: int = Field(default_factory=lambda: int(time.time()))
 12 |     object: str | None = "model"
 13 |     owned_by: str | None = "bedrock"
 14 | 
 15 | 
 16 | class Models(BaseModel):
 17 |     object: str | None = "list"
 18 |     data: list[Model] = []
 19 | 
 20 | 
 21 | class ResponseFunction(BaseModel):
 22 |     name: str | None = None
 23 |     arguments: str
 24 | 
 25 | 
 26 | class ToolCall(BaseModel):
 27 |     index: int | None = None
 28 |     id: str | None = None
 29 |     type: Literal["function"] = "function"
 30 |     function: ResponseFunction
 31 | 
 32 | 
 33 | class TextContent(BaseModel):
 34 |     type: Literal["text"] = "text"
 35 |     text: str
 36 | 
 37 | 
 38 | class ImageUrl(BaseModel):
 39 |     url: str
 40 |     detail: str | None = "auto"
 41 | 
 42 | 
 43 | class ImageContent(BaseModel):
 44 |     type: Literal["image_url"] = "image"
 45 |     image_url: ImageUrl
 46 | 
 47 | 
 48 | class ToolContent(BaseModel):
 49 |     type: Literal["text"] = "text"
 50 |     text: str
 51 | 
 52 | 
 53 | class SystemMessage(BaseModel):
 54 |     name: str | None = None
 55 |     role: Literal["system"] = "system"
 56 |     content: str
 57 | 
 58 | 
 59 | class UserMessage(BaseModel):
 60 |     name: str | None = None
 61 |     role: Literal["user"] = "user"
 62 |     content: str | list[TextContent | ImageContent]
 63 | 
 64 | 
 65 | class AssistantMessage(BaseModel):
 66 |     name: str | None = None
 67 |     role: Literal["assistant"] = "assistant"
 68 |     content: str | list[TextContent | ImageContent] | None = None
 69 |     tool_calls: list[ToolCall] | None = None
 70 | 
 71 | 
 72 | class ToolMessage(BaseModel):
 73 |     role: Literal["tool"] = "tool"
 74 |     content: str | list[ToolContent] | list[dict]
 75 |     tool_call_id: str
 76 | 
 77 | 
 78 | class DeveloperMessage(BaseModel):
 79 |     name: str | None = None
 80 |     role: Literal["developer"] = "developer"
 81 |     content: str
 82 | 
 83 | 
 84 | class Function(BaseModel):
 85 |     name: str
 86 |     description: str | None = None
 87 |     parameters: object
 88 | 
 89 | 
 90 | class Tool(BaseModel):
 91 |     type: Literal["function"] = "function"
 92 |     function: Function
 93 | 
 94 | 
 95 | class StreamOptions(BaseModel):
 96 |     include_usage: bool = True
 97 | 
 98 | 
 99 | class ChatRequest(BaseModel):
100 |     messages: list[SystemMessage | UserMessage | AssistantMessage | ToolMessage | DeveloperMessage]
101 |     model: str = DEFAULT_MODEL
102 |     frequency_penalty: float | None = Field(default=0.0, le=2.0, ge=-2.0)  # Not used
103 |     presence_penalty: float | None = Field(default=0.0, le=2.0, ge=-2.0)  # Not used
104 |     stream: bool | None = False
105 |     stream_options: StreamOptions | None = None
106 |     temperature: float | None = Field(default=None, le=2.0, ge=0.0)
107 |     top_p: float | None = Field(default=None, le=1.0, ge=0.0)
108 |     user: str | None = None  # Not used
109 |     max_tokens: int | None = 2048
110 |     max_completion_tokens: int | None = None
111 |     reasoning_effort: Literal["low", "medium", "high"] | None = None
112 |     n: int | None = 1  # Not used
113 |     tools: list[Tool] | None = None
114 |     tool_choice: str | object = "auto"
115 |     stop: list[str] | str | None = None
116 |     extra_body: dict | None = None
117 | 
118 | 
119 | class PromptTokensDetails(BaseModel):
120 |     """Details about prompt tokens usage, following OpenAI API format."""
121 |     cached_tokens: int = 0
122 |     audio_tokens: int = 0
123 | 
124 | 
125 | class CompletionTokensDetails(BaseModel):
126 |     """Details about completion tokens usage, following OpenAI API format."""
127 |     reasoning_tokens: int = 0
128 |     audio_tokens: int = 0
129 | 
130 | 
131 | class Usage(BaseModel):
132 |     prompt_tokens: int
133 |     completion_tokens: int
134 |     total_tokens: int
135 |     prompt_tokens_details: PromptTokensDetails | None = None
136 |     completion_tokens_details: CompletionTokensDetails | None = None
137 | 
138 | 
139 | class ChatResponseMessage(BaseModel):
140 |     # tool_calls
141 |     role: Literal["assistant"] | None = None
142 |     content: str | None = None
143 |     tool_calls: list[ToolCall] | None = None
144 |     reasoning_content: str | None = None
145 | 
146 | 
147 | class BaseChoice(BaseModel):
148 |     index: int | None = 0
149 |     finish_reason: str | None = None
150 |     logprobs: dict | None = None
151 | 
152 | 
153 | class Choice(BaseChoice):
154 |     message: ChatResponseMessage
155 | 
156 | 
157 | class ChoiceDelta(BaseChoice):
158 |     delta: ChatResponseMessage
159 | 
160 | 
161 | class BaseChatResponse(BaseModel):
162 |     # id: str = Field(default_factory=lambda: "chatcmpl-" + str(uuid.uuid4())[:8])
163 |     id: str
164 |     created: int = Field(default_factory=lambda: int(time.time()))
165 |     model: str
166 |     system_fingerprint: str = "fp"
167 | 
168 | 
169 | class ChatResponse(BaseChatResponse):
170 |     choices: list[Choice]
171 |     object: Literal["chat.completion"] = "chat.completion"
172 |     usage: Usage
173 | 
174 | 
175 | class ChatStreamResponse(BaseChatResponse):
176 |     choices: list[ChoiceDelta]
177 |     object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
178 |     usage: Usage | None = None
179 | 
180 | 
181 | class EmbeddingsRequest(BaseModel):
182 |     input: str | list[str] | Iterable[int | Iterable[int]]
183 |     model: str
184 |     encoding_format: Literal["float", "base64"] = "float"
185 |     dimensions: int | None = None  # not used.
186 |     user: str | None = None  # not used.
187 | 
188 | 
189 | class Embedding(BaseModel):
190 |     object: Literal["embedding"] = "embedding"
191 |     embedding: list[float] | bytes
192 |     index: int
193 | 
194 | 
195 | class EmbeddingsUsage(BaseModel):
196 |     prompt_tokens: int
197 |     total_tokens: int
198 | 
199 | 
200 | class EmbeddingsResponse(BaseModel):
201 |     object: Literal["list"] = "list"
202 |     data: list[Embedding]
203 |     model: str
204 |     usage: EmbeddingsUsage
205 | 
206 | 
207 | class ErrorMessage(BaseModel):
208 |     message: str
209 | 
210 | 
211 | class Error(BaseModel):
212 |     error: ErrorMessage
213 | 


--------------------------------------------------------------------------------
/deployment/BedrockProxy.template:
--------------------------------------------------------------------------------
  1 | Description: Bedrock Access Gateway - OpenAI-compatible RESTful APIs for Amazon Bedrock (API Gateway + Lambda with Streaming)
  2 | Parameters:
  3 |   ApiKeySecretArn:
  4 |     Type: String
  5 |     AllowedPattern: ^arn:aws:secretsmanager:.*$
  6 |     Description: The secret ARN in Secrets Manager used to store the API Key
  7 |   ContainerImageUri:
  8 |     Type: String
  9 |     Description: The ECR image URI for the Lambda function (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api:latest)
 10 |   DefaultModelId:
 11 |     Type: String
 12 |     Default: anthropic.claude-3-sonnet-20240229-v1:0
 13 |     Description: The default model ID, please make sure the model ID is supported in the current region
 14 |   EnablePromptCaching:
 15 |     Type: String
 16 |     Default: "false"
 17 |     AllowedValues:
 18 |       - "true"
 19 |       - "false"
 20 |     Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings.
 21 | Resources:
 22 |   # IAM Role for Lambda
 23 |   ProxyApiHandlerServiceRole:
 24 |     Type: AWS::IAM::Role
 25 |     Properties:
 26 |       AssumeRolePolicyDocument:
 27 |         Statement:
 28 |           - Action: sts:AssumeRole
 29 |             Effect: Allow
 30 |             Principal:
 31 |               Service: lambda.amazonaws.com
 32 |         Version: "2012-10-17"
 33 |       ManagedPolicyArns:
 34 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
 35 | 
 36 |   ProxyApiHandlerServiceRoleDefaultPolicy:
 37 |     Type: AWS::IAM::Policy
 38 |     Properties:
 39 |       PolicyDocument:
 40 |         Statement:
 41 |           - Action:
 42 |               - bedrock:ListFoundationModels
 43 |               - bedrock:ListInferenceProfiles
 44 |             Effect: Allow
 45 |             Resource: "*"
 46 |           - Action:
 47 |               - bedrock:InvokeModel
 48 |               - bedrock:InvokeModelWithResponseStream
 49 |             Effect: Allow
 50 |             Resource:
 51 |               - arn:aws:bedrock:*::foundation-model/*
 52 |               - arn:aws:bedrock:*:*:inference-profile/*
 53 |               - arn:aws:bedrock:*:*:application-inference-profile/*
 54 |           - Action:
 55 |               - secretsmanager:GetSecretValue
 56 |               - secretsmanager:DescribeSecret
 57 |             Effect: Allow
 58 |             Resource: !Ref ApiKeySecretArn
 59 |         Version: "2012-10-17"
 60 |       PolicyName: ProxyApiHandlerServiceRoleDefaultPolicy
 61 |       Roles:
 62 |         - !Ref ProxyApiHandlerServiceRole
 63 | 
 64 |   # Lambda Function with Lambda Web Adapter for streaming
 65 |   ProxyApiHandler:
 66 |     Type: AWS::Lambda::Function
 67 |     Properties:
 68 |       Architectures:
 69 |         - arm64
 70 |       Code:
 71 |         ImageUri: !Ref ContainerImageUri
 72 |       Description: Bedrock Proxy API Handler with Response Streaming
 73 |       Environment:
 74 |         Variables:
 75 |           # Lambda Web Adapter settings
 76 |           AWS_LWA_INVOKE_MODE: RESPONSE_STREAM
 77 |           AWS_LWA_READINESS_CHECK_PATH: /health
 78 |           AWS_LWA_ASYNC_INIT: "true"
 79 |           PORT: "8080"
 80 |           # Application settings
 81 |           DEBUG: "false"
 82 |           API_KEY_SECRET_ARN: !Ref ApiKeySecretArn
 83 |           DEFAULT_MODEL: !Ref DefaultModelId
 84 |           DEFAULT_EMBEDDING_MODEL: cohere.embed-multilingual-v3
 85 |           ENABLE_CROSS_REGION_INFERENCE: "true"
 86 |           ENABLE_APPLICATION_INFERENCE_PROFILES: "true"
 87 |           ENABLE_PROMPT_CACHING: !Ref EnablePromptCaching
 88 |           API_ROUTE_PREFIX: /v1
 89 |       MemorySize: 1024
 90 |       PackageType: Image
 91 |       Role: !GetAtt ProxyApiHandlerServiceRole.Arn
 92 |       Timeout: 600
 93 |     DependsOn:
 94 |       - ProxyApiHandlerServiceRoleDefaultPolicy
 95 |       - ProxyApiHandlerServiceRole
 96 | 
 97 |   # API Gateway REST API (Regional)
 98 |   RestApi:
 99 |     Type: AWS::ApiGateway::RestApi
100 |     Properties:
101 |       Name: BedrockProxyApi
102 |       Description: Bedrock Access Gateway - OpenAI-compatible API with streaming support
103 |       EndpointConfiguration:
104 |         Types:
105 |           - REGIONAL
106 |       Body:
107 |         openapi: "3.0.1"
108 |         info:
109 |           title: BedrockProxyApi
110 |           version: "1.0"
111 |         paths:
112 |           /{proxy+}:
113 |             x-amazon-apigateway-any-method:
114 |               parameters:
115 |                 - name: proxy
116 |                   in: path
117 |                   required: true
118 |                   schema:
119 |                     type: string
120 |               x-amazon-apigateway-integration:
121 |                 type: aws_proxy
122 |                 httpMethod: POST
123 |                 uri: !Sub "arn:aws:apigateway:${AWS::Region}:lambda:path/2021-11-15/functions/${ProxyApiHandler.Arn}/response-streaming-invocations"
124 |                 passthroughBehavior: when_no_match
125 |                 timeoutInMillis: 600000
126 |                 responseTransferMode: STREAM
127 |               responses:
128 |                 default:
129 |                   description: Default response
130 |           /:
131 |             x-amazon-apigateway-any-method:
132 |               x-amazon-apigateway-integration:
133 |                 type: aws_proxy
134 |                 httpMethod: POST
135 |                 uri: !Sub "arn:aws:apigateway:${AWS::Region}:lambda:path/2021-11-15/functions/${ProxyApiHandler.Arn}/response-streaming-invocations"
136 |                 passthroughBehavior: when_no_match
137 |                 timeoutInMillis: 600000
138 |                 responseTransferMode: STREAM
139 |               responses:
140 |                 default:
141 |                   description: Default response
142 | 
143 |   # Lambda Permission for API Gateway
144 |   LambdaPermission:
145 |     Type: AWS::Lambda::Permission
146 |     Properties:
147 |       FunctionName: !Ref ProxyApiHandler
148 |       Action: lambda:InvokeFunction
149 |       Principal: apigateway.amazonaws.com
150 |       SourceArn: !Sub "arn:aws:execute-api:${AWS::Region}:${AWS::AccountId}:${RestApi}/*"
151 | 
152 |   # API Gateway Deployment
153 |   ApiDeployment:
154 |     Type: AWS::ApiGateway::Deployment
155 |     Properties:
156 |       RestApiId: !Ref RestApi
157 |     DependsOn:
158 |       - RestApi
159 | 
160 |   # API Gateway Stage
161 |   ApiStage:
162 |     Type: AWS::ApiGateway::Stage
163 |     Properties:
164 |       RestApiId: !Ref RestApi
165 |       DeploymentId: !Ref ApiDeployment
166 |       StageName: api
167 |       Description: API Stage with streaming support
168 | 
169 | Outputs:
170 |   APIBaseUrl:
171 |     Description: Proxy API Base URL (OPENAI_API_BASE)
172 |     Value: !Sub "https://${RestApi}.execute-api.${AWS::Region}.amazonaws.com/api/v1"
173 |   RestApiId:
174 |     Description: API Gateway REST API ID
175 |     Value: !Ref RestApi
176 |   LambdaFunctionArn:
177 |     Description: Lambda Function ARN
178 |     Value: !GetAtt ProxyApiHandler.Arn
179 | 


--------------------------------------------------------------------------------
/deployment/BedrockProxyFargate.template:
--------------------------------------------------------------------------------
  1 | Description: Bedrock Access Gateway - OpenAI-compatible RESTful APIs for Amazon Bedrock
  2 | Parameters:
  3 |   ApiKeySecretArn:
  4 |     Type: String
  5 |     AllowedPattern: ^arn:aws:secretsmanager:.*$
  6 |     Description: The secret ARN in Secrets Manager used to store the API Key
  7 |   ContainerImageUri:
  8 |     Type: String
  9 |     Description: The ECR image URI for the ECS/Fargate task (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api-ecs:latest)
 10 |   DefaultModelId:
 11 |     Type: String
 12 |     Default: anthropic.claude-3-sonnet-20240229-v1:0
 13 |     Description: The default model ID, please make sure the model ID is supported in the current region
 14 |   EnablePromptCaching:
 15 |     Type: String
 16 |     Default: "false"
 17 |     AllowedValues:
 18 |       - "true"
 19 |       - "false"
 20 |     Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings.
 21 | Resources:
 22 |   VPCB9E5F0B4:
 23 |     Type: AWS::EC2::VPC
 24 |     Properties:
 25 |       CidrBlock: 10.250.0.0/16
 26 |       EnableDnsHostnames: true
 27 |       EnableDnsSupport: true
 28 |       InstanceTenancy: default
 29 |       Tags:
 30 |         - Key: Name
 31 |           Value: BedrockProxyFargate/VPC
 32 |   VPCPublicSubnet1SubnetB4246D30:
 33 |     Type: AWS::EC2::Subnet
 34 |     Properties:
 35 |       AvailabilityZone:
 36 |         Fn::Select:
 37 |           - 0
 38 |           - Fn::GetAZs: ""
 39 |       CidrBlock: 10.250.0.0/24
 40 |       MapPublicIpOnLaunch: true
 41 |       Tags:
 42 |         - Key: aws-cdk:subnet-name
 43 |           Value: Public
 44 |         - Key: aws-cdk:subnet-type
 45 |           Value: Public
 46 |         - Key: Name
 47 |           Value: BedrockProxyFargate/VPC/PublicSubnet1
 48 |       VpcId:
 49 |         Ref: VPCB9E5F0B4
 50 |   VPCPublicSubnet1RouteTableFEE4B781:
 51 |     Type: AWS::EC2::RouteTable
 52 |     Properties:
 53 |       Tags:
 54 |         - Key: Name
 55 |           Value: BedrockProxyFargate/VPC/PublicSubnet1
 56 |       VpcId:
 57 |         Ref: VPCB9E5F0B4
 58 |   VPCPublicSubnet1RouteTableAssociation0B0896DC:
 59 |     Type: AWS::EC2::SubnetRouteTableAssociation
 60 |     Properties:
 61 |       RouteTableId:
 62 |         Ref: VPCPublicSubnet1RouteTableFEE4B781
 63 |       SubnetId:
 64 |         Ref: VPCPublicSubnet1SubnetB4246D30
 65 |   VPCPublicSubnet1DefaultRoute91CEF279:
 66 |     Type: AWS::EC2::Route
 67 |     Properties:
 68 |       DestinationCidrBlock: 0.0.0.0/0
 69 |       GatewayId:
 70 |         Ref: VPCIGWB7E252D3
 71 |       RouteTableId:
 72 |         Ref: VPCPublicSubnet1RouteTableFEE4B781
 73 |     DependsOn:
 74 |       - VPCVPCGW99B986DC
 75 |   VPCPublicSubnet2Subnet74179F39:
 76 |     Type: AWS::EC2::Subnet
 77 |     Properties:
 78 |       AvailabilityZone:
 79 |         Fn::Select:
 80 |           - 1
 81 |           - Fn::GetAZs: ""
 82 |       CidrBlock: 10.250.1.0/24
 83 |       MapPublicIpOnLaunch: true
 84 |       Tags:
 85 |         - Key: aws-cdk:subnet-name
 86 |           Value: Public
 87 |         - Key: aws-cdk:subnet-type
 88 |           Value: Public
 89 |         - Key: Name
 90 |           Value: BedrockProxyFargate/VPC/PublicSubnet2
 91 |       VpcId:
 92 |         Ref: VPCB9E5F0B4
 93 |   VPCPublicSubnet2RouteTable6F1A15F1:
 94 |     Type: AWS::EC2::RouteTable
 95 |     Properties:
 96 |       Tags:
 97 |         - Key: Name
 98 |           Value: BedrockProxyFargate/VPC/PublicSubnet2
 99 |       VpcId:
100 |         Ref: VPCB9E5F0B4
101 |   VPCPublicSubnet2RouteTableAssociation5A808732:
102 |     Type: AWS::EC2::SubnetRouteTableAssociation
103 |     Properties:
104 |       RouteTableId:
105 |         Ref: VPCPublicSubnet2RouteTable6F1A15F1
106 |       SubnetId:
107 |         Ref: VPCPublicSubnet2Subnet74179F39
108 |   VPCPublicSubnet2DefaultRouteB7481BBA:
109 |     Type: AWS::EC2::Route
110 |     Properties:
111 |       DestinationCidrBlock: 0.0.0.0/0
112 |       GatewayId:
113 |         Ref: VPCIGWB7E252D3
114 |       RouteTableId:
115 |         Ref: VPCPublicSubnet2RouteTable6F1A15F1
116 |     DependsOn:
117 |       - VPCVPCGW99B986DC
118 |   VPCIGWB7E252D3:
119 |     Type: AWS::EC2::InternetGateway
120 |     Properties:
121 |       Tags:
122 |         - Key: Name
123 |           Value: BedrockProxyFargate/VPC
124 |   VPCVPCGW99B986DC:
125 |     Type: AWS::EC2::VPCGatewayAttachment
126 |     Properties:
127 |       InternetGatewayId:
128 |         Ref: VPCIGWB7E252D3
129 |       VpcId:
130 |         Ref: VPCB9E5F0B4
131 |   ProxyExecRole6947A5BE:
132 |     Type: AWS::IAM::Role
133 |     Properties:
134 |       AssumeRolePolicyDocument:
135 |         Statement:
136 |           - Action: sts:AssumeRole
137 |             Effect: Allow
138 |             Principal:
139 |               Service: ecs-tasks.amazonaws.com
140 |         Version: "2012-10-17"
141 |   ProxyExecRoleDefaultPolicyED41DFE7:
142 |     Type: AWS::IAM::Policy
143 |     Properties:
144 |       PolicyDocument:
145 |         Statement:
146 |           - Action:
147 |               - logs:CreateLogStream
148 |               - logs:PutLogEvents
149 |             Effect: Allow
150 |             Resource: "*"
151 |           - Action:
152 |               - secretsmanager:GetSecretValue
153 |               - secretsmanager:DescribeSecret
154 |             Effect: Allow
155 |             Resource:
156 |               Ref: ApiKeySecretArn
157 |           - Action:
158 |               - ecr:BatchCheckLayerAvailability
159 |               - ecr:GetDownloadUrlForLayer
160 |               - ecr:BatchGetImage
161 |             Effect: Allow
162 |             Resource:
163 |               Fn::Join:
164 |                 - ""
165 |                 - - "arn:aws:ecr:"
166 |                   - Fn::Select:
167 |                       - 3
168 |                       - Fn::Split:
169 |                           - "."
170 |                           - Fn::Select:
171 |                               - 0
172 |                               - Fn::Split:
173 |                                   - "/"
174 |                                   - Ref: ContainerImageUri
175 |                   - ":"
176 |                   - Fn::Select:
177 |                       - 0
178 |                       - Fn::Split:
179 |                           - "."
180 |                           - Fn::Select:
181 |                               - 0
182 |                               - Fn::Split:
183 |                                   - "/"
184 |                                   - Ref: ContainerImageUri
185 |                   - ":repository/"
186 |                   - Fn::Select:
187 |                       - 0
188 |                       - Fn::Split:
189 |                           - ":"
190 |                           - Fn::Select:
191 |                               - 1
192 |                               - Fn::Split:
193 |                                   - "/"
194 |                                   - Ref: ContainerImageUri
195 |           - Action: ecr:GetAuthorizationToken
196 |             Effect: Allow
197 |             Resource: "*"
198 |         Version: "2012-10-17"
199 |       PolicyName: ProxyExecRoleDefaultPolicyED41DFE7
200 |       Roles:
201 |         - Ref: ProxyExecRole6947A5BE
202 |   ProxyTaskRole5DB6A540:
203 |     Type: AWS::IAM::Role
204 |     Properties:
205 |       AssumeRolePolicyDocument:
206 |         Statement:
207 |           - Action: sts:AssumeRole
208 |             Effect: Allow
209 |             Principal:
210 |               Service: ecs-tasks.amazonaws.com
211 |         Version: "2012-10-17"
212 |   ProxyTaskRoleDefaultPolicy933321B8:
213 |     Type: AWS::IAM::Policy
214 |     Properties:
215 |       PolicyDocument:
216 |         Statement:
217 |           - Action:
218 |               - bedrock:ListFoundationModels
219 |               - bedrock:ListInferenceProfiles
220 |             Effect: Allow
221 |             Resource: "*"
222 |           - Action:
223 |               - bedrock:InvokeModel
224 |               - bedrock:InvokeModelWithResponseStream
225 |             Effect: Allow
226 |             Resource:
227 |               - arn:aws:bedrock:*::foundation-model/*
228 |               - arn:aws:bedrock:*:*:inference-profile/*
229 |               - arn:aws:bedrock:*:*:application-inference-profile/*
230 |         Version: "2012-10-17"
231 |       PolicyName: ProxyTaskRoleDefaultPolicy933321B8
232 |       Roles:
233 |         - Ref: ProxyTaskRole5DB6A540
234 |   ProxyBedrockCluster893F4261:
235 |     Type: AWS::ECS::Cluster
236 |   ProxyBedrockClusterD9C31EFF:
237 |     Type: AWS::ECS::ClusterCapacityProviderAssociations
238 |     Properties:
239 |       CapacityProviders:
240 |         - FARGATE
241 |         - FARGATE_SPOT
242 |       Cluster:
243 |         Ref: ProxyBedrockCluster893F4261
244 |       DefaultCapacityProviderStrategy: []
245 |   ProxyTaskDef9F2A72E5:
246 |     Type: AWS::ECS::TaskDefinition
247 |     Properties:
248 |       ContainerDefinitions:
249 |         - Environment:
250 |             - Name: DEBUG
251 |               Value: "false"
252 |             - Name: DEFAULT_MODEL
253 |               Value:
254 |                 Ref: DefaultModelId
255 |             - Name: DEFAULT_EMBEDDING_MODEL
256 |               Value: cohere.embed-multilingual-v3
257 |             - Name: ENABLE_CROSS_REGION_INFERENCE
258 |               Value: "true"
259 |             - Name: ENABLE_APPLICATION_INFERENCE_PROFILES
260 |               Value: "true"
261 |             - Name: ENABLE_PROMPT_CACHING
262 |               Value:
263 |                 Ref: EnablePromptCaching
264 |           Essential: true
265 |           Image:
266 |             Ref: ContainerImageUri
267 |           Name: proxy-api
268 |           PortMappings:
269 |             - ContainerPort: 8080
270 |               HostPort: 8080
271 |               Protocol: tcp
272 |           Secrets:
273 |             - Name: API_KEY
274 |               ValueFrom:
275 |                 Fn::Join:
276 |                   - ""
277 |                   - - Ref: ApiKeySecretArn
278 |                     - ":api_key::"
279 |       Cpu: "1024"
280 |       ExecutionRoleArn:
281 |         Fn::GetAtt:
282 |           - ProxyExecRole6947A5BE
283 |           - Arn
284 |       Family: BedrockProxyFargateProxyTaskDefCD902792
285 |       Memory: "2048"
286 |       NetworkMode: awsvpc
287 |       RequiresCompatibilities:
288 |         - FARGATE
289 |       RuntimePlatform:
290 |         CpuArchitecture: ARM64
291 |         OperatingSystemFamily: LINUX
292 |       TaskRoleArn:
293 |         Fn::GetAtt:
294 |           - ProxyTaskRole5DB6A540
295 |           - Arn
296 |   ProxyApiService8651D882:
297 |     Type: AWS::ECS::Service
298 |     Properties:
299 |       CapacityProviderStrategy:
300 |         - CapacityProvider: FARGATE
301 |           Weight: 1
302 |       Cluster:
303 |         Ref: ProxyBedrockCluster893F4261
304 |       DeploymentConfiguration:
305 |         Alarms:
306 |           AlarmNames: []
307 |           Enable: false
308 |           Rollback: false
309 |         MaximumPercent: 200
310 |         MinimumHealthyPercent: 50
311 |       DesiredCount: 1
312 |       EnableECSManagedTags: false
313 |       HealthCheckGracePeriodSeconds: 60
314 |       LoadBalancers:
315 |         - ContainerName: proxy-api
316 |           ContainerPort: 8080
317 |           TargetGroupArn:
318 |             Ref: ProxyALBListenerTargetsGroup187739FA
319 |       NetworkConfiguration:
320 |         AwsvpcConfiguration:
321 |           AssignPublicIp: ENABLED
322 |           SecurityGroups:
323 |             - Fn::GetAtt:
324 |                 - ProxyApiServiceSecurityGroup51EBD9B8
325 |                 - GroupId
326 |           Subnets:
327 |             - Ref: VPCPublicSubnet1SubnetB4246D30
328 |             - Ref: VPCPublicSubnet2Subnet74179F39
329 |       TaskDefinition:
330 |         Ref: ProxyTaskDef9F2A72E5
331 |     DependsOn:
332 |       - ProxyALBListener933E9515
333 |       - ProxyALBListenerTargetsGroup187739FA
334 |       - ProxyTaskRoleDefaultPolicy933321B8
335 |       - ProxyTaskRole5DB6A540
336 |   ProxyApiServiceSecurityGroup51EBD9B8:
337 |     Type: AWS::EC2::SecurityGroup
338 |     Properties:
339 |       GroupDescription: BedrockProxyFargate/Proxy/ApiService/SecurityGroup
340 |       SecurityGroupEgress:
341 |         - CidrIp: 0.0.0.0/0
342 |           Description: Allow all outbound traffic by default
343 |           IpProtocol: "-1"
344 |       VpcId:
345 |         Ref: VPCB9E5F0B4
346 |     DependsOn:
347 |       - ProxyTaskRoleDefaultPolicy933321B8
348 |       - ProxyTaskRole5DB6A540
349 |   ProxyApiServiceSecurityGroupfromBedrockProxyFargateProxyALBSecurityGroup9C12825880081F8FE2:
350 |     Type: AWS::EC2::SecurityGroupIngress
351 |     Properties:
352 |       Description: Load balancer to target
353 |       FromPort: 8080
354 |       GroupId:
355 |         Fn::GetAtt:
356 |           - ProxyApiServiceSecurityGroup51EBD9B8
357 |           - GroupId
358 |       IpProtocol: tcp
359 |       SourceSecurityGroupId:
360 |         Fn::GetAtt:
361 |           - ProxyALBSecurityGroup0D6CA3DA
362 |           - GroupId
363 |       ToPort: 8080
364 |     DependsOn:
365 |       - ProxyTaskRoleDefaultPolicy933321B8
366 |       - ProxyTaskRole5DB6A540
367 |   ProxyALB87756780:
368 |     Type: AWS::ElasticLoadBalancingV2::LoadBalancer
369 |     Properties:
370 |       LoadBalancerAttributes:
371 |         - Key: deletion_protection.enabled
372 |           Value: "false"
373 |         - Key: idle_timeout.timeout_seconds
374 |           Value: "600"
375 |       Scheme: internet-facing
376 |       SecurityGroups:
377 |         - Fn::GetAtt:
378 |             - ProxyALBSecurityGroup0D6CA3DA
379 |             - GroupId
380 |       Subnets:
381 |         - Ref: VPCPublicSubnet1SubnetB4246D30
382 |         - Ref: VPCPublicSubnet2Subnet74179F39
383 |       Type: application
384 |     DependsOn:
385 |       - VPCPublicSubnet1DefaultRoute91CEF279
386 |       - VPCPublicSubnet1RouteTableAssociation0B0896DC
387 |       - VPCPublicSubnet2DefaultRouteB7481BBA
388 |       - VPCPublicSubnet2RouteTableAssociation5A808732
389 |   ProxyALBSecurityGroup0D6CA3DA:
390 |     Type: AWS::EC2::SecurityGroup
391 |     Properties:
392 |       GroupDescription: Automatically created Security Group for ELB BedrockProxyFargateProxyALB481672E7
393 |       SecurityGroupIngress:
394 |         - CidrIp: 0.0.0.0/0
395 |           Description: Allow from anyone on port 80
396 |           FromPort: 80
397 |           IpProtocol: tcp
398 |           ToPort: 80
399 |       VpcId:
400 |         Ref: VPCB9E5F0B4
401 |   ProxyALBSecurityGrouptoBedrockProxyFargateProxyApiServiceSecurityGroupDDA1C56480393D1E44:
402 |     Type: AWS::EC2::SecurityGroupEgress
403 |     Properties:
404 |       Description: Load balancer to target
405 |       DestinationSecurityGroupId:
406 |         Fn::GetAtt:
407 |           - ProxyApiServiceSecurityGroup51EBD9B8
408 |           - GroupId
409 |       FromPort: 8080
410 |       GroupId:
411 |         Fn::GetAtt:
412 |           - ProxyALBSecurityGroup0D6CA3DA
413 |           - GroupId
414 |       IpProtocol: tcp
415 |       ToPort: 8080
416 |   ProxyALBListener933E9515:
417 |     Type: AWS::ElasticLoadBalancingV2::Listener
418 |     Properties:
419 |       DefaultActions:
420 |         - TargetGroupArn:
421 |             Ref: ProxyALBListenerTargetsGroup187739FA
422 |           Type: forward
423 |       LoadBalancerArn:
424 |         Ref: ProxyALB87756780
425 |       Port: 80
426 |       Protocol: HTTP
427 |   ProxyALBListenerTargetsGroup187739FA:
428 |     Type: AWS::ElasticLoadBalancingV2::TargetGroup
429 |     Properties:
430 |       HealthCheckEnabled: true
431 |       HealthCheckIntervalSeconds: 60
432 |       HealthCheckPath: /health
433 |       HealthCheckTimeoutSeconds: 30
434 |       Port: 8080
435 |       Protocol: HTTP
436 |       TargetGroupAttributes:
437 |         - Key: stickiness.enabled
438 |           Value: "false"
439 |       TargetType: ip
440 |       VpcId:
441 |         Ref: VPCB9E5F0B4
442 | Outputs:
443 |   APIBaseUrl:
444 |     Description: Proxy API Base URL (OPENAI_API_BASE)
445 |     Value:
446 |       Fn::Join:
447 |         - ""
448 |         - - http://
449 |           - Fn::GetAtt:
450 |               - ProxyALB87756780
451 |               - DNSName
452 |           - /api/v1
453 | 
454 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bedrock Access Gateway
  2 | 
  3 | OpenAI-compatible RESTful APIs for Amazon Bedrock
  4 | 
  5 | ## What's New 🔥
  6 | 
  7 | **API Gateway Response Streaming Support** - You can now deploy with Amazon API Gateway REST API instead of ALB, enabling true response streaming for better latency and cost optimization. See [Deployment Options](#deployment-options) for details.
  8 | 
  9 | **Latest Models Supported:**
 10 | - **Claude 4.5 Family**: Opus 4.5, Sonnet 4.5, Haiku 4.5 - Anthropic's most intelligent models with enhanced coding and agent capabilities
 11 | - **Amazon Nova**: Nova Micro, Nova Lite, Nova Pro, Nova Premier - Amazon's native foundation models with multimodal support
 12 | - **DeepSeek**: DeepSeek-R1 (reasoning), DeepSeek-V3.1 - Advanced reasoning and general-purpose models
 13 | - **Qwen 3**: Qwen3-32B, Qwen3-235B, Qwen3-Coder-30B, Qwen3-Coder-480B - Alibaba's latest language and coding models
 14 | - **OpenAI OSS**: gpt-oss-20b, gpt-oss-120b - Open-source GPT models available via Bedrock
 15 | 
 16 | It also supports reasoning for **Claude 4/4.5** (extended thinking and interleaved thinking) and **DeepSeek R1**. Check [How to Use](./docs/Usage.md#reasoning) for more details. You need to first run the Models API to refresh the model list.
 17 | 
 18 | ## Overview
 19 | 
 20 | Amazon Bedrock offers a wide range of foundation models (such as Claude 3 Opus/Sonnet/Haiku, Llama 2/3, Mistral/Mixtral,
 21 | etc.) and a broad set of capabilities for you to build generative AI applications. Check the [Amazon Bedrock](https://aws.amazon.com/bedrock) landing page for additional information.
 22 | 
 23 | Sometimes, you might have applications developed using OpenAI APIs or SDKs, and you want to experiment with Amazon Bedrock without modifying your codebase. Or you may simply wish to evaluate the capabilities of these foundation models in tools like AutoGen etc. Well, this repository allows you to access Amazon Bedrock models seamlessly through OpenAI APIs and SDKs, enabling you to test these models without code changes.
 24 | 
 25 | If you find this GitHub repository useful, please consider giving it a free star ⭐ to show your appreciation and support for the project.
 26 | 
 27 | **Features:**
 28 | 
 29 | - [x] Support streaming response via server-sent events (SSE)
 30 | - [x] Support Model APIs
 31 | - [x] Support Chat Completion APIs
 32 | - [x] Support Tool Call
 33 | - [x] Support Embedding API
 34 | - [x] Support Multimodal API
 35 | - [x] Support Cross-Region Inference
 36 | - [x] Support Application Inference Profiles (**new**)
 37 | - [x] Support Reasoning (**new**)
 38 | - [x] Support Interleaved thinking (**new**)
 39 | - [x] Support Prompt Caching (**new**)
 40 | 
 41 | Please check [Usage Guide](./docs/Usage.md) for more details about how to use the new APIs.
 42 | 
 43 | 
 44 | ## Get Started
 45 | 
 46 | ### Prerequisites
 47 | 
 48 | Please make sure you have met below prerequisites:
 49 | 
 50 | - Access to Amazon Bedrock foundation models.
 51 | 
 52 | > For more information on how to request model access, please refer to the [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) (Set Up > Model access)
 53 | 
 54 | ### Architecture
 55 | 
 56 | The following diagram illustrates the reference architecture. It uses [Amazon API Gateway response streaming](https://aws.amazon.com/blogs/compute/building-responsive-apis-with-amazon-api-gateway-response-streaming/) with Lambda for SSE support.
 57 | 
 58 | ![Architecture](assets/arch.png)
 59 | 
 60 | ### Deployment Options
 61 | 
 62 | | Option | Pros | Cons | Best For |
 63 | |--------|------|------|----------|
 64 | | **API Gateway + Lambda** | No VPC required, pay-per-request, native streaming support, lower operational overhead | Potential cold starts | Most use cases, cost-sensitive deployments |
 65 | | **ALB + Fargate** | Lowest streaming latency, no cold starts | Higher cost, requires VPC | High-throughput, latency-sensitive workloads |
 66 | 
 67 | You can also use Lambda Function URL as an alternative, see [example](https://github.com/awslabs/aws-lambda-web-adapter/tree/main/examples/fastapi-response-streaming)
 68 | 
 69 | ### Deployment
 70 | 
 71 | Please follow the steps below to deploy the Bedrock Proxy APIs into your AWS account. Only supports regions where Amazon Bedrock is available (such as `us-west-2`). The deployment will take approximately **10-15 minutes** 🕒.
 72 | 
 73 | **Step 1: Create your own API key in Secrets Manager (MUST)**
 74 | 
 75 | > **Note:** This step is to use any string (without spaces) you like to create a custom API Key (credential) that will be used to access the proxy API later. This key does not have to match your actual OpenAI key, and you don't need to have an OpenAI API key. please keep the key safe and private.
 76 | 
 77 | 1. Open the AWS Management Console and navigate to the AWS Secrets Manager service.
 78 | 2. Click on "Store a new secret" button.
 79 | 3. In the "Choose secret type" page, select:
 80 | 
 81 |    Secret type: Other type of secret
 82 |    Key/value pairs:
 83 |    - Key: api_key
 84 |    - Value: Enter your API key value
 85 | 
 86 |    Click "Next"
 87 | 4. In the "Configure secret" page:
 88 |    Secret name: Enter a name (e.g., "BedrockProxyAPIKey")
 89 |    Description: (Optional) Add a description of your secret
 90 | 5. Click "Next" and review all your settings and click "Store"
 91 | 
 92 | After creation, you'll see your secret in the Secrets Manager console. Make note of the secret ARN.
 93 | 
 94 | **Step 2: Build and push container images to ECR**
 95 | 
 96 | 1. Clone this repository:
 97 |    ```bash
 98 |    git clone https://github.com/aws-samples/bedrock-access-gateway.git
 99 |    cd bedrock-access-gateway
100 |    ```
101 | 
102 | 2. Run the build and push script:
103 |    ```bash
104 |    cd scripts
105 |    bash ./push-to-ecr.sh
106 |    ```
107 | 
108 | 3. Follow the prompts to configure:
109 |    - ECR repository names (or use defaults)
110 |    - Image tag (or use default: `latest`)
111 |    - AWS region (or use default: `us-east-1`)
112 | 
113 | 4. The script will build and push both Lambda and ECS/Fargate images to your ECR repositories.
114 | 
115 | 5. **Important**: Copy the image URIs displayed at the end of the script output. You'll need these in the next step.
116 | 
117 | **Step 3: Deploy the CloudFormation stack**
118 | 
119 | 1. Download the CloudFormation template you want to use:
120 |    - For API Gateway + Lambda: [`deployment/BedrockProxy.template`](deployment/BedrockProxy.template)
121 |    - For ALB + Fargate: [`deployment/BedrockProxyFargate.template`](deployment/BedrockProxyFargate.template)
122 | 
123 | 2. Sign in to AWS Management Console and navigate to the CloudFormation service in your target region.
124 | 
125 | 3. Click "Create stack" → "With new resources (standard)".
126 | 
127 | 4. Upload the template file you downloaded.
128 | 
129 | 5. On the "Specify stack details" page, provide the following information:
130 |    - **Stack name**: Enter a stack name (e.g., "BedrockProxyAPI")
131 |    - **ApiKeySecretArn**: Enter the secret ARN from Step 1
132 |    - **ContainerImageUri**: Enter the ECR image URI from Step 2 output
133 |    - **DefaultModelId**: (Optional) Change the default model if needed
134 | 
135 |    Click "Next".
136 | 
137 | 6. On the "Configure stack options" page, you can leave the default settings or customize them according to your needs. Click "Next".
138 | 
139 | 7. On the "Review" page, review all details. Check the "I acknowledge that AWS CloudFormation might create IAM resources" checkbox at the bottom. Click "Submit".
140 | 
141 | That is it! 🎉 Once deployed, click the CloudFormation stack and go to **Outputs** tab, you can find the API Base URL from `APIBaseUrl`, the value should look like `http://xxxx.xxx.elb.amazonaws.com/api/v1`.
142 | 
143 | ### Troubleshooting
144 | 
145 | If you encounter any issues, please check the [Troubleshooting Guide](./docs/Troubleshooting.md) for more details.
146 | 
147 | ### SDK/API Usage
148 | 
149 | All you need is the API Key and the API Base URL. If you didn't set up your own key following Step 1, the application will fail to start with an error message indicating that the API Key is not configured.
150 | 
151 | Now, you can try out the proxy APIs. Let's say you want to test Claude 3 Sonnet model (model ID: `anthropic.claude-3-sonnet-20240229-v1:0`)...
152 | 
153 | **Example API Usage**
154 | 
155 | ```bash
156 | export OPENAI_API_KEY=<API key>
157 | export OPENAI_BASE_URL=<API base url>
158 | # For older versions
159 | # https://github.com/openai/openai-python/issues/624
160 | export OPENAI_API_BASE=<API base url>
161 | ```
162 | 
163 | ```bash
164 | curl $OPENAI_BASE_URL/chat/completions \
165 |   -H "Content-Type: application/json" \
166 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
167 |   -d '{
168 |     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
169 |     "messages": [
170 |       {
171 |         "role": "user",
172 |         "content": "Hello!"
173 |       }
174 |     ]
175 |   }'
176 | ```
177 | 
178 | **Example SDK Usage**
179 | 
180 | ```python
181 | from openai import OpenAI
182 | 
183 | client = OpenAI()
184 | completion = client.chat.completions.create(
185 |     model="anthropic.claude-3-sonnet-20240229-v1:0",
186 |     messages=[{"role": "user", "content": "Hello!"}],
187 | )
188 | 
189 | print(completion.choices[0].message.content)
190 | ```
191 | 
192 | Please check [Usage Guide](./docs/Usage.md) for more details about how to use embedding API, multimodal API and tool call.
193 | 
194 | ### Application Inference Profiles
195 | 
196 | This proxy now supports **Application Inference Profiles**, which allow you to track usage and costs for your model invocations. You can use application inference profiles created in your AWS account for cost tracking and monitoring purposes.
197 | 
198 | **Using Application Inference Profiles:**
199 | 
200 | ```bash
201 | # Use an application inference profile ARN as the model ID
202 | curl $OPENAI_BASE_URL/chat/completions \
203 |   -H "Content-Type: application/json" \
204 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
205 |   -d '{
206 |     "model": "arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id",
207 |     "messages": [
208 |       {
209 |         "role": "user",
210 |         "content": "Hello!"
211 |       }
212 |     ]
213 |   }'
214 | ```
215 | 
216 | **SDK Usage with Application Inference Profiles:**
217 | 
218 | ```python
219 | from openai import OpenAI
220 | 
221 | client = OpenAI()
222 | completion = client.chat.completions.create(
223 |     model="arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id",
224 |     messages=[{"role": "user", "content": "Hello!"}],
225 | )
226 | 
227 | print(completion.choices[0].message.content)
228 | ```
229 | 
230 | **Benefits of Application Inference Profiles:**
231 | - **Cost Tracking**: Track usage and costs for specific applications or use cases
232 | - **Usage Monitoring**: Monitor model invocation metrics through CloudWatch
233 | - **Tag-based Cost Allocation**: Use AWS cost allocation tags for detailed billing analysis
234 | 
235 | For more information about creating and managing application inference profiles, see the [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/inference-profiles-create.html).
236 | 
237 | ### Prompt Caching
238 | 
239 | This proxy now supports **Prompt Caching** for Claude and Nova models, which can reduce costs by up to 90% and latency by up to 85% for workloads with repeated prompts.
240 | 
241 | **Supported Models:**
242 | - Claude models (Claude 3.5 Haiku, Claude 4, Claude 4.5, etc.)
243 | - Nova models (Nova Micro, Nova Lite, Nova Pro, Nova Premier)
244 | 
245 | **Enabling Prompt Caching:**
246 | 
247 | You can enable prompt caching in two ways:
248 | 
249 | 1. **Globally via Environment Variable** (set in ECS Task Definition or Lambda):
250 | ```bash
251 | ENABLE_PROMPT_CACHING=true
252 | ```
253 | 
254 | 2. **Per-request via `extra_body`** :
255 | 
256 | **Python SDK:**
257 | ```python
258 | from openai import OpenAI
259 | 
260 | client = OpenAI()
261 | 
262 | # Cache system prompts
263 | response = client.chat.completions.create(
264 |     model="global.anthropic.claude-haiku-4-5-20251001-v1:0",
265 |     messages=[
266 |         {"role": "system", "content": "You are an expert assistant with knowledge of..."},
267 |         {"role": "user", "content": "Help me with this task"}
268 |     ],
269 |     extra_body={
270 |         "prompt_caching": {"system": True}
271 |     }
272 | )
273 | 
274 | # Check cache hit
275 | if response.usage.prompt_tokens_details:
276 |     cached_tokens = response.usage.prompt_tokens_details.cached_tokens
277 |     print(f"Cached tokens: {cached_tokens}")
278 | ```
279 | 
280 | **cURL:**
281 | ```bash
282 | curl $OPENAI_BASE_URL/chat/completions \
283 |   -H "Content-Type: application/json" \
284 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
285 |   -d '{
286 |     "model": "global.anthropic.claude-haiku-4-5-20251001-v1:0",
287 |     "messages": [
288 |       {"role": "system", "content": "Long system prompt..."},
289 |       {"role": "user", "content": "Question"}
290 |     ],
291 |     "extra_body": {
292 |       "prompt_caching": {"system": true}
293 |     }
294 |   }'
295 | ```
296 | 
297 | **Cache Options:**
298 | - `"prompt_caching": {"system": true}` - Cache system prompts
299 | - `"prompt_caching": {"messages": true}` - Cache user messages
300 | - `"prompt_caching": {"system": true, "messages": true}` - Cache both
301 | 
302 | **Requirements:**
303 | - Prompt must be ≥1,024 tokens to enable caching
304 | - Cache TTL is 5 minutes (resets on each cache hit)
305 | - Nova models have a 20,000 token caching limit
306 | 
307 | For more information, see the [Amazon Bedrock Prompt Caching Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html).
308 | 
309 | ## Other Examples
310 | 
311 | ### LangChain
312 | 
313 | Make sure you use `ChatOpenAI(...)` instead of `OpenAI(...)`
314 | 
315 | ```python
316 | # pip install langchain-openai
317 | import os
318 | 
319 | from langchain.chains import LLMChain
320 | from langchain.prompts import PromptTemplate
321 | from langchain_openai import ChatOpenAI
322 | 
323 | chat = ChatOpenAI(
324 |     model="anthropic.claude-3-sonnet-20240229-v1:0",
325 |     temperature=0,
326 |     openai_api_key=os.environ['OPENAI_API_KEY'],
327 |     openai_api_base=os.environ['OPENAI_BASE_URL'],
328 | )
329 | 
330 | template = """Question: {question}
331 | 
332 | Answer: Let's think step by step."""
333 | 
334 | prompt = PromptTemplate.from_template(template)
335 | llm_chain = LLMChain(prompt=prompt, llm=chat)
336 | 
337 | question = "What NFL team won the Super Bowl in the year Justin Beiber was born?"
338 | response = llm_chain.invoke(question)
339 | print(response)
340 | 
341 | ```
342 | 
343 | ## FAQs
344 | 
345 | ### About Privacy
346 | 
347 | This application does not collect any of your data. Furthermore, it does not log any requests or responses by default.
348 | 
349 | ### Why choose API Gateway vs ALB?
350 | 
351 | **API Gateway + Lambda** uses [API Gateway response streaming](https://aws.amazon.com/blogs/compute/building-responsive-apis-with-amazon-api-gateway-response-streaming/) with [Lambda Web Adapter](https://github.com/awslabs/aws-lambda-web-adapter) to support SSE streaming without requiring a VPC. This is a cost-effective, serverless option with up to 10 minutes timeout.
352 | 
353 | **ALB + Fargate** provides the lowest streaming latency with no cold starts, ideal for high-throughput workloads.
354 | 
355 | ### Which regions are supported?
356 | 
357 | Generally speaking, all regions that Amazon Bedrock supports will also be supported, if not, please raise an issue in Github.
358 | 
359 | Note that not all models are available in those regions.
360 | 
361 | ### Which models are supported?
362 | 
363 | You can use the [Models API](./docs/Usage.md#models-api) to get/refresh a list of supported models in the current region.
364 | 
365 | ### Can I run this locally
366 | 
367 | Yes, you can run this locally, e.g. run below command under `src` folder:
368 | 
369 | ```bash
370 | uvicorn api.app:app --host 0.0.0.0 --port 8000
371 | ```
372 | 
373 | The API base url should look like `http://localhost:8000/api/v1`.
374 | 
375 | ### Any performance sacrifice or latency increase by using the proxy APIs
376 | 
377 | Compared with direct AWS SDK calls, the proxy architecture will add some latency. The default API Gateway + Lambda deployment provides good streaming performance with Lambda response streaming.
378 | 
379 | For lowest latency on streaming responses, consider the ALB + Fargate deployment option which eliminates cold starts and provides consistent performance.
380 | 
381 | ### Any plan to support SageMaker models?
382 | 
383 | Currently, there is no plan to support SageMaker models. This may change provided there's a demand from customers.
384 | 
385 | ### Any plan to support Bedrock custom models?
386 | 
387 | Fine-tuned models and models with Provisioned Throughput are currently not supported. You can clone the repo and make the customization if needed.
388 | 
389 | ### How to upgrade?
390 | 
391 | To use the latest features, you need follow the deployment guide to redeploy the application. You can upgrade the existing CloudFormation stack to get the latest changes.
392 | 
393 | ## Security
394 | 
395 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
396 | 
397 | ## License
398 | 
399 | This library is licensed under the MIT-0 License. See the LICENSE file.
400 | 


--------------------------------------------------------------------------------
/docs/Usage_CN.md:
--------------------------------------------------------------------------------
  1 | [English](./Usage.md)
  2 | 
  3 | # Usage Guide
  4 | 
  5 | 假设您在部署后已设置以下环境变量:
  6 | 
  7 | ```bash
  8 | export OPENAI_API_KEY=<API key>
  9 | export OPENAI_BASE_URL=<API base url>
 10 | ```
 11 | 
 12 | **API 示例:**
 13 | - [Models API](#models-api)
 14 | - [Embedding API](#embedding-api)
 15 | - [Multimodal API](#multimodal-api)
 16 | - [Tool Call](#tool-call)
 17 | - [Reasoning](#reasoning)
 18 | - [Interleaved thinking (beta)](#Interleaved thinking (beta))
 19 | 
 20 | 
 21 | ## Models API
 22 | 
 23 | 你可以通过这个API 获取支持的models 列表。 另外，如果Amazon Bedrock有新模型加入后，你也可以用它来更新刷新模型列表。
 24 | 
 25 | **Request 示例**
 26 | 
 27 | ```bash
 28 | curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq .data
 29 | ```
 30 | 
 31 | **Response 示例**
 32 | 
 33 | ```bash
 34 | [
 35 |   ...
 36 |   {
 37 |     "id": "anthropic.claude-3-5-sonnet-20240620-v1:0",
 38 |     "created": 1734416893,
 39 |     "object": "model",
 40 |     "owned_by": "bedrock"
 41 |   },
 42 |   {
 43 |     "id": "us.anthropic.claude-3-5-sonnet-20240620-v1:0",
 44 |     "created": 1734416893,
 45 |     "object": "model",
 46 |     "owned_by": "bedrock"
 47 |   },
 48 |   ...
 49 | ]
 50 | ```
 51 | 
 52 | ## Chat Completions API
 53 | 
 54 | ### Claude Sonnet 4.5 基础示例
 55 | 
 56 | Claude Sonnet 4.5 是 Anthropic 最智能的模型，在编码、复杂推理和基于代理的任务方面表现出色。它通过全球跨区域推理配置文件提供。
 57 | 
 58 | **Request 示例**
 59 | 
 60 | ```bash
 61 | curl $OPENAI_BASE_URL/chat/completions \
 62 |   -H "Content-Type: application/json" \
 63 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
 64 |   -d '{
 65 |     "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
 66 |     "messages": [
 67 |       {
 68 |         "role": "user",
 69 |         "content": "编写一个使用动态规划计算斐波那契数列的Python函数。"
 70 |       }
 71 |     ]
 72 |   }'
 73 | ```
 74 | 
 75 | **SDK 使用示例**
 76 | 
 77 | ```python
 78 | from openai import OpenAI
 79 | 
 80 | client = OpenAI()
 81 | completion = client.chat.completions.create(
 82 |     model="global.anthropic.claude-sonnet-4-5-20250929-v1:0",
 83 |     messages=[{"role": "user", "content": "编写一个使用动态规划计算斐波那契数列的Python函数。"}],
 84 | )
 85 | 
 86 | print(completion.choices[0].message.content)
 87 | ```
 88 | 
 89 | ## Embedding API
 90 | 
 91 | **重要**: 在使用此代理 API 之前,请仔细阅读以下几点:
 92 | 
 93 | 1. 如果您之前使用 OpenAI Embedding模型来创建向量,请注意切换到新模型可能没有那么直接。不同模型具有不同的维度(例如,embed-multilingual-v3.0 有 1024 个维度),即使对于相同的文本,它们也可能产生不同的结果。
 94 | 2. 如果您使用 OpenAI Embedding模型传入的是整数编码(例如与 LangChain 一起使用),此方案将尝试使用 `tiktoken` 进行解码以检索原始文本。但是,无法保证解码后的文本准确无误。
 95 | 3. 如果您对长文本使用 OpenAI Embedding,您应该验证 Bedrock 模型支持的最大Token数,例如为获得最佳性能,Bedrock 建议将文本长度限制在少于 512 个Token。
 96 | 
 97 | **Request 示例**
 98 | 
 99 | ```bash
100 | curl $OPENAI_BASE_URL/embeddings \
101 | -H "Authorization: Bearer $OPENAI_API_KEY" \
102 | -H "Content-Type: application/json" \
103 | -d '{
104 |     "input": "The food was delicious and the waiter...",
105 |     "model": "text-embedding-ada-002",
106 |     "encoding_format": "float"
107 |   }'
108 | ```
109 | 
110 | **Response 示例**
111 | 
112 | ```json
113 | {
114 |     "object": "list",
115 |     "data": [
116 |         {
117 |             "object": "embedding",
118 |             "embedding": [
119 |                 -0.02279663,
120 |                 -0.024612427,
121 |                 0.012863159,
122 |                 ...
123 |                 0.01612854,
124 |                 0.0038928986
125 |             ],
126 |             "index": 0
127 |         }
128 |     ],
129 |     "model": "cohere.embed-multilingual-v3",
130 |     "usage": {
131 |         "prompt_tokens": 0,
132 |         "total_tokens": 0
133 |     }
134 | }
135 | ```
136 | 
137 | 或者你可以使用OpenAI 的SDK
138 | 
139 | ```python
140 | from openai import OpenAI
141 | 
142 | client = OpenAI()
143 | 
144 | def get_embedding(text, model="text-embedding-3-small"):
145 |     text = text.replace("\n", " ")
146 |     return client.embeddings.create(input=[text], model=model).data[0].embedding
147 | 
148 | text = "hello"
149 | # will output like [0.003578186, 0.028717041, 0.031021118, -0.0014066696,...]
150 | print(get_embedding(text))
151 | ```
152 | 
153 | 或者 LangChain
154 | 
155 | ```python
156 | from langchain_openai import OpenAIEmbeddings
157 | 
158 | embeddings = OpenAIEmbeddings(
159 |     model="text-embedding-3-large",
160 | )
161 | text = "This is a test document."
162 | query_result = embeddings.embed_query(text)
163 | print(query_result[:5])
164 | doc_result = embeddings.embed_documents([text])
165 | print(doc_result[0][:5])
166 | ```
167 | 
168 | ## Multimodal API
169 | 
170 | **Request 示例**
171 | 
172 | ```bash
173 | curl $OPENAI_BASE_URL/chat/completions \
174 |   -H "Content-Type: application/json" \
175 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
176 |   -d '{
177 |     "model": "gpt-3.5-turbo",
178 |     "messages": [
179 |         {
180 |             "role": "user",
181 |             "content": [
182 |                 {
183 |                     "type": "text",
184 |                     "text": "please identify and count all the objects in this images, list all the names"
185 |                 },
186 |                 {
187 |                     "type": "image_url",
188 |                     "image_url": {
189 |                         "url": "https://github.com/aws-samples/bedrock-access-gateway/blob/main/assets/obj-detect.png?raw=true"
190 |                     }
191 |                 }
192 |             ]
193 |         }
194 |     ]
195 | }'
196 | ```
197 | 
198 | 如果您需要使用此API处理非公开图像,您可以先对图像进行base64编码,然后传递编码后的字符串。
199 | 将"image/jpeg"替换为实际的内容类型(content type)。目前仅支持"image/jpeg"、"image/png"、"image/gif"或"image/webp"。
200 | 
201 | ```bash
202 | curl $OPENAI_BASE_URL/chat/completions \
203 |   -H "Content-Type: application/json" \
204 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
205 |   -d '{
206 |     "model": "gpt-3.5-turbo",
207 |     "messages": [
208 |         {
209 |             "role": "user",
210 |             "content": [
211 |                 {
212 |                     "type": "text",
213 |                     "text": "please identify and count all the objects in this images, list all the names"
214 |                 },
215 |                 {
216 |                     "type": "image_url",
217 |                     "image_url": {
218 |                         "url": "data:image/jpeg;base64,<your image data>"
219 |                     }
220 |                 }
221 |             ]
222 |         }
223 |     ]
224 | }'
225 | ```
226 | 
227 | **Response 示例**
228 | 
229 | ```json
230 | {
231 |     "id": "msg_01BY3wcz41x7XrKhxY3VzWke",
232 |     "created": 1712543069,
233 |     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
234 |     "system_fingerprint": "fp",
235 |     "choices": [
236 |         {
237 |             "index": 0,
238 |             "finish_reason": "stop",
239 |             "message": {
240 |                 "role": "assistant",
241 |                 "content": "The image contains the following objects:\n\n1. A peach-colored short-sleeve button-up shirt\n2. An olive green plaid long coat/jacket\n3. A pair of white sneakers or canvas shoes\n4. A brown shoulder bag or purse\n5. A makeup brush or cosmetic applicator\n6. A tube or container (possibly lipstick or lip balm)\n7. A pair of sunglasses\n8. A thought bubble icon\n9. A footprint icon\n10. A leaf or plant icon\n11. A flower icon\n12. A cloud icon\n\nIn total, there are 12 distinct objects depicted in the illustrated scene."
242 |             }
243 |         }
244 |     ],
245 |     "object": "chat.completion",
246 |     "usage": {
247 |         "prompt_tokens": 197,
248 |         "completion_tokens": 147,
249 |         "total_tokens": 344
250 |     }
251 | }
252 | ```
253 | 
254 | 
255 | ## Tool Call
256 | 
257 | **重要**:在使用此代理API进行Tool Call之前,请仔细阅读以下几点:
258 | 
259 | 1. OpenAI 已经废弃使用Function Call,而推荐使用Tool Call,因此Function Call在此处不受支持,您应该改为Tool Call。
260 | 
261 | **Request 示例**
262 | 
263 | ```bash
264 | curl $OPENAI_BASE_URL/chat/completions \
265 |   -H "Content-Type: application/json" \
266 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
267 |   -d '{
268 |     "model": "gpt-3.5-turbo",
269 |     "messages": [
270 |         {
271 |             "role": "user",
272 |             "content": "What is the weather like in Shanghai today?"
273 |         }
274 |     ],
275 |     "tools": [
276 |         {
277 |             "type": "function",
278 |             "function": {
279 |                 "name": "get_current_weather",
280 |                 "description": "Get the current weather in a given location",
281 |                 "parameters": {
282 |                     "type": "object",
283 |                     "properties": {
284 |                         "location": {
285 |                             "type": "string",
286 |                             "description": "The city or state which is required."
287 |                         },
288 |                         "unit": {
289 |                             "type": "string",
290 |                             "enum": [
291 |                                 "celsius",
292 |                                 "fahrenheit"
293 |                             ]
294 |                         }
295 |                     },
296 |                     "required": [
297 |                         "location"
298 |                     ]
299 |                 }
300 |             }
301 |         },
302 |         {
303 |             "type": "function",
304 |             "function": {
305 |                 "name": "get_current_location",
306 |                 "description": "Use this tool to get the current location if user does not provide a location",
307 |                 "parameters": {
308 |                     "type": "object",
309 |                     "properties": {}
310 |                 }
311 |             }
312 |         }
313 |     ],
314 |     "tool_choice": "auto"
315 | }'
316 | ```
317 | 
318 | **Response 示例**
319 | 
320 | ```json
321 | {
322 |     "id": "msg_01PjrKDWhYGsrTNdeqzWd6D9",
323 |     "created": 1712543689,
324 |     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
325 |     "system_fingerprint": "fp",
326 |     "choices": [
327 |         {
328 |             "index": 0,
329 |             "finish_reason": "stop",
330 |             "message": {
331 |                 "role": "assistant",
332 |                 "tool_calls": [
333 |                     {
334 |                         "id": "0",
335 |                         "type": "function",
336 |                         "function": {
337 |                             "name": "get_current_weather",
338 |                             "arguments": "{\"location\": \"Shanghai\", \"unit\": \"celsius\"}"
339 |                         }
340 |                     }
341 |                 ]
342 |             }
343 |         }
344 |     ],
345 |     "object": "chat.completion",
346 |     "usage": {
347 |         "prompt_tokens": 256,
348 |         "completion_tokens": 64,
349 |         "total_tokens": 320
350 |     }
351 | }
352 | ```
353 | 
354 | You can try it with different questions, such as:
355 | 1. Hello, who are you?  (No tools are needed)
356 | 2. What is the weather like today?  (Should use get_current_location tool first)
357 | 
358 | ## Reasoning
359 | 
360 | 
361 | **重要**: 使用此 reasoning 推理模式前，请仔细阅读以下要点。
362 | 
363 | - 目前仅 Claude 3.7 Sonnet / Deepseek R1 模型支持推理功能。使用前请确保所用模型支持推理。
364 | - Claude 3.7 Sonnet 推理模式（或思考模式）默认未启用，您必须在请求中传递额外的 reasoning_effort 参数，参数值可选:low，medium, high。另外，请在请求中提供正确的 max_tokens（或 max_completion_tokens）参数。budget_tokens 基于 reasoning_effort 设置（低：30%，中：60%，高：100% 的max tokens），确保最小 budget_tokens 为 1,024，Anthropic 建议至少使用 4,000 个令牌以获得全面的推理。详情请参阅 [Bedrock Document](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-37.html)。
365 | - Deepseek R1 会自动使用推理模式，不需要在中传递额外的 reasoning_effort 参数（否则会报错）
366 | - 推理结果（思维链结果、思考过程）被添加到名为 'reasoning_content' 的额外标签中，这不是 OpenAI 官方支持的格式。此设计遵循 [Deepseek Reasoning Model](https://api-docs.deepseek.com/guides/reasoning_model#api-example)  的规范。未来可能会有所变动。
367 | 
368 | **Request 示例**
369 | 
370 | - Claude 3.7 Sonnet
371 | 
372 | ```bash
373 | curl $OPENAI_BASE_URL/chat/completions \
374 |   -H "Content-Type: application/json" \
375 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
376 |   -d '{
377 |     "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
378 |     "messages": [
379 |         {
380 |             "role": "user",
381 |             "content": "which one is bigger, 3.9 or 3.11?"
382 |         }
383 |     ],
384 |     "max_completion_tokens": 4096,
385 |     "reasoning_effort": "low",
386 |     "stream": false
387 | }'
388 | ```
389 | 
390 | - DeepSeek R1
391 | 
392 | ```bash
393 | curl $OPENAI_BASE_URL/chat/completions \
394 |   -H "Content-Type: application/json" \
395 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
396 |   -d '{
397 |     "model": "us.deepseek.r1-v1:0",
398 |     "messages": [
399 |         {
400 |             "role": "user",
401 |             "content": "which one is bigger, 3.9 or 3.11?"
402 |         }
403 |     ],
404 |     "stream": false
405 | }'
406 | ```
407 | 
408 | 
409 | **Response 示例**
410 | 
411 | ```json
412 | {
413 |     "id": "chatcmpl-83fb7a88",
414 |     "created": 1740545278,
415 |     "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
416 |     "system_fingerprint": "fp",
417 |     "choices": [
418 |         {
419 |             "index": 0,
420 |             "finish_reason": "stop",
421 |             "logprobs": null,
422 |             "message": {
423 |                 "role": "assistant",
424 |                 "content": "3.9 is bigger than 3.11.\n\nWhen comparing decimal numbers, we need to understand what these numbers actually represent:...",
425 |                 "reasoning_content": "I need to compare the decimal numbers 3.9 and 3.11.\n\nFor decimal numbers, we first compare the whole number parts, and if they're equal, we compare the decimal parts. \n\nBoth numbers ..."
426 |             }
427 |         }
428 |     ],
429 |     "object": "chat.completion",
430 |     "usage": {
431 |         "prompt_tokens": 51,
432 |         "completion_tokens": 565,
433 |         "total_tokens": 616
434 |     }
435 | }
436 | ```
437 | 
438 | 或者使用 OpenAI SDK (请先运行`pip3 install -U openai` 升级到最新版本)
439 | 
440 | - Non-Streaming
441 | 
442 | ```python
443 | from openai import OpenAI
444 | client = OpenAI()
445 | 
446 | messages = [{"role": "user", "content": "which one is bigger, 3.9 or 3.11?"}]
447 | response = client.chat.completions.create(
448 |     model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
449 |     messages=messages,
450 |     reasoning_effort="low",
451 |     max_completion_tokens=4096,
452 | )
453 | 
454 | reasoning_content = response.choices[0].message.reasoning_content
455 | content = response.choices[0].message.content
456 | ```
457 | 
458 | - Streaming
459 | 
460 | ```python
461 | from openai import OpenAI
462 | client = OpenAI()
463 | 
464 | messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
465 | response = client.chat.completions.create(
466 |     model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
467 |     messages=messages,
468 |     reasoning_effort="low",
469 |     max_completion_tokens=4096,
470 |     stream=True,
471 | )
472 | 
473 | reasoning_content = ""
474 | content = ""
475 | 
476 | for chunk in response:
477 |     if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content:
478 |         reasoning_content += chunk.choices[0].delta.reasoning_content
479 |     elif chunk.choices[0].delta.content:
480 |         content += chunk.choices[0].delta.content
481 | ```
482 | 
483 | ## Interleaved thinking (beta)
484 | 
485 | **重要提示**：在使用 Chat Completion API 的推理模式（reasoning mode）前，请务必仔细阅读以下内容。
486 | 
487 | Claude 4 模型支持借助工具使用的扩展思维功能（Extended Thinking），其中包含交错思考（[interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) ）。该功能使 Claude 4 可以在多次调用工具之间进行思考，并在收到工具结果后执行更复杂的推理，这对处理更复杂的 Agentic AI 交互非常有帮助。
488 | 
489 | 在交错思考模式下，budget_tokens 可以超过 max_tokens 参数，因为它代表一次助手回合中所有思考块的总 Token 预算。
490 | 
491 | **支持的模型**: Claude Sonnet 4, Claude Sonnet 4.5
492 | 
493 | **Request 示例**
494 | 
495 | - Non-Streaming (Claude Sonnet 4.5)
496 | 
497 | ```bash
498 | curl http://127.0.0.1:8000/api/v1/chat/completions \
499 | -H "Content-Type: application/json" \
500 | -H "Authorization: Bearer bedrock" \
501 | -d '{
502 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
503 | "max_tokens": 2048,
504 | "messages": [{
505 | "role": "user",
506 | "content": "解释如何实现一个具有自平衡功能的二叉搜索树。"
507 | }],
508 | "extra_body": {
509 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
510 | "thinking": {"type": "enabled", "budget_tokens": 4096}
511 | }
512 | }'
513 | ```
514 | 
515 | - Non-Streaming (Claude Sonnet 4)
516 | 
517 | ```bash
518 | curl http://127.0.0.1:8000/api/v1/chat/completions \
519 | -H "Content-Type: application/json" \
520 | -H "Authorization: Bearer bedrock" \
521 | -d '{
522 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
523 | "max_tokens": 2048,
524 | "messages": [{
525 | "role": "user",
526 | "content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
527 | }],
528 | "extra_body": {
529 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
530 | "thinking": {"type": "enabled", "budget_tokens": 4096}
531 | }
532 | }'
533 | ```
534 | 
535 | - Streaming (Claude Sonnet 4.5)
536 | 
537 | ```bash
538 | curl http://127.0.0.1:8000/api/v1/chat/completions \
539 | -H "Content-Type: application/json" \
540 | -H "Authorization: Bearer bedrock" \
541 | -d '{
542 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
543 | "max_tokens": 2048,
544 | "messages": [{
545 | "role": "user",
546 | "content": "解释如何实现一个具有自平衡功能的二叉搜索树。"
547 | }],
548 | "stream": true,
549 | "extra_body": {
550 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
551 | "thinking": {"type": "enabled", "budget_tokens": 4096}
552 | }
553 | }'
554 | ```
555 | 
556 | - Streaming (Claude Sonnet 4)
557 | 
558 | ```bash
559 | curl http://127.0.0.1:8000/api/v1/chat/completions \
560 | -H "Content-Type: application/json" \
561 | -H "Authorization: Bearer bedrock" \
562 | -d '{
563 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
564 | "max_tokens": 2048,
565 | "messages": [{
566 | "role": "user",
567 | "content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
568 | }],
569 | "stream": true,
570 | "extra_body": {
571 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
572 | "thinking": {"type": "enabled", "budget_tokens": 4096}
573 | }
574 | }'
575 | ```
576 | 


--------------------------------------------------------------------------------
/docs/Usage.md:
--------------------------------------------------------------------------------
  1 | [中文](./Usage_CN.md)
  2 | 
  3 | # Usage Guide
  4 | 
  5 | Assuming you have set up below environment variables after deployed:
  6 | 
  7 | ```bash
  8 | export OPENAI_API_KEY=<API key>
  9 | export OPENAI_BASE_URL=<API base url>
 10 | ```
 11 | 
 12 | **API Example:**
 13 | - [Models API](#models-api)
 14 | - [Embedding API](#embedding-api)
 15 | - [Multimodal API](#multimodal-api)
 16 | - [Tool Call](#tool-call)
 17 | - [Reasoning](#reasoning)
 18 | - [Interleaved thinking (beta)](#Interleaved thinking (beta))
 19 | 
 20 | ## Models API
 21 | 
 22 | You can use this API to get a list of supported model IDs.
 23 | 
 24 | Also, you can use this API to refresh the model list if new models are added to Amazon Bedrock.
 25 | 
 26 | 
 27 | **Example Request**
 28 | 
 29 | ```bash
 30 | curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq .data
 31 | ```
 32 | 
 33 | **Example Response**
 34 | 
 35 | ```bash
 36 | [
 37 |   ...
 38 |   {
 39 |     "id": "anthropic.claude-3-5-sonnet-20240620-v1:0",
 40 |     "created": 1734416893,
 41 |     "object": "model",
 42 |     "owned_by": "bedrock"
 43 |   },
 44 |   {
 45 |     "id": "us.anthropic.claude-3-5-sonnet-20240620-v1:0",
 46 |     "created": 1734416893,
 47 |     "object": "model",
 48 |     "owned_by": "bedrock"
 49 |   },
 50 |   ...
 51 | ]
 52 | ```
 53 | 
 54 | ## Chat Completions API
 55 | 
 56 | ### Basic Example with Claude Sonnet 4.5
 57 | 
 58 | Claude Sonnet 4.5 is Anthropic's most intelligent model, excelling at coding, complex reasoning, and agent-based tasks. It's available via global cross-region inference profiles.
 59 | 
 60 | **Example Request**
 61 | 
 62 | ```bash
 63 | curl $OPENAI_BASE_URL/chat/completions \
 64 |   -H "Content-Type: application/json" \
 65 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
 66 |   -d '{
 67 |     "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
 68 |     "messages": [
 69 |       {
 70 |         "role": "user",
 71 |         "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming."
 72 |       }
 73 |     ]
 74 |   }'
 75 | ```
 76 | 
 77 | **Example SDK Usage**
 78 | 
 79 | ```python
 80 | from openai import OpenAI
 81 | 
 82 | client = OpenAI()
 83 | completion = client.chat.completions.create(
 84 |     model="global.anthropic.claude-sonnet-4-5-20250929-v1:0",
 85 |     messages=[{"role": "user", "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming."}],
 86 | )
 87 | 
 88 | print(completion.choices[0].message.content)
 89 | ```
 90 | 
 91 | ## Embedding API
 92 | 
 93 | **Important Notice**: Please carefully review the following points before using this proxy API for embedding.
 94 | 
 95 | 1. If you have previously used OpenAI embedding models to create vectors, be aware that switching to a new model may not be straightforward. Different models have varying dimensions (e.g., embed-multilingual-v3.0 has 1024 dimensions), and even for the same text, they may produce different results.
 96 | 2. If you are using OpenAI embedding models for encoded integers (such as with LangChain), this solution will attempt to decode the integers using `tiktoken` to retrieve the original text. However, there is no guarantee that the decoded text will be accurate.
 97 | 3. If you are using OpenAI embedding models for long texts, you should verify the maximum number of tokens supported for Bedrock models, e.g. for optimal performance, Bedrock recommends limiting the text length to less than 512 tokens.
 98 | 
 99 | 
100 | **Example Request**
101 | 
102 | ```bash
103 | curl $OPENAI_BASE_URL/embeddings \
104 | -H "Authorization: Bearer $OPENAI_API_KEY" \
105 | -H "Content-Type: application/json" \
106 | -d '{
107 |     "input": "The food was delicious and the waiter...",
108 |     "model": "text-embedding-ada-002",
109 |     "encoding_format": "float"
110 |   }'
111 | ```
112 | 
113 | **Example Response**
114 | 
115 | ```json
116 | {
117 |     "object": "list",
118 |     "data": [
119 |         {
120 |             "object": "embedding",
121 |             "embedding": [
122 |                 -0.02279663,
123 |                 -0.024612427,
124 |                 0.012863159,
125 |                 ...
126 |                 0.01612854,
127 |                 0.0038928986
128 |             ],
129 |             "index": 0
130 |         }
131 |     ],
132 |     "model": "cohere.embed-multilingual-v3",
133 |     "usage": {
134 |         "prompt_tokens": 0,
135 |         "total_tokens": 0
136 |     }
137 | }
138 | ```
139 | 
140 | Alternatively, you can use the OpenAI SDK
141 | 
142 | ```python
143 | from openai import OpenAI
144 | 
145 | client = OpenAI()
146 | 
147 | def get_embedding(text, model="text-embedding-3-small"):
148 |     text = text.replace("\n", " ")
149 |     return client.embeddings.create(input=[text], model=model).data[0].embedding
150 | 
151 | text = "hello"
152 | # will output like [0.003578186, 0.028717041, 0.031021118, -0.0014066696,...]
153 | print(get_embedding(text))
154 | ```
155 | 
156 | Or LangChain
157 | 
158 | ```python
159 | from langchain_openai import OpenAIEmbeddings
160 | 
161 | embeddings = OpenAIEmbeddings(
162 |     model="text-embedding-3-large",
163 | )
164 | text = "This is a test document."
165 | query_result = embeddings.embed_query(text)
166 | print(query_result[:5])
167 | doc_result = embeddings.embed_documents([text])
168 | print(doc_result[0][:5])
169 | ```
170 | 
171 | ## Multimodal API
172 | 
173 | **Example Request**
174 | 
175 | ```bash
176 | curl $OPENAI_BASE_URL/chat/completions \
177 | curl $OPENAI_BASE_URL/chat/completions \
178 |   -H "Content-Type: application/json" \
179 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
180 |   -d '{
181 |     "model": "gpt-3.5-turbo",
182 |     "messages": [
183 |         {
184 |             "role": "user",
185 |             "content": [
186 |                 {
187 |                     "type": "text",
188 |                     "text": "please identify and count all the objects in these images, list all the names"
189 |                 },
190 |                 {
191 |                     "type": "image_url",
192 |                     "image_url": {
193 |                         "url": "https://github.com/aws-samples/bedrock-access-gateway/blob/main/assets/obj-detect.png?raw=true"
194 |                     }
195 |                 }
196 |             ]
197 |         }
198 |     ]
199 | }'
200 | ```
201 | 
202 | If you need to use this API with non-public images, you can do base64 the image first and pass the encoded string. 
203 | Replace `image/jpeg` with the actual content type. Currently, only 'image/jpeg', 'image/png', 'image/gif' or 'image/webp' is supported.
204 | 
205 | ```bash
206 | curl $OPENAI_BASE_URL/chat/completions \
207 |   -H "Content-Type: application/json" \
208 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
209 |   -d '{
210 |     "model": "gpt-3.5-turbo",
211 |     "messages": [
212 |         {
213 |             "role": "user",
214 |             "content": [
215 |                 {
216 |                     "type": "text",
217 |                     "text": "please identify and count all the objects in this images, list all the names"
218 |                 },
219 |                 {
220 |                     "type": "image_url",
221 |                     "image_url": {
222 |                         "url": "data:image/jpeg;base64,<your image data>"
223 |                     }
224 |                 }
225 |             ]
226 |         }
227 |     ]
228 | }'
229 | ```
230 | 
231 | **Example Response**
232 | 
233 | ```json
234 | {
235 |     "id": "msg_01BY3wcz41x7XrKhxY3VzWke",
236 |     "created": 1712543069,
237 |     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
238 |     "system_fingerprint": "fp",
239 |     "choices": [
240 |         {
241 |             "index": 0,
242 |             "finish_reason": "stop",
243 |             "message": {
244 |                 "role": "assistant",
245 |                 "content": "The image contains the following objects:\n\n1. A peach-colored short-sleeve button-up shirt\n2. An olive green plaid long coat/jacket\n3. A pair of white sneakers or canvas shoes\n4. A brown shoulder bag or purse\n5. A makeup brush or cosmetic applicator\n6. A tube or container (possibly lipstick or lip balm)\n7. A pair of sunglasses\n8. A thought bubble icon\n9. A footprint icon\n10. A leaf or plant icon\n11. A flower icon\n12. A cloud icon\n\nIn total, there are 12 distinct objects depicted in the illustrated scene."
246 |             }
247 |         }
248 |     ],
249 |     "object": "chat.completion",
250 |     "usage": {
251 |         "prompt_tokens": 197,
252 |         "completion_tokens": 147,
253 |         "total_tokens": 344
254 |     }
255 | }
256 | ```
257 | 
258 | 
259 | ## Tool Call
260 | 
261 | **Important Notice**: Please carefully review the following points before using this Tool Call for Chat completion API.
262 | 
263 | 1. Function Call is now deprecated in favor of Tool Call by OpenAI, hence it's not supported here, you should use Tool Call instead.
264 | 
265 | **Example Request**
266 | 
267 | ```bash
268 | curl $OPENAI_BASE_URL/chat/completions \
269 |   -H "Content-Type: application/json" \
270 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
271 |   -d '{
272 |     "model": "gpt-3.5-turbo",
273 |     "messages": [
274 |         {
275 |             "role": "user",
276 |             "content": "What is the weather like in Shanghai today?"
277 |         }
278 |     ],
279 |     "tools": [
280 |         {
281 |             "type": "function",
282 |             "function": {
283 |                 "name": "get_current_weather",
284 |                 "description": "Get the current weather in a given location",
285 |                 "parameters": {
286 |                     "type": "object",
287 |                     "properties": {
288 |                         "location": {
289 |                             "type": "string",
290 |                             "description": "The city or state which is required."
291 |                         },
292 |                         "unit": {
293 |                             "type": "string",
294 |                             "enum": [
295 |                                 "celsius",
296 |                                 "fahrenheit"
297 |                             ]
298 |                         }
299 |                     },
300 |                     "required": [
301 |                         "location"
302 |                     ]
303 |                 }
304 |             }
305 |         },
306 |         {
307 |             "type": "function",
308 |             "function": {
309 |                 "name": "get_current_location",
310 |                 "description": "Use this tool to get the current location if user does not provide a location",
311 |                 "parameters": {
312 |                     "type": "object",
313 |                     "properties": {}
314 |                 }
315 |             }
316 |         }
317 |     ],
318 |     "tool_choice": "auto"
319 | }'
320 | ```
321 | 
322 | **Example Response**
323 | 
324 | ```json
325 | {
326 |     "id": "msg_01PjrKDWhYGsrTNdeqzWd6D9",
327 |     "created": 1712543689,
328 |     "model": "anthropic.claude-3-sonnet-20240229-v1:0",
329 |     "system_fingerprint": "fp",
330 |     "choices": [
331 |         {
332 |             "index": 0,
333 |             "finish_reason": "stop",
334 |             "message": {
335 |                 "role": "assistant",
336 |                 "tool_calls": [
337 |                     {
338 |                         "id": "0",
339 |                         "type": "function",
340 |                         "function": {
341 |                             "name": "get_current_weather",
342 |                             "arguments": "{\"location\": \"Shanghai\", \"unit\": \"celsius\"}"
343 |                         }
344 |                     }
345 |                 ]
346 |             }
347 |         }
348 |     ],
349 |     "object": "chat.completion",
350 |     "usage": {
351 |         "prompt_tokens": 256,
352 |         "completion_tokens": 64,
353 |         "total_tokens": 320
354 |     }
355 | }
356 | ```
357 | 
358 | You can try it with different questions, such as:
359 | 1. Hello, who are you?  (No tools are needed)
360 | 2. What is the weather like today?  (Should use get_current_location tool first)
361 | 
362 | 
363 | ## Reasoning
364 | 
365 | **Important Notice**: Please carefully review the following points before using reasoning mode for Chat completion API.
366 | - Only Claude 3.7 Sonnet (extended thinking) and DeepSeek R1 support Reasoning so far. Please make sure the model supports reasoning before use.
367 | - For Claude 3.7 Sonnet, the reasoning mode (or thinking mode) is not enabled by default, you must pass additional `reasoning_effort` parameter in your request. Please also provide the right max_tokens (or max_completion_tokens) in your request. The budget_tokens is based on reasoning_effort (low: 30%, medium: 60%, high: 100% of max tokens), ensuring minimum budget_tokens of 1,024 with Anthropic recommending at least 4,000 tokens for comprehensive reasoning. Check [Bedrock Document](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-37.html) for more details.
368 | - For DeepSeek R1, you don't need additional reasoning_effort parameter, otherwise, you may get an error.
369 | - The reasoning response (CoT, thoughts) is added in an additional tag 'reasoning_content' which is not officially supported by OpenAI. This is to follow [Deepseek Reasoning Model](https://api-docs.deepseek.com/guides/reasoning_model#api-example). This may be changed in the future.
370 | 
371 | **Example Request**
372 | 
373 | - Claude 3.7 Sonnet
374 | 
375 | ```bash
376 | curl $OPENAI_BASE_URL/chat/completions \
377 |   -H "Content-Type: application/json" \
378 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
379 |   -d '{
380 |     "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
381 |     "messages": [
382 |             "role": "user",
383 |             "content": "which one is bigger, 3.9 or 3.11?"
384 |         }
385 |     ],
386 |     "max_completion_tokens": 4096,
387 |     "reasoning_effort": "low",
388 |     "stream": false
389 | }'
390 | ```
391 | 
392 | - DeepSeek R1
393 | 
394 | ```bash
395 | curl $OPENAI_BASE_URL/chat/completions \
396 |   -H "Content-Type: application/json" \
397 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
398 |   -d '{
399 |     "model": "us.deepseek.r1-v1:0",
400 |     "messages": [
401 |         {
402 |             "role": "user",
403 |             "content": "which one is bigger, 3.9 or 3.11?"
404 |         }
405 |     ],
406 |     "stream": false
407 | }'
408 | ```
409 | 
410 | **Example Response**
411 | 
412 | ```json
413 | {
414 |     "id": "chatcmpl-83fb7a88",
415 |     "created": 1740545278,
416 |     "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
417 |     "system_fingerprint": "fp",
418 |     "choices": [
419 |         {
420 |             "index": 0,
421 |             "finish_reason": "stop",
422 |             "logprobs": null,
423 |             "message": {
424 |                 "role": "assistant",
425 |                 "content": "3.9 is bigger than 3.11.\n\nWhen comparing decimal numbers, we need to understand what these numbers actually represent:...",
426 |                 "reasoning_content": "I need to compare the decimal numbers 3.9 and 3.11.\n\nFor decimal numbers, we first compare the whole number parts, and if they're equal, we compare the decimal parts. \n\nBoth numbers ..."
427 |             }
428 |         }
429 |     ],
430 |     "object": "chat.completion",
431 |     "usage": {
432 |         "prompt_tokens": 51,
433 |         "completion_tokens": 565,
434 |         "total_tokens": 616
435 |     }
436 | }
437 | ```
438 | 
439 | You can also use OpenAI SDK (run `pip3 install -U openai` first )
440 | 
441 | - Non-Streaming
442 | 
443 | ```python
444 | from openai import OpenAI
445 | client = OpenAI()
446 | 
447 | messages = [{"role": "user", "content": "which one is bigger, 3.9 or 3.11?"}]
448 | response = client.chat.completions.create(
449 |     model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
450 |     messages=messages,
451 |     reasoning_effort="low",
452 |     max_completion_tokens=4096,
453 | )
454 | 
455 | reasoning_content = response.choices[0].message.reasoning_content
456 | content = response.choices[0].message.content
457 | ```
458 | 
459 | - Streaming
460 | 
461 | ```python
462 | from openai import OpenAI
463 | client = OpenAI()
464 | 
465 | messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
466 | response = client.chat.completions.create(
467 |     model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
468 |     messages=messages,
469 |     reasoning_effort="low",
470 |     max_completion_tokens=4096,
471 |     stream=True,
472 | )
473 | 
474 | reasoning_content = ""
475 | content = ""
476 | 
477 | for chunk in response:
478 |     if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content:
479 |         reasoning_content += chunk.choices[0].delta.reasoning_content
480 |     elif chunk.choices[0].delta.content:
481 |         content += chunk.choices[0].delta.content
482 | ```
483 | 
484 | ## Interleaved thinking (beta)
485 | 
486 | **Important Notice**: Please carefully review the following points before using reasoning mode for Chat completion API.
487 | 
488 | Extended thinking with tool use in Claude 4 models supports [interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) enables Claude 4 models to think between tool calls and run more sophisticated reasoning after receiving tool results. which is helpful for more complex agentic interactions.
489 | With interleaved thinking, the `budget_tokens` can exceed the `max_tokens` parameter because it represents the total budget across all thinking blocks within one assistant turn.
490 | 
491 | **Supported Models**: Claude Sonnet 4, Claude Sonnet 4.5
492 | 
493 | **Example Request**
494 | 
495 | - Non-Streaming (Claude Sonnet 4.5)
496 | 
497 | ```bash
498 | curl http://127.0.0.1:8000/api/v1/chat/completions \
499 | -H "Content-Type: application/json" \
500 | -H "Authorization: Bearer bedrock" \
501 | -d '{
502 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
503 | "max_tokens": 2048,
504 | "messages": [{
505 | "role": "user",
506 | "content": "Explain how to implement a binary search tree with self-balancing capabilities."
507 | }],
508 | "extra_body": {
509 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
510 | "thinking": {"type": "enabled", "budget_tokens": 4096}
511 | }
512 | }'
513 | ```
514 | 
515 | - Non-Streaming (Claude Sonnet 4)
516 | 
517 | ```bash
518 | curl http://127.0.0.1:8000/api/v1/chat/completions \
519 | -H "Content-Type: application/json" \
520 | -H "Authorization: Bearer bedrock" \
521 | -d '{
522 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
523 | "max_tokens": 2048,
524 | "messages": [{
525 | "role": "user",
526 | "content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
527 | }],
528 | "extra_body": {
529 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
530 | "thinking": {"type": "enabled", "budget_tokens": 4096}
531 | }
532 | }'
533 | ```
534 | 
535 | - Streaming (Claude Sonnet 4.5)
536 | 
537 | ```bash
538 | curl http://127.0.0.1:8000/api/v1/chat/completions \
539 | -H "Content-Type: application/json" \
540 | -H "Authorization: Bearer bedrock" \
541 | -d '{
542 | "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
543 | "max_tokens": 2048,
544 | "messages": [{
545 | "role": "user",
546 | "content": "Explain how to implement a binary search tree with self-balancing capabilities."
547 | }],
548 | "stream": true,
549 | "extra_body": {
550 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
551 | "thinking": {"type": "enabled", "budget_tokens": 4096}
552 | }
553 | }'
554 | ```
555 | 
556 | - Streaming (Claude Sonnet 4)
557 | 
558 | ```bash
559 | curl http://127.0.0.1:8000/api/v1/chat/completions \
560 | -H "Content-Type: application/json" \
561 | -H "Authorization: Bearer bedrock" \
562 | -d '{
563 | "model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
564 | "max_tokens": 2048,
565 | "messages": [{
566 | "role": "user",
567 | "content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
568 | }],
569 | "stream": true,
570 | "extra_body": {
571 | "anthropic_beta": ["interleaved-thinking-2025-05-14"],
572 | "thinking": {"type": "enabled", "budget_tokens": 4096}
573 | }
574 | }'
575 | ```
576 | 


--------------------------------------------------------------------------------
/src/api/models/bedrock.py:
--------------------------------------------------------------------------------
   1 | import base64
   2 | import json
   3 | import logging
   4 | import re
   5 | import time
   6 | from abc import ABC
   7 | from typing import AsyncIterable, Iterable, Literal
   8 | 
   9 | import boto3
  10 | import numpy as np
  11 | import requests
  12 | import tiktoken
  13 | from botocore.config import Config
  14 | from fastapi import HTTPException
  15 | from starlette.concurrency import run_in_threadpool
  16 | 
  17 | from api.models.base import BaseChatModel, BaseEmbeddingsModel
  18 | from api.schema import (
  19 |     AssistantMessage,
  20 |     ChatRequest,
  21 |     ChatResponse,
  22 |     ChatResponseMessage,
  23 |     ChatStreamResponse,
  24 |     Choice,
  25 |     ChoiceDelta,
  26 |     CompletionTokensDetails,
  27 |     DeveloperMessage,
  28 |     Embedding,
  29 |     EmbeddingsRequest,
  30 |     EmbeddingsResponse,
  31 |     EmbeddingsUsage,
  32 |     Error,
  33 |     ErrorMessage,
  34 |     Function,
  35 |     ImageContent,
  36 |     PromptTokensDetails,
  37 |     ResponseFunction,
  38 |     TextContent,
  39 |     ToolCall,
  40 |     ToolContent,
  41 |     ToolMessage,
  42 |     Usage,
  43 |     UserMessage,
  44 | )
  45 | from api.setting import (
  46 |     AWS_REGION,
  47 |     DEBUG,
  48 |     DEFAULT_MODEL,
  49 |     ENABLE_CROSS_REGION_INFERENCE,
  50 |     ENABLE_APPLICATION_INFERENCE_PROFILES,
  51 |     ENABLE_PROMPT_CACHING,
  52 | )
  53 | 
  54 | logger = logging.getLogger(__name__)
  55 | 
  56 | config = Config(
  57 |             connect_timeout=60,      # Connection timeout: 60 seconds
  58 |             read_timeout=900,        # Read timeout: 15 minutes (suitable for long streaming responses)
  59 |             retries={
  60 |                 'max_attempts': 8,   # Maximum retry attempts
  61 |                 'mode': 'adaptive'   # Adaptive retry mode
  62 |             },
  63 |             max_pool_connections=50  # Maximum connection pool size
  64 |         )
  65 | 
  66 | bedrock_runtime = boto3.client(
  67 |     service_name="bedrock-runtime",
  68 |     region_name=AWS_REGION,
  69 |     config=config,
  70 | )
  71 | bedrock_client = boto3.client(
  72 |     service_name="bedrock",
  73 |     region_name=AWS_REGION,
  74 |     config=config,
  75 | )
  76 | 
  77 | SUPPORTED_BEDROCK_EMBEDDING_MODELS = {
  78 |     "cohere.embed-multilingual-v3": "Cohere Embed Multilingual",
  79 |     "cohere.embed-english-v3": "Cohere Embed English",
  80 |     "amazon.titan-embed-text-v1": "Titan Embeddings G1 - Text",
  81 |     "amazon.titan-embed-text-v2:0": "Titan Embeddings G2 - Text",
  82 |     # Disable Titan embedding.
  83 |     # "amazon.titan-embed-image-v1": "Titan Multimodal Embeddings G1"
  84 | }
  85 | 
  86 | ENCODER = tiktoken.get_encoding("cl100k_base")
  87 | 
  88 | # Global mapping: Profile ID/ARN → Foundation Model ID
  89 | # Handles both SYSTEM_DEFINED (cross-region) and APPLICATION profiles
  90 | # This enables feature detection for all profile types without pattern matching
  91 | profile_metadata = {}
  92 | 
  93 | # Models that don't support both temperature and topP simultaneously
  94 | # When both are provided, temperature takes precedence and topP is removed
  95 | TEMPERATURE_TOPP_CONFLICT_MODELS = {
  96 |     "claude-sonnet-4-5",
  97 |     "claude-haiku-4-5",
  98 |     "claude-opus-4-5",
  99 | }
 100 | 
 101 | 
 102 | def list_bedrock_models() -> dict:
 103 |     """Automatically getting a list of supported models.
 104 | 
 105 |     Returns a model list combines:
 106 |         - ON_DEMAND models.
 107 |         - Cross-Region Inference Profiles (if enabled via Env)
 108 |         - Application Inference Profiles (if enabled via Env)
 109 |     """
 110 |     model_list = {}
 111 |     try:
 112 |         if ENABLE_CROSS_REGION_INFERENCE:
 113 |             # List system defined inference profile IDs and store underlying model mapping
 114 |             paginator = bedrock_client.get_paginator('list_inference_profiles')
 115 |             for page in paginator.paginate(maxResults=1000, typeEquals="SYSTEM_DEFINED"):
 116 |                 for profile in page["inferenceProfileSummaries"]:
 117 |                     profile_id = profile.get("inferenceProfileId")
 118 |                     if not profile_id:
 119 |                         continue
 120 | 
 121 |                     # Extract underlying model from first model in the profile
 122 |                     models = profile.get("models", [])
 123 |                     if models:
 124 |                         model_arn = models[0].get("modelArn", "")
 125 |                         if model_arn:
 126 |                             # Extract foundation model ID from ARN
 127 |                             model_id = model_arn.split('/')[-1]
 128 |                             profile_metadata[profile_id] = {
 129 |                                 "underlying_model_id": model_id,
 130 |                                 "profile_type": "SYSTEM_DEFINED",
 131 |                             }
 132 | 
 133 |         if ENABLE_APPLICATION_INFERENCE_PROFILES:
 134 |             # List application defined inference profile IDs and create mapping
 135 |             paginator = bedrock_client.get_paginator('list_inference_profiles')
 136 |             for page in paginator.paginate(maxResults=1000, typeEquals="APPLICATION"):
 137 |                 for profile in page["inferenceProfileSummaries"]:
 138 |                     try:
 139 |                         profile_arn = profile.get("inferenceProfileArn")
 140 |                         if not profile_arn:
 141 |                             continue
 142 | 
 143 |                         # Process all models in the profile
 144 |                         models = profile.get("models", [])
 145 |                         if not models:
 146 |                             logger.warning(f"Application profile {profile_arn} has no models")
 147 |                             continue
 148 | 
 149 |                         # Take first model - all models in array are same type (regional instances)
 150 |                         first_model = models[0]
 151 |                         model_arn = first_model.get("modelArn", "")
 152 |                         if not model_arn:
 153 |                             continue
 154 | 
 155 |                         # Extract model ID from ARN (works for both foundation models and cross-region profiles)
 156 |                         model_id = model_arn.split('/')[-1] if '/' in model_arn else model_arn
 157 | 
 158 |                         # Store in unified profile metadata for feature detection
 159 |                         profile_metadata[profile_arn] = {
 160 |                             "underlying_model_id": model_id,
 161 |                             "profile_type": "APPLICATION",
 162 |                             "profile_name": profile.get("inferenceProfileName", ""),
 163 |                         }
 164 |                     except Exception as e:
 165 |                         logger.warning(f"Error processing application profile: {e}")
 166 |                         continue
 167 | 
 168 |         # List foundation models, only cares about text outputs here.
 169 |         response = bedrock_client.list_foundation_models(byOutputModality="TEXT")
 170 | 
 171 |         for model in response["modelSummaries"]:
 172 |             model_id = model.get("modelId", "N/A")
 173 |             stream_supported = model.get("responseStreamingSupported", True)
 174 |             status = model["modelLifecycle"].get("status", "ACTIVE")
 175 | 
 176 |             # currently, use this to filter out rerank models and legacy models
 177 |             if not stream_supported or status not in ["ACTIVE", "LEGACY"]:
 178 |                 continue
 179 | 
 180 |             inference_types = model.get("inferenceTypesSupported", [])
 181 |             input_modalities = model["inputModalities"]
 182 |             # Add on-demand model list
 183 |             if "ON_DEMAND" in inference_types:
 184 |                 model_list[model_id] = {"modalities": input_modalities}
 185 | 
 186 |             # Add all inference profiles (cross-region and application) for this model
 187 |             for profile_id, metadata in profile_metadata.items():
 188 |                 if metadata.get("underlying_model_id") == model_id:
 189 |                     model_list[profile_id] = {"modalities": input_modalities}
 190 | 
 191 |     except Exception as e:
 192 |         logger.error(f"Unable to list models: {str(e)}")
 193 | 
 194 |     if not model_list:
 195 |         # In case stack not updated.
 196 |         model_list[DEFAULT_MODEL] = {"modalities": ["TEXT", "IMAGE"]}
 197 | 
 198 |     return model_list
 199 | 
 200 | 
 201 | # Initialize the model list.
 202 | bedrock_model_list = list_bedrock_models()
 203 | 
 204 | 
 205 | class BedrockModel(BaseChatModel):
 206 |     def list_models(self) -> list[str]:
 207 |         """Always refresh the latest model list"""
 208 |         global bedrock_model_list
 209 |         bedrock_model_list = list_bedrock_models()
 210 |         return list(bedrock_model_list.keys())
 211 | 
 212 |     def validate(self, chat_request: ChatRequest):
 213 |         """Perform basic validation on requests"""
 214 |         error = ""
 215 |         # check if model is supported
 216 |         if chat_request.model not in bedrock_model_list.keys():
 217 |             # Provide helpful error for application profiles
 218 |             if "application-inference-profile" in chat_request.model:
 219 |                 error = (
 220 |                     f"Application profile {chat_request.model} not found. "
 221 |                     f"Available profiles can be listed via GET /models API. "
 222 |                     f"Ensure ENABLE_APPLICATION_INFERENCE_PROFILES=true and "
 223 |                     f"the profile exists in your AWS account."
 224 |                 )
 225 |             else:
 226 |                 error = f"Unsupported model {chat_request.model}, please use models API to get a list of supported models"
 227 |             logger.error("Unsupported model: %s", chat_request.model)
 228 | 
 229 |         # Validate profile has resolvable underlying model
 230 |         if not error and chat_request.model in profile_metadata:
 231 |             resolved = self._resolve_to_foundation_model(chat_request.model)
 232 |             if resolved == chat_request.model:
 233 |                 logger.warning(
 234 |                     f"Could not resolve profile {chat_request.model} "
 235 |                     f"to underlying model. Some features may not work correctly."
 236 |                 )
 237 | 
 238 |         if error:
 239 |             raise HTTPException(
 240 |                 status_code=400,
 241 |                 detail=error,
 242 |             )
 243 | 
 244 |     def _resolve_to_foundation_model(self, model_id: str) -> str:
 245 |         """
 246 |         Resolve any model identifier to foundation model ID for feature detection.
 247 | 
 248 |         Handles:
 249 |         - Cross-region profiles (us.*, eu.*, apac.*, global.*)
 250 |         - Application profiles (arn:aws:bedrock:...:application-inference-profile/...)
 251 |         - Foundation models (pass through unchanged)
 252 | 
 253 |         No pattern matching needed - just dictionary lookup.
 254 |         Unknown identifiers pass through unchanged (graceful fallback).
 255 | 
 256 |         Args:
 257 |             model_id: Can be foundation model ID, cross-region profile, or app profile ARN
 258 | 
 259 |         Returns:
 260 |             Foundation model ID if mapping exists, otherwise original model_id
 261 |         """
 262 |         if model_id in profile_metadata:
 263 |             return profile_metadata[model_id]["underlying_model_id"]
 264 |         return model_id
 265 | 
 266 |     def _supports_prompt_caching(self, model_id: str) -> bool:
 267 |         """
 268 |         Check if model supports prompt caching based on model ID pattern.
 269 | 
 270 |         Uses pattern matching instead of hardcoded whitelist for better maintainability.
 271 |         Automatically supports new models following the naming convention.
 272 | 
 273 |         Supported models:
 274 |         - Claude: anthropic.claude-* (excluding very old versions)
 275 |         - Nova: amazon.nova-*
 276 | 
 277 |         Returns:
 278 |             bool: True if model supports prompt caching
 279 |         """
 280 |         # Resolve profile to underlying model for feature detection
 281 |         resolved_model = self._resolve_to_foundation_model(model_id)
 282 |         model_lower = resolved_model.lower()
 283 | 
 284 |         # Claude models pattern matching
 285 |         if "anthropic.claude" in model_lower:
 286 |             # Exclude very old models that don't support caching
 287 |             excluded_patterns = ["claude-instant", "claude-v1", "claude-v2"]
 288 |             if any(pattern in model_lower for pattern in excluded_patterns):
 289 |                 return False
 290 |             return True
 291 | 
 292 |         # Nova models pattern matching
 293 |         if "amazon.nova" in model_lower:
 294 |             return True
 295 | 
 296 |         # Future providers can be added here
 297 |         # Example: if "provider.model-name" in model_lower: return True
 298 | 
 299 |         return False
 300 | 
 301 |     def _get_max_cache_tokens(self, model_id: str) -> int | None:
 302 |         """
 303 |         Get maximum cacheable tokens limit for the model.
 304 | 
 305 |         Different models have different caching limits:
 306 |         - Claude: No explicit limit mentioned in docs
 307 |         - Nova: 20,000 tokens max
 308 | 
 309 |         Returns:
 310 |             int | None: Max tokens, or None if unlimited
 311 |         """
 312 |         # Resolve profile to underlying model for feature detection
 313 |         resolved_model = self._resolve_to_foundation_model(model_id)
 314 |         model_lower = resolved_model.lower()
 315 | 
 316 |         # Nova models have 20K limit
 317 |         if "amazon.nova" in model_lower:
 318 |             return 20_000
 319 | 
 320 |         # Claude: No explicit limit
 321 |         if "anthropic.claude" in model_lower:
 322 |             return None
 323 | 
 324 |         return None
 325 | 
 326 |     async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
 327 |         """Common logic for invoke bedrock models"""
 328 |         if DEBUG:
 329 |             logger.info("Raw request: " + chat_request.model_dump_json())
 330 | 
 331 |             # Log profile resolution for debugging
 332 |             if chat_request.model in profile_metadata:
 333 |                 resolved = self._resolve_to_foundation_model(chat_request.model)
 334 |                 profile_type = profile_metadata[chat_request.model].get("profile_type", "UNKNOWN")
 335 |                 logger.info(
 336 |                     f"Profile resolution: {chat_request.model} ({profile_type}) → {resolved}"
 337 |                 )
 338 | 
 339 |         # convert OpenAI chat request to Bedrock SDK request
 340 |         args = self._parse_request(chat_request)
 341 |         if DEBUG:
 342 |             logger.info("Bedrock request: " + json.dumps(str(args)))
 343 | 
 344 |         try:
 345 |             if stream:
 346 |                 # Run the blocking boto3 call in a thread pool
 347 |                 response = await run_in_threadpool(
 348 |                     bedrock_runtime.converse_stream, **args
 349 |                 )
 350 |             else:
 351 |                 # Run the blocking boto3 call in a thread pool
 352 |                 response = await run_in_threadpool(bedrock_runtime.converse, **args)
 353 |         except bedrock_runtime.exceptions.ValidationException as e:
 354 |             logger.error("Bedrock validation error for model %s: %s", chat_request.model, str(e))
 355 |             raise HTTPException(status_code=400, detail=str(e))
 356 |         except bedrock_runtime.exceptions.ThrottlingException as e:
 357 |             logger.warning("Bedrock throttling for model %s: %s", chat_request.model, str(e))
 358 |             raise HTTPException(status_code=429, detail=str(e))
 359 |         except Exception as e:
 360 |             logger.error("Bedrock invocation failed for model %s: %s", chat_request.model, str(e))
 361 |             raise HTTPException(status_code=500, detail=str(e))
 362 |         return response
 363 | 
 364 |     async def chat(self, chat_request: ChatRequest) -> ChatResponse:
 365 |         """Default implementation for Chat API."""
 366 | 
 367 |         message_id = self.generate_message_id()
 368 |         response = await self._invoke_bedrock(chat_request)
 369 | 
 370 |         output_message = response["output"]["message"]
 371 |         usage = response["usage"]
 372 | 
 373 |         # Extract all token counts
 374 |         output_tokens = usage["outputTokens"]
 375 |         total_tokens = usage["totalTokens"]
 376 |         finish_reason = response["stopReason"]
 377 | 
 378 |         # Extract prompt caching metrics if available
 379 |         cache_read_tokens = usage.get("cacheReadInputTokens", 0)
 380 |         cache_creation_tokens = usage.get("cacheWriteInputTokens", 0)
 381 | 
 382 |         # Calculate actual prompt tokens
 383 |         # Bedrock's totalTokens includes all: inputTokens + cacheRead + cacheWrite + outputTokens
 384 |         # So: prompt_tokens = totalTokens - outputTokens
 385 |         actual_prompt_tokens = total_tokens - output_tokens
 386 | 
 387 |         chat_response = self._create_response(
 388 |             model=chat_request.model,
 389 |             message_id=message_id,
 390 |             content=output_message["content"],
 391 |             finish_reason=finish_reason,
 392 |             input_tokens=actual_prompt_tokens,
 393 |             output_tokens=output_tokens,
 394 |             total_tokens=total_tokens,
 395 |             cache_read_tokens=cache_read_tokens,
 396 |             cache_creation_tokens=cache_creation_tokens,
 397 |         )
 398 |         if DEBUG:
 399 |             logger.info("Proxy response :" + chat_response.model_dump_json())
 400 |         return chat_response
 401 | 
 402 |     async def _async_iterate(self, stream):
 403 |         """Helper method to convert sync iterator to async iterator"""
 404 |         for chunk in stream:
 405 |             await run_in_threadpool(lambda: chunk)
 406 |             yield chunk
 407 | 
 408 |     async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
 409 |         """Default implementation for Chat Stream API"""
 410 |         try:
 411 |             response = await self._invoke_bedrock(chat_request, stream=True)
 412 |             message_id = self.generate_message_id()
 413 |             stream = response.get("stream")
 414 |             self.think_emitted = False
 415 |             async for chunk in self._async_iterate(stream):
 416 |                 args = {"model_id": chat_request.model, "message_id": message_id, "chunk": chunk}
 417 |                 stream_response = self._create_response_stream(**args)
 418 |                 if not stream_response:
 419 |                     continue
 420 |                 if DEBUG:
 421 |                     logger.info("Proxy response :" + stream_response.model_dump_json())
 422 |                 if stream_response.choices:
 423 |                     yield self.stream_response_to_bytes(stream_response)
 424 |                 elif chat_request.stream_options and chat_request.stream_options.include_usage:
 425 |                     # An empty choices for Usage as per OpenAI doc below:
 426 |                     # if you set stream_options: {"include_usage": true}.
 427 |                     # an additional chunk will be streamed before the data: [DONE] message.
 428 |                     # The usage field on this chunk shows the token usage statistics for the entire request,
 429 |                     # and the choices field will always be an empty array.
 430 |                     # All other chunks will also include a usage field, but with a null value.
 431 |                     yield self.stream_response_to_bytes(stream_response)
 432 | 
 433 |             # return an [DONE] message at the end.
 434 |             yield self.stream_response_to_bytes()
 435 |             self.think_emitted = False  # Cleanup
 436 |         except Exception as e:
 437 |             logger.error("Stream error for model %s: %s", chat_request.model, str(e))
 438 |             error_event = Error(error=ErrorMessage(message=str(e)))
 439 |             yield self.stream_response_to_bytes(error_event)
 440 | 
 441 |     def _parse_system_prompts(self, chat_request: ChatRequest) -> list[dict[str, str]]:
 442 |         """Create system prompts with optional prompt caching support.
 443 | 
 444 |         Prompt caching can be enabled via:
 445 |         1. ENABLE_PROMPT_CACHING environment variable (global default)
 446 |         2. extra_body.prompt_caching.system = True/False (per-request override)
 447 | 
 448 |         Only adds cachePoint if:
 449 |         - Model supports caching (Claude, Nova)
 450 |         - Caching is enabled (ENV or extra_body)
 451 |         - System prompts exist and meet minimum token requirements
 452 | 
 453 |         Example output: [{"text" : system_prompt}, {"cachePoint": {"type": "default"}}]
 454 | 
 455 |         See: https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
 456 |         """
 457 |         system_prompts = []
 458 |         for message in chat_request.messages:
 459 |             if message.role not in ("system", "developer"):
 460 |                 continue
 461 |             if not isinstance(message.content, str):
 462 |                 raise TypeError(f"System message content must be a string, got {type(message.content).__name__}")
 463 |             system_prompts.append({"text": message.content})
 464 | 
 465 |         if not system_prompts:
 466 |             return system_prompts
 467 | 
 468 |         # Check if model supports prompt caching
 469 |         if not self._supports_prompt_caching(chat_request.model):
 470 |             return system_prompts
 471 | 
 472 |         # Determine if caching should be enabled
 473 |         cache_enabled = ENABLE_PROMPT_CACHING  # Default from ENV
 474 | 
 475 |         # Check for extra_body override
 476 |         if chat_request.extra_body and isinstance(chat_request.extra_body, dict):
 477 |             prompt_caching = chat_request.extra_body.get("prompt_caching", {})
 478 |             if "system" in prompt_caching:
 479 |                 # extra_body explicitly controls caching
 480 |                 cache_enabled = prompt_caching.get("system") is True
 481 | 
 482 |         if not cache_enabled:
 483 |             return system_prompts
 484 | 
 485 |         # Estimate total tokens for limit check
 486 |         total_text = " ".join(p.get("text", "") for p in system_prompts)
 487 |         estimated_tokens = len(total_text.split()) * 1.3  # Rough estimate
 488 | 
 489 |         # Check token limits (Nova has 20K limit)
 490 |         max_tokens = self._get_max_cache_tokens(chat_request.model)
 491 |         if max_tokens and estimated_tokens > max_tokens:
 492 |             logger.warning(
 493 |                 f"System prompts (~{estimated_tokens:.0f} tokens) exceed model cache limit ({max_tokens} tokens). "
 494 |                 f"Caching will still be attempted but may not work optimally."
 495 |             )
 496 |             # Still add cachePoint - let Bedrock handle the limit
 497 | 
 498 |         # Add cache checkpoint after system prompts
 499 |         system_prompts.append({"cachePoint": {"type": "default"}})
 500 | 
 501 |         if DEBUG:
 502 |             logger.info(f"Added cachePoint to system prompts for model {chat_request.model}")
 503 | 
 504 |         return system_prompts
 505 | 
 506 |     def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
 507 |         """
 508 |         Converse API only support user and assistant messages.
 509 | 
 510 |         example output: [{
 511 |             "role": "user",
 512 |             "content": [{"text": input_text}]
 513 |         }]
 514 | 
 515 |         See example:
 516 |         https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples
 517 |         """
 518 |         messages = []
 519 |         for message in chat_request.messages:
 520 |             if isinstance(message, UserMessage):
 521 |                 messages.append(
 522 |                     {
 523 |                         "role": message.role,
 524 |                         "content": self._parse_content_parts(
 525 |                             message, chat_request.model
 526 |                         ),
 527 |                     }
 528 |                 )
 529 |             elif isinstance(message, AssistantMessage):
 530 |                 # Check if message has content that's not empty
 531 |                 has_content = False
 532 |                 if isinstance(message.content, str):
 533 |                     has_content = message.content.strip() != ""
 534 |                 elif isinstance(message.content, list):
 535 |                     has_content = len(message.content) > 0
 536 |                 elif message.content is not None:
 537 |                     has_content = True
 538 | 
 539 |                 if has_content:
 540 |                     # Text message
 541 |                     messages.append(
 542 |                         {
 543 |                             "role": message.role,
 544 |                             "content": self._parse_content_parts(
 545 |                                 message, chat_request.model
 546 |                             ),
 547 |                         }
 548 |                     )
 549 |                 if message.tool_calls:
 550 |                     # Tool use message
 551 |                     for tool_call in message.tool_calls:
 552 |                         tool_input = json.loads(tool_call.function.arguments)
 553 |                         messages.append(
 554 |                             {
 555 |                                 "role": message.role,
 556 |                                 "content": [
 557 |                                     {
 558 |                                         "toolUse": {
 559 |                                             "toolUseId": tool_call.id,
 560 |                                             "name": tool_call.function.name,
 561 |                                             "input": tool_input,
 562 |                                         }
 563 |                                     }
 564 |                                 ],
 565 |                             }
 566 |                         )
 567 |             elif isinstance(message, ToolMessage):
 568 |                 # Bedrock does not support tool role,
 569 |                 # Add toolResult to content
 570 |                 # https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolResultBlock.html
 571 | 
 572 |                 # Handle different content formats from OpenAI SDK
 573 |                 tool_content = self._extract_tool_content(message.content)
 574 | 
 575 |                 messages.append(
 576 |                     {
 577 |                         "role": "user",
 578 |                         "content": [
 579 |                             {
 580 |                                 "toolResult": {
 581 |                                     "toolUseId": message.tool_call_id,
 582 |                                     "content": [{"text": tool_content}],
 583 |                                 }
 584 |                             }
 585 |                         ],
 586 |                     }
 587 |                 )
 588 | 
 589 |             else:
 590 |                 # ignore others, such as system messages
 591 |                 continue
 592 |         return self._reframe_multi_payloard(messages, chat_request)
 593 | 
 594 |     def _extract_tool_content(self, content) -> str:
 595 |         """Extract text content from various OpenAI SDK tool message formats.
 596 | 
 597 |         Handles:
 598 |         - String content (legacy format)
 599 |         - List of content objects (OpenAI SDK 1.91.0+)
 600 |         - Nested JSON structures within text content
 601 |         """
 602 |         try:
 603 |             if isinstance(content, str):
 604 |                 return content
 605 | 
 606 |             if isinstance(content, list):
 607 |                 text_parts = []
 608 |                 for i, item in enumerate(content):
 609 |                     if isinstance(item, dict):
 610 |                         # Handle dict with 'text' field
 611 |                         if "text" in item:
 612 |                             item_text = item["text"]
 613 |                             if isinstance(item_text, str):
 614 |                                 # Try to parse as JSON if it looks like JSON
 615 |                                 if item_text.strip().startswith('{') and item_text.strip().endswith('}'):
 616 |                                     try:
 617 |                                         parsed_json = json.loads(item_text)
 618 |                                         # Convert JSON object to readable text
 619 |                                         text_parts.append(json.dumps(parsed_json, indent=2))
 620 |                                     except json.JSONDecodeError:
 621 |                                         # Silently fallback to original text
 622 |                                         text_parts.append(item_text)
 623 |                                 else:
 624 |                                     text_parts.append(item_text)
 625 |                             else:
 626 |                                 text_parts.append(str(item_text))
 627 |                         else:
 628 |                             # Handle other dict formats - convert to JSON string
 629 |                             text_parts.append(json.dumps(item, indent=2))
 630 |                     elif hasattr(item, 'text'):
 631 |                         # Handle ToolContent objects
 632 |                         text_parts.append(item.text)
 633 |                     else:
 634 |                         # Convert any other type to string
 635 |                         text_parts.append(str(item))
 636 |                 return "\n".join(text_parts)
 637 | 
 638 |             # Fallback for any other type
 639 |             return str(content)
 640 |         except Exception as e:
 641 |             logger.warning("Tool content extraction failed: %s", str(e))
 642 |             # Return a safe fallback
 643 |             return str(content) if content is not None else ""
 644 | 
 645 |     def _reframe_multi_payloard(self, messages: list, chat_request: ChatRequest = None) -> list:
 646 |         """Receive messages and reformat them to comply with the Claude format
 647 | 
 648 |         With OpenAI format requests, it's not a problem to repeatedly receive messages from the same role, but
 649 |         with Claude format requests, you cannot repeatedly receive messages from the same role.
 650 | 
 651 |         This method searches through the OpenAI format messages in order and reformats them to the Claude format.
 652 | 
 653 |         ```
 654 |         openai_format_messages=[
 655 |             {"role": "user", "content": "Hello"},
 656 |             {"role": "user", "content": "Who are you?"},
 657 |         ]
 658 | 
 659 |         bedrock_format_messages=[
 660 |             {
 661 |                 "role": "user",
 662 |                 "content": [
 663 |                     {"text": "Hello"},
 664 |                     {"text": "Who are you?"}
 665 |                 ]
 666 |             },
 667 |         ]
 668 |         """
 669 |         reformatted_messages = []
 670 |         current_role = None
 671 |         current_content = []
 672 | 
 673 |         # Search through the list of messages and combine messages from the same role into one list
 674 |         for message in messages:
 675 |             next_role = message["role"]
 676 |             next_content = message["content"]
 677 | 
 678 |             # If the next role is different from the previous message, add the previous role's messages to the list
 679 |             if next_role != current_role:
 680 |                 if current_content:
 681 |                     reformatted_messages.append(
 682 |                         {"role": current_role, "content": current_content}
 683 |                     )
 684 |                 # Switch to the new role
 685 |                 current_role = next_role
 686 |                 current_content = []
 687 | 
 688 |             # Add the message content to current_content
 689 |             if isinstance(next_content, str):
 690 |                 current_content.append({"text": next_content})
 691 |             elif isinstance(next_content, list):
 692 |                 current_content.extend(next_content)
 693 | 
 694 |         # Add the last role's messages to the list
 695 |         if current_content:
 696 |             reformatted_messages.append(
 697 |                 {"role": current_role, "content": current_content}
 698 |             )
 699 | 
 700 |         # Add cachePoint to messages if enabled and supported
 701 |         if chat_request and reformatted_messages:
 702 |             if not self._supports_prompt_caching(chat_request.model):
 703 |                 return reformatted_messages
 704 | 
 705 |             # Determine if messages caching should be enabled
 706 |             cache_enabled = ENABLE_PROMPT_CACHING
 707 | 
 708 |             if chat_request.extra_body and isinstance(chat_request.extra_body, dict):
 709 |                 prompt_caching = chat_request.extra_body.get("prompt_caching", {})
 710 |                 if "messages" in prompt_caching:
 711 |                     cache_enabled = prompt_caching.get("messages") is True
 712 | 
 713 |             if cache_enabled:
 714 |                 # Add cachePoint to the last user message content
 715 |                 for msg in reversed(reformatted_messages):
 716 |                     if msg["role"] == "user" and msg.get("content"):
 717 |                         # Add cachePoint at the end of user message content
 718 |                         msg["content"].append({"cachePoint": {"type": "default"}})
 719 |                         if DEBUG:
 720 |                             logger.info(f"Added cachePoint to last user message for model {chat_request.model}")
 721 |                         break
 722 | 
 723 |         return reformatted_messages
 724 | 
 725 |     def _parse_request(self, chat_request: ChatRequest) -> dict:
 726 |         """Create default converse request body.
 727 | 
 728 |         Also perform validations to tool call etc.
 729 | 
 730 |         Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html
 731 |         """
 732 |         messages = self._parse_messages(chat_request)
 733 |         system_prompts = self._parse_system_prompts(chat_request)
 734 | 
 735 |         # Base inference parameters.
 736 |         inference_config = {
 737 |             "maxTokens": chat_request.max_tokens,
 738 |         }
 739 | 
 740 |         # Only include optional parameters when specified
 741 |         if chat_request.temperature is not None:
 742 |             inference_config["temperature"] = chat_request.temperature
 743 |         if chat_request.top_p is not None:
 744 |             inference_config["topP"] = chat_request.top_p
 745 | 
 746 |         # Some models (Claude Sonnet 4.5, Haiku 4.5) don't support both temperature and topP
 747 |         # When both are provided, keep temperature and remove topP
 748 |         # Resolve profile to underlying model for feature detection
 749 |         resolved_model = self._resolve_to_foundation_model(chat_request.model)
 750 |         model_lower = resolved_model.lower()
 751 | 
 752 |         # Check if model is in the conflict list and both parameters are present
 753 |         if "temperature" in inference_config and "topP" in inference_config:
 754 |             if any(conflict_model in model_lower for conflict_model in TEMPERATURE_TOPP_CONFLICT_MODELS):
 755 |                 inference_config.pop("topP", None)
 756 |                 if DEBUG:
 757 |                     logger.info(f"Removed topP for {chat_request.model} (conflicts with temperature)")
 758 | 
 759 |         if chat_request.stop is not None:
 760 |             stop = chat_request.stop
 761 |             if isinstance(stop, str):
 762 |                 stop = [stop]
 763 |             inference_config["stopSequences"] = stop
 764 | 
 765 |         args = {
 766 |             "modelId": chat_request.model,
 767 |             "messages": messages,
 768 |             "system": system_prompts,
 769 |             "inferenceConfig": inference_config,
 770 |         }
 771 |         if chat_request.reasoning_effort:
 772 |             # reasoning_effort is supported by Claude and DeepSeek v3
 773 |             # Different models use different formats
 774 |             # Resolve profile to underlying model for feature detection
 775 |             resolved_model = self._resolve_to_foundation_model(chat_request.model)
 776 |             model_lower = resolved_model.lower()
 777 | 
 778 |             if "anthropic.claude" in model_lower:
 779 |                 # Claude format: reasoning_config = object with budget_tokens
 780 |                 max_tokens = (
 781 |                     chat_request.max_completion_tokens
 782 |                     if chat_request.max_completion_tokens
 783 |                     else chat_request.max_tokens
 784 |                 )
 785 |                 budget_tokens = self._calc_budget_tokens(
 786 |                     max_tokens, chat_request.reasoning_effort
 787 |                 )
 788 |                 inference_config["maxTokens"] = max_tokens
 789 |                 # unset topP - Not supported
 790 |                 inference_config.pop("topP", None)
 791 | 
 792 |                 args["additionalModelRequestFields"] = {
 793 |                     "reasoning_config": {"type": "enabled", "budget_tokens": budget_tokens}
 794 |                 }
 795 |             elif "deepseek.v3" in model_lower or "deepseek.deepseek-v3" in model_lower:
 796 |                 # DeepSeek v3 format: reasoning_config = string ('low', 'medium', 'high')
 797 |                 # From Bedrock Playground: {"reasoning_config": "high"}
 798 |                 args["additionalModelRequestFields"] = {
 799 |                     "reasoning_config": chat_request.reasoning_effort  # Direct string: low/medium/high
 800 |                 }
 801 |                 if DEBUG:
 802 |                     logger.info(f"Applied reasoning_config={chat_request.reasoning_effort} for DeepSeek v3")
 803 |             else:
 804 |                 # For other models (Qwen, etc.), ignore reasoning_effort parameter
 805 |                 if DEBUG:
 806 |                     logger.info(f"reasoning_effort parameter ignored for model {chat_request.model} (not supported)")
 807 |         # add tool config
 808 |         if chat_request.tools:
 809 |             tool_config = {"tools": [self._convert_tool_spec(t.function) for t in chat_request.tools]}
 810 | 
 811 |             if chat_request.tool_choice and not chat_request.model.startswith(
 812 |                 "meta.llama3-1-"
 813 |             ):
 814 |                 if isinstance(chat_request.tool_choice, str):
 815 |                     # auto (default) is mapped to {"auto" : {}}
 816 |                     # required is mapped to {"any" : {}}
 817 |                     if chat_request.tool_choice == "required":
 818 |                         tool_config["toolChoice"] = {"any": {}}
 819 |                     else:
 820 |                         tool_config["toolChoice"] = {"auto": {}}
 821 |                 else:
 822 |                     # Specific tool to use
 823 |                     if "function" not in chat_request.tool_choice:
 824 |                         raise ValueError("tool_choice must contain 'function' key when specifying a specific tool")
 825 |                     tool_config["toolChoice"] = {"tool": {"name": chat_request.tool_choice["function"].get("name", "")}}
 826 |             args["toolConfig"] = tool_config
 827 |         # Add additional fields to enable extend thinking or other model-specific features
 828 |         if chat_request.extra_body:
 829 |             # Filter out prompt_caching (our control field, not for Bedrock)
 830 |             additional_fields = {
 831 |                 k: v for k, v in chat_request.extra_body.items()
 832 |                 if k != "prompt_caching"
 833 |             }
 834 | 
 835 |             if additional_fields:
 836 |                 # Only set additionalModelRequestFields if there are actual fields to pass
 837 |                 args["additionalModelRequestFields"] = additional_fields
 838 | 
 839 |                 # Extended thinking doesn't support both temperature and topP
 840 |                 # Remove topP to avoid validation error
 841 |                 if "thinking" in additional_fields:
 842 |                     inference_config.pop("topP", None)
 843 | 
 844 |         return args
 845 | 
 846 |     def _estimate_reasoning_tokens(self, content: list[dict]) -> int:
 847 |         """
 848 |         Estimate reasoning tokens from reasoningContent blocks.
 849 | 
 850 |         Bedrock doesn't separately report reasoning tokens, so we estimate
 851 |         them using tiktoken to maintain OpenAI API compatibility.
 852 |         """
 853 |         reasoning_text = ""
 854 |         for block in content:
 855 |             if "reasoningContent" in block:
 856 |                 reasoning_text += block["reasoningContent"]["reasoningText"].get("text", "")
 857 | 
 858 |         if reasoning_text:
 859 |             # Use tiktoken to estimate token count
 860 |             return len(ENCODER.encode(reasoning_text))
 861 |         return 0
 862 | 
 863 |     def _create_response(
 864 |         self,
 865 |         model: str,
 866 |         message_id: str,
 867 |         content: list[dict] | None = None,
 868 |         finish_reason: str | None = None,
 869 |         input_tokens: int = 0,
 870 |         output_tokens: int = 0,
 871 |         total_tokens: int = 0,
 872 |         cache_read_tokens: int = 0,
 873 |         cache_creation_tokens: int = 0,
 874 |     ) -> ChatResponse:
 875 |         message = ChatResponseMessage(
 876 |             role="assistant",
 877 |         )
 878 |         if finish_reason == "tool_use":
 879 |             # https://docs.aws.amazon.com/bedrock/latest/userguide/tool-use.html#tool-use-examples
 880 |             tool_calls = []
 881 |             for part in content:
 882 |                 if "toolUse" in part:
 883 |                     tool = part["toolUse"]
 884 |                     tool_calls.append(
 885 |                         ToolCall(
 886 |                             id=tool["toolUseId"],
 887 |                             type="function",
 888 |                             function=ResponseFunction(
 889 |                                 name=tool["name"],
 890 |                                 arguments=json.dumps(tool["input"]),
 891 |                             ),
 892 |                         )
 893 |                     )
 894 |             message.tool_calls = tool_calls
 895 |             message.content = None
 896 |         else:
 897 |             message.content = ""
 898 |             for c in content:
 899 |                 if "reasoningContent" in c:
 900 |                     message.reasoning_content = c["reasoningContent"][
 901 |                         "reasoningText"
 902 |                     ].get("text", "")
 903 |                 elif "text" in c:
 904 |                     message.content = c["text"]
 905 |                 else:
 906 |                     logger.warning(
 907 |                         "Unknown tag in message content " + ",".join(c.keys())
 908 |                     )
 909 |             if message.reasoning_content:
 910 |                 message.content = f"<think>{message.reasoning_content}</think>{message.content}"
 911 |                 message.reasoning_content = None
 912 | 
 913 |         # Create prompt_tokens_details if cache metrics are available
 914 |         prompt_tokens_details = None
 915 |         if cache_read_tokens > 0 or cache_creation_tokens > 0:
 916 |             # Map Bedrock cache metrics to OpenAI format
 917 |             # cached_tokens represents tokens read from cache (cache hits)
 918 |             prompt_tokens_details = PromptTokensDetails(
 919 |                 cached_tokens=cache_read_tokens,
 920 |                 audio_tokens=0,
 921 |             )
 922 | 
 923 |         # Create completion_tokens_details if reasoning content exists
 924 |         completion_tokens_details = None
 925 |         reasoning_tokens = self._estimate_reasoning_tokens(content) if content else 0
 926 |         if reasoning_tokens > 0:
 927 |             completion_tokens_details = CompletionTokensDetails(
 928 |                 reasoning_tokens=reasoning_tokens,
 929 |                 audio_tokens=0,
 930 |             )
 931 | 
 932 |         response = ChatResponse(
 933 |             id=message_id,
 934 |             model=model,
 935 |             choices=[
 936 |                 Choice(
 937 |                     index=0,
 938 |                     message=message,
 939 |                     finish_reason=self._convert_finish_reason(finish_reason),
 940 |                     logprobs=None,
 941 |                 )
 942 |             ],
 943 |             usage=Usage(
 944 |                 prompt_tokens=input_tokens,
 945 |                 completion_tokens=output_tokens,
 946 |                 total_tokens=total_tokens if total_tokens > 0 else input_tokens + output_tokens,
 947 |                 prompt_tokens_details=prompt_tokens_details,
 948 |                 completion_tokens_details=completion_tokens_details,
 949 |             ),
 950 |         )
 951 |         response.system_fingerprint = "fp"
 952 |         response.object = "chat.completion"
 953 |         response.created = int(time.time())
 954 |         return response
 955 | 
 956 |     def _create_response_stream(
 957 |         self, model_id: str, message_id: str, chunk: dict
 958 |     ) -> ChatStreamResponse | None:
 959 |         """Parsing the Bedrock stream response chunk.
 960 | 
 961 |         Ref: https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples
 962 |         """
 963 |         if DEBUG:
 964 |             logger.info("Bedrock response chunk: " + str(chunk))
 965 | 
 966 |         finish_reason = None
 967 |         message = None
 968 |         usage = None
 969 | 
 970 |         if "messageStart" in chunk:
 971 |             message = ChatResponseMessage(
 972 |                 role=chunk["messageStart"]["role"],
 973 |                 content="",
 974 |             )
 975 | 
 976 |         if "contentBlockStart" in chunk:
 977 |             # tool call start
 978 |             delta = chunk["contentBlockStart"]["start"]
 979 |             if "toolUse" in delta:
 980 |                 # first index is content
 981 |                 index = chunk["contentBlockStart"]["contentBlockIndex"] - 1
 982 |                 message = ChatResponseMessage(
 983 |                     tool_calls=[
 984 |                         ToolCall(
 985 |                             index=index,
 986 |                             type="function",
 987 |                             id=delta["toolUse"]["toolUseId"],
 988 |                             function=ResponseFunction(
 989 |                                 name=delta["toolUse"]["name"],
 990 |                                 arguments="",
 991 |                             ),
 992 |                         )
 993 |                     ]
 994 |                 )
 995 | 
 996 |         if "contentBlockDelta" in chunk:
 997 |             delta = chunk["contentBlockDelta"]["delta"]
 998 |             if "text" in delta:
 999 |                 # Regular text content - close thinking tag if open
1000 |                 content = delta["text"]
1001 |                 if self.think_emitted:
1002 |                     # Transition from reasoning to regular text
1003 |                     content = "</think>" + content
1004 |                     self.think_emitted = False
1005 |                 message = ChatResponseMessage(content=content)
1006 |             elif "reasoningContent" in delta:
1007 |                 if "text" in delta["reasoningContent"]:
1008 |                     content = delta["reasoningContent"]["text"]
1009 |                     if not self.think_emitted:
1010 |                         # Start of reasoning content
1011 |                         content = "<think>" + content
1012 |                         self.think_emitted = True
1013 |                     message = ChatResponseMessage(content=content)
1014 |                 elif "signature" in delta["reasoningContent"]:
1015 |                     # Port of "signature_delta" (for models that send it)
1016 |                     if self.think_emitted:
1017 |                         message = ChatResponseMessage(content="</think>")
1018 |                         self.think_emitted = False
1019 |                     else:
1020 |                         return None  # Ignore signature if no <think> started
1021 |             else:
1022 |                 # tool use
1023 |                 index = chunk["contentBlockDelta"]["contentBlockIndex"] - 1
1024 |                 message = ChatResponseMessage(
1025 |                     tool_calls=[
1026 |                         ToolCall(
1027 |                             index=index,
1028 |                             function=ResponseFunction(
1029 |                                 arguments=delta["toolUse"]["input"],
1030 |                             ),
1031 |                         )
1032 |                     ]
1033 |                 )
1034 | 
1035 |         if "messageStop" in chunk:
1036 |             # Safety check: Close any open thinking tags before message stops
1037 |             if self.think_emitted:
1038 |                 self.think_emitted = False
1039 |                 return ChatStreamResponse(
1040 |                     id=message_id,
1041 |                     model=model_id,
1042 |                     choices=[
1043 |                         ChoiceDelta(
1044 |                             index=0,
1045 |                             delta=ChatResponseMessage(content="</think>"),
1046 |                             logprobs=None,
1047 |                             finish_reason=None,
1048 |                         )
1049 |                     ],
1050 |                 )
1051 |             message = ChatResponseMessage()
1052 |             finish_reason = chunk["messageStop"]["stopReason"]
1053 | 
1054 |         if "metadata" in chunk:
1055 |             # usage information in metadata.
1056 |             metadata = chunk["metadata"]
1057 |             if "usage" in metadata:
1058 |                 # token usage
1059 |                 usage_data = metadata["usage"]
1060 | 
1061 |                 # Extract prompt caching metrics if available
1062 |                 cache_read_tokens = usage_data.get("cacheReadInputTokens", 0)
1063 |                 cache_creation_tokens = usage_data.get("cacheWriteInputTokens", 0)
1064 | 
1065 |                 # Create prompt_tokens_details if cache metrics are available
1066 |                 prompt_tokens_details = None
1067 |                 if cache_read_tokens > 0 or cache_creation_tokens > 0:
1068 |                     prompt_tokens_details = PromptTokensDetails(
1069 |                         cached_tokens=cache_read_tokens,
1070 |                         audio_tokens=0,
1071 |                     )
1072 | 
1073 |                 # Calculate actual prompt tokens
1074 |                 # Bedrock's totalTokens includes all tokens
1075 |                 # prompt_tokens = totalTokens - outputTokens
1076 |                 total_tokens = usage_data["totalTokens"]
1077 |                 output_tokens = usage_data["outputTokens"]
1078 |                 actual_prompt_tokens = total_tokens - output_tokens
1079 | 
1080 |                 return ChatStreamResponse(
1081 |                     id=message_id,
1082 |                     model=model_id,
1083 |                     choices=[],
1084 |                     usage=Usage(
1085 |                         prompt_tokens=actual_prompt_tokens,
1086 |                         completion_tokens=output_tokens,
1087 |                         total_tokens=total_tokens,
1088 |                         prompt_tokens_details=prompt_tokens_details,
1089 |                     ),
1090 |                 )
1091 | 
1092 |         if message:
1093 |             return ChatStreamResponse(
1094 |                 id=message_id,
1095 |                 model=model_id,
1096 |                 choices=[
1097 |                     ChoiceDelta(
1098 |                         index=0,
1099 |                         delta=message,
1100 |                         logprobs=None,
1101 |                         finish_reason=self._convert_finish_reason(finish_reason),
1102 |                     )
1103 |                 ],
1104 |                 usage=usage,
1105 |             )
1106 | 
1107 |         return None
1108 | 
1109 |     def _parse_image(self, image_url: str) -> tuple[bytes, str]:
1110 |         """Try to get the raw data from an image url.
1111 | 
1112 |         Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ImageSource.html
1113 |         returns a tuple of (Image Data, Content Type)
1114 |         """
1115 |         pattern = r"^data:(image/[a-z]*);base64,\s*"
1116 |         content_type = re.search(pattern, image_url)
1117 |         # if already base64 encoded.
1118 |         # Only supports 'image/jpeg', 'image/png', 'image/gif' or 'image/webp'
1119 |         if content_type:
1120 |             image_data = re.sub(pattern, "", image_url)
1121 |             return base64.b64decode(image_data), content_type.group(1)
1122 | 
1123 |         # Send a request to the image URL
1124 |         response = requests.get(image_url, timeout=30)
1125 |         # Check if the request was successful
1126 |         if response.status_code == 200:
1127 |             content_type = response.headers.get("Content-Type")
1128 |             if not content_type.startswith("image"):
1129 |                 content_type = "image/jpeg"
1130 |             # Get the image content
1131 |             image_content = response.content
1132 |             return image_content, content_type
1133 |         else:
1134 |             raise HTTPException(
1135 |                 status_code=500, detail="Unable to access the image url"
1136 |             )
1137 | 
1138 |     def _parse_content_parts(
1139 |         self,
1140 |         message: UserMessage | AssistantMessage,
1141 |         model_id: str,
1142 |     ) -> list[dict]:
1143 |         if isinstance(message.content, str):
1144 |             return [
1145 |                 {
1146 |                     "text": message.content,
1147 |                 }
1148 |             ]
1149 |         content_parts = []
1150 |         for part in message.content:
1151 |             if isinstance(part, TextContent):
1152 |                 content_parts.append(
1153 |                     {
1154 |                         "text": part.text,
1155 |                     }
1156 |                 )
1157 |             elif isinstance(part, ImageContent):
1158 |                 if not self.is_supported_modality(model_id, modality="IMAGE"):
1159 |                     raise HTTPException(
1160 |                         status_code=400,
1161 |                         detail=f"Multimodal message is currently not supported by {model_id}",
1162 |                     )
1163 |                 image_data, content_type = self._parse_image(part.image_url.url)
1164 |                 content_parts.append(
1165 |                     {
1166 |                         "image": {
1167 |                             "format": content_type[6:],  # image/
1168 |                             "source": {"bytes": image_data},
1169 |                         },
1170 |                     }
1171 |                 )
1172 |             else:
1173 |                 # Ignore..
1174 |                 continue
1175 |         return content_parts
1176 | 
1177 |     @staticmethod
1178 |     def is_supported_modality(model_id: str, modality: str = "IMAGE") -> bool:
1179 |         model = bedrock_model_list.get(model_id, {})
1180 |         modalities = model.get("modalities", [])
1181 |         if modality in modalities:
1182 |             return True
1183 |         return False
1184 | 
1185 |     def _convert_tool_spec(self, func: Function) -> dict:
1186 |         return {
1187 |             "toolSpec": {
1188 |                 "name": func.name,
1189 |                 "description": func.description,
1190 |                 "inputSchema": {
1191 |                     "json": func.parameters,
1192 |                 },
1193 |             }
1194 |         }
1195 | 
1196 |     def _calc_budget_tokens(
1197 |         self, max_tokens: int, reasoning_effort: Literal["low", "medium", "high"]
1198 |     ) -> int:
1199 |         # Helper function to calculate budget_tokens based on the max_tokens.
1200 |         # Ratio for efforts:  Low - 30%, medium - 60%, High: Max token - 1
1201 |         # Note that The minimum budget_tokens is 1,024 tokens so far.
1202 |         # But it may be changed for different models in the future.
1203 |         if reasoning_effort == "low":
1204 |             return int(max_tokens * 0.3)
1205 |         elif reasoning_effort == "medium":
1206 |             return int(max_tokens * 0.6)
1207 |         else:
1208 |             return max_tokens - 1
1209 | 
1210 |     def _convert_finish_reason(self, finish_reason: str | None) -> str | None:
1211 |         """
1212 |         Below is a list of finish reason according to OpenAI doc:
1213 | 
1214 |         - stop: if the model hit a natural stop point or a provided stop sequence,
1215 |         - length: if the maximum number of tokens specified in the request was reached,
1216 |         - content_filter: if content was omitted due to a flag from our content filters,
1217 |         - tool_calls: if the model called a tool
1218 |         """
1219 |         if finish_reason:
1220 |             finish_reason_mapping = {
1221 |                 "tool_use": "tool_calls",
1222 |                 "finished": "stop",
1223 |                 "end_turn": "stop",
1224 |                 "max_tokens": "length",
1225 |                 "stop_sequence": "stop",
1226 |                 "complete": "stop",
1227 |                 "content_filtered": "content_filter",
1228 |             }
1229 |             return finish_reason_mapping.get(
1230 |                 finish_reason.lower(), finish_reason.lower()
1231 |             )
1232 |         return None
1233 | 
1234 | 
1235 | class BedrockEmbeddingsModel(BaseEmbeddingsModel, ABC):
1236 |     accept = "application/json"
1237 |     content_type = "application/json"
1238 | 
1239 |     def _invoke_model(self, args: dict, model_id: str):
1240 |         body = json.dumps(args)
1241 |         if DEBUG:
1242 |             logger.info("Invoke Bedrock Model: " + model_id)
1243 |             logger.info("Bedrock request body: " + body)
1244 |         try:
1245 |             return bedrock_runtime.invoke_model(
1246 |                 body=body,
1247 |                 modelId=model_id,
1248 |                 accept=self.accept,
1249 |                 contentType=self.content_type,
1250 |             )
1251 |         except bedrock_runtime.exceptions.ValidationException as e:
1252 |             logger.error("Validation Error: " + str(e))
1253 |             raise HTTPException(status_code=400, detail=str(e))
1254 |         except bedrock_runtime.exceptions.ThrottlingException as e:
1255 |             logger.error("Throttling Error: " + str(e))
1256 |             raise HTTPException(status_code=429, detail=str(e))
1257 |         except Exception as e:
1258 |             logger.error(e)
1259 |             raise HTTPException(status_code=500, detail=str(e))
1260 | 
1261 |     def _create_response(
1262 |         self,
1263 |         embeddings: list[float],
1264 |         model: str,
1265 |         input_tokens: int = 0,
1266 |         output_tokens: int = 0,
1267 |         encoding_format: Literal["float", "base64"] = "float",
1268 |     ) -> EmbeddingsResponse:
1269 |         data = []
1270 |         for i, embedding in enumerate(embeddings):
1271 |             if encoding_format == "base64":
1272 |                 arr = np.array(embedding, dtype=np.float32)
1273 |                 arr_bytes = arr.tobytes()
1274 |                 encoded_embedding = base64.b64encode(arr_bytes)
1275 |                 data.append(Embedding(index=i, embedding=encoded_embedding))
1276 |             else:
1277 |                 data.append(Embedding(index=i, embedding=embedding))
1278 |         response = EmbeddingsResponse(
1279 |             data=data,
1280 |             model=model,
1281 |             usage=EmbeddingsUsage(
1282 |                 prompt_tokens=input_tokens,
1283 |                 total_tokens=input_tokens + output_tokens,
1284 |             ),
1285 |         )
1286 |         if DEBUG:
1287 |             logger.info("Proxy response :" + response.model_dump_json())
1288 |         return response
1289 | 
1290 | 
1291 | class CohereEmbeddingsModel(BedrockEmbeddingsModel):
1292 |     def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict:
1293 |         texts = []
1294 |         if isinstance(embeddings_request.input, str):
1295 |             texts = [embeddings_request.input]
1296 |         elif isinstance(embeddings_request.input, list):
1297 |             texts = embeddings_request.input
1298 |         elif isinstance(embeddings_request.input, Iterable):
1299 |             # For encoded input
1300 |             # The workaround is to use tiktoken to decode to get the original text.
1301 |             encodings = []
1302 |             for inner in embeddings_request.input:
1303 |                 if isinstance(inner, int):
1304 |                     # Iterable[int]
1305 |                     encodings.append(inner)
1306 |                 else:
1307 |                     # Iterable[Iterable[int]]
1308 |                     text = ENCODER.decode(list(inner))
1309 |                     texts.append(text)
1310 |             if encodings:
1311 |                 texts.append(ENCODER.decode(encodings))
1312 | 
1313 |         # Maximum of 2048 characters
1314 |         args = {
1315 |             "texts": texts,
1316 |             "input_type": "search_document",
1317 |             "truncate": "END",  # "NONE|START|END"
1318 |         }
1319 |         return args
1320 | 
1321 |     def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse:
1322 |         response = self._invoke_model(
1323 |             args=self._parse_args(embeddings_request), model_id=embeddings_request.model
1324 |         )
1325 |         response_body = json.loads(response.get("body").read())
1326 |         if DEBUG:
1327 |             logger.info("Bedrock response body: " + str(response_body))
1328 | 
1329 |         return self._create_response(
1330 |             embeddings=response_body["embeddings"],
1331 |             model=embeddings_request.model,
1332 |             encoding_format=embeddings_request.encoding_format,
1333 |         )
1334 | 
1335 | 
1336 | class TitanEmbeddingsModel(BedrockEmbeddingsModel):
1337 |     def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict:
1338 |         if isinstance(embeddings_request.input, str):
1339 |             input_text = embeddings_request.input
1340 |         elif (
1341 |             isinstance(embeddings_request.input, list)
1342 |             and len(embeddings_request.input) == 1
1343 |         ):
1344 |             input_text = embeddings_request.input[0]
1345 |         else:
1346 |             raise ValueError(
1347 |                 "Amazon Titan Embeddings models support only single strings as input."
1348 |             )
1349 |         args = {
1350 |             "inputText": input_text,
1351 |             # Note: inputImage is not supported!
1352 |         }
1353 |         if embeddings_request.model == "amazon.titan-embed-image-v1":
1354 |             args["embeddingConfig"] = (
1355 |                 embeddings_request.embedding_config
1356 |                 if embeddings_request.embedding_config
1357 |                 else {"outputEmbeddingLength": 1024}
1358 |             )
1359 |         return args
1360 | 
1361 |     def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse:
1362 |         response = self._invoke_model(
1363 |             args=self._parse_args(embeddings_request), model_id=embeddings_request.model
1364 |         )
1365 |         response_body = json.loads(response.get("body").read())
1366 |         if DEBUG:
1367 |             logger.info("Bedrock response body: " + str(response_body))
1368 | 
1369 |         return self._create_response(
1370 |             embeddings=[response_body["embedding"]],
1371 |             model=embeddings_request.model,
1372 |             input_tokens=response_body["inputTextTokenCount"],
1373 |         )
1374 | 
1375 | 
1376 | def get_embeddings_model(model_id: str) -> BedrockEmbeddingsModel:
1377 |     model_name = SUPPORTED_BEDROCK_EMBEDDING_MODELS.get(model_id, "")
1378 |     if DEBUG:
1379 |         logger.info("model name is " + model_name)
1380 |     match model_name:
1381 |         case "Cohere Embed Multilingual" | "Cohere Embed English":
1382 |             return CohereEmbeddingsModel()
1383 |         case "Titan Embeddings G2 - Text":
1384 |             return TitanEmbeddingsModel()
1385 |         case _:
1386 |             logger.error("Unsupported model id " + model_id)
1387 |             raise HTTPException(
1388 |                 status_code=400,
1389 |                 detail="Unsupported embedding model id " + model_id,
1390 |             )
1391 | 


--------------------------------------------------------------------------------