├── images └── architecture.png ├── .idea └── .gitignore ├── requirements.txt ├── LICENSE ├── app.py ├── .gitignore ├── README.md └── llm_multi_modal_invoke.py /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryanadoty/Amazon-Bedrock-Claude3-Multi-Modal-Sample/HEAD/images/architecture.png -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altair==5.2.0 2 | attrs==23.2.0 3 | blinker==1.7.0 4 | boto3==1.34.56 5 | botocore==1.34.56 6 | cachetools==5.3.3 7 | certifi==2024.2.2 8 | charset-normalizer==3.3.2 9 | click==8.1.7 10 | gitdb==4.0.11 11 | GitPython==3.1.42 12 | idna==3.6 13 | importlib-metadata==7.0.1 14 | Jinja2==3.1.3 15 | jmespath==1.0.1 16 | jsonschema==4.21.1 17 | jsonschema-specifications==2023.12.1 18 | markdown-it-py==3.0.0 19 | MarkupSafe==2.1.5 20 | mdurl==0.1.2 21 | numpy==1.26.4 22 | packaging==23.2 23 | pandas==2.2.1 24 | pillow==10.2.0 25 | pip==23.2.1 26 | protobuf==4.25.3 27 | pyarrow==15.0.0 28 | pydeck==0.8.1b0 29 | Pygments==2.17.2 30 | python-dateutil==2.9.0.post0 31 | python-dotenv==1.0.1 32 | pytz==2024.1 33 | referencing==0.33.0 34 | requests==2.31.0 35 | rich==13.7.1 36 | rpds-py==0.18.0 37 | s3transfer==0.10.0 38 | setuptools==68.2.0 39 | six==1.16.0 40 | smmap==5.0.1 41 | streamlit==1.31.1 42 | tenacity==8.2.3 43 | toml==0.10.2 44 | toolz==0.12.1 45 | tornado==6.4 46 | typing_extensions==4.10.0 47 | tzdata==2024.1 48 | tzlocal==5.2 49 | urllib3==1.26.18 50 | validators==0.22.0 51 | wheel==0.41.2 52 | zipp==3.17.0 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ryan Doty 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from pathlib import Path 3 | import os 4 | from dotenv import load_dotenv 5 | from llm_multi_modal_invoke import image_to_text, text_to_text 6 | 7 | # load environment variables 8 | load_dotenv() 9 | # title of the streamlit app 10 | st.title(f""":rainbow[Multi-Modal with Amazon Bedrock and Anthropic Claude 3]""") 11 | # directions on what can be done with this streamlit app 12 | st.header(f"""Directions to use this application: 13 | You have several options when it comes to leveraging Claude 3, you can either: 14 | 1. Upload an image, and ask a specific question about it by inserting the question into the text box. 15 | 2. Upload an image, and let the model describe the image without inserting text. 16 | 3. Insert a question in the text box, and let the model answer the question directly without uploading an image. 17 | 18 | """, divider='rainbow') 19 | # default container that houses the image upload field 20 | with st.container(): 21 | # header that is shown on the web UI 22 | st.subheader('Image File Upload:') 23 | # the image upload field, the specific ui element that allows you to upload an image 24 | # when an image is uploaded it saves the file to the directory, and creates a path to that image 25 | File = st.file_uploader('Upload an Image', type=["png", "jpg", "jpeg"], key="new") 26 | # this is the text box that allows the user to insert a question about the uploaded image or a question in general 27 | text = st.text_input("Do you have a question about the image? Or about anything in general?") 28 | # this is the button that triggers the invocation of the model, processing of the image and/or question 29 | result = st.button("Process Image or Answer Question or Both!") 30 | # if the button is pressed, the model is invoked, and the results are output to the front end 31 | if result: 32 | # if an image is uploaded, a file will be present, triggering the image_to_text function 33 | if File is not None: 34 | # the image is displayed to the front end for the user to see 35 | st.image(File) 36 | # determine the path to temporarily save the image file that was uploaded 37 | save_folder = os.getenv("save_folder") 38 | # create a posix path of save_folder and the file name 39 | save_path = Path(save_folder, File.name) 40 | # write the uploaded image file to the save_folder you specified 41 | with open(save_path, mode='wb') as w: 42 | w.write(File.getvalue()) 43 | # once the save path exists... 44 | if save_path.exists(): 45 | # write a success message saying the image has been successfully saved 46 | st.success(f'Image {File.name} is successfully saved!') 47 | # running the image to text task, and outputting the results to the front end 48 | st.write(image_to_text(File.name, text)) 49 | # removing the image file that was temporarily saved to perform the question and answer task 50 | os.remove(save_path) 51 | # if an Image is not uploaded, but a question is, the text_to_text function is invoked 52 | else: 53 | # running a text to text task, and outputting the results to the front end 54 | st.write(text_to_text(text)) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon-Bedrock-Claude3-Multi-Modal-Sample 2 | This is sample code demonstrating the use of Amazon Bedrock and Anthropic Claude 3 to satisfy multi-modal use cases. The application is constructed with a simple streamlit frontend where users can input zero shot requests to satisfy a broad range of use cases, including image to text multi-modal style use cases. 3 | 4 | # **Goal of this Repo:** 5 | The goal of this repo is to provide users the ability to use Amazon Bedrock (specifically Claude3) and generative AI to leverage its multi-modal capabilities, allowing users to insert text questions, images, or both to get a comprehensive description/or answer based on the image and/or question that was passed in. 6 | This repo comes with a basic frontend to help users stand up a proof of concept in just a few minutes. 7 | 8 | The architecture and flow of the sample application will be: 9 | 10 | ![Alt text](images/architecture.png "POC Architecture") 11 | 12 | When a user interacts with the GenAI app, the flow is as follows: 13 | 14 | 1. (1a) The user uploads an image file to the streamlit app, with or without a text question. (app.py). (1b) The user inserts a text question into to the streamlit app, with or without an image. (app.py). 15 | 2. The streamlit app, takes the image file and/or text and saves it. The image and/or text is passed into Amazon Bedrock (Anthropic Claude 3). (llm_multi_modal_invoke.py). 16 | 3. A natural language response is returned to the end user, either describing the image, answering a question about the image, or answering a question in general. (app.py). 17 | 18 | # How to use this Repo: 19 | 20 | ## Prerequisites: 21 | 1. Amazon Bedrock Access and CLI Credentials. 22 | 2. Ensure Python 3.9 installed on your machine, it is the most stable version of Python for the packages we will be using, it can be downloaded [here](https://www.python.org/downloads/release/python-3911/). 23 | 24 | ## Step 1: 25 | The first step of utilizing this repo is performing a git clone of the repository. 26 | 27 | ``` 28 | git clone https://github.com/aws-rdoty/Amazon-Bedrock-Claude3-Multi-Modal-Sample.git 29 | ``` 30 | 31 | After cloning the repo onto your local machine, open it up in your favorite code editor. The file structure of this repo is broken into 3 key files, 32 | the app.py file, the llm_multi_modal_invoke.py file, and the requirements.txt. The app.py file houses the frontend application (a streamlit app). 33 | The llm_multi_modal_invoke.py file houses the logic of the application, including the image encoding and Amazon Bedrock API invocations. 34 | The requirements.txt file contains all necessary dependencies for this sample application to work. 35 | 36 | ## Step 2: 37 | Set up a python virtual environment in the root directory of the repository and ensure that you are using Python 3.9. This can be done by running the following commands: 38 | ``` 39 | pip install virtualenv 40 | python3.9 -m venv venv 41 | ``` 42 | The virtual environment will be extremely useful when you begin installing the requirements. If you need more clarification on the creation of the virtual environment please refer to this [blog](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/). 43 | After the virtual environment is created, ensure that it is activated, following the activation steps of the virtual environment tool you are using. Likely: 44 | ``` 45 | cd venv 46 | cd bin 47 | source activate 48 | cd ../../ 49 | ``` 50 | After your virtual environment has been created and activated, you can install all the requirements found in the requirements.txt file by running this command in the root of this repos directory in your terminal: 51 | ``` 52 | pip install -r requirements.txt 53 | ``` 54 | 55 | ## Step 3: 56 | Now that the requirements have been successfully installed in your virtual environment we can begin configuring environment variables. 57 | You will first need to create a .env file in the root of this repo. Within the .env file you just created you will need to configure the .env to contain: 58 | 59 | ``` 60 | profile_name= 61 | save_folder= 62 | ``` 63 | Please ensure that your AWS CLI Profile has access to Amazon Bedrock! 64 | 65 | ## Step 4: 66 | As soon as you have successfully cloned the repo, created a virtual environment, activated it, installed the requirements.txt, and created a .env file, your application should be ready to go. 67 | To start up the application with its basic frontend you simply need to run the following command in your terminal while in the root of the repositories' directory: 68 | 69 | ``` 70 | streamlit run app.py 71 | ``` 72 | As soon as the application is up and running in your browser of choice you can begin uploading images and or text questions and generating natural language responses detailing the images or the specific questions that were asked.. 73 | 74 | ## ***The contents of this repository represent my viewpoints and not of my past or current employers, including Amazon Web Services (AWS). All third-party libraries, modules, plugins, and SDKs are the property of their respective owners.*** 75 | -------------------------------------------------------------------------------- /llm_multi_modal_invoke.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | from dotenv import load_dotenv 4 | import os 5 | import base64 6 | import io 7 | from PIL import Image 8 | 9 | # loading in variables from .env file 10 | load_dotenv() 11 | 12 | # instantiating the Bedrock client, and passing in the CLI profile 13 | boto3.setup_default_session(profile_name=os.getenv("profile_name")) 14 | bedrock = boto3.client('bedrock-runtime', 'us-west-2', endpoint_url='https://bedrock-runtime.us-west-2.amazonaws.com') 15 | 16 | 17 | def image_base64_encoder(image_name): 18 | """ 19 | This function takes in a string that represent the path to the image that has been uploaded by the user and the function 20 | is used to encode the image to base64. The base64 string is then returned. 21 | :param image_name: This is the path to the image file that the user has uploaded. 22 | :return: A base64 string of the image that was uploaded. 23 | """ 24 | # opening the image file that was uploaded by the user 25 | open_image = Image.open(image_name) 26 | # creating a BytesIO object to store the image in memory 27 | image_bytes = io.BytesIO() 28 | # saving the image to the BytesIO object 29 | open_image.save(image_bytes, format=open_image.format) 30 | # converting the BytesIO object to a base64 string and returning it 31 | image_bytes = image_bytes.getvalue() 32 | image_base64 = base64.b64encode(image_bytes).decode('utf-8') 33 | # getting the appropriate file type as claude 3 expects the file type to be presented 34 | file_type = f"image/{open_image.format.lower()}" 35 | # returning both the formatted file type string, along with the base64 encoded image 36 | return file_type, image_base64 37 | 38 | 39 | def image_to_text(image_name, text) -> str: 40 | """ 41 | This function is used to perform an image to text llm invocation against Claude 3. It can work with just an image and/or with 42 | text. If the user does not use any text, a default prompt will be passed in along with the system prompt as Claude 3 expects 43 | text in the text block of the prompt. 44 | :param image_name: This is the path to the image file that the user has uploaded. 45 | :param text: This is the text the user inserted in the text box on the frontend. 46 | :return: A natural language response giving a detailed analysis of the image that was uploaded or answering a specific 47 | question that the user asked along with the image. 48 | """ 49 | # invoking the image_base64_encoder function to encode the image to base64 and get the file type string 50 | file_type, image_base64 = image_base64_encoder(image_name) 51 | # the system prompt is used as a default prompt, and is always passed into to the model 52 | # TODO: Edit the system prompt based on your specific use case 53 | system_prompt = """Describe every detail you can about this image, be extremely thorough and detail even the most minute aspects of the image. 54 | 55 | If a more specific question is presented by the user, make sure to prioritize that answer. 56 | """ 57 | # checking if the user inserted any text along with the image, if not, we set text to a default since claude expects 58 | # text in the text block of the prompt. 59 | if text == "": 60 | text = "Use the system prompt" 61 | # this is the primary prompt passed into Claude3 with the system prompt, user uploaded image in base64 and any 62 | # text the user inserted 63 | prompt = { 64 | "anthropic_version": "bedrock-2023-05-31", 65 | "max_tokens": 1000, 66 | "temperature": 0.5, 67 | "system": system_prompt, 68 | "messages": [ 69 | { 70 | "role": "user", 71 | "content": [ 72 | { 73 | "type": "image", 74 | "source": { 75 | "type": "base64", 76 | "media_type": file_type, 77 | "data": image_base64 78 | } 79 | }, 80 | { 81 | "type": "text", 82 | "text": text 83 | } 84 | ] 85 | } 86 | ] 87 | } 88 | # formatting the prompt as a json string 89 | json_prompt = json.dumps(prompt) 90 | # invoking Claude3, passing in our prompt 91 | response = bedrock.invoke_model(body=json_prompt, modelId="anthropic.claude-3-sonnet-20240229-v1:0", 92 | accept="application/json", contentType="application/json") 93 | # getting the response from Claude3 and parsing it to return to the end user 94 | response_body = json.loads(response.get('body').read()) 95 | # the final string returned to the end user 96 | llm_output = response_body['content'][0]['text'] 97 | # returning the final string to the end user 98 | return llm_output 99 | 100 | 101 | def text_to_text(text): 102 | """ 103 | This function is used if a user does not upload an image, and only uploads text, this text is directly passed into 104 | Claude3. 105 | :param text: This is the text that the user inserts on the frontend. 106 | :return: A natural language response to the question that the user inserted on the frontend. 107 | """ 108 | # the system prompt is used as a default prompt, and is always passed into to the model 109 | # TODO: Edit the system prompt based on your specific use case 110 | system_prompt = """Answer every aspect of the provided question as thoroughly as possible. Be extremely thorough and provide detailed answers to the user provided question. 111 | """ 112 | # this is the formatted prompt that contains both the system_prompt along with the text prompt that was inserted by the user. 113 | prompt = { 114 | "anthropic_version": "bedrock-2023-05-31", 115 | "max_tokens": 1000, 116 | "temperature": 0.5, 117 | "system": system_prompt, 118 | "messages": [ 119 | { 120 | "role": "user", 121 | "content": [ 122 | { 123 | "type": "text", 124 | "text": text 125 | } 126 | ] 127 | } 128 | ] 129 | } 130 | # formatting the prompt as a json string 131 | json_prompt = json.dumps(prompt) 132 | # invoking Claude3, passing in our prompt 133 | response = bedrock.invoke_model(body=json_prompt, modelId="anthropic.claude-3-sonnet-20240229-v1:0", 134 | accept="application/json", contentType="application/json") 135 | # getting the response from Claude3 and parsing it to return to the end user 136 | response_body = json.loads(response.get('body').read()) 137 | # the final string returned to the end user 138 | llm_output = response_body['content'][0]['text'] 139 | # returning the final string to the end user 140 | return llm_output 141 | --------------------------------------------------------------------------------