├── images
    └── architecture.png
├── .idea
    └── .gitignore
├── requirements.txt
├── LICENSE
├── app.py
├── .gitignore
├── README.md
└── llm_multi_modal_invoke.py


/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryanadoty/Amazon-Bedrock-Claude3-Multi-Modal-Sample/HEAD/images/architecture.png


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altair==5.2.0
 2 | attrs==23.2.0
 3 | blinker==1.7.0
 4 | boto3==1.34.56
 5 | botocore==1.34.56
 6 | cachetools==5.3.3
 7 | certifi==2024.2.2
 8 | charset-normalizer==3.3.2
 9 | click==8.1.7
10 | gitdb==4.0.11
11 | GitPython==3.1.42
12 | idna==3.6
13 | importlib-metadata==7.0.1
14 | Jinja2==3.1.3
15 | jmespath==1.0.1
16 | jsonschema==4.21.1
17 | jsonschema-specifications==2023.12.1
18 | markdown-it-py==3.0.0
19 | MarkupSafe==2.1.5
20 | mdurl==0.1.2
21 | numpy==1.26.4
22 | packaging==23.2
23 | pandas==2.2.1
24 | pillow==10.2.0
25 | pip==23.2.1
26 | protobuf==4.25.3
27 | pyarrow==15.0.0
28 | pydeck==0.8.1b0
29 | Pygments==2.17.2
30 | python-dateutil==2.9.0.post0
31 | python-dotenv==1.0.1
32 | pytz==2024.1
33 | referencing==0.33.0
34 | requests==2.31.0
35 | rich==13.7.1
36 | rpds-py==0.18.0
37 | s3transfer==0.10.0
38 | setuptools==68.2.0
39 | six==1.16.0
40 | smmap==5.0.1
41 | streamlit==1.31.1
42 | tenacity==8.2.3
43 | toml==0.10.2
44 | toolz==0.12.1
45 | tornado==6.4
46 | typing_extensions==4.10.0
47 | tzdata==2024.1
48 | tzlocal==5.2
49 | urllib3==1.26.18
50 | validators==0.22.0
51 | wheel==0.41.2
52 | zipp==3.17.0
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Ryan Doty
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from pathlib import Path
 3 | import os
 4 | from dotenv import load_dotenv
 5 | from llm_multi_modal_invoke import image_to_text, text_to_text
 6 | 
 7 | # load environment variables
 8 | load_dotenv()
 9 | # title of the streamlit app
10 | st.title(f""":rainbow[Multi-Modal with Amazon Bedrock and Anthropic Claude 3]""")
11 | # directions on what can be done with this streamlit app
12 | st.header(f"""Directions to use this application:
13 | You have several options when it comes to leveraging Claude 3, you can either:
14 | 1. Upload an image, and ask a specific question about it by inserting the question into the text box.
15 | 2. Upload an image, and let the model describe the image without inserting text.
16 | 3. Insert a question in the text box, and let the model answer the question directly without uploading an image.
17 | 
18 | """, divider='rainbow')
19 | # default container that houses the image upload field
20 | with st.container():
21 |     # header that is shown on the web UI
22 |     st.subheader('Image File Upload:')
23 |     # the image upload field, the specific ui element that allows you to upload an image
24 |     # when an image is uploaded it saves the file to the directory, and creates a path to that image
25 |     File = st.file_uploader('Upload an Image', type=["png", "jpg", "jpeg"], key="new")
26 |     # this is the text box that allows the user to insert a question about the uploaded image or a question in general
27 |     text = st.text_input("Do you have a question about the image? Or about anything in general?")
28 |     # this is the button that triggers the invocation of the model, processing of the image and/or question
29 |     result = st.button("Process Image or Answer Question or Both!")
30 |     # if the button is pressed, the model is invoked, and the results are output to the front end
31 |     if result:
32 |         # if an image is uploaded, a file will be present, triggering the image_to_text function
33 |         if File is not None:
34 |             # the image is displayed to the front end for the user to see
35 |             st.image(File)
36 |             # determine the path to temporarily save the image file that was uploaded
37 |             save_folder = os.getenv("save_folder")
38 |             # create a posix path of save_folder and the file name
39 |             save_path = Path(save_folder, File.name)
40 |             # write the uploaded image file to the save_folder you specified
41 |             with open(save_path, mode='wb') as w:
42 |                 w.write(File.getvalue())
43 |             # once the save path exists...
44 |             if save_path.exists():
45 |                 # write a success message saying the image has been successfully saved
46 |                 st.success(f'Image {File.name} is successfully saved!')
47 |                 # running the image to text task, and outputting the results to the front end
48 |                 st.write(image_to_text(File.name, text))
49 |                 # removing the image file that was temporarily saved to perform the question and answer task
50 |                 os.remove(save_path)
51 |         # if an Image is not uploaded, but a question is, the text_to_text function is invoked
52 |         else:
53 |             # running a text to text task, and outputting the results to the front end
54 |             st.write(text_to_text(text))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon-Bedrock-Claude3-Multi-Modal-Sample
 2 | This is sample code demonstrating the use of Amazon Bedrock and Anthropic Claude 3 to satisfy multi-modal use cases. The application is constructed with a simple streamlit frontend where users can input zero shot requests to satisfy a broad range of use cases, including image to text multi-modal style use cases.
 3 | 
 4 | # **Goal of this Repo:**
 5 | The goal of this repo is to provide users the ability to use Amazon Bedrock (specifically Claude3) and generative AI to leverage its multi-modal capabilities, allowing users to insert text questions, images, or both to get a comprehensive description/or answer based on the image and/or question that was passed in.
 6 | This repo comes with a basic frontend to help users stand up a proof of concept in just a few minutes.
 7 | 
 8 | The architecture and flow of the sample application will be:
 9 | 
10 | ![Alt text](images/architecture.png "POC Architecture")
11 | 
12 | When a user interacts with the GenAI app, the flow is as follows:
13 | 
14 | 1. (1a) The user uploads an image file to the streamlit app, with or without a text question. (app.py). (1b) The user inserts a text question into to the streamlit app, with or without an image. (app.py).
15 | 2. The streamlit app, takes the image file and/or text and saves it. The image and/or text is passed into Amazon Bedrock (Anthropic Claude 3). (llm_multi_modal_invoke.py).
16 | 3. A natural language response is returned to the end user, either describing the image, answering a question about the image, or answering a question in general. (app.py).
17 | 
18 | # How to use this Repo:
19 | 
20 | ## Prerequisites:
21 | 1. Amazon Bedrock Access and CLI Credentials.
22 | 2. Ensure Python 3.9 installed on your machine, it is the most stable version of Python for the packages we will be using, it can be downloaded [here](https://www.python.org/downloads/release/python-3911/).
23 | 
24 | ## Step 1:
25 | The first step of utilizing this repo is performing a git clone of the repository.
26 | 
27 | ```
28 | git clone https://github.com/aws-rdoty/Amazon-Bedrock-Claude3-Multi-Modal-Sample.git
29 | ```
30 | 
31 | After cloning the repo onto your local machine, open it up in your favorite code editor. The file structure of this repo is broken into 3 key files,
32 | the app.py file, the llm_multi_modal_invoke.py file, and the requirements.txt. The app.py file houses the frontend application (a streamlit app). 
33 | The llm_multi_modal_invoke.py file houses the logic of the application, including the image encoding and Amazon Bedrock API invocations.
34 | The requirements.txt file contains all necessary dependencies for this sample application to work.
35 | 
36 | ## Step 2:
37 | Set up a python virtual environment in the root directory of the repository and ensure that you are using Python 3.9. This can be done by running the following commands:
38 | ```
39 | pip install virtualenv
40 | python3.9 -m venv venv
41 | ```
42 | The virtual environment will be extremely useful when you begin installing the requirements. If you need more clarification on the creation of the virtual environment please refer to this [blog](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/).
43 | After the virtual environment is created, ensure that it is activated, following the activation steps of the virtual environment tool you are using. Likely:
44 | ```
45 | cd venv
46 | cd bin
47 | source activate
48 | cd ../../ 
49 | ```
50 | After your virtual environment has been created and activated, you can install all the requirements found in the requirements.txt file by running this command in the root of this repos directory in your terminal:
51 | ```
52 | pip install -r requirements.txt
53 | ```
54 | 
55 | ## Step 3:
56 | Now that the requirements have been successfully installed in your virtual environment we can begin configuring environment variables.
57 | You will first need to create a .env file in the root of this repo. Within the .env file you just created you will need to configure the .env to contain:
58 | 
59 | ```
60 | profile_name=<AWS_CLI_PROFILE_NAME>
61 | save_folder=<PATH_TO_ROOT_OF_THIS_REPO>
62 | ```
63 | Please ensure that your AWS CLI Profile has access to Amazon Bedrock!
64 | 
65 | ## Step 4:
66 | As soon as you have successfully cloned the repo, created a virtual environment, activated it, installed the requirements.txt, and created a .env file, your application should be ready to go. 
67 | To start up the application with its basic frontend you simply need to run the following command in your terminal while in the root of the repositories' directory:
68 | 
69 | ```
70 | streamlit run app.py
71 | ```
72 | As soon as the application is up and running in your browser of choice you can begin uploading images and or text questions and generating natural language responses detailing the images or the specific questions that were asked.. 
73 | 
74 | ## ***The contents of this repository represent my viewpoints and not of my past or current employers, including Amazon Web Services (AWS). All third-party libraries, modules, plugins, and SDKs are the property of their respective owners.***
75 | 


--------------------------------------------------------------------------------
/llm_multi_modal_invoke.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import json
  3 | from dotenv import load_dotenv
  4 | import os
  5 | import base64
  6 | import io
  7 | from PIL import Image
  8 | 
  9 | # loading in variables from .env file
 10 | load_dotenv()
 11 | 
 12 | # instantiating the Bedrock client, and passing in the CLI profile
 13 | boto3.setup_default_session(profile_name=os.getenv("profile_name"))
 14 | bedrock = boto3.client('bedrock-runtime', 'us-west-2', endpoint_url='https://bedrock-runtime.us-west-2.amazonaws.com')
 15 | 
 16 | 
 17 | def image_base64_encoder(image_name):
 18 |     """
 19 |     This function takes in a string that represent the path to the image that has been uploaded by the user and the function
 20 |     is used to encode the image to base64. The base64 string is then returned.
 21 |     :param image_name: This is the path to the image file that the user has uploaded.
 22 |     :return: A base64 string of the image that was uploaded.
 23 |     """
 24 |     # opening the image file that was uploaded by the user
 25 |     open_image = Image.open(image_name)
 26 |     # creating a BytesIO object to store the image in memory
 27 |     image_bytes = io.BytesIO()
 28 |     # saving the image to the BytesIO object
 29 |     open_image.save(image_bytes, format=open_image.format)
 30 |     # converting the BytesIO object to a base64 string and returning it
 31 |     image_bytes = image_bytes.getvalue()
 32 |     image_base64 = base64.b64encode(image_bytes).decode('utf-8')
 33 |     # getting the appropriate file type as claude 3 expects the file type to be presented
 34 |     file_type = f"image/{open_image.format.lower()}"
 35 |     # returning both the formatted file type string, along with the base64 encoded image
 36 |     return file_type, image_base64
 37 | 
 38 | 
 39 | def image_to_text(image_name, text) -> str:
 40 |     """
 41 |     This function is used to perform an image to text llm invocation against Claude 3. It can work with just an image and/or with
 42 |     text. If the user does not use any text, a default prompt will be passed in along with the system prompt as Claude 3 expects
 43 |     text in the text block of the prompt.
 44 |     :param image_name: This is the path to the image file that the user has uploaded.
 45 |     :param text: This is the text the user inserted in the text box on the frontend.
 46 |     :return: A natural language response giving a detailed analysis of the image that was uploaded or answering a specific
 47 |     question that the user asked along with the image.
 48 |     """
 49 |     # invoking the image_base64_encoder function to encode the image to base64 and get the file type string
 50 |     file_type, image_base64 = image_base64_encoder(image_name)
 51 |     # the system prompt is used as a default prompt, and is always passed into to the model
 52 |     # TODO: Edit the system prompt based on your specific use case
 53 |     system_prompt = """Describe every detail you can about this image, be extremely thorough and detail even the most minute aspects of the image. 
 54 |     
 55 |     If a more specific question is presented by the user, make sure to prioritize that answer.
 56 |     """
 57 |     # checking if the user inserted any text along with the image, if not, we set text to a default since claude expects
 58 |     # text in the text block of the prompt.
 59 |     if text == "":
 60 |         text = "Use the system prompt"
 61 |     # this is the primary prompt passed into Claude3 with the system prompt, user uploaded image in base64 and any
 62 |     # text the user inserted
 63 |     prompt = {
 64 |         "anthropic_version": "bedrock-2023-05-31",
 65 |         "max_tokens": 1000,
 66 |         "temperature": 0.5,
 67 |         "system": system_prompt,
 68 |         "messages": [
 69 |             {
 70 |                 "role": "user",
 71 |                 "content": [
 72 |                     {
 73 |                         "type": "image",
 74 |                         "source": {
 75 |                             "type": "base64",
 76 |                             "media_type": file_type,
 77 |                             "data": image_base64
 78 |                         }
 79 |                     },
 80 |                     {
 81 |                         "type": "text",
 82 |                         "text": text
 83 |                     }
 84 |                 ]
 85 |             }
 86 |         ]
 87 |     }
 88 |     # formatting the prompt as a json string
 89 |     json_prompt = json.dumps(prompt)
 90 |     # invoking Claude3, passing in our prompt
 91 |     response = bedrock.invoke_model(body=json_prompt, modelId="anthropic.claude-3-sonnet-20240229-v1:0",
 92 |                                     accept="application/json", contentType="application/json")
 93 |     # getting the response from Claude3 and parsing it to return to the end user
 94 |     response_body = json.loads(response.get('body').read())
 95 |     # the final string returned to the end user
 96 |     llm_output = response_body['content'][0]['text']
 97 |     # returning the final string to the end user
 98 |     return llm_output
 99 | 
100 | 
101 | def text_to_text(text):
102 |     """
103 |     This function is used if a user does not upload an image, and only uploads text, this text is directly passed into
104 |     Claude3.
105 |     :param text: This is the text that the user inserts on the frontend.
106 |     :return: A natural language response to the question that the user inserted on the frontend.
107 |     """
108 |     # the system prompt is used as a default prompt, and is always passed into to the model
109 |     # TODO: Edit the system prompt based on your specific use case
110 |     system_prompt = """Answer every aspect of the provided question as thoroughly as possible. Be extremely thorough and provide detailed answers to the user provided question.
111 |     """
112 |     # this is the formatted prompt that contains both the system_prompt along with the text prompt that was inserted by the user.
113 |     prompt = {
114 |         "anthropic_version": "bedrock-2023-05-31",
115 |         "max_tokens": 1000,
116 |         "temperature": 0.5,
117 |         "system": system_prompt,
118 |         "messages": [
119 |             {
120 |                 "role": "user",
121 |                 "content": [
122 |                     {
123 |                         "type": "text",
124 |                         "text": text
125 |                     }
126 |                 ]
127 |             }
128 |         ]
129 |     }
130 |     # formatting the prompt as a json string
131 |     json_prompt = json.dumps(prompt)
132 |     # invoking Claude3, passing in our prompt
133 |     response = bedrock.invoke_model(body=json_prompt, modelId="anthropic.claude-3-sonnet-20240229-v1:0",
134 |                                     accept="application/json", contentType="application/json")
135 |     # getting the response from Claude3 and parsing it to return to the end user
136 |     response_body = json.loads(response.get('body').read())
137 |     # the final string returned to the end user
138 |     llm_output = response_body['content'][0]['text']
139 |     # returning the final string to the end user
140 |     return llm_output
141 | 


--------------------------------------------------------------------------------