├── NotoSansCJK-Regular.ttc ├── README.md ├── bbox.py ├── image └── README │ └── 1725058543677.png └── requirements.txt /NotoSansCJK-Regular.ttc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexZhangji/bonding_w_geimini/e1bfadc0a08654339ff55cb2acc85726db95cef0/NotoSansCJK-Regular.ttc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bounding with Gemini 2 | 3 | This Streamlit application allows you to upload an image, send it to the Gemini API for bounding box detection, and visualize the results with labeled bounding boxes drawn directly on the image. 4 | Automatically generated ReadME. This should be very straight forward. 5 | 6 | Example Screen cap: 7 | 8 | ![1725058543677](image/README/1725058543677.png) 9 | 10 | ## Features 11 | 12 | - Upload images in JPG, JPEG, or PNG format. 13 | - Send a prompt to the Gemini API to detect objects and return bounding box coordinates. 14 | - Visualize detected bounding boxes on the image with customizable labels. 15 | - Download the processed image with bounding boxes. 16 | 17 | ## Prerequisites 18 | 19 | Before running the application, ensure you have the following installed: 20 | 21 | - Python 3.9 or later 22 | - A valid Gemini API key 23 | 24 | ### Install Required Python Packages 25 | 26 | Install the necessary Python packages using `pip`: 27 | 28 | ```bash 29 | pip install streamlit Pillow google-generativeai 30 | ``` 31 | 32 | ## Running the Application 33 | 34 | 1. **Clone the Repository:** 35 | 36 | ```bash 37 | git clone https://github.com/AlexZhangji/bonding_w_geimini.git 38 | cd bonding_w_geimini 39 | ``` 40 | 2. **Run the Streamlit App:** 41 | Start the application with the following command: 42 | 43 | ```bash 44 | streamlit run bbox.py 45 | ``` 46 | 3. **Enter Your Gemini API Key:** 47 | 48 | - On the sidebar, enter your Gemini API key in the input box. This key is required to interact with the Gemini API. 49 | 4. **Upload an Image:** 50 | 51 | - Click "Browse files" to upload a JPG, JPEG, or PNG image from your local machine. 52 | 5. **Enter a Prompt:** 53 | 54 | - Enter a prompt in the text area that describes how you want the Gemini API to detect objects and return bounding boxes. 55 | 6. **Process the Image:** 56 | 57 | - Click the "Process" button to send the image and prompt to the Gemini API. 58 | - Wait for the results, which will display the image with bounding boxes drawn around detected objects. 59 | 7. **Download the Processed Image:** 60 | 61 | - After processing, you can download the image with the bounding boxes drawn by clicking the "Download Image with Bounding Boxes" button. 62 | -------------------------------------------------------------------------------- /bbox.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import json 3 | from PIL import Image, ImageDraw, ImageFont 4 | import io 5 | import google.generativeai as genai 6 | import random 7 | import os 8 | from google.api_core.exceptions import GoogleAPIError 9 | 10 | def resize_image(image, max_size=800): 11 | """ 12 | Resize the image maintaining the aspect ratio. If either dimension exceeds max_size, scale it down. 13 | """ 14 | width, height = image.size 15 | if width > height: 16 | if width > max_size: 17 | height = int((height * max_size) / width) 18 | width = max_size 19 | else: 20 | if height > max_size: 21 | width = int((width * max_size) / height) 22 | height = max_size 23 | return image.resize((width, height)) 24 | 25 | def generate_random_color(): 26 | """ 27 | Generate a random color in hexadecimal format. 28 | """ 29 | return "#{:06x}".format(random.randint(0, 0xFFFFFF)) 30 | 31 | def get_font(size=20): 32 | """ 33 | Get a font object for drawing text. Attempts to load NotoSansCJK-Regular.ttc. 34 | Falls back to default font if unavailable. 35 | """ 36 | font_files = ["NotoSansCJK-Regular.ttc"] 37 | 38 | for font_file in font_files: 39 | if os.path.exists(font_file): 40 | try: 41 | return ImageFont.truetype(font_file, size) 42 | except IOError: 43 | continue 44 | 45 | return ImageFont.load_default() 46 | 47 | def draw_text_with_outline(draw, text, position, font, text_color, outline_color): 48 | """ 49 | Draw text with an outline on the image. 50 | """ 51 | x, y = position 52 | # Draw outline 53 | draw.text((x-1, y-1), text, font=font, fill=outline_color) 54 | draw.text((x+1, y-1), text, font=font, fill=outline_color) 55 | draw.text((x-1, y+1), text, font=font, fill=outline_color) 56 | draw.text((x+1, y+1), text, font=font, fill=outline_color) 57 | # Draw text 58 | draw.text(position, text, font=font, fill=text_color) 59 | 60 | def draw_bounding_boxes(image, bboxes): 61 | """ 62 | Draw bounding boxes on the image using the coordinates provided in the bboxes dictionary. 63 | """ 64 | draw = ImageDraw.Draw(image) 65 | width, height = image.size 66 | 67 | font = get_font(20) 68 | 69 | for label, bbox in bboxes.items(): 70 | color = generate_random_color() 71 | ymin, xmin, ymax, xmax = [coord / 1000 * dim for coord, dim in zip(bbox, [height, width, height, width])] 72 | 73 | draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3) 74 | 75 | # Calculate the area needed for the label and add padding 76 | label_bbox = font.getbbox(label) 77 | label_width = label_bbox[2] - label_bbox[0] + 10 # Adding padding 78 | label_height = label_bbox[3] - label_bbox[1] + 10 # Adding padding 79 | 80 | if xmax - xmin < label_width: 81 | xmax = xmin + label_width 82 | if ymax - ymin < label_height: 83 | ymax = ymin + label_height 84 | 85 | draw.rectangle([xmin, ymin, xmin + label_width, ymin + label_height], fill=color) 86 | draw_text_with_outline(draw, label, (xmin + 5, ymin + 5), font, text_color="white", outline_color="black") # Adding black outline to white text 87 | return image 88 | 89 | def extract_bounding_boxes(text): 90 | """ 91 | Extract bounding boxes from the given text, which is expected to be in JSON format. 92 | """ 93 | try: 94 | bboxes = json.loads(text) 95 | return bboxes 96 | except json.JSONDecodeError: 97 | import re 98 | pattern = r'"([^"]+)":\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]' 99 | matches = re.findall(pattern, text) 100 | return {label: list(map(int, coords)) for label, *coords in matches} 101 | 102 | def main(): 103 | st.title("Bounding with Gemini") 104 | 105 | with st.sidebar: 106 | st.header("Gemini API Configuration") 107 | # Keep the input box for API key 108 | api_key = st.text_input("Enter your Gemini API key", type="password", value=st.session_state.get('api_key', '')) 109 | 110 | if api_key: 111 | st.session_state['api_key'] = api_key 112 | genai.configure(api_key=api_key) 113 | st.success("API key configured!") 114 | else: 115 | st.warning("Please enter your Gemini API key to use the app.") 116 | 117 | model_options = { 118 | "gemini-1.5-pro-exp-0827": "gemini-1.5-pro-exp-0827", 119 | "gemini-1.5-pro": "gemini-1.5-pro", 120 | "gemini-1.5-flash-exp-0827": "gemini-1.5-flash-exp-0827", 121 | "gemini-1.5-flash-8b-exp-0827": "gemini-1.5-flash-8b-exp-0827" 122 | } 123 | selected_model = st.selectbox("Select Gemini Model", options=list(model_options.keys()), format_func=lambda x: model_options[x]) 124 | 125 | uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"]) 126 | 127 | prompt = st.text_area("Enter prompt for Gemini API", "Return bounding boxes as JSON arrays as name of object and its bounding boxes [ymin, xmin, ymax, xmax]. like 'name_1': [ymin, xmin, ymax, xmax]") 128 | 129 | if st.button("Process") and uploaded_file is not None and api_key: 130 | try: 131 | # Validate and open the uploaded image file 132 | try: 133 | original_image = Image.open(uploaded_file) 134 | except IOError: 135 | st.error("Uploaded file is not a valid image. Please upload a JPG, JPEG, or PNG file.") 136 | return 137 | 138 | resized_image = resize_image(original_image) 139 | 140 | model = genai.GenerativeModel(selected_model) 141 | 142 | with st.spinner("Processing the image..."): 143 | try: 144 | response = model.generate_content([prompt, resized_image]) 145 | except GoogleAPIError as api_error: 146 | st.error(f"API request failed: {api_error.message}") 147 | return 148 | 149 | bboxes = extract_bounding_boxes(response.text) 150 | if bboxes: 151 | image_with_boxes = draw_bounding_boxes(resized_image.copy(), bboxes) 152 | 153 | # Display the image with bounding boxes first 154 | st.image(image_with_boxes, caption="Image with Bounding Boxes", use_column_width=True) 155 | 156 | # Display the API response below the image 157 | st.subheader("Gemini API Response") 158 | st.write(response.text) 159 | 160 | buffered = io.BytesIO() 161 | image_with_boxes.save(buffered, format="PNG") 162 | st.download_button( 163 | label="Download Image with Bounding Boxes", 164 | data=buffered.getvalue(), 165 | file_name="image_with_bounding_boxes.png", 166 | mime="image/png" 167 | ) 168 | else: 169 | st.warning("No valid bounding box coordinates found in the response.") 170 | except Exception as e: 171 | st.error(f"An error occurred: {str(e)}") 172 | elif not api_key: 173 | st.info("Please enter your Gemini API key in the sidebar to proceed.") 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /image/README/1725058543677.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexZhangji/bonding_w_geimini/e1bfadc0a08654339ff55cb2acc85726db95cef0/image/README/1725058543677.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | Pillow 3 | google-generativeai 4 | --------------------------------------------------------------------------------