├── .gitignore
├── .streamlit
    └── config.toml
├── requirements.txt
├── utils
    ├── removemarkdownsyntax.py
    ├── model.py
    └── util.py
├── README.md
└── app.py


/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .env
3 | *.mp4
4 | *.mp3
5 | *.pyc


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 |      [server]
2 |      maxUploadSize = 2000  # 2000 MB = 2 GB
3 |      


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | google-generativeai
 2 | streamlit
 3 | pillow
 4 | ipython
 5 | opencv-python
 6 | rich
 7 | streamlit
 8 | Live
 9 | python-dotenv
10 | streamlit-chat


--------------------------------------------------------------------------------
/utils/removemarkdownsyntax.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def remove_markdown(text):
 4 |     """
 5 |     Remove Markdown formatting from the given text.
 6 | 
 7 |     Args:
 8 |         text (str): The input text containing Markdown.
 9 | 
10 |     Returns:
11 |         str: The text without any Markdown formatting.
12 |     """
13 |     # Remove headers (e.g., ###, ##, #)
14 |     text = re.sub(r'(^|\s)#+\s+', '', text)
15 |     
16 |     # Remove emphasis (bold, italic, strikethrough)
17 |     text = re.sub(r'(\*{1,2}|_{1,2}|~~)(.*?)\1', r'\2', text)
18 |     
19 |     # Remove code blocks with language specifiers (e.g., ```json)
20 |     text = re.sub(r'```[a-zA-Z]*\n([\s\S]*?)\n```', r'\1', text)  
21 |     
22 |     # Remove inline code
23 |     text = re.sub(r'`{1,3}([^`]*)`{1,3}', r'\1', text)
24 |     
25 |     # Remove links [text](url)
26 |     text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
27 |     
28 |     # Remove images ![alt text](url)
29 |     text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)
30 |     
31 |     # Remove blockquotes
32 |     text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
33 |     
34 |     # Remove horizontal rules (---, ***, ___)
35 |     text = re.sub(r'(^|\n)(-{3,}|_{3,}|\*{3,})(\n|$)', r'\1', text)
36 |     
37 |     # Remove lists (unordered and ordered)
38 |     text = re.sub(r'(^|\n)(\s*[-+*]|\d+\.)\s+', r'\1', text)
39 | 
40 |     # Remove any remaining Markdown-specific characters
41 |     text = re.sub(r'[*_~`]', '', text)
42 | 
43 |     return text.strip()


--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
 1 | import google.generativeai as genai
 2 | from google.generativeai.types import GenerationConfig
 3 | from dotenv import load_dotenv
 4 | import os
 5 | from google.generativeai import caching
 6 | import datetime
 7 | 
 8 | def load_model(type, schemaType):
 9 |   load_dotenv()
10 |   genai.configure(api_key=os.getenv('API_KEY'))
11 |   if type is not None and schemaType is not None:
12 |       # Configuration when both type and schemaType are provided
13 |       generation_config = GenerationConfig(
14 |           temperature=0.7,
15 |           top_p=0.9,
16 |           top_k=40,
17 |           candidate_count=1,
18 |           max_output_tokens=8192,
19 |           response_mime_type="application/json",
20 |           response_schema=schemaType
21 |       )
22 |   else:
23 |       # Default configuration when type or schemaType is not provided
24 |       generation_config = GenerationConfig(
25 |           temperature=0.9,
26 |           top_p=1.0,
27 |           top_k=32,
28 |           candidate_count=1,
29 |           max_output_tokens=8192
30 |       )
31 |   
32 |   model_name = os.getenv('MODEL')
33 |   model = genai.GenerativeModel(model_name=model_name, generation_config=generation_config)
34 |   return model
35 | 
36 | 
37 | def load_cached_content_model(contents, display_name, system_instruction, ttl_minutes=5):
38 |   print('loading cached content model')
39 |   load_dotenv() 
40 |   genai.configure(api_key=os.getenv('API_KEY'))
41 |   # Create a cache with the specified TTL
42 |   cache = caching.CachedContent.create(
43 |       model=os.getenv('CACHING_MODEL'),
44 |       display_name=display_name,
45 |       system_instruction=system_instruction,
46 |       contents=contents,
47 |       ttl=datetime.timedelta(minutes=ttl_minutes),
48 |   )
49 |   print('cache',cache)
50 |   # Construct a GenerativeModel which uses the created cache.
51 |   model = genai.GenerativeModel.from_cached_content(cached_content=cache)
52 |   return model
53 | 
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Gemini Multimodal Streamlit Application
  2 | 
  3 | A comprehensive multimodal AI application built with Streamlit and Google's Gemini AI that provides video analysis, object detection, audio transcription, and file management capabilities through an intuitive web interface.
  4 | 
  5 | ## Project Overview
  6 | 
  7 | This application leverages Google's Gemini AI models to process and analyze various types of media content. It provides four main functionalities:
  8 | 
  9 | - **Video Analysis**: Upload videos to automatically generate metadata including title, summary, duration, and tags
 10 | - **Object Detection**: Upload images or use camera input to detect and locate objects with bounding box visualization
 11 | - **Audio Transcription**: Convert audio files to text with speaker identification
 12 | - **File API Management**: List, view, and delete files uploaded to the Gemini API
 13 | 
 14 | The application is designed to be user-friendly with a clean, tabbed interface that allows users to easily switch between different AI-powered features.
 15 | 
 16 | ## Features
 17 | 
 18 | ### 🎥 Video Analysis
 19 | 
 20 | - Upload video files (MP4, MOV, AVI, MKV)
 21 | - Automatic metadata generation with structured JSON schema
 22 | - Extract video title, summary, duration, and relevant tags
 23 | - Real-time processing status updates
 24 | 
 25 | ### 📸 Object Detection
 26 | 
 27 | - Upload images (JPG, JPEG, PNG) for object detection
 28 | - Specify custom objects to detect or detect all objects
 29 | - Visual bounding box annotations with object labels
 30 | - Pixel-perfect coordinate extraction and display
 31 | - Support for normalized coordinate conversion
 32 | 
 33 | ### 🔊 Audio Transcription
 34 | 
 35 | - Support for multiple audio formats (MP3, WAV, AIFF, AAC, OGG, FLAC)
 36 | - Speaker identification and dialogue formatting
 37 | - Accurate transcription with filler words preservation
 38 | - Interview and conversation transcription optimization
 39 | 
 40 | ### 📂 File API Management
 41 | 
 42 | - List all uploaded files with display names and file names
 43 | - Individual file deletion by name
 44 | - Bulk delete all files functionality
 45 | - Real-time file status monitoring
 46 | 
 47 | ## Tech Stack
 48 | 
 49 | ### Core Technologies
 50 | 
 51 | - **Python 3.x** - Primary programming language
 52 | - **Streamlit** - Web application framework for the user interface
 53 | - **Google Generative AI (Gemini)** - Core AI model for multimodal processing
 54 | 
 55 | ### AI & Machine Learning
 56 | 
 57 | - **google-generativeai** - Official Google Gemini AI SDK
 58 | - **Pillow (PIL)** - Image processing and manipulation
 59 | - **OpenCV** - Computer vision operations
 60 | 
 61 | ### Utilities & Support
 62 | 
 63 | - **python-dotenv** - Environment variable management
 64 | - **Rich** - Enhanced terminal output formatting
 65 | - **IPython** - Interactive Python environment
 66 | - **streamlit-chat** - Chat interface components
 67 | 
 68 | ### Development Tools
 69 | 
 70 | - **Live** - Development server utilities
 71 | - **JSON** - Data serialization and parsing
 72 | - **Regex (re)** - Text processing and markdown removal
 73 | 
 74 | ## Project Structure
 75 | 
 76 | ```
 77 | GeminiMultiModalStreamlit/
 78 | ├── app.py                          # Main Streamlit application
 79 | ├── requirements.txt                # Python dependencies
 80 | ├── README.md                       # Project documentation
 81 | ├── utils/                          # Utility modules
 82 | │   ├── model.py                   # AI model loading and configuration
 83 | │   ├── util.py                    # Core utility functions
 84 | │   └── removemarkdownsyntax.py    # Markdown text processing
 85 | └── temp/                          # Temporary file storage (auto-created)
 86 | ```
 87 | 
 88 | ### Key Components
 89 | 
 90 | - **`app.py`**: Main application entry point with Streamlit UI and tab management
 91 | - **`utils/model.py`**: Handles Gemini AI model initialization, configuration, and caching
 92 | - **`utils/util.py`**: Core utilities for file upload, processing, metadata generation, and image processing
 93 | - **`utils/removemarkdownsyntax.py`**: Text processing utilities for cleaning AI responses
 94 | 
 95 | ## Setup Instructions
 96 | 
 97 | ### Prerequisites
 98 | 
 99 | - Python 3.7 or higher
100 | - Google AI API key (from Google AI Studio)
101 | - Virtual environment (recommended)
102 | 
103 | ### 1. Clone and Navigate
104 | 
105 | ```bash
106 | git clone <repository-url>
107 | cd GeminiMultiModalStreamlit
108 | ```
109 | 
110 | ### 2. Create Virtual Environment
111 | 
112 | ```bash
113 | # Windows
114 | python -m venv venv
115 | venv\Scripts\activate
116 | 
117 | # macOS/Linux
118 | python -m venv venv
119 | source venv/bin/activate
120 | ```
121 | 
122 | ### 3. Install Dependencies
123 | 
124 | ```bash
125 | pip install -r requirements.txt
126 | ```
127 | 
128 | ### 4. Environment Configuration
129 | 
130 | Create a `.env` file in the project root directory:
131 | 
132 | ```plaintext
133 | API_KEY=your_google_ai_api_key_here
134 | MODEL=gemini-1.5-flash-latest
135 | CACHING_MODEL=gemini-1.5-flash-001
136 | ```
137 | 
138 | **Important**:
139 | 
140 | - Replace `your_google_ai_api_key_here` with your actual Google AI API key
141 | - You can obtain an API key from [Google AI Studio](https://aistudio.google.com/)
142 | - The `MODEL` can be any supported Gemini model version
143 | - `CACHING_MODEL` is used for cached content operations
144 | 
145 | ### 5. Run the Application
146 | 
147 | ```bash
148 | streamlit run app.py
149 | ```
150 | 
151 | The application will start and be accessible at `http://localhost:8501`
152 | 
153 | ## Usage Guide
154 | 
155 | ### Getting Started
156 | 
157 | 1. Launch the application using the command above
158 | 2. The interface will display four tabs: Video, Image, Audio, and File API
159 | 3. Select the appropriate tab based on your use case
160 | 
161 | ### Video Analysis
162 | 
163 | 1. Navigate to the **Video** tab
164 | 2. Upload a video file using the file uploader
165 | 3. Click **"Analyze Video"** to start processing
166 | 4. Wait for file upload and processing completion
167 | 5. View the generated metadata including title, summary, duration, and tags
168 | 
169 | ### Object Detection
170 | 
171 | 1. Go to the **Image** tab
172 | 2. Upload an image using the sidebar file uploader
173 | 3. Choose detection mode:
174 |    - **Specific Object**: Enter the object name you want to detect
175 |    - **All Objects**: Check "Detect All Objects" to find everything
176 | 4. Click **"Detect Objects"** to process the image
177 | 5. View the annotated image with bounding boxes and coordinate details
178 | 
179 | ### Audio Transcription
180 | 
181 | 1. Select the **Audio** tab
182 | 2. Upload an audio file (supports multiple formats)
183 | 3. Click **"Transcribe Audio"** to start processing
184 | 4. Wait for upload and transcription completion
185 | 5. View the formatted transcription with speaker identification
186 | 
187 | ### File Management
188 | 
189 | 1. Access the **File API** tab
190 | 2. **List Files**: Click to view all uploaded files
191 | 3. **Delete Single File**: Enter file name and click delete
192 | 4. **Delete All Files**: Check the checkbox and confirm to remove all files
193 | 
194 | ## Configuration
195 | 
196 | ### Environment Variables
197 | 
198 | The application requires the following environment variables in your `.env` file:
199 | 
200 | | Variable        | Description                  | Required | Example                   |
201 | | --------------- | ---------------------------- | -------- | ------------------------- |
202 | | `API_KEY`       | Google AI API key            | Yes      | `AIza...`                 |
203 | | `MODEL`         | Gemini model version         | Yes      | `gemini-1.5-flash-latest` |
204 | | `CACHING_MODEL` | Model for caching operations | No       | `gemini-1.5-flash-001`    |
205 | 
206 | ### Model Configuration
207 | 
208 | The application automatically configures different model settings based on use case:
209 | 
210 | - **Structured Output** (Video Analysis): JSON schema response with specific temperature and token limits
211 | - **General Purpose** (Image/Audio): Standard configuration with higher creativity settings
212 | - **Caching**: Optimized for repeated operations with TTL management
213 | 
214 | ### File Handling
215 | 
216 | - Temporary files are automatically created and cleaned up
217 | - Supported video formats: MP4, MOV, AVI, MKV
218 | - Supported image formats: JPG, JPEG, PNG
219 | - Supported audio formats: MP3, WAV, AIFF, AAC, OGG, FLAC
220 | - Files are uploaded to Google's servers and processed remotely
221 | 
222 | ### Performance Optimization
223 | 
224 | - Model instances are cached using Streamlit's `@st.cache_resource`
225 | - File processing includes polling mechanisms for completion status
226 | - Bounding box coordinates are normalized and converted for accuracy
227 | - Error handling and validation throughout the processing pipeline
228 | 
229 | ---
230 | 
231 | **Note**: This application requires an active internet connection and valid Google AI API credentials to function properly. Make sure your API key has sufficient quota for the operations you plan to perform.
232 | 


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import google.generativeai as genai
  3 | from typing import Optional, Dict, Any, List
  4 | import json
  5 | import time
  6 | from PIL import Image
  7 | import streamlit as st
  8 | import re
  9 | import PIL.Image
 10 | from PIL import Image, ImageDraw
 11 | import os
 12 | 
 13 | def upload_file_to_gemini(file) -> Optional[Dict[str, Any]]:
 14 |     """Uploads a file to Google Gemini."""
 15 |     try:
 16 |         temp_dir = pathlib.Path("temp")
 17 |         temp_dir.mkdir(exist_ok=True)
 18 |         file_path = temp_dir / file.name
 19 |         with open(file_path, "wb") as f:
 20 |             f.write(file.getbuffer())
 21 |         uploaded_file = genai.upload_file(file_path)
 22 |         os.remove(file_path)  # Remove the file from local after upload
 23 |         return uploaded_file
 24 |     except Exception as e:
 25 |         st.error(f"Error uploading file: {e}")
 26 |         return None
 27 | 
 28 | 
 29 | def poll_file_processing(uploaded_file) -> Optional[Dict[str, Any]]:
 30 |     """Polls the status of the uploaded file until processing is complete."""
 31 |     try:
 32 |         with st.spinner('Processing file...'):
 33 |             while uploaded_file.state.name == "PROCESSING":
 34 |                 time.sleep(1)
 35 |                 uploaded_file = genai.get_file(uploaded_file.name)
 36 |             if uploaded_file.state.name == "ACTIVE":
 37 |                 st.success(" File processing completed.")
 38 |                 return uploaded_file
 39 |             elif uploaded_file.state.name == "FAILED":
 40 |                 st.error("File processing failed.")
 41 |                 return None
 42 |             else:
 43 |                 st.error(f"Unexpected file state: {uploaded_file.state.name}")
 44 |                 return None
 45 |     except Exception as e:
 46 |         st.error(f"Error during file processing: {e}")
 47 |         return None
 48 | 
 49 | 
 50 | def generate_metadata(model: Any, video_file) -> Optional[Dict[str, Any]]:
 51 |     """Generates metadata for the uploaded video using the Generative AI model."""
 52 |     try:
 53 |         prompt = "Provide the details based on provided response schema"
 54 |         result = model.generate_content([video_file, prompt])
 55 |         if result.text:
 56 |             metadata = json.loads(result.text)
 57 |             return metadata
 58 |         else:
 59 |             st.error("No response received from the model.")
 60 |             return None
 61 |     except json.JSONDecodeError as je:
 62 |         st.error(f"Error decoding JSON response: {je}")
 63 |         return None
 64 |     except Exception as e:
 65 |         st.error(f"Error generating metadata: {e}")
 66 |         return None
 67 | 
 68 | 
 69 | def generate_transcription(model: Any, audio_file) -> Optional[str]:
 70 |     """Generates transcription for the uploaded audio using the Generative AI model."""
 71 |     try:
 72 |         prompt = """
 73 | Please transcribe this interview in the following format:
 74 | [Speaker Name or Speaker A/B]: [Dialogue or caption].
 75 | If a speaker's name is mentioned or can be identified in the audio, map the actual names accordingly.
 76 | If no names are given, use Speaker A, Speaker B, etc.
 77 | Ensure the transcription captures all spoken words accurately, including filler words where appropriate.
 78 | """
 79 |         responses = model.generate_content([audio_file, prompt])
 80 |         if responses.text:
 81 |             transcription = responses.text.strip()
 82 |             return transcription
 83 |         else:
 84 |             st.error("No response received from the model.")
 85 |             return None
 86 |     except Exception as e:
 87 |         st.error(f"Error generating transcription: {e}")
 88 |         return None
 89 | 
 90 | 
 91 | def remove_markdown(text):
 92 |     """
 93 |     Remove Markdown formatting from the given text.
 94 | 
 95 |     Args:
 96 |         text (str): The input text containing Markdown.
 97 | 
 98 |     Returns:
 99 |         str: The text without any Markdown formatting.
100 |     """
101 |     # Remove headers (e.g., ###, ##, #)
102 |     text = re.sub(r'(^|\s)#+\s+', '', text)
103 |     
104 |     # Remove emphasis (bold, italic, strikethrough)
105 |     text = re.sub(r'(\*{1,2}|_{1,2}|~~)(.*?)\1', r'\2', text)
106 |     
107 |     # Remove code blocks with language specifiers (e.g., ```json)
108 |     text = re.sub(r'```[a-zA-Z]*\n([\s\S]*?)\n```', r'\1', text)  
109 |     
110 |     # Remove inline code
111 |     text = re.sub(r'`{1,3}([^`]*)`{1,3}', r'\1', text)
112 |     
113 |     # Remove links [text](url)
114 |     text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
115 |     
116 |     # Remove images ![alt text](url)
117 |     text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)
118 |     
119 |     # Remove blockquotes
120 |     text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
121 |     
122 |     # Remove horizontal rules (---, ***, ___)
123 |     text = re.sub(r'(^|\n)(-{3,}|_{3,}|\*{3,})(\n|$)', r'\1', text)
124 |     
125 |     # Remove lists (unordered and ordered)
126 |     text = re.sub(r'(^|\n)(\s*[-+*]|\d+\.)\s+', r'\1', text)
127 | 
128 |     # Remove any remaining Markdown-specific characters
129 |     text = re.sub(r'[*_~`]', '', text)
130 | 
131 |     return text.strip()
132 | 
133 | 
134 | def parse_bounding_boxes(response_text):
135 |   """
136 |   Parses the JSON response to extract bounding boxes along with their names.
137 |   
138 |   Args:
139 |       response_text (str): The raw text response from the model.
140 |       
141 |   Returns:
142 |       list of dict: A list containing bounding box dictionaries with object names.
143 |       
144 |   Raises:
145 |       ValueError: If JSON parsing fails or the structure is incorrect.
146 |   """
147 |   try:
148 |       bounding_boxes = json.loads(response_text)
149 |       
150 |       # Validate that the response is a list
151 |       if not isinstance(bounding_boxes, list):
152 |           raise ValueError("Response JSON is not a list.")
153 |       
154 |       # Define the required keys and their expected types
155 |       required_keys = {
156 |           "name": str,
157 |           "ymin": (int, float),
158 |           "xmin": (int, float),
159 |           "ymax": (int, float),
160 |           "xmax": (int, float)
161 |       }
162 |       
163 |       # Validate each bounding box
164 |       for box in bounding_boxes:
165 |           # Check if all required keys are present
166 |           missing_keys = [key for key in required_keys if key not in box]
167 |           if missing_keys:
168 |               raise ValueError(f"Bounding box missing keys: {missing_keys} in {box}")
169 |           
170 |           # Validate the type of each key
171 |           for key, expected_type in required_keys.items():
172 |               if not isinstance(box[key], expected_type):
173 |                   raise ValueError(f"Bounding box key '{key}' has incorrect type in {box}. Expected {expected_type}, got {type(box[key])}.")
174 |       
175 |       return bounding_boxes
176 |   except json.JSONDecodeError as e:
177 |       raise ValueError(f"Invalid JSON response: {e}")
178 | 
179 | def convert_normalized_to_pixel(bounding_boxes, image_width, image_height):
180 |   """
181 |   Converts normalized bounding box coordinates to pixel values.
182 |   
183 |   Args:
184 |       bounding_boxes (list of dict): List of bounding boxes with normalized coordinates.
185 |       image_width (int): Width of the original image in pixels.
186 |       image_height (int): Height of the original image in pixels.
187 |       
188 |   Returns:
189 |       list of dict: List of bounding boxes with pixel coordinates.
190 |   """
191 |   converted_boxes = []
192 |   for box in bounding_boxes:
193 |       name = (box['name'])
194 |       xmin = (box['xmin'] / 1000) * image_width
195 |       ymin = (box['ymin'] / 1000) * image_height
196 |       xmax = (box['xmax'] / 1000) * image_width
197 |       ymax = (box['ymax'] / 1000) * image_height
198 |       
199 |       # Ensure coordinates are integers
200 |       xmin, ymin, xmax, ymax = map(int, [xmin, ymin, xmax, ymax])
201 |       
202 |       # Validate coordinates
203 |       if not (0 <= xmin < xmax <= image_width) or not (0 <= ymin < ymax <= image_height):
204 |           print(f"Invalid bounding box coordinates after conversion: {box}")
205 |           continue  # Skip invalid boxes
206 |       
207 |       converted_boxes.append({
208 |           'name': name,
209 |           'xmin': xmin,
210 |           'ymin': ymin,
211 |           'xmax': xmax,
212 |           'ymax': ymax
213 |       })
214 |   
215 |   return converted_boxes
216 | 
217 | 
218 | def draw_bounding_boxes(image, bounding_boxes, output_path=None):
219 |     """
220 |     Draws multiple bounding boxes on the image with labeled text.
221 |     
222 |     Args:
223 |         image (PIL.Image.Image): The original image.
224 |         bounding_boxes (list of dict): List of bounding boxes with pixel coordinates.
225 |         output_path (str, optional): Path to save the annotated image. If None, returns the image object.
226 |         
227 |     Returns:
228 |         PIL.Image.Image: Image with bounding boxes and labels drawn.
229 |     """
230 |     draw = ImageDraw.Draw(image)
231 |     
232 |     # You may need to adjust the font size based on your needs
233 |     try:
234 |         from PIL import ImageFont
235 |         font = ImageFont.truetype("arial.ttf", 20)
236 |     except:
237 |         font = ImageFont.load_default()
238 |     
239 |     for idx, box in enumerate(bounding_boxes, 1):
240 |         xmin = box['xmin']
241 |         ymin = box['ymin']
242 |         xmax = box['xmax']
243 |         ymax = box['ymax']
244 |         name = box['name']
245 |         
246 |         # Draw the red bounding box
247 |         draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=1)
248 |         
249 |         # Prepare the label text
250 |         label_text = f"{name}"
251 |         
252 |         # Get text size
253 |         text_bbox = draw.textbbox((0, 0), label_text, font=font)
254 |         text_width = text_bbox[2] - text_bbox[0]
255 |         text_height = text_bbox[3] - text_bbox[1]
256 |         
257 |         # Calculate text position (above the bounding box)
258 |         text_x = xmin
259 |         text_y = max(0, ymin - text_height - 2)  # 2 pixels padding
260 |         
261 |         # Draw yellow background for text
262 |         draw.rectangle(
263 |             [
264 |                 text_x - 2,  # 2 pixels padding
265 |                 text_y - 2,
266 |                 text_x + text_width + 2,
267 |                 text_y + text_height + 2
268 |             ],
269 |             fill="yellow"
270 |         )
271 |         
272 |         # Draw black text on yellow background
273 |         draw.text((text_x, text_y), label_text, fill="black", font=font)
274 |     
275 |     if output_path:
276 |         image.save(output_path)
277 |         print(f"Annotated image saved at '{output_path}'.")
278 |     return image


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # app.py
  2 | import streamlit as st
  3 | from utils.util import (
  4 |   upload_file_to_gemini,
  5 |   poll_file_processing,
  6 |   generate_metadata,
  7 |   generate_transcription,
  8 |   remove_markdown,
  9 |   parse_bounding_boxes,
 10 |   convert_normalized_to_pixel,
 11 |   draw_bounding_boxes
 12 | )
 13 | from utils.model import load_model
 14 | from PIL import Image
 15 | from typing import TypedDict, Optional, List, Dict, Any
 16 | from utils.util import upload_file_to_gemini
 17 | import google.generativeai as genai
 18 | 
 19 | def main():
 20 |   st.set_page_config(page_title="Gemini Multimodal", layout="wide")
 21 |   st.title("Gemini Multimodal Application")
 22 | 
 23 |   # Tab selection using radio
 24 |   tab = st.radio("", ["Video", "Image", "Audio", "File API"], horizontal=True)
 25 | 
 26 |   if tab == "Video":
 27 |       video_tab()
 28 |   elif tab == "Image":
 29 |       image_tab()
 30 |   elif tab == "Audio":
 31 |       audio_tab()
 32 |   elif tab == "File API":
 33 |       file_api_tab()
 34 | 
 35 | def video_tab():
 36 | 
 37 |   # Define the structure for Video Analysis metadata
 38 |   class VideoAnalysis(TypedDict):
 39 |       name: str
 40 |       title: str
 41 |       total_duration: float  # Duration in seconds
 42 |       summary: str
 43 |       small_summary: str
 44 |       tags: Optional[List[str]]
 45 | 
 46 |   def display_metadata(metadata: VideoAnalysis):
 47 |       """Displays the generated metadata in a user-friendly format."""
 48 |       st.header("Generated Metadata")
 49 |       st.subheader(f"Title: {metadata.get('title', 'N/A')}")
 50 |       st.write(f"**Name:** {metadata.get('name', 'N/A')}")
 51 |       st.write(f"**Total Duration:** {metadata.get('total_duration', 'N/A')} seconds")
 52 |       st.write(f"**Summary:** {metadata.get('summary', 'N/A')}")
 53 |       st.write(f"**Small Summary:** {metadata.get('small_summary', 'N/A')}")
 54 |       st.write(f"**Tags:** {', '.join(metadata.get('tags', [])) if metadata.get('tags') else 'N/A'}")
 55 | 
 56 |   st.header("📹 Video Metadata and Summary Generation")
 57 |   st.write("Upload a video to analyze its content and automatically generate metadata and summary.")
 58 | 
 59 |   model = load_model(type="video", schemaType=VideoAnalysis)
 60 | 
 61 |   uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mov", "avi", "mkv"])
 62 | 
 63 |   if uploaded_file is not None:
 64 |       col1, col2, col3 = st.columns([2, 6, 2])  # 20%, 60%, 20% width
 65 |       with col2:
 66 |           st.video(uploaded_file)
 67 |       uploaded_file.seek(0)
 68 |       if st.button("Analyze Video"):
 69 |           with st.spinner('Uploading video...'):
 70 |               uploaded_genai_file = upload_file_to_gemini(uploaded_file)
 71 |               if uploaded_genai_file:
 72 |                   st.success("File Upload successful!")
 73 |               if uploaded_genai_file is None:
 74 |                   st.error("Failed to upload the video.")
 75 |                   return
 76 | 
 77 |           processed_file = poll_file_processing(uploaded_genai_file)
 78 |           if processed_file is None:
 79 |               st.error("Video processing failed.")
 80 |               return
 81 | 
 82 |           with st.spinner('Generating metadata...'):
 83 |               metadata = generate_metadata(model, processed_file)
 84 |               if metadata:
 85 |                   st.success("Metadata generation successful!")
 86 |                   display_metadata(metadata)
 87 |   else:
 88 |       st.info("Please upload a video file to begin analysis.")
 89 | 
 90 | 
 91 | def image_tab():
 92 |     @st.cache_resource
 93 |     def get_model():
 94 |         model = load_model(type=None, schemaType=None)
 95 |         return model
 96 | 
 97 |     def process_image(image: Image.Image, object_name: str, model):
 98 |         # Define the dynamic prompt with the user-specified object
 99 |         prompt = f""" 
100 |         You are given an image. Identify all {object_name} in the image and provide their bounding boxes. 
101 |         Return ONLY a valid JSON array in the exact format shown below. 
102 |         return specific name , let say if it's a dog and you know the dog breed name return that.
103 |         Do NOT include any additional text, explanations, comments, trailing commas, or markdown formatting such as code blocks.
104 |         Use this JSON schema:
105 |         [
106 |             {{
107 |                 "name": "string",
108 |                 "ymin": float,
109 |                 "xmin": float,
110 |                 "ymax": float,
111 |                 "xmax": float
112 |             }}
113 |         ]
114 |         """
115 |         try:
116 |             response = model.generate_content([image, prompt])
117 |         except Exception as e:
118 |             st.error(f"Error generating content from the model: {e}")
119 |             return None
120 | 
121 |         final_response = remove_markdown(response.text)
122 |         
123 |         try:
124 |             bounding_boxes = parse_bounding_boxes(final_response)
125 |         except ValueError as ve:
126 |             st.error(f"Error parsing bounding boxes: {ve}")
127 |             return None
128 | 
129 |         image_width, image_height = image.size
130 |         converted_boxes = convert_normalized_to_pixel(bounding_boxes, image_width, image_height)
131 |         return converted_boxes
132 | 
133 |     st.header("📸 Object Detection")
134 |     st.write("""
135 |     Upload an image or use your camera to capture one, then specify the object you want to detect.
136 |     The application will draw bounding boxes around the detected objects and display their coordinates.
137 |     """)
138 | 
139 |     # Sidebar for user inputs
140 |     st.sidebar.header("🔍 Detection Settings")
141 |     
142 |     # Radio buttons to select input method
143 |     input_method = st.sidebar.radio(
144 |         "Select Image Input Method",
145 |         # ("Upload Image", "Use Camera")
146 |         ("Upload Image")
147 |     )
148 | 
149 |     # Initialize uploaded_image as None
150 |     uploaded_image = None
151 | 
152 |     if input_method == "Upload Image":
153 |         uploaded_file = st.sidebar.file_uploader("📂 Choose an image...", type=["jpg", "jpeg", "png"])
154 |         if uploaded_file is not None:
155 |             try:
156 |                 uploaded_image = Image.open(uploaded_file).convert("RGB")
157 |                 st.image(uploaded_image, caption='🖼️ Uploaded Image', use_container_width=True)
158 |             except Exception as e:
159 |                 st.error(f"❌ Error opening image: {e}")
160 |     # elif input_method == "Use Camera":
161 |     #     captured_image = st.sidebar.camera_input("📸 Capture an image")
162 |     #     if captured_image is not None:
163 |     #         try:
164 |     #             uploaded_image = Image.open(captured_image).convert("RGB")
165 |     #             st.image(uploaded_image, caption='🖼️ Captured Image', use_container_width=True)
166 |     #         except Exception as e:
167 |     #             st.error(f"❌ Error capturing image: {e}")
168 | 
169 |     # Add detect all checkbox
170 |     detect_all = st.sidebar.checkbox("Detect All Objects")
171 |     # Show object input only if detect all is not checked
172 |     if not detect_all:
173 |         object_name = st.sidebar.text_input("📝 Enter the object to detect", placeholder="e.g., cat, bottle")
174 |     else:
175 |         object_name = "all"  # Set object_name to "all" when detect all is checked
176 | 
177 |     detect_button = st.sidebar.button("🚀 Detect Objects")
178 | 
179 |     if detect_button:
180 |         if uploaded_image is not None:
181 |             if not detect_all and not object_name.strip():
182 |                 st.error("⚠️ Please enter a valid object name to detect.")
183 |                 st.stop()
184 | 
185 |             with st.spinner("🔄 Loading the model..."):
186 |                 model = get_model()
187 | 
188 |             with st.spinner("🔍 Detecting objects..."):
189 |                 converted_boxes = process_image(uploaded_image, object_name, model)
190 | 
191 |             if converted_boxes is None:
192 |                 st.error("❌ An error occurred during object detection.")
193 |                 st.stop()
194 | 
195 |             if converted_boxes:
196 |                 annotated_image = draw_bounding_boxes(uploaded_image.copy(), converted_boxes, output_path=None)
197 |                 st.image(annotated_image, caption='🖼️ Annotated Image', use_container_width=True)
198 | 
199 |                 st.subheader("📍 Bounding Box Coordinates")
200 |                 for idx, box in enumerate(converted_boxes, start=1):
201 |                     st.markdown(f"**{idx}. {box['name'].capitalize()}:**")
202 |                     st.markdown(f"- ymin: {box['ymin']}")
203 |                     st.markdown(f"- xmin: {box['xmin']}")
204 |                     st.markdown(f"- ymax: {box['ymax']}")
205 |                     st.markdown(f"- xmax: {box['xmax']}")
206 |                     st.markdown("---")
207 |             else:
208 |                 if detect_all:
209 |                     st.warning("⚠️ No objects were detected in the image.")
210 |                 else:
211 |                     st.warning(f"⚠️ No instances of '{object_name}' were found in the image.")
212 |         else:
213 |             st.error("⚠️ Please provide an image either by uploading or using the camera.")
214 |             st.stop()
215 | 
216 | 
217 | def audio_tab():
218 | 
219 |   st.header("🔊 Audio Transcription")
220 |   st.write("Upload an audio file to transcribe its content.")
221 | 
222 |   model = load_model(type=None, schemaType=None)
223 | 
224 |   uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "aiff", "acc", "ogg", "flac"])
225 | 
226 |   if uploaded_audio is not None:
227 |       st.audio(uploaded_audio, format='audio/mp3')
228 |       if st.button("Transcribe Audio"):
229 |           with st.spinner('Uploading audio...'):
230 |               try:
231 |                   uploaded_genai_file = upload_file_to_gemini(uploaded_audio)
232 |                   if uploaded_genai_file:
233 |                       st.success("File Upload successful!")
234 |               except Exception as e:
235 |                   st.error(f"Error uploading audio: {e}")
236 |                   return
237 | 
238 |           processed_file = poll_file_processing(uploaded_genai_file)
239 |           if processed_file is None:
240 |               st.error("Audio processing failed.")
241 |               return
242 | 
243 |           with st.spinner('Transcribing audio...'):
244 |               transcription = generate_transcription(model, processed_file)
245 |               if transcription:
246 |                   st.success("Transcription successful!")
247 |                   st.text_area("Transcription", transcription, height=300)
248 |   else:
249 |       st.info("Please upload an audio file to begin transcription.")
250 | 
251 | 
252 | def file_api_tab():
253 | 
254 |   st.header("📂 File API Operations")
255 |   st.write("List and manage files uploaded to the API.")
256 | 
257 |   # List files
258 |   if st.button("List Files"):
259 |       st.subheader("Uploaded Files:")
260 |       try:
261 |           files = genai.list_files()
262 |           file_list = list(files)  # Convert generator to list
263 | 
264 |           # Check if the list has files
265 |           if len(file_list) > 0:
266 |               cols = st.columns(2)  # Create two columns
267 |               with cols[0]:
268 |                   st.markdown("**__Display Name__**")
269 |               with cols[1]:
270 |                   st.write("**__File Name__**")
271 | 
272 |               for f in file_list:
273 |                   with cols[0]:
274 |                       st.write(f"📄 {f.display_name}")  # Display Name
275 |                   with cols[1]:
276 |                       st.write(f"📄 {f.name}")  # File Name
277 |           else:
278 |               st.markdown("<span style='color:red;'>No files found.</span>", unsafe_allow_html=True)  # Display error if no files
279 |       except Exception as e:
280 |           st.error(f"Error listing files: {e}")
281 | 
282 |   # Delete file
283 |   st.subheader("Delete a Single File")
284 |   file_name_to_delete = st.text_input("Enter the name of the file to delete")
285 |   if st.button("Delete File"):
286 |       if file_name_to_delete.strip():
287 |           try:
288 |               myfile = genai.get_file(file_name_to_delete)
289 |               myfile.delete()
290 |               st.success(f"File '{file_name_to_delete}' has been deleted.")
291 |           except Exception as e:
292 |               st.error(f"Error deleting file: {e}")
293 |       else:
294 |           st.error("Please enter a valid file name.")
295 | 
296 |   # Option to delete all files
297 |   st.subheader("Delete All Files")
298 |   delete_all = st.checkbox("Delete all files")
299 |   if delete_all:
300 |       if st.button("Confirm Delete All"):
301 |           with st.spinner("Deleting all files..."):
302 |               try:
303 |                   files = genai.list_files()
304 |                   for f in files:
305 |                       myfile = genai.get_file(f.name)
306 |                       myfile.delete()
307 |                   st.success("All files have been deleted.")
308 |               except Exception as e:
309 |                   st.error(f"Error deleting all files: {e}")
310 | 
311 | 
312 | if __name__ == "__main__":
313 |   main()


--------------------------------------------------------------------------------