├── .gitignore ├── .streamlit └── config.toml ├── requirements.txt ├── utils ├── removemarkdownsyntax.py ├── model.py └── util.py ├── README.md └── app.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .env 3 | *.mp4 4 | *.mp3 5 | *.pyc -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | maxUploadSize = 2000 # 2000 MB = 2 GB 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | google-generativeai 2 | streamlit 3 | pillow 4 | ipython 5 | opencv-python 6 | rich 7 | streamlit 8 | Live 9 | python-dotenv 10 | streamlit-chat -------------------------------------------------------------------------------- /utils/removemarkdownsyntax.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def remove_markdown(text): 4 | """ 5 | Remove Markdown formatting from the given text. 6 | 7 | Args: 8 | text (str): The input text containing Markdown. 9 | 10 | Returns: 11 | str: The text without any Markdown formatting. 12 | """ 13 | # Remove headers (e.g., ###, ##, #) 14 | text = re.sub(r'(^|\s)#+\s+', '', text) 15 | 16 | # Remove emphasis (bold, italic, strikethrough) 17 | text = re.sub(r'(\*{1,2}|_{1,2}|~~)(.*?)\1', r'\2', text) 18 | 19 | # Remove code blocks with language specifiers (e.g., ```json) 20 | text = re.sub(r'```[a-zA-Z]*\n([\s\S]*?)\n```', r'\1', text) 21 | 22 | # Remove inline code 23 | text = re.sub(r'`{1,3}([^`]*)`{1,3}', r'\1', text) 24 | 25 | # Remove links [text](url) 26 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) 27 | 28 | # Remove images ![alt text](url) 29 | text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text) 30 | 31 | # Remove blockquotes 32 | text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) 33 | 34 | # Remove horizontal rules (---, ***, ___) 35 | text = re.sub(r'(^|\n)(-{3,}|_{3,}|\*{3,})(\n|$)', r'\1', text) 36 | 37 | # Remove lists (unordered and ordered) 38 | text = re.sub(r'(^|\n)(\s*[-+*]|\d+\.)\s+', r'\1', text) 39 | 40 | # Remove any remaining Markdown-specific characters 41 | text = re.sub(r'[*_~`]', '', text) 42 | 43 | return text.strip() -------------------------------------------------------------------------------- /utils/model.py: -------------------------------------------------------------------------------- 1 | import google.generativeai as genai 2 | from google.generativeai.types import GenerationConfig 3 | from dotenv import load_dotenv 4 | import os 5 | from google.generativeai import caching 6 | import datetime 7 | 8 | def load_model(type, schemaType): 9 | load_dotenv() 10 | genai.configure(api_key=os.getenv('API_KEY')) 11 | if type is not None and schemaType is not None: 12 | # Configuration when both type and schemaType are provided 13 | generation_config = GenerationConfig( 14 | temperature=0.7, 15 | top_p=0.9, 16 | top_k=40, 17 | candidate_count=1, 18 | max_output_tokens=8192, 19 | response_mime_type="application/json", 20 | response_schema=schemaType 21 | ) 22 | else: 23 | # Default configuration when type or schemaType is not provided 24 | generation_config = GenerationConfig( 25 | temperature=0.9, 26 | top_p=1.0, 27 | top_k=32, 28 | candidate_count=1, 29 | max_output_tokens=8192 30 | ) 31 | 32 | model_name = os.getenv('MODEL') 33 | model = genai.GenerativeModel(model_name=model_name, generation_config=generation_config) 34 | return model 35 | 36 | 37 | def load_cached_content_model(contents, display_name, system_instruction, ttl_minutes=5): 38 | print('loading cached content model') 39 | load_dotenv() 40 | genai.configure(api_key=os.getenv('API_KEY')) 41 | # Create a cache with the specified TTL 42 | cache = caching.CachedContent.create( 43 | model=os.getenv('CACHING_MODEL'), 44 | display_name=display_name, 45 | system_instruction=system_instruction, 46 | contents=contents, 47 | ttl=datetime.timedelta(minutes=ttl_minutes), 48 | ) 49 | print('cache',cache) 50 | # Construct a GenerativeModel which uses the created cache. 51 | model = genai.GenerativeModel.from_cached_content(cached_content=cache) 52 | return model 53 | 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gemini Multimodal Streamlit Application 2 | 3 | A comprehensive multimodal AI application built with Streamlit and Google's Gemini AI that provides video analysis, object detection, audio transcription, and file management capabilities through an intuitive web interface. 4 | 5 | ## Project Overview 6 | 7 | This application leverages Google's Gemini AI models to process and analyze various types of media content. It provides four main functionalities: 8 | 9 | - **Video Analysis**: Upload videos to automatically generate metadata including title, summary, duration, and tags 10 | - **Object Detection**: Upload images or use camera input to detect and locate objects with bounding box visualization 11 | - **Audio Transcription**: Convert audio files to text with speaker identification 12 | - **File API Management**: List, view, and delete files uploaded to the Gemini API 13 | 14 | The application is designed to be user-friendly with a clean, tabbed interface that allows users to easily switch between different AI-powered features. 15 | 16 | ## Features 17 | 18 | ### 🎥 Video Analysis 19 | 20 | - Upload video files (MP4, MOV, AVI, MKV) 21 | - Automatic metadata generation with structured JSON schema 22 | - Extract video title, summary, duration, and relevant tags 23 | - Real-time processing status updates 24 | 25 | ### 📸 Object Detection 26 | 27 | - Upload images (JPG, JPEG, PNG) for object detection 28 | - Specify custom objects to detect or detect all objects 29 | - Visual bounding box annotations with object labels 30 | - Pixel-perfect coordinate extraction and display 31 | - Support for normalized coordinate conversion 32 | 33 | ### 🔊 Audio Transcription 34 | 35 | - Support for multiple audio formats (MP3, WAV, AIFF, AAC, OGG, FLAC) 36 | - Speaker identification and dialogue formatting 37 | - Accurate transcription with filler words preservation 38 | - Interview and conversation transcription optimization 39 | 40 | ### 📂 File API Management 41 | 42 | - List all uploaded files with display names and file names 43 | - Individual file deletion by name 44 | - Bulk delete all files functionality 45 | - Real-time file status monitoring 46 | 47 | ## Tech Stack 48 | 49 | ### Core Technologies 50 | 51 | - **Python 3.x** - Primary programming language 52 | - **Streamlit** - Web application framework for the user interface 53 | - **Google Generative AI (Gemini)** - Core AI model for multimodal processing 54 | 55 | ### AI & Machine Learning 56 | 57 | - **google-generativeai** - Official Google Gemini AI SDK 58 | - **Pillow (PIL)** - Image processing and manipulation 59 | - **OpenCV** - Computer vision operations 60 | 61 | ### Utilities & Support 62 | 63 | - **python-dotenv** - Environment variable management 64 | - **Rich** - Enhanced terminal output formatting 65 | - **IPython** - Interactive Python environment 66 | - **streamlit-chat** - Chat interface components 67 | 68 | ### Development Tools 69 | 70 | - **Live** - Development server utilities 71 | - **JSON** - Data serialization and parsing 72 | - **Regex (re)** - Text processing and markdown removal 73 | 74 | ## Project Structure 75 | 76 | ``` 77 | GeminiMultiModalStreamlit/ 78 | ├── app.py # Main Streamlit application 79 | ├── requirements.txt # Python dependencies 80 | ├── README.md # Project documentation 81 | ├── utils/ # Utility modules 82 | │ ├── model.py # AI model loading and configuration 83 | │ ├── util.py # Core utility functions 84 | │ └── removemarkdownsyntax.py # Markdown text processing 85 | └── temp/ # Temporary file storage (auto-created) 86 | ``` 87 | 88 | ### Key Components 89 | 90 | - **`app.py`**: Main application entry point with Streamlit UI and tab management 91 | - **`utils/model.py`**: Handles Gemini AI model initialization, configuration, and caching 92 | - **`utils/util.py`**: Core utilities for file upload, processing, metadata generation, and image processing 93 | - **`utils/removemarkdownsyntax.py`**: Text processing utilities for cleaning AI responses 94 | 95 | ## Setup Instructions 96 | 97 | ### Prerequisites 98 | 99 | - Python 3.7 or higher 100 | - Google AI API key (from Google AI Studio) 101 | - Virtual environment (recommended) 102 | 103 | ### 1. Clone and Navigate 104 | 105 | ```bash 106 | git clone 107 | cd GeminiMultiModalStreamlit 108 | ``` 109 | 110 | ### 2. Create Virtual Environment 111 | 112 | ```bash 113 | # Windows 114 | python -m venv venv 115 | venv\Scripts\activate 116 | 117 | # macOS/Linux 118 | python -m venv venv 119 | source venv/bin/activate 120 | ``` 121 | 122 | ### 3. Install Dependencies 123 | 124 | ```bash 125 | pip install -r requirements.txt 126 | ``` 127 | 128 | ### 4. Environment Configuration 129 | 130 | Create a `.env` file in the project root directory: 131 | 132 | ```plaintext 133 | API_KEY=your_google_ai_api_key_here 134 | MODEL=gemini-1.5-flash-latest 135 | CACHING_MODEL=gemini-1.5-flash-001 136 | ``` 137 | 138 | **Important**: 139 | 140 | - Replace `your_google_ai_api_key_here` with your actual Google AI API key 141 | - You can obtain an API key from [Google AI Studio](https://aistudio.google.com/) 142 | - The `MODEL` can be any supported Gemini model version 143 | - `CACHING_MODEL` is used for cached content operations 144 | 145 | ### 5. Run the Application 146 | 147 | ```bash 148 | streamlit run app.py 149 | ``` 150 | 151 | The application will start and be accessible at `http://localhost:8501` 152 | 153 | ## Usage Guide 154 | 155 | ### Getting Started 156 | 157 | 1. Launch the application using the command above 158 | 2. The interface will display four tabs: Video, Image, Audio, and File API 159 | 3. Select the appropriate tab based on your use case 160 | 161 | ### Video Analysis 162 | 163 | 1. Navigate to the **Video** tab 164 | 2. Upload a video file using the file uploader 165 | 3. Click **"Analyze Video"** to start processing 166 | 4. Wait for file upload and processing completion 167 | 5. View the generated metadata including title, summary, duration, and tags 168 | 169 | ### Object Detection 170 | 171 | 1. Go to the **Image** tab 172 | 2. Upload an image using the sidebar file uploader 173 | 3. Choose detection mode: 174 | - **Specific Object**: Enter the object name you want to detect 175 | - **All Objects**: Check "Detect All Objects" to find everything 176 | 4. Click **"Detect Objects"** to process the image 177 | 5. View the annotated image with bounding boxes and coordinate details 178 | 179 | ### Audio Transcription 180 | 181 | 1. Select the **Audio** tab 182 | 2. Upload an audio file (supports multiple formats) 183 | 3. Click **"Transcribe Audio"** to start processing 184 | 4. Wait for upload and transcription completion 185 | 5. View the formatted transcription with speaker identification 186 | 187 | ### File Management 188 | 189 | 1. Access the **File API** tab 190 | 2. **List Files**: Click to view all uploaded files 191 | 3. **Delete Single File**: Enter file name and click delete 192 | 4. **Delete All Files**: Check the checkbox and confirm to remove all files 193 | 194 | ## Configuration 195 | 196 | ### Environment Variables 197 | 198 | The application requires the following environment variables in your `.env` file: 199 | 200 | | Variable | Description | Required | Example | 201 | | --------------- | ---------------------------- | -------- | ------------------------- | 202 | | `API_KEY` | Google AI API key | Yes | `AIza...` | 203 | | `MODEL` | Gemini model version | Yes | `gemini-1.5-flash-latest` | 204 | | `CACHING_MODEL` | Model for caching operations | No | `gemini-1.5-flash-001` | 205 | 206 | ### Model Configuration 207 | 208 | The application automatically configures different model settings based on use case: 209 | 210 | - **Structured Output** (Video Analysis): JSON schema response with specific temperature and token limits 211 | - **General Purpose** (Image/Audio): Standard configuration with higher creativity settings 212 | - **Caching**: Optimized for repeated operations with TTL management 213 | 214 | ### File Handling 215 | 216 | - Temporary files are automatically created and cleaned up 217 | - Supported video formats: MP4, MOV, AVI, MKV 218 | - Supported image formats: JPG, JPEG, PNG 219 | - Supported audio formats: MP3, WAV, AIFF, AAC, OGG, FLAC 220 | - Files are uploaded to Google's servers and processed remotely 221 | 222 | ### Performance Optimization 223 | 224 | - Model instances are cached using Streamlit's `@st.cache_resource` 225 | - File processing includes polling mechanisms for completion status 226 | - Bounding box coordinates are normalized and converted for accuracy 227 | - Error handling and validation throughout the processing pipeline 228 | 229 | --- 230 | 231 | **Note**: This application requires an active internet connection and valid Google AI API credentials to function properly. Make sure your API key has sufficient quota for the operations you plan to perform. 232 | -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import google.generativeai as genai 3 | from typing import Optional, Dict, Any, List 4 | import json 5 | import time 6 | from PIL import Image 7 | import streamlit as st 8 | import re 9 | import PIL.Image 10 | from PIL import Image, ImageDraw 11 | import os 12 | 13 | def upload_file_to_gemini(file) -> Optional[Dict[str, Any]]: 14 | """Uploads a file to Google Gemini.""" 15 | try: 16 | temp_dir = pathlib.Path("temp") 17 | temp_dir.mkdir(exist_ok=True) 18 | file_path = temp_dir / file.name 19 | with open(file_path, "wb") as f: 20 | f.write(file.getbuffer()) 21 | uploaded_file = genai.upload_file(file_path) 22 | os.remove(file_path) # Remove the file from local after upload 23 | return uploaded_file 24 | except Exception as e: 25 | st.error(f"Error uploading file: {e}") 26 | return None 27 | 28 | 29 | def poll_file_processing(uploaded_file) -> Optional[Dict[str, Any]]: 30 | """Polls the status of the uploaded file until processing is complete.""" 31 | try: 32 | with st.spinner('Processing file...'): 33 | while uploaded_file.state.name == "PROCESSING": 34 | time.sleep(1) 35 | uploaded_file = genai.get_file(uploaded_file.name) 36 | if uploaded_file.state.name == "ACTIVE": 37 | st.success(" File processing completed.") 38 | return uploaded_file 39 | elif uploaded_file.state.name == "FAILED": 40 | st.error("File processing failed.") 41 | return None 42 | else: 43 | st.error(f"Unexpected file state: {uploaded_file.state.name}") 44 | return None 45 | except Exception as e: 46 | st.error(f"Error during file processing: {e}") 47 | return None 48 | 49 | 50 | def generate_metadata(model: Any, video_file) -> Optional[Dict[str, Any]]: 51 | """Generates metadata for the uploaded video using the Generative AI model.""" 52 | try: 53 | prompt = "Provide the details based on provided response schema" 54 | result = model.generate_content([video_file, prompt]) 55 | if result.text: 56 | metadata = json.loads(result.text) 57 | return metadata 58 | else: 59 | st.error("No response received from the model.") 60 | return None 61 | except json.JSONDecodeError as je: 62 | st.error(f"Error decoding JSON response: {je}") 63 | return None 64 | except Exception as e: 65 | st.error(f"Error generating metadata: {e}") 66 | return None 67 | 68 | 69 | def generate_transcription(model: Any, audio_file) -> Optional[str]: 70 | """Generates transcription for the uploaded audio using the Generative AI model.""" 71 | try: 72 | prompt = """ 73 | Please transcribe this interview in the following format: 74 | [Speaker Name or Speaker A/B]: [Dialogue or caption]. 75 | If a speaker's name is mentioned or can be identified in the audio, map the actual names accordingly. 76 | If no names are given, use Speaker A, Speaker B, etc. 77 | Ensure the transcription captures all spoken words accurately, including filler words where appropriate. 78 | """ 79 | responses = model.generate_content([audio_file, prompt]) 80 | if responses.text: 81 | transcription = responses.text.strip() 82 | return transcription 83 | else: 84 | st.error("No response received from the model.") 85 | return None 86 | except Exception as e: 87 | st.error(f"Error generating transcription: {e}") 88 | return None 89 | 90 | 91 | def remove_markdown(text): 92 | """ 93 | Remove Markdown formatting from the given text. 94 | 95 | Args: 96 | text (str): The input text containing Markdown. 97 | 98 | Returns: 99 | str: The text without any Markdown formatting. 100 | """ 101 | # Remove headers (e.g., ###, ##, #) 102 | text = re.sub(r'(^|\s)#+\s+', '', text) 103 | 104 | # Remove emphasis (bold, italic, strikethrough) 105 | text = re.sub(r'(\*{1,2}|_{1,2}|~~)(.*?)\1', r'\2', text) 106 | 107 | # Remove code blocks with language specifiers (e.g., ```json) 108 | text = re.sub(r'```[a-zA-Z]*\n([\s\S]*?)\n```', r'\1', text) 109 | 110 | # Remove inline code 111 | text = re.sub(r'`{1,3}([^`]*)`{1,3}', r'\1', text) 112 | 113 | # Remove links [text](url) 114 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) 115 | 116 | # Remove images ![alt text](url) 117 | text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text) 118 | 119 | # Remove blockquotes 120 | text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) 121 | 122 | # Remove horizontal rules (---, ***, ___) 123 | text = re.sub(r'(^|\n)(-{3,}|_{3,}|\*{3,})(\n|$)', r'\1', text) 124 | 125 | # Remove lists (unordered and ordered) 126 | text = re.sub(r'(^|\n)(\s*[-+*]|\d+\.)\s+', r'\1', text) 127 | 128 | # Remove any remaining Markdown-specific characters 129 | text = re.sub(r'[*_~`]', '', text) 130 | 131 | return text.strip() 132 | 133 | 134 | def parse_bounding_boxes(response_text): 135 | """ 136 | Parses the JSON response to extract bounding boxes along with their names. 137 | 138 | Args: 139 | response_text (str): The raw text response from the model. 140 | 141 | Returns: 142 | list of dict: A list containing bounding box dictionaries with object names. 143 | 144 | Raises: 145 | ValueError: If JSON parsing fails or the structure is incorrect. 146 | """ 147 | try: 148 | bounding_boxes = json.loads(response_text) 149 | 150 | # Validate that the response is a list 151 | if not isinstance(bounding_boxes, list): 152 | raise ValueError("Response JSON is not a list.") 153 | 154 | # Define the required keys and their expected types 155 | required_keys = { 156 | "name": str, 157 | "ymin": (int, float), 158 | "xmin": (int, float), 159 | "ymax": (int, float), 160 | "xmax": (int, float) 161 | } 162 | 163 | # Validate each bounding box 164 | for box in bounding_boxes: 165 | # Check if all required keys are present 166 | missing_keys = [key for key in required_keys if key not in box] 167 | if missing_keys: 168 | raise ValueError(f"Bounding box missing keys: {missing_keys} in {box}") 169 | 170 | # Validate the type of each key 171 | for key, expected_type in required_keys.items(): 172 | if not isinstance(box[key], expected_type): 173 | raise ValueError(f"Bounding box key '{key}' has incorrect type in {box}. Expected {expected_type}, got {type(box[key])}.") 174 | 175 | return bounding_boxes 176 | except json.JSONDecodeError as e: 177 | raise ValueError(f"Invalid JSON response: {e}") 178 | 179 | def convert_normalized_to_pixel(bounding_boxes, image_width, image_height): 180 | """ 181 | Converts normalized bounding box coordinates to pixel values. 182 | 183 | Args: 184 | bounding_boxes (list of dict): List of bounding boxes with normalized coordinates. 185 | image_width (int): Width of the original image in pixels. 186 | image_height (int): Height of the original image in pixels. 187 | 188 | Returns: 189 | list of dict: List of bounding boxes with pixel coordinates. 190 | """ 191 | converted_boxes = [] 192 | for box in bounding_boxes: 193 | name = (box['name']) 194 | xmin = (box['xmin'] / 1000) * image_width 195 | ymin = (box['ymin'] / 1000) * image_height 196 | xmax = (box['xmax'] / 1000) * image_width 197 | ymax = (box['ymax'] / 1000) * image_height 198 | 199 | # Ensure coordinates are integers 200 | xmin, ymin, xmax, ymax = map(int, [xmin, ymin, xmax, ymax]) 201 | 202 | # Validate coordinates 203 | if not (0 <= xmin < xmax <= image_width) or not (0 <= ymin < ymax <= image_height): 204 | print(f"Invalid bounding box coordinates after conversion: {box}") 205 | continue # Skip invalid boxes 206 | 207 | converted_boxes.append({ 208 | 'name': name, 209 | 'xmin': xmin, 210 | 'ymin': ymin, 211 | 'xmax': xmax, 212 | 'ymax': ymax 213 | }) 214 | 215 | return converted_boxes 216 | 217 | 218 | def draw_bounding_boxes(image, bounding_boxes, output_path=None): 219 | """ 220 | Draws multiple bounding boxes on the image with labeled text. 221 | 222 | Args: 223 | image (PIL.Image.Image): The original image. 224 | bounding_boxes (list of dict): List of bounding boxes with pixel coordinates. 225 | output_path (str, optional): Path to save the annotated image. If None, returns the image object. 226 | 227 | Returns: 228 | PIL.Image.Image: Image with bounding boxes and labels drawn. 229 | """ 230 | draw = ImageDraw.Draw(image) 231 | 232 | # You may need to adjust the font size based on your needs 233 | try: 234 | from PIL import ImageFont 235 | font = ImageFont.truetype("arial.ttf", 20) 236 | except: 237 | font = ImageFont.load_default() 238 | 239 | for idx, box in enumerate(bounding_boxes, 1): 240 | xmin = box['xmin'] 241 | ymin = box['ymin'] 242 | xmax = box['xmax'] 243 | ymax = box['ymax'] 244 | name = box['name'] 245 | 246 | # Draw the red bounding box 247 | draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=1) 248 | 249 | # Prepare the label text 250 | label_text = f"{name}" 251 | 252 | # Get text size 253 | text_bbox = draw.textbbox((0, 0), label_text, font=font) 254 | text_width = text_bbox[2] - text_bbox[0] 255 | text_height = text_bbox[3] - text_bbox[1] 256 | 257 | # Calculate text position (above the bounding box) 258 | text_x = xmin 259 | text_y = max(0, ymin - text_height - 2) # 2 pixels padding 260 | 261 | # Draw yellow background for text 262 | draw.rectangle( 263 | [ 264 | text_x - 2, # 2 pixels padding 265 | text_y - 2, 266 | text_x + text_width + 2, 267 | text_y + text_height + 2 268 | ], 269 | fill="yellow" 270 | ) 271 | 272 | # Draw black text on yellow background 273 | draw.text((text_x, text_y), label_text, fill="black", font=font) 274 | 275 | if output_path: 276 | image.save(output_path) 277 | print(f"Annotated image saved at '{output_path}'.") 278 | return image -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # app.py 2 | import streamlit as st 3 | from utils.util import ( 4 | upload_file_to_gemini, 5 | poll_file_processing, 6 | generate_metadata, 7 | generate_transcription, 8 | remove_markdown, 9 | parse_bounding_boxes, 10 | convert_normalized_to_pixel, 11 | draw_bounding_boxes 12 | ) 13 | from utils.model import load_model 14 | from PIL import Image 15 | from typing import TypedDict, Optional, List, Dict, Any 16 | from utils.util import upload_file_to_gemini 17 | import google.generativeai as genai 18 | 19 | def main(): 20 | st.set_page_config(page_title="Gemini Multimodal", layout="wide") 21 | st.title("Gemini Multimodal Application") 22 | 23 | # Tab selection using radio 24 | tab = st.radio("", ["Video", "Image", "Audio", "File API"], horizontal=True) 25 | 26 | if tab == "Video": 27 | video_tab() 28 | elif tab == "Image": 29 | image_tab() 30 | elif tab == "Audio": 31 | audio_tab() 32 | elif tab == "File API": 33 | file_api_tab() 34 | 35 | def video_tab(): 36 | 37 | # Define the structure for Video Analysis metadata 38 | class VideoAnalysis(TypedDict): 39 | name: str 40 | title: str 41 | total_duration: float # Duration in seconds 42 | summary: str 43 | small_summary: str 44 | tags: Optional[List[str]] 45 | 46 | def display_metadata(metadata: VideoAnalysis): 47 | """Displays the generated metadata in a user-friendly format.""" 48 | st.header("Generated Metadata") 49 | st.subheader(f"Title: {metadata.get('title', 'N/A')}") 50 | st.write(f"**Name:** {metadata.get('name', 'N/A')}") 51 | st.write(f"**Total Duration:** {metadata.get('total_duration', 'N/A')} seconds") 52 | st.write(f"**Summary:** {metadata.get('summary', 'N/A')}") 53 | st.write(f"**Small Summary:** {metadata.get('small_summary', 'N/A')}") 54 | st.write(f"**Tags:** {', '.join(metadata.get('tags', [])) if metadata.get('tags') else 'N/A'}") 55 | 56 | st.header("📹 Video Metadata and Summary Generation") 57 | st.write("Upload a video to analyze its content and automatically generate metadata and summary.") 58 | 59 | model = load_model(type="video", schemaType=VideoAnalysis) 60 | 61 | uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mov", "avi", "mkv"]) 62 | 63 | if uploaded_file is not None: 64 | col1, col2, col3 = st.columns([2, 6, 2]) # 20%, 60%, 20% width 65 | with col2: 66 | st.video(uploaded_file) 67 | uploaded_file.seek(0) 68 | if st.button("Analyze Video"): 69 | with st.spinner('Uploading video...'): 70 | uploaded_genai_file = upload_file_to_gemini(uploaded_file) 71 | if uploaded_genai_file: 72 | st.success("File Upload successful!") 73 | if uploaded_genai_file is None: 74 | st.error("Failed to upload the video.") 75 | return 76 | 77 | processed_file = poll_file_processing(uploaded_genai_file) 78 | if processed_file is None: 79 | st.error("Video processing failed.") 80 | return 81 | 82 | with st.spinner('Generating metadata...'): 83 | metadata = generate_metadata(model, processed_file) 84 | if metadata: 85 | st.success("Metadata generation successful!") 86 | display_metadata(metadata) 87 | else: 88 | st.info("Please upload a video file to begin analysis.") 89 | 90 | 91 | def image_tab(): 92 | @st.cache_resource 93 | def get_model(): 94 | model = load_model(type=None, schemaType=None) 95 | return model 96 | 97 | def process_image(image: Image.Image, object_name: str, model): 98 | # Define the dynamic prompt with the user-specified object 99 | prompt = f""" 100 | You are given an image. Identify all {object_name} in the image and provide their bounding boxes. 101 | Return ONLY a valid JSON array in the exact format shown below. 102 | return specific name , let say if it's a dog and you know the dog breed name return that. 103 | Do NOT include any additional text, explanations, comments, trailing commas, or markdown formatting such as code blocks. 104 | Use this JSON schema: 105 | [ 106 | {{ 107 | "name": "string", 108 | "ymin": float, 109 | "xmin": float, 110 | "ymax": float, 111 | "xmax": float 112 | }} 113 | ] 114 | """ 115 | try: 116 | response = model.generate_content([image, prompt]) 117 | except Exception as e: 118 | st.error(f"Error generating content from the model: {e}") 119 | return None 120 | 121 | final_response = remove_markdown(response.text) 122 | 123 | try: 124 | bounding_boxes = parse_bounding_boxes(final_response) 125 | except ValueError as ve: 126 | st.error(f"Error parsing bounding boxes: {ve}") 127 | return None 128 | 129 | image_width, image_height = image.size 130 | converted_boxes = convert_normalized_to_pixel(bounding_boxes, image_width, image_height) 131 | return converted_boxes 132 | 133 | st.header("📸 Object Detection") 134 | st.write(""" 135 | Upload an image or use your camera to capture one, then specify the object you want to detect. 136 | The application will draw bounding boxes around the detected objects and display their coordinates. 137 | """) 138 | 139 | # Sidebar for user inputs 140 | st.sidebar.header("🔍 Detection Settings") 141 | 142 | # Radio buttons to select input method 143 | input_method = st.sidebar.radio( 144 | "Select Image Input Method", 145 | # ("Upload Image", "Use Camera") 146 | ("Upload Image") 147 | ) 148 | 149 | # Initialize uploaded_image as None 150 | uploaded_image = None 151 | 152 | if input_method == "Upload Image": 153 | uploaded_file = st.sidebar.file_uploader("📂 Choose an image...", type=["jpg", "jpeg", "png"]) 154 | if uploaded_file is not None: 155 | try: 156 | uploaded_image = Image.open(uploaded_file).convert("RGB") 157 | st.image(uploaded_image, caption='🖼️ Uploaded Image', use_container_width=True) 158 | except Exception as e: 159 | st.error(f"❌ Error opening image: {e}") 160 | # elif input_method == "Use Camera": 161 | # captured_image = st.sidebar.camera_input("📸 Capture an image") 162 | # if captured_image is not None: 163 | # try: 164 | # uploaded_image = Image.open(captured_image).convert("RGB") 165 | # st.image(uploaded_image, caption='🖼️ Captured Image', use_container_width=True) 166 | # except Exception as e: 167 | # st.error(f"❌ Error capturing image: {e}") 168 | 169 | # Add detect all checkbox 170 | detect_all = st.sidebar.checkbox("Detect All Objects") 171 | # Show object input only if detect all is not checked 172 | if not detect_all: 173 | object_name = st.sidebar.text_input("📝 Enter the object to detect", placeholder="e.g., cat, bottle") 174 | else: 175 | object_name = "all" # Set object_name to "all" when detect all is checked 176 | 177 | detect_button = st.sidebar.button("🚀 Detect Objects") 178 | 179 | if detect_button: 180 | if uploaded_image is not None: 181 | if not detect_all and not object_name.strip(): 182 | st.error("⚠️ Please enter a valid object name to detect.") 183 | st.stop() 184 | 185 | with st.spinner("🔄 Loading the model..."): 186 | model = get_model() 187 | 188 | with st.spinner("🔍 Detecting objects..."): 189 | converted_boxes = process_image(uploaded_image, object_name, model) 190 | 191 | if converted_boxes is None: 192 | st.error("❌ An error occurred during object detection.") 193 | st.stop() 194 | 195 | if converted_boxes: 196 | annotated_image = draw_bounding_boxes(uploaded_image.copy(), converted_boxes, output_path=None) 197 | st.image(annotated_image, caption='🖼️ Annotated Image', use_container_width=True) 198 | 199 | st.subheader("📍 Bounding Box Coordinates") 200 | for idx, box in enumerate(converted_boxes, start=1): 201 | st.markdown(f"**{idx}. {box['name'].capitalize()}:**") 202 | st.markdown(f"- ymin: {box['ymin']}") 203 | st.markdown(f"- xmin: {box['xmin']}") 204 | st.markdown(f"- ymax: {box['ymax']}") 205 | st.markdown(f"- xmax: {box['xmax']}") 206 | st.markdown("---") 207 | else: 208 | if detect_all: 209 | st.warning("⚠️ No objects were detected in the image.") 210 | else: 211 | st.warning(f"⚠️ No instances of '{object_name}' were found in the image.") 212 | else: 213 | st.error("⚠️ Please provide an image either by uploading or using the camera.") 214 | st.stop() 215 | 216 | 217 | def audio_tab(): 218 | 219 | st.header("🔊 Audio Transcription") 220 | st.write("Upload an audio file to transcribe its content.") 221 | 222 | model = load_model(type=None, schemaType=None) 223 | 224 | uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "aiff", "acc", "ogg", "flac"]) 225 | 226 | if uploaded_audio is not None: 227 | st.audio(uploaded_audio, format='audio/mp3') 228 | if st.button("Transcribe Audio"): 229 | with st.spinner('Uploading audio...'): 230 | try: 231 | uploaded_genai_file = upload_file_to_gemini(uploaded_audio) 232 | if uploaded_genai_file: 233 | st.success("File Upload successful!") 234 | except Exception as e: 235 | st.error(f"Error uploading audio: {e}") 236 | return 237 | 238 | processed_file = poll_file_processing(uploaded_genai_file) 239 | if processed_file is None: 240 | st.error("Audio processing failed.") 241 | return 242 | 243 | with st.spinner('Transcribing audio...'): 244 | transcription = generate_transcription(model, processed_file) 245 | if transcription: 246 | st.success("Transcription successful!") 247 | st.text_area("Transcription", transcription, height=300) 248 | else: 249 | st.info("Please upload an audio file to begin transcription.") 250 | 251 | 252 | def file_api_tab(): 253 | 254 | st.header("📂 File API Operations") 255 | st.write("List and manage files uploaded to the API.") 256 | 257 | # List files 258 | if st.button("List Files"): 259 | st.subheader("Uploaded Files:") 260 | try: 261 | files = genai.list_files() 262 | file_list = list(files) # Convert generator to list 263 | 264 | # Check if the list has files 265 | if len(file_list) > 0: 266 | cols = st.columns(2) # Create two columns 267 | with cols[0]: 268 | st.markdown("**__Display Name__**") 269 | with cols[1]: 270 | st.write("**__File Name__**") 271 | 272 | for f in file_list: 273 | with cols[0]: 274 | st.write(f"📄 {f.display_name}") # Display Name 275 | with cols[1]: 276 | st.write(f"📄 {f.name}") # File Name 277 | else: 278 | st.markdown("No files found.", unsafe_allow_html=True) # Display error if no files 279 | except Exception as e: 280 | st.error(f"Error listing files: {e}") 281 | 282 | # Delete file 283 | st.subheader("Delete a Single File") 284 | file_name_to_delete = st.text_input("Enter the name of the file to delete") 285 | if st.button("Delete File"): 286 | if file_name_to_delete.strip(): 287 | try: 288 | myfile = genai.get_file(file_name_to_delete) 289 | myfile.delete() 290 | st.success(f"File '{file_name_to_delete}' has been deleted.") 291 | except Exception as e: 292 | st.error(f"Error deleting file: {e}") 293 | else: 294 | st.error("Please enter a valid file name.") 295 | 296 | # Option to delete all files 297 | st.subheader("Delete All Files") 298 | delete_all = st.checkbox("Delete all files") 299 | if delete_all: 300 | if st.button("Confirm Delete All"): 301 | with st.spinner("Deleting all files..."): 302 | try: 303 | files = genai.list_files() 304 | for f in files: 305 | myfile = genai.get_file(f.name) 306 | myfile.delete() 307 | st.success("All files have been deleted.") 308 | except Exception as e: 309 | st.error(f"Error deleting all files: {e}") 310 | 311 | 312 | if __name__ == "__main__": 313 | main() --------------------------------------------------------------------------------