├── app ├── favicon.ico ├── globals.css ├── layout.tsx └── page.tsx ├── postcss.config.mjs ├── public ├── groq-labs-logo.png ├── vercel.svg ├── window.svg ├── file.svg ├── globe.svg └── next.svg ├── next.config.ts ├── backend ├── requirements.txt ├── models │ └── requests.py ├── utils │ ├── config.py │ └── file_utils.py ├── services │ ├── video_processing_service.py │ ├── translation_service.py │ ├── transcription_service.py │ └── subtitle_service.py └── main.py ├── package.json ├── tsconfig.json ├── .gitignore ├── LICENSE ├── start.sh ├── setup.sh └── README.md /app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/build-with-groq/groq-subtitle-generator/HEAD/app/favicon.ico -------------------------------------------------------------------------------- /postcss.config.mjs: -------------------------------------------------------------------------------- 1 | const config = { 2 | plugins: ["@tailwindcss/postcss"], 3 | }; 4 | 5 | export default config; 6 | -------------------------------------------------------------------------------- /public/groq-labs-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/build-with-groq/groq-subtitle-generator/HEAD/public/groq-labs-logo.png -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /next.config.ts: -------------------------------------------------------------------------------- 1 | import type { NextConfig } from "next"; 2 | 3 | const nextConfig: NextConfig = { 4 | /* config options here */ 5 | }; 6 | 7 | export default nextConfig; 8 | -------------------------------------------------------------------------------- /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.104.1 2 | uvicorn[standard]==0.24.0 3 | python-multipart==0.0.6 4 | groq==0.29.0 5 | ffmpeg-python==0.2.0 6 | python-dotenv==1.0.0 7 | pydantic==2.5.0 8 | pydantic-settings==2.1.0 9 | pysrt==1.1.2 10 | webvtt-py==0.4.6 -------------------------------------------------------------------------------- /public/window.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/file.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/globals.css: -------------------------------------------------------------------------------- 1 | @import "tailwindcss"; 2 | 3 | :root { 4 | --background: #ffffff; 5 | --foreground: #171717; 6 | } 7 | 8 | @theme inline { 9 | --color-background: var(--background); 10 | --color-foreground: var(--foreground); 11 | --font-sans: var(--font-montserrat); 12 | --font-mono: var(--font-montserrat); 13 | } 14 | 15 | @media (prefers-color-scheme: dark) { 16 | :root { 17 | --background: #0a0a0a; 18 | --foreground: #ededed; 19 | } 20 | } 21 | 22 | body { 23 | background: var(--background); 24 | color: var(--foreground); 25 | font-family: Montserrat; 26 | } 27 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "movie-subtitles", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev --turbopack", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "lucide-react": "^0.534.0", 13 | "next": "15.3.5", 14 | "react": "^19.0.0", 15 | "react-dom": "^19.0.0" 16 | }, 17 | "devDependencies": { 18 | "@tailwindcss/postcss": "^4", 19 | "@types/node": "^20", 20 | "@types/react": "^19", 21 | "@types/react-dom": "^19", 22 | "tailwindcss": "^4", 23 | "typescript": "^5" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2017", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/*": ["./*"] 23 | } 24 | }, 25 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 26 | "exclude": ["node_modules"] 27 | } 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.* 7 | .yarn/* 8 | !.yarn/patches 9 | !.yarn/plugins 10 | !.yarn/releases 11 | !.yarn/versions 12 | 13 | # testing 14 | /coverage 15 | 16 | # next.js 17 | /.next/ 18 | /out/ 19 | 20 | # production 21 | /build 22 | 23 | # misc 24 | .DS_Store 25 | *.pem 26 | 27 | # debug 28 | npm-debug.log* 29 | yarn-debug.log* 30 | yarn-error.log* 31 | .pnpm-debug.log* 32 | 33 | # env files (can opt-in for committing if needed) 34 | .env* 35 | 36 | # vercel 37 | .vercel 38 | 39 | # typescript 40 | *.tsbuildinfo 41 | next-env.d.ts 42 | 43 | __pycache__/ 44 | *.pyc 45 | *.pyo 46 | venv/ 47 | backend/temp/ 48 | -------------------------------------------------------------------------------- /app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import { Montserrat } from "next/font/google"; 3 | import "./globals.css"; 4 | 5 | 6 | const montserrat = Montserrat({ 7 | variable: "--font-montserrat", 8 | subsets: ["latin"], 9 | weight: ["300", "400", "500", "600", "700"], 10 | }); 11 | 12 | export const metadata: Metadata = { 13 | title: "Auto Multilingual Subtitle Generator", 14 | description: "Lightning-fast AI-powered multilingual subtitles.", 15 | }; 16 | 17 | export default function RootLayout({ 18 | children, 19 | }: Readonly<{ 20 | children: React.ReactNode; 21 | }>) { 22 | return ( 23 | 24 |
27 | {children} 28 | 29 | 30 | ); 31 | } -------------------------------------------------------------------------------- /public/globe.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Build With Groq 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/models/requests.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional, List, Dict 3 | from enum import Enum 4 | 5 | class ProcessingStatus(str, Enum): 6 | UPLOADED = "uploaded" 7 | PROCESSING = "processing" 8 | EXTRACTING_AUDIO = "extracting_audio" 9 | TRANSCRIBING = "transcribing" 10 | TRANSLATING = "translating" 11 | GENERATING_SUBTITLES = "generating_subtitles" 12 | RENDERING_VIDEO = "rendering_video" 13 | COMPLETED = "completed" 14 | FAILED = "failed" 15 | 16 | class ProcessVideoRequest(BaseModel): 17 | target_language: str 18 | source_language: Optional[str] = None 19 | subtitle_format: Optional[str] = "srt" 20 | 21 | class VideoProcessingStatus(BaseModel): 22 | job_id: str 23 | status: str 24 | progress: int = 0 25 | message: str = "" 26 | file_path: Optional[str] = None 27 | output_path: Optional[str] = None 28 | subtitle_path: Optional[str] = None 29 | target_language: Optional[str] = None 30 | source_language: Optional[str] = None 31 | error: Optional[str] = None 32 | 33 | class Config: 34 | use_enum_values = True 35 | 36 | class TranscriptionSegment(BaseModel): 37 | start: float 38 | end: float 39 | text: str 40 | confidence: Optional[float] = None 41 | 42 | class TranscriptionResult(BaseModel): 43 | text: str 44 | segments: List[TranscriptionSegment] 45 | detected_language: Optional[str] = None 46 | confidence: Optional[float] = None 47 | 48 | class TranslationRequest(BaseModel): 49 | text: str 50 | source_language: str 51 | target_language: str 52 | context: Optional[str] = None 53 | 54 | class TranslationResult(BaseModel): 55 | translated_text: str 56 | source_language: str 57 | target_language: str 58 | confidence: Optional[float] = None 59 | 60 | class SubtitleEntry(BaseModel): 61 | start_time: str 62 | end_time: str 63 | text: str 64 | index: int 65 | 66 | class FileInfo(BaseModel): 67 | filename: str 68 | size: int 69 | duration: Optional[float] = None 70 | format: Optional[str] = None 71 | resolution: Optional[str] = None 72 | fps: Optional[float] = None 73 | 74 | class ErrorResponse(BaseModel): 75 | error: str 76 | message: str 77 | details: Optional[Dict] = None -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Video Subtitle Generator - Startup Script 4 | 5 | echo "Starting Video Subtitle Generator..." 6 | 7 | # Check if Python is installed 8 | if ! command -v python3 &> /dev/null; then 9 | echo "Python 3 is not installed. Please install Python 3.8+" 10 | exit 1 11 | fi 12 | 13 | # Check if Node.js is installed 14 | if ! command -v node &> /dev/null; then 15 | echo "Node.js is not installed. Please install Node.js 18+" 16 | exit 1 17 | fi 18 | 19 | # Check if FFmpeg is installed 20 | if ! command -v ffmpeg &> /dev/null; then 21 | echo " FFmpeg is not installed. Please install FFmpeg first." 22 | echo " macOS: brew install ffmpeg" 23 | echo " Ubuntu/Debian: sudo apt install ffmpeg" 24 | echo " Windows: Download from https://ffmpeg.org/download.html" 25 | exit 1 26 | fi 27 | 28 | # Check if backend environment exists 29 | if [ ! -d "backend/venv" ]; then 30 | echo "Creating Python virtual environment..." 31 | cd backend 32 | python3 -m venv venv 33 | cd .. 34 | fi 35 | 36 | # Check if backend dependencies are installed 37 | if [ ! -f "backend/venv/lib/python*/site-packages/fastapi" ]; then 38 | echo "Installing Python dependencies..." 39 | cd backend 40 | source venv/bin/activate 41 | pip install -r requirements.txt 42 | cd .. 43 | fi 44 | 45 | # Check if frontend dependencies are installed 46 | if [ ! -d "node_modules" ]; then 47 | echo "Installing Node.js dependencies..." 48 | npm install 49 | fi 50 | 51 | # Check if .env file exists 52 | if [ ! -f "backend/.env" ]; then 53 | echo ".env file not found. Creating basic configuration..." 54 | cat > backend/.env << 'EOF' 55 | # Groq API Configuration 56 | GROQ_API_KEY=your_groq_api_key_here 57 | 58 | # Model Configuration 59 | GROQ_MODEL=qwen/qwen3-32b 60 | GROQ_WHISPER_MODEL=whisper-large-v3 61 | EOF 62 | echo "Please edit backend/.env and add your Groq API key before continuing." 63 | echo "GROQ_API_KEY=your_groq_api_key_here" 64 | echo "" 65 | echo "You can get a free API key at: https://groq.com" 66 | echo "" 67 | read -p "Press Enter after setting up your API key..." 68 | fi 69 | 70 | # Function to cleanup background processes 71 | cleanup() { 72 | echo "" 73 | echo "Shutting down servers..." 74 | kill $BACKEND_PID 2>/dev/null 75 | kill $FRONTEND_PID 2>/dev/null 76 | exit 0 77 | } 78 | 79 | # Set up signal handlers 80 | trap cleanup SIGINT SIGTERM 81 | 82 | echo "🔧 Starting backend server..." 83 | cd backend 84 | source venv/bin/activate 85 | python main.py & 86 | BACKEND_PID=$! 87 | cd .. 88 | 89 | # Wait for backend to start 90 | echo "⏳ Waiting for backend to start..." 91 | sleep 3 92 | 93 | # Check if backend is running 94 | if ! curl -s http://localhost:8000 > /dev/null; then 95 | echo "Backend failed to start. Check the logs above." 96 | kill $BACKEND_PID 2>/dev/null 97 | exit 1 98 | fi 99 | 100 | echo "Backend server started at http://localhost:8000" 101 | 102 | echo "Starting frontend server..." 103 | npm run dev & 104 | FRONTEND_PID=$! 105 | 106 | # Wait for frontend to start 107 | echo "Waiting for frontend to start..." 108 | sleep 5 109 | 110 | echo "Frontend server started at http://localhost:3000" 111 | echo "" 112 | echo " Video Subtitle Generator is ready!" 113 | echo " Frontend: http://localhost:3000" 114 | echo " Backend API: http://localhost:8000" 115 | echo " API Docs: http://localhost:8000/docs" 116 | echo "" 117 | echo "Press Ctrl+C to stop both servers" 118 | 119 | # Wait for both processes 120 | wait $BACKEND_PID $FRONTEND_PID -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Video Subtitle Generator - Setup Script 4 | 5 | echo "🛠️ Setting up Video Subtitle Generator..." 6 | echo "" 7 | 8 | # Check if Python is installed 9 | if ! command -v python3 &> /dev/null; then 10 | echo " Python 3 is not installed. Please install Python 3.8+" 11 | echo " macOS: brew install python" 12 | echo " Ubuntu/Debian: sudo apt install python3 python3-pip python3-venv" 13 | echo " Windows: Download from https://python.org" 14 | exit 1 15 | fi 16 | 17 | # Check if uv is installed, install if not 18 | if ! command -v uv &> /dev/null; then 19 | echo "📦 uv package manager not found. Installing uv..." 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | 22 | # Source the environment to make uv available 23 | export PATH="$HOME/.cargo/bin:$PATH" 24 | 25 | # Check if uv is now available 26 | if ! command -v uv &> /dev/null; then 27 | echo "❌ Failed to install uv. Please install manually:" 28 | echo " curl -LsSf https://astral.sh/uv/install.sh | sh" 29 | echo " Then restart your terminal and run this script again." 30 | exit 1 31 | fi 32 | echo "✅ uv installed successfully" 33 | else 34 | echo "✅ uv package manager found" 35 | fi 36 | 37 | 38 | # Check if Node.js is installed 39 | if ! command -v node &> /dev/null; then 40 | echo " Node.js is not installed. Please install Node.js 18+" 41 | echo " macOS: brew install node" 42 | echo " Ubuntu/Debian: curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs" 43 | echo " Windows: Download from https://nodejs.org" 44 | exit 1 45 | fi 46 | 47 | # Check if FFmpeg is installed 48 | if ! command -v ffmpeg &> /dev/null; then 49 | echo " FFmpeg is not installed. Please install FFmpeg first." 50 | echo " macOS: brew install ffmpeg" 51 | echo " Ubuntu/Debian: sudo apt install ffmpeg" 52 | echo " Windows: Download from https://ffmpeg.org/download.html" 53 | exit 1 54 | fi 55 | 56 | echo "All system dependencies are installed!" 57 | echo "" 58 | 59 | # Create backend virtual environment 60 | echo "Setting up Python virtual environment..." 61 | cd backend 62 | if [ ! -d "venv" ]; then 63 | python3 -m venv venv 64 | echo "Virtual environment created" 65 | else 66 | echo "Virtual environment already exists" 67 | fi 68 | 69 | # Activate virtual environment and install dependencies 70 | echo "Installing Python dependencies..." 71 | source venv/bin/activate 72 | uv pip install -r requirements.txt 73 | echo "Python dependencies installed" 74 | 75 | cd .. 76 | 77 | # Install Node.js dependencies 78 | echo "Installing Node.js dependencies..." 79 | npm install 80 | echo "Node.js dependencies installed" 81 | 82 | # Create .env file if it doesn't exist 83 | if [ ! -f "backend/.env" ]; then 84 | echo "Creating environment configuration..." 85 | cat > backend/.env << 'EOF' 86 | # Groq API Configuration 87 | GROQ_API_KEY=your_groq_api_key_here 88 | 89 | # Model Configuration 90 | GROQ_MODEL=qwen/qwen3-32b 91 | GROQ_WHISPER_MODEL=whisper-large-v3 92 | EOF 93 | echo "Environment file created" 94 | else 95 | echo "Environment file already exists" 96 | fi 97 | 98 | echo "" 99 | echo "Setup complete!" 100 | echo "" 101 | echo "Next steps:" 102 | echo "1. Edit backend/.env and add your Groq API key:" 103 | echo " GROQ_API_KEY=your_groq_api_key_here" 104 | echo "" 105 | echo "2. Get a free API key at: https://groq.com" 106 | echo "" 107 | echo "3. Run the application:" 108 | echo " ./start.sh" 109 | echo "" 110 | echo "For more information, see README.md" -------------------------------------------------------------------------------- /backend/utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | from pydantic_settings import BaseSettings 4 | from dotenv import load_dotenv 5 | 6 | # load environment variables from .env file 7 | load_dotenv() 8 | 9 | class Settings(BaseSettings): 10 | groq_api_key: str = os.getenv("GROQ_API_KEY") 11 | groq_model: str = os.getenv("GROQ_MODEL", "qwen/qwen3-32b") 12 | groq_whisper_model: str = os.getenv("GROQ_WHISPER_MODEL", "whisper-large-v3") 13 | 14 | max_file_size: int = int(os.getenv("MAX_FILE_SIZE", str(100 * 1024 * 1024))) # 100MB default 15 | supported_video_formats: list = [ 16 | "mp4", "mov", "avi" 17 | ] 18 | 19 | temp_dir: str = os.getenv("TEMP_DIR", "temp") 20 | max_concurrent_jobs: int = int(os.getenv("MAX_CONCURRENT_JOBS", "5")) 21 | 22 | 23 | # Supported languages 24 | supported_languages: dict = { 25 | "en": "English", 26 | "es": "Spanish", 27 | "fr": "French", 28 | "de": "German", 29 | "it": "Italian", 30 | "pt": "Portuguese", 31 | "ru": "Russian", 32 | "ja": "Japanese", 33 | "ko": "Korean", 34 | "zh": "Chinese", 35 | "ar": "Arabic", 36 | "hi": "Hindi", 37 | "th": "Thai", 38 | "vi": "Vietnamese", 39 | "nl": "Dutch", 40 | "sv": "Swedish", 41 | "no": "Norwegian", 42 | "da": "Danish", 43 | "fi": "Finnish", 44 | "pl": "Polish", 45 | "tr": "Turkish", 46 | "cs": "Czech", 47 | "hu": "Hungarian", 48 | "ro": "Romanian", 49 | "bg": "Bulgarian", 50 | "hr": "Croatian", 51 | "sk": "Slovak", 52 | "sl": "Slovenian", 53 | "et": "Estonian", 54 | "lv": "Latvian", 55 | "lt": "Lithuanian", 56 | "mt": "Maltese", 57 | "ga": "Irish", 58 | "cy": "Welsh", 59 | "eu": "Basque", 60 | "ca": "Catalan", 61 | "gl": "Galician", 62 | "is": "Icelandic", 63 | "mk": "Macedonian", 64 | "sq": "Albanian", 65 | "be": "Belarusian", 66 | "uk": "Ukrainian", 67 | "he": "Hebrew", 68 | "fa": "Persian", 69 | "ur": "Urdu", 70 | "bn": "Bengali", 71 | "ta": "Tamil", 72 | "te": "Telugu", 73 | "ml": "Malayalam", 74 | "kn": "Kannada", 75 | "gu": "Gujarati", 76 | "mr": "Marathi", 77 | "ne": "Nepali", 78 | "si": "Sinhala", 79 | "my": "Burmese", 80 | "km": "Khmer", 81 | "lo": "Lao", 82 | "ka": "Georgian", 83 | "am": "Amharic", 84 | "sw": "Swahili", 85 | "zu": "Zulu", 86 | "af": "Afrikaans", 87 | "ms": "Malay", 88 | "tl": "Filipino", 89 | "id": "Indonesian" 90 | } 91 | 92 | # FFmpeg settings 93 | ffmpeg_path: str = os.getenv("FFMPEG_PATH", "ffmpeg") 94 | subtitle_font: str = os.getenv("SUBTITLE_FONT", "Arial") 95 | subtitle_font_size: int = int(os.getenv("SUBTITLE_FONT_SIZE", "24")) 96 | subtitle_color: str = os.getenv("SUBTITLE_COLOR", "white") 97 | subtitle_outline_color: str = os.getenv("SUBTITLE_OUTLINE_COLOR", "black") 98 | subtitle_outline_width: int = int(os.getenv("SUBTITLE_OUTLINE_WIDTH", "2")) 99 | 100 | class Config: 101 | env_file = ".env" 102 | case_sensitive = False 103 | 104 | # Global settings instance 105 | _settings = None 106 | 107 | def get_settings() -> Settings: 108 | global _settings 109 | if _settings is None: 110 | _settings = Settings() 111 | return _settings 112 | 113 | def validate_groq_key(): 114 | """Validate that Groq API key is provided""" 115 | settings = get_settings() 116 | if not settings.groq_api_key: 117 | raise ValueError("GROQ_API_KEY environment variable is required") 118 | return True 119 | 120 | def get_language_code(language_name: str) -> str: 121 | """Get language code from language name""" 122 | settings = get_settings() 123 | for code, name in settings.supported_languages.items(): 124 | if name.lower() == language_name.lower(): 125 | return code 126 | return language_name.lower() 127 | 128 | def get_language_name(language_code: str) -> str: 129 | """Get language name from language code""" 130 | settings = get_settings() 131 | return settings.supported_languages.get(language_code.lower(), language_code) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Groq Subtitle Generator (Project SubLingo) 2 | The Groq Subtitle Generator (Project SubLingo) is a demo that showcases Groq in action through high-speed transcription and translation, allowing users to generate burned-in subtitles across languages in just seconds. Users can upload a video in any of the 50+ supported languages, choose the same or a different language for subtitles, review the transcription, and watch the magic unfold. 3 | 4 | Once a video is uploaded, FFmpeg converts it to a WAV audio file, which is then passed to the Video Processing Service. This audio is transcribed using OpenAI’s Whisper Large V3-turbo model, powered by Groq. Users can edit individual segments to ensure accuracy. If translation is selected, the content is then translated using the Qwen3-32B model through the Groq API. The final subtitles are formatted as an SRT file and rendered onto the video using FFmpeg to produce the final output. Here is a [sample video](https://github.com/user-attachments/assets/1d81f956-c0e7-4995-83ac-856aec1b8b58) file to test the demo out! 5 | 6 | https://github.com/user-attachments/assets/08ebc0af-4a5e-40a3-affa-8652bedfd6f4 7 | 8 | ## Features 9 | - **Multi-language Support**: 50+ languages with automatic detection 10 | - **Ultra-Fast Processing**: Whisper API powered by Groq for lightning-fast transcription 11 | - **Advanced Translation**: Qwen3-32b by Groq for accurate multilingual translation 12 | - **Video Processing**: Supports MP4, MOV, AVI 13 | - **Burned-in Subtitles**: Create videos with permanently embedded subtitles 14 | - **Edit Before Translation**: Review and edit transcription for perfect accuracy 15 | 16 | ## 🛠️ Tech Stack 17 | 18 | ### Backend 19 | - **FastAPI** 20 | - **Groq API** 21 | - **FFmpeg** 22 | - **Pydantic** 23 | 24 | ### Frontend 25 | - **Next.js 15** 26 | - **TypeScript** 27 | - **Tailwind CSS** 28 | - **Lucide React** 29 | 30 | ## 🚀 Quick Start 31 | 32 | ### Prerequisites 33 | - Python 3.8+ 34 | - Node.js 18+ 35 | - FFmpeg installed 36 | - Groq API key (get free access at [groq.com](https://groq.com)) 37 | 38 | ### Install FFmpeg 39 | 40 | **macOS**: 41 | ```bash 42 | brew install ffmpeg 43 | ``` 44 | 45 | **Ubuntu/Debian**: 46 | ```bash 47 | sudo apt update 48 | sudo apt install ffmpeg 49 | ``` 50 | 51 | **Windows**: 52 | Download from [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html) 53 | 54 | ## 🎯 Setup & Run 55 | 56 | Choose your preferred setup method: 57 | 58 | ### Option 1: Automated Setup (Recommended) 59 | 60 | **Easy one-command setup:** 61 | 62 | 1. **Clone and setup everything**: 63 | ```bash 64 | git clone https://github.com/build-with-groq/groq-subtitle-generator 65 | cd groq-subtitle-generator 66 | chmod +x setup.sh start.sh 67 | ./setup.sh 68 | ``` 69 | 70 | 2. **Add your Groq API key**: 71 | ```bash 72 | # Edit backend/.env and add your API key 73 | GROQ_API_KEY=your_groq_api_key_here 74 | ``` 75 | 76 | 3. **Start the application**: 77 | ```bash 78 | ./start.sh 79 | ``` 80 | 81 | The scripts handle all dependency installation, virtual environment setup, and server management automatically! 82 | 83 | ### Option 2: Manual Setup 84 | 85 | **For those who prefer manual control:** 86 | 87 | 1. **Clone the repository**: 88 | ```bash 89 | git clone https://github.com/build-with-groq/groq-subtitle-generator 90 | cd groq-subtitle-generator 91 | ``` 92 | 93 | 2. **Setup Python environment**: 94 | ```bash 95 | cd backend 96 | python3 -m venv venv 97 | source venv/bin/activate # On Windows: venv\Scripts\activate 98 | pip install -r requirements.txt 99 | cd .. 100 | ``` 101 | 102 | 3. **Setup Node.js dependencies**: 103 | ```bash 104 | npm install 105 | ``` 106 | 107 | 4. **Create environment file**: 108 | ```bash 109 | # Create backend/.env file 110 | cat > backend/.env << 'EOF' 111 | GROQ_API_KEY=your_groq_api_key_here 112 | GROQ_MODEL=qwen/qwen3-32b 113 | GROQ_WHISPER_MODEL=whisper-large-v3 114 | EOF 115 | ``` 116 | 117 | 5. **Start backend server**: 118 | ```bash 119 | cd backend 120 | source venv/bin/activate 121 | python main.py 122 | ``` 123 | 124 | 6. **In a new terminal, start frontend**: 125 | ```bash 126 | npm run dev 127 | ``` 128 | 129 | That's it! 🎉 The application will be available at `http://localhost:3000` 130 | 131 | ## ⚙️ Configuration Options 132 | 133 | The `backend/.env` file supports these settings: 134 | 135 | ```env 136 | # Required 137 | GROQ_API_KEY=your_groq_api_key_here 138 | 139 | # Model Selection (optional) 140 | GROQ_MODEL=qwen/qwen3-32b 141 | GROQ_WHISPER_MODEL=whisper-large-v3 # Options: whisper-large-v3, whisper-large-v3-turbo, distil-whisper-large-v3-en 142 | ``` 143 | 144 | ## 🎬 Usage Workflow 145 | 146 | 1. **📤 Upload Video**: Drag and drop or select a video file 147 | 2. **🌐 Configure Languages**: 148 | - Source language 149 | - Target language for subtitles 150 | 3. **🎵 Transcription**: Whisper transcribes the audio with timestamps 151 | 4. **✏️ Edit & Review**: Review and edit transcription for perfect accuracy 152 | - Edit individual segments with timestamps 153 | - Ensure quality before translation 154 | 5. **🔄 Translation**: Qwen3-32b translates to your target language 155 | 6. **🎬 Generation**: Create subtitled video with burned-in subtitles 156 | 7. **📥 Download**: Get your subtitled video 157 | 158 | 159 | ## 🔍 Troubleshooting 160 | 161 | **Scripts not executable?** 162 | ```bash 163 | chmod +x setup.sh start.sh 164 | ``` 165 | 166 | **FFmpeg not found?** 167 | Make sure FFmpeg is installed and available in your PATH. 168 | 169 | **API key issues?** 170 | Ensure your Groq API key is correctly set in `backend/.env`. 171 | 172 | **Port conflicts?** 173 | The app uses ports 3000 (frontend) and 8000 (backend). Make sure these are available. 174 | 175 | 176 | ## 🙏 Acknowledgments 177 | 178 | - [Groq](https://groq.com) for ultra-fast AI inference 179 | - [FFmpeg](https://ffmpeg.org) for video processing 180 | - [FastAPI](https://fastapi.tiangolo.com) & [Next.js](https://nextjs.org) for the frameworks 181 | 182 | ## 👨💻 Author 183 | Created by **Krish Desai**, AI Applications Engineer Intern at **Groq**. 184 | Connect with him on [X (formerly Twitter)](https://x.com/thekrishdesai) and [LinkedIn](https://linkedin.com/in/desaikrish). 185 | 186 | ## 📄 License 187 | This project is licensed under the **MIT License**. 188 | -------------------------------------------------------------------------------- /backend/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mimetypes 3 | import subprocess 4 | import json 5 | from typing import Optional, Dict, Any 6 | from fastapi import UploadFile 7 | import ffmpeg 8 | from pathlib import Path 9 | 10 | from models.requests import FileInfo 11 | from utils.config import get_settings 12 | 13 | def validate_video_file(file: UploadFile) -> bool: 14 | """ 15 | validate uploaded video file format and type 16 | """ 17 | settings = get_settings() 18 | 19 | if not file.filename: 20 | return False 21 | 22 | file_extension = file.filename.split('.')[-1].lower() 23 | if file_extension not in settings.supported_video_formats: 24 | return False 25 | 26 | mime_type, _ = mimetypes.guess_type(file.filename) 27 | if mime_type and not mime_type.startswith('video/'): 28 | return False 29 | 30 | return True 31 | 32 | def get_file_info(file_path: str) -> FileInfo: 33 | """ 34 | extract video file information using ffmpeg.probe 35 | """ 36 | try: 37 | # file size 38 | file_size = os.path.getsize(file_path) 39 | filename = os.path.basename(file_path) 40 | 41 | probe = ffmpeg.probe(file_path) 42 | video_stream = next((stream for stream in probe['streams'] 43 | if stream['codec_type'] == 'video'), None) 44 | 45 | duration = None 46 | format_name = None 47 | resolution = None 48 | fps = None 49 | 50 | if 'format' in probe: 51 | duration = float(probe['format'].get('duration', 0)) 52 | format_name = probe['format'].get('format_name', 'unknown') 53 | 54 | if video_stream: 55 | width = video_stream.get('width') 56 | height = video_stream.get('height') 57 | if width and height: 58 | resolution = f"{width}x{height}" 59 | 60 | # calculate FPS 61 | fps_str = video_stream.get('r_frame_rate', '0/1') 62 | if '/' in fps_str: 63 | num, den = fps_str.split('/') 64 | if int(den) != 0: 65 | fps = float(num) / float(den) 66 | 67 | return FileInfo( 68 | filename=filename, 69 | size=file_size, 70 | duration=duration, 71 | format=format_name, 72 | resolution=resolution, 73 | fps=fps 74 | ) 75 | 76 | except Exception as e: 77 | return FileInfo( 78 | filename=os.path.basename(file_path), 79 | size=os.path.getsize(file_path) if os.path.exists(file_path) else 0 80 | ) 81 | 82 | def get_video_duration(file_path: str) -> Optional[float]: 83 | """ 84 | get video duration in seconds using ffprobe 85 | """ 86 | try: 87 | cmd = [ 88 | 'ffprobe', '-v', 'quiet', '-print_format', 'json', 89 | '-show_format', file_path 90 | ] 91 | result = subprocess.run(cmd, capture_output=True, text=True) 92 | 93 | if result.returncode == 0: 94 | data = json.loads(result.stdout) 95 | return float(data['format']['duration']) 96 | 97 | return None 98 | except Exception: 99 | return None 100 | 101 | def is_video_file(file_path: str) -> bool: 102 | """ 103 | check if file is a valid video file 104 | """ 105 | try: 106 | probe = ffmpeg.probe(file_path) 107 | video_streams = [stream for stream in probe['streams'] 108 | if stream['codec_type'] == 'video'] 109 | return len(video_streams) > 0 110 | except Exception: 111 | return False 112 | 113 | def get_video_codec(file_path: str) -> Optional[str]: 114 | """ 115 | get video codec information 116 | """ 117 | try: 118 | probe = ffmpeg.probe(file_path) 119 | video_stream = next((stream for stream in probe['streams'] 120 | if stream['codec_type'] == 'video'), None) 121 | if video_stream: 122 | return video_stream.get('codec_name') 123 | return None 124 | except Exception: 125 | return None 126 | 127 | def ensure_directory_exists(directory: str) -> None: 128 | """ 129 | create directory if it doesn't exist 130 | """ 131 | Path(directory).mkdir(parents=True, exist_ok=True) 132 | 133 | def clean_filename(filename: str) -> str: 134 | """ 135 | clean filename to remove invalid characters 136 | """ 137 | import re 138 | cleaned = re.sub(r'[<>:"/\\|?*]', '_', filename) 139 | return cleaned 140 | 141 | def get_safe_filename(base_name: str, extension: str, directory: str) -> str: 142 | """ 143 | generate a safe filename that doesn't conflict with existing files 144 | """ 145 | counter = 1 146 | base_path = os.path.join(directory, f"{base_name}.{extension}") 147 | 148 | while os.path.exists(base_path): 149 | base_path = os.path.join(directory, f"{base_name}_{counter}.{extension}") 150 | counter += 1 151 | 152 | return base_path 153 | 154 | def format_file_size(size_bytes: int) -> str: 155 | """ 156 | format file size in human-readable format 157 | """ 158 | if size_bytes == 0: 159 | return "0 B" 160 | 161 | size_names = ["B", "KB", "MB", "GB", "TB"] 162 | i = 0 163 | 164 | while size_bytes >= 1024 and i < len(size_names) - 1: 165 | size_bytes /= 1024.0 166 | i += 1 167 | 168 | return f"{size_bytes:.2f} {size_names[i]}" 169 | 170 | def format_duration(seconds: float) -> str: 171 | """ 172 | format duration in HH:MM:SS format 173 | """ 174 | hours = int(seconds // 3600) 175 | minutes = int((seconds % 3600) // 60) 176 | seconds = int(seconds % 60) 177 | 178 | if hours > 0: 179 | return f"{hours:02d}:{minutes:02d}:{seconds:02d}" 180 | else: 181 | return f"{minutes:02d}:{seconds:02d}" 182 | 183 | def check_ffmpeg_installed() -> bool: 184 | """ 185 | check if ffmpeg is installed and available 186 | """ 187 | try: 188 | subprocess.run(['ffmpeg', '-version'], 189 | capture_output=True, check=True) 190 | return True 191 | except (subprocess.CalledProcessError, FileNotFoundError): 192 | return False 193 | 194 | def check_disk_space(directory: str, required_space: int) -> bool: 195 | """ 196 | check if there's enough disk space for processing 197 | """ 198 | try: 199 | stat = os.statvfs(directory) 200 | available_space = stat.f_bavail * stat.f_frsize 201 | return available_space >= required_space 202 | except Exception: 203 | return True -------------------------------------------------------------------------------- /backend/services/video_processing_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import subprocess 4 | import logging 5 | from typing import Optional, List, Dict 6 | from pathlib import Path 7 | import asyncio 8 | from contextlib import asynccontextmanager 9 | 10 | from services.transcription_service import TranscriptionService 11 | from services.translation_service import TranslationService 12 | from services.subtitle_service import SubtitleService 13 | from models.requests import TranscriptionResult 14 | from utils.config import get_settings 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | class VideoProcessingService: 19 | def __init__(self): 20 | self.settings = get_settings() 21 | self.transcription_service = TranscriptionService() 22 | self.translation_service = TranslationService() 23 | self.subtitle_service = SubtitleService() 24 | 25 | @asynccontextmanager 26 | async def temporary_file(self, suffix: str = ""): 27 | """context manager for temporary files that are automatically cleaned up""" 28 | temp_file = None 29 | try: 30 | temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) 31 | temp_file.close() 32 | yield temp_file.name 33 | finally: 34 | if temp_file and os.path.exists(temp_file.name): 35 | try: 36 | os.unlink(temp_file.name) 37 | logger.debug(f"Cleaned up temporary file: {temp_file.name}") 38 | except Exception as e: 39 | logger.warning(f"Failed to cleanup temporary file {temp_file.name}: {str(e)}") 40 | 41 | #processes video with streaming approach so no permanent files are saved 42 | async def process_video_streaming(self, video_data: bytes, target_language: str, 43 | source_language: Optional[str] = None) -> bytes: 44 | """ 45 | process video 46 | """ 47 | try: 48 | logger.info(f"Starting streaming video processing") 49 | 50 | # save video temporarily for processing 51 | async with self.temporary_file(suffix=".mp4") as temp_video_path: 52 | with open(temp_video_path, 'wb') as f: 53 | f.write(video_data) 54 | 55 | # extract audio temporarily 56 | async with self.temporary_file(suffix=".wav") as temp_audio_path: 57 | await self._extract_audio(temp_video_path, temp_audio_path) 58 | 59 | # transcribe audio 60 | transcription_result = await self.transcription_service.transcribe_audio(temp_audio_path) 61 | 62 | # detect language if not provided 63 | if not source_language: 64 | source_language = transcription_result.detected_language or "en" 65 | 66 | # translate if needed 67 | if source_language != target_language: 68 | translated_segments = await self.translation_service.translate_segments( 69 | transcription_result.segments, source_language, target_language 70 | ) 71 | 72 | final_transcription = TranscriptionResult( 73 | text=" ".join([segment.text for segment in translated_segments]), 74 | segments=translated_segments, 75 | detected_language=source_language, 76 | confidence=transcription_result.confidence 77 | ) 78 | else: 79 | final_transcription = transcription_result 80 | 81 | srt_content = await self.subtitle_service.generate_srt_content(final_transcription) 82 | 83 | async with self.temporary_file(suffix=".srt") as temp_subtitle_path: 84 | with open(temp_subtitle_path, 'w', encoding='utf-8') as f: 85 | f.write(srt_content) 86 | 87 | return await self._render_video_to_bytes(temp_video_path, temp_subtitle_path) 88 | 89 | except Exception as e: 90 | logger.error(f"Error in streaming video processing: {str(e)}") 91 | raise 92 | 93 | async def _extract_audio(self, video_path: str, audio_path: str): 94 | """extract audio from video""" 95 | try: 96 | cmd = [ 97 | 'ffmpeg', '-i', video_path, 98 | '-vn', '-acodec', 'pcm_s16le', 99 | '-ar', '16000', '-ac', '1', 100 | '-y', audio_path 101 | ] 102 | 103 | process = await asyncio.create_subprocess_exec( 104 | *cmd, 105 | stdout=asyncio.subprocess.PIPE, 106 | stderr=asyncio.subprocess.PIPE 107 | ) 108 | 109 | stdout, stderr = await process.communicate() 110 | 111 | if process.returncode != 0: 112 | raise Exception(f"Audio extraction failed: {stderr.decode()}") 113 | 114 | logger.info("Audio extraction completed") 115 | 116 | except Exception as e: 117 | logger.error(f"Error extracting audio: {str(e)}") 118 | raise 119 | 120 | async def _render_video_to_bytes(self, video_path: str, subtitle_path: str) -> bytes: 121 | """render video with subtitles and return as bytes""" 122 | try: 123 | async with self.temporary_file(suffix=".mp4") as temp_output_path: 124 | cmd = [ 125 | 'ffmpeg', '-i', video_path, 126 | '-vf', f'subtitles={subtitle_path}', 127 | '-c:a', 'copy', 128 | '-c:v', 'libx264', 129 | '-y', temp_output_path 130 | ] 131 | 132 | process = await asyncio.create_subprocess_exec( 133 | *cmd, 134 | stdout=asyncio.subprocess.PIPE, 135 | stderr=asyncio.subprocess.PIPE 136 | ) 137 | 138 | stdout, stderr = await process.communicate() 139 | 140 | if process.returncode != 0: 141 | raise Exception(f"video rendering failed: {stderr.decode()}") 142 | 143 | # read the rendered video into meory 144 | with open(temp_output_path, 'rb') as f: 145 | video_bytes = f.read() 146 | 147 | logger.info("video rendering completed") 148 | return video_bytes 149 | 150 | except Exception as e: 151 | logger.error(f"error rendering video: {str(e)}") 152 | raise 153 | 154 | async def get_video_info(self, video_data: bytes) -> Dict: 155 | """Get video information from bytes""" 156 | try: 157 | async with self.temporary_file(suffix=".mp4") as temp_video_path: 158 | # write video data to temporary file 159 | with open(temp_video_path, 'wb') as f: 160 | f.write(video_data) 161 | 162 | cmd = [ 163 | 'ffprobe', '-v', 'quiet', '-print_format', 'json', 164 | '-show_format', '-show_streams', temp_video_path 165 | ] 166 | 167 | process = await asyncio.create_subprocess_exec( 168 | *cmd, 169 | stdout=asyncio.subprocess.PIPE, 170 | stderr=asyncio.subprocess.PIPE 171 | ) 172 | 173 | stdout, stderr = await process.communicate() 174 | 175 | if process.returncode != 0: 176 | raise Exception(f"Failed to get video info: {stderr.decode()}") 177 | 178 | import json 179 | return json.loads(stdout.decode()) 180 | 181 | except Exception as e: 182 | logger.error(f"Error getting video info: {str(e)}") 183 | raise 184 | 185 | async def detect_language_from_video(self, video_data: bytes) -> str: 186 | """detect language from video bytes""" 187 | try: 188 | async with self.temporary_file(suffix=".mp4") as temp_video_path: 189 | # write video data to temporary file 190 | with open(temp_video_path, 'wb') as f: 191 | f.write(video_data) 192 | 193 | # extract audio temporarily 194 | async with self.temporary_file(suffix=".wav") as temp_audio_path: 195 | await self._extract_audio(temp_video_path, temp_audio_path) 196 | return await self.transcription_service.detect_language(temp_audio_path) 197 | 198 | except Exception as e: 199 | logger.error(f"Error detecting language: {str(e)}") 200 | raise -------------------------------------------------------------------------------- /backend/services/translation_service.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from typing import List, Optional, Dict 4 | from groq import Groq 5 | import json 6 | import re 7 | 8 | from models.requests import TranslationResult, TranscriptionSegment 9 | from utils.config import get_settings, get_language_name, get_language_code, validate_groq_key 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class TranslationService: 14 | """ 15 | service for translating text 16 | """ 17 | 18 | def __init__(self): 19 | self.settings = get_settings() 20 | validate_groq_key() 21 | self.client = Groq(api_key=self.settings.groq_api_key) 22 | self.model = self.settings.groq_model 23 | 24 | #main public interface for translating text (asynchronous method) 25 | #translates a given text from a source language to a target language 26 | #returns a TranslationResults 27 | async def translate_text(self, text: str, source_language: str, target_language: str, 28 | context: Optional[str] = None) -> TranslationResult: 29 | """ 30 | translate text from source to target language 31 | """ 32 | try: 33 | logger.info(f"Translating text from {source_language} to {target_language}") 34 | 35 | # skip translation if source and target are the same 36 | if source_language.lower() == target_language.lower(): 37 | return TranslationResult( 38 | translated_text=text, 39 | source_language=source_language, 40 | target_language=target_language, 41 | confidence=1.0 42 | ) 43 | 44 | # get language names for better context 45 | source_lang_name = get_language_name(source_language) 46 | target_lang_name = get_language_name(target_language) 47 | 48 | prompt = self._create_translation_prompt( 49 | text, source_lang_name, target_lang_name, context 50 | ) 51 | 52 | loop = asyncio.get_event_loop() 53 | translated_text = await loop.run_in_executor( 54 | None, 55 | self._translate_with_groq, 56 | prompt 57 | ) 58 | 59 | # post-process the translation 60 | translated_text = self._post_process_translation(translated_text) 61 | 62 | logger.info(f"Translation completed. Original: {len(text)} chars, Translated: {len(translated_text)} chars") 63 | 64 | return TranslationResult( 65 | translated_text=translated_text, 66 | source_language=source_language, 67 | target_language=target_language, 68 | confidence=0.9 69 | ) 70 | 71 | except Exception as e: 72 | logger.error(f"Translation error: {str(e)}") 73 | raise Exception(f"Translation failed: {str(e)}") 74 | 75 | #translates multiple transcription segments with TranscriptionSegment objects 76 | #returns a list of translated segments 77 | async def translate_segments(self, segments: List[TranscriptionSegment], 78 | source_language: str, target_language: str) -> List[TranscriptionSegment]: 79 | """ 80 | translate multiple segments with context awareness 81 | """ 82 | try: 83 | logger.info(f"Translating {len(segments)} segments") 84 | 85 | # skip translation if source and target are the same 86 | if source_language.lower() == target_language.lower(): 87 | return segments 88 | 89 | batch_size = 10 90 | translated_segments = [] 91 | 92 | for i in range(0, len(segments), batch_size): 93 | batch = segments[i:i + batch_size] 94 | 95 | batch_text = self._create_batch_text(batch) 96 | 97 | # translate the batch 98 | translated_batch = await self.translate_text( 99 | batch_text, 100 | source_language, 101 | target_language, 102 | context="This is a batch of subtitle segments from a video. Maintain natural flow and context." 103 | ) 104 | 105 | # parse the translated batch back to segments 106 | translated_batch_segments = self._parse_batch_translation( 107 | translated_batch.translated_text, batch 108 | ) 109 | 110 | translated_segments.extend(translated_batch_segments) 111 | 112 | logger.info(f"Segment translation completed: {len(translated_segments)} segments") 113 | 114 | return translated_segments 115 | 116 | except Exception as e: 117 | logger.error(f"Segment translation error: {str(e)}") 118 | raise Exception(f"Segment translation failed: {str(e)}") 119 | 120 | #creates a prompt for the translation 121 | def _create_translation_prompt(self, text: str, source_lang: str, target_lang: str, 122 | context: Optional[str] = None) -> str: 123 | """ 124 | Create an optimized prompt for Groq translation 125 | """ 126 | prompt = f"""You are a professional translator specializing in subtitle translation for videos. 127 | 128 | Task: Translate the following text from {source_lang} to {target_lang}. 129 | 130 | Instructions: 131 | 1. Maintain natural flow and readability for subtitles 132 | 2. Keep cultural context and nuances 133 | 3. Use appropriate subtitle conventions for {target_lang} 134 | 4. Preserve timing and pacing suitable for spoken dialogue 135 | 5. Handle names, places, and technical terms appropriately 136 | 6. Keep similar length when possible for subtitle display 137 | 7. Add the approriate substitutes for interjections. 138 | 8. ENSURE THAT THE ENTIRE TEXT IS TRANSLATED TO THE TARGET LANGUAGE. 139 | 140 | {f"Context: {context}" if context else ""} 141 | 142 | Source text ({source_lang}): 143 | {text} 144 | 145 | Translated text ({target_lang}):""" 146 | 147 | return prompt 148 | 149 | #actual translation method that calls the groq api 150 | def _translate_with_groq(self, prompt: str) -> str: 151 | """ 152 | Perform the actual translation using Groq 153 | """ 154 | try: 155 | response = self.client.chat.completions.create( 156 | model=self.model, 157 | messages=[ 158 | { 159 | "role": "user", 160 | "content": prompt 161 | } 162 | ], 163 | temperature=0.3, 164 | max_tokens=2000, 165 | top_p=0.9, 166 | frequency_penalty=0.0, 167 | presence_penalty=0.0 168 | ) 169 | 170 | return response.choices[0].message.content.strip() 171 | 172 | except Exception as e: 173 | logger.error(f"Groq translation error: {str(e)}") 174 | raise 175 | 176 | #post-process the translation to clean up common issues 177 | def _post_process_translation(self, translated_text: str) -> str: 178 | """ 179 | Post-process the translated text to clean up common issues 180 | """ 181 | prefixes_to_remove = [ 182 | "Translated text:", 183 | "Translation:", 184 | "Here's the translation:", 185 | "The translation is:", 186 | "In English:", 187 | "In Spanish:", 188 | "In French:", 189 | "In German:", 190 | "In Italian:", 191 | "In Portuguese:", 192 | "In Russian:", 193 | "In Japanese:", 194 | "In Korean:", 195 | "In Chinese:", 196 | "In Arabic:", 197 | "In Hindi:", 198 | ] 199 | 200 | for prefix in prefixes_to_remove: 201 | if translated_text.lower().startswith(prefix.lower()): 202 | translated_text = translated_text[len(prefix):].strip() 203 | 204 | if translated_text.startswith('"') and translated_text.endswith('"'): 205 | translated_text = translated_text[1:-1] 206 | 207 | if translated_text.startswith("'") and translated_text.endswith("'"): 208 | translated_text = translated_text[1:-1] 209 | 210 | return translated_text.strip() 211 | 212 | def _create_batch_text(self, segments: List[TranscriptionSegment]) -> str: 213 | """ 214 | create a batch text from segments with markers for parsing 215 | """ 216 | batch_lines = [] 217 | for i, segment in enumerate(segments): 218 | batch_lines.append(f"[{i}] {segment.text}") 219 | 220 | return "\n".join(batch_lines) 221 | 222 | def _parse_batch_translation(self, translated_text: str, 223 | original_segments: List[TranscriptionSegment]) -> List[TranscriptionSegment]: 224 | """ 225 | parse the translated batch back to individual segments 226 | """ 227 | try: 228 | lines = translated_text.strip().split('\n') 229 | translated_segments = [] 230 | 231 | segment_map = {} 232 | for line in lines: 233 | line = line.strip() 234 | if line and line.startswith('[') and ']' in line: 235 | # Extract segment index and text 236 | marker_end = line.find(']') 237 | if marker_end > 0: 238 | try: 239 | index = int(line[1:marker_end]) 240 | text = line[marker_end + 1:].strip() 241 | segment_map[index] = text 242 | except ValueError: 243 | continue 244 | 245 | for i, original_segment in enumerate(original_segments): 246 | translated_text = segment_map.get(i, original_segment.text) 247 | 248 | translated_segments.append(TranscriptionSegment( 249 | start=original_segment.start, 250 | end=original_segment.end, 251 | text=translated_text, 252 | confidence=original_segment.confidence 253 | )) 254 | 255 | return translated_segments 256 | 257 | except Exception as e: 258 | logger.error(f"Error parsing batch translation: {str(e)}") 259 | return original_segments 260 | 261 | async def detect_language_groq(self, text: str) -> str: 262 | """ 263 | use groq to detect the language of text 264 | """ 265 | try: 266 | prompt = f"""Detect the language of the following text and respond with only the language code (e.g., 'en' for English, 'es' for Spanish, 'fr' for French, etc.). 267 | 268 | Text: {text[:500]} 269 | 270 | Language code:""" 271 | 272 | loop = asyncio.get_event_loop() 273 | response = await loop.run_in_executor( 274 | None, 275 | self._translate_with_groq, 276 | prompt 277 | ) 278 | 279 | # extract language code from response 280 | detected_lang = response.strip().lower() 281 | 282 | # validate against supported languages 283 | if detected_lang in self.settings.supported_languages: 284 | return detected_lang 285 | 286 | # try to match against language names 287 | for code, name in self.settings.supported_languages.items(): 288 | if name.lower() in detected_lang.lower(): 289 | return code 290 | 291 | logger.warning(f"Unknown language detected: {detected_lang}, defaulting to 'en'") 292 | return "en" 293 | 294 | except Exception as e: 295 | logger.error(f"Language detection error: {str(e)}") 296 | return "en" 297 | 298 | async def improve_translation_quality(self, original_text: str, translated_text: str, 299 | source_language: str, target_language: str) -> str: 300 | """ 301 | use groq to improve translation quality by reviewing and refining 302 | """ 303 | try: 304 | source_lang_name = get_language_name(source_language) 305 | target_lang_name = get_language_name(target_language) 306 | 307 | prompt = f"""Review and improve the following translation for subtitle display: 308 | 309 | You are a professional subtitle translator. Your task is to improve subtitle translations with high accuracy and natural fluency. 310 | 311 | Original ({source_lang_name}): "{original_text}" 312 | Current Translation ({target_lang_name}): "{translated_text}" 313 | 314 | Guidelines: 315 | 1. Return **only the improved subtitle translation**, nothing else (no explanations or reasoning). 316 | 2. If the original text is background music, sound effects, or irrelevant (e.g., intro/outro credits, logos, gibberish), return an **empty string** "" to omit it from subtitles. 317 | 3. If a character repeats a word for emphasis (e.g., "no no no"), keep it natural but **limit repetition** to 2–3 times max unless it’s critical to context. 318 | 4. Keep translation concise and natural for subtitle display. Avoid overly long sentences. 319 | 5. Do not insert random characters, numbers, or non-speech text. 320 | 6. ENSURE THAT THE ENTIRE TEXT IS TRANSLATED TO THE TARGET LANGUAGE. 321 | 322 | Improved translation:""" 323 | 324 | loop = asyncio.get_event_loop() 325 | improved_translation = await loop.run_in_executor( 326 | None, 327 | self._translate_with_groq, 328 | prompt 329 | ) 330 | 331 | return self._post_process_translation(improved_translation) 332 | 333 | except Exception as e: 334 | logger.error(f"Translation improvement error: {str(e)}") 335 | return translated_text 336 | 337 | def get_supported_languages(self) -> Dict[str, str]: 338 | """ 339 | Get list of supported languages for translation 340 | """ 341 | return self.settings.supported_languages -------------------------------------------------------------------------------- /backend/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, File, UploadFile, HTTPException, Form, BackgroundTasks 2 | from fastapi.responses import StreamingResponse 3 | from fastapi.middleware.cors import CORSMiddleware 4 | import uvicorn 5 | import logging 6 | from typing import Optional, Dict 7 | import asyncio 8 | import json 9 | from datetime import datetime 10 | import io 11 | 12 | from services.video_processing_service import VideoProcessingService 13 | from utils.config import get_settings 14 | 15 | logging.basicConfig( 16 | level=logging.INFO, 17 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 18 | ) 19 | logger = logging.getLogger(__name__) 20 | 21 | app = FastAPI(title="Video Subtitle Generator", version="1.0.0") 22 | 23 | # Configure CORS 24 | app.add_middleware( 25 | CORSMiddleware, 26 | allow_origins=["http://localhost:3000"], 27 | allow_credentials=True, 28 | allow_methods=["*"], 29 | allow_headers=["*"], 30 | ) 31 | 32 | video_service = VideoProcessingService() 33 | settings = get_settings() 34 | 35 | active_jobs: Dict[str, Dict] = {} 36 | 37 | @app.get("/") 38 | async def root(): 39 | return {"message": "Video Subtitle Generator API"} 40 | 41 | @app.post("/upload") 42 | async def upload_video(file: UploadFile = File(...)): 43 | """Upload video file and return job ID""" 44 | try: 45 | # validate 46 | if not file.content_type.startswith('video/'): 47 | raise HTTPException(status_code=400, detail="File must be a video") 48 | 49 | # read video data 50 | video_data = await file.read() 51 | 52 | # check file size 53 | if len(video_data) > settings.max_file_size: 54 | max_size_mb = settings.max_file_size / (1024 * 1024) 55 | raise HTTPException(status_code=400, detail=f"File size exceeds {max_size_mb:.0f}MB limit") 56 | 57 | import uuid 58 | job_id = str(uuid.uuid4()) 59 | 60 | active_jobs[job_id] = { 61 | "status": "uploaded", 62 | "filename": file.filename, 63 | "video_data": video_data, 64 | "created_at": datetime.now().isoformat(), 65 | "progress": 0 66 | } 67 | 68 | logger.info(f"Video uploaded successfully: {file.filename} (Job ID: {job_id})") 69 | 70 | return { 71 | "job_id": job_id, 72 | "filename": file.filename, 73 | "size": len(video_data), 74 | "status": "uploaded" 75 | } 76 | 77 | except Exception as e: 78 | logger.error(f"Error uploading video: {str(e)}") 79 | raise HTTPException(status_code=500, detail=str(e)) 80 | 81 | @app.post("/process/{job_id}") 82 | async def process_video( 83 | job_id: str, 84 | background_tasks: BackgroundTasks, 85 | target_language: str = Form(...), 86 | source_language: Optional[str] = Form(None) 87 | ): 88 | """Start video processing""" 89 | try: 90 | if job_id not in active_jobs: 91 | raise HTTPException(status_code=404, detail="Job not found") 92 | 93 | job = active_jobs[job_id] 94 | 95 | if job["status"] != "uploaded": 96 | raise HTTPException(status_code=400, detail="Job not ready for processing") 97 | 98 | job["target_language"] = target_language 99 | 100 | background_tasks.add_task( 101 | process_video_background, 102 | job_id, 103 | job["video_data"], 104 | target_language, 105 | source_language 106 | ) 107 | 108 | logger.info(f"Started processing for job: {job_id}") 109 | 110 | return { 111 | "job_id": job_id, 112 | "status": "processing_started", 113 | "message": "Video processing started" 114 | } 115 | 116 | except Exception as e: 117 | logger.error(f"Error starting processing: {str(e)}") 118 | raise HTTPException(status_code=500, detail=str(e)) 119 | 120 | async def process_video_background(job_id: str, video_data: bytes, target_language: str, source_language: Optional[str]): 121 | """Background task for video processing""" 122 | try: 123 | job = active_jobs[job_id] 124 | 125 | job["progress"] = 20 126 | job["status"] = "extracting_audio" 127 | 128 | async with video_service.temporary_file(suffix=".mp4") as temp_video_path: 129 | with open(temp_video_path, 'wb') as f: 130 | f.write(video_data) 131 | 132 | async with video_service.temporary_file(suffix=".wav") as temp_audio_path: 133 | await video_service._extract_audio(temp_video_path, temp_audio_path) 134 | 135 | job["progress"] = 50 136 | job["status"] = "transcribing" 137 | 138 | logger.info(f"Starting transcription for job {job_id}") 139 | transcription_result = await video_service.transcription_service.transcribe_audio( 140 | temp_audio_path, 141 | language=source_language 142 | ) 143 | 144 | logger.info(f"Transcription completed for job {job_id}. Segments: {len(transcription_result.segments)}, Text length: {len(transcription_result.text)}") 145 | 146 | # detect language if not provided 147 | if not source_language: 148 | source_language = transcription_result.detected_language or "en" 149 | 150 | # store transcription result for user review 151 | job["transcription_result"] = { 152 | "text": transcription_result.text, 153 | "segments": [ 154 | { 155 | "start": seg.start, 156 | "end": seg.end, 157 | "text": seg.text, 158 | "confidence": seg.confidence 159 | } for seg in transcription_result.segments 160 | ], 161 | "detected_language": transcription_result.detected_language, 162 | "confidence": transcription_result.confidence 163 | } 164 | job["source_language"] = source_language 165 | job["status"] = "transcription_complete" 166 | job["progress"] = 60 167 | 168 | logger.info(f"Transcription completed for job {job_id}, waiting for user review") 169 | 170 | except Exception as e: 171 | logger.error(f"Error processing video {job_id}: {str(e)}") 172 | if job_id in active_jobs: 173 | active_jobs[job_id]["status"] = "failed" 174 | active_jobs[job_id]["error"] = str(e) 175 | active_jobs[job_id]["progress"] = 0 176 | 177 | @app.get("/status/{job_id}") 178 | async def get_job_status(job_id: str): 179 | """Get job status""" 180 | try: 181 | if job_id not in active_jobs: 182 | raise HTTPException(status_code=404, detail="Job not found") 183 | 184 | job = active_jobs[job_id] 185 | 186 | return { 187 | "job_id": job_id, 188 | "status": job["status"], 189 | "progress": job["progress"], 190 | "message": job.get("error", ""), 191 | "filename": job.get("filename", "") 192 | } 193 | 194 | except Exception as e: 195 | logger.error(f"Error getting job status: {str(e)}") 196 | raise HTTPException(status_code=500, detail=str(e)) 197 | 198 | @app.get("/transcription/{job_id}") 199 | async def get_transcription(job_id: str): 200 | """Get transcription result for user review""" 201 | try: 202 | if job_id not in active_jobs: 203 | raise HTTPException(status_code=404, detail="Job not found") 204 | 205 | job = active_jobs[job_id] 206 | 207 | if job["status"] != "transcription_complete": 208 | raise HTTPException(status_code=400, detail="Transcription not ready for review") 209 | 210 | return { 211 | "job_id": job_id, 212 | "transcription": job["transcription_result"], 213 | "source_language": job["source_language"], 214 | "target_language": job["target_language"], 215 | "filename": job["filename"] 216 | } 217 | 218 | except Exception as e: 219 | logger.error(f"Error getting transcription: {str(e)}") 220 | raise HTTPException(status_code=500, detail=str(e)) 221 | 222 | @app.post("/transcription/{job_id}/continue") 223 | async def continue_with_transcription( 224 | job_id: str, 225 | background_tasks: BackgroundTasks, 226 | transcription: dict 227 | ): 228 | """Continue processing with edited transcription""" 229 | try: 230 | if job_id not in active_jobs: 231 | raise HTTPException(status_code=404, detail="Job not found") 232 | 233 | job = active_jobs[job_id] 234 | 235 | if job["status"] != "transcription_complete": 236 | raise HTTPException(status_code=400, detail="Job not ready for transcription continuation") 237 | 238 | # start background processing with edited transcription 239 | background_tasks.add_task( 240 | continue_processing_after_transcription, 241 | job_id, 242 | transcription 243 | ) 244 | 245 | logger.info(f"Continuing processing with edited transcription for job: {job_id}") 246 | 247 | return { 248 | "job_id": job_id, 249 | "status": "processing_continued", 250 | "message": "Processing continued with edited transcription" 251 | } 252 | 253 | except Exception as e: 254 | logger.error(f"Error continuing with transcription: {str(e)}") 255 | raise HTTPException(status_code=500, detail=str(e)) 256 | 257 | async def continue_processing_after_transcription(job_id: str, edited_transcription: dict): 258 | """Continue processing after user has reviewed/edited transcription""" 259 | try: 260 | job = active_jobs[job_id] 261 | 262 | # recreate transcription result from edited data 263 | from models.requests import TranscriptionResult, TranscriptionSegment 264 | 265 | segments = [ 266 | TranscriptionSegment( 267 | start=seg["start"], 268 | end=seg["end"], 269 | text=seg["text"], 270 | confidence=seg["confidence"] 271 | ) for seg in edited_transcription["segments"] 272 | ] 273 | 274 | transcription_result = TranscriptionResult( 275 | text=edited_transcription["text"], 276 | segments=segments, 277 | detected_language=edited_transcription["detected_language"], 278 | confidence=edited_transcription["confidence"] 279 | ) 280 | 281 | job["status"] = "translating" 282 | job["progress"] = 70 283 | 284 | source_language = job["source_language"] 285 | target_language = job["target_language"] 286 | 287 | if source_language != target_language: 288 | translated_segments = await video_service.translation_service.translate_segments( 289 | transcription_result.segments, source_language, target_language 290 | ) 291 | 292 | final_transcription = TranscriptionResult( 293 | text=" ".join([segment.text for segment in translated_segments]), 294 | segments=translated_segments, 295 | detected_language=source_language, 296 | confidence=transcription_result.confidence 297 | ) 298 | else: 299 | final_transcription = transcription_result 300 | 301 | job["status"] = "generating_subtitles" 302 | job["progress"] = 80 303 | 304 | # generate subtitles and render video 305 | srt_content = await video_service.subtitle_service.generate_srt_content(final_transcription) 306 | 307 | video_data = job["video_data"] 308 | 309 | async with video_service.temporary_file(suffix=".mp4") as temp_video_path: 310 | with open(temp_video_path, 'wb') as f: 311 | f.write(video_data) 312 | 313 | async with video_service.temporary_file(suffix=".srt") as temp_subtitle_path: 314 | with open(temp_subtitle_path, 'w', encoding='utf-8') as f: 315 | f.write(srt_content) 316 | 317 | job["status"] = "rendering_video" 318 | job["progress"] = 90 319 | 320 | result_video_bytes = await video_service._render_video_to_bytes(temp_video_path, temp_subtitle_path) 321 | 322 | # update job with result 323 | job["status"] = "completed" 324 | job["progress"] = 100 325 | job["result_video"] = result_video_bytes 326 | job["completed_at"] = datetime.now().isoformat() 327 | 328 | # Clear video data to save memory 329 | del job["video_data"] 330 | del job["transcription_result"] 331 | 332 | logger.info(f"Job completed successfully: {job_id}") 333 | 334 | except Exception as e: 335 | logger.error(f"Error continuing processing for job {job_id}: {str(e)}") 336 | if job_id in active_jobs: 337 | active_jobs[job_id]["status"] = "failed" 338 | active_jobs[job_id]["error"] = str(e) 339 | active_jobs[job_id]["progress"] = 0 340 | 341 | @app.get("/download/{job_id}") 342 | async def download_video(job_id: str): 343 | """Download processed video with subtitles""" 344 | try: 345 | if job_id not in active_jobs: 346 | raise HTTPException(status_code=404, detail="Job not found") 347 | 348 | job = active_jobs[job_id] 349 | 350 | if job["status"] != "completed": 351 | raise HTTPException(status_code=400, detail="Job not completed yet") 352 | 353 | if "result_video" not in job: 354 | raise HTTPException(status_code=404, detail="Processed video not found") 355 | 356 | video_bytes = job["result_video"] 357 | 358 | original_filename = job["filename"] 359 | name, ext = original_filename.rsplit('.', 1) 360 | download_filename = f"{name}_subtitled.{ext}" 361 | 362 | def cleanup_job(): 363 | if job_id in active_jobs: 364 | del active_jobs[job_id] 365 | logger.info(f"Cleaned up job from memory: {job_id}") 366 | 367 | def generate(): 368 | yield video_bytes 369 | cleanup_job() 370 | 371 | return StreamingResponse( 372 | generate(), 373 | media_type="video/mp4", 374 | headers={ 375 | "Content-Disposition": f"attachment; filename={download_filename}", 376 | "Content-Length": str(len(video_bytes)) 377 | } 378 | ) 379 | 380 | except Exception as e: 381 | logger.error(f"Error downloading video: {str(e)}") 382 | raise HTTPException(status_code=500, detail=str(e)) 383 | 384 | @app.get("/video/preview/{job_id}") 385 | async def preview_video(job_id: str): 386 | """Stream processed video for preview""" 387 | try: 388 | if job_id not in active_jobs: 389 | raise HTTPException(status_code=404, detail="Job not found") 390 | 391 | job = active_jobs[job_id] 392 | 393 | if job["status"] != "completed": 394 | raise HTTPException(status_code=400, detail="Job not completed yet") 395 | 396 | if "result_video" not in job: 397 | raise HTTPException(status_code=404, detail="Processed video not found") 398 | 399 | video_bytes = job["result_video"] 400 | 401 | return StreamingResponse( 402 | io.BytesIO(video_bytes), 403 | media_type="video/mp4", 404 | headers={ 405 | "Content-Length": str(len(video_bytes)), 406 | "Accept-Ranges": "bytes" 407 | } 408 | ) 409 | 410 | except Exception as e: 411 | logger.error(f"Error streaming video preview: {str(e)}") 412 | raise HTTPException(status_code=500, detail=str(e)) 413 | 414 | if __name__ == "__main__": 415 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /backend/services/transcription_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import logging 4 | from typing import List, Optional, Dict 5 | from groq import Groq 6 | import tempfile 7 | import json 8 | import re 9 | 10 | from models.requests import TranscriptionResult, TranscriptionSegment 11 | from utils.config import get_settings, get_language_code, validate_groq_key 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class TranscriptionService: 16 | """transcribe audio using whisper""" 17 | 18 | def __init__(self): 19 | self.settings = get_settings() 20 | validate_groq_key() 21 | self.client = Groq(api_key=self.settings.groq_api_key) 22 | self.model = self.settings.groq_whisper_model 23 | 24 | #main public interface for transcribing audio (asynchronous method) 25 | async def transcribe_audio(self, audio_path: str, language: Optional[str] = None) -> TranscriptionResult: 26 | """transcribe audio file to text with timestamps using whisper""" 27 | try: 28 | logger.info(f"Transcribing audio: {audio_path} with model: {self.model}") 29 | if language: 30 | logger.info(f"Using specified language: {language}") 31 | 32 | #read audio file and run executor to avoid blocking 33 | with open(audio_path, "rb") as file: 34 | audio_data = file.read() 35 | logger.info(f"Audio file size: {len(audio_data)} bytes") 36 | 37 | loop = asyncio.get_event_loop() 38 | result = await loop.run_in_executor( 39 | None, 40 | self._transcribe_with_groq, 41 | audio_data, 42 | language 43 | ) 44 | 45 | transcription_result = self._convert_groq_result(result, language) 46 | 47 | logger.info(f"Transcription completed successfully:") 48 | logger.info(f" - Text length: {len(transcription_result.text)} characters") 49 | logger.info(f" - Number of segments: {len(transcription_result.segments)}") 50 | logger.info(f" - Detected language: {transcription_result.detected_language}") 51 | logger.info(f" - Overall confidence: {transcription_result.confidence:.2f}") 52 | 53 | return transcription_result 54 | 55 | except Exception as e: 56 | logger.error(f"Error transcribing audio: {str(e)}") 57 | raise Exception(f"Transcription failed: {str(e)}") 58 | 59 | #this is the actual transcription method that calls the groq api 60 | def _transcribe_with_groq(self, audio_data: bytes, language: Optional[str] = None) -> dict: 61 | """perform the actual transcription""" 62 | try: 63 | with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: 64 | temp_file.write(audio_data) 65 | temp_file.flush() 66 | 67 | logger.info(f"Calling Groq API with model: {self.model}") 68 | 69 | with open(temp_file.name, "rb") as file: 70 | transcription = self.client.audio.transcriptions.create( 71 | file=file, 72 | model=self.model, 73 | language=language, 74 | response_format="verbose_json", 75 | temperature=0.0 76 | ) 77 | 78 | os.unlink(temp_file.name) 79 | 80 | if hasattr(transcription, 'model_dump'): 81 | result = transcription.model_dump() 82 | else: 83 | result = transcription.__dict__ 84 | 85 | logger.info(f"Groq API response received:") 86 | logger.info(f" - Response keys: {list(result.keys())}") 87 | logger.info(f" - Text length: {len(result.get('text', ''))}") 88 | 89 | if 'segments' in result: 90 | logger.info(f" - Number of segments: {len(result['segments'])}") 91 | else: 92 | logger.warning(" - No segments in response (text-only)") 93 | 94 | return result 95 | 96 | except Exception as e: 97 | logger.error(f"Groq API transcription error: {str(e)}") 98 | raise 99 | 100 | #converts the output from the groq api to the intended format 101 | def _convert_groq_result(self, result: dict, expected_language: Optional[str] = None) -> TranscriptionResult: 102 | """Convert Groq API result to our TranscriptionResult format""" 103 | try: 104 | segments = [] 105 | 106 | # handle segmented response 107 | if 'segments' in result and result['segments']: 108 | logger.info("Processing segmented transcription") 109 | for i, segment in enumerate(result['segments']): 110 | start_time = float(segment.get('start', 0.0)) 111 | end_time = float(segment.get('end', 0.0)) 112 | text = segment.get('text', '').strip() 113 | 114 | if not text: 115 | logger.debug(f"Skipping empty segment {i}") 116 | continue 117 | 118 | # random thing to check for hallucinations (not sure if this is needed) 119 | if self._is_likely_hallucination(text, expected_language): 120 | logger.warning(f"Potential hallucination detected in segment {i}: '{text}' - skipping") 121 | continue 122 | 123 | # calculate confidence 124 | confidence = 0.8 # default 125 | if 'avg_logprob' in segment: 126 | logprob = float(segment['avg_logprob']) 127 | # convert logprob to confidence (recheck calculation as this is a rough approximation) 128 | confidence = max(0.1, min(1.0, (logprob + 3.0) / 3.0)) 129 | elif 'confidence' in segment: 130 | confidence = float(segment.get('confidence', 0.8)) 131 | 132 | confidence = max(0.1, min(1.0, confidence)) 133 | 134 | # ensure end time is after start time 135 | if end_time <= start_time: 136 | words = len(text.split()) 137 | estimated_duration = max(1.0, words / 2.5) 138 | end_time = start_time + estimated_duration 139 | logger.debug(f"Estimated duration for segment {i}: {estimated_duration:.2f}s") 140 | 141 | # add the segment to the list 142 | segments.append(TranscriptionSegment( 143 | start=start_time, 144 | end=end_time, 145 | text=text, 146 | confidence=confidence 147 | )) 148 | 149 | logger.debug(f"Segment {i}: {start_time:.2f}-{end_time:.2f}s, confidence: {confidence:.2f}") 150 | 151 | # handle text-only response 152 | elif 'text' in result and result['text'].strip(): 153 | logger.info("Processing text-only transcription (no segments)") 154 | full_text = result['text'].strip() 155 | 156 | # split text into reasonable chunks for subtitle display 157 | sentences = self._split_text_into_sentences(full_text) 158 | 159 | # estimate total duration based on word count 160 | words = len(full_text.split()) 161 | estimated_total_duration = max(5.0, words / 2.5) # 2.5 words per second 162 | 163 | current_time = 0.0 164 | for i, sentence in enumerate(sentences): 165 | sentence_words = len(sentence.split()) 166 | sentence_duration = max(1.0, sentence_words / 2.5) 167 | 168 | segments.append(TranscriptionSegment( 169 | start=current_time, 170 | end=current_time + sentence_duration, 171 | text=sentence, 172 | confidence=0.8 173 | )) 174 | 175 | current_time += sentence_duration 176 | logger.debug(f"Text segment {i}: {sentence_words} words, {sentence_duration:.2f}s") 177 | 178 | # calculate overall confidence 179 | if segments: 180 | overall_confidence = sum(seg.confidence for seg in segments) / len(segments) 181 | else: 182 | overall_confidence = 0.0 183 | 184 | # clean the final text 185 | final_text = self._clean_transcription_text(result.get('text', '').strip()) 186 | detected_language = result.get('language', expected_language or 'en') 187 | 188 | logger.info(f"Conversion completed:") 189 | logger.info(f" - Final segments: {len(segments)}") 190 | logger.info(f" - Final text length: {len(final_text)}") 191 | logger.info(f" - Overall confidence: {overall_confidence:.2f}") 192 | 193 | return TranscriptionResult( 194 | text=final_text, 195 | segments=segments, 196 | detected_language=detected_language, 197 | confidence=overall_confidence 198 | ) 199 | 200 | #error handling 201 | except Exception as e: 202 | logger.error(f"Error converting Groq result: {str(e)}") 203 | return TranscriptionResult( 204 | text=result.get('text', ''), 205 | segments=[], 206 | detected_language='en', 207 | confidence=0.0 208 | ) 209 | 210 | #current method for real obvious hallucination detection (not sure if this is needed or the most effective) 211 | def _is_likely_hallucination(self, text: str, expected_language: Optional[str] = None) -> bool: 212 | """Detect potential hallucinations using very conservative heuristics""" 213 | if not text or len(text.strip()) < 2: 214 | return False 215 | 216 | text = text.strip().lower() 217 | 218 | # only catch the most obvious hallucinations - be very conservative 219 | hallucination_indicators = [ 220 | # single meaningless characters or very short nonsense 221 | len(text) <= 2 and not text.isalnum(), 222 | # only flag extremely obvious repeated patterns 223 | len(text) > 20 and len(set(text.replace(' ', ''))) == 1, 224 | ] 225 | 226 | return any(hallucination_indicators) 227 | 228 | #another method for error handling 229 | def _contains_multiple_scripts(self, text: str) -> bool: 230 | """check if text contains multiple writing scripts (indicating mixed languages)""" 231 | # simple check for mixed scripts - only flag obvious cases 232 | has_latin = any('a' <= c <= 'z' or 'A' <= c <= 'Z' for c in text) 233 | has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) 234 | has_arabic = any('\u0600' <= c <= '\u06ff' for c in text) 235 | has_cyrillic = any('\u0400' <= c <= '\u04ff' for c in text) 236 | 237 | script_count = sum([has_latin, has_chinese, has_arabic, has_cyrillic]) 238 | return script_count > 1 239 | 240 | def _clean_transcription_text(self, text: str) -> str: 241 | """Clean transcription text of common artifacts""" 242 | if not text: 243 | return "" 244 | 245 | # remove multiple spaces 246 | text = re.sub(r'\s+', ' ', text) 247 | 248 | # remove leading/trailing whitespace 249 | text = text.strip() 250 | 251 | return text 252 | 253 | def _split_text_into_sentences(self, text: str) -> List[str]: 254 | """split text into sentences for better subtitle timing""" 255 | import re 256 | 257 | # split on sentence endings 258 | sentences = re.split(r'[.!?]+', text) 259 | 260 | cleaned_sentences = [] 261 | for sentence in sentences: 262 | sentence = sentence.strip() 263 | if sentence and len(sentence) > 3: # Minimum sentence length 264 | cleaned_sentences.append(sentence) 265 | 266 | # fallback: split on commas if no sentences found 267 | if not cleaned_sentences: 268 | if ',' in text: 269 | parts = text.split(',') 270 | cleaned_sentences = [part.strip() for part in parts if part.strip()] 271 | else: 272 | # last resort: split into chunks of words 273 | words = text.split() 274 | chunk_size = 10 275 | for i in range(0, len(words), chunk_size): 276 | chunk = ' '.join(words[i:i + chunk_size]) 277 | if chunk.strip(): 278 | cleaned_sentences.append(chunk.strip()) 279 | 280 | return cleaned_sentences or [text] 281 | 282 | async def detect_language(self, audio_path: str) -> str: 283 | """detect the language of the audio file when the user does not specify""" 284 | try: 285 | logger.info(f"Detecting language for audio: {audio_path}") 286 | 287 | # transcribe a short sample for language detection 288 | transcription_result = await self._transcribe_sample(audio_path, max_duration=30) 289 | 290 | if transcription_result.detected_language: 291 | detected_lang = get_language_code(transcription_result.detected_language) 292 | logger.info(f"Language detected: {detected_lang}") 293 | return detected_lang 294 | 295 | logger.warning("No language detected, defaulting to English") 296 | return "en" 297 | 298 | except Exception as e: 299 | logger.error(f"Error detecting language: {str(e)}") 300 | return "en" 301 | 302 | #transcribe short sample to detect language 303 | async def _transcribe_sample(self, audio_path: str, max_duration: int = 30) -> TranscriptionResult: 304 | """transcribe a short sample for language detection""" 305 | try: 306 | with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: 307 | temp_path = temp_file.name 308 | 309 | #use ffmpeg to extract a short sample for language detection 310 | import subprocess 311 | cmd = [ 312 | 'ffmpeg', '-i', audio_path, 313 | '-ss', '0', '-t', str(max_duration), 314 | '-ar', '16000', '-ac', '1', 315 | '-y', temp_path 316 | ] 317 | 318 | subprocess.run(cmd, capture_output=True, check=True) 319 | 320 | result = await self.transcribe_audio(temp_path) 321 | 322 | os.unlink(temp_path) 323 | 324 | return result 325 | 326 | except Exception as e: 327 | logger.error(f"Error transcribing sample: {str(e)}") 328 | raise 329 | 330 | async def get_supported_languages(self) -> List[str]: 331 | """Get list of supported language codes""" 332 | return [ 333 | "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", 334 | "ar", "hi", "th", "vi", "nl", "sv", "no", "da", "fi", "pl", 335 | "tr", "cs", "hu", "ro", "bg", "hr", "sk", "sl", "et", "lv", 336 | "lt", "uk", "he", "fa", "ur", "bn", "ta", "te", "ml", "kn", 337 | "gu", "mr", "ne", "si", "my", "km", "lo", "ka", "am", "sw", 338 | "zu", "af", "ms", "tl", "id" 339 | ] 340 | 341 | def get_model_info(self) -> Dict[str, str]: 342 | """Get information about the current model""" 343 | model_info = { 344 | "whisper-large-v3": { 345 | "name": "Whisper Large v3", 346 | "description": "Most accurate, supports 99+ languages", 347 | "speed": "Medium", 348 | "accuracy": "Highest" 349 | }, 350 | "whisper-large-v3-turbo": { 351 | "name": "Whisper Large v3 Turbo", 352 | "description": "Faster version of Large v3", 353 | "speed": "Fast", 354 | "accuracy": "High" 355 | }, 356 | "distil-whisper-large-v3-en": { 357 | "name": "Distil-Whisper Large v3 EN", 358 | "description": "English-only, very fast", 359 | "speed": "Very Fast", 360 | "accuracy": "High (English only)" 361 | } 362 | } 363 | 364 | return model_info.get(self.model, { 365 | "name": self.model, 366 | "description": "Unknown model", 367 | "speed": "Unknown", 368 | "accuracy": "Unknown" 369 | }) -------------------------------------------------------------------------------- /backend/services/subtitle_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import logging 4 | from typing import List, Optional 5 | import pysrt 6 | import webvtt 7 | from datetime import timedelta 8 | 9 | from models.requests import TranscriptionSegment, TranscriptionResult, SubtitleEntry 10 | from utils.config import get_settings 11 | from utils.file_utils import ensure_directory_exists, get_safe_filename 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class SubtitleService: 16 | """ 17 | create subtitle files from transcription segments 18 | """ 19 | 20 | def __init__(self): 21 | self.settings = get_settings() 22 | 23 | async def create_subtitle_file(self, transcription_result: TranscriptionResult, 24 | job_id: str, format: str = "srt") -> str: 25 | """ 26 | create a subtitle file from transcription result 27 | """ 28 | try: 29 | logger.info(f"Creating {format.upper()} subtitle file for job {job_id}") 30 | # optimize segments for subtitle display 31 | optimized_segments = self._optimize_segments_for_subtitles( 32 | transcription_result.segments 33 | ) 34 | 35 | # generate subtitle file based on format 36 | if format.lower() == "srt": 37 | subtitle_path = await self._create_srt_file(optimized_segments, job_id) 38 | elif format.lower() == "vtt": 39 | subtitle_path = await self._create_vtt_file(optimized_segments, job_id) 40 | else: 41 | raise ValueError(f"Unsupported subtitle format: {format}") 42 | 43 | logger.info(f"Subtitle file created: {subtitle_path}") 44 | return subtitle_path 45 | 46 | except Exception as e: 47 | logger.error(f"Error creating subtitle file: {str(e)}") 48 | raise Exception(f"Subtitle creation failed: {str(e)}") 49 | 50 | async def _create_srt_file(self, segments: List[TranscriptionSegment], job_id: str) -> str: 51 | """ 52 | create SRT format subtitle file 53 | """ 54 | try: 55 | # generate output path 56 | subtitle_path = get_safe_filename( 57 | f"subtitles_{job_id}", 58 | "srt", 59 | f"{self.settings.temp_dir}/subtitles" 60 | ) 61 | 62 | # create SRT subtitle file 63 | srt_subs = pysrt.SubRipFile() 64 | 65 | for i, segment in enumerate(segments, 1): 66 | # convert seconds to SRT time format 67 | start_time = self._seconds_to_srt_time(segment.start) 68 | end_time = self._seconds_to_srt_time(segment.end) 69 | 70 | # create subtitle item 71 | subtitle_item = pysrt.SubRipItem( 72 | index=i, 73 | start=start_time, 74 | end=end_time, 75 | text=segment.text 76 | ) 77 | 78 | srt_subs.append(subtitle_item) 79 | 80 | # save to file 81 | srt_subs.save(subtitle_path, encoding='utf-8') 82 | 83 | return subtitle_path 84 | 85 | except Exception as e: 86 | logger.error(f"Error creating SRT file: {str(e)}") 87 | raise 88 | 89 | async def _create_vtt_file(self, segments: List[TranscriptionSegment], job_id: str) -> str: 90 | """ 91 | Create VTT format subtitle file 92 | """ 93 | try: 94 | # generate output path 95 | subtitle_path = get_safe_filename( 96 | f"subtitles_{job_id}", 97 | "vtt", 98 | f"{self.settings.temp_dir}/subtitles" 99 | ) 100 | 101 | # create VTT subtitle file 102 | vtt_subs = webvtt.WebVTT() 103 | 104 | for segment in segments: 105 | # convert seconds to VTT time format 106 | start_time = self._seconds_to_vtt_time(segment.start) 107 | end_time = self._seconds_to_vtt_time(segment.end) 108 | 109 | # create caption 110 | caption = webvtt.Caption( 111 | start=start_time, 112 | end=end_time, 113 | text=segment.text 114 | ) 115 | 116 | vtt_subs.captions.append(caption) 117 | 118 | # save to file 119 | vtt_subs.save(subtitle_path) 120 | 121 | return subtitle_path 122 | 123 | except Exception as e: 124 | logger.error(f"Error creating VTT file: {str(e)}") 125 | raise 126 | 127 | def _optimize_segments_for_subtitles(self, segments: List[TranscriptionSegment]) -> List[TranscriptionSegment]: 128 | """ 129 | optimize segments for subtitle display 130 | """ 131 | optimized_segments = [] 132 | 133 | for segment in segments: 134 | # skip empty segments 135 | if not segment.text.strip(): 136 | continue 137 | 138 | duration = segment.end - segment.start 139 | 140 | min_duration = max(1.0, len(segment.text) * 0.05) # 50ms per character 141 | 142 | if duration < min_duration: 143 | segment.end = segment.start + min_duration 144 | 145 | max_duration = 7.0 146 | if duration > max_duration: 147 | # split long segments 148 | split_segments = self._split_long_segment(segment, max_duration) 149 | optimized_segments.extend(split_segments) 150 | else: 151 | optimized_segments.append(segment) 152 | 153 | # ensure no overlapping segments 154 | return self._remove_overlaps(optimized_segments) 155 | 156 | def _split_long_segment(self, segment: TranscriptionSegment, max_duration: float) -> List[TranscriptionSegment]: 157 | """ 158 | split a long segment into multiple shorter ones 159 | """ 160 | segments = [] 161 | text = segment.text 162 | words = text.split() 163 | 164 | if len(words) <= 1: 165 | return [segment] 166 | 167 | # split into roughly equal parts 168 | duration = segment.end - segment.start 169 | num_parts = int(duration / max_duration) + 1 170 | words_per_part = len(words) // num_parts 171 | 172 | current_start = segment.start 173 | for i in range(num_parts): 174 | start_idx = i * words_per_part 175 | end_idx = (i + 1) * words_per_part if i < num_parts - 1 else len(words) 176 | 177 | if start_idx >= len(words): 178 | break 179 | 180 | part_text = ' '.join(words[start_idx:end_idx]) 181 | part_duration = (duration / num_parts) 182 | part_end = current_start + part_duration 183 | 184 | segments.append(TranscriptionSegment( 185 | start=current_start, 186 | end=part_end, 187 | text=part_text, 188 | confidence=segment.confidence 189 | )) 190 | 191 | current_start = part_end 192 | 193 | return segments 194 | 195 | def _remove_overlaps(self, segments: List[TranscriptionSegment]) -> List[TranscriptionSegment]: 196 | """ 197 | remove overlapping segments 198 | """ 199 | if not segments: 200 | return segments 201 | 202 | # sort by start time 203 | segments.sort(key=lambda x: x.start) 204 | 205 | cleaned_segments = [] 206 | for segment in segments: 207 | if not cleaned_segments: 208 | cleaned_segments.append(segment) 209 | continue 210 | 211 | last_segment = cleaned_segments[-1] 212 | 213 | # check for overlap 214 | if segment.start < last_segment.end: 215 | # adjust start time to avoid overlap 216 | gap = 0.1 217 | segment.start = last_segment.end + gap 218 | 219 | # ensure end time is still after start time 220 | if segment.end <= segment.start: 221 | segment.end = segment.start + 1.0 222 | 223 | cleaned_segments.append(segment) 224 | 225 | return cleaned_segments 226 | 227 | def _seconds_to_srt_time(self, seconds: float) -> pysrt.SubRipTime: 228 | """ 229 | convert seconds to SRT time format 230 | """ 231 | hours = int(seconds // 3600) 232 | minutes = int((seconds % 3600) // 60) 233 | secs = int(seconds % 60) 234 | milliseconds = int((seconds - int(seconds)) * 1000) 235 | 236 | return pysrt.SubRipTime(hours, minutes, secs, milliseconds) 237 | 238 | def _seconds_to_vtt_time(self, seconds: float) -> str: 239 | """ 240 | convert seconds to VTT time format 241 | """ 242 | hours = int(seconds // 3600) 243 | minutes = int((seconds % 3600) // 60) 244 | secs = seconds % 60 245 | 246 | return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" 247 | 248 | def validate_subtitle_file(self, file_path: str) -> bool: 249 | """ 250 | validate a subtitle file 251 | """ 252 | try: 253 | if not os.path.exists(file_path): 254 | return False 255 | 256 | file_extension = os.path.splitext(file_path)[1].lower() 257 | 258 | if file_extension == '.srt': 259 | # try to parse SRT file 260 | subs = pysrt.open(file_path) 261 | return len(subs) > 0 262 | elif file_extension == '.vtt': 263 | # try to parse VTT file 264 | vtt = webvtt.read(file_path) 265 | return len(vtt.captions) > 0 266 | 267 | return False 268 | 269 | except Exception as e: 270 | logger.error(f"Error validating subtitle file: {str(e)}") 271 | return False 272 | 273 | def get_subtitle_stats(self, file_path: str) -> dict: 274 | """ 275 | get statistics about a subtitle file 276 | """ 277 | try: 278 | if not os.path.exists(file_path): 279 | return {} 280 | 281 | file_extension = os.path.splitext(file_path)[1].lower() 282 | 283 | if file_extension == '.srt': 284 | subs = pysrt.open(file_path) 285 | total_duration = 0 286 | total_chars = 0 287 | 288 | for sub in subs: 289 | duration = (sub.end - sub.start).total_seconds() 290 | total_duration += duration 291 | total_chars += len(sub.text) 292 | 293 | return { 294 | 'format': 'SRT', 295 | 'subtitle_count': len(subs), 296 | 'total_duration': total_duration, 297 | 'total_characters': total_chars, 298 | 'average_duration': total_duration / len(subs) if subs else 0, 299 | 'average_characters': total_chars / len(subs) if subs else 0 300 | } 301 | 302 | elif file_extension == '.vtt': 303 | vtt = webvtt.read(file_path) 304 | total_duration = 0 305 | total_chars = 0 306 | 307 | for caption in vtt.captions: 308 | start_seconds = self._vtt_time_to_seconds(caption.start) 309 | end_seconds = self._vtt_time_to_seconds(caption.end) 310 | duration = end_seconds - start_seconds 311 | total_duration += duration 312 | total_chars += len(caption.text) 313 | 314 | return { 315 | 'format': 'VTT', 316 | 'subtitle_count': len(vtt.captions), 317 | 'total_duration': total_duration, 318 | 'total_characters': total_chars, 319 | 'average_duration': total_duration / len(vtt.captions) if vtt.captions else 0, 320 | 'average_characters': total_chars / len(vtt.captions) if vtt.captions else 0 321 | } 322 | 323 | return {} 324 | 325 | except Exception as e: 326 | logger.error(f"error getting subtitle stats: {str(e)}") 327 | return {} 328 | 329 | def _vtt_time_to_seconds(self, vtt_time: str) -> float: 330 | """ 331 | convert VTT time format to seconds 332 | """ 333 | try: 334 | parts = vtt_time.split(':') 335 | hours = int(parts[0]) 336 | minutes = int(parts[1]) 337 | seconds = float(parts[2]) 338 | 339 | return hours * 3600 + minutes * 60 + seconds 340 | 341 | except Exception: 342 | return 0.0 343 | 344 | def convert_subtitle_format(self, input_path: str, output_format: str, job_id: str) -> str: 345 | """ 346 | convert subtitle file from one format to another 347 | """ 348 | try: 349 | input_extension = os.path.splitext(input_path)[1].lower() 350 | 351 | # generate output path 352 | output_path = get_safe_filename( 353 | f"converted_{job_id}", 354 | output_format, 355 | f"{self.settings.temp_dir}/subtitles" 356 | ) 357 | 358 | if input_extension == '.srt' and output_format == 'vtt': 359 | # convert SRT to VTT 360 | srt_subs = pysrt.open(input_path) 361 | vtt_subs = webvtt.WebVTT() 362 | 363 | for sub in srt_subs: 364 | start_time = self._srt_time_to_vtt_time(sub.start) 365 | end_time = self._srt_time_to_vtt_time(sub.end) 366 | 367 | caption = webvtt.Caption( 368 | start=start_time, 369 | end=end_time, 370 | text=sub.text 371 | ) 372 | 373 | vtt_subs.captions.append(caption) 374 | 375 | vtt_subs.save(output_path) 376 | 377 | elif input_extension == '.vtt' and output_format == 'srt': 378 | # convert VTT to SRT 379 | vtt_subs = webvtt.read(input_path) 380 | srt_subs = pysrt.SubRipFile() 381 | 382 | for i, caption in enumerate(vtt_subs.captions, 1): 383 | start_time = self._vtt_time_to_srt_time(caption.start) 384 | end_time = self._vtt_time_to_srt_time(caption.end) 385 | 386 | subtitle_item = pysrt.SubRipItem( 387 | index=i, 388 | start=start_time, 389 | end=end_time, 390 | text=caption.text 391 | ) 392 | 393 | srt_subs.append(subtitle_item) 394 | 395 | srt_subs.save(output_path, encoding='utf-8') 396 | 397 | else: 398 | raise ValueError(f"unsupported conversion: {input_extension} to {output_format}") 399 | 400 | logger.info(f"subtitle converted from {input_extension} to {output_format}: {output_path}") 401 | return output_path 402 | 403 | except Exception as e: 404 | logger.error(f"error converting subtitle format: {str(e)}") 405 | raise 406 | 407 | def _srt_time_to_vtt_time(self, srt_time: pysrt.SubRipTime) -> str: 408 | """ 409 | convert SRT time to VTT time format 410 | """ 411 | total_seconds = srt_time.hours * 3600 + srt_time.minutes * 60 + srt_time.seconds + srt_time.milliseconds / 1000 412 | return self._seconds_to_vtt_time(total_seconds) 413 | 414 | def _vtt_time_to_srt_time(self, vtt_time: str) -> pysrt.SubRipTime: 415 | """ 416 | convert VTT time to SRT time format 417 | """ 418 | seconds = self._vtt_time_to_seconds(vtt_time) 419 | return self._seconds_to_srt_time(seconds) 420 | 421 | async def generate_srt_content(self, transcription_result: TranscriptionResult) -> str: 422 | """ 423 | generate SRT subtitle content as a string without saving to disk 424 | """ 425 | try: 426 | logger.info("generating SRT content in memory") 427 | 428 | optimized_segments = self._optimize_segments_for_subtitles( 429 | transcription_result.segments 430 | ) 431 | 432 | srt_lines = [] 433 | 434 | for i, segment in enumerate(optimized_segments, 1): 435 | start_time = self._seconds_to_srt_time_string(segment.start) 436 | end_time = self._seconds_to_srt_time_string(segment.end) 437 | 438 | srt_lines.append(str(i)) 439 | srt_lines.append(f"{start_time} --> {end_time}") 440 | srt_lines.append(segment.text) 441 | srt_lines.append("") 442 | 443 | srt_content = "\n".join(srt_lines) 444 | logger.info(f"generated SRT content with {len(optimized_segments)} segments") 445 | 446 | return srt_content 447 | 448 | except Exception as e: 449 | logger.error(f"error generating SRT content: {str(e)}") 450 | raise Exception(f"SRT generation failed: {str(e)}") 451 | 452 | def _seconds_to_srt_time_string(self, seconds: float) -> str: 453 | """ 454 | convert seconds to SRT time format string 455 | """ 456 | hours = int(seconds // 3600) 457 | minutes = int((seconds % 3600) // 60) 458 | secs = int(seconds % 60) 459 | milliseconds = int((seconds - int(seconds)) * 1000) 460 | 461 | return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}" -------------------------------------------------------------------------------- /app/page.tsx: -------------------------------------------------------------------------------- 1 | 'use client' 2 | import React, { useState, useCallback, useRef, memo } from 'react' 3 | import { Upload, Play, Download, Globe, Clock, FileText, CheckCircle, AlertCircle, Eye } from 'lucide-react' 4 | 5 | //types for managing the status and file info 6 | interface JobStatus { 7 | job_id: string 8 | status: string 9 | progress: number 10 | message: string 11 | output_path?: string 12 | subtitle_path?: string 13 | } 14 | 15 | interface FileInfo { 16 | filename: string 17 | size: number 18 | duration?: number 19 | format?: string 20 | resolution?: string 21 | fps?: number 22 | } 23 | 24 | interface FileInfoCardProps { 25 | icon: React.ComponentType<{ className?: string }> 26 | iconColor: string 27 | label: string 28 | value: string 29 | title?: string 30 | } 31 | 32 | interface FileInfoSectionProps { 33 | fileInfo: FileInfo 34 | title: string 35 | icon: React.ComponentType<{ className?: string }> 36 | gradientFrom: string 37 | gradientTo: string 38 | borderColor: string 39 | iconColor: string 40 | formatFileSize: (bytes: number) => string 41 | formatDuration: (seconds: number) => string 42 | getFileExtension: (filename: string) => string 43 | getAspectRatio: (resolution: string) => string 44 | getEstimatedBitrate: (size: number, duration: number) => string 45 | } 46 | 47 | 48 | //list of supported languages by qwen3-32b. if you change the model, please update this list. 49 | const SUPPORTED_LANGUAGES = { 50 | 'en': 'English', 51 | 'es': 'Spanish', 52 | 'fr': 'French', 53 | 'de': 'German', 54 | 'it': 'Italian', 55 | 'pt': 'Portuguese', 56 | 'ru': 'Russian', 57 | 'ja': 'Japanese', 58 | 'ko': 'Korean', 59 | 'zh': 'Chinese', 60 | 'ar': 'Arabic', 61 | 'hi': 'Hindi', 62 | 'th': 'Thai', 63 | 'vi': 'Vietnamese', 64 | 'nl': 'Dutch', 65 | 'sv': 'Swedish', 66 | 'no': 'Norwegian', 67 | 'da': 'Danish', 68 | 'fi': 'Finnish', 69 | 'pl': 'Polish', 70 | 'tr': 'Turkish', 71 | 'cs': 'Czech', 72 | 'hu': 'Hungarian', 73 | 'ro': 'Romanian', 74 | 'bg': 'Bulgarian', 75 | 'hr': 'Croatian', 76 | 'sk': 'Slovak', 77 | 'sl': 'Slovenian', 78 | 'et': 'Estonian', 79 | 'lv': 'Latvian', 80 | 'lt': 'Lithuanian', 81 | 'mt': 'Maltese', 82 | 'ga': 'Irish', 83 | 'cy': 'Welsh', 84 | 'eu': 'Basque', 85 | 'ca': 'Catalan', 86 | 'gl': 'Galician', 87 | 'is': 'Icelandic', 88 | 'mk': 'Macedonian', 89 | 'sq': 'Albanian', 90 | 'be': 'Belarusian', 91 | 'uk': 'Ukrainian', 92 | 'he': 'Hebrew', 93 | 'fa': 'Persian', 94 | 'ur': 'Urdu', 95 | 'bn': 'Bengali', 96 | 'ta': 'Tamil', 97 | 'te': 'Telugu', 98 | 'ml': 'Malayalam', 99 | 'kn': 'Kannada', 100 | 'gu': 'Gujarati', 101 | 'mr': 'Marathi', 102 | 'ne': 'Nepali', 103 | 'si': 'Sinhala', 104 | 'my': 'Burmese', 105 | 'km': 'Khmer', 106 | 'lo': 'Lao', 107 | 'ka': 'Georgian', 108 | 'am': 'Amharic', 109 | 'sw': 'Swahili', 110 | 'zu': 'Zulu', 111 | 'af': 'Afrikaans', 112 | 'ms': 'Malay', 113 | 'tl': 'Filipino', 114 | 'id': 'Indonesian' 115 | } 116 | 117 | //info card component 118 | const FileInfoCard = memo
502 | 511 | Lightning-fast AI-powered multilingual subtitles. 512 |
513 |514 | Powered by Groq. 515 |
516 |{formatFileSize(selectedFile.size)}
554 |562 | Drop your video file here or click to browse 563 |
564 |Supports MP4, MOV or AVI up to 25MB
565 |Keep videos under 5-10 minutes for optimal performance
566 |785 | Please review the transcription below and make any necessary corrections before continuing. 786 |
787 |Detected Language: {transcription.detected_language}
814 |