├── .gitattributes ├── .gitignore ├── GEMINI_INSIGHTS.md ├── INSTALLATION.md ├── LICENSE ├── README.md ├── app.py ├── install.bat ├── install.py ├── install.sh ├── requirements.txt └── utils ├── audio_processing.py ├── cache.py ├── diarization.py ├── export.py ├── gpu_utils.py ├── keyword_extraction.py ├── ollama_integration.py ├── summarization.py ├── transcription.py ├── translation.py └── validation.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python virtual environment 2 | venv/ 3 | __pycache__/ 4 | *.pyc 5 | 6 | # IDE files 7 | .vscode/ 8 | .idea/ 9 | 10 | # OS files 11 | .env 12 | .DS_Store 13 | Thumbs.db 14 | -------------------------------------------------------------------------------- /GEMINI_INSIGHTS.md: -------------------------------------------------------------------------------- 1 | # Gemini Insights: OBS Recording Transcriber 2 | 3 | ## Project Overview 4 | The OBS Recording Transcriber is a Python application built with Streamlit that processes video recordings (particularly from OBS Studio) to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization. 5 | 6 | ## Key Improvement Areas 7 | 8 | ### 1. UI Enhancements 9 | - **Implemented:** 10 | - Responsive layout with columns for better organization 11 | - Expanded sidebar with categorized settings 12 | - Custom CSS for improved button styling 13 | - Spinner for long-running operations 14 | - Expanded transcript view by default 15 | 16 | - **Additional Recommendations:** 17 | - Add a dark mode toggle 18 | - Implement progress bars for each processing step 19 | - Add tooltips for complex options 20 | - Create a dashboard view for batch processing results 21 | - Add visualization of transcript segments with timestamps 22 | 23 | ### 2. Ollama Local API Integration 24 | - **Implemented:** 25 | - Local API integration for offline summarization 26 | - Model selection from available Ollama models 27 | - Chunking for long texts 28 | - Fallback to online models when Ollama fails 29 | 30 | - **Additional Recommendations:** 31 | - Add temperature and other generation parameters as advanced options 32 | - Implement streaming responses for real-time feedback 33 | - Cache results to avoid reprocessing 34 | - Add support for custom Ollama model creation with specific instructions 35 | - Implement parallel processing for multiple chunks 36 | 37 | ### 3. Subtitle Export Formats 38 | - **Implemented:** 39 | - SRT export with proper formatting 40 | - ASS export with basic styling 41 | - Multi-format export options 42 | - Automatic segment creation from plain text 43 | 44 | - **Additional Recommendations:** 45 | - Add customizable styling options for ASS subtitles 46 | - Implement subtitle editing before export 47 | - Add support for VTT format for web videos 48 | - Implement subtitle timing adjustment 49 | - Add batch export for multiple files 50 | 51 | ### 4. Architecture and Code Quality 52 | - **Recommendations:** 53 | - Implement proper error handling and logging throughout 54 | - Add unit tests for critical components 55 | - Create a configuration file for default settings 56 | - Implement caching for processed files 57 | - Add type hints throughout the codebase 58 | - Document API endpoints for potential future web service 59 | 60 | ### 5. Performance Optimizations 61 | - **Recommendations:** 62 | - Implement parallel processing for batch operations 63 | - Add GPU acceleration configuration options 64 | - Optimize memory usage for large files 65 | - Implement incremental processing for very long recordings 66 | - Add compression options for exported files 67 | 68 | ### 6. Additional Features 69 | - **Recommendations:** 70 | - Speaker diarization (identifying different speakers) 71 | - Language detection and translation 72 | - Keyword extraction and timestamp linking 73 | - Integration with video editing software 74 | - Batch processing queue with email notifications 75 | - Custom vocabulary for domain-specific terminology 76 | 77 | ## Implementation Roadmap 78 | 1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export 79 | 2. **Phase 2 (Completed):** Performance optimizations and additional export formats 80 | - Added WebVTT export format for web videos 81 | - Implemented GPU acceleration with automatic device selection 82 | - Added caching system for faster processing of previously transcribed files 83 | - Optimized memory usage with configurable memory limits 84 | - Added compression options for exported files 85 | - Enhanced ASS subtitle styling options 86 | - Added progress indicators for better user feedback 87 | 3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation 88 | - Implemented speaker diarization to identify different speakers in recordings 89 | - Added language detection and translation capabilities 90 | - Integrated keyword extraction with timestamp linking 91 | - Created interactive transcript with keyword highlighting 92 | - Added named entity recognition for better content analysis 93 | - Generated keyword index with timestamp references 94 | - Provided speaker statistics and word count analysis 95 | 4. **Phase 4:** Integration with other tools and services 96 | 97 | ## Technical Considerations 98 | - Ensure compatibility with different Whisper model sizes 99 | - Handle large files efficiently to prevent memory issues 100 | - Provide graceful degradation when optional dependencies are missing 101 | - Maintain backward compatibility with existing workflows 102 | - Consider containerization for easier deployment 103 | 104 | ## Conclusion 105 | The OBS Recording Transcriber has a solid foundation but can be significantly enhanced with the suggested improvements. The focus should be on improving user experience, adding offline processing capabilities, and expanding export options to make the tool more versatile for different use cases. -------------------------------------------------------------------------------- /INSTALLATION.md: -------------------------------------------------------------------------------- 1 | # Installation Guide for OBS Recording Transcriber 2 | 3 | This guide will help you install all the necessary dependencies for the OBS Recording Transcriber application, including the advanced features from Phase 3. 4 | 5 | ## Prerequisites 6 | 7 | Before installing the Python packages, you need to set up some prerequisites: 8 | 9 | ### 1. Python 3.8 or higher 10 | 11 | Make sure you have Python 3.8 or higher installed. You can download it from [python.org](https://www.python.org/downloads/). 12 | 13 | ### 2. FFmpeg 14 | 15 | FFmpeg is required for audio processing: 16 | 17 | - **Windows**: 18 | - Download from [gyan.dev/ffmpeg/builds](https://www.gyan.dev/ffmpeg/builds/) 19 | - Extract the ZIP file 20 | - Add the `bin` folder to your system PATH 21 | 22 | - **macOS**: 23 | ```bash 24 | brew install ffmpeg 25 | ``` 26 | 27 | - **Linux**: 28 | ```bash 29 | sudo apt update 30 | sudo apt install ffmpeg 31 | ``` 32 | 33 | ### 3. Visual C++ Build Tools (Windows only) 34 | 35 | Some packages like `tokenizers` require C++ build tools: 36 | 37 | 1. Download and install [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) 38 | 2. During installation, select "Desktop development with C++" 39 | 40 | ## Installation Steps 41 | 42 | ### 1. Create a Virtual Environment (Recommended) 43 | 44 | ```bash 45 | # Create a virtual environment 46 | python -m venv venv 47 | 48 | # Activate the virtual environment 49 | # Windows 50 | venv\Scripts\activate 51 | # macOS/Linux 52 | source venv/bin/activate 53 | ``` 54 | 55 | ### 2. Install PyTorch 56 | 57 | For better performance, install PyTorch with CUDA support if you have an NVIDIA GPU: 58 | 59 | ```bash 60 | # Windows/Linux with CUDA 61 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 62 | 63 | # macOS or CPU-only 64 | pip install torch torchvision torchaudio 65 | ``` 66 | 67 | ### 3. Install Dependencies 68 | 69 | ```bash 70 | # Install all dependencies from requirements.txt 71 | pip install -r requirements.txt 72 | ``` 73 | 74 | ### 4. Troubleshooting Common Issues 75 | 76 | #### Tokenizers Installation Issues 77 | 78 | If you encounter issues with `tokenizers` installation: 79 | 80 | 1. Make sure you have Visual C++ Build Tools installed (Windows) 81 | 2. Try installing Rust: [rustup.rs](https://rustup.rs/) 82 | 3. Install tokenizers separately: 83 | ```bash 84 | pip install tokenizers --no-binary tokenizers 85 | ``` 86 | 87 | #### PyAnnote.Audio Access 88 | 89 | To use speaker diarization, you need a HuggingFace token with access to the pyannote models: 90 | 91 | 1. Create an account on [HuggingFace](https://huggingface.co/) 92 | 2. Generate an access token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) 93 | 3. Request access to [pyannote/speaker-diarization-3.0](https://huggingface.co/pyannote/speaker-diarization-3.0) 94 | 4. Set the token in the application when prompted or as an environment variable: 95 | ```bash 96 | # Windows 97 | set HF_TOKEN=your_token_here 98 | # macOS/Linux 99 | export HF_TOKEN=your_token_here 100 | ``` 101 | 102 | #### Memory Issues with Large Files 103 | 104 | If you encounter memory issues with large files: 105 | 106 | 1. Use a smaller Whisper model (e.g., "base" instead of "large") 107 | 2. Reduce the GPU memory fraction in the application settings 108 | 3. Increase your system's swap space/virtual memory 109 | 110 | ## Running the Application 111 | 112 | After installation, run the application with: 113 | 114 | ```bash 115 | streamlit run app.py 116 | ``` 117 | 118 | ## Optional: Ollama Setup for Local Summarization 119 | 120 | To use Ollama for local summarization: 121 | 122 | 1. Install Ollama from [ollama.ai](https://ollama.ai/) 123 | 2. Pull a model: 124 | ```bash 125 | ollama pull llama3 126 | ``` 127 | 3. Uncomment the Ollama line in requirements.txt and install: 128 | ```bash 129 | pip install ollama 130 | ``` 131 | 132 | ## Verifying Installation 133 | 134 | To verify that all components are working correctly: 135 | 136 | 1. Run the application 137 | 2. Check that GPU acceleration is available (if applicable) 138 | 3. Test a small video file with basic transcription 139 | 4. Gradually enable advanced features like diarization and translation 140 | 141 | If you encounter any issues, check the application logs for specific error messages. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 DataAnts-AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video Transcriber 2 | 3 | ## Project Overview 4 | The video Recording Transcriber is a Python application built with Streamlit that processes video recordings (particularly from OBS Studio) to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization. 5 | 6 | 7 | ![SuiteQL_query_UI-1-Thumbnail](https://github.com/user-attachments/assets/72aaf238-6615-4739-b77f-c4eb9ff96996) 8 | 9 | Demo here 10 | 11 | https://github.com/user-attachments/assets/990e63fc-232e-46a0-afdf-ca8836d46a13 12 | 13 | 14 | ## Installation 15 | 16 | ### Easy Installation (Recommended) 17 | 18 | #### Windows 19 | 1. Download or clone the repository 20 | 2. Run `install.bat` by double-clicking it 21 | 3. Follow the on-screen instructions 22 | 23 | #### Linux/macOS 24 | 1. Download or clone the repository 25 | 2. Open a terminal in the project directory 26 | 3. Make the install script executable: `chmod +x install.sh` 27 | 4. Run the script: `./install.sh` 28 | 5. Follow the on-screen instructions 29 | 30 | ### Manual Installation 31 | 1. Clone the repo. 32 | ``` 33 | git clone https://github.com/DataAnts-AI/VideoTranscriber.git 34 | cd VideoTranscriber 35 | ``` 36 | 37 | 2. Install dependencies: 38 | ``` 39 | pip install -r requirements.txt 40 | ``` 41 | 42 | Notes: 43 | - Ensure that the versions align with the features you use and your system compatibility. 44 | - torch version should match the capabilities of your hardware (e.g., CUDA support for GPUs). 45 | - For advanced features like speaker diarization, you'll need a HuggingFace token. 46 | - See `INSTALLATION.md` for detailed instructions and troubleshooting. 47 | 48 | 3. Run the application: 49 | ``` 50 | streamlit run app.py 51 | ``` 52 | 53 | ## Usage 54 | 1. Set your base folder where OBS recordings are stored 55 | 2. Select a recording from the dropdown 56 | 3. Choose transcription and summarization models 57 | 4. Configure performance settings (GPU acceleration, caching) 58 | 5. Select export formats and compression options 59 | 6. Click "Process Recording" to start 60 | 61 | ## Advanced Features 62 | - **Speaker Diarization**: Identify and label different speakers in your recordings 63 | - **Translation**: Automatically detect language and translate to multiple languages 64 | - **Keyword Extraction**: Extract important keywords with timestamp links 65 | - **Interactive Transcript**: Navigate through the transcript with keyword highlighting 66 | - **GPU Acceleration**: Utilize your GPU for faster processing 67 | - **Caching**: Save processing time by caching results 68 | 69 | 70 | 71 | ## Key Improvement Areas 72 | 73 | ### 1. UI Enhancements 74 | - **Implemented:** 75 | - Responsive layout with columns for better organization 76 | - Expanded sidebar with categorized settings 77 | - Custom CSS for improved button styling 78 | - Spinner for long-running operations 79 | - Expanded transcript view by default 80 | 81 | - **Additional Recommendations:** 82 | - Add a dark mode toggle 83 | - Implement progress bars for each processing step 84 | - Add tooltips for complex options 85 | - Create a dashboard view for batch processing results 86 | - Add visualization of transcript segments with timestamps 87 | 88 | ### 2. Ollama Local API Integration 89 | - **Implemented:** 90 | - Local API integration for offline summarization 91 | - Model selection from available Ollama models 92 | - Chunking for long texts 93 | - Fallback to online models when Ollama fails 94 | 95 | - **Additional Recommendations:** 96 | - Add temperature and other generation parameters as advanced options 97 | - Implement streaming responses for real-time feedback 98 | - Cache results to avoid reprocessing 99 | - Add support for custom Ollama model creation with specific instructions 100 | - Implement parallel processing for multiple chunks 101 | 102 | ### 3. Subtitle Export Formats 103 | - **Implemented:** 104 | - SRT export with proper formatting 105 | - ASS export with basic styling 106 | - Multi-format export options 107 | - Automatic segment creation from plain text 108 | 109 | - **Additional Recommendations:** 110 | - Add customizable styling options for ASS subtitles 111 | - Implement subtitle editing before export 112 | - Add support for VTT format for web videos 113 | - Implement subtitle timing adjustment 114 | - Add batch export for multiple files 115 | 116 | ### 4. Architecture and Code Quality 117 | - **Recommendations:** 118 | - Implement proper error handling and logging throughout 119 | - Add unit tests for critical components 120 | - Create a configuration file for default settings 121 | - Implement caching for processed files 122 | - Add type hints throughout the codebase 123 | - Document API endpoints for potential future web service 124 | 125 | ### 5. Performance Optimizations 126 | - **Recommendations:** 127 | - Implement parallel processing for batch operations 128 | - Add GPU acceleration configuration options 129 | - Optimize memory usage for large files 130 | - Implement incremental processing for very long recordings 131 | - Add compression options for exported files 132 | 133 | ### 6. Additional Features 134 | - **Recommendations:** 135 | - Speaker diarization (identifying different speakers) 136 | - Language detection and translation 137 | - Keyword extraction and timestamp linking 138 | - Integration with video editing software 139 | - Batch processing queue with email notifications 140 | - Custom vocabulary for domain-specific terminology 141 | 142 | ## Implementation Roadmap 143 | 1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export 144 | 2. **Phase 2 (Completed):** Performance optimizations and additional export formats 145 | - Added WebVTT export format for web videos 146 | - Implemented GPU acceleration with automatic device selection 147 | - Added caching system for faster processing of previously transcribed files 148 | - Optimized memory usage with configurable memory limits 149 | - Added compression options for exported files 150 | - Enhanced ASS subtitle styling options 151 | - Added progress indicators for better user feedback 152 | 3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation 153 | - Implemented speaker diarization to identify different speakers in recordings 154 | - Added language detection and translation capabilities 155 | - Integrated keyword extraction with timestamp linking 156 | - Created interactive transcript with keyword highlighting 157 | - Added named entity recognition for better content analysis 158 | - Generated keyword index with timestamp references 159 | - Provided speaker statistics and word count analysis 160 | 4. **Phase 4:** Integration with other tools and services (In progess) 161 | 162 | 163 | Reach out to support@dataants.org if you need assistance with any AI solutions - we offer support for n8n workflows, local RAG chatbots, and ERP and Financial reporting. 164 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from utils.audio_processing import extract_audio 3 | from utils.transcription import transcribe_audio 4 | from utils.summarization import summarize_text 5 | from utils.validation import validate_environment 6 | from utils.export import export_transcript 7 | from pathlib import Path 8 | import os 9 | import logging 10 | import humanize 11 | from datetime import timedelta 12 | 13 | # Configure logging 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | # Try to import Ollama integration, but don't fail if it's not available 18 | try: 19 | from utils.ollama_integration import check_ollama_available, list_available_models, chunk_and_summarize 20 | OLLAMA_AVAILABLE = check_ollama_available() 21 | except ImportError: 22 | OLLAMA_AVAILABLE = False 23 | 24 | # Try to import GPU utilities, but don't fail if not available 25 | try: 26 | from utils.gpu_utils import get_gpu_info, configure_gpu 27 | GPU_UTILS_AVAILABLE = True 28 | except ImportError: 29 | GPU_UTILS_AVAILABLE = False 30 | 31 | # Try to import caching utilities, but don't fail if not available 32 | try: 33 | from utils.cache import get_cache_size, clear_cache 34 | CACHE_AVAILABLE = True 35 | except ImportError: 36 | CACHE_AVAILABLE = False 37 | 38 | # Try to import diarization utilities, but don't fail if not available 39 | try: 40 | from utils.diarization import transcribe_with_diarization 41 | DIARIZATION_AVAILABLE = True 42 | except ImportError: 43 | DIARIZATION_AVAILABLE = False 44 | 45 | # Try to import translation utilities, but don't fail if not available 46 | try: 47 | from utils.translation import transcribe_and_translate, get_language_name 48 | TRANSLATION_AVAILABLE = True 49 | except ImportError: 50 | TRANSLATION_AVAILABLE = False 51 | 52 | # Try to import keyword extraction utilities, but don't fail if not available 53 | try: 54 | from utils.keyword_extraction import extract_keywords_from_transcript, generate_keyword_index, generate_interactive_transcript 55 | KEYWORD_EXTRACTION_AVAILABLE = True 56 | except ImportError: 57 | KEYWORD_EXTRACTION_AVAILABLE = False 58 | 59 | def main(): 60 | # Set page configuration 61 | st.set_page_config( 62 | page_title="OBS Recording Transcriber", 63 | page_icon="🎥", 64 | layout="wide", 65 | initial_sidebar_state="expanded" 66 | ) 67 | 68 | # Custom CSS for better UI 69 | st.markdown(""" 70 | 102 | """, unsafe_allow_html=True) 103 | 104 | st.title("🎥 OBS Recording Transcriber") 105 | st.caption("Process your OBS recordings with AI transcription and summarization") 106 | 107 | # Sidebar configuration 108 | st.sidebar.header("Settings") 109 | 110 | # Allow the user to select a base folder 111 | base_folder = st.sidebar.text_input( 112 | "Enter the base folder path:", 113 | value=str(Path.home()) 114 | ) 115 | 116 | base_path = Path(base_folder) 117 | 118 | # Model selection 119 | st.sidebar.subheader("Model Settings") 120 | 121 | # Transcription model selection 122 | transcription_model = st.sidebar.selectbox( 123 | "Transcription Model", 124 | ["tiny", "base", "small", "medium", "large"], 125 | index=1, 126 | help="Select the Whisper model size. Larger models are more accurate but slower." 127 | ) 128 | 129 | # Summarization model selection 130 | summarization_options = ["Hugging Face (Online)", "Ollama (Local)"] if OLLAMA_AVAILABLE else ["Hugging Face (Online)"] 131 | summarization_method = st.sidebar.selectbox( 132 | "Summarization Method", 133 | summarization_options, 134 | index=0, 135 | help="Select the summarization method. Ollama runs locally but requires installation." 136 | ) 137 | 138 | # If Ollama is selected, show model selection 139 | ollama_model = None 140 | if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)": 141 | available_models = list_available_models() 142 | if available_models: 143 | ollama_model = st.sidebar.selectbox( 144 | "Ollama Model", 145 | available_models, 146 | index=0 if "llama3" in available_models else 0, 147 | help="Select the Ollama model to use for summarization." 148 | ) 149 | else: 150 | st.sidebar.warning("No Ollama models found. Please install models using 'ollama pull model_name'.") 151 | 152 | # Advanced features 153 | st.sidebar.subheader("Advanced Features") 154 | 155 | # Speaker diarization 156 | use_diarization = st.sidebar.checkbox( 157 | "Speaker Diarization", 158 | value=False, 159 | disabled=not DIARIZATION_AVAILABLE, 160 | help="Identify different speakers in the recording." 161 | ) 162 | 163 | # Show HF token input if diarization is enabled 164 | hf_token = None 165 | if use_diarization and DIARIZATION_AVAILABLE: 166 | hf_token = st.sidebar.text_input( 167 | "HuggingFace Token", 168 | type="password", 169 | help="Required for speaker diarization. Get your token at huggingface.co/settings/tokens" 170 | ) 171 | 172 | num_speakers = st.sidebar.number_input( 173 | "Number of Speakers", 174 | min_value=1, 175 | max_value=10, 176 | value=2, 177 | help="Specify the number of speakers if known, or leave at default for auto-detection." 178 | ) 179 | 180 | # Translation 181 | use_translation = st.sidebar.checkbox( 182 | "Translation", 183 | value=False, 184 | disabled=not TRANSLATION_AVAILABLE, 185 | help="Translate the transcript to another language." 186 | ) 187 | 188 | # Target language selection if translation is enabled 189 | target_lang = None 190 | if use_translation and TRANSLATION_AVAILABLE: 191 | target_lang = st.sidebar.selectbox( 192 | "Target Language", 193 | ["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko", "ar"], 194 | format_func=lambda x: f"{get_language_name(x)} ({x})", 195 | help="Select the language to translate to." 196 | ) 197 | 198 | # Keyword extraction 199 | use_keywords = st.sidebar.checkbox( 200 | "Keyword Extraction", 201 | value=False, 202 | disabled=not KEYWORD_EXTRACTION_AVAILABLE, 203 | help="Extract keywords and link them to timestamps." 204 | ) 205 | 206 | if use_keywords and KEYWORD_EXTRACTION_AVAILABLE: 207 | max_keywords = st.sidebar.slider( 208 | "Max Keywords", 209 | min_value=5, 210 | max_value=30, 211 | value=15, 212 | help="Maximum number of keywords to extract." 213 | ) 214 | 215 | # Performance settings 216 | st.sidebar.subheader("Performance Settings") 217 | 218 | # GPU acceleration 219 | use_gpu = st.sidebar.checkbox( 220 | "Use GPU Acceleration", 221 | value=True if GPU_UTILS_AVAILABLE else False, 222 | disabled=not GPU_UTILS_AVAILABLE, 223 | help="Use GPU for faster processing if available." 224 | ) 225 | 226 | # Show GPU info if available 227 | if GPU_UTILS_AVAILABLE and use_gpu: 228 | gpu_info = get_gpu_info() 229 | if gpu_info["cuda_available"]: 230 | gpu_devices = [f"{d['name']} ({humanize.naturalsize(d['total_memory'])})" for d in gpu_info["cuda_devices"]] 231 | st.sidebar.info(f"GPU(s) available: {', '.join(gpu_devices)}") 232 | elif gpu_info["mps_available"]: 233 | st.sidebar.info("Apple Silicon GPU (MPS) available") 234 | else: 235 | st.sidebar.warning("No GPU detected. Using CPU.") 236 | 237 | # Memory usage 238 | memory_fraction = st.sidebar.slider( 239 | "GPU Memory Usage", 240 | min_value=0.1, 241 | max_value=1.0, 242 | value=0.8, 243 | step=0.1, 244 | disabled=not (GPU_UTILS_AVAILABLE and use_gpu), 245 | help="Fraction of GPU memory to use. Lower if you encounter out-of-memory errors." 246 | ) 247 | 248 | # Caching options 249 | use_cache = st.sidebar.checkbox( 250 | "Use Caching", 251 | value=True if CACHE_AVAILABLE else False, 252 | disabled=not CACHE_AVAILABLE, 253 | help="Cache transcription results to avoid reprocessing the same files." 254 | ) 255 | 256 | # Cache management 257 | if CACHE_AVAILABLE and use_cache: 258 | cache_size, cache_files = get_cache_size() 259 | if cache_size > 0: 260 | st.sidebar.info(f"Cache: {humanize.naturalsize(cache_size)} ({cache_files} files)") 261 | if st.sidebar.button("Clear Cache"): 262 | cleared = clear_cache() 263 | st.sidebar.success(f"Cleared {cleared} cache files") 264 | 265 | # Export options 266 | st.sidebar.subheader("Export Options") 267 | export_format = st.sidebar.multiselect( 268 | "Export Formats", 269 | ["TXT", "SRT", "VTT", "ASS"], 270 | default=["TXT"], 271 | help="Select the formats to export the transcript." 272 | ) 273 | 274 | # Compression options 275 | compress_exports = st.sidebar.checkbox( 276 | "Compress Exports", 277 | value=False, 278 | help="Compress exported files to save space." 279 | ) 280 | 281 | if compress_exports: 282 | compression_type = st.sidebar.radio( 283 | "Compression Format", 284 | ["gzip", "zip"], 285 | index=0, 286 | help="Select the compression format for exported files." 287 | ) 288 | else: 289 | compression_type = None 290 | 291 | # ASS subtitle styling 292 | if "ASS" in export_format: 293 | st.sidebar.subheader("ASS Subtitle Styling") 294 | show_style_options = st.sidebar.checkbox("Customize ASS Style", value=False) 295 | 296 | if show_style_options: 297 | ass_style = {} 298 | ass_style["fontname"] = st.sidebar.selectbox( 299 | "Font", 300 | ["Arial", "Helvetica", "Times New Roman", "Courier New", "Comic Sans MS"], 301 | index=0 302 | ) 303 | ass_style["fontsize"] = st.sidebar.slider("Font Size", 12, 72, 48) 304 | ass_style["alignment"] = st.sidebar.selectbox( 305 | "Alignment", 306 | ["2 (Bottom Center)", "1 (Bottom Left)", "3 (Bottom Right)", "8 (Top Center)"], 307 | index=0 308 | ).split()[0] # Extract just the number 309 | ass_style["bold"] = "-1" if st.sidebar.checkbox("Bold", value=True) else "0" 310 | ass_style["italic"] = "-1" if st.sidebar.checkbox("Italic", value=False) else "0" 311 | else: 312 | ass_style = None 313 | 314 | # Validate environment 315 | env_errors = validate_environment(base_path) 316 | if env_errors: 317 | st.error("## Environment Issues") 318 | for error in env_errors: 319 | st.markdown(f"- {error}") 320 | return 321 | 322 | # File selection 323 | recordings = list(base_path.glob("*.mp4")) 324 | if not recordings: 325 | st.warning(f"📂 No recordings found in the folder: {base_folder}!") 326 | return 327 | 328 | selected_file = st.selectbox("Choose a recording", recordings) 329 | 330 | # Process button with spinner 331 | if st.button("🚀 Start Processing"): 332 | # Create a progress bar 333 | progress_bar = st.progress(0) 334 | status_text = st.empty() 335 | 336 | try: 337 | # Update progress 338 | status_text.text("Extracting audio...") 339 | progress_bar.progress(10) 340 | 341 | # Process based on selected features 342 | if use_diarization and DIARIZATION_AVAILABLE and hf_token: 343 | # Transcribe with speaker diarization 344 | status_text.text("Transcribing with speaker diarization...") 345 | num_speakers_arg = int(num_speakers) if num_speakers > 0 else None 346 | diarized_segments, diarized_transcript = transcribe_with_diarization( 347 | selected_file, 348 | whisper_model=transcription_model, 349 | num_speakers=num_speakers_arg, 350 | use_gpu=use_gpu, 351 | hf_token=hf_token 352 | ) 353 | segments = diarized_segments 354 | transcript = diarized_transcript 355 | elif use_translation and TRANSLATION_AVAILABLE: 356 | # Transcribe and translate 357 | status_text.text("Transcribing and translating...") 358 | original_segments, translated_segments, original_transcript, translated_transcript = transcribe_and_translate( 359 | selected_file, 360 | whisper_model=transcription_model, 361 | target_lang=target_lang, 362 | use_gpu=use_gpu 363 | ) 364 | segments = translated_segments 365 | transcript = translated_transcript 366 | # Store original for display 367 | original_text = original_transcript 368 | else: 369 | # Standard transcription 370 | status_text.text("Transcribing audio...") 371 | segments, transcript = transcribe_audio( 372 | selected_file, 373 | model=transcription_model, 374 | use_cache=use_cache, 375 | use_gpu=use_gpu, 376 | memory_fraction=memory_fraction 377 | ) 378 | 379 | progress_bar.progress(50) 380 | 381 | if transcript: 382 | # Extract keywords if requested 383 | keyword_timestamps = None 384 | entity_timestamps = None 385 | if use_keywords and KEYWORD_EXTRACTION_AVAILABLE: 386 | status_text.text("Extracting keywords...") 387 | keyword_timestamps, entity_timestamps = extract_keywords_from_transcript( 388 | transcript, 389 | segments, 390 | max_keywords=max_keywords, 391 | use_gpu=use_gpu 392 | ) 393 | 394 | # Generate keyword index 395 | keyword_index = generate_keyword_index(keyword_timestamps, entity_timestamps) 396 | 397 | # Generate interactive transcript 398 | interactive_transcript = generate_interactive_transcript( 399 | segments, 400 | keyword_timestamps, 401 | entity_timestamps 402 | ) 403 | 404 | # Generate summary based on selected method 405 | status_text.text("Generating summary...") 406 | if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)" and ollama_model: 407 | summary = chunk_and_summarize(transcript, model=ollama_model) 408 | if not summary: 409 | st.warning("Ollama summarization failed. Falling back to Hugging Face.") 410 | summary = summarize_text( 411 | transcript, 412 | use_gpu=use_gpu, 413 | memory_fraction=memory_fraction 414 | ) 415 | else: 416 | summary = summarize_text( 417 | transcript, 418 | use_gpu=use_gpu, 419 | memory_fraction=memory_fraction 420 | ) 421 | 422 | progress_bar.progress(80) 423 | status_text.text("Preparing results...") 424 | 425 | # Display results in tabs 426 | tab1, tab2, tab3 = st.tabs(["Summary", "Transcript", "Advanced"]) 427 | 428 | with tab1: 429 | st.subheader("🖍 Summary") 430 | st.write(summary) 431 | 432 | # If translation was used, show original language 433 | if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals(): 434 | with st.expander("Original Language Summary"): 435 | original_summary = summarize_text( 436 | original_text, 437 | use_gpu=use_gpu, 438 | memory_fraction=memory_fraction 439 | ) 440 | st.write(original_summary) 441 | 442 | with tab2: 443 | st.subheader("📜 Full Transcript") 444 | 445 | # Show interactive transcript if keywords were extracted 446 | if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'interactive_transcript' in locals(): 447 | st.markdown(interactive_transcript, unsafe_allow_html=True) 448 | else: 449 | st.text(transcript) 450 | 451 | # If translation was used, show original language 452 | if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals(): 453 | with st.expander("Original Language Transcript"): 454 | st.text(original_text) 455 | 456 | with tab3: 457 | # Show keyword index if available 458 | if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'keyword_index' in locals(): 459 | st.subheader("🔑 Keyword Index") 460 | st.markdown(keyword_index) 461 | 462 | # Show speaker information if available 463 | if use_diarization and DIARIZATION_AVAILABLE: 464 | st.subheader("🎙️ Speaker Information") 465 | speakers = set(segment.get('speaker', 'UNKNOWN') for segment in segments) 466 | st.write(f"Detected {len(speakers)} speakers: {', '.join(speakers)}") 467 | 468 | # Count words per speaker 469 | speaker_words = {} 470 | for segment in segments: 471 | speaker = segment.get('speaker', 'UNKNOWN') 472 | words = len(segment['text'].split()) 473 | if speaker in speaker_words: 474 | speaker_words[speaker] += words 475 | else: 476 | speaker_words[speaker] = words 477 | 478 | # Display speaker statistics 479 | st.write("### Speaker Statistics") 480 | for speaker, words in speaker_words.items(): 481 | st.write(f"- **{speaker}**: {words} words") 482 | 483 | # Export options 484 | st.subheader("💾 Export Options") 485 | export_cols = st.columns(len(export_format)) 486 | 487 | output_base = Path(selected_file).stem 488 | 489 | for i, format_type in enumerate(export_format): 490 | with export_cols[i]: 491 | if format_type == "TXT": 492 | st.download_button( 493 | label=f"Download {format_type}", 494 | data=transcript, 495 | file_name=f"{output_base}_transcript.txt", 496 | mime="text/plain" 497 | ) 498 | elif format_type in ["SRT", "VTT", "ASS"]: 499 | # Export to subtitle format 500 | output_path = export_transcript( 501 | transcript, 502 | output_base, 503 | format_type.lower(), 504 | segments=segments, 505 | compress=compress_exports, 506 | compression_type=compression_type, 507 | style=ass_style if format_type == "ASS" and ass_style else None 508 | ) 509 | 510 | # Read the exported file for download 511 | with open(output_path, 'rb') as f: 512 | subtitle_content = f.read() 513 | 514 | # Determine file extension 515 | file_ext = f".{format_type.lower()}" 516 | if compress_exports: 517 | file_ext += ".gz" if compression_type == "gzip" else ".zip" 518 | 519 | st.download_button( 520 | label=f"Download {format_type}", 521 | data=subtitle_content, 522 | file_name=f"{output_base}{file_ext}", 523 | mime="application/octet-stream" 524 | ) 525 | 526 | # Clean up the temporary file 527 | os.remove(output_path) 528 | 529 | # Complete progress 530 | progress_bar.progress(100) 531 | status_text.text("Processing complete!") 532 | else: 533 | st.error("❌ Failed to process recording") 534 | except Exception as e: 535 | st.error(f"An error occurred: {e}") 536 | st.write(e) # This will show the traceback in the Streamlit app 537 | 538 | if __name__ == "__main__": 539 | main() 540 | -------------------------------------------------------------------------------- /install.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo =================================================== 3 | echo OBS Recording Transcriber - Windows Installation 4 | echo =================================================== 5 | echo. 6 | 7 | :: Check for Python 8 | python --version > nul 2>&1 9 | if %errorlevel% neq 0 ( 10 | echo Python not found! Please install Python 3.8 or higher. 11 | echo Download from: https://www.python.org/downloads/ 12 | echo Make sure to check "Add Python to PATH" during installation. 13 | pause 14 | exit /b 1 15 | ) 16 | 17 | :: Run the installation script 18 | echo Running installation script... 19 | python install.py 20 | 21 | echo. 22 | echo If the installation was successful, you can run the application with: 23 | echo streamlit run app.py 24 | echo. 25 | pause -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Installation script for OBS Recording Transcriber. 4 | This script helps install all required dependencies and checks for common issues. 5 | """ 6 | 7 | import os 8 | import sys 9 | import platform 10 | import subprocess 11 | import shutil 12 | from pathlib import Path 13 | 14 | def print_header(text): 15 | """Print a formatted header.""" 16 | print("\n" + "=" * 80) 17 | print(f" {text}") 18 | print("=" * 80) 19 | 20 | def print_step(text): 21 | """Print a step in the installation process.""" 22 | print(f"\n>> {text}") 23 | 24 | def run_command(command, check=True): 25 | """Run a shell command and return the result.""" 26 | try: 27 | result = subprocess.run( 28 | command, 29 | shell=True, 30 | check=check, 31 | stdout=subprocess.PIPE, 32 | stderr=subprocess.PIPE, 33 | text=True 34 | ) 35 | return result 36 | except subprocess.CalledProcessError as e: 37 | print(f"Error executing command: {command}") 38 | print(f"Error message: {e.stderr}") 39 | return None 40 | 41 | def check_python_version(): 42 | """Check if Python version is 3.8 or higher.""" 43 | print_step("Checking Python version") 44 | version = sys.version_info 45 | if version.major < 3 or (version.major == 3 and version.minor < 8): 46 | print(f"Python 3.8 or higher is required. You have {sys.version}") 47 | print("Please upgrade your Python installation.") 48 | return False 49 | print(f"Python version: {sys.version}") 50 | return True 51 | 52 | def check_ffmpeg(): 53 | """Check if FFmpeg is installed.""" 54 | print_step("Checking FFmpeg installation") 55 | result = shutil.which("ffmpeg") 56 | if result is None: 57 | print("FFmpeg not found in PATH.") 58 | print("Please install FFmpeg:") 59 | if platform.system() == "Windows": 60 | print(" - Download from: https://www.gyan.dev/ffmpeg/builds/") 61 | print(" - Extract and add the bin folder to your PATH") 62 | elif platform.system() == "Darwin": # macOS 63 | print(" - Install with Homebrew: brew install ffmpeg") 64 | else: # Linux 65 | print(" - Install with apt: sudo apt update && sudo apt install ffmpeg") 66 | return False 67 | 68 | # Check FFmpeg version 69 | version_result = run_command("ffmpeg -version") 70 | if version_result: 71 | print(f"FFmpeg is installed: {version_result.stdout.splitlines()[0]}") 72 | return True 73 | return False 74 | 75 | def check_gpu(): 76 | """Check for GPU availability.""" 77 | print_step("Checking GPU availability") 78 | 79 | # Check for NVIDIA GPU 80 | if platform.system() == "Windows": 81 | nvidia_smi = shutil.which("nvidia-smi") 82 | if nvidia_smi: 83 | result = run_command("nvidia-smi", check=False) 84 | if result and result.returncode == 0: 85 | print("NVIDIA GPU detected:") 86 | for line in result.stdout.splitlines()[:10]: 87 | print(f" {line}") 88 | return "nvidia" 89 | 90 | # Check for Apple Silicon 91 | if platform.system() == "Darwin" and platform.machine() == "arm64": 92 | print("Apple Silicon (M1/M2) detected") 93 | return "apple" 94 | 95 | print("No GPU detected or GPU drivers not installed. CPU will be used for processing.") 96 | return "cpu" 97 | 98 | def setup_virtual_env(): 99 | """Set up a virtual environment.""" 100 | print_step("Setting up virtual environment") 101 | 102 | # Check if venv module is available 103 | try: 104 | import venv 105 | print("Python venv module is available") 106 | except ImportError: 107 | print("Python venv module is not available. Please install it.") 108 | return False 109 | 110 | # Create virtual environment if it doesn't exist 111 | venv_path = Path("venv") 112 | if venv_path.exists(): 113 | print(f"Virtual environment already exists at {venv_path}") 114 | activate_venv() 115 | return True 116 | 117 | print(f"Creating virtual environment at {venv_path}") 118 | try: 119 | subprocess.run([sys.executable, "-m", "venv", "venv"], check=True) 120 | print("Virtual environment created successfully") 121 | activate_venv() 122 | return True 123 | except subprocess.CalledProcessError as e: 124 | print(f"Error creating virtual environment: {e}") 125 | return False 126 | 127 | def activate_venv(): 128 | """Activate the virtual environment.""" 129 | print_step("Activating virtual environment") 130 | 131 | venv_path = Path("venv") 132 | if not venv_path.exists(): 133 | print("Virtual environment not found") 134 | return False 135 | 136 | # Get the path to the activate script 137 | if platform.system() == "Windows": 138 | activate_script = venv_path / "Scripts" / "activate.bat" 139 | activate_cmd = f"call {activate_script}" 140 | else: 141 | activate_script = venv_path / "bin" / "activate" 142 | activate_cmd = f"source {activate_script}" 143 | 144 | print(f"To activate the virtual environment, run:") 145 | print(f" {activate_cmd}") 146 | 147 | # We can't actually activate the venv in this script because it would only 148 | # affect the subprocess, not the parent process. We just provide instructions. 149 | return True 150 | 151 | def install_pytorch(gpu_type): 152 | """Install PyTorch with appropriate GPU support.""" 153 | print_step("Installing PyTorch") 154 | 155 | if gpu_type == "nvidia": 156 | print("Installing PyTorch with CUDA support") 157 | cmd = "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" 158 | elif gpu_type == "apple": 159 | print("Installing PyTorch with MPS support") 160 | cmd = "pip install torch torchvision torchaudio" 161 | else: 162 | print("Installing PyTorch (CPU version)") 163 | cmd = "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu" 164 | 165 | result = run_command(cmd) 166 | if result and result.returncode == 0: 167 | print("PyTorch installed successfully") 168 | return True 169 | else: 170 | print("Failed to install PyTorch") 171 | return False 172 | 173 | def install_dependencies(): 174 | """Install dependencies from requirements.txt.""" 175 | print_step("Installing dependencies from requirements.txt") 176 | 177 | requirements_path = Path("requirements.txt") 178 | if not requirements_path.exists(): 179 | print("requirements.txt not found") 180 | return False 181 | 182 | result = run_command("pip install -r requirements.txt") 183 | if result and result.returncode == 0: 184 | print("Dependencies installed successfully") 185 | return True 186 | else: 187 | print("Some dependencies failed to install. See error messages above.") 188 | return False 189 | 190 | def install_tokenizers(): 191 | """Install tokenizers package separately.""" 192 | print_step("Installing tokenizers package") 193 | 194 | # First try the normal installation 195 | result = run_command("pip install tokenizers", check=False) 196 | if result and result.returncode == 0: 197 | print("Tokenizers installed successfully") 198 | return True 199 | 200 | # If that fails, try the no-binary option 201 | print("Standard installation failed, trying alternative method...") 202 | result = run_command("pip install tokenizers --no-binary tokenizers", check=False) 203 | if result and result.returncode == 0: 204 | print("Tokenizers installed successfully with alternative method") 205 | return True 206 | 207 | print("Failed to install tokenizers. You may need to install Rust or Visual C++ Build Tools.") 208 | if platform.system() == "Windows": 209 | print("Download Visual C++ Build Tools: https://visualstudio.microsoft.com/visual-cpp-build-tools/") 210 | print("Install Rust: https://rustup.rs/") 211 | return False 212 | 213 | def check_installation(): 214 | """Verify the installation by importing key packages.""" 215 | print_step("Verifying installation") 216 | 217 | packages_to_check = [ 218 | "streamlit", 219 | "torch", 220 | "transformers", 221 | "whisper", 222 | "numpy", 223 | "sklearn" 224 | ] 225 | 226 | all_successful = True 227 | for package in packages_to_check: 228 | try: 229 | __import__(package) 230 | print(f"✓ {package} imported successfully") 231 | except ImportError: 232 | print(f"✗ Failed to import {package}") 233 | all_successful = False 234 | 235 | # Check optional packages 236 | optional_packages = [ 237 | "pyannote.audio", 238 | "iso639" 239 | ] 240 | 241 | print("\nChecking optional packages:") 242 | for package in optional_packages: 243 | try: 244 | if package == "pyannote.audio": 245 | # Just try to import pyannote 246 | __import__("pyannote") 247 | else: 248 | __import__(package) 249 | print(f"✓ {package} imported successfully") 250 | except ImportError: 251 | print(f"⚠ {package} not available (required for some advanced features)") 252 | 253 | return all_successful 254 | 255 | def main(): 256 | """Main installation function.""" 257 | print_header("OBS Recording Transcriber - Installation Script") 258 | 259 | # Check prerequisites 260 | if not check_python_version(): 261 | return 262 | 263 | ffmpeg_available = check_ffmpeg() 264 | gpu_type = check_gpu() 265 | 266 | # Setup environment 267 | if not setup_virtual_env(): 268 | print("Failed to set up virtual environment. Continuing with system Python...") 269 | 270 | # Install packages 271 | print("\nReady to install packages. Make sure your virtual environment is activated.") 272 | input("Press Enter to continue...") 273 | 274 | install_pytorch(gpu_type) 275 | install_dependencies() 276 | install_tokenizers() 277 | 278 | # Verify installation 279 | success = check_installation() 280 | 281 | print_header("Installation Summary") 282 | print(f"Python: {'✓ OK' if check_python_version() else '✗ Needs upgrade'}") 283 | print(f"FFmpeg: {'✓ Installed' if ffmpeg_available else '✗ Not found'}") 284 | print(f"GPU Support: {gpu_type.upper()}") 285 | print(f"Dependencies: {'✓ Installed' if success else '⚠ Some issues'}") 286 | 287 | print("\nNext steps:") 288 | if not ffmpeg_available: 289 | print("1. Install FFmpeg (required for audio processing)") 290 | 291 | print("1. Activate your virtual environment:") 292 | if platform.system() == "Windows": 293 | print(" venv\\Scripts\\activate") 294 | else: 295 | print(" source venv/bin/activate") 296 | 297 | print("2. Run the application:") 298 | print(" streamlit run app.py") 299 | 300 | print("\nFor advanced features like speaker diarization:") 301 | print("1. Get a HuggingFace token: https://huggingface.co/settings/tokens") 302 | print("2. Request access to pyannote models: https://huggingface.co/pyannote/speaker-diarization-3.0") 303 | 304 | print("\nSee INSTALLATION.md for more details and troubleshooting.") 305 | 306 | if __name__ == "__main__": 307 | main() -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "===================================================" 4 | echo " OBS Recording Transcriber - Unix Installation" 5 | echo "===================================================" 6 | echo 7 | 8 | # Check for Python 9 | if ! command -v python3 &> /dev/null; then 10 | echo "Python 3 not found! Please install Python 3.8 or higher." 11 | echo "For Ubuntu/Debian: sudo apt update && sudo apt install python3 python3-pip python3-venv" 12 | echo "For macOS: brew install python3" 13 | exit 1 14 | fi 15 | 16 | # Make the script executable 17 | chmod +x install.py 18 | 19 | # Run the installation script 20 | echo "Running installation script..." 21 | python3 ./install.py 22 | 23 | echo 24 | echo "If the installation was successful, you can run the application with:" 25 | echo "streamlit run app.py" 26 | echo -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # OBS Recording Transcriber Dependencies 2 | # Core dependencies 3 | streamlit==1.26.0 4 | moviepy==1.0.3 5 | openai-whisper==20231117 6 | transformers>=4.21.1 7 | torch>=1.7.0 8 | torchaudio>=0.7.0 9 | requests>=2.28.0 10 | humanize>=4.6.0 11 | 12 | # Phase 2 dependencies 13 | scikit-learn>=1.0.0 14 | numpy>=1.20.0 15 | 16 | # Phase 3 dependencies 17 | pyannote.audio>=2.1.1 18 | iso639>=0.1.4 19 | protobuf>=3.20.0,<4.0.0 20 | tokenizers>=0.13.2 21 | scipy>=1.7.0 22 | matplotlib>=3.5.0 23 | soundfile>=0.10.3 24 | ffmpeg-python>=0.2.0 25 | 26 | # Optional: Ollama Python client (uncomment to install) 27 | # ollama 28 | 29 | # Installation notes: 30 | # 1. For Windows users, you may need to install PyTorch separately: 31 | # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 32 | # 33 | # 2. For tokenizers issues, try installing Visual C++ Build Tools: 34 | # https://visualstudio.microsoft.com/visual-cpp-build-tools/ 35 | # 36 | # 3. For pyannote.audio, you'll need a HuggingFace token with access to: 37 | # https://huggingface.co/pyannote/speaker-diarization-3.0 38 | # 39 | # 4. FFmpeg is required for audio processing: 40 | # Windows: https://www.gyan.dev/ffmpeg/builds/ 41 | # Mac: brew install ffmpeg 42 | # Linux: apt-get install ffmpeg 43 | -------------------------------------------------------------------------------- /utils/audio_processing.py: -------------------------------------------------------------------------------- 1 | from moviepy.editor import AudioFileClip 2 | from pathlib import Path 3 | 4 | def extract_audio(video_path: Path): 5 | """Extract audio from a video file.""" 6 | try: 7 | audio = AudioFileClip(str(video_path)) 8 | audio_path = video_path.parent / f"{video_path.stem}_audio.wav" 9 | audio.write_audiofile(str(audio_path), verbose=False, logger=None) 10 | return audio_path 11 | except Exception as e: 12 | raise RuntimeError(f"Audio extraction failed: {e}") 13 | -------------------------------------------------------------------------------- /utils/cache.py: -------------------------------------------------------------------------------- 1 | """ 2 | Caching utilities for the OBS Recording Transcriber. 3 | Provides functions to cache and retrieve transcription and summarization results. 4 | """ 5 | 6 | import json 7 | import hashlib 8 | import os 9 | from pathlib import Path 10 | import logging 11 | import time 12 | 13 | # Configure logging 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | # Default cache directory 18 | CACHE_DIR = Path.home() / ".obs_transcriber_cache" 19 | 20 | 21 | def get_file_hash(file_path): 22 | """ 23 | Generate a hash for a file based on its content and modification time. 24 | 25 | Args: 26 | file_path (Path): Path to the file 27 | 28 | Returns: 29 | str: Hash string representing the file 30 | """ 31 | file_path = Path(file_path) 32 | if not file_path.exists(): 33 | return None 34 | 35 | # Get file stats 36 | stats = file_path.stat() 37 | file_size = stats.st_size 38 | mod_time = stats.st_mtime 39 | 40 | # Create a hash based on path, size and modification time 41 | # This is faster than hashing the entire file content 42 | hash_input = f"{file_path.absolute()}|{file_size}|{mod_time}" 43 | return hashlib.md5(hash_input.encode()).hexdigest() 44 | 45 | 46 | def get_cache_path(file_path, model=None, operation=None): 47 | """ 48 | Get the cache file path for a given input file and operation. 49 | 50 | Args: 51 | file_path (Path): Path to the original file 52 | model (str, optional): Model used for processing 53 | operation (str, optional): Operation type (e.g., 'transcribe', 'summarize') 54 | 55 | Returns: 56 | Path: Path to the cache file 57 | """ 58 | file_path = Path(file_path) 59 | file_hash = get_file_hash(file_path) 60 | 61 | if not file_hash: 62 | return None 63 | 64 | # Create cache directory if it doesn't exist 65 | cache_dir = CACHE_DIR 66 | cache_dir.mkdir(parents=True, exist_ok=True) 67 | 68 | # Create a cache filename based on the hash and optional parameters 69 | cache_name = file_hash 70 | if model: 71 | cache_name += f"_{model}" 72 | if operation: 73 | cache_name += f"_{operation}" 74 | 75 | return cache_dir / f"{cache_name}.json" 76 | 77 | 78 | def save_to_cache(file_path, data, model=None, operation=None): 79 | """ 80 | Save data to cache. 81 | 82 | Args: 83 | file_path (Path): Path to the original file 84 | data (dict): Data to cache 85 | model (str, optional): Model used for processing 86 | operation (str, optional): Operation type 87 | 88 | Returns: 89 | bool: True if successful, False otherwise 90 | """ 91 | cache_path = get_cache_path(file_path, model, operation) 92 | if not cache_path: 93 | return False 94 | 95 | try: 96 | # Add metadata to the cached data 97 | cache_data = { 98 | "original_file": str(Path(file_path).absolute()), 99 | "timestamp": time.time(), 100 | "model": model, 101 | "operation": operation, 102 | "data": data 103 | } 104 | 105 | with open(cache_path, 'w', encoding='utf-8') as f: 106 | json.dump(cache_data, f, ensure_ascii=False, indent=2) 107 | 108 | logger.info(f"Cached data saved to {cache_path}") 109 | return True 110 | except Exception as e: 111 | logger.error(f"Error saving cache: {e}") 112 | return False 113 | 114 | 115 | def load_from_cache(file_path, model=None, operation=None, max_age=None): 116 | """ 117 | Load data from cache if available and not expired. 118 | 119 | Args: 120 | file_path (Path): Path to the original file 121 | model (str, optional): Model used for processing 122 | operation (str, optional): Operation type 123 | max_age (float, optional): Maximum age of cache in seconds 124 | 125 | Returns: 126 | dict or None: Cached data or None if not available 127 | """ 128 | cache_path = get_cache_path(file_path, model, operation) 129 | if not cache_path or not cache_path.exists(): 130 | return None 131 | 132 | try: 133 | with open(cache_path, 'r', encoding='utf-8') as f: 134 | cache_data = json.load(f) 135 | 136 | # Check if cache is expired 137 | if max_age is not None: 138 | cache_time = cache_data.get("timestamp", 0) 139 | if time.time() - cache_time > max_age: 140 | logger.info(f"Cache expired for {file_path}") 141 | return None 142 | 143 | logger.info(f"Loaded data from cache: {cache_path}") 144 | return cache_data.get("data") 145 | except Exception as e: 146 | logger.error(f"Error loading cache: {e}") 147 | return None 148 | 149 | 150 | def clear_cache(max_age=None): 151 | """ 152 | Clear all cache files or only expired ones. 153 | 154 | Args: 155 | max_age (float, optional): Maximum age of cache in seconds 156 | 157 | Returns: 158 | int: Number of files deleted 159 | """ 160 | if not CACHE_DIR.exists(): 161 | return 0 162 | 163 | count = 0 164 | for cache_file in CACHE_DIR.glob("*.json"): 165 | try: 166 | if max_age is not None: 167 | # Check if file is expired 168 | with open(cache_file, 'r', encoding='utf-8') as f: 169 | cache_data = json.load(f) 170 | 171 | cache_time = cache_data.get("timestamp", 0) 172 | if time.time() - cache_time <= max_age: 173 | continue # Skip non-expired files 174 | 175 | # Delete the file 176 | os.remove(cache_file) 177 | count += 1 178 | except Exception as e: 179 | logger.error(f"Error deleting cache file {cache_file}: {e}") 180 | 181 | logger.info(f"Cleared {count} cache files") 182 | return count 183 | 184 | 185 | def get_cache_size(): 186 | """ 187 | Get the total size of the cache directory. 188 | 189 | Returns: 190 | tuple: (size_bytes, file_count) 191 | """ 192 | if not CACHE_DIR.exists(): 193 | return 0, 0 194 | 195 | total_size = 0 196 | file_count = 0 197 | 198 | for cache_file in CACHE_DIR.glob("*.json"): 199 | try: 200 | total_size += cache_file.stat().st_size 201 | file_count += 1 202 | except Exception: 203 | pass 204 | 205 | return total_size, file_count -------------------------------------------------------------------------------- /utils/diarization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Speaker diarization utilities for the OBS Recording Transcriber. 3 | Provides functions to identify different speakers in audio recordings. 4 | """ 5 | 6 | import logging 7 | import os 8 | import numpy as np 9 | from pathlib import Path 10 | import torch 11 | from pyannote.audio import Pipeline 12 | from pyannote.core import Segment 13 | import whisper 14 | 15 | # Configure logging 16 | logging.basicConfig(level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | # Try to import GPU utilities, but don't fail if not available 20 | try: 21 | from utils.gpu_utils import get_optimal_device 22 | GPU_UTILS_AVAILABLE = True 23 | except ImportError: 24 | GPU_UTILS_AVAILABLE = False 25 | 26 | # Default HuggingFace auth token environment variable 27 | HF_TOKEN_ENV = "HF_TOKEN" 28 | 29 | 30 | def get_diarization_pipeline(use_gpu=True, hf_token=None): 31 | """ 32 | Initialize the speaker diarization pipeline. 33 | 34 | Args: 35 | use_gpu (bool): Whether to use GPU acceleration if available 36 | hf_token (str, optional): HuggingFace API token for accessing the model 37 | 38 | Returns: 39 | Pipeline or None: Diarization pipeline if successful, None otherwise 40 | """ 41 | # Check if token is provided or in environment 42 | if hf_token is None: 43 | hf_token = os.environ.get(HF_TOKEN_ENV) 44 | if hf_token is None: 45 | logger.error(f"HuggingFace token not provided. Set {HF_TOKEN_ENV} environment variable or pass token directly.") 46 | return None 47 | 48 | try: 49 | # Configure device 50 | device = torch.device("cpu") 51 | if use_gpu and GPU_UTILS_AVAILABLE: 52 | device = get_optimal_device() 53 | logger.info(f"Using device: {device} for diarization") 54 | 55 | # Initialize the pipeline 56 | pipeline = Pipeline.from_pretrained( 57 | "pyannote/speaker-diarization-3.0", 58 | use_auth_token=hf_token 59 | ) 60 | 61 | # Move to appropriate device 62 | if device.type == "cuda": 63 | pipeline = pipeline.to(torch.device(device)) 64 | 65 | return pipeline 66 | except Exception as e: 67 | logger.error(f"Error initializing diarization pipeline: {e}") 68 | return None 69 | 70 | 71 | def diarize_audio(audio_path, pipeline=None, num_speakers=None, use_gpu=True, hf_token=None): 72 | """ 73 | Perform speaker diarization on an audio file. 74 | 75 | Args: 76 | audio_path (Path): Path to the audio file 77 | pipeline (Pipeline, optional): Pre-initialized diarization pipeline 78 | num_speakers (int, optional): Number of speakers (if known) 79 | use_gpu (bool): Whether to use GPU acceleration if available 80 | hf_token (str, optional): HuggingFace API token 81 | 82 | Returns: 83 | dict: Dictionary mapping time segments to speaker IDs 84 | """ 85 | audio_path = Path(audio_path) 86 | 87 | # Initialize pipeline if not provided 88 | if pipeline is None: 89 | pipeline = get_diarization_pipeline(use_gpu, hf_token) 90 | if pipeline is None: 91 | return None 92 | 93 | try: 94 | # Run diarization 95 | logger.info(f"Running speaker diarization on {audio_path}") 96 | diarization = pipeline(audio_path, num_speakers=num_speakers) 97 | 98 | # Extract speaker segments 99 | speaker_segments = {} 100 | for turn, _, speaker in diarization.itertracks(yield_label=True): 101 | segment = (turn.start, turn.end) 102 | speaker_segments[segment] = speaker 103 | 104 | return speaker_segments 105 | except Exception as e: 106 | logger.error(f"Error during diarization: {e}") 107 | return None 108 | 109 | 110 | def apply_diarization_to_transcript(transcript_segments, speaker_segments): 111 | """ 112 | Apply speaker diarization results to transcript segments. 113 | 114 | Args: 115 | transcript_segments (list): List of transcript segments with timing info 116 | speaker_segments (dict): Dictionary mapping time segments to speaker IDs 117 | 118 | Returns: 119 | list: Updated transcript segments with speaker information 120 | """ 121 | if not speaker_segments: 122 | return transcript_segments 123 | 124 | # Convert speaker segments to a more usable format 125 | speaker_ranges = [(Segment(start, end), speaker) 126 | for (start, end), speaker in speaker_segments.items()] 127 | 128 | # Update transcript segments with speaker information 129 | for segment in transcript_segments: 130 | segment_start = segment['start'] 131 | segment_end = segment['end'] 132 | segment_range = Segment(segment_start, segment_end) 133 | 134 | # Find overlapping speaker segments 135 | overlaps = [] 136 | for (spk_range, speaker) in speaker_ranges: 137 | overlap = segment_range.intersect(spk_range) 138 | if overlap: 139 | overlaps.append((overlap.duration, speaker)) 140 | 141 | # Assign the speaker with the most overlap 142 | if overlaps: 143 | overlaps.sort(reverse=True) # Sort by duration (descending) 144 | segment['speaker'] = overlaps[0][1] 145 | else: 146 | segment['speaker'] = "UNKNOWN" 147 | 148 | return transcript_segments 149 | 150 | 151 | def format_transcript_with_speakers(transcript_segments): 152 | """ 153 | Format transcript with speaker labels. 154 | 155 | Args: 156 | transcript_segments (list): List of transcript segments with speaker info 157 | 158 | Returns: 159 | str: Formatted transcript with speaker labels 160 | """ 161 | formatted_lines = [] 162 | current_speaker = None 163 | 164 | for segment in transcript_segments: 165 | speaker = segment.get('speaker', 'UNKNOWN') 166 | text = segment['text'].strip() 167 | 168 | # Add speaker label when speaker changes 169 | if speaker != current_speaker: 170 | formatted_lines.append(f"\n[{speaker}]") 171 | current_speaker = speaker 172 | 173 | formatted_lines.append(text) 174 | 175 | return " ".join(formatted_lines) 176 | 177 | 178 | def transcribe_with_diarization(audio_path, whisper_model="base", num_speakers=None, 179 | use_gpu=True, hf_token=None): 180 | """ 181 | Transcribe audio with speaker diarization. 182 | 183 | Args: 184 | audio_path (Path): Path to the audio file 185 | whisper_model (str): Whisper model size to use 186 | num_speakers (int, optional): Number of speakers (if known) 187 | use_gpu (bool): Whether to use GPU acceleration if available 188 | hf_token (str, optional): HuggingFace API token 189 | 190 | Returns: 191 | tuple: (diarized_segments, formatted_transcript) 192 | """ 193 | audio_path = Path(audio_path) 194 | 195 | # Configure device 196 | device = torch.device("cpu") 197 | if use_gpu and GPU_UTILS_AVAILABLE: 198 | device = get_optimal_device() 199 | 200 | try: 201 | # Step 1: Transcribe audio with Whisper 202 | logger.info(f"Transcribing audio with Whisper model: {whisper_model}") 203 | model = whisper.load_model(whisper_model, device=device if device.type != "mps" else "cpu") 204 | result = model.transcribe(str(audio_path)) 205 | transcript_segments = result["segments"] 206 | 207 | # Step 2: Perform speaker diarization 208 | logger.info("Performing speaker diarization") 209 | pipeline = get_diarization_pipeline(use_gpu, hf_token) 210 | if pipeline is None: 211 | logger.warning("Diarization pipeline not available, returning transcript without speakers") 212 | return transcript_segments, result["text"] 213 | 214 | speaker_segments = diarize_audio(audio_path, pipeline, num_speakers, use_gpu) 215 | 216 | # Step 3: Apply diarization to transcript 217 | if speaker_segments: 218 | diarized_segments = apply_diarization_to_transcript(transcript_segments, speaker_segments) 219 | formatted_transcript = format_transcript_with_speakers(diarized_segments) 220 | return diarized_segments, formatted_transcript 221 | else: 222 | return transcript_segments, result["text"] 223 | 224 | except Exception as e: 225 | logger.error(f"Error in transcribe_with_diarization: {e}") 226 | return None, None -------------------------------------------------------------------------------- /utils/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Subtitle export utilities for the OBS Recording Transcriber. 3 | Supports exporting transcripts to SRT, ASS, and WebVTT subtitle formats. 4 | """ 5 | 6 | from pathlib import Path 7 | import re 8 | from datetime import timedelta 9 | import gzip 10 | import zipfile 11 | import logging 12 | 13 | # Configure logging 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def format_timestamp_srt(timestamp_ms): 19 | """ 20 | Format a timestamp in milliseconds to SRT format (HH:MM:SS,mmm). 21 | 22 | Args: 23 | timestamp_ms (int): Timestamp in milliseconds 24 | 25 | Returns: 26 | str: Formatted timestamp string 27 | """ 28 | hours, remainder = divmod(timestamp_ms, 3600000) 29 | minutes, remainder = divmod(remainder, 60000) 30 | seconds, milliseconds = divmod(remainder, 1000) 31 | return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}" 32 | 33 | 34 | def format_timestamp_ass(timestamp_ms): 35 | """ 36 | Format a timestamp in milliseconds to ASS format (H:MM:SS.cc). 37 | 38 | Args: 39 | timestamp_ms (int): Timestamp in milliseconds 40 | 41 | Returns: 42 | str: Formatted timestamp string 43 | """ 44 | hours, remainder = divmod(timestamp_ms, 3600000) 45 | minutes, remainder = divmod(remainder, 60000) 46 | seconds, remainder = divmod(remainder, 1000) 47 | centiseconds = remainder // 10 48 | return f"{int(hours)}:{int(minutes):02d}:{int(seconds):02d}.{int(centiseconds):02d}" 49 | 50 | 51 | def format_timestamp_vtt(timestamp_ms): 52 | """ 53 | Format a timestamp in milliseconds to WebVTT format (HH:MM:SS.mmm). 54 | 55 | Args: 56 | timestamp_ms (int): Timestamp in milliseconds 57 | 58 | Returns: 59 | str: Formatted timestamp string 60 | """ 61 | hours, remainder = divmod(timestamp_ms, 3600000) 62 | minutes, remainder = divmod(remainder, 60000) 63 | seconds, milliseconds = divmod(remainder, 1000) 64 | return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}" 65 | 66 | 67 | def export_to_srt(segments, output_path): 68 | """ 69 | Export transcript segments to SRT format. 70 | 71 | Args: 72 | segments (list): List of transcript segments with start, end, and text 73 | output_path (Path): Path to save the SRT file 74 | 75 | Returns: 76 | Path: Path to the saved SRT file 77 | """ 78 | with open(output_path, 'w', encoding='utf-8') as f: 79 | for i, segment in enumerate(segments, 1): 80 | start_time = format_timestamp_srt(int(segment['start'] * 1000)) 81 | end_time = format_timestamp_srt(int(segment['end'] * 1000)) 82 | 83 | f.write(f"{i}\n") 84 | f.write(f"{start_time} --> {end_time}\n") 85 | f.write(f"{segment['text'].strip()}\n\n") 86 | 87 | return output_path 88 | 89 | 90 | def export_to_ass(segments, output_path, video_width=1920, video_height=1080, style=None): 91 | """ 92 | Export transcript segments to ASS format with styling. 93 | 94 | Args: 95 | segments (list): List of transcript segments with start, end, and text 96 | output_path (Path): Path to save the ASS file 97 | video_width (int): Width of the video in pixels 98 | video_height (int): Height of the video in pixels 99 | style (dict, optional): Custom style parameters 100 | 101 | Returns: 102 | Path: Path to the saved ASS file 103 | """ 104 | # Default style 105 | default_style = { 106 | "fontname": "Arial", 107 | "fontsize": "48", 108 | "primary_color": "&H00FFFFFF", # White 109 | "secondary_color": "&H000000FF", # Blue 110 | "outline_color": "&H00000000", # Black 111 | "back_color": "&H80000000", # Semi-transparent black 112 | "bold": "-1", # True 113 | "italic": "0", # False 114 | "alignment": "2", # Bottom center 115 | } 116 | 117 | # Apply custom style if provided 118 | if style: 119 | default_style.update(style) 120 | 121 | # ASS header template 122 | ass_header = f"""[Script Info] 123 | Title: Transcription 124 | ScriptType: v4.00+ 125 | WrapStyle: 0 126 | PlayResX: {video_width} 127 | PlayResY: {video_height} 128 | ScaledBorderAndShadow: yes 129 | 130 | [V4+ Styles] 131 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 132 | Style: Default,{default_style['fontname']},{default_style['fontsize']},{default_style['primary_color']},{default_style['secondary_color']},{default_style['outline_color']},{default_style['back_color']},{default_style['bold']},{default_style['italic']},0,0,100,100,0,0,1,2,2,{default_style['alignment']},10,10,10,1 133 | 134 | [Events] 135 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 136 | """ 137 | 138 | with open(output_path, 'w', encoding='utf-8') as f: 139 | f.write(ass_header) 140 | 141 | for segment in segments: 142 | start_time = format_timestamp_ass(int(segment['start'] * 1000)) 143 | end_time = format_timestamp_ass(int(segment['end'] * 1000)) 144 | text = segment['text'].strip().replace('\n', '\\N') 145 | 146 | f.write(f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{text}\n") 147 | 148 | return output_path 149 | 150 | 151 | def export_to_vtt(segments, output_path): 152 | """ 153 | Export transcript segments to WebVTT format. 154 | 155 | Args: 156 | segments (list): List of transcript segments with start, end, and text 157 | output_path (Path): Path to save the WebVTT file 158 | 159 | Returns: 160 | Path: Path to the saved WebVTT file 161 | """ 162 | with open(output_path, 'w', encoding='utf-8') as f: 163 | # WebVTT header 164 | f.write("WEBVTT\n\n") 165 | 166 | for i, segment in enumerate(segments, 1): 167 | start_time = format_timestamp_vtt(int(segment['start'] * 1000)) 168 | end_time = format_timestamp_vtt(int(segment['end'] * 1000)) 169 | 170 | # Optional cue identifier 171 | f.write(f"{i}\n") 172 | f.write(f"{start_time} --> {end_time}\n") 173 | f.write(f"{segment['text'].strip()}\n\n") 174 | 175 | return output_path 176 | 177 | 178 | def transcript_to_segments(transcript, segment_duration=5.0): 179 | """ 180 | Convert a plain transcript to timed segments for subtitle export. 181 | Used when the original segments are not available. 182 | 183 | Args: 184 | transcript (str): Full transcript text 185 | segment_duration (float): Duration of each segment in seconds 186 | 187 | Returns: 188 | list: List of segments with start, end, and text 189 | """ 190 | # Split transcript into sentences 191 | sentences = re.split(r'(?<=[.!?])\s+', transcript) 192 | segments = [] 193 | 194 | current_time = 0.0 195 | for sentence in sentences: 196 | if not sentence.strip(): 197 | continue 198 | 199 | # Estimate duration based on word count (approx. 2.5 words per second) 200 | word_count = len(sentence.split()) 201 | duration = max(2.0, word_count / 2.5) 202 | 203 | segments.append({ 204 | 'start': current_time, 205 | 'end': current_time + duration, 206 | 'text': sentence 207 | }) 208 | 209 | current_time += duration 210 | 211 | return segments 212 | 213 | 214 | def compress_file(input_path, compression_type='gzip'): 215 | """ 216 | Compress a file using the specified compression method. 217 | 218 | Args: 219 | input_path (Path): Path to the file to compress 220 | compression_type (str): Type of compression ('gzip' or 'zip') 221 | 222 | Returns: 223 | Path: Path to the compressed file 224 | """ 225 | input_path = Path(input_path) 226 | 227 | if compression_type == 'gzip': 228 | output_path = input_path.with_suffix(input_path.suffix + '.gz') 229 | with open(input_path, 'rb') as f_in: 230 | with gzip.open(output_path, 'wb') as f_out: 231 | f_out.write(f_in.read()) 232 | return output_path 233 | 234 | elif compression_type == 'zip': 235 | output_path = input_path.with_suffix('.zip') 236 | with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: 237 | zipf.write(input_path, arcname=input_path.name) 238 | return output_path 239 | 240 | else: 241 | logger.warning(f"Unsupported compression type: {compression_type}") 242 | return input_path 243 | 244 | 245 | def export_transcript(transcript, output_path, format_type='srt', segments=None, 246 | compress=False, compression_type='gzip', style=None): 247 | """ 248 | Export transcript to the specified subtitle format. 249 | 250 | Args: 251 | transcript (str): Full transcript text 252 | output_path (Path): Base path for the output file (without extension) 253 | format_type (str): 'srt', 'ass', or 'vtt' 254 | segments (list, optional): List of transcript segments with timing information 255 | compress (bool): Whether to compress the output file 256 | compression_type (str): Type of compression ('gzip' or 'zip') 257 | style (dict, optional): Custom style parameters for ASS format 258 | 259 | Returns: 260 | Path: Path to the saved subtitle file 261 | """ 262 | output_path = Path(output_path) 263 | 264 | # If segments are not provided, create them from the transcript 265 | if segments is None: 266 | segments = transcript_to_segments(transcript) 267 | 268 | if format_type.lower() == 'srt': 269 | output_file = output_path.with_suffix('.srt') 270 | result_path = export_to_srt(segments, output_file) 271 | elif format_type.lower() == 'ass': 272 | output_file = output_path.with_suffix('.ass') 273 | result_path = export_to_ass(segments, output_file, style=style) 274 | elif format_type.lower() == 'vtt': 275 | output_file = output_path.with_suffix('.vtt') 276 | result_path = export_to_vtt(segments, output_file) 277 | else: 278 | raise ValueError(f"Unsupported format type: {format_type}. Use 'srt', 'ass', or 'vtt'.") 279 | 280 | # Compress the file if requested 281 | if compress: 282 | result_path = compress_file(result_path, compression_type) 283 | 284 | return result_path -------------------------------------------------------------------------------- /utils/gpu_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPU utilities for the OBS Recording Transcriber. 3 | Provides functions to detect and configure GPU acceleration. 4 | """ 5 | 6 | import logging 7 | import os 8 | import platform 9 | import subprocess 10 | import torch 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def get_gpu_info(): 18 | """ 19 | Get information about available GPUs. 20 | 21 | Returns: 22 | dict: Information about available GPUs 23 | """ 24 | gpu_info = { 25 | "cuda_available": torch.cuda.is_available(), 26 | "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0, 27 | "cuda_devices": [], 28 | "mps_available": hasattr(torch.backends, "mps") and torch.backends.mps.is_available() 29 | } 30 | 31 | # Get CUDA device information 32 | if gpu_info["cuda_available"]: 33 | for i in range(gpu_info["cuda_device_count"]): 34 | device_props = torch.cuda.get_device_properties(i) 35 | gpu_info["cuda_devices"].append({ 36 | "index": i, 37 | "name": device_props.name, 38 | "total_memory": device_props.total_memory, 39 | "compute_capability": f"{device_props.major}.{device_props.minor}" 40 | }) 41 | 42 | return gpu_info 43 | 44 | 45 | def get_optimal_device(): 46 | """ 47 | Get the optimal device for computation. 48 | 49 | Returns: 50 | torch.device: The optimal device (cuda, mps, or cpu) 51 | """ 52 | if torch.cuda.is_available(): 53 | # If multiple GPUs are available, select the one with the most memory 54 | if torch.cuda.device_count() > 1: 55 | max_memory = 0 56 | best_device = 0 57 | for i in range(torch.cuda.device_count()): 58 | device_props = torch.cuda.get_device_properties(i) 59 | if device_props.total_memory > max_memory: 60 | max_memory = device_props.total_memory 61 | best_device = i 62 | return torch.device(f"cuda:{best_device}") 63 | return torch.device("cuda:0") 64 | elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): 65 | return torch.device("mps") 66 | else: 67 | return torch.device("cpu") 68 | 69 | 70 | def set_memory_limits(memory_fraction=0.8): 71 | global torch 72 | import torch 73 | """ 74 | Set memory limits for GPU usage. 75 | 76 | Args: 77 | memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0) 78 | 79 | Returns: 80 | bool: True if successful, False otherwise 81 | """ 82 | if not torch.cuda.is_available(): 83 | return False 84 | 85 | try: 86 | # Set memory fraction for each device 87 | for i in range(torch.cuda.device_count()): 88 | torch.cuda.set_per_process_memory_fraction(memory_fraction, i) 89 | 90 | return True 91 | except Exception as e: 92 | logger.error(f"Error setting memory limits: {e}") 93 | return False 94 | 95 | 96 | def optimize_for_inference(): 97 | """ 98 | Apply optimizations for inference. 99 | 100 | Returns: 101 | bool: True if successful, False otherwise 102 | """ 103 | try: 104 | # Set deterministic algorithms for reproducibility 105 | torch.backends.cudnn.deterministic = True 106 | 107 | # Enable cuDNN benchmark mode for optimized performance 108 | torch.backends.cudnn.benchmark = True 109 | 110 | # Disable gradient calculation for inference 111 | torch.set_grad_enabled(False) 112 | 113 | return True 114 | except Exception as e: 115 | logger.error(f"Error optimizing for inference: {e}") 116 | return False 117 | 118 | 119 | def get_recommended_batch_size(model_size="base"): 120 | """ 121 | Get recommended batch size based on available GPU memory. 122 | 123 | Args: 124 | model_size (str): Size of the model (tiny, base, small, medium, large) 125 | 126 | Returns: 127 | int: Recommended batch size 128 | """ 129 | # Default batch sizes for CPU 130 | default_batch_sizes = { 131 | "tiny": 16, 132 | "base": 8, 133 | "small": 4, 134 | "medium": 2, 135 | "large": 1 136 | } 137 | 138 | # If CUDA is not available, return default CPU batch size 139 | if not torch.cuda.is_available(): 140 | return default_batch_sizes.get(model_size, 1) 141 | 142 | # Approximate memory requirements in GB for different model sizes 143 | memory_requirements = { 144 | "tiny": 1, 145 | "base": 2, 146 | "small": 4, 147 | "medium": 8, 148 | "large": 16 149 | } 150 | 151 | # Get available GPU memory 152 | device = get_optimal_device() 153 | if device.type == "cuda": 154 | device_idx = device.index 155 | device_props = torch.cuda.get_device_properties(device_idx) 156 | available_memory_gb = device_props.total_memory / (1024 ** 3) 157 | 158 | # Calculate batch size based on available memory 159 | model_memory = memory_requirements.get(model_size, 2) 160 | max_batch_size = int(available_memory_gb / model_memory) 161 | 162 | # Ensure batch size is at least 1 163 | return max(1, max_batch_size) 164 | 165 | # For MPS or other devices, return default 166 | return default_batch_sizes.get(model_size, 1) 167 | 168 | 169 | def configure_gpu(model_size="base", memory_fraction=0.8): 170 | """ 171 | Configure GPU settings for optimal performance. 172 | 173 | Args: 174 | model_size (str): Size of the model (tiny, base, small, medium, large) 175 | memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0) 176 | 177 | Returns: 178 | dict: Configuration information 179 | """ 180 | gpu_info = get_gpu_info() 181 | device = get_optimal_device() 182 | 183 | # Set memory limits if using CUDA 184 | if device.type == "cuda": 185 | set_memory_limits(memory_fraction) 186 | 187 | # Apply inference optimizations 188 | optimize_for_inference() 189 | 190 | # Get recommended batch size 191 | batch_size = get_recommended_batch_size(model_size) 192 | 193 | config = { 194 | "device": device, 195 | "batch_size": batch_size, 196 | "gpu_info": gpu_info, 197 | "memory_fraction": memory_fraction if device.type == "cuda" else None 198 | } 199 | 200 | logger.info(f"GPU configuration: Using {device} with batch size {batch_size}") 201 | return config -------------------------------------------------------------------------------- /utils/keyword_extraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Keyword extraction utilities for the OBS Recording Transcriber. 3 | Provides functions to extract keywords and link them to timestamps. 4 | """ 5 | 6 | import logging 7 | import re 8 | import torch 9 | import numpy as np 10 | from pathlib import Path 11 | from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification 12 | from sklearn.feature_extraction.text import TfidfVectorizer 13 | from collections import Counter 14 | 15 | # Configure logging 16 | logging.basicConfig(level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | # Try to import GPU utilities, but don't fail if not available 20 | try: 21 | from utils.gpu_utils import get_optimal_device 22 | GPU_UTILS_AVAILABLE = True 23 | except ImportError: 24 | GPU_UTILS_AVAILABLE = False 25 | 26 | # Default models 27 | NER_MODEL = "dslim/bert-base-NER" 28 | 29 | 30 | def extract_keywords_tfidf(text, max_keywords=10, ngram_range=(1, 2)): 31 | """ 32 | Extract keywords using TF-IDF. 33 | 34 | Args: 35 | text (str): Text to extract keywords from 36 | max_keywords (int): Maximum number of keywords to extract 37 | ngram_range (tuple): Range of n-grams to consider 38 | 39 | Returns: 40 | list: List of (keyword, score) tuples 41 | """ 42 | try: 43 | # Preprocess text 44 | text = text.lower() 45 | 46 | # Remove common stopwords 47 | stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 48 | 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those', 49 | 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 50 | 'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by'} 51 | 52 | # Create sentences for better TF-IDF analysis 53 | sentences = re.split(r'[.!?]', text) 54 | sentences = [s.strip() for s in sentences if s.strip()] 55 | 56 | if not sentences: 57 | return [] 58 | 59 | # Apply TF-IDF 60 | vectorizer = TfidfVectorizer( 61 | max_features=100, 62 | stop_words=stopwords, 63 | ngram_range=ngram_range 64 | ) 65 | 66 | try: 67 | tfidf_matrix = vectorizer.fit_transform(sentences) 68 | feature_names = vectorizer.get_feature_names_out() 69 | 70 | # Calculate average TF-IDF score across all sentences 71 | avg_tfidf = np.mean(tfidf_matrix.toarray(), axis=0) 72 | 73 | # Get top keywords 74 | keywords = [(feature_names[i], avg_tfidf[i]) for i in avg_tfidf.argsort()[::-1]] 75 | 76 | # Filter out single-character keywords and limit to max_keywords 77 | keywords = [(k, s) for k, s in keywords if len(k) > 1][:max_keywords] 78 | 79 | return keywords 80 | except ValueError as e: 81 | logger.warning(f"TF-IDF extraction failed: {e}") 82 | return [] 83 | 84 | except Exception as e: 85 | logger.error(f"Error extracting keywords with TF-IDF: {e}") 86 | return [] 87 | 88 | 89 | def extract_named_entities(text, model=NER_MODEL, use_gpu=True): 90 | """ 91 | Extract named entities from text. 92 | 93 | Args: 94 | text (str): Text to extract entities from 95 | model (str): Model to use for NER 96 | use_gpu (bool): Whether to use GPU acceleration if available 97 | 98 | Returns: 99 | list: List of (entity, type) tuples 100 | """ 101 | # Configure device 102 | device = torch.device("cpu") 103 | if use_gpu and GPU_UTILS_AVAILABLE: 104 | device = get_optimal_device() 105 | device_arg = 0 if device.type == "cuda" else -1 106 | else: 107 | device_arg = -1 108 | 109 | try: 110 | # Initialize the pipeline 111 | ner_pipeline = pipeline("ner", model=model, device=device_arg, aggregation_strategy="simple") 112 | 113 | # Split text into manageable chunks if too long 114 | max_length = 512 115 | if len(text) > max_length: 116 | chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)] 117 | else: 118 | chunks = [text] 119 | 120 | # Process each chunk 121 | all_entities = [] 122 | for chunk in chunks: 123 | entities = ner_pipeline(chunk) 124 | all_entities.extend(entities) 125 | 126 | # Extract entity text and type 127 | entity_info = [(entity["word"], entity["entity_group"]) for entity in all_entities] 128 | 129 | return entity_info 130 | except Exception as e: 131 | logger.error(f"Error extracting named entities: {e}") 132 | return [] 133 | 134 | 135 | def find_keyword_timestamps(segments, keywords): 136 | """ 137 | Find timestamps for keywords in transcript segments. 138 | 139 | Args: 140 | segments (list): List of transcript segments with timing info 141 | keywords (list): List of keywords to find 142 | 143 | Returns: 144 | dict: Dictionary mapping keywords to lists of timestamps 145 | """ 146 | keyword_timestamps = {} 147 | 148 | # Convert keywords to lowercase for case-insensitive matching 149 | if isinstance(keywords[0], tuple): 150 | # If keywords is a list of (keyword, score) tuples 151 | keywords_lower = [k.lower() for k, _ in keywords] 152 | else: 153 | # If keywords is just a list of keywords 154 | keywords_lower = [k.lower() for k in keywords] 155 | 156 | # Process each segment 157 | for segment in segments: 158 | segment_text = segment["text"].lower() 159 | start_time = segment["start"] 160 | end_time = segment["end"] 161 | 162 | # Check each keyword 163 | for i, keyword in enumerate(keywords_lower): 164 | if keyword in segment_text: 165 | # Get the original case of the keyword 166 | original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i] 167 | 168 | # Initialize the list if this is the first occurrence 169 | if original_keyword not in keyword_timestamps: 170 | keyword_timestamps[original_keyword] = [] 171 | 172 | # Add the timestamp 173 | keyword_timestamps[original_keyword].append({ 174 | "start": start_time, 175 | "end": end_time, 176 | "context": segment["text"] 177 | }) 178 | 179 | return keyword_timestamps 180 | 181 | 182 | def extract_keywords_from_transcript(transcript, segments, max_keywords=15, use_gpu=True): 183 | """ 184 | Extract keywords from transcript and link them to timestamps. 185 | 186 | Args: 187 | transcript (str): Full transcript text 188 | segments (list): List of transcript segments with timing info 189 | max_keywords (int): Maximum number of keywords to extract 190 | use_gpu (bool): Whether to use GPU acceleration if available 191 | 192 | Returns: 193 | tuple: (keyword_timestamps, entities_with_timestamps) 194 | """ 195 | try: 196 | # Extract keywords using TF-IDF 197 | tfidf_keywords = extract_keywords_tfidf(transcript, max_keywords=max_keywords) 198 | 199 | # Extract named entities 200 | entities = extract_named_entities(transcript, use_gpu=use_gpu) 201 | 202 | # Count entity occurrences and get the most frequent ones 203 | entity_counter = Counter([entity for entity, _ in entities]) 204 | top_entities = [(entity, count) for entity, count in entity_counter.most_common(max_keywords)] 205 | 206 | # Find timestamps for keywords and entities 207 | keyword_timestamps = find_keyword_timestamps(segments, tfidf_keywords) 208 | entity_timestamps = find_keyword_timestamps(segments, top_entities) 209 | 210 | return keyword_timestamps, entity_timestamps 211 | 212 | except Exception as e: 213 | logger.error(f"Error extracting keywords from transcript: {e}") 214 | return {}, {} 215 | 216 | 217 | def generate_keyword_index(keyword_timestamps, entity_timestamps=None): 218 | """ 219 | Generate a keyword index with timestamps. 220 | 221 | Args: 222 | keyword_timestamps (dict): Dictionary mapping keywords to timestamp lists 223 | entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists 224 | 225 | Returns: 226 | str: Formatted keyword index 227 | """ 228 | lines = ["# Keyword Index\n"] 229 | 230 | # Add keywords section 231 | if keyword_timestamps: 232 | lines.append("## Keywords\n") 233 | for keyword, timestamps in sorted(keyword_timestamps.items()): 234 | if timestamps: 235 | times = [f"{int(ts['start'] // 60):02d}:{int(ts['start'] % 60):02d}" for ts in timestamps] 236 | lines.append(f"- **{keyword}**: {', '.join(times)}\n") 237 | 238 | # Add entities section 239 | if entity_timestamps: 240 | lines.append("\n## Named Entities\n") 241 | for entity, timestamps in sorted(entity_timestamps.items()): 242 | if timestamps: 243 | times = [f"{int(ts['start'] // 60):02d}:{int(ts['start'] % 60):02d}" for ts in timestamps] 244 | lines.append(f"- **{entity}**: {', '.join(times)}\n") 245 | 246 | return "".join(lines) 247 | 248 | 249 | def generate_interactive_transcript(segments, keyword_timestamps=None, entity_timestamps=None): 250 | """ 251 | Generate an interactive transcript with keyword highlighting. 252 | 253 | Args: 254 | segments (list): List of transcript segments with timing info 255 | keyword_timestamps (dict, optional): Dictionary mapping keywords to timestamp lists 256 | entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists 257 | 258 | Returns: 259 | str: HTML formatted interactive transcript 260 | """ 261 | # Combine keywords and entities 262 | all_keywords = {} 263 | if keyword_timestamps: 264 | all_keywords.update(keyword_timestamps) 265 | if entity_timestamps: 266 | all_keywords.update(entity_timestamps) 267 | 268 | # Generate HTML 269 | html = ["
"] 270 | 271 | for segment in segments: 272 | start_time = segment["start"] 273 | end_time = segment["end"] 274 | text = segment["text"] 275 | 276 | # Format timestamp 277 | timestamp = f"{int(start_time // 60):02d}:{int(start_time % 60):02d}" 278 | 279 | # Add speaker if available 280 | speaker = segment.get("speaker", "") 281 | speaker_html = f"[{speaker}] " if speaker else "" 282 | 283 | # Highlight keywords in text 284 | highlighted_text = text 285 | for keyword in all_keywords: 286 | # Use regex to match whole words only 287 | pattern = r'\b' + re.escape(keyword) + r'\b' 288 | replacement = f"{keyword}" 289 | highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE) 290 | 291 | # Add segment to HTML 292 | html.append(f"

") 293 | html.append(f"{timestamp} {speaker_html}{highlighted_text}") 294 | html.append("

") 295 | 296 | html.append("
") 297 | 298 | return "\n".join(html) 299 | 300 | 301 | def create_keyword_cloud_data(keyword_timestamps, entity_timestamps=None): 302 | """ 303 | Create data for a keyword cloud visualization. 304 | 305 | Args: 306 | keyword_timestamps (dict): Dictionary mapping keywords to timestamp lists 307 | entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists 308 | 309 | Returns: 310 | list: List of (keyword, weight) tuples for visualization 311 | """ 312 | cloud_data = [] 313 | 314 | # Process keywords 315 | for keyword, timestamps in keyword_timestamps.items(): 316 | weight = len(timestamps) # Weight by occurrence count 317 | cloud_data.append((keyword, weight)) 318 | 319 | # Process entities if provided 320 | if entity_timestamps: 321 | for entity, timestamps in entity_timestamps.items(): 322 | weight = len(timestamps) * 1.5 # Give entities slightly higher weight 323 | cloud_data.append((entity, weight)) 324 | 325 | return cloud_data -------------------------------------------------------------------------------- /utils/ollama_integration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ollama integration for local AI model inference. 3 | Provides functions to use Ollama's API for text summarization. 4 | """ 5 | 6 | import requests 7 | import json 8 | import logging 9 | from pathlib import Path 10 | import os 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | # Default Ollama API endpoint 17 | OLLAMA_API_URL = "http://localhost:11434/api" 18 | 19 | 20 | def check_ollama_available(): 21 | """ 22 | Check if Ollama service is available. 23 | 24 | Returns: 25 | bool: True if Ollama is available, False otherwise 26 | """ 27 | try: 28 | response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2) 29 | return response.status_code == 200 30 | except requests.exceptions.RequestException: 31 | return False 32 | 33 | 34 | def list_available_models(): 35 | """ 36 | List available models in Ollama. 37 | 38 | Returns: 39 | list: List of available model names 40 | """ 41 | try: 42 | response = requests.get(f"{OLLAMA_API_URL}/tags") 43 | if response.status_code == 200: 44 | models = response.json().get('models', []) 45 | return [model['name'] for model in models] 46 | return [] 47 | except requests.exceptions.RequestException as e: 48 | logger.error(f"Error listing Ollama models: {e}") 49 | return [] 50 | 51 | 52 | def summarize_with_ollama(text, model="llama3", max_length=150): 53 | """ 54 | Summarize text using Ollama's local API. 55 | 56 | Args: 57 | text (str): Text to summarize 58 | model (str): Ollama model to use 59 | max_length (int): Maximum length of the summary 60 | 61 | Returns: 62 | str: Summarized text or None if failed 63 | """ 64 | if not check_ollama_available(): 65 | logger.warning("Ollama service is not available") 66 | return None 67 | 68 | # Check if the model is available 69 | available_models = list_available_models() 70 | if model not in available_models: 71 | logger.warning(f"Model {model} not available in Ollama. Available models: {available_models}") 72 | return None 73 | 74 | # Prepare the prompt for summarization 75 | prompt = f"Summarize the following text in about {max_length} words:\n\n{text}" 76 | 77 | try: 78 | # Make the API request 79 | response = requests.post( 80 | f"{OLLAMA_API_URL}/generate", 81 | json={ 82 | "model": model, 83 | "prompt": prompt, 84 | "stream": False, 85 | "options": { 86 | "temperature": 0.3, 87 | "top_p": 0.9, 88 | "max_tokens": max_length * 2 # Approximate token count 89 | } 90 | } 91 | ) 92 | 93 | if response.status_code == 200: 94 | result = response.json() 95 | return result.get('response', '').strip() 96 | else: 97 | logger.error(f"Ollama API error: {response.status_code} - {response.text}") 98 | return None 99 | except requests.exceptions.RequestException as e: 100 | logger.error(f"Error communicating with Ollama: {e}") 101 | return None 102 | 103 | 104 | def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150): 105 | """ 106 | Chunk long text and summarize each chunk, then combine the summaries. 107 | 108 | Args: 109 | text (str): Text to summarize 110 | model (str): Ollama model to use 111 | chunk_size (int): Maximum size of each chunk in characters 112 | max_length (int): Maximum length of the final summary 113 | 114 | Returns: 115 | str: Combined summary or None if failed 116 | """ 117 | if len(text) <= chunk_size: 118 | return summarize_with_ollama(text, model, max_length) 119 | 120 | # Split text into chunks 121 | words = text.split() 122 | chunks = [] 123 | current_chunk = [] 124 | current_length = 0 125 | 126 | for word in words: 127 | if current_length + len(word) + 1 <= chunk_size: 128 | current_chunk.append(word) 129 | current_length += len(word) + 1 130 | else: 131 | chunks.append(' '.join(current_chunk)) 132 | current_chunk = [word] 133 | current_length = len(word) + 1 134 | 135 | if current_chunk: 136 | chunks.append(' '.join(current_chunk)) 137 | 138 | # Summarize each chunk 139 | chunk_summaries = [] 140 | for i, chunk in enumerate(chunks): 141 | logger.info(f"Summarizing chunk {i+1}/{len(chunks)}") 142 | summary = summarize_with_ollama(chunk, model, max_length // len(chunks)) 143 | if summary: 144 | chunk_summaries.append(summary) 145 | 146 | if not chunk_summaries: 147 | return None 148 | 149 | # If there's only one chunk summary, return it 150 | if len(chunk_summaries) == 1: 151 | return chunk_summaries[0] 152 | 153 | # Otherwise, combine the summaries and summarize again 154 | combined_summary = " ".join(chunk_summaries) 155 | return summarize_with_ollama(combined_summary, model, max_length) -------------------------------------------------------------------------------- /utils/summarization.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline, AutoTokenizer 2 | import torch 3 | import logging 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | SUMMARY_MODEL = "Falconsai/text_summarization" 10 | 11 | def chunk_text(text, max_tokens, tokenizer): 12 | """ 13 | Splits the text into a list of chunks based on token limits. 14 | 15 | Args: 16 | text (str): Text to chunk 17 | max_tokens (int): Maximum tokens per chunk 18 | tokenizer (AutoTokenizer): Tokenizer to use 19 | 20 | Returns: 21 | list: List of text chunks 22 | """ 23 | words = text.split() 24 | chunks = [] 25 | current_chunk = [] 26 | current_length = 0 27 | 28 | for word in words: 29 | hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 30 | if hypothetical_length <= max_tokens: 31 | current_chunk.append(word) 32 | current_length = hypothetical_length 33 | else: 34 | chunks.append(' '.join(current_chunk)) 35 | current_chunk = [word] 36 | current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 37 | 38 | if current_chunk: 39 | chunks.append(' '.join(current_chunk)) 40 | 41 | return chunks 42 | 43 | def summarize_text(text, use_gpu=True, memory_fraction=0.8): 44 | """ 45 | Summarize text using a Hugging Face pipeline with chunking support. 46 | 47 | Args: 48 | text (str): Text to summarize 49 | use_gpu (bool): Whether to use GPU if available 50 | memory_fraction (float): Fraction of GPU memory to use 51 | 52 | Returns: 53 | str: Summarized text 54 | """ 55 | # Determine device 56 | device = -1 # Default to CPU 57 | if use_gpu and torch.cuda.is_available(): 58 | device = 0 # Use first GPU 59 | if torch.cuda.is_available(): 60 | torch.cuda.set_per_process_memory_fraction(memory_fraction) 61 | 62 | logger.info(f"Using device {device} for summarization") 63 | 64 | try: 65 | # Initialize the pipeline and tokenizer 66 | summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device) 67 | tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL) 68 | 69 | # Check if text needs to be chunked 70 | max_tokens = 512 71 | tokens = tokenizer(text, return_tensors='pt') 72 | num_tokens = len(tokens['input_ids'][0]) 73 | 74 | if num_tokens > max_tokens: 75 | chunks = chunk_text(text, max_tokens, tokenizer) 76 | summaries = [] 77 | 78 | for i, chunk in enumerate(chunks): 79 | logger.info(f"Summarizing chunk {i+1}/{len(chunks)}") 80 | summary_output = summarizer( 81 | "summarize: " + chunk, 82 | max_length=150, 83 | min_length=30, 84 | do_sample=False 85 | ) 86 | summaries.append(summary_output[0]['summary_text']) 87 | 88 | # If multiple chunks, summarize the combined summaries 89 | if len(summaries) > 1: 90 | logger.info("Generating final summary from chunk summaries") 91 | combined_text = " ".join(summaries) 92 | return summarizer( 93 | "summarize: " + combined_text, 94 | max_length=150, 95 | min_length=30, 96 | do_sample=False 97 | )[0]['summary_text'] 98 | return summaries[0] 99 | else: 100 | return summarizer( 101 | "summarize: " + text, 102 | max_length=150, 103 | min_length=30, 104 | do_sample=False 105 | )[0]['summary_text'] 106 | 107 | except Exception as e: 108 | logger.error(f"Error during summarization: {e}") 109 | # Fallback to CPU if GPU fails 110 | if device != -1: 111 | logger.info("Falling back to CPU") 112 | return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction) 113 | raise 114 | -------------------------------------------------------------------------------- /utils/transcription.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | from pathlib import Path 3 | from transformers import pipeline, AutoTokenizer 4 | from utils.audio_processing import extract_audio 5 | from utils.summarization import summarize_text 6 | import logging 7 | import torch 8 | 9 | # Try to import GPU utilities, but don't fail if not available 10 | try: 11 | from utils.gpu_utils import configure_gpu, get_optimal_device 12 | GPU_UTILS_AVAILABLE = True 13 | except ImportError: 14 | GPU_UTILS_AVAILABLE = False 15 | 16 | # Try to import caching utilities, but don't fail if not available 17 | try: 18 | from utils.cache import load_from_cache, save_to_cache 19 | CACHE_AVAILABLE = True 20 | except ImportError: 21 | CACHE_AVAILABLE = False 22 | 23 | # Configure logging 24 | logging.basicConfig(level=logging.INFO) 25 | logger = logging.getLogger(__name__) 26 | 27 | WHISPER_MODEL = "base" 28 | 29 | def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 30 | use_gpu=True, memory_fraction=0.8): 31 | """ 32 | Transcribe audio using Whisper and return both segments and full transcript. 33 | 34 | Args: 35 | audio_path (Path): Path to the audio or video file 36 | model (str): Whisper model size to use (tiny, base, small, medium, large) 37 | use_cache (bool): Whether to use caching 38 | cache_max_age (float, optional): Maximum age of cache in seconds 39 | use_gpu (bool): Whether to use GPU acceleration if available 40 | memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0) 41 | 42 | Returns: 43 | tuple: (segments, transcript) where segments is a list of dicts with timing info 44 | """ 45 | audio_path = Path(audio_path) 46 | 47 | # Check cache first if enabled 48 | if use_cache and CACHE_AVAILABLE: 49 | cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age) 50 | if cached_data: 51 | logger.info(f"Using cached transcription for {audio_path}") 52 | return cached_data.get("segments", []), cached_data.get("transcript", "") 53 | 54 | # Extract audio if the input is a video file 55 | if audio_path.suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv']: 56 | audio_path = extract_audio(audio_path) 57 | 58 | # Configure GPU if available and requested 59 | device = torch.device("cpu") 60 | if use_gpu and GPU_UTILS_AVAILABLE: 61 | gpu_config = configure_gpu(model, memory_fraction) 62 | device = gpu_config["device"] 63 | logger.info(f"Using device: {device} for transcription") 64 | 65 | # Load the specified Whisper model 66 | logger.info(f"Loading Whisper model: {model}") 67 | whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu") 68 | 69 | # Transcribe the audio 70 | logger.info(f"Transcribing audio: {audio_path}") 71 | result = whisper_model.transcribe(str(audio_path)) 72 | 73 | # Extract the full transcript and segments 74 | transcript = result["text"] 75 | segments = result["segments"] 76 | 77 | # Cache the results if caching is enabled 78 | if use_cache and CACHE_AVAILABLE: 79 | cache_data = { 80 | "transcript": transcript, 81 | "segments": segments 82 | } 83 | save_to_cache(audio_path, cache_data, model, "transcribe") 84 | 85 | return segments, transcript -------------------------------------------------------------------------------- /utils/translation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Translation utilities for the OBS Recording Transcriber. 3 | Provides functions for language detection and translation. 4 | """ 5 | 6 | import logging 7 | import torch 8 | from pathlib import Path 9 | from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, M2M100ForConditionalGeneration 10 | import whisper 11 | import iso639 12 | 13 | # Configure logging 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | # Try to import GPU utilities, but don't fail if not available 18 | try: 19 | from utils.gpu_utils import get_optimal_device 20 | GPU_UTILS_AVAILABLE = True 21 | except ImportError: 22 | GPU_UTILS_AVAILABLE = False 23 | 24 | # Default models 25 | TRANSLATION_MODEL = "facebook/m2m100_418M" 26 | LANGUAGE_DETECTION_MODEL = "papluca/xlm-roberta-base-language-detection" 27 | 28 | # ISO language code mapping 29 | def get_language_name(code): 30 | """ 31 | Get the language name from ISO code. 32 | 33 | Args: 34 | code (str): ISO language code 35 | 36 | Returns: 37 | str: Language name or original code if not found 38 | """ 39 | try: 40 | return iso639.languages.get(part1=code).name 41 | except (KeyError, AttributeError): 42 | try: 43 | return iso639.languages.get(part2b=code).name 44 | except (KeyError, AttributeError): 45 | return code 46 | 47 | 48 | def detect_language(text, model=LANGUAGE_DETECTION_MODEL, use_gpu=True): 49 | """ 50 | Detect the language of a text. 51 | 52 | Args: 53 | text (str): Text to detect language for 54 | model (str): Model to use for language detection 55 | use_gpu (bool): Whether to use GPU acceleration if available 56 | 57 | Returns: 58 | tuple: (language_code, confidence) 59 | """ 60 | # Configure device 61 | device = torch.device("cpu") 62 | if use_gpu and GPU_UTILS_AVAILABLE: 63 | device = get_optimal_device() 64 | device_arg = 0 if device.type == "cuda" else -1 65 | else: 66 | device_arg = -1 67 | 68 | try: 69 | # Initialize the pipeline 70 | classifier = pipeline("text-classification", model=model, device=device_arg) 71 | 72 | # Truncate text if too long 73 | max_length = 512 74 | if len(text) > max_length: 75 | text = text[:max_length] 76 | 77 | # Detect language 78 | result = classifier(text)[0] 79 | language_code = result["label"] 80 | confidence = result["score"] 81 | 82 | return language_code, confidence 83 | except Exception as e: 84 | logger.error(f"Error detecting language: {e}") 85 | return None, 0.0 86 | 87 | 88 | def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_MODEL, use_gpu=True): 89 | """ 90 | Translate text from source language to target language. 91 | 92 | Args: 93 | text (str): Text to translate 94 | source_lang (str, optional): Source language code (auto-detect if None) 95 | target_lang (str): Target language code 96 | model (str): Model to use for translation 97 | use_gpu (bool): Whether to use GPU acceleration if available 98 | 99 | Returns: 100 | str: Translated text 101 | """ 102 | # Auto-detect source language if not provided 103 | if source_lang is None: 104 | detected_lang, confidence = detect_language(text, use_gpu=use_gpu) 105 | if detected_lang and confidence > 0.5: 106 | source_lang = detected_lang 107 | logger.info(f"Detected language: {get_language_name(source_lang)} ({source_lang}) with confidence {confidence:.2f}") 108 | else: 109 | logger.warning("Could not reliably detect language, defaulting to English") 110 | source_lang = "en" 111 | 112 | # Skip translation if source and target are the same 113 | if source_lang == target_lang: 114 | logger.info(f"Source and target languages are the same ({source_lang}), skipping translation") 115 | return text 116 | 117 | # Configure device 118 | device = torch.device("cpu") 119 | if use_gpu and GPU_UTILS_AVAILABLE: 120 | device = get_optimal_device() 121 | 122 | try: 123 | # Load model and tokenizer 124 | tokenizer = AutoTokenizer.from_pretrained(model) 125 | model = M2M100ForConditionalGeneration.from_pretrained(model) 126 | 127 | # Move model to device 128 | model = model.to(device) 129 | 130 | # Prepare for translation 131 | tokenizer.src_lang = source_lang 132 | 133 | # Split text into manageable chunks if too long 134 | max_length = 512 135 | if len(text) > max_length: 136 | chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)] 137 | else: 138 | chunks = [text] 139 | 140 | # Translate each chunk 141 | translated_chunks = [] 142 | for chunk in chunks: 143 | encoded = tokenizer(chunk, return_tensors="pt").to(device) 144 | generated_tokens = model.generate( 145 | **encoded, 146 | forced_bos_token_id=tokenizer.get_lang_id(target_lang), 147 | max_length=max_length 148 | ) 149 | translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] 150 | translated_chunks.append(translated_chunk) 151 | 152 | # Combine translated chunks 153 | translated_text = " ".join(translated_chunks) 154 | 155 | return translated_text 156 | except Exception as e: 157 | logger.error(f"Error translating text: {e}") 158 | return text 159 | 160 | 161 | def translate_segments(segments, source_lang=None, target_lang="en", use_gpu=True): 162 | """ 163 | Translate transcript segments. 164 | 165 | Args: 166 | segments (list): List of transcript segments 167 | source_lang (str, optional): Source language code (auto-detect if None) 168 | target_lang (str): Target language code 169 | use_gpu (bool): Whether to use GPU acceleration if available 170 | 171 | Returns: 172 | list: Translated segments 173 | """ 174 | if not segments: 175 | return [] 176 | 177 | # Auto-detect source language from combined text if not provided 178 | if source_lang is None: 179 | combined_text = " ".join([segment["text"] for segment in segments]) 180 | detected_lang, _ = detect_language(combined_text, use_gpu=use_gpu) 181 | source_lang = detected_lang if detected_lang else "en" 182 | 183 | # Skip translation if source and target are the same 184 | if source_lang == target_lang: 185 | return segments 186 | 187 | try: 188 | # Initialize translation pipeline 189 | translated_segments = [] 190 | 191 | # Translate each segment 192 | for segment in segments: 193 | translated_text = translate_text( 194 | segment["text"], 195 | source_lang=source_lang, 196 | target_lang=target_lang, 197 | use_gpu=use_gpu 198 | ) 199 | 200 | # Create a new segment with translated text 201 | translated_segment = segment.copy() 202 | translated_segment["text"] = translated_text 203 | translated_segment["original_text"] = segment["text"] 204 | translated_segment["source_lang"] = source_lang 205 | translated_segment["target_lang"] = target_lang 206 | 207 | translated_segments.append(translated_segment) 208 | 209 | return translated_segments 210 | except Exception as e: 211 | logger.error(f"Error translating segments: {e}") 212 | return segments 213 | 214 | 215 | def transcribe_and_translate(audio_path, whisper_model="base", target_lang="en", 216 | use_gpu=True, detect_source=True): 217 | """ 218 | Transcribe audio and translate to target language. 219 | 220 | Args: 221 | audio_path (Path): Path to the audio file 222 | whisper_model (str): Whisper model size to use 223 | target_lang (str): Target language code 224 | use_gpu (bool): Whether to use GPU acceleration if available 225 | detect_source (bool): Whether to auto-detect source language 226 | 227 | Returns: 228 | tuple: (original_segments, translated_segments, original_transcript, translated_transcript) 229 | """ 230 | audio_path = Path(audio_path) 231 | 232 | # Configure device 233 | device = torch.device("cpu") 234 | if use_gpu and GPU_UTILS_AVAILABLE: 235 | device = get_optimal_device() 236 | 237 | try: 238 | # Step 1: Transcribe audio with Whisper 239 | logger.info(f"Transcribing audio with Whisper model: {whisper_model}") 240 | model = whisper.load_model(whisper_model, device=device if device.type != "mps" else "cpu") 241 | 242 | # Use Whisper's built-in language detection if requested 243 | if detect_source: 244 | # First, detect language with Whisper 245 | audio = whisper.load_audio(str(audio_path)) 246 | audio = whisper.pad_or_trim(audio) 247 | mel = whisper.log_mel_spectrogram(audio).to(device if device.type != "mps" else "cpu") 248 | _, probs = model.detect_language(mel) 249 | source_lang = max(probs, key=probs.get) 250 | logger.info(f"Whisper detected language: {get_language_name(source_lang)} ({source_lang})") 251 | 252 | # Transcribe with detected language 253 | result = model.transcribe(str(audio_path), language=source_lang) 254 | else: 255 | # Transcribe without language specification 256 | result = model.transcribe(str(audio_path)) 257 | source_lang = result.get("language", "en") 258 | 259 | original_segments = result["segments"] 260 | original_transcript = result["text"] 261 | 262 | # Step 2: Translate if needed 263 | if source_lang != target_lang: 264 | logger.info(f"Translating from {source_lang} to {target_lang}") 265 | translated_segments = translate_segments( 266 | original_segments, 267 | source_lang=source_lang, 268 | target_lang=target_lang, 269 | use_gpu=use_gpu 270 | ) 271 | 272 | # Create full translated transcript 273 | translated_transcript = " ".join([segment["text"] for segment in translated_segments]) 274 | else: 275 | logger.info(f"Source and target languages are the same ({source_lang}), skipping translation") 276 | translated_segments = original_segments 277 | translated_transcript = original_transcript 278 | 279 | return original_segments, translated_segments, original_transcript, translated_transcript 280 | 281 | except Exception as e: 282 | logger.error(f"Error in transcribe_and_translate: {e}") 283 | return None, None, None, None -------------------------------------------------------------------------------- /utils/validation.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | def validate_environment(obs_path: Path): 4 | """Validate environment and prerequisites.""" 5 | errors = [] 6 | if not obs_path.exists(): 7 | errors.append(f"OBS directory not found: {obs_path}") 8 | return errors 9 | --------------------------------------------------------------------------------