├── .python-version ├── transwithai.ico ├── 运行(CPU).bat ├── 运行(GPU).bat ├── 运行(GPU,低显存模式).bat ├── 运行(GPU)(输出到当前文件夹).bat ├── 运行(GPU,高显存加速模式).bat ├── infer.py ├── src └── faster_whisper_transwithai_chickenrice │ ├── __init__.py │ ├── injection.py │ ├── i18n_modern.py │ ├── vad_manager.py │ └── infer.py ├── environment-cuda128.yml ├── environment-cuda122.yml ├── LICENSE ├── environment-cuda118.yml ├── generation_config.json5 ├── runtime_hook.py ├── 使用说明.txt ├── patches └── batch-transcribe.patch ├── README.md ├── .gitignore ├── locales ├── zh-CN │ └── messages.json └── en-US │ └── messages.json ├── RELEASE_NOTES_CN.md ├── project.spec ├── download_models.py └── .github └── workflows └── build-release-conda.yml /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /transwithai.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/HEAD/transwithai.ico -------------------------------------------------------------------------------- /运行(CPU).bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 3 | set cpath=%~dp0 4 | set cpath=%cpath:~0,-1% 5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cpu" %* 6 | pause 7 | -------------------------------------------------------------------------------- /运行(GPU).bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 3 | set cpath=%~dp0 4 | set cpath=%cpath:~0,-1% 5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" %* 6 | pause 7 | -------------------------------------------------------------------------------- /运行(GPU,低显存模式).bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 3 | set cpath=%~dp0 4 | set cpath=%cpath:~0,-1% 5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" %* 6 | pause 7 | -------------------------------------------------------------------------------- /运行(GPU)(输出到当前文件夹).bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 3 | set cpath=%~dp0 4 | set cpath=%cpath:~0,-1% 5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --output_dir="输出" --device="cuda" %* 6 | pause 7 | -------------------------------------------------------------------------------- /运行(GPU,高显存加速模式).bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 3 | set cpath=%~dp0 4 | set cpath=%cpath:~0,-1% 5 | echo ======================================== 6 | echo GPU批处理加速模式 (Batch Inference) 7 | echo 自动检测最佳批处理大小以提高速度 8 | echo 需要更多显存 (建议8GB+) 9 | echo ======================================== 10 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" --enable_batching --max_batch_size=8 %* 11 | pause -------------------------------------------------------------------------------- /infer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Standalone inference script with custom VAD injection 4 | This can be run directly from the project root without installation 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | # Add src to path for local development 11 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) 12 | 13 | from faster_whisper_transwithai_chickenrice.infer import main 14 | 15 | if __name__ == '__main__': 16 | if getattr(sys, 'frozen', False): 17 | os.chdir(os.path.dirname(sys.executable)) 18 | else: 19 | os.chdir(os.path.dirname(__file__)) 20 | main() -------------------------------------------------------------------------------- /src/faster_whisper_transwithai_chickenrice/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | faster_whisper_transwithai_chickenrice - Custom VAD injection for faster_whisper 3 | """ 4 | 5 | from .injection import ( 6 | inject_vad, 7 | uninject_vad, 8 | VadInjectionContext, 9 | with_vad_injection, 10 | auto_inject_vad, 11 | VadOptionsCompat, 12 | is_injection_active, 13 | ) 14 | from .vad_manager import VadModelManager, WhisperVadModel 15 | 16 | __version__ = "0.1.0" 17 | 18 | __all__ = [ 19 | "inject_vad", 20 | "uninject_vad", 21 | "VadInjectionContext", 22 | "with_vad_injection", 23 | "auto_inject_vad", 24 | "VadOptionsCompat", 25 | "is_injection_active", 26 | "VadModelManager", 27 | "WhisperVadModel", 28 | ] -------------------------------------------------------------------------------- /environment-cuda128.yml: -------------------------------------------------------------------------------- 1 | # Conda environment for CUDA 12.8 2 | name: faster-whisper-cu128 3 | channels: 4 | - conda-forge 5 | - defaults 6 | 7 | dependencies: 8 | # Python version 9 | - python=3.10 10 | 11 | # Core dependencies 12 | - librosa>=0.10.0 13 | - ffmpeg>=8.0 14 | - pip 15 | 16 | # CUDA 12.8 toolkit with cuDNN 9 17 | - cuda-runtime=12.8.* 18 | - cudnn=9.10.* 19 | 20 | # Pip dependencies (some packages not available in conda) 21 | - pip: 22 | # CTranslate2 for CUDA 12 23 | - ctranslate2>=4.5.0 24 | 25 | # faster-whisper and related 26 | - faster-whisper>=1.0.0 27 | 28 | # Other ML dependencies 29 | - transformers>=4.30.0 30 | 31 | # Utilities 32 | - pyjson5>=1.6.0 33 | - markupsafe==2.1.5 34 | - backports.functools-lru-cache # Fix for PyInstaller ModuleNotFoundError 35 | 36 | # Build tools 37 | - pyinstaller>=6.0.0 38 | - setuptools>=65.0.0 39 | - wheel>=0.38.0 40 | - build>=0.10.0 41 | - requests>=2.28.0 42 | 43 | # Test dependencies 44 | - pytest>=7.0.0 45 | - pytest-cov>=4.0.0 46 | -------------------------------------------------------------------------------- /environment-cuda122.yml: -------------------------------------------------------------------------------- 1 | # Conda environment for CUDA 12.2 with cuDNN 8 2 | name: faster-whisper-cu122 3 | channels: 4 | - conda-forge 5 | - defaults 6 | 7 | dependencies: 8 | # Python version 9 | - python=3.10 10 | 11 | # Core dependencies 12 | - librosa>=0.10.0 13 | - ffmpeg>=8.0 14 | - pip 15 | 16 | # CUDA 12.2 toolkit with cuDNN 9 17 | - cuda-runtime=12.2.* 18 | - cudnn=9.2.* 19 | 20 | # Pip dependencies (some packages not available in conda) 21 | - pip: 22 | # CTranslate2 for CUDA 12 23 | - ctranslate2>=4.5.0 24 | 25 | # faster-whisper and related 26 | - faster-whisper>=1.0.0 27 | 28 | # Other ML dependencies 29 | - transformers>=4.30.0 30 | 31 | # Utilities 32 | - pyjson5>=1.6.0 33 | - markupsafe==2.1.5 34 | - backports.functools-lru-cache # Fix for PyInstaller ModuleNotFoundError 35 | 36 | # Build tools 37 | - pyinstaller>=6.0.0 38 | - setuptools>=65.0.0 39 | - wheel>=0.38.0 40 | - build>=0.10.0 41 | - requests>=2.28.0 42 | 43 | # Test dependencies 44 | - pytest>=7.0.0 45 | - pytest-cov>=4.0.0 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 TransWithAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /environment-cuda118.yml: -------------------------------------------------------------------------------- 1 | # Conda environment for CUDA 11.8 using nvidia channel packages 2 | name: faster-whisper-cu118 3 | channels: 4 | - conda-forge 5 | - defaults 6 | 7 | dependencies: 8 | # Python version 9 | - python=3.10 10 | 11 | # Core dependencies 12 | - librosa>=0.10.0 13 | - ffmpeg<6 # av pip installation issue workaround 14 | - pip 15 | 16 | # CUDA 11.8 toolkit with cuDNN 8 17 | - cudatoolkit=11.8.* 18 | - cudnn=8.* 19 | 20 | # Pip dependencies (some packages not available in conda) 21 | - pip: 22 | # CTranslate2 for CUDA 11, later forced reinstall to 3.24.0 in CI 23 | - ctranslate2 24 | 25 | # onnxruntime compatibility workaround 26 | - numpy==1.26.4 27 | 28 | # faster-whisper and related 29 | - faster-whisper>=1.0.0 30 | 31 | # Other ML dependencies 32 | - transformers>=4.30.0 33 | 34 | # Utilities 35 | - pyjson5>=1.6.0 36 | - markupsafe==2.1.5 37 | - backports.functools-lru-cache # Fix for PyInstaller ModuleNotFoundError 38 | 39 | # Build tools 40 | - pyinstaller>=6.0.0 41 | - setuptools>=65.0.0 42 | - wheel>=0.38.0 43 | - build>=0.10.0 44 | - requests>=2.28.0 45 | 46 | # Test dependencies 47 | - pytest>=7.0.0 48 | - pytest-cov>=4.0.0 49 | -------------------------------------------------------------------------------- /generation_config.json5: -------------------------------------------------------------------------------- 1 | { 2 | // 可以在这里控制各种生成字幕的参数, 下面这个链接里的参数都可以控制 3 | // You can control various subtitle generation parameters here, all parameters in the link below can be controlled 4 | // https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733 5 | 6 | // VAD 参数设置 (使用改进的 whisper_vad 模型) 7 | // VAD parameters (using improved whisper_vad model) 8 | "vad_parameters": { 9 | // VAD检测阈值 (0.3-0.7, 推荐0.5) 10 | // 太大会导致漏翻, 太小可能会导致时间轴不准或文本质量下降(幻听) 11 | // VAD detection threshold (0.3-0.7, recommended 0.5) 12 | // Too high will cause missed translations, too low may cause timeline inaccuracy or text quality degradation (hallucinations) 13 | "threshold": 0.5, 14 | 15 | // 最小语音持续时间 (毫秒) 16 | // Minimum speech duration (milliseconds) 17 | "min_speech_duration_ms": 300, 18 | 19 | // 最小静音持续时间 (毫秒) 20 | // Minimum silence duration (milliseconds) 21 | "min_silence_duration_ms": 100, 22 | 23 | // 语音前后填充时间 (毫秒) 24 | // Speech padding before and after (milliseconds) 25 | "speech_pad_ms": 200, 26 | }, 27 | 28 | // 避免时间轴向前偏移过长的问题 29 | // Avoid excessive forward shift of timeline 30 | "max_initial_timestamp": 30, 31 | 32 | "repetition_penalty": 1.1, 33 | 34 | } -------------------------------------------------------------------------------- /runtime_hook.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Runtime hook for PyInstaller to set environment variables before the application starts. 4 | This resolves OpenMP conflicts when multiple libraries bring their own OpenMP implementations. 5 | """ 6 | 7 | import os 8 | import sys 9 | import multiprocessing 10 | 11 | # Set KMP_DUPLICATE_LIB_OK to allow multiple OpenMP libraries 12 | # This is needed because different packages (numpy, scipy, ctranslate2, onnxruntime) 13 | # may bring different OpenMP implementations (libiomp5md.dll vs mk2iomp5md.dll) 14 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' 15 | 16 | # Suppress transformers advisory warnings 17 | os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' 18 | 19 | # Configure ONNX Runtime to use half of available CPU cores for better performance 20 | # This prevents oversubscription and resource contention 21 | cpu_count = multiprocessing.cpu_count() 22 | optimal_threads = max(1, cpu_count // 2) 23 | 24 | # Set ONNX Runtime environment variables for CPU execution 25 | os.environ['OMP_NUM_THREADS'] = str(optimal_threads) 26 | os.environ['MKL_NUM_THREADS'] = str(optimal_threads) 27 | 28 | print(f"Runtime hook: Set KMP_DUPLICATE_LIB_OK=TRUE to resolve OpenMP conflicts") 29 | print(f"Runtime hook: Set TRANSFORMERS_NO_ADVISORY_WARNINGS=1 to suppress advisory warnings") 30 | print(f"Runtime hook: Configured ONNX Runtime to use {optimal_threads} threads (half of {cpu_count} available CPUs)") -------------------------------------------------------------------------------- /使用说明.txt: -------------------------------------------------------------------------------- 1 | ======================================== 2 | ⚠️ 重要声明 3 | ======================================== 4 | 本软件开源于: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice 5 | 开发团队: AI汉化组 (https://t.me/transWithAI) 6 | ======================================== 7 | 8 | 基本用法: 9 | 10 | 将需要转录/翻译的音频或视频文件(或包含它们的文件夹)拖放到相应的批处理文件上运行。 11 | 12 | === 选择运行模式 === 13 | 14 | CPU模式: 15 | - 拖放到 "运行(CPU).bat" - 使用CPU进行处理 16 | 17 | GPU模式(仅限NVIDIA显卡): 18 | - 拖放到 "运行(GPU).bat" - 显存≥6GB时使用 19 | - 拖放到 "运行(GPU,低显存模式).bat" - 显存4GB时使用 20 | - 建议先更新显卡驱动到最新版本 21 | 22 | 视频专用模式: 23 | - 拖放到 "运行(翻译视频)(CPU).bat" - 使用CPU处理视频 24 | - 拖放到 "运行(翻译视频)(GPU).bat" - 使用GPU处理视频 25 | - 拖放到 "运行(翻译视频)(GPU,低显存模式).bat" - 低显存GPU处理视频 26 | 27 | 输出到指定文件夹: 28 | - 拖放到 "运行(GPU)(输出到当前文件夹).bat" - 字幕输出到"输出"文件夹 29 | 30 | === 支持的格式 === 31 | 32 | 音频格式: mp3, wav, flac, m4a, aac, ogg, wma 33 | 视频格式: mp4, mkv, avi, mov, webm, flv, wmv 34 | 35 | 输出格式: 36 | - LRC (歌词格式,适合音乐播放器) 37 | - SRT (常用字幕格式,适合视频播放器) 38 | - VTT (WebVTT格式,适合网页视频) 39 | 40 | 所有批处理文件默认会生成这三种格式的字幕。如果字幕文件已存在,将自动跳过。 41 | 42 | --- 43 | 44 | 调整参数: 45 | 46 | 1. 基本参数调整: 47 | 编辑批处理文件,找到以 "%cpath%\infer.exe" 开头的行,在其后添加参数。 48 | 49 | 示例(添加覆盖模式): 50 | 添加前: "%cpath%\infer.exe" --device="cuda" %* 51 | 添加后: "%cpath%\infer.exe" --overwrite --device="cuda" %* 52 | 53 | 常用参数: 54 | --overwrite : 覆盖已存在的字幕文件 55 | --output_dir="路径" : 指定输出文件夹(默认输出到源文件所在文件夹) 56 | --audio_suffixes="mp3,wav" : 自定义处理的文件格式 57 | --sub_formats="srt,vtt,lrc" : 自定义输出格式 58 | --log_level="INFO" : 减少控制台输出(默认为DEBUG) 59 | 60 | 2. 生成参数调整(高级): 61 | 编辑 generation_config.json5 文件调整转录参数。 62 | 参数详情见下方相关项目链接。 63 | 64 | 注意:通常不需要调整生成参数。如遇到以下情况可尝试调整: 65 | - 声音过小导致漏翻 66 | - 时间轴对不上 67 | - 出现幻听 68 | 69 | --- 70 | 71 | 故障排除: 72 | 73 | 1. GPU模式无法运行: 74 | - 确认是否为NVIDIA显卡 75 | - 更新显卡驱动到最新版本 76 | - 检查CUDA是否正确安装 77 | 78 | 2. 字幕未生成: 79 | - 检查文件格式是否支持 80 | - 查看控制台是否有错误信息 81 | - 尝试使用 --overwrite 参数重新生成 82 | 83 | 3. 内存不足: 84 | - 使用低显存模式 85 | - 尝试CPU模式 86 | - 处理较小的文件或分段处理 87 | 88 | --- 89 | 90 | 相关项目: 91 | 92 | - Faster Whisper: https://github.com/SYSTRAN/faster-whisper 93 | - 海南鸡模型 (日文转中文优化): https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st 94 | - 音声优化 VAD 模型: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx 95 | - OpenAI Whisper: https://github.com/openai/whisper 96 | - 参数详情: https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733 97 | 98 | 致谢: 99 | - 基于 SYSTRAN/faster-whisper 开发 100 | - 使用 chickenrice0721 日文转中文优化模型(5000小时音频数据训练) 101 | - 使用 TransWithAI 音声优化 VAD 模型 (Whisper-Vad-EncDec-ASMR-onnx) 102 | - 感谢某匿名群友的算力和技术支持 103 | -------------------------------------------------------------------------------- /patches/batch-transcribe.patch: -------------------------------------------------------------------------------- 1 | --- a/faster_whisper/transcribe.py 2 | +++ b/faster_whisper/transcribe.py 3 | @@ -219,6 +219,10 @@ 4 | for i, language_token in enumerate(language_tokens): 5 | prompts[i][language_token_index] = language_token 6 | 7 | + max_initial_timestamp_index = int( 8 | + round(options.max_initial_timestamp / self.model.time_precision) 9 | + ) 10 | + 11 | results = self.model.model.generate( 12 | encoder_output, 13 | prompts, 14 | @@ -228,6 +232,7 @@ 15 | max_length=max_length, 16 | suppress_blank=options.suppress_blank, 17 | suppress_tokens=options.suppress_tokens, 18 | + max_initial_timestamp_index=max_initial_timestamp_index, 19 | return_scores=True, 20 | return_no_speech_prob=True, 21 | sampling_temperature=options.temperatures[0], 22 | @@ -280,7 +285,7 @@ 23 | prefix: Optional[str] = None, 24 | suppress_blank: bool = True, 25 | suppress_tokens: Optional[List[int]] = [-1], 26 | - without_timestamps: bool = True, 27 | + without_timestamps: bool = False, 28 | max_initial_timestamp: float = 1.0, 29 | word_timestamps: bool = False, 30 | prepend_punctuations: str = "\"'"¿([{-", 31 | @@ -321,6 +326,7 @@ 32 | suppress_tokens: List of token IDs to suppress. -1 will suppress a default set 33 | of symbols as defined in `tokenizer.non_speech_tokens()`. 34 | without_timestamps: Only sample text tokens. 35 | + max_initial_timestamp: The initial timestamp cannot be later than this. 36 | word_timestamps: Extract word-level timestamps using the cross-attention pattern 37 | and dynamic time warping, and include the timestamps for each word in each segment. 38 | Set as False. 39 | @@ -363,7 +369,6 @@ 40 | prompt_reset_on_temperature: Resets prompt if temperature is above this value. 41 | Arg has effect only if condition_on_previous_text is True. Set at 0.5 42 | prefix: Optional text to provide as a prefix at the beginning of each window. 43 | - max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. 44 | hallucination_silence_threshold: Optional[float] 45 | When word_timestamps is True, skip silent periods longer than this threshold 46 | (in seconds) when a possible hallucination is detected. set as None. 47 | @@ -549,7 +554,7 @@ 48 | prompt_reset_on_temperature=0.5, 49 | multilingual=multilingual, 50 | without_timestamps=without_timestamps, 51 | - max_initial_timestamp=0.0, 52 | + max_initial_timestamp=max_initial_timestamp, 53 | ) 54 | 55 | info = TranscriptionInfo( -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🎙️ Faster Whisper TransWithAI ChickenRice 2 | 3 | [![GitHub Release](https://img.shields.io/github/v/release/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice)](https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/releases) 4 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) 5 | 6 | 高性能音视频转录和翻译工具 - 基于 Faster Whisper 和音声优化 VAD 的日文转中文优化版本 7 | 8 | High-performance audio/video transcription and translation tool - Japanese-to-Chinese optimized version based on Faster Whisper and voice-optimized VAD 9 | 10 | ## ⚠️ 重要声明 / Important Notice 11 | 12 | > **本软件为开源软件 / This software is open source** 13 | > 14 | > 🔗 **开源地址 / Repository**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice 15 | > 16 | > 👥 **开发团队 / Development Team**: AI汉化组 (https://t.me/transWithAI) 17 | > 18 | > 本软件完全免费开源 / This software is completely free and open source 19 | 20 | ## 🙏 致谢 / Acknowledgments 21 | 22 | - 🚀 基于 [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) 开发 23 | - 🐔 使用 [chickenrice0721/whisper-large-v2-translate-zh-v0.2-st](https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st) 日文转中文优化模型 24 | - 🔊 使用 [TransWithAI/Whisper-Vad-EncDec-ASMR-onnx](https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx) 音声优化 VAD 模型 25 | - 💪 **感谢某匿名群友的算力和技术支持** 26 | 27 | ## ✨ 功能特性 / Features 28 | 29 | - 🎯 **高精度日文转中文翻译**: 基于5000小时音频数据训练的"海南鸡v2"日文转中文优化模型 30 | - 🚀 **GPU加速**: 支持CUDA 11.8/12.2/12.8,充分利用NVIDIA显卡性能 31 | - 📝 **多格式输出**: 支持SRT、VTT、LRC等多种字幕格式 32 | - 🎬 **音视频支持**: 支持常见音频(mp3/wav/flac等)和视频格式(mp4/mkv/avi等) 33 | - 💾 **智能缓存**: 自动跳过已处理文件,提高批量处理效率 34 | - 🔧 **灵活配置**: 可自定义转录参数,满足不同场景需求 35 | 36 | ## 📦 版本说明 / Package Variants 37 | 38 | ### 基础版 (Base Package) - 约 2.2GB 39 | - ✅ 所有 GPU 依赖项 40 | - ✅ 音声优化 VAD(语音活动检测)模型 41 | - ❌ 不含 Whisper 模型(需自行下载) 42 | 43 | ### 海南鸡版 (ChickenRice Edition) - 约 4.4GB 44 | - ✅ 所有 GPU 依赖项 45 | - ✅ 音声优化 VAD(语音活动检测)模型 46 | - ✅ **"海南鸡v2 5000小时"** 日文转中文优化模型(开箱即用) 47 | 48 | ## 🚀 快速开始 / Quick Start 49 | 50 | ### 1. 选择适合的CUDA版本 / Choose CUDA Version 51 | 52 | 运行 `nvidia-smi` 查看您的CUDA版本: 53 | 54 | | 显卡系列 | 推荐 CUDA 版本 | 55 | |---------|--------------| 56 | | GTX 10/16系列 | CUDA 11.8 | 57 | | RTX 20/30系列 | CUDA 11.8 或 12.2 | 58 | | RTX 40系列 | CUDA 12.2 或 12.8 | 59 | | RTX 50系列 | **必须使用 CUDA 12.8** | 60 | 61 | ### 2. 下载对应版本 / Download 62 | 63 | 从 [Releases](https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/releases) 页面下载对应版本 64 | 65 | ### 3. 使用方法 / Usage 66 | 67 | 将音视频文件拖放到相应的批处理文件: 68 | 69 | ```bash 70 | # GPU模式(推荐,显存≥6GB) 71 | 运行(GPU).bat 72 | 73 | # GPU低显存模式(显存4GB) 74 | 运行(GPU,低显存模式).bat 75 | 76 | # CPU模式(无显卡用户) 77 | 运行(CPU).bat 78 | 79 | # 视频专用模式 80 | 运行(翻译视频)(GPU).bat 81 | ``` 82 | 83 | ## 📖 详细文档 / Documentation 84 | 85 | - 📝 [使用说明](使用说明.txt) - 详细的使用指南和参数配置 86 | - 📋 [发行说明](RELEASE_NOTES_CN.md) - 版本更新日志和选择指南 87 | - ⚙️ [生成配置](generation_config.json5) - 转录参数配置文件 88 | 89 | ## 🛠️ 高级配置 / Advanced Configuration 90 | 91 | ### 命令行参数 92 | 93 | 编辑批处理文件,在 `infer.exe` 后添加参数: 94 | 95 | ```batch 96 | # 覆盖已存在的字幕文件 97 | --overwrite 98 | 99 | # 指定输出文件夹 100 | --output_dir="路径" 101 | 102 | # 自定义文件格式 103 | --audio_suffixes="mp3,wav" 104 | --sub_formats="srt,vtt,lrc" 105 | 106 | # 调整日志级别 107 | --log_level="INFO" 108 | ``` 109 | 110 | ### 转录参数调整 111 | 112 | 编辑 `generation_config.json5` 文件调整转录参数。 113 | 114 | 参数详情请参考 [Faster Whisper 文档](https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733) 115 | 116 | ## 🔗 相关链接 / Links 117 | 118 | - **Faster Whisper**: https://github.com/SYSTRAN/faster-whisper 119 | - **海南鸡模型**: https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st 120 | - **音声优化 VAD 模型**: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx 121 | - **OpenAI Whisper**: https://github.com/openai/whisper 122 | - **AI汉化组**: https://t.me/transWithAI 123 | 124 | ## 💡 常见问题 / FAQ 125 | 126 | **Q: GPU模式无法运行?** 127 | A: 确认是否为NVIDIA显卡,更新显卡驱动到最新版本 128 | 129 | **Q: 字幕未生成?** 130 | A: 检查文件格式是否支持,查看控制台错误信息,尝试使用 `--overwrite` 参数 131 | 132 | **Q: 内存/显存不足?** 133 | A: 使用低显存模式或切换到CPU模式 134 | 135 | **Q: 如何选择CUDA版本?** 136 | A: 运行 `nvidia-smi` 查看CUDA Version,参考[发行说明](RELEASE_NOTES_CN.md)中的兼容性表 137 | 138 | ## 📞 技术支持 / Support 139 | 140 | 如遇到问题,请: 141 | 1. 查看[使用说明](使用说明.txt)和[发行说明](RELEASE_NOTES_CN.md) 142 | 2. 检查显卡驱动是否为最新版本 143 | 3. 确认选择了正确的CUDA版本 144 | 4. 提交Issue到项目仓库 145 | 146 | ## 📄 许可证 / License 147 | 148 | 本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件 149 | 150 | --- 151 | 152 | *本工具基于 Faster Whisper 开发,海南鸡模型经过5000小时音频数据优化训练,专门针对日文转中文翻译场景。* 153 | *由AI汉化组开源维护,永久免费。* 154 | 155 | **再次感谢某匿名群友的算力和技术支持!** 156 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | !project.spec # Keep our custom spec file 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | # (Using JSON-based i18n now, no compiled files) 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | 165 | ### Python Patch ### 166 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 167 | poetry.toml 168 | 169 | # ruff 170 | .ruff_cache/ 171 | 172 | # LSP config files 173 | pyrightconfig.json 174 | 175 | # End of https://www.toptal.com/developers/gitignore/api/python 176 | 177 | # Model files (downloaded separately) 178 | models/*.bin 179 | models/*.onnx 180 | models/*.pt 181 | models/*.pth 182 | models/*.safetensors 183 | # Keep metadata and config files 184 | !models/*.json 185 | !models/*.json5 186 | !models/*.txt 187 | !models/*.yaml 188 | !models/*.yml 189 | 190 | # UV package manager 191 | .uv/ 192 | uv.lock 193 | 194 | # OS specific 195 | .DS_Store 196 | Thumbs.db 197 | desktop.ini 198 | 199 | models/ -------------------------------------------------------------------------------- /locales/zh-CN/messages.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": { 3 | "name": "Whisper转录增强版", 4 | "description": "基于自定义VAD注入的Whisper转录" 5 | }, 6 | 7 | "args": { 8 | "model_path": "Whisper模型路径", 9 | "device": "运行模型的设备 (cpu, cuda, auto)", 10 | "compute_type": "模型计算类型", 11 | "overwrite": "覆盖现有字幕文件", 12 | "audio_extensions": "要处理的音频文件扩展名列表(逗号分隔)", 13 | "subtitle_formats": "字幕格式列表(逗号分隔):lrc, srt, vtt, txt", 14 | "output_dir": "字幕文件输出目录", 15 | "config_file": "生成配置文件路径", 16 | "log_level": "日志级别", 17 | "vad_threshold": "覆盖VAD阈值", 18 | "min_speech_duration": "覆盖最小语音持续时间(毫秒)", 19 | "min_silence_duration": "覆盖最小静音持续时间(毫秒)", 20 | "speech_padding": "覆盖语音填充时间(毫秒)", 21 | "directories": "要处理的目录或文件" 22 | }, 23 | 24 | "info": { 25 | "output_dir": "输出目录:{output_dir}", 26 | "generation_config": "生成配置:{config}", 27 | "initializing_vad": "正在初始化增强VAD模型...", 28 | "vad_activated": "✓ 增强VAD已激活(阈值={threshold})", 29 | "loading_whisper": "正在加载Whisper模型...", 30 | "model_precision": "模型运行精度:{precision},设备:{device}", 31 | "translating": "正在翻译({current}/{total}):{path}", 32 | "duration": "时长:{duration}", 33 | "duration_filtered": "时长:{original} → {filtered}(检测到 {percent} 语音)", 34 | "writing": "正在写入:{path}", 35 | "vad_deactivated": "VAD注入已停用", 36 | "no_files_found": "未找到要翻译的文件", 37 | "logging_to_file": "日志文件:{path}", 38 | "program_version": "程序版本:{version}", 39 | "python_version": "Python版本:{version}", 40 | "platform": "运行平台:{platform}", 41 | "arguments": "运行参数:{args}", 42 | "auto_detected_device": "自动检测到设备:{device}", 43 | "auto_selected_compute_type": "自动选择计算类型 '{compute_type}',设备:'{device}'" 44 | }, 45 | 46 | "tasks": { 47 | "translation": { 48 | "one": "翻译任务:{count}", 49 | "other": "翻译任务:{count}" 50 | } 51 | }, 52 | 53 | "files": { 54 | "found": { 55 | "one": "找到 {count} 个文件待处理", 56 | "other": "找到 {count} 个文件待处理" 57 | }, 58 | "count": { 59 | "one": "{count} 个文件", 60 | "other": "{count} 个文件" 61 | } 62 | }, 63 | 64 | "warnings": { 65 | "provide_directories": "请提供要翻译的目录", 66 | "drag_files": "请将要翻译的文件或目录拖放到此程序上", 67 | "unknown_format": "未知格式:{format}", 68 | "loaded_vad_config": "已从 {path} 加载VAD配置", 69 | "failed_load_vad": "无法从 {path} 加载VAD元数据:{error}", 70 | "using_default_vad": "使用默认VAD配置", 71 | "vad_file_not_found": "在 {path} 未找到VAD元数据文件", 72 | "compute_types_unavailable": "无法获取设备 {device} 支持的计算类型:{error}", 73 | "no_preferred_compute_type": "未找到首选计算类型,使用默认值 '{default}'" 74 | }, 75 | 76 | "progress": { 77 | "vad": "VAD进度:{current}/{total} 块({percent:0.1f}%)在 {device} 上" 78 | }, 79 | 80 | "debug": { 81 | "processing": "正在处理:{path}", 82 | "file_suffix": "文件后缀:{suffix}", 83 | "valid_suffixes": "有效后缀:{suffixes}", 84 | "skipped_suffix": "已跳过 - 后缀 '{suffix}' 不在有效音频格式中", 85 | "subtitle_exists": "字幕已存在:{path}", 86 | "skipped_all_exist": "已跳过 - 所有字幕格式已存在", 87 | "added_task": "为格式添加任务:{formats}", 88 | "scanning": "正在扫描:{path}" 89 | }, 90 | 91 | "time": { 92 | "duration_hours": "{hours}小时{minutes}分{seconds:0.0f}秒", 93 | "duration_minutes": "{minutes}分{seconds:0.1f}秒", 94 | "duration_seconds": "{seconds:0.2f}秒" 95 | }, 96 | 97 | "format": { 98 | "percentage": "{value:0.1f}%" 99 | }, 100 | 101 | "vad": { 102 | "onnx_not_installed": "未安装onnxruntime。请使用以下命令安装:\n pip install onnxruntime # CPU版本\n pip install onnxruntime-gpu # GPU版本", 103 | "transformers_not_installed": "未安装transformers。请使用以下命令安装:\n pip install transformers", 104 | "model_loaded": "ONNX模型已加载:{path}", 105 | "auto_configured": "自动配置ONNX使用{threads}个CPU线程(可用{total}个的一半)", 106 | "device": "设备:{device}", 107 | "providers": "提供器:{providers}", 108 | "chunk_duration": "块时长:{duration}毫秒", 109 | "frame_duration": "帧时长:{duration}毫秒", 110 | "librosa_not_installed": "未安装librosa,假设音频已经是16kHz", 111 | "starting": "在 {device} 上开始VAD处理", 112 | "total_samples": "总音频采样数:{samples}", 113 | "chunk_size": "块大小:{samples} 个采样({duration}毫秒)", 114 | "total_chunks": "待处理总块数:{chunks}", 115 | "processing_chunk": "正在处理第 {current}/{total} 块({percent:0.1f}%)在 {device} 上", 116 | "completed": "VAD处理完成:在 {device} 上处理了 {chunks} 块", 117 | "model_initialized": "WhisperVadModel已用模型初始化:{path}", 118 | "using_device": "使用设备:{device}", 119 | "init_failed": "初始化ONNX模型失败:{error}", 120 | "path_invalid": "未提供ONNX模型路径或路径不存在:{path}", 121 | "not_initialized": "WhisperVadModel:ONNX模型未初始化。请提供有效的ONNX模型路径。", 122 | "speech_segments": { 123 | "one": "使用Whisper VAD找到 {count} 个语音片段", 124 | "other": "使用Whisper VAD找到 {count} 个语音片段" 125 | }, 126 | "registered": "已注册带进度回调的whisper_vad模型", 127 | "model_not_found": "未找到模型 {model_id},使用默认模型", 128 | "feature_extractor_loaded": "从本地文件夹加载 WhisperFeatureExtractor: {path}" 129 | }, 130 | 131 | "injection": { 132 | "already_active": "VAD注入已激活,跳过", 133 | "patched": "已修补 {path}", 134 | "patch_failed": "无法修补 {path}:{error}", 135 | "activated_with_model": "VAD注入已激活,使用模型 '{model_id}'", 136 | "activated": "VAD注入已激活", 137 | "not_active": "VAD注入未激活,无需取消注入", 138 | "stop_error": "停止修补时出错:{error}", 139 | "auto_injected": "已自动注入VAD,使用模型:{model_id}" 140 | }, 141 | 142 | "batch": { 143 | "finding_optimal": "正在寻找最佳批次大小(测试范围:{min_size}-{max_size})...", 144 | "testing_size": "测试批次大小:{size}", 145 | "size_successful": "批次大小 {size} 成功", 146 | "optimal_found": "找到最佳批次大小:{size}", 147 | "oom_error": "批次大小 {size} 因内存不足而失败", 148 | "runtime_error": "批次大小 {size} 失败,错误:{error}", 149 | "reducing_size": "将批次大小从 {old_size} 减小到 {new_size}", 150 | "no_suitable_size": "即使使用最小批次大小 {min_size} 也无法找到合适的批次大小", 151 | "unexpected_error": "测试批次大小 {size} 时出现意外错误:{error}", 152 | "attempting_transcription": "尝试使用批次大小={size}进行转录", 153 | "auto_adjusted": "内存不足后批次大小自动调整为 {size}", 154 | "oom_reducing": "批次大小={old_size}时内存不足,减小到 {new_size} (x0.8)...", 155 | "cannot_run_min": "即使使用批次大小={min_size}也无法运行", 156 | "inference_failed": "即使使用最小批次大小={min_size}也无法运行推理。请考虑减小模型大小或使用CPU。" 157 | } 158 | } -------------------------------------------------------------------------------- /RELEASE_NOTES_CN.md: -------------------------------------------------------------------------------- 1 | # 🎙️ Faster Whisper 转录工具 - 发行说明 2 | 3 | ## ⚠️ 重要声明 4 | 5 | > **本软件为开源软件** 6 | > 7 | > 🔗 **开源地址**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice 8 | > 9 | > 👥 **开发团队**: AI汉化组 (https://t.me/transWithAI) 10 | 11 | --- 12 | 13 | ## 📦 发行包说明 14 | 15 | 本发行版包含多个变体版本,请根据您的显卡型号选择合适的版本: 16 | 17 | ### 🎯 版本类型说明 18 | 19 | #### 基础版(Base Package) 20 | - **下载大小**:约 2.2GB 21 | - **包含内容**: 22 | - ✅ 所有 GPU 依赖项 23 | - ✅ 音声优化 VAD(语音活动检测)ONNX 模型 24 | - ❌ 不含 Whisper 模型(需自行下载) 25 | - **适用场景**:需要使用自定义模型的用户 26 | 27 | #### 海南鸡版(Chickenrice Edition) 28 | - **下载大小**:约 4.4GB 29 | - **包含内容**: 30 | - ✅ 所有 GPU 依赖项 31 | - ✅ 音声优化 VAD(语音活动检测)ONNX 模型 32 | - ✅ **"海南鸡v2 5000小时"** 日文转中文优化模型 33 | - **适用场景**:开箱即用的日文转中文翻译 34 | - **模型说明**:包含经过5000小时音频数据训练的海南鸡v2版本模型,专门优化日文转中文翻译 35 | 36 | ### 📌 文件命名规则 37 | 38 | | 文件名后缀 | CUDA版本 | 模型类型 | 39 | |-----------|---------|---------| 40 | | `*_cu118.zip` | CUDA 11.8 | 基础版 | 41 | | `*_cu118-chickenrice.zip` | CUDA 11.8 | 海南鸡版 | 42 | | `*_cu122.zip` | CUDA 12.2 | 基础版 | 43 | | `*_cu122-chickenrice.zip` | CUDA 12.2 | 海南鸡版 | 44 | | `*_cu128.zip` | CUDA 12.8 | 基础版 | 45 | | `*_cu128-chickenrice.zip` | CUDA 12.8 | 海南鸡版 | 46 | 47 | --- 48 | 49 | ## 🔍 如何选择正确的 CUDA 版本 50 | 51 | ### 方法一:通过 nvidia-smi 查询 52 | 53 | 1. 打开命令提示符或终端 54 | 2. 输入命令:`nvidia-smi` 55 | 3. 查看输出中的 **Driver Version** 和 **CUDA Version** 56 | 57 | ``` 58 | +-------------------------------------------------------------------------+ 59 | | NVIDIA-SMI 570.00 Driver Version: 570.00 CUDA Version: 12.8| 60 | +-------------------------------------------------------------------------+ 61 | ``` 62 | 63 | ### 方法二:通过显卡型号和驱动版本对照表 64 | 65 | #### 📊 NVIDIA 驱动版本与 CUDA 版本兼容性表 66 | 67 | | CUDA 版本 | 最低驱动要求(Windows) | 最低驱动要求(Linux) | 推荐使用场景 | 68 | |----------|------------------------|---------------------|------------| 69 | | **CUDA 11.8** | ≥452.39 | ≥450.80.02 | 较旧的显卡(GTX 10系列、RTX 20/30系列) | 70 | | **CUDA 12.2** | ≥525.60.13 | ≥525.60.13 | RTX 30/40系列,较新的驱动 | 71 | | **CUDA 12.8** | ≥570.65 | ≥570.26 | RTX 40/50系列,最新驱动 | 72 | 73 | #### 🎮 显卡型号推荐表 74 | 75 | | 显卡系列 | 推荐 CUDA 版本 | 说明 | 76 | |---------|--------------|------| 77 | | GTX 10系列(1060/1070/1080等) | **CUDA 11.8** | 兼容性最好 | 78 | | GTX 16系列(1650/1660等) | **CUDA 11.8** | 兼容性最好 | 79 | | RTX 20系列(2060/2070/2080等) | **CUDA 11.8** 或 **12.2** | 根据驱动版本选择 | 80 | | RTX 30系列(3060/3070/3080/3090等) | **CUDA 12.2** | 推荐使用 | 81 | | RTX 40系列(4060/4070/4080/4090等) | **CUDA 12.2** 或 **12.8** | 最新驱动用12.8 | 82 | | **RTX 50系列(5090/5080/5070等)** | **🔴 必须使用 CUDA 12.8** | ⚠️ 注意:RTX 50系列必须使用CUDA 12.8版本 | 83 | 84 | ### ⚠️ 重要提示 85 | 86 | - **RTX 50系列用户**:由于新架构要求,**必须使用 CUDA 12.8 版本**,驱动版本必须 ≥570.00 87 | - **驱动版本查询**:在 nvidia-smi 中显示的 CUDA Version 是您的驱动**支持的最高**CUDA版本 88 | - **向下兼容**:高版本驱动可以运行低版本CUDA程序(例如:570驱动可以运行CUDA 11.8程序) 89 | - **性能考虑**:使用与驱动匹配的CUDA版本可获得最佳性能 90 | 91 | --- 92 | 93 | ## 📥 模型下载说明 94 | 95 | ### 基础版用户(需自行下载模型) 96 | 97 | 基础版包含VAD模型,但**不包含**Whisper语音识别模型。您需要: 98 | 99 | 1. **从 Hugging Face 下载模型** 100 | - 示例模型地址:https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st 101 | - 这是"海南鸡v2 5000小时"版本的日文转中文优化模型 102 | 103 | 2. **放置模型文件** 104 | ``` 105 | 将下载的模型文件放入: 106 | faster_whisper_transwithai_chickenrice/ 107 | └── models/ 108 | └── [您下载的模型文件夹]/ 109 | ``` 110 | 111 | 3. **其他可用模型** 112 | - OpenAI Whisper官方模型 113 | - 其他社区优化模型 114 | 115 | ### 海南鸡版用户(开箱即用) 116 | 117 | 海南鸡版已包含: 118 | - ✅ 音声优化 VAD 语音活动检测模型 119 | - ✅ "海南鸡v2 5000小时"日文转中文优化版Whisper模型 120 | - ✅ 所有必要的配置文件 121 | 122 | **无需额外下载**,解压后直接运行即可使用! 123 | 124 | --- 125 | 126 | ## 🚀 快速开始指南 127 | 128 | ### 1. 选择版本 129 | 根据上述表格,选择适合您显卡的CUDA版本 130 | 131 | ### 2. 下载对应版本 132 | - 仅转录/翻译:下载基础版 + 自行下载模型 133 | - 日文转中文优化:下载海南鸡版(推荐) 134 | 135 | ### 3. 解压并运行 136 | ```bash 137 | # GPU模式(推荐) 138 | 将音视频文件拖放到 "运行(GPU).bat" 139 | 140 | # CPU模式(无显卡用户) 141 | 将音视频文件拖放到 "运行(CPU).bat" 142 | 143 | # 低显存模式(4GB显存) 144 | 将音视频文件拖放到 "运行(GPU,低显存模式).bat" 145 | ``` 146 | 147 | --- 148 | 149 | ## 💡 常见问题 150 | 151 | **Q: 我应该选择哪个CUDA版本?** 152 | A: 运行 `nvidia-smi` 查看您的驱动版本,然后对照上表选择。 153 | 154 | **Q: 海南鸡版和基础版有什么区别?** 155 | A: 海南鸡版包含预训练的日文转中文优化模型(5000小时训练),基础版需要自行下载模型。 156 | 157 | **Q: RTX 4090 应该用哪个版本?** 158 | A: 推荐使用 CUDA 12.2 或 12.8 版本,取决于您的驱动版本。 159 | 160 | **Q: 显存不足怎么办?** 161 | A: 使用"低显存模式"批处理文件,或切换到CPU模式。 162 | 163 | --- 164 | 165 | ## 📝 更新日志 166 | 167 | ### v1.4 (2025-11-25) 168 | - 🚀 **批处理推理支持**:新增批处理推理模式(--enable_batching),大幅提升处理速度 169 | - ⚡ **智能批次大小自动检测**:程序启动时自动测试不同批次大小(1-8),找到显存允许的最大批次 170 | - 🎯 **手动批次大小控制**:支持通过 --batch_size 参数手动指定批次大小,跳过自动检测 171 | - 🔧 **运行时自适应调整**:处理过程中如遇到显存不足(OOM),自动降低批次大小(每次减少20%)继续处理 172 | - 📈 **最大批次大小配置**:通过 --max_batch_size 参数控制自动检测的上限(默认8,可根据显存调整) 173 | - 📦 **新增高显存加速模式**:提供 "运行(GPU,高显存加速模式).bat" 专门为8GB+显存用户优化 174 | - 🔨 **修复批处理兼容性**:应用补丁修复faster-whisper批处理的max_initial_timestamp参数传递问题 175 | - 🌐 **批处理日志国际化**:为批处理功能添加完整的中英文本地化消息,便于调试和使用 176 | 177 | **📊 批处理模式说明**: 178 | - **并行处理优势**:批处理模式下,多个音频片段并行转录,每个片段独立处理,不依赖前面片段的结果 179 | - **精度权衡**:批处理可能略微降低转录精度(由于失去了条件生成的上下文信息) 180 | - **特殊场景优化**:在某些场景下批处理反而效果更好,因为避免了条件生成可能带来的错误传播 181 | - 噪声较多的音频:避免噪声片段影响后续转录 182 | - 多说话人场景:减少不同说话人之间的相互干扰 183 | - 长音频文件:防止错误累积效应 184 | 185 | **🎮 使用建议**: 186 | - 8GB+ 显存:使用 "运行(GPU,高显存加速模式).bat",自动检测最优批次大小 187 | - 4-8GB 显存:手动设置较小批次大小,如 --batch_size=2 或 4 188 | - 追求最高精度:使用常规模式(不加 --enable_batching 参数) 189 | - 追求处理速度:启用批处理模式,接受轻微的精度权衡 190 | 191 | ### v1.3 (2025-11-17) 192 | - 🤖 智能计算类型选择:自动检测设备并选择最优计算类型(bfloat16 > float16 > int8 > float32) 193 | - 🔍 增强设备自动检测:改进CUDA可用性检测,支持CUDA_VISIBLE_DEVICES环境变量 194 | - 🔇 抑制警告信息:添加TRANSFORMERS_NO_ADVISORY_WARNINGS环境变量,减少日志噪音 195 | - 🎯 简化批处理文件:移除硬编码的计算类型设置,全部使用自动检测模式 196 | - 📊 改进日志记录:添加自动检测设备和计算类型的详细日志信息 197 | - 🐛 修复日志重复问题:移除重复的根日志处理器,避免日志重复输出 198 | - 🌐 增强国际化支持:为自动检测功能添加完整的中英文本地化消息 199 | 200 | ### v1.2 (2025-11-15) 201 | - ⚡ CPU模式性能优化:添加 int16 计算类型支持,提升CPU处理速度 202 | - 🎮 GPU兼容性改进:强制使用 float16 替代模型精度 bfloat16,提升显卡兼容性 203 | - 🔧 精简批处理文件:合并视频翻译功能到主批处理文件中 204 | - 📝 新增日志记录功能:自动保存运行日志到 latest.log 文件,方便问题反馈 205 | 206 | ### v1.1 (2025-11-14) 207 | - 🌐 离线支持改进:预下载 whisper-base 模型文件,实现完全离线运行 208 | - 📥 自动模型管理:WhisperFeatureExtractor 优先使用本地模型,避免网络超时 209 | - 🔧 优化下载流程:支持从已有模型文件夹复制,减少重复下载 210 | - 🚀 提升稳定性:解决网络不稳定环境下的 HuggingFace 连接超时问题 211 | 212 | ### v1.0 (2025-11-13) 213 | - 🎯 支持多CUDA版本(11.8/12.2/12.8) 214 | - 🚀 优化的日文转中文翻译效果(海南鸡v2版本) 215 | - 🔊 音声优化的VAD语音活动检测 216 | - 💾 改进的缓存机制,加快CI/CD构建速度 217 | - 📦 分离的基础版和完整版,满足不同需求 218 | - 🔧 自动VAD模型下载和管理 219 | 220 | --- 221 | 222 | ## 📞 技术支持 223 | 224 | 如遇到问题,请: 225 | 1. 检查显卡驱动是否为最新版本 226 | 2. 确认选择了正确的CUDA版本 227 | 3. 查看控制台输出的错误信息 228 | 4. 提交Issue到项目仓库: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice 229 | 230 | ### 🔗 官方链接 231 | - **GitHub仓库**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice 232 | - **音声优化 VAD 模型**: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx 233 | - **Telegram群组**: https://t.me/transWithAI 234 | - **开发团队**: AI汉化组 235 | 236 | --- 237 | 238 | ## 🙏 致谢 239 | 240 | - 🚀 基于 [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) 开发 241 | - 🐔 使用 [chickenrice0721/whisper-large-v2-translate-zh-v0.2-st](https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st) 日文转中文优化模型 242 | - 🔊 使用 [TransWithAI/Whisper-Vad-EncDec-ASMR-onnx](https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx) 音声优化 VAD 模型 243 | - 🎙️ [OpenAI Whisper](https://github.com/openai/whisper) 原始项目 244 | - 💪 **感谢某匿名群友的算力和技术支持** 245 | 246 | --- 247 | 248 | *本工具基于 Faster Whisper 开发,海南鸡模型经过5000小时音频数据优化训练,专门针对日文转中文翻译场景。* 249 | *由AI汉化组开源维护,永久免费。* 250 | -------------------------------------------------------------------------------- /locales/en-US/messages.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": { 3 | "name": "Whisper Transcription Enhanced", 4 | "description": "Whisper transcription with custom VAD injection" 5 | }, 6 | 7 | "args": { 8 | "model_path": "Path to the Whisper model", 9 | "device": "Device to run the model on (cpu, cuda, auto)", 10 | "compute_type": "Compute type for the model", 11 | "overwrite": "Overwrite existing subtitle files", 12 | "audio_extensions": "Comma-separated list of audio file extensions to process", 13 | "subtitle_formats": "Comma-separated list of subtitle formats (lrc, srt, vtt, txt)", 14 | "output_dir": "Output directory for subtitle files", 15 | "config_file": "Path to generation config file", 16 | "log_level": "Logging level", 17 | "vad_threshold": "Override VAD threshold", 18 | "min_speech_duration": "Override minimum speech duration in ms", 19 | "min_silence_duration": "Override minimum silence duration in ms", 20 | "speech_padding": "Override speech padding in ms", 21 | "directories": "Directories or files to process" 22 | }, 23 | 24 | "info": { 25 | "output_dir": "Output directory: {output_dir}", 26 | "generation_config": "Generation config: {config}", 27 | "initializing_vad": "Initializing enhanced VAD model...", 28 | "vad_activated": "✓ Enhanced VAD activated (threshold={threshold})", 29 | "loading_whisper": "Loading Whisper model...", 30 | "model_precision": "Model running with precision: {precision} on device: {device}", 31 | "translating": "Translating ({current}/{total}): {path}", 32 | "duration": "Duration: {duration}", 33 | "duration_filtered": "Duration: {original} → {filtered} ({percent} speech detected)", 34 | "writing": "Writing: {path}", 35 | "vad_deactivated": "VAD injection deactivated", 36 | "no_files_found": "No files found to translate", 37 | "logging_to_file": "Logging to file: {path}", 38 | "program_version": "Program version: {version}", 39 | "python_version": "Python version: {version}", 40 | "platform": "Platform: {platform}", 41 | "arguments": "Arguments: {args}", 42 | "auto_detected_device": "Auto-detected device: {device}", 43 | "auto_selected_compute_type": "Auto-selected compute type '{compute_type}' for device '{device}'" 44 | }, 45 | 46 | "tasks": { 47 | "translation": { 48 | "one": "Translation task: {count}", 49 | "other": "Translation tasks: {count}" 50 | } 51 | }, 52 | 53 | "files": { 54 | "found": { 55 | "one": "Found {count} file to process", 56 | "other": "Found {count} files to process" 57 | }, 58 | "count": { 59 | "one": "{count} file", 60 | "other": "{count} files" 61 | } 62 | }, 63 | 64 | "warnings": { 65 | "provide_directories": "Please provide directories to translate", 66 | "drag_files": "Please drag files or directories to translate onto this program", 67 | "unknown_format": "Unknown format: {format}", 68 | "loaded_vad_config": "Loaded VAD configuration from {path}", 69 | "failed_load_vad": "Failed to load VAD metadata from {path}: {error}", 70 | "using_default_vad": "Using default VAD configuration", 71 | "vad_file_not_found": "VAD metadata file not found at {path}", 72 | "compute_types_unavailable": "Could not get supported compute types for {device}: {error}", 73 | "no_preferred_compute_type": "No preferred compute type found, using default '{default}'" 74 | }, 75 | 76 | "progress": { 77 | "vad": "VAD Progress: {current}/{total} chunks ({percent:0.1f}%) on {device}" 78 | }, 79 | 80 | "debug": { 81 | "processing": "Processing: {path}", 82 | "file_suffix": "File suffix: {suffix}", 83 | "valid_suffixes": "Valid suffixes: {suffixes}", 84 | "skipped_suffix": "Skipped - suffix '{suffix}' not in valid audio formats", 85 | "subtitle_exists": "Subtitle already exists: {path}", 86 | "skipped_all_exist": "Skipped - all subtitle formats already exist", 87 | "added_task": "Added task for formats: {formats}", 88 | "scanning": "Scanning: {path}" 89 | }, 90 | 91 | "time": { 92 | "duration_hours": "{hours}h {minutes}m {seconds:0.0f}s", 93 | "duration_minutes": "{minutes}m {seconds:0.1f}s", 94 | "duration_seconds": "{seconds:0.2f}s" 95 | }, 96 | 97 | "format": { 98 | "percentage": "{value:0.1f}%" 99 | }, 100 | 101 | "vad": { 102 | "onnx_not_installed": "onnxruntime not installed. Install with:\n pip install onnxruntime # For CPU\n pip install onnxruntime-gpu # For GPU", 103 | "transformers_not_installed": "transformers not installed. Install with:\n pip install transformers", 104 | "model_loaded": "ONNX Model loaded: {path}", 105 | "auto_configured": "Auto-configured ONNX to use {threads} CPU threads (half of {total} available)", 106 | "device": "Device: {device}", 107 | "providers": "Providers: {providers}", 108 | "chunk_duration": "Chunk duration: {duration}ms", 109 | "frame_duration": "Frame duration: {duration}ms", 110 | "librosa_not_installed": "librosa not installed, assuming audio is already at 16kHz", 111 | "starting": "Starting VAD processing on {device}", 112 | "total_samples": "Total audio samples: {samples}", 113 | "chunk_size": "Chunk size: {samples} samples ({duration}ms)", 114 | "total_chunks": "Total chunks to process: {chunks}", 115 | "processing_chunk": "Processing chunk {current}/{total} ({percent:0.1f}%) on {device}", 116 | "completed": "VAD processing completed: {chunks} chunks processed on {device}", 117 | "model_initialized": "WhisperVadModel initialized with model: {path}", 118 | "using_device": "Using device: {device}", 119 | "init_failed": "Failed to initialize ONNX model: {error}", 120 | "path_invalid": "ONNX model path not provided or doesn't exist: {path}", 121 | "not_initialized": "WhisperVadModel: ONNX model not initialized. Please provide a valid ONNX model path.", 122 | "speech_segments": { 123 | "one": "Found {count} speech segment using Whisper VAD", 124 | "other": "Found {count} speech segments using Whisper VAD" 125 | }, 126 | "registered": "Registered whisper_vad model with progress callback", 127 | "model_not_found": "Model {model_id} not found, using default", 128 | "feature_extractor_loaded": "Loaded WhisperFeatureExtractor from local folder: {path}" 129 | }, 130 | 131 | "injection": { 132 | "already_active": "VAD injection already active, skipping", 133 | "patched": "Patched {path}", 134 | "patch_failed": "Could not patch {path}: {error}", 135 | "activated_with_model": "VAD injection activated with model '{model_id}'", 136 | "activated": "VAD injection activated", 137 | "not_active": "VAD injection not active, nothing to uninject", 138 | "stop_error": "Error stopping patch: {error}", 139 | "auto_injected": "Auto-injected VAD with model: {model_id}" 140 | }, 141 | 142 | "batch": { 143 | "finding_optimal": "Finding optimal batch size (testing range: {min_size}-{max_size})...", 144 | "testing_size": "Testing batch size: {size}", 145 | "size_successful": "Batch size {size} successful", 146 | "optimal_found": "Optimal batch size found: {size}", 147 | "oom_error": "Batch size {size} failed with out of memory error", 148 | "runtime_error": "Batch size {size} failed with error: {error}", 149 | "reducing_size": "Reducing batch size from {old_size} to {new_size}", 150 | "no_suitable_size": "Cannot find suitable batch size even with minimum size {min_size}", 151 | "unexpected_error": "Unexpected error testing batch size {size}: {error}", 152 | "attempting_transcription": "Attempting transcription with batch_size={size}", 153 | "auto_adjusted": "Batch size auto-adjusted to {size} after OOM", 154 | "oom_reducing": "OOM with batch_size={old_size}, reducing to {new_size} (x0.8)...", 155 | "cannot_run_min": "Cannot run even with batch_size={min_size}", 156 | "inference_failed": "Unable to run inference even with minimum batch_size={min_size}. Consider reducing model size or using CPU." 157 | } 158 | } -------------------------------------------------------------------------------- /src/faster_whisper_transwithai_chickenrice/injection.py: -------------------------------------------------------------------------------- 1 | """ 2 | VAD Injection System - Redirects faster_whisper VAD calls to custom implementations 3 | Provides transparent switching between custom VAD models 4 | """ 5 | 6 | import unittest.mock as mock 7 | from typing import List, Dict, Any, Optional, Callable 8 | import logging 9 | import numpy as np 10 | from dataclasses import dataclass 11 | 12 | from .vad_manager import VadModelManager, VadConfig 13 | 14 | # Import modern i18n module for translations 15 | from . import i18n_modern as i18n 16 | 17 | # Convenience imports 18 | _ = i18n._ 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | # Global flag to track if injection is active 23 | _injection_active = False 24 | _active_patches = [] 25 | _global_config = None 26 | _global_progress_callback = None 27 | 28 | 29 | @dataclass 30 | class VadOptionsCompat: 31 | """Mock VadOptions class that mimics faster_whisper.vad.VadOptions""" 32 | threshold: float = 0.5 33 | neg_threshold: Optional[float] = None 34 | min_speech_duration_ms: int = 0 35 | max_speech_duration_s: float = float('inf') 36 | min_silence_duration_ms: int = 2000 37 | speech_pad_ms: int = 400 38 | 39 | def __post_init__(self): 40 | """Compatibility with the original VadOptions""" 41 | pass 42 | 43 | 44 | def set_global_config(config: VadConfig): 45 | """Set the global configuration for VAD injection""" 46 | global _global_config 47 | _global_config = config 48 | 49 | 50 | def get_global_config() -> VadConfig: 51 | """Get the global configuration, creating default if needed""" 52 | global _global_config 53 | if _global_config is None: 54 | _global_config = VadConfig() 55 | return _global_config 56 | 57 | 58 | def get_speech_timestamps_injected( 59 | audio: np.ndarray, 60 | vad_options: Any = None, 61 | sampling_rate: int = 16000, 62 | **kwargs 63 | ) -> List[Dict[str, Any]]: 64 | """ 65 | Injected implementation of get_speech_timestamps that uses our VAD model manager. 66 | 67 | This function is injected in place of faster_whisper.vad.get_speech_timestamps 68 | to transparently use custom VAD models. 69 | """ 70 | # Get configuration 71 | config = get_global_config() 72 | 73 | # Check if a specific model was requested via kwargs 74 | model_id = kwargs.get('vad_model_id', config.default_model) 75 | 76 | # Check if a progress callback was provided (from kwargs or global) 77 | progress_callback = kwargs.get('progress_callback', None) or _global_progress_callback 78 | 79 | # Create manager (this uses cached instances internally) 80 | manager = VadModelManager(config=config, ttl=config.ttl, progress_callback=progress_callback) 81 | 82 | # Extract options from vad_options (works with both real and mock VadOptions) 83 | if vad_options is not None: 84 | options_dict = { 85 | 'threshold': getattr(vad_options, 'threshold', config.threshold), 86 | 'neg_threshold': getattr(vad_options, 'neg_threshold', config.neg_threshold), 87 | 'min_speech_duration_ms': getattr(vad_options, 'min_speech_duration_ms', config.min_speech_duration_ms), 88 | 'max_speech_duration_s': getattr(vad_options, 'max_speech_duration_s', config.max_speech_duration_s), 89 | 'min_silence_duration_ms': getattr(vad_options, 'min_silence_duration_ms', config.min_silence_duration_ms), 90 | 'speech_pad_ms': getattr(vad_options, 'speech_pad_ms', config.speech_pad_ms), 91 | } 92 | else: 93 | # Use defaults from config 94 | options_dict = { 95 | 'threshold': config.threshold, 96 | 'neg_threshold': config.neg_threshold, 97 | 'min_speech_duration_ms': config.min_speech_duration_ms, 98 | 'max_speech_duration_s': config.max_speech_duration_s, 99 | 'min_silence_duration_ms': config.min_silence_duration_ms, 100 | 'speech_pad_ms': config.speech_pad_ms, 101 | } 102 | 103 | # Remove vad_model_id and progress_callback from kwargs to avoid passing them to the actual VAD 104 | kwargs_copy = kwargs.copy() 105 | kwargs_copy.pop('vad_model_id', None) 106 | kwargs_copy.pop('progress_callback', None) 107 | 108 | # Merge options_dict with remaining kwargs 109 | final_kwargs = {**options_dict, **kwargs_copy} 110 | 111 | # Get speech timestamps using the model manager 112 | return manager.get_speech_timestamps( 113 | model_id=model_id, 114 | audio=audio, 115 | sampling_rate=sampling_rate, 116 | **final_kwargs 117 | ) 118 | 119 | 120 | def get_vad_patches(model_id: Optional[str] = None) -> Dict[str, mock.Mock]: 121 | """ 122 | Get all VAD-related patches for the codebase. 123 | 124 | Args: 125 | model_id: Optional model ID to force (e.g., "whisper_vad") 126 | 127 | Returns: 128 | Dictionary of patch paths to mock objects 129 | """ 130 | # Create wrapper functions that include model_id if specified 131 | if model_id: 132 | def get_timestamps_wrapper(audio, vad_options=None, sampling_rate=16000, **kwargs): 133 | kwargs['vad_model_id'] = model_id 134 | return get_speech_timestamps_injected(audio, vad_options, sampling_rate, **kwargs) 135 | else: 136 | get_timestamps_wrapper = get_speech_timestamps_injected 137 | 138 | patches = { 139 | # Core VAD module patches 140 | 'faster_whisper.vad.VadOptions': mock.Mock(side_effect=VadOptionsCompat), 141 | 'faster_whisper.vad.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper), 142 | 143 | # Alternative import location (used in transcribe module) 144 | 'faster_whisper.transcribe.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper), 145 | 146 | # Patch for VadOptions in transcribe module 147 | 'faster_whisper.transcribe.VadOptions': mock.Mock(side_effect=VadOptionsCompat), 148 | 149 | # You can add more patches here for specific modules if needed 150 | # For example, if you have modules that directly import from faster_whisper: 151 | # 'your_module.VadOptions': mock.Mock(side_effect=VadOptionsCompat), 152 | # 'your_module.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper), 153 | } 154 | 155 | return patches 156 | 157 | 158 | def inject_vad(model_id: Optional[str] = None, config: Optional[VadConfig] = None, progress_callback: Optional[Callable] = None) -> None: 159 | """ 160 | Inject VAD implementation to redirect faster_whisper calls. 161 | 162 | Args: 163 | model_id: Optional model ID to force (e.g., "whisper_vad") 164 | If None, uses the configured default model. 165 | config: Optional VadConfig to use for injection 166 | progress_callback: Optional progress callback for VAD processing 167 | """ 168 | global _injection_active, _active_patches, _global_progress_callback 169 | 170 | if _injection_active: 171 | logger.warning(_("injection.already_active")) 172 | return 173 | 174 | # Store progress callback globally 175 | _global_progress_callback = progress_callback 176 | 177 | # Set config if provided 178 | if config: 179 | set_global_config(config) 180 | 181 | patches_dict = get_vad_patches(model_id) 182 | 183 | for path, mock_obj in patches_dict.items(): 184 | try: 185 | patch = mock.patch(path, mock_obj) 186 | patch.start() 187 | _active_patches.append(patch) 188 | logger.debug(_("injection.patched", path=path)) 189 | except Exception as e: 190 | logger.debug(_("injection.patch_failed", path=path, error=e)) 191 | 192 | _injection_active = True 193 | if model_id: 194 | logger.info(_("injection.activated_with_model", model_id=model_id)) 195 | else: 196 | logger.info(_("injection.activated")) 197 | 198 | 199 | def uninject_vad() -> None: 200 | """ 201 | Remove VAD injection and restore original faster_whisper behavior. 202 | """ 203 | global _injection_active, _active_patches, _global_progress_callback 204 | 205 | if not _injection_active: 206 | logger.warning(_("injection.not_active")) 207 | return 208 | 209 | for patch in _active_patches: 210 | try: 211 | patch.stop() 212 | except Exception as e: 213 | logger.warning(_("injection.stop_error", error=e)) 214 | 215 | _active_patches.clear() 216 | _injection_active = False 217 | _global_progress_callback = None # Clear the progress callback 218 | logger.info(_("info.vad_deactivated")) 219 | 220 | 221 | class VadInjectionContext: 222 | """ 223 | Context manager for VAD injection. 224 | 225 | Usage: 226 | with VadInjectionContext(model_id="whisper_vad"): 227 | # Code that uses faster_whisper VAD will now use whisper VAD 228 | from faster_whisper.vad import get_speech_timestamps 229 | timestamps = get_speech_timestamps(audio, vad_options) 230 | """ 231 | 232 | def __init__(self, model_id: Optional[str] = None, config: Optional[VadConfig] = None): 233 | self.model_id = model_id 234 | self.config = config 235 | self.was_active = False 236 | 237 | def __enter__(self): 238 | global _injection_active 239 | self.was_active = _injection_active 240 | if self.was_active: 241 | uninject_vad() 242 | inject_vad(self.model_id, self.config) 243 | return self 244 | 245 | def __exit__(self, exc_type, exc_val, exc_tb): 246 | uninject_vad() 247 | if self.was_active: 248 | inject_vad() # Restore previous injection 249 | 250 | 251 | def auto_inject_vad(config: Optional[VadConfig] = None) -> None: 252 | """ 253 | Automatically inject VAD based on configuration. 254 | This should be called during application startup. 255 | 256 | Args: 257 | config: Optional VadConfig to use 258 | """ 259 | if config is None: 260 | config = get_global_config() 261 | else: 262 | set_global_config(config) 263 | 264 | # Check if we should inject based on configuration 265 | if config.auto_inject: 266 | model_id = config.default_model 267 | inject_vad(model_id, config) 268 | logger.info(_("injection.auto_injected", model_id=model_id)) 269 | 270 | 271 | def with_vad_injection(model_id: Optional[str] = None, config: Optional[VadConfig] = None): 272 | """ 273 | Decorator to use VAD injection for a specific function. 274 | 275 | Usage: 276 | @with_vad_injection(model_id="whisper_vad") 277 | def my_function(): 278 | # This function will use whisper VAD 279 | from faster_whisper.vad import get_speech_timestamps 280 | return get_speech_timestamps(audio, vad_options) 281 | """ 282 | def decorator(func): 283 | def wrapper(*args, **kwargs): 284 | with VadInjectionContext(model_id, config): 285 | return func(*args, **kwargs) 286 | return wrapper 287 | return decorator 288 | 289 | 290 | def is_injection_active() -> bool: 291 | """Check if VAD injection is currently active""" 292 | return _injection_active -------------------------------------------------------------------------------- /project.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | import sys 3 | import os 4 | from PyInstaller.utils.hooks import collect_all, collect_data_files, collect_submodules 5 | from pathlib import Path 6 | import glob 7 | 8 | block_cipher = None 9 | 10 | # Collect all data and binaries from critical packages 11 | datas = [] 12 | binaries = [] 13 | hiddenimports = [] 14 | 15 | # Function to detect conda environment and CUDA version 16 | def get_conda_cuda_libs(): 17 | """Detect and collect CUDA/cuDNN libraries from the active conda environment""" 18 | cuda_binaries = [] 19 | 20 | # Get the conda environment path 21 | conda_prefix = os.environ.get('CONDA_PREFIX', sys.prefix) 22 | print(f"Conda environment detected: {conda_prefix}") 23 | 24 | # Detect CUDA version from environment path or libraries 25 | cuda_version = None 26 | if 'cu118' in conda_prefix or 'cuda118' in conda_prefix: 27 | cuda_version = '11.8' 28 | elif 'cu122' in conda_prefix or 'cuda122' in conda_prefix: 29 | cuda_version = '12.2' 30 | elif 'cu128' in conda_prefix or 'cuda128' in conda_prefix: 31 | cuda_version = '12.8' 32 | else: 33 | # Try to detect from cudart version 34 | cudart_files = glob.glob(os.path.join(conda_prefix, 'lib', 'libcudart.so.*')) 35 | if cudart_files: 36 | cudart_file = os.path.basename(cudart_files[0]) 37 | if '11.8' in cudart_file: 38 | cuda_version = '11.8' 39 | elif '12.2' in cudart_file: 40 | cuda_version = '12.2' 41 | elif '12.8' in cudart_file: 42 | cuda_version = '12.8' 43 | 44 | print(f"Detected CUDA version: {cuda_version}") 45 | 46 | # Library paths to check - Windows uses different paths than Linux 47 | if sys.platform == 'win32': 48 | lib_dirs = [ 49 | os.path.join(conda_prefix, 'Library', 'bin'), # Primary location for Windows DLLs 50 | os.path.join(conda_prefix, 'bin'), # Alternative location 51 | os.path.join(conda_prefix, 'DLLs'), # Python DLLs location 52 | ] 53 | 54 | # Also check Python site-packages for ONNX Runtime libraries 55 | import site 56 | site_packages = site.getsitepackages() 57 | for sp in site_packages: 58 | if conda_prefix in sp: 59 | onnx_capi_path = os.path.join(sp, 'onnxruntime', 'capi') 60 | if os.path.exists(onnx_capi_path): 61 | lib_dirs.append(onnx_capi_path) 62 | print(f" Added ONNX Runtime path: {onnx_capi_path}") 63 | 64 | # Windows CUDA library patterns with version numbers 65 | cuda_libs_patterns = [ 66 | # CUDA Runtime 67 | 'cudart64_*.dll', 68 | 'cudart32_*.dll', # 32-bit variant if exists 69 | # cuBLAS 70 | 'cublas64_*.dll', 71 | 'cublasLt64_*.dll', 72 | # cuDNN libraries - critical for deep learning 73 | 'cudnn64_*.dll', 74 | 'cudnn_ops_infer64_*.dll', 75 | 'cudnn_ops_train64_*.dll', 76 | 'cudnn_cnn_infer64_*.dll', 77 | 'cudnn_cnn_train64_*.dll', 78 | 'cudnn_adv_infer64_*.dll', 79 | 'cudnn_adv_train64_*.dll', 80 | # For newer cuDNN versions (9.x) 81 | 'cudnn*.dll', 82 | # cuFFT 83 | 'cufft64_*.dll', 84 | 'cufftw64_*.dll', 85 | # cuRAND 86 | 'curand64_*.dll', 87 | # cuSPARSE 88 | 'cusparse64_*.dll', 89 | # cuSOLVER 90 | 'cusolver64_*.dll', 91 | 'cusolverMg64_*.dll', 92 | # NVRTC 93 | 'nvrtc64_*.dll', 94 | 'nvrtc-builtins64_*.dll', 95 | # NVIDIA Tools Extension 96 | 'nvToolsExt64_*.dll', 97 | # Additional potential libraries 98 | 'nppc64_*.dll', 99 | 'nppif64_*.dll', 100 | 'npps64_*.dll', 101 | # ONNX Runtime GPU dependencies (important!) 102 | 'onnxruntime_providers_cuda.dll', 103 | 'onnxruntime_providers_tensorrt.dll', 104 | 'onnxruntime_providers_shared.dll', 105 | # Python binding for ONNX Runtime 106 | 'onnxruntime_pybind11_state*.pyd', 107 | ] 108 | else: 109 | # Linux/Unix library paths 110 | lib_dirs = [ 111 | os.path.join(conda_prefix, 'lib'), 112 | os.path.join(conda_prefix, 'lib', 'stubs'), 113 | ] 114 | 115 | # Also check Python site-packages for ONNX Runtime libraries 116 | # This is crucial for finding libonnxruntime_providers_cuda.so, etc. 117 | import site 118 | # Try to get the site-packages directory in the conda environment 119 | python_version = f"python{sys.version_info.major}.{sys.version_info.minor}" 120 | site_packages_paths = [ 121 | os.path.join(conda_prefix, 'lib', python_version, 'site-packages'), 122 | os.path.join(conda_prefix, 'lib', 'python3.10', 'site-packages'), # Fallback for CI 123 | os.path.join(conda_prefix, 'lib', 'python3.11', 'site-packages'), # Alternative version 124 | ] 125 | 126 | for sp_path in site_packages_paths: 127 | onnx_capi_path = os.path.join(sp_path, 'onnxruntime', 'capi') 128 | if os.path.exists(onnx_capi_path): 129 | lib_dirs.append(onnx_capi_path) 130 | print(f" Added ONNX Runtime path: {onnx_capi_path}") 131 | break 132 | 133 | # Linux CUDA library patterns 134 | cuda_libs_patterns = [ 135 | 'libcudart.so*', 136 | 'libcublas.so*', 137 | 'libcublasLt.so*', 138 | 'libcudnn*.so*', 139 | 'libcufft.so*', 140 | 'libcufftw.so*', 141 | 'libcurand.so*', 142 | 'libcusparse.so*', 143 | 'libcusolver.so*', 144 | 'libnvrtc.so*', 145 | 'libnvToolsExt.so*', 146 | # ONNX Runtime GPU dependencies 147 | 'libonnxruntime_providers_cuda.so*', 148 | 'libonnxruntime_providers_tensorrt.so*', 149 | 'libonnxruntime_providers_shared.so*', 150 | # Also check without 'lib' prefix (for files in capi directory) 151 | 'onnxruntime_providers_cuda.so*', 152 | 'onnxruntime_providers_tensorrt.so*', 153 | 'onnxruntime_providers_shared.so*', 154 | # Python extension module 155 | 'onnxruntime_pybind11_state*.so', 156 | ] 157 | 158 | # Collect all matching libraries 159 | for lib_dir in lib_dirs: 160 | if not os.path.exists(lib_dir): 161 | continue 162 | 163 | for pattern in cuda_libs_patterns: 164 | for lib_file in glob.glob(os.path.join(lib_dir, pattern)): 165 | if os.path.isfile(lib_file) and not os.path.islink(lib_file): 166 | # Add to binaries list with destination directory 167 | dest_dir = '.' 168 | if 'stubs' in lib_file: 169 | dest_dir = 'stubs' 170 | cuda_binaries.append((lib_file, dest_dir)) 171 | print(f" Including CUDA library: {os.path.basename(lib_file)}") 172 | 173 | 174 | return cuda_binaries 175 | 176 | # Collect CUDA/cuDNN libraries 177 | cuda_binaries = get_conda_cuda_libs() 178 | binaries += cuda_binaries 179 | 180 | # Collect CTranslate2 (the actual inference engine for faster-whisper) 181 | try: 182 | ctranslate2_datas, ctranslate2_binaries, ctranslate2_hiddenimports = collect_all('ctranslate2') 183 | datas += ctranslate2_datas 184 | binaries += ctranslate2_binaries 185 | hiddenimports += ctranslate2_hiddenimports 186 | except: 187 | pass 188 | 189 | # Collect faster-whisper 190 | faster_whisper_datas, faster_whisper_binaries, faster_whisper_hiddenimports = collect_all('faster_whisper') 191 | datas += faster_whisper_datas 192 | binaries += faster_whisper_binaries 193 | hiddenimports += faster_whisper_hiddenimports 194 | 195 | # Collect transformers (needed for tokenizers) 196 | transformers_datas, transformers_binaries, transformers_hiddenimports = collect_all('transformers') 197 | datas += transformers_datas 198 | binaries += transformers_binaries 199 | hiddenimports += transformers_hiddenimports 200 | 201 | # Collect onnxruntime for VAD model 202 | # Note: The Python module is always 'onnxruntime' regardless of whether 203 | # you installed onnxruntime-gpu or onnxruntime via pip 204 | onnx_collected = False 205 | onnx_package = 'onnxruntime' # Module name is always 'onnxruntime' 206 | try: 207 | onnx_datas, onnx_binaries, onnx_hiddenimports = collect_all(onnx_package) 208 | datas += onnx_datas 209 | binaries += onnx_binaries 210 | hiddenimports += onnx_hiddenimports 211 | print(f"Collected {onnx_package} successfully") 212 | onnx_collected = True 213 | 214 | # Explicitly add ONNX Runtime capi libraries if not already included 215 | try: 216 | import importlib.util 217 | spec = importlib.util.find_spec(onnx_package) 218 | if spec and spec.origin: 219 | onnx_path = os.path.dirname(spec.origin) 220 | capi_path = os.path.join(onnx_path, 'capi') 221 | 222 | if os.path.exists(capi_path): 223 | print(f" Found ONNX Runtime capi directory: {capi_path}") 224 | for file in os.listdir(capi_path): 225 | if file.endswith(('.so', '.dll', '.pyd', '.dylib')): 226 | src = os.path.join(capi_path, file) 227 | # Add to root directory of the bundle 228 | binaries.append((src, '.')) 229 | print(f" Added capi library: {file}") 230 | except Exception as e: 231 | print(f" Warning: Could not collect capi libraries: {e}") 232 | 233 | except Exception as e: 234 | print(f"Could not collect {onnx_package}: {e}") 235 | onnx_collected = False 236 | 237 | if not onnx_collected: 238 | print("WARNING: Could not collect any ONNX Runtime package") 239 | 240 | # Collect librosa for audio processing 241 | librosa_datas, librosa_binaries, librosa_hiddenimports = collect_all('librosa') 242 | datas += librosa_datas 243 | binaries += librosa_binaries 244 | hiddenimports += librosa_hiddenimports 245 | 246 | # Add numpy 247 | numpy_datas, numpy_binaries, numpy_hiddenimports = collect_all('numpy') 248 | datas += numpy_datas 249 | binaries += numpy_binaries 250 | hiddenimports += numpy_hiddenimports 251 | 252 | # Add other necessary packages 253 | for package in ['pyjson5', 'scipy', 'soundfile', 'audioread', 'resampy', 'numba', 'av', 'tokenizers']: 254 | try: 255 | pkg_datas, pkg_binaries, pkg_hiddenimports = collect_all(package) 256 | datas += pkg_datas 257 | binaries += pkg_binaries 258 | hiddenimports += pkg_hiddenimports 259 | except: 260 | pass 261 | 262 | # Collect setuptools and pkg_resources data to fix missing modules 263 | try: 264 | from PyInstaller.utils.hooks import collect_data_files 265 | setuptools_datas = collect_data_files('setuptools') 266 | datas += setuptools_datas 267 | pkg_resources_datas = collect_data_files('pkg_resources') 268 | datas += pkg_resources_datas 269 | except: 270 | pass 271 | 272 | # Explicitly collect backports module to fix ModuleNotFoundError 273 | try: 274 | backports_datas, backports_binaries, backports_hiddenimports = collect_all('backports') 275 | datas += backports_datas 276 | binaries += backports_binaries 277 | hiddenimports += backports_hiddenimports 278 | print("Collected backports module successfully") 279 | except Exception as e: 280 | print(f"Could not collect backports module: {e}") 281 | # Try alternative collection method 282 | try: 283 | import backports 284 | import os 285 | backports_path = os.path.dirname(backports.__file__) 286 | datas.append((backports_path, 'backports')) 287 | print(f"Added backports from path: {backports_path}") 288 | except: 289 | print("Warning: backports module not found - may need to be installed") 290 | 291 | # Add hidden imports for modules that might not be detected automatically 292 | hiddenimports += [ 293 | 'ctranslate2', 294 | 'transformers.models', 295 | 'transformers.models.whisper', 296 | 'transformers.tokenization_utils', 297 | 'transformers.tokenization_utils_base', 298 | 'tokenizers', 299 | 'tokenizers.implementations', 300 | 'tokenizers.models', 301 | 'tokenizers.pre_tokenizers', 302 | 'tokenizers.processors', 303 | 'onnxruntime.capi', 304 | 'onnxruntime.capi._pybind_state', 305 | 'onnxruntime.capi.onnxruntime_providers_cuda', # Important for GPU 306 | 'onnxruntime.capi.onnxruntime_providers_tensorrt', # TensorRT if available 307 | 'librosa.core', 308 | 'librosa.feature', 309 | 'scipy.special._ufuncs_cxx', 310 | 'scipy.linalg._fblas', 311 | 'scipy.linalg._flapack', 312 | 'scipy.linalg._cythonized_array_utils', 313 | 'scipy.linalg._solve_toeplitz', 314 | 'scipy.linalg._matfuncs_sqrtm_triu', 315 | 'scipy.linalg._decomp_lu_cython', 316 | 'scipy.linalg._matfuncs_expm', 317 | 'scipy.linalg.cython_blas', 318 | 'scipy.linalg.cython_lapack', 319 | 'numba.core', 320 | 'numba.cuda', 321 | 'av.audio', 322 | 'av.container', 323 | 'av.stream', 324 | 'pkg_resources.extern', 325 | 'pkg_resources._vendor', 326 | 'packaging', 327 | 'packaging.version', 328 | 'packaging.specifiers', 329 | 'packaging.requirements', 330 | 'backports', # Fix for ModuleNotFoundError 331 | 'backports.functools_lru_cache', # Common backports module 332 | 'setuptools._vendor.jaraco', # Include jaraco modules 333 | 'setuptools._vendor.jaraco.text', 334 | 'setuptools._vendor.jaraco.context', 335 | 'setuptools._vendor.jaraco.functools', 336 | 'code', # For interactive console with --console option 337 | 'readline', # For better console experience (if available) 338 | 'rlcompleter', # For tab completion in console 339 | ] 340 | 341 | # Add project data files 342 | # Note: models directory is excluded and handled separately by CI 343 | datas += [ 344 | ('src/faster_whisper_transwithai_chickenrice', 'faster_whisper_transwithai_chickenrice'), 345 | ('locales', 'locales'), # Include the locales directory with translations 346 | ] 347 | 348 | a = Analysis( 349 | ['infer.py'], 350 | pathex=[], 351 | binaries=binaries, 352 | datas=datas, 353 | hiddenimports=hiddenimports, 354 | hookspath=[], # PyInstaller hooks contrib should be auto-detected 355 | hooksconfig={}, 356 | runtime_hooks=['runtime_hook.py'], # Add runtime hook to set KMP_DUPLICATE_LIB_OK 357 | excludes=[ 358 | 'matplotlib', 359 | 'tkinter', 360 | 'PyQt5', 361 | 'PyQt6', 362 | 'PySide2', 363 | 'PySide6', 364 | 'notebook', 365 | 'jupyter', 366 | 'IPython', 367 | 'pytest', 368 | ], 369 | win_no_prefer_redirects=False, 370 | win_private_assemblies=False, 371 | cipher=block_cipher, 372 | noarchive=False, 373 | ) 374 | 375 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) 376 | 377 | exe = EXE( 378 | pyz, 379 | a.scripts, 380 | [], 381 | exclude_binaries=True, 382 | name='infer', 383 | debug=False, 384 | bootloader_ignore_signals=False, 385 | strip=False, 386 | upx=False, 387 | console=True, 388 | disable_windowed_traceback=False, 389 | argv_emulation=False, 390 | target_arch=None, 391 | codesign_identity=None, 392 | entitlements_file=None, 393 | icon='transwithai.ico' if os.path.exists('transwithai.ico') else None, 394 | ) 395 | 396 | coll = COLLECT( 397 | exe, 398 | a.binaries, 399 | a.zipfiles, 400 | a.datas, 401 | strip=False, 402 | upx=False, 403 | upx_exclude=[], 404 | name='faster_whisper_transwithai_chickenrice', 405 | ) -------------------------------------------------------------------------------- /src/faster_whisper_transwithai_chickenrice/i18n_modern.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ultra-modern, dependency-free i18n module using JSON. 3 | 4 | This is a lightweight, modern internationalization solution that: 5 | - Uses JSON files (human-readable, easy to edit) 6 | - No external dependencies 7 | - Supports nested keys with dot notation 8 | - Interpolation with {variable} syntax 9 | - Pluralization support 10 | - Lazy loading 11 | - Type hints for better IDE support 12 | """ 13 | 14 | import os 15 | import sys 16 | import json 17 | import locale 18 | import re 19 | from pathlib import Path 20 | from typing import Dict, Any, Optional, List, Union 21 | from functools import lru_cache 22 | from dataclasses import dataclass 23 | from enum import Enum 24 | 25 | 26 | class PluralForm(Enum): 27 | """Plural forms for different languages.""" 28 | ZERO = "zero" 29 | ONE = "one" 30 | TWO = "two" 31 | FEW = "few" 32 | MANY = "many" 33 | OTHER = "other" 34 | 35 | 36 | @dataclass 37 | class LocaleInfo: 38 | """Information about a locale.""" 39 | code: str 40 | language: str 41 | region: Optional[str] = None 42 | script: Optional[str] = None 43 | 44 | @property 45 | def language_code(self) -> str: 46 | """Get just the language part.""" 47 | return self.language 48 | 49 | @property 50 | def full_code(self) -> str: 51 | """Get full locale code.""" 52 | parts = [self.language] 53 | if self.script: 54 | parts.append(self.script) 55 | if self.region: 56 | parts.append(self.region) 57 | return '-'.join(parts) 58 | 59 | 60 | class PluralRules: 61 | """Simplified plural rules for common languages.""" 62 | 63 | @staticmethod 64 | def get_plural_form(locale_code: str, count: Union[int, float]) -> PluralForm: 65 | """ 66 | Get the appropriate plural form for a count in a given locale. 67 | 68 | This is a simplified version of CLDR plural rules. 69 | """ 70 | lang = locale_code.split('-')[0].lower() 71 | n = abs(count) 72 | 73 | # Languages with single form (Chinese, Japanese, Korean, Thai, etc.) 74 | if lang in ['zh', 'ja', 'ko', 'th', 'vi', 'id', 'ms']: 75 | return PluralForm.OTHER 76 | 77 | # English and Germanic languages 78 | if lang in ['en', 'de', 'nl', 'sv', 'da', 'no']: 79 | return PluralForm.ONE if n == 1 else PluralForm.OTHER 80 | 81 | # French, Portuguese, Spanish, Italian 82 | if lang in ['fr', 'pt', 'es', 'it']: 83 | if n == 0: 84 | return PluralForm.ZERO if lang == 'fr' else PluralForm.OTHER 85 | elif n == 1: 86 | return PluralForm.ONE 87 | else: 88 | return PluralForm.OTHER 89 | 90 | # Russian and Slavic languages (simplified) 91 | if lang in ['ru', 'uk', 'pl', 'cs', 'sk']: 92 | if n == 1: 93 | return PluralForm.ONE 94 | elif 2 <= n <= 4: 95 | return PluralForm.FEW 96 | else: 97 | return PluralForm.OTHER 98 | 99 | # Arabic (simplified) 100 | if lang == 'ar': 101 | if n == 0: 102 | return PluralForm.ZERO 103 | elif n == 1: 104 | return PluralForm.ONE 105 | elif n == 2: 106 | return PluralForm.TWO 107 | elif 3 <= n <= 10: 108 | return PluralForm.FEW 109 | elif 11 <= n <= 99: 110 | return PluralForm.MANY 111 | else: 112 | return PluralForm.OTHER 113 | 114 | # Default 115 | return PluralForm.OTHER 116 | 117 | 118 | class ModernI18n: 119 | """ 120 | Modern, lightweight i18n implementation using JSON. 121 | 122 | Features: 123 | - JSON-based translations (human-readable) 124 | - Nested key support with dot notation 125 | - Variable interpolation with {var} syntax 126 | - Smart pluralization 127 | - Locale auto-detection 128 | - Fallback chains 129 | - No external dependencies 130 | """ 131 | 132 | def __init__(self, 133 | locales_dir: Optional[Union[str, Path]] = None, 134 | default_locale: str = 'zh-CN', 135 | fallback_locale: str = 'en-US'): 136 | """ 137 | Initialize the i18n system. 138 | 139 | Args: 140 | locales_dir: Directory containing JSON translation files 141 | default_locale: Default locale to use 142 | fallback_locale: Fallback locale for missing translations 143 | """ 144 | self.locales_dir = Path(locales_dir or self._find_locales_dir()) 145 | self.default_locale = default_locale 146 | self.fallback_locale = fallback_locale 147 | self._translations: Dict[str, Dict[str, Any]] = {} 148 | self._current_locale: Optional[str] = None 149 | 150 | # Auto-detect and set locale 151 | detected = self._detect_locale() 152 | self.set_locale(detected) 153 | 154 | def _find_locales_dir(self) -> Path: 155 | """Find the locales directory.""" 156 | # Check if running from PyInstaller bundle 157 | if getattr(sys, 'frozen', False): 158 | # Running from executable 159 | # sys._MEIPASS is the temporary folder where PyInstaller extracts files 160 | base_path = Path(sys._MEIPASS) 161 | possible_paths = [ 162 | base_path / 'locales', 163 | Path(sys.executable).parent / 'locales', 164 | ] 165 | else: 166 | # Running from source 167 | possible_paths = [ 168 | Path(__file__).parent.parent.parent / 'locales', 169 | Path(__file__).parent / 'locales', 170 | Path.cwd() / 'locales', 171 | ] 172 | 173 | for path in possible_paths: 174 | if path.exists() and path.is_dir(): 175 | return path 176 | 177 | # Create default 178 | default_path = Path(__file__).parent.parent.parent / 'locales' 179 | default_path.mkdir(parents=True, exist_ok=True) 180 | return default_path 181 | 182 | def _detect_locale(self) -> str: 183 | """Auto-detect user's preferred locale.""" 184 | # Environment variables 185 | for env_var in ['LANGUAGE', 'LANG', 'LC_ALL', 'LC_MESSAGES']: 186 | if lang := os.environ.get(env_var): 187 | return self._normalize_locale(lang.split(':')[0].split('.')[0]) 188 | 189 | # System locale 190 | try: 191 | system_locale, _ = locale.getdefaultlocale() 192 | if system_locale: 193 | return self._normalize_locale(system_locale) 194 | except: 195 | pass 196 | 197 | # Windows-specific 198 | if sys.platform == 'win32': 199 | try: 200 | import ctypes 201 | lang_id = ctypes.windll.kernel32.GetUserDefaultUILanguage() 202 | locale_map = { 203 | 0x0804: 'zh-CN', 204 | 0x0404: 'zh-TW', 205 | 0x0409: 'en-US', 206 | 0x0411: 'ja-JP', 207 | 0x0412: 'ko-KR', 208 | } 209 | if lang_id in locale_map: 210 | return locale_map[lang_id] 211 | except: 212 | pass 213 | 214 | return self.default_locale 215 | 216 | def _normalize_locale(self, locale_code: str) -> str: 217 | """Normalize locale code to standard format.""" 218 | if not locale_code: 219 | return self.default_locale 220 | 221 | # Replace underscores 222 | locale_code = locale_code.replace('_', '-') 223 | 224 | # Add default region if needed 225 | if '-' not in locale_code: 226 | defaults = { 227 | 'zh': 'zh-CN', 228 | 'en': 'en-US', 229 | 'ja': 'ja-JP', 230 | 'ko': 'ko-KR', 231 | 'es': 'es-ES', 232 | 'fr': 'fr-FR', 233 | 'de': 'de-DE', 234 | 'it': 'it-IT', 235 | 'pt': 'pt-BR', 236 | 'ru': 'ru-RU', 237 | } 238 | locale_code = defaults.get(locale_code.lower(), locale_code) 239 | 240 | return locale_code 241 | 242 | @lru_cache(maxsize=10) 243 | def _load_translations(self, locale_code: str) -> Dict[str, Any]: 244 | """Load translations for a locale (cached).""" 245 | translations = {} 246 | 247 | # Try JSON file 248 | json_path = self.locales_dir / locale_code / 'messages.json' 249 | if json_path.exists(): 250 | try: 251 | with open(json_path, 'r', encoding='utf-8') as f: 252 | translations = json.load(f) 253 | except Exception as e: 254 | print(f"Warning: Failed to load {json_path}: {e}", file=sys.stderr) 255 | 256 | return translations 257 | 258 | def set_locale(self, locale_code: str): 259 | """Set the current locale.""" 260 | self._current_locale = self._normalize_locale(locale_code) 261 | # Pre-load translations 262 | self._translations[self._current_locale] = self._load_translations(self._current_locale) 263 | if self.fallback_locale != self._current_locale: 264 | self._translations[self.fallback_locale] = self._load_translations(self.fallback_locale) 265 | 266 | def _get_nested_value(self, data: Dict[str, Any], key: str) -> Any: 267 | """Get value from nested dict using dot notation.""" 268 | keys = key.split('.') 269 | value = data 270 | 271 | for k in keys: 272 | if isinstance(value, dict): 273 | value = value.get(k) 274 | if value is None: 275 | return None 276 | else: 277 | return None 278 | 279 | return value 280 | 281 | def _interpolate(self, template: str, variables: Dict[str, Any]) -> str: 282 | """Interpolate variables in template string.""" 283 | if not isinstance(template, str): 284 | return str(template) 285 | 286 | # Match {variable_name} or {variable_name:format} 287 | pattern = r'\{(\w+)(?::([^}]+))?\}' 288 | 289 | def replacer(match): 290 | var_name = match.group(1) 291 | format_spec = match.group(2) 292 | 293 | if var_name not in variables: 294 | return match.group(0) # Keep original if variable not found 295 | 296 | value = variables[var_name] 297 | 298 | # Apply format if specified 299 | if format_spec: 300 | try: 301 | if format_spec.endswith('f'): 302 | # Float formatting like {value:0.2f} 303 | decimals = int(format_spec[:-1].split('.')[-1]) if '.' in format_spec else 0 304 | return f"{float(value):.{decimals}f}" 305 | elif format_spec.isdigit(): 306 | # Padding like {value:5} 307 | return str(value).zfill(int(format_spec)) 308 | except: 309 | pass 310 | 311 | return str(value) 312 | 313 | return re.sub(pattern, replacer, template) 314 | 315 | def get(self, key: str, **variables) -> str: 316 | """ 317 | Get a translated string. 318 | 319 | Args: 320 | key: Translation key (supports dot notation) 321 | **variables: Variables for interpolation 322 | 323 | Returns: 324 | Translated and interpolated string 325 | """ 326 | # Handle pluralization 327 | if 'count' in variables: 328 | plural_key = self._get_plural_key(key, variables['count']) 329 | result = self._get_translation(plural_key) 330 | if result is not None and result != plural_key: 331 | return self._interpolate(result, variables) 332 | 333 | # Regular translation 334 | result = self._get_translation(key) 335 | 336 | # Fallback to key if not found 337 | if result is None: 338 | result = key 339 | 340 | # Interpolate variables 341 | if variables: 342 | result = self._interpolate(result, variables) 343 | 344 | return result 345 | 346 | def _get_plural_key(self, base_key: str, count: Union[int, float]) -> str: 347 | """Get the plural form key.""" 348 | plural_form = PluralRules.get_plural_form(self._current_locale, count) 349 | return f"{base_key}.{plural_form.value}" 350 | 351 | def _get_translation(self, key: str) -> Optional[str]: 352 | """Get translation from current or fallback locale.""" 353 | # Try current locale 354 | if self._current_locale in self._translations: 355 | value = self._get_nested_value(self._translations[self._current_locale], key) 356 | if value is not None: 357 | return value 358 | 359 | # Try fallback locale 360 | if self.fallback_locale in self._translations: 361 | value = self._get_nested_value(self._translations[self.fallback_locale], key) 362 | if value is not None: 363 | return value 364 | 365 | return None 366 | 367 | def format_duration(self, seconds: float) -> str: 368 | """Format duration in a localized way.""" 369 | hours = int(seconds // 3600) 370 | minutes = int((seconds % 3600) // 60) 371 | secs = seconds % 60 372 | 373 | if hours > 0: 374 | return self.get('time.duration_hours', hours=hours, minutes=minutes, seconds=secs) 375 | elif minutes > 0: 376 | return self.get('time.duration_minutes', minutes=minutes, seconds=secs) 377 | else: 378 | return self.get('time.duration_seconds', seconds=secs) 379 | 380 | def format_percentage(self, value: float, decimals: int = 1) -> str: 381 | """Format percentage in a localized way.""" 382 | return self.get('format.percentage', value=value * 100, decimals=decimals) 383 | 384 | def format_file_count(self, count: int) -> str: 385 | """Format file count with proper pluralization.""" 386 | return self.get('files.count', count=count) 387 | 388 | @property 389 | def current_locale(self) -> str: 390 | """Get current locale.""" 391 | return self._current_locale 392 | 393 | @property 394 | def available_locales(self) -> List[str]: 395 | """Get list of available locales.""" 396 | locales = [] 397 | if self.locales_dir.exists(): 398 | for path in self.locales_dir.iterdir(): 399 | if path.is_dir() and (path / 'messages.json').exists(): 400 | locales.append(path.name) 401 | return sorted(locales) 402 | 403 | def has_key(self, key: str) -> bool: 404 | """Check if a translation key exists.""" 405 | return self._get_translation(key) is not None 406 | 407 | def get_all_keys(self) -> List[str]: 408 | """Get all available translation keys.""" 409 | keys = set() 410 | 411 | def extract_keys(data: Dict[str, Any], prefix: str = ''): 412 | for key, value in data.items(): 413 | full_key = f"{prefix}.{key}" if prefix else key 414 | if isinstance(value, dict): 415 | extract_keys(value, full_key) 416 | else: 417 | keys.add(full_key) 418 | 419 | for locale_code in [self._current_locale, self.fallback_locale]: 420 | if locale_code in self._translations: 421 | extract_keys(self._translations[locale_code]) 422 | 423 | return sorted(keys) 424 | 425 | 426 | # Global instance 427 | _i18n: Optional[ModernI18n] = None 428 | 429 | 430 | def init(locales_dir: Optional[Union[str, Path]] = None, 431 | default_locale: str = 'zh-CN', 432 | fallback_locale: str = 'en-US') -> ModernI18n: 433 | """Initialize the global i18n instance.""" 434 | global _i18n 435 | _i18n = ModernI18n(locales_dir, default_locale, fallback_locale) 436 | return _i18n 437 | 438 | 439 | def get_i18n() -> ModernI18n: 440 | """Get the global i18n instance.""" 441 | global _i18n 442 | if _i18n is None: 443 | _i18n = init() 444 | return _i18n 445 | 446 | 447 | # Convenience functions 448 | def _(key: str, **variables) -> str: 449 | """Get translated string.""" 450 | return get_i18n().get(key, **variables) 451 | 452 | 453 | def set_locale(locale_code: str): 454 | """Set current locale.""" 455 | get_i18n().set_locale(locale_code) 456 | 457 | 458 | def get_locale() -> str: 459 | """Get current locale.""" 460 | return get_i18n().current_locale 461 | 462 | 463 | def available_locales() -> List[str]: 464 | """Get available locales.""" 465 | return get_i18n().available_locales 466 | 467 | 468 | # Format helpers 469 | format_duration = lambda s: get_i18n().format_duration(s) 470 | format_percentage = lambda v, d=1: get_i18n().format_percentage(v, d) 471 | format_file_count = lambda c: get_i18n().format_file_count(c) 472 | 473 | # Auto-initialize 474 | init() -------------------------------------------------------------------------------- /download_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Download required model files from Hugging Face repositories 4 | ONNX VAD model is always downloaded, additional models can be specified via HuggingFace repo path 5 | """ 6 | 7 | import os 8 | import sys 9 | import json 10 | import shutil 11 | import argparse 12 | import requests 13 | from pathlib import Path 14 | from typing import List, Dict, Any, Optional 15 | from urllib.parse import urljoin 16 | 17 | # Detect if the environment supports Unicode/emoji 18 | def can_use_unicode(): 19 | """Check if the current environment supports Unicode output""" 20 | # If we're in a CI environment, be conservative and use ASCII 21 | if os.environ.get('CI') or os.environ.get('GITHUB_ACTIONS'): 22 | # CI environments often have encoding issues, especially on Windows 23 | return False 24 | 25 | # Check if UTF-8 is explicitly set 26 | if os.environ.get('PYTHONIOENCODING', '').lower().startswith('utf'): 27 | return True 28 | 29 | if sys.platform == 'win32': 30 | # Windows console often doesn't support Unicode well 31 | # Try to enable UTF-8 on Windows 32 | try: 33 | import codecs 34 | # Test if we can encode an emoji 35 | test_emoji = "✓" 36 | test_emoji.encode(sys.stdout.encoding or 'utf-8') 37 | return True 38 | except (UnicodeEncodeError, LookupError): 39 | return False 40 | 41 | # On other platforms (Linux, Mac), usually Unicode works 42 | return True 43 | 44 | # Define symbols based on Unicode support 45 | USE_UNICODE = can_use_unicode() 46 | 47 | if USE_UNICODE: 48 | # Unicode/emoji symbols 49 | CHECKMARK = "✓" 50 | CROSS = "✗" 51 | DOWNLOAD = "⬇" 52 | PACKAGE = "📦" 53 | SEARCH = "🔍" 54 | SUCCESS = "✅" 55 | ERROR = "❌" 56 | WARNING = "⚠" 57 | else: 58 | # ASCII fallback symbols 59 | CHECKMARK = "[OK]" 60 | CROSS = "[X]" 61 | DOWNLOAD = "[DOWNLOADING]" 62 | PACKAGE = "[PACKAGE]" 63 | SEARCH = "[SEARCH]" 64 | SUCCESS = "[SUCCESS]" 65 | ERROR = "[ERROR]" 66 | WARNING = "[WARNING]" 67 | 68 | # Force UTF-8 encoding on stdout/stderr if possible 69 | if sys.platform == 'win32' and not USE_UNICODE: 70 | # On Windows CI, try to set UTF-8 mode 71 | try: 72 | import io 73 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') 74 | sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') 75 | except: 76 | pass # If this fails, we'll use ASCII symbols anyway 77 | 78 | def download_file(url: str, dest_path: Path, session: requests.Session = None) -> bool: 79 | """Download a file with progress indicator""" 80 | if session is None: 81 | session = requests.Session() 82 | 83 | try: 84 | # Create parent directory if needed 85 | dest_path.parent.mkdir(parents=True, exist_ok=True) 86 | 87 | # Skip if file already exists 88 | if dest_path.exists(): 89 | print(f" {CHECKMARK} {dest_path.name} already exists") 90 | return True 91 | 92 | print(f" {DOWNLOAD} Downloading {dest_path.name}...", end=" ") 93 | 94 | response = session.get(url, stream=True, timeout=30) 95 | response.raise_for_status() 96 | 97 | # Get file size 98 | total_size = int(response.headers.get('content-length', 0)) 99 | 100 | # Download with progress 101 | downloaded = 0 102 | last_reported_progress = -1 103 | with open(dest_path, 'wb') as f: 104 | for chunk in response.iter_content(chunk_size=8192): 105 | if chunk: 106 | f.write(chunk) 107 | downloaded += len(chunk) 108 | if total_size > 0: 109 | progress = downloaded / total_size * 100 110 | # Only update display every 10% 111 | progress_milestone = int(progress // 10) * 10 112 | if progress_milestone > last_reported_progress: 113 | print(f"\r {DOWNLOAD} Downloading {dest_path.name}... {progress_milestone}%", end="") 114 | last_reported_progress = progress_milestone 115 | 116 | print(f"\r {CHECKMARK} Downloaded {dest_path.name} ({downloaded / (1024*1024):.1f} MB)") 117 | return True 118 | 119 | except Exception as e: 120 | print(f"\r {CROSS} Failed to download {dest_path.name}: {e}") 121 | if dest_path.exists(): 122 | dest_path.unlink() 123 | return False 124 | 125 | def get_hf_api_files(repo_id: str) -> List[str]: 126 | """Get list of files from Hugging Face repo using API""" 127 | api_url = f"https://huggingface.co/api/models/{repo_id}/tree/main" 128 | 129 | try: 130 | response = requests.get(api_url, timeout=10) 131 | response.raise_for_status() 132 | files = response.json() 133 | return [f['path'] for f in files if f['type'] == 'file'] 134 | except Exception as e: 135 | print(f"Warning: Could not fetch file list from API: {e}") 136 | return [] 137 | 138 | def download_hf_model(repo_id: str, target_dir: Optional[str] = None): 139 | """Download model files from any HuggingFace repository""" 140 | base_url = f"https://huggingface.co/{repo_id}/resolve/main/" 141 | 142 | # Determine target directory 143 | if target_dir: 144 | models_dir = Path("models") / target_dir 145 | else: 146 | # Use repository name as default subdirectory 147 | repo_name = repo_id.split('/')[-1] 148 | models_dir = Path("models") / repo_name 149 | 150 | print(f"\n{PACKAGE} Downloading model from {repo_id}") 151 | print(f" Target directory: {models_dir}") 152 | 153 | # Essential file extensions to download for transformer/whisper models 154 | essential_extensions = ['.json', '.bin', '.txt', '.onnx', '.safetensors', '.model'] 155 | 156 | # Try to get full file list from API 157 | api_files = get_hf_api_files(repo_id) 158 | if api_files: 159 | # Filter for essential files 160 | files_to_download = [f for f in api_files if any( 161 | f.endswith(ext) for ext in essential_extensions 162 | )] 163 | print(f" Found {len(files_to_download)} files in repository") 164 | else: 165 | # If API fails, try common file names 166 | files_to_download = [ 167 | "config.json", 168 | "model.bin", 169 | "pytorch_model.bin", 170 | "model.safetensors", 171 | "preprocessor_config.json", 172 | "tokenizer.json", 173 | "tokenizer_config.json", 174 | "vocabulary.json", 175 | "vocab.json", 176 | "special_tokens_map.json", 177 | "merges.txt", 178 | ] 179 | print(f" Using common file list (API unavailable)") 180 | 181 | session = requests.Session() 182 | session.headers.update({ 183 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 184 | }) 185 | 186 | success_count = 0 187 | for filename in files_to_download: 188 | url = urljoin(base_url, filename) 189 | dest_path = models_dir / filename 190 | if download_file(url, dest_path, session): 191 | success_count += 1 192 | 193 | print(f" {CHECKMARK} Downloaded {success_count}/{len(files_to_download)} files") 194 | return success_count > 0 195 | 196 | def download_vad_model(): 197 | """Download VAD ONNX model files (always required)""" 198 | repo_id = "TransWithAI/Whisper-Vad-EncDec-ASMR-onnx" 199 | base_url = f"https://huggingface.co/{repo_id}/resolve/main/" 200 | models_dir = Path("models") 201 | 202 | print(f"\n{PACKAGE} Downloading VAD ONNX model from {repo_id}") 203 | 204 | # Files to download (renamed to match existing structure) 205 | files = [ 206 | ("model.onnx", "whisper_vad.onnx"), # Download as model.onnx, save as whisper_vad.onnx 207 | ("model_metadata.json", "whisper_vad_metadata.json"), # Download as model_metadata.json, save as whisper_vad_metadata.json 208 | ] 209 | 210 | session = requests.Session() 211 | session.headers.update({ 212 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 213 | }) 214 | 215 | success_count = 0 216 | for source_name, dest_name in files: 217 | url = urljoin(base_url, source_name) 218 | dest_path = models_dir / dest_name 219 | if download_file(url, dest_path, session): 220 | success_count += 1 221 | 222 | print(f" {CHECKMARK} Downloaded {success_count}/{len(files)} files") 223 | return success_count == len(files) 224 | 225 | def download_whisper_base_for_feature_extractor(): 226 | """Download whisper-base model files specifically for feature extractor (offline usage)""" 227 | repo_id = "openai/whisper-base" 228 | models_dir = Path("models") / "whisper-base" 229 | base_url = f"https://huggingface.co/{repo_id}/resolve/main/" 230 | 231 | print(f"\n{PACKAGE} Downloading whisper-base for feature extractor (offline usage)") 232 | 233 | # Check if files already exist from main models folder 234 | existing_models_dir = Path("models") 235 | if existing_models_dir.exists(): 236 | # Files we can copy from existing models folder if available 237 | files_to_copy = [ 238 | "preprocessor_config.json", 239 | "config.json", 240 | "tokenizer.json", 241 | "vocab.json", 242 | ] 243 | 244 | copied = 0 245 | models_dir.mkdir(parents=True, exist_ok=True) 246 | for filename in files_to_copy: 247 | src = existing_models_dir / filename 248 | dest = models_dir / filename 249 | if src.exists() and not dest.exists(): 250 | shutil.copy2(src, dest) 251 | print(f" {CHECKMARK} Copied {filename} from existing models folder") 252 | copied += 1 253 | elif dest.exists(): 254 | print(f" {CHECKMARK} {filename} already exists") 255 | copied += 1 256 | 257 | if copied >= 2: # At minimum we need preprocessor_config.json and config.json 258 | print(f" {CHECKMARK} Used existing files for whisper-base") 259 | return True 260 | 261 | # Download ONLY the specific files needed for feature extractor 262 | # We don't need model weights (.bin, .safetensors) for feature extraction 263 | required_files = [ 264 | "preprocessor_config.json", # Required for feature extractor 265 | "config.json", # Required for configuration 266 | "tokenizer.json", # Optional but useful for tokenization 267 | "vocab.json", # Optional but useful for vocabulary 268 | ] 269 | 270 | models_dir.mkdir(parents=True, exist_ok=True) 271 | print(f" Downloading feature extractor files from {repo_id}...") 272 | 273 | session = requests.Session() 274 | session.headers.update({ 275 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 276 | }) 277 | 278 | success_count = 0 279 | for filename in required_files: 280 | url = urljoin(base_url, filename) 281 | dest_path = models_dir / filename 282 | if download_file(url, dest_path, session): 283 | success_count += 1 284 | 285 | print(f" {CHECKMARK} Downloaded {success_count}/{len(required_files)} feature extractor files") 286 | return success_count >= 2 # At minimum we need the two required files 287 | 288 | def verify_whisper_base_feature_extractor(): 289 | """Verify that whisper-base feature extractor files exist""" 290 | models_dir = Path("models") / "whisper-base" 291 | 292 | required_files = [ 293 | ("preprocessor_config.json", "Feature extractor config"), 294 | ("config.json", "Model configuration"), 295 | ] 296 | 297 | optional_files = [ 298 | ("tokenizer.json", "Tokenizer"), 299 | ("vocab.json", "Vocabulary"), 300 | ] 301 | 302 | if not models_dir.exists(): 303 | return False 304 | 305 | print(f"\n{SEARCH} Verifying whisper-base feature extractor files...") 306 | all_required_present = True 307 | 308 | for filename, description in required_files: 309 | filepath = models_dir / filename 310 | if filepath.exists(): 311 | size_kb = filepath.stat().st_size / 1024 312 | print(f" {CHECKMARK} {filename} ({size_kb:.1f} KB)") 313 | else: 314 | print(f" {CROSS} {filename} missing - {description}") 315 | all_required_present = False 316 | 317 | for filename, description in optional_files: 318 | filepath = models_dir / filename 319 | if filepath.exists(): 320 | size_kb = filepath.stat().st_size / 1024 321 | print(f" {CHECKMARK} {filename} ({size_kb:.1f} KB) - optional") 322 | 323 | return all_required_present 324 | 325 | def verify_vad_model(): 326 | """Verify that required VAD model files exist""" 327 | models_dir = Path("models") 328 | 329 | required_files = [ 330 | ("whisper_vad.onnx", "VAD ONNX model"), 331 | ("whisper_vad_metadata.json", "VAD metadata"), 332 | ] 333 | 334 | print(f"\n{SEARCH} Verifying VAD model files...") 335 | all_present = True 336 | 337 | for filename, description in required_files: 338 | filepath = models_dir / filename 339 | if filepath.exists(): 340 | size_mb = filepath.stat().st_size / (1024 * 1024) 341 | print(f" {CHECKMARK} {filename} ({size_mb:.1f} MB)") 342 | else: 343 | print(f" {CROSS} {filename} missing - {description}") 344 | all_present = False 345 | 346 | return all_present 347 | 348 | def verify_hf_model(repo_id: str, target_dir: Optional[str] = None): 349 | """Verify that HuggingFace model files exist""" 350 | if target_dir: 351 | models_dir = Path("models") / target_dir 352 | else: 353 | repo_name = repo_id.split('/')[-1] 354 | models_dir = Path("models") / repo_name 355 | 356 | if not models_dir.exists(): 357 | print(f"\n{WARNING} Model directory {models_dir} does not exist") 358 | return False 359 | 360 | print(f"\n{SEARCH} Verifying model files in {models_dir}...") 361 | 362 | # Check for common model files 363 | common_files = ["config.json", "model.bin", "pytorch_model.bin", "model.safetensors", "model.onnx"] 364 | found_files = [] 365 | 366 | for file in models_dir.iterdir(): 367 | if file.is_file(): 368 | size_mb = file.stat().st_size / (1024 * 1024) 369 | print(f" {CHECKMARK} {file.name} ({size_mb:.1f} MB)") 370 | found_files.append(file.name) 371 | 372 | # Check if at least one model file exists 373 | has_model = any(f in found_files for f in common_files) 374 | 375 | if not has_model and found_files: 376 | print(f" {WARNING} Warning: No common model files found, but other files exist") 377 | elif not found_files: 378 | print(f" {CROSS} No files found in model directory") 379 | return False 380 | 381 | return True 382 | 383 | def main(): 384 | """Main download function""" 385 | parser = argparse.ArgumentParser( 386 | description="Model Downloader for Faster Whisper Custom VAD", 387 | formatter_class=argparse.RawDescriptionHelpFormatter, 388 | epilog=""" 389 | Examples: 390 | %(prog)s 391 | # Download VAD model and whisper-base (both required for offline usage) 392 | 393 | %(prog)s --skip-whisper-base 394 | # Download only VAD model, skip whisper-base (not recommended) 395 | 396 | %(prog)s --hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2 397 | # Download VAD, whisper-base, and Chickenrice Whisper model 398 | 399 | %(prog)s --hf-model openai/whisper-large-v3 --target-dir whisper-v3 400 | # Download VAD, whisper-base, and Whisper v3 to specific directory 401 | 402 | %(prog)s --force --hf-model myusername/my-custom-model 403 | # Force re-download everything including VAD, whisper-base, and custom model 404 | """ 405 | ) 406 | 407 | parser.add_argument( 408 | '--hf-model', 409 | type=str, 410 | help='HuggingFace repository path to download (e.g., "openai/whisper-large-v3")' 411 | ) 412 | 413 | parser.add_argument( 414 | '--target-dir', 415 | type=str, 416 | help='Target subdirectory name in models/ for the HuggingFace model (defaults to repo name)' 417 | ) 418 | 419 | parser.add_argument( 420 | '--force', 421 | action='store_true', 422 | help='Force re-download even if models already exist' 423 | ) 424 | 425 | parser.add_argument( 426 | '--skip-vad', 427 | action='store_true', 428 | help='Skip downloading VAD model (not recommended, for testing only)' 429 | ) 430 | 431 | parser.add_argument( 432 | '--skip-whisper-base', 433 | action='store_true', 434 | help='Skip downloading whisper-base model for feature extractor (not recommended)' 435 | ) 436 | 437 | args = parser.parse_args() 438 | 439 | print("=" * 60) 440 | print("Model Downloader for Faster Whisper Custom VAD") 441 | print("=" * 60) 442 | 443 | models_dir = Path("models") 444 | models_dir.mkdir(exist_ok=True) 445 | 446 | # Check if VAD model already exists 447 | if not args.force and not args.skip_vad and verify_vad_model(): 448 | print(f"\n{CHECKMARK} VAD model files already present") 449 | vad_exists = True 450 | else: 451 | vad_exists = False 452 | 453 | # Check if whisper-base feature extractor already exists 454 | whisper_base_exists = False 455 | if not args.skip_whisper_base and not args.force: 456 | if verify_whisper_base_feature_extractor(): 457 | print(f"\n{CHECKMARK} Whisper-base feature extractor files already present") 458 | whisper_base_exists = True 459 | 460 | # Check if HF model already exists (if specified) 461 | hf_exists = False 462 | if args.hf_model and not args.force: 463 | if verify_hf_model(args.hf_model, args.target_dir): 464 | print(f"\n{CHECKMARK} Model {args.hf_model} already present") 465 | hf_exists = True 466 | 467 | # If everything exists and no force flag, ask user 468 | all_exists = vad_exists and (not args.hf_model or hf_exists) and (args.skip_whisper_base or whisper_base_exists) 469 | if all_exists and not args.force: 470 | response = input("\nAll required models are present. Re-download? (y/N): ").strip().lower() 471 | if response != 'y': 472 | print("Skipping download.") 473 | return 0 474 | 475 | # Download models 476 | success = True 477 | 478 | # Always download VAD model (unless explicitly skipped) 479 | if not args.skip_vad: 480 | if not download_vad_model(): 481 | print(f"{WARNING} Error: VAD model is required and could not be downloaded") 482 | success = False 483 | else: 484 | print(f"\n{WARNING} Skipping VAD model download (not recommended)") 485 | 486 | # Download whisper-base feature extractor (unless explicitly skipped) 487 | if not args.skip_whisper_base: 488 | if not download_whisper_base_for_feature_extractor(): 489 | print(f"{WARNING} Warning: Whisper-base feature extractor could not be downloaded completely") 490 | # Don't fail completely if feature extractor download has issues 491 | else: 492 | print(f"\n{WARNING} Skipping whisper-base download (not recommended for offline usage)") 493 | 494 | # Download HuggingFace model if specified 495 | if args.hf_model: 496 | if not download_hf_model(args.hf_model, args.target_dir): 497 | print(f"{WARNING} Warning: Model {args.hf_model} could not be downloaded completely") 498 | # Don't fail completely if HF model download has issues 499 | 500 | # Final verification 501 | print("\n" + "=" * 60) 502 | 503 | # Verify VAD model 504 | if not args.skip_vad: 505 | if verify_vad_model(): 506 | print(f"\n{SUCCESS} VAD model downloaded successfully!") 507 | else: 508 | print(f"\n{ERROR} Critical: VAD model is missing. Cannot proceed without it.") 509 | return 1 510 | 511 | # Verify whisper-base feature extractor (unless skipped) 512 | if not args.skip_whisper_base: 513 | if verify_whisper_base_feature_extractor(): 514 | print(f"\n{SUCCESS} Whisper-base feature extractor downloaded successfully!") 515 | else: 516 | print(f"\n{WARNING} Warning: Some whisper-base feature extractor files may be missing.") 517 | 518 | # Verify HF model if specified 519 | if args.hf_model: 520 | if verify_hf_model(args.hf_model, args.target_dir): 521 | print(f"\n{SUCCESS} Model {args.hf_model} downloaded successfully!") 522 | else: 523 | print(f"\n{WARNING} Warning: Some files from {args.hf_model} may be missing.") 524 | 525 | return 0 526 | 527 | if __name__ == "__main__": 528 | try: 529 | sys.exit(main()) 530 | except KeyboardInterrupt: 531 | print("\n\nDownload cancelled by user.") 532 | sys.exit(1) 533 | except Exception as e: 534 | print(f"\n{ERROR} Error: {e}") 535 | sys.exit(1) -------------------------------------------------------------------------------- /src/faster_whisper_transwithai_chickenrice/vad_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | VAD Model Manager - Manages different VAD model implementations 3 | """ 4 | 5 | import json 6 | import logging 7 | import os 8 | import warnings 9 | from pathlib import Path 10 | from typing import List, Dict, Any, Optional, Protocol, Callable 11 | import numpy as np 12 | from dataclasses import dataclass 13 | 14 | # Import modern i18n module for translations 15 | from . import i18n_modern as i18n 16 | 17 | # Convenience imports 18 | _ = i18n._ 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | @dataclass 24 | class VadConfig: 25 | """Configuration for VAD models""" 26 | default_model: str = "whisper_vad" 27 | auto_inject: bool = False 28 | ttl: int = 3600 # Cache TTL in seconds 29 | 30 | # VAD parameters 31 | threshold: float = 0.5 32 | neg_threshold: Optional[float] = None 33 | min_speech_duration_ms: int = 250 34 | max_speech_duration_s: float = float('inf') 35 | min_silence_duration_ms: int = 2000 36 | speech_pad_ms: int = 400 37 | 38 | # ONNX-specific parameters 39 | onnx_model_path: Optional[str] = None 40 | onnx_metadata_path: Optional[str] = None 41 | whisper_model_name: str = "openai/whisper-base" 42 | frame_duration_ms: int = 20 43 | chunk_duration_ms: int = 30000 44 | force_cpu: bool = False 45 | num_threads: int = 1 46 | 47 | 48 | class VadModel(Protocol): 49 | """Protocol for VAD models""" 50 | 51 | def get_speech_timestamps( 52 | self, 53 | audio: np.ndarray, 54 | sampling_rate: int = 16000, 55 | **kwargs 56 | ) -> List[Dict[str, Any]]: 57 | """Get speech timestamps from audio""" 58 | ... 59 | 60 | 61 | class WhisperVADOnnxWrapper: 62 | """ONNX wrapper for Whisper-based VAD model following Silero's architecture.""" 63 | 64 | def __init__( 65 | self, 66 | model_path: str, 67 | metadata_path: Optional[str] = None, 68 | force_cpu: bool = False, 69 | num_threads: int = 1, 70 | progress_callback: Optional[Callable[[int, int, str], None]] = None, 71 | ): 72 | """Initialize ONNX model wrapper. 73 | 74 | Args: 75 | model_path: Path to ONNX model file 76 | metadata_path: Path to metadata JSON file (optional) 77 | force_cpu: Force CPU execution even if GPU is available 78 | num_threads: Number of CPU threads for inference 79 | progress_callback: Optional callback for progress tracking (chunk_idx, total_chunks, device) 80 | """ 81 | try: 82 | import onnxruntime as ort 83 | except ImportError: 84 | raise ImportError(_("vad.onnx_not_installed")) 85 | 86 | try: 87 | from transformers import WhisperFeatureExtractor 88 | except ImportError: 89 | raise ImportError(_("vad.transformers_not_installed")) 90 | 91 | self.model_path = model_path 92 | self.progress_callback = progress_callback 93 | self.device = "CPU" # Will be updated based on actual provider 94 | 95 | # Load metadata 96 | if metadata_path is None: 97 | metadata_path = model_path.replace('.onnx', '_metadata.json') 98 | 99 | if os.path.exists(metadata_path): 100 | with open(metadata_path, 'r') as f: 101 | self.metadata = json.load(f) 102 | else: 103 | warnings.warn("No metadata file found. Using default values.") 104 | self.metadata = { 105 | 'whisper_model_name': 'openai/whisper-base', 106 | 'frame_duration_ms': 20, 107 | 'total_duration_ms': 30000, 108 | } 109 | 110 | # Initialize feature extractor - try local folder first for offline usage 111 | local_whisper_base_path = Path("models/whisper-base") 112 | if local_whisper_base_path.exists() and (local_whisper_base_path / "preprocessor_config.json").exists(): 113 | # Load from local folder for offline usage 114 | try: 115 | self.feature_extractor = WhisperFeatureExtractor.from_pretrained( 116 | str(local_whisper_base_path) 117 | ) 118 | logger.info(_("vad.feature_extractor_loaded", path=local_whisper_base_path)) 119 | except Exception as e: 120 | warnings.warn(f"Failed to load from local folder, trying online: {e}") 121 | self.feature_extractor = WhisperFeatureExtractor.from_pretrained( 122 | self.metadata['whisper_model_name'] 123 | ) 124 | else: 125 | # Try to load from HuggingFace (requires internet) 126 | self.feature_extractor = WhisperFeatureExtractor.from_pretrained( 127 | self.metadata['whisper_model_name'] 128 | ) 129 | 130 | # Set up ONNX Runtime session 131 | opts = ort.SessionOptions() 132 | 133 | # Determine execution provider first 134 | providers = ['CPUExecutionProvider'] 135 | use_gpu = not force_cpu and 'CUDAExecutionProvider' in ort.get_available_providers() 136 | 137 | if use_gpu: 138 | providers.insert(0, 'CUDAExecutionProvider') 139 | self.device = "GPU (CUDA)" 140 | # For GPU, use the provided num_threads or default 141 | opts.inter_op_num_threads = num_threads 142 | opts.intra_op_num_threads = num_threads 143 | else: 144 | self.device = "CPU" 145 | # For CPU, use half of available processors if num_threads is default (1) 146 | import multiprocessing 147 | if num_threads == 1: 148 | # Use half of CPU count for optimal performance 149 | optimal_threads = max(1, multiprocessing.cpu_count() // 2) 150 | opts.inter_op_num_threads = optimal_threads 151 | opts.intra_op_num_threads = optimal_threads 152 | logger.info(_("vad.auto_configured", threads=optimal_threads, 153 | total=multiprocessing.cpu_count())) 154 | else: 155 | # Use user-specified thread count 156 | opts.inter_op_num_threads = num_threads 157 | opts.intra_op_num_threads = num_threads 158 | 159 | self.session = ort.InferenceSession(model_path, providers=providers, sess_options=opts) 160 | 161 | # Get input/output info 162 | self.input_name = self.session.get_inputs()[0].name 163 | self.output_names = [out.name for out in self.session.get_outputs()] 164 | 165 | # Model parameters 166 | self.sample_rate = 16000 # Whisper uses 16kHz 167 | self.frame_duration_ms = self.metadata.get('frame_duration_ms', 20) 168 | self.chunk_duration_ms = self.metadata.get('total_duration_ms', 30000) 169 | self.chunk_samples = int(self.chunk_duration_ms * self.sample_rate / 1000) 170 | self.frames_per_chunk = int(self.chunk_duration_ms / self.frame_duration_ms) 171 | 172 | # Initialize state 173 | self.reset_states() 174 | 175 | logger.info(_("vad.model_loaded", path=model_path)) 176 | logger.info(_("vad.device", device=self.device)) 177 | logger.info(_("vad.providers", providers=providers)) 178 | logger.info(_("vad.chunk_duration", duration=self.chunk_duration_ms)) 179 | logger.info(_("vad.frame_duration", duration=self.frame_duration_ms)) 180 | 181 | def reset_states(self): 182 | """Reset internal states for new audio stream.""" 183 | self._context = None 184 | self._last_chunk = None 185 | 186 | def _validate_input(self, audio: np.ndarray, sr: int) -> np.ndarray: 187 | """Validate and preprocess input audio. 188 | 189 | Args: 190 | audio: Input audio array 191 | sr: Sample rate 192 | 193 | Returns: 194 | Preprocessed audio at 16kHz 195 | """ 196 | if audio.ndim > 1: 197 | # Convert to mono if multi-channel 198 | audio = audio.mean(axis=0 if audio.shape[0] > audio.shape[1] else 1) 199 | 200 | # Resample if needed 201 | if sr != self.sample_rate: 202 | try: 203 | import librosa 204 | audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate) 205 | except ImportError: 206 | logger.warning(_("vad.librosa_not_installed")) 207 | # Basic downsampling if librosa not available 208 | if sr > self.sample_rate: 209 | # Simple downsampling 210 | ratio = sr // self.sample_rate 211 | audio = audio[::ratio] 212 | 213 | return audio 214 | 215 | def __call__(self, audio_chunk: np.ndarray, sr: int = 16000) -> np.ndarray: 216 | """Process a single audio chunk. 217 | 218 | Args: 219 | audio_chunk: Audio chunk to process 220 | sr: Sample rate 221 | 222 | Returns: 223 | Frame-level speech probabilities 224 | """ 225 | # Validate input 226 | audio_chunk = self._validate_input(audio_chunk, sr) 227 | 228 | # Ensure chunk is correct size 229 | if len(audio_chunk) < self.chunk_samples: 230 | audio_chunk = np.pad( 231 | audio_chunk, 232 | (0, self.chunk_samples - len(audio_chunk)), 233 | mode='constant' 234 | ) 235 | elif len(audio_chunk) > self.chunk_samples: 236 | audio_chunk = audio_chunk[:self.chunk_samples] 237 | 238 | # Extract features 239 | inputs = self.feature_extractor( 240 | audio_chunk, 241 | sampling_rate=self.sample_rate, 242 | return_tensors="np" 243 | ) 244 | 245 | # Run inference 246 | outputs = self.session.run( 247 | self.output_names, 248 | {self.input_name: inputs.input_features} 249 | ) 250 | 251 | # Apply sigmoid to get probabilities 252 | frame_logits = outputs[0][0] # Remove batch dimension 253 | frame_probs = 1 / (1 + np.exp(-frame_logits)) 254 | 255 | return frame_probs 256 | 257 | def audio_forward(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray: 258 | """Process full audio file in chunks (Silero-style). 259 | 260 | Args: 261 | audio: Full audio array 262 | sr: Sample rate 263 | 264 | Returns: 265 | Concatenated frame probabilities for entire audio 266 | """ 267 | audio = self._validate_input(audio, sr) 268 | self.reset_states() 269 | 270 | all_probs = [] 271 | 272 | # Calculate total number of chunks 273 | total_chunks = (len(audio) + self.chunk_samples - 1) // self.chunk_samples 274 | 275 | # Log initial processing info 276 | logger.info(_("vad.starting", device=self.device)) 277 | logger.info(_("vad.total_samples", samples=len(audio))) 278 | logger.info(_("vad.chunk_size", samples=self.chunk_samples, duration=self.chunk_duration_ms)) 279 | logger.info(_("vad.total_chunks", chunks=total_chunks)) 280 | 281 | # Process in chunks 282 | for chunk_idx, i in enumerate(range(0, len(audio), self.chunk_samples)): 283 | chunk = audio[i:i + self.chunk_samples] 284 | 285 | # Pad last chunk if needed 286 | if len(chunk) < self.chunk_samples: 287 | chunk = np.pad(chunk, (0, self.chunk_samples - len(chunk)), mode='constant') 288 | 289 | # Report progress 290 | if self.progress_callback: 291 | self.progress_callback(chunk_idx + 1, total_chunks, self.device) 292 | 293 | # Log chunk progress 294 | progress_pct = ((chunk_idx + 1) / total_chunks) * 100 295 | logger.debug(_("vad.processing_chunk", current=chunk_idx + 1, total=total_chunks, 296 | percent=progress_pct, device=self.device)) 297 | 298 | # Get predictions for chunk 299 | chunk_probs = self.__call__(chunk, self.sample_rate) 300 | all_probs.append(chunk_probs) 301 | 302 | logger.info(_("vad.completed", chunks=total_chunks, device=self.device)) 303 | 304 | # Concatenate all probabilities 305 | if all_probs: 306 | return np.concatenate(all_probs) 307 | return np.array([]) 308 | 309 | 310 | def get_speech_timestamps_onnx( 311 | audio: np.ndarray, 312 | model, 313 | threshold: float = 0.5, 314 | sampling_rate: int = 16000, 315 | min_speech_duration_ms: int = 250, 316 | max_speech_duration_s: float = float('inf'), 317 | min_silence_duration_ms: int = 100, 318 | speech_pad_ms: int = 30, 319 | return_seconds: bool = True, 320 | neg_threshold: Optional[float] = None, 321 | progress_tracking_callback: Optional[Callable[[float], None]] = None, 322 | ) -> List[Dict[str, float]]: 323 | """Extract speech timestamps from audio using Silero-style processing. 324 | 325 | This function implements Silero VAD's approach with: 326 | - Dual threshold (positive and negative) for hysteresis 327 | - Proper segment padding 328 | - Minimum duration filtering 329 | - Maximum duration handling with intelligent splitting 330 | 331 | Args: 332 | audio: Input audio array 333 | model: VAD model (WhisperVADOnnxWrapper instance) 334 | threshold: Speech threshold (default: 0.5) 335 | sampling_rate: Audio sample rate 336 | min_speech_duration_ms: Minimum speech segment duration 337 | max_speech_duration_s: Maximum speech segment duration 338 | min_silence_duration_ms: Minimum silence to split segments 339 | speech_pad_ms: Padding to add to speech segments 340 | return_seconds: Return times in seconds vs samples 341 | neg_threshold: Negative threshold for hysteresis (default: threshold - 0.15) 342 | progress_tracking_callback: Progress callback function 343 | 344 | Returns: 345 | List of speech segments with start/end times 346 | """ 347 | # Audio should already be numpy array 348 | 349 | # Validate audio 350 | if audio.ndim > 1: 351 | audio = audio.mean(axis=0 if audio.shape[0] > audio.shape[1] else 1) 352 | 353 | # Get frame probabilities for entire audio 354 | model.reset_states() 355 | speech_probs = model.audio_forward(audio, sampling_rate) 356 | 357 | # Calculate frame parameters 358 | frame_duration_ms = model.frame_duration_ms 359 | frame_samples = int(sampling_rate * frame_duration_ms / 1000) 360 | 361 | # Convert durations to frames 362 | min_speech_frames = int(min_speech_duration_ms / frame_duration_ms) 363 | min_silence_frames = int(min_silence_duration_ms / frame_duration_ms) 364 | speech_pad_frames = int(speech_pad_ms / frame_duration_ms) 365 | max_speech_frames = int(max_speech_duration_s * 1000 / frame_duration_ms) if max_speech_duration_s != float('inf') else len(speech_probs) 366 | 367 | # Set negative threshold for hysteresis 368 | if neg_threshold is None: 369 | neg_threshold = max(threshold - 0.15, 0.01) 370 | 371 | # Track speech segments 372 | triggered = False 373 | speeches = [] 374 | current_speech = {} 375 | current_probs = [] # Track probabilities for current segment 376 | temp_end = 0 377 | 378 | # Process each frame 379 | for i, speech_prob in enumerate(speech_probs): 380 | # Report progress 381 | if progress_tracking_callback: 382 | progress = (i + 1) / len(speech_probs) * 100 383 | progress_tracking_callback(progress) 384 | 385 | # Track probabilities for current segment 386 | if triggered: 387 | current_probs.append(float(speech_prob)) 388 | 389 | # Speech onset detection 390 | if speech_prob >= threshold and not triggered: 391 | triggered = True 392 | current_speech['start'] = i 393 | current_probs = [float(speech_prob)] # Start tracking probabilities 394 | continue 395 | 396 | # Check for maximum speech duration 397 | if triggered and 'start' in current_speech: 398 | duration = i - current_speech['start'] 399 | if duration > max_speech_frames: 400 | # Force end segment at max duration 401 | current_speech['end'] = current_speech['start'] + max_speech_frames 402 | # Calculate probability statistics for segment 403 | if current_probs: 404 | current_speech['probability'] = np.mean(current_probs) 405 | speeches.append(current_speech) 406 | current_speech = {} 407 | current_probs = [] 408 | triggered = False 409 | temp_end = 0 410 | continue 411 | 412 | # Speech offset detection with hysteresis 413 | if speech_prob < neg_threshold and triggered: 414 | if not temp_end: 415 | temp_end = i 416 | 417 | # Check if silence is long enough 418 | if i - temp_end >= min_silence_frames: 419 | # End current speech segment 420 | current_speech['end'] = temp_end 421 | 422 | # Check minimum duration 423 | if current_speech['end'] - current_speech['start'] >= min_speech_frames: 424 | # Calculate probability statistics for segment 425 | if current_probs: 426 | current_speech['probability'] = np.mean(current_probs[:temp_end - current_speech['start']]) 427 | speeches.append(current_speech) 428 | 429 | current_speech = {} 430 | current_probs = [] 431 | triggered = False 432 | temp_end = 0 433 | 434 | # Reset temp_end if speech resumes 435 | elif speech_prob >= threshold and temp_end: 436 | temp_end = 0 437 | 438 | # Handle speech that continues to the end 439 | if triggered and 'start' in current_speech: 440 | current_speech['end'] = len(speech_probs) 441 | if current_speech['end'] - current_speech['start'] >= min_speech_frames: 442 | # Calculate probability statistics for segment 443 | if current_probs: 444 | current_speech['probability'] = np.mean(current_probs) 445 | speeches.append(current_speech) 446 | 447 | # Apply padding to segments 448 | for i, speech in enumerate(speeches): 449 | # Add padding 450 | if i == 0: 451 | speech['start'] = max(0, speech['start'] - speech_pad_frames) 452 | else: 453 | speech['start'] = max(speeches[i-1]['end'], speech['start'] - speech_pad_frames) 454 | 455 | if i < len(speeches) - 1: 456 | speech['end'] = min(speeches[i+1]['start'], speech['end'] + speech_pad_frames) 457 | else: 458 | speech['end'] = min(len(speech_probs), speech['end'] + speech_pad_frames) 459 | 460 | # Convert to time units or sample indices based on return_seconds 461 | for speech in speeches: 462 | if return_seconds: 463 | # Convert frame indices to seconds 464 | speech['start'] = speech['start'] * frame_duration_ms / 1000 465 | speech['end'] = speech['end'] * frame_duration_ms / 1000 466 | else: 467 | # Convert frame indices to sample indices 468 | speech['start'] = int(speech['start'] * frame_samples) 469 | speech['end'] = int(speech['end'] * frame_samples) 470 | 471 | return speeches 472 | 473 | 474 | class WhisperVadModel: 475 | """ 476 | Whisper-based VAD model implementation using ONNX. 477 | Uses a Whisper model exported to ONNX for voice activity detection. 478 | """ 479 | 480 | def __init__(self, config: Optional[VadConfig] = None, progress_callback: Optional[Callable[[int, int, str], None]] = None): 481 | self.config = config or VadConfig() 482 | self.wrapper = None 483 | self.progress_callback = progress_callback 484 | 485 | # Initialize ONNX model if path provided 486 | if self.config.onnx_model_path and os.path.exists(self.config.onnx_model_path): 487 | try: 488 | self.wrapper = WhisperVADOnnxWrapper( 489 | model_path=self.config.onnx_model_path, 490 | metadata_path=self.config.onnx_metadata_path, 491 | force_cpu=self.config.force_cpu, 492 | num_threads=self.config.num_threads, 493 | progress_callback=progress_callback, 494 | ) 495 | logger.info(_("vad.model_initialized", path=self.config.onnx_model_path)) 496 | if self.wrapper.device: 497 | logger.info(_("vad.using_device", device=self.wrapper.device)) 498 | except Exception as e: 499 | logger.error(_("vad.init_failed", error=e)) 500 | else: 501 | logger.warning(_("vad.path_invalid", path=self.config.onnx_model_path)) 502 | 503 | def get_speech_timestamps( 504 | self, 505 | audio: np.ndarray, 506 | sampling_rate: int = 16000, 507 | threshold: float = None, 508 | min_speech_duration_ms: int = None, 509 | max_speech_duration_s: float = None, 510 | min_silence_duration_ms: int = None, 511 | speech_pad_ms: int = None, 512 | neg_threshold: float = None, 513 | **kwargs 514 | ) -> List[Dict[str, Any]]: 515 | """ 516 | Get speech timestamps using Whisper VAD. 517 | """ 518 | if self.wrapper is None: 519 | logger.error(_("vad.not_initialized")) 520 | return [] 521 | 522 | # Use provided parameters or defaults from config 523 | threshold = threshold if threshold is not None else self.config.threshold 524 | neg_threshold = neg_threshold if neg_threshold is not None else self.config.neg_threshold 525 | min_speech_duration_ms = min_speech_duration_ms if min_speech_duration_ms is not None else self.config.min_speech_duration_ms 526 | max_speech_duration_s = max_speech_duration_s if max_speech_duration_s is not None else self.config.max_speech_duration_s 527 | min_silence_duration_ms = min_silence_duration_ms if min_silence_duration_ms is not None else self.config.min_silence_duration_ms 528 | speech_pad_ms = speech_pad_ms if speech_pad_ms is not None else self.config.speech_pad_ms 529 | 530 | # Use ONNX model for speech detection 531 | # Return sample indices (not seconds) for compatibility with faster_whisper 532 | segments = get_speech_timestamps_onnx( 533 | audio=audio, 534 | model=self.wrapper, 535 | threshold=threshold, 536 | sampling_rate=sampling_rate, 537 | min_speech_duration_ms=min_speech_duration_ms, 538 | max_speech_duration_s=max_speech_duration_s, 539 | min_silence_duration_ms=min_silence_duration_ms, 540 | speech_pad_ms=speech_pad_ms, 541 | return_seconds=False, # faster_whisper expects sample indices 542 | neg_threshold=neg_threshold, 543 | ) 544 | 545 | logger.debug(_("vad.speech_segments", count=len(segments))) 546 | return segments 547 | 548 | def get_device(self) -> str: 549 | """Get the device being used for VAD processing.""" 550 | if self.wrapper: 551 | return self.wrapper.device 552 | return "Not initialized" 553 | 554 | 555 | class VadModelManager: 556 | """ 557 | Manages different VAD model implementations. 558 | Provides a unified interface for VAD operations. 559 | """ 560 | 561 | def __init__(self, config: Optional[VadConfig] = None, ttl: int = 3600, progress_callback: Optional[Callable[[int, int, str], None]] = None): 562 | self.config = config or VadConfig() 563 | self.ttl = ttl 564 | self.progress_callback = progress_callback 565 | self._models: Dict[str, VadModel] = {} # Instance variable, not class variable 566 | self._config = config 567 | 568 | # Register available models 569 | self._register_models() 570 | 571 | def _register_models(self): 572 | """Register available VAD models""" 573 | # Always recreate the model with the current progress callback 574 | self._models["whisper_vad"] = WhisperVadModel(self.config, progress_callback=self.progress_callback) 575 | logger.debug(_("vad.registered")) 576 | 577 | def get_model(self, model_id: str) -> VadModel: 578 | """Get a VAD model by ID""" 579 | if model_id not in self._models: 580 | logger.warning(_("vad.model_not_found", model_id=model_id)) 581 | model_id = self.config.default_model 582 | 583 | return self._models.get(model_id, self._models["whisper_vad"]) 584 | 585 | def get_speech_timestamps( 586 | self, 587 | model_id: str, 588 | audio: np.ndarray, 589 | sampling_rate: int = 16000, 590 | **kwargs 591 | ) -> List[Dict[str, Any]]: 592 | """ 593 | Get speech timestamps using specified model. 594 | 595 | Args: 596 | model_id: ID of the VAD model to use 597 | audio: Audio array 598 | sampling_rate: Sample rate of audio 599 | **kwargs: Additional parameters for the VAD model 600 | 601 | Returns: 602 | List of speech segments with start, end, and probability 603 | """ 604 | model = self.get_model(model_id) 605 | return model.get_speech_timestamps(audio, sampling_rate, **kwargs) 606 | 607 | @classmethod 608 | def get_available_models(cls) -> List[str]: 609 | """Get list of available VAD models""" 610 | # Since models are now instance variables, return the known model types 611 | return ["whisper_vad"] 612 | 613 | def get_device(self, model_id: str = None) -> str: 614 | """Get the device being used for VAD processing.""" 615 | if model_id is None: 616 | model_id = self.config.default_model 617 | model = self.get_model(model_id) 618 | if hasattr(model, 'get_device'): 619 | return model.get_device() 620 | return "Unknown" -------------------------------------------------------------------------------- /.github/workflows/build-release-conda.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release with Conda 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | pull_request: 8 | branches: [ main ] 9 | workflow_dispatch: 10 | inputs: 11 | release_version: 12 | description: 'Release version (e.g., v1.0.0)' 13 | required: false 14 | type: string 15 | include_chickenrice: 16 | description: 'Include Chickenrice model in releases' 17 | required: false 18 | type: boolean 19 | default: false 20 | 21 | jobs: 22 | build-windows: 23 | name: Build Windows - CUDA ${{ matrix.cuda }} ${{ matrix.model_variant }} 24 | runs-on: windows-latest 25 | defaults: 26 | run: 27 | shell: bash -el {0} # Important for conda activation on Windows 28 | strategy: 29 | matrix: 30 | include: 31 | # CUDA 11.8 versions 32 | - cuda: "11.8" 33 | env_file: "environment-cuda118.yml" 34 | env_name: "faster-whisper-cu118" 35 | artifact_suffix: "cu118" 36 | model_variant: "base" 37 | hf_model: "" 38 | - cuda: "11.8" 39 | env_file: "environment-cuda118.yml" 40 | env_name: "faster-whisper-cu118" 41 | artifact_suffix: "cu118-chickenrice" 42 | model_variant: "chickenrice" 43 | hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2" 44 | # CUDA 12.2 versions 45 | - cuda: "12.2" 46 | env_file: "environment-cuda122.yml" 47 | env_name: "faster-whisper-cu122" 48 | artifact_suffix: "cu122" 49 | model_variant: "base" 50 | hf_model: "" 51 | - cuda: "12.2" 52 | env_file: "environment-cuda122.yml" 53 | env_name: "faster-whisper-cu122" 54 | artifact_suffix: "cu122-chickenrice" 55 | model_variant: "chickenrice" 56 | hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2" 57 | # CUDA 12.8 versions 58 | - cuda: "12.8" 59 | env_file: "environment-cuda128.yml" 60 | env_name: "faster-whisper-cu128" 61 | artifact_suffix: "cu128" 62 | model_variant: "base" 63 | hf_model: "" 64 | - cuda: "12.8" 65 | env_file: "environment-cuda128.yml" 66 | env_name: "faster-whisper-cu128" 67 | artifact_suffix: "cu128-chickenrice" 68 | model_variant: "chickenrice" 69 | hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2" 70 | 71 | steps: 72 | - name: Checkout code 73 | uses: actions/checkout@v4 74 | 75 | - name: Configure stdout buffering 76 | run: | 77 | # Enable line buffering for better performance with large outputs 78 | echo "Configuring buffering for improved CI performance..." 79 | echo "PYTHONUNBUFFERED=1" >> $GITHUB_ENV 80 | # For shell commands, we'll use stdbuf where needed 81 | echo "Buffering configuration complete." 82 | 83 | - name: Cache conda packages 84 | uses: actions/cache@v4 85 | id: conda-cache 86 | env: 87 | CACHE_NUMBER: 1 # Increment to invalidate cache 88 | with: 89 | path: | 90 | ~/conda_pkgs_dir 91 | key: ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}-${{ env.CACHE_NUMBER }}-${{ hashFiles(matrix.env_file) }} 92 | restore-keys: | 93 | ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}-${{ env.CACHE_NUMBER }}- 94 | ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}- 95 | 96 | 97 | - name: Setup Miniforge 98 | uses: conda-incubator/setup-miniconda@v3 99 | with: 100 | miniforge-version: latest 101 | auto-update-conda: true 102 | environment-file: ${{ matrix.env_file }} 103 | activate-environment: ${{ matrix.env_name }} 104 | show-channel-urls: true 105 | use-only-tar-bz2: true 106 | use-mamba: true # Use mamba for faster dependency resolution 107 | # Add conda-pkgs-dir to use cached packages 108 | pkgs-dirs: ~/conda_pkgs_dir 109 | python-version: "3.10" 110 | 111 | - name: Force reinstall ctranslate2 for CUDA 11.8 112 | if: matrix.cuda == '11.8' 113 | run: | 114 | echo "Force reinstalling ctranslate2==3.24.0 for CUDA 11.8 compatibility..." 115 | pip install --force-reinstall ctranslate2==3.24.0 numpy==1.26.4 116 | echo "ctranslate2 reinstalled successfully" 117 | python -c "import ctranslate2; print(f'CTranslate2 version: {ctranslate2.__version__}')" 118 | 119 | - name: Fix onnxruntime CPU/GPU conflict 120 | run: | 121 | echo "Removing onnxruntime CPU version to avoid conflicts..." 122 | pip uninstall onnxruntime -y || true 123 | echo "" 124 | echo "Installing appropriate onnxruntime-gpu version for CUDA ${{ matrix.cuda }}..." 125 | if [ "${{ matrix.cuda }}" = "11.8" ]; then 126 | echo "Installing onnxruntime-gpu==1.18.0 for CUDA 11.8..." 127 | pip install onnxruntime-gpu==1.18.0 128 | elif [ "${{ matrix.cuda }}" = "12.2" ] || [ "${{ matrix.cuda }}" = "12.8" ]; then 129 | echo "Installing onnxruntime-gpu==1.20.2 for CUDA ${{ matrix.cuda }}..." 130 | pip install onnxruntime-gpu==1.20.2 131 | else 132 | echo "Installing onnxruntime-gpu>=1.17.0 for CUDA ${{ matrix.cuda }}..." 133 | pip install "onnxruntime-gpu>=1.17.0" 134 | fi 135 | echo "" 136 | echo "Verifying onnxruntime-gpu installation..." 137 | python -c "import onnxruntime as ort; print(f'ONNX Runtime version: {ort.__version__}'); print(f'Available providers: {ort.get_available_providers()}')" || echo "Note: GPU providers won't show on GitHub runners (no GPU)" 138 | 139 | - name: Apply batch transcribe patch to faster-whisper 140 | run: | 141 | echo "Applying batch transcribe patch to faster-whisper package..." 142 | # Find the faster-whisper package installation directory 143 | FASTER_WHISPER_PATH=$(python -c "import faster_whisper; import os; print(os.path.dirname(faster_whisper.__file__))") 144 | echo "faster-whisper package location: $FASTER_WHISPER_PATH" 145 | 146 | # Verify the transcribe.py file exists 147 | if [ -f "$FASTER_WHISPER_PATH/transcribe.py" ]; then 148 | echo "Found transcribe.py at: $FASTER_WHISPER_PATH/transcribe.py" 149 | 150 | # Apply the batch transcribe patch 151 | echo "Applying batch-transcribe.patch..." 152 | patch -p1 -d "$FASTER_WHISPER_PATH/.." < patches/batch-transcribe.patch 153 | 154 | # Verify patch was applied 155 | if [ $? -eq 0 ]; then 156 | echo "Batch transcribe patch applied successfully!" 157 | 158 | # Display the patched sections for verification 159 | echo "" 160 | echo "Verifying batch transcribe patch was applied correctly..." 161 | echo "Checking for max_initial_timestamp_index calculation:" 162 | grep -A 2 -B 2 "max_initial_timestamp_index = int" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found" 163 | 164 | echo "" 165 | echo "Checking for without_timestamps default value:" 166 | grep "without_timestamps: bool = False" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found" 167 | 168 | echo "" 169 | echo "Checking for max_initial_timestamp parameter pass-through:" 170 | grep "max_initial_timestamp=max_initial_timestamp" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found" 171 | else 172 | echo "WARNING: Batch transcribe patch failed to apply, but continuing build..." 173 | echo "This might be because the patch is already applied or the file has changed." 174 | fi 175 | else 176 | echo "ERROR: Could not find transcribe.py at expected location!" 177 | echo "Build will continue without batch transcribe patch." 178 | fi 179 | 180 | - name: Report conda cache status 181 | run: | 182 | echo "Conda packages cache hit: ${{ steps.conda-cache.outputs.cache-hit }}" 183 | if [ "${{ steps.conda-cache.outputs.cache-hit }}" = "true" ]; then 184 | echo "Package cache was restored, installation should be faster" 185 | else 186 | echo "Package cache miss, downloading packages" 187 | fi 188 | echo "" 189 | echo "Conda environment location:" 190 | conda info --envs 191 | 192 | - name: Cache HuggingFace and whisper-base models 193 | uses: actions/cache@v4 194 | with: 195 | # Cache HuggingFace model subdirectories and whisper-base, not VAD models 196 | # The chickenrice model goes into models/whisper-large-v2-translate-zh-v0.2-st-ct2/ 197 | # The whisper-base goes into models/whisper-base/ 198 | path: | 199 | models/*/ 200 | !models/*.onnx 201 | !models/*.json 202 | key: hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}-${{ matrix.hf_model }}-whisper-base 203 | restore-keys: | 204 | hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}-${{ matrix.hf_model }}- 205 | hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}- 206 | 207 | - name: Display environment info 208 | run: | 209 | conda info 210 | conda list 211 | python --version 212 | python -c "import ctranslate2; print(f'CTranslate2 version: {ctranslate2.__version__}')" 213 | echo "Note: CUDA availability check skipped (no GPU on GitHub runners)" 214 | 215 | - name: Check cached models 216 | run: | 217 | echo "Checking for cached models..." 218 | if [ -d "models" ]; then 219 | echo "Models directory exists:" 220 | # Use buffered find instead of ls for better performance 221 | find models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null | head -20 222 | echo "" 223 | echo "Model subdirectories (HuggingFace models and whisper-base, cached):" 224 | # Pre-calculate all sizes at once 225 | du -sh models/*/ 2>/dev/null | while read size dir; do 226 | echo " - $(basename "$dir"): $size" 227 | done 228 | echo "" 229 | echo "Root model files (VAD models, not cached):" 230 | find models -maxdepth 1 \( -name "*.onnx" -o -name "*.json" \) -printf "%s %p\n" 2>/dev/null | \ 231 | awk '{size=$1; $1=""; printf " %s (%s)\n", $0, size}' || echo " No VAD model files yet" 232 | else 233 | echo "Models directory does not exist yet" 234 | fi 235 | 236 | - name: Download models 237 | run: | 238 | python download_models.py ${{ matrix.hf_model }} 239 | continue-on-error: false 240 | 241 | - name: Verify downloaded models 242 | run: | 243 | echo "Model files after download:" 244 | echo "" 245 | echo "VAD models (not cached, always re-downloaded):" 246 | find models -maxdepth 1 \( -name "*.onnx" -o -name "*.json" \) -printf " %p (%s bytes)\n" 2>/dev/null || echo " No VAD model files found" 247 | echo "" 248 | echo "Whisper-base feature extractor files:" 249 | if [ -d "models/whisper-base" ]; then 250 | find models/whisper-base -type f -printf " %p (%s bytes)\n" 2>/dev/null | head -10 251 | else 252 | echo " Not yet downloaded" 253 | fi 254 | echo "" 255 | if [ "${{ matrix.hf_model }}" != "" ]; then 256 | echo "HuggingFace models (cached):" 257 | # Pre-calculate all directory sizes 258 | du -sh models/*/ 2>/dev/null > /tmp/model_sizes.txt 259 | for dir in models/*/; do 260 | if [ -d "$dir" ]; then 261 | echo " Directory: $(basename "$dir")" 262 | find "$dir" -maxdepth 1 -type f -printf " %f (%s bytes)\n" 2>/dev/null | head -10 263 | size=$(grep "$dir" /tmp/model_sizes.txt | cut -f1) 264 | echo " Total size: $size" 265 | echo "" 266 | fi 267 | done 268 | else 269 | echo "No HuggingFace models (base variant)" 270 | fi 271 | 272 | 273 | - name: Build with PyInstaller 274 | run: | 275 | echo "Using conda environment: $CONDA_DEFAULT_ENV" 276 | echo "Python path: $(which python)" 277 | echo "PyInstaller version:" 278 | python -m pip show pyinstaller 279 | export PYTHONPATH="${PYTHONPATH}:${PWD}/src" 280 | 281 | python build_windows.py 282 | 283 | - name: Copy models to distribution 284 | run: | 285 | echo "Copying models to distribution directory..." 286 | if [ -d "models" ]; then 287 | echo "Found models directory" 288 | echo "Contents of models directory:" 289 | find models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null 290 | echo "" 291 | 292 | # Create models directory in dist 293 | mkdir -p dist/faster_whisper_transwithai_chickenrice/models 294 | 295 | # Copy VAD model files (always included) 296 | echo "Copying VAD models..." 297 | cp models/*.onnx dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true 298 | cp models/*.json dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true 299 | 300 | # Copy whisper-base for feature extractor (always included for offline usage) 301 | echo "Copying whisper-base for feature extractor..." 302 | if [ -d "models/whisper-base" ]; then 303 | echo " Found whisper-base directory, copying..." 304 | cp -r models/whisper-base dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true 305 | echo " Whisper-base copied for offline feature extractor support" 306 | else 307 | echo " WARNING: whisper-base directory not found" 308 | fi 309 | 310 | # Copy HuggingFace models if this is a chickenrice variant 311 | if [ "${{ matrix.model_variant }}" = "chickenrice" ]; then 312 | echo "Copying Chickenrice model..." 313 | for dir in models/*/; do 314 | if [ -d "$dir" ]; then 315 | model_name=$(basename "$dir") 316 | # Skip whisper-base as we already copied it 317 | if [ "$model_name" != "whisper-base" ]; then 318 | echo " Copying model contents from: $model_name" 319 | # Copy the contents of the model directory, not the directory itself 320 | cp -r "$dir"* dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true 321 | # Also copy hidden files if any exist 322 | cp -r "$dir".* dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true 323 | fi 324 | fi 325 | done 326 | fi 327 | 328 | echo "" 329 | echo "Models in distribution:" 330 | find dist/faster_whisper_transwithai_chickenrice/models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null 331 | echo "" 332 | echo "Total distribution size:" 333 | du -sh dist/faster_whisper_transwithai_chickenrice/ 334 | else 335 | echo "WARNING: Models directory not found!" 336 | fi 337 | 338 | - name: Copy batch files, configuration, and documentation 339 | run: | 340 | echo "Copying batch files, configuration, and documentation to distribution..." 341 | 342 | # Copy usage instructions 343 | if [ -f "使用说明.txt" ]; then 344 | cp "使用说明.txt" dist/faster_whisper_transwithai_chickenrice/ 345 | echo "Copied: 使用说明.txt" 346 | fi 347 | 348 | # Copy release notes 349 | if [ -f "RELEASE_NOTES_CN.md" ]; then 350 | cp "RELEASE_NOTES_CN.md" dist/faster_whisper_transwithai_chickenrice/ 351 | echo "Copied: RELEASE_NOTES_CN.md" 352 | fi 353 | 354 | # Copy generation config to root directory (for easy user editing) 355 | if [ -f "generation_config.json5" ]; then 356 | cp "generation_config.json5" dist/faster_whisper_transwithai_chickenrice/ 357 | echo "Copied: generation_config.json5 to root directory" 358 | fi 359 | 360 | # Copy all batch files 361 | for bat_file in *.bat; do 362 | if [ -f "$bat_file" ]; then 363 | # Skip any build-related batch files 364 | if [[ "$bat_file" != *"build"* ]] && [[ "$bat_file" == "运行"* ]]; then 365 | cp "$bat_file" dist/faster_whisper_transwithai_chickenrice/ 366 | echo "Copied: $bat_file" 367 | fi 368 | fi 369 | done 370 | 371 | echo "" 372 | echo "Distribution contents:" 373 | find dist/faster_whisper_transwithai_chickenrice -maxdepth 1 \( -name "*.bat" -o -name "*.txt" -o -name "*.md" -o -name "*.json5" \) -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null || echo "No batch/text/config files found" 374 | 375 | - name: Test executable (CPU mode) 376 | shell: cmd /C CALL {0} 377 | run: | 378 | cd dist\faster_whisper_transwithai_chickenrice 379 | infer.exe --help 380 | 381 | - name: Upload artifact 382 | uses: actions/upload-artifact@v4 383 | with: 384 | name: faster_whisper_transwithai_windows_${{ matrix.artifact_suffix }} 385 | path: dist/faster_whisper_transwithai_chickenrice/ 386 | retention-days: 30 387 | 388 | - name: List artifact directory structure 389 | run: | 390 | echo "========================================================================" 391 | echo "📦 ARTIFACT DIRECTORY STRUCTURE" 392 | echo "========================================================================" 393 | echo "Build variant: ${{ matrix.artifact_suffix }}" 394 | echo "CUDA version: ${{ matrix.cuda }}" 395 | echo "Model variant: ${{ matrix.model_variant }}" 396 | echo "------------------------------------------------------------------------" 397 | 398 | # Simple directory tree (depth limited to 3) 399 | echo "Directory structure:" 400 | find dist/faster_whisper_transwithai_chickenrice -type d -maxdepth 3 | \ 401 | sed 's|[^/]*/| |g' | \ 402 | sed 's|^ |dist/|' 403 | 404 | echo "" 405 | echo "Total artifact size: $(du -sh dist/faster_whisper_transwithai_chickenrice/ | cut -f1)" 406 | echo "========================================================================" 407 | 408 | # Create the initial GitHub release 409 | create-release: 410 | name: Create GitHub Release 411 | needs: [build-windows] 412 | runs-on: ubuntu-latest 413 | if: startsWith(github.ref, 'refs/tags/v') 414 | permissions: 415 | contents: write 416 | outputs: 417 | release_created: ${{ steps.create.outputs.release_created }} 418 | 419 | steps: 420 | - name: Checkout code for release notes 421 | uses: actions/checkout@v4 422 | with: 423 | sparse-checkout: | 424 | RELEASE_NOTES_CN.md 425 | sparse-checkout-cone-mode: false 426 | 427 | - name: Read release body 428 | id: read_body 429 | run: | 430 | if [ -f "RELEASE_NOTES_CN.md" ]; then 431 | echo 'body<> $GITHUB_OUTPUT 432 | cat RELEASE_NOTES_CN.md >> $GITHUB_OUTPUT 433 | echo 'EOF' >> $GITHUB_OUTPUT 434 | else 435 | echo 'body=Release created by GitHub Actions' >> $GITHUB_OUTPUT 436 | fi 437 | 438 | - name: Create empty placeholder file for initial release 439 | run: | 440 | echo "This release contains large binary files. Files are being uploaded..." > placeholder.txt 441 | 442 | - name: Create Release with placeholder 443 | id: create 444 | uses: ading2210/gh-large-releases@v1 445 | with: 446 | repository: ${{ github.repository }} 447 | tag_name: ${{ github.ref }} 448 | name: ${{ github.ref_name }} 449 | body: ${{ steps.read_body.outputs.body }} 450 | draft: false 451 | prerelease: false 452 | files: placeholder.txt 453 | token: ${{ secrets.GITHUB_TOKEN }} 454 | 455 | - name: Set output 456 | run: echo "release_created=true" >> $GITHUB_OUTPUT 457 | 458 | # Parallel upload jobs - each handles one artifact 459 | upload-cu118: 460 | name: Upload CUDA 11.8 Base 461 | needs: [create-release] 462 | runs-on: ubuntu-latest 463 | if: startsWith(github.ref, 'refs/tags/v') 464 | permissions: 465 | contents: write 466 | 467 | steps: 468 | - name: Download artifact 469 | uses: actions/download-artifact@v6 470 | with: 471 | name: faster_whisper_transwithai_windows_cu118 472 | path: artifact/ 473 | 474 | - name: Create archive with optimized compression 475 | run: | 476 | cd artifact 477 | echo "Creating archive for CUDA 11.8 base variant..." 478 | # Using compression level 5 for faster builds (was level 9) 479 | # Level 5 provides good balance between speed and compression ratio 480 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu118.zip . 481 | cd .. 482 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu118.zip | awk '{print $5}')" 483 | 484 | - name: Upload to release with large file support 485 | uses: ading2210/gh-large-releases@v1 486 | with: 487 | repository: ${{ github.repository }} 488 | tag_name: ${{ github.ref }} 489 | files: faster_whisper_transwithai_windows_cu118.zip 490 | token: ${{ secrets.GITHUB_TOKEN }} 491 | 492 | upload-cu118-chickenrice: 493 | name: Upload CUDA 11.8 Chickenrice 494 | needs: [create-release] 495 | runs-on: ubuntu-latest 496 | if: startsWith(github.ref, 'refs/tags/v') 497 | permissions: 498 | contents: write 499 | 500 | steps: 501 | - name: Download artifact 502 | uses: actions/download-artifact@v6 503 | with: 504 | name: faster_whisper_transwithai_windows_cu118-chickenrice 505 | path: artifact/ 506 | 507 | - name: Create archive with optimized compression 508 | run: | 509 | cd artifact 510 | echo "Creating archive for CUDA 11.8 chickenrice variant..." 511 | # Using compression level 5 for faster builds (was level 9) 512 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu118-chickenrice.zip . 513 | cd .. 514 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu118-chickenrice.zip | awk '{print $5}')" 515 | 516 | - name: Upload to release with large file support 517 | uses: ading2210/gh-large-releases@v1 518 | with: 519 | repository: ${{ github.repository }} 520 | tag_name: ${{ github.ref }} 521 | files: faster_whisper_transwithai_windows_cu118-chickenrice.zip 522 | token: ${{ secrets.GITHUB_TOKEN }} 523 | 524 | upload-cu122: 525 | name: Upload CUDA 12.2 Base 526 | needs: [create-release] 527 | runs-on: ubuntu-latest 528 | if: startsWith(github.ref, 'refs/tags/v') 529 | permissions: 530 | contents: write 531 | 532 | steps: 533 | - name: Download artifact 534 | uses: actions/download-artifact@v6 535 | with: 536 | name: faster_whisper_transwithai_windows_cu122 537 | path: artifact/ 538 | 539 | - name: Create archive with optimized compression 540 | run: | 541 | cd artifact 542 | echo "Creating archive for CUDA 12.2 base variant..." 543 | # Using compression level 5 for faster builds (was level 9) 544 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu122.zip . 545 | cd .. 546 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu122.zip | awk '{print $5}')" 547 | 548 | - name: Upload to release with large file support 549 | uses: ading2210/gh-large-releases@v1 550 | with: 551 | repository: ${{ github.repository }} 552 | tag_name: ${{ github.ref }} 553 | files: faster_whisper_transwithai_windows_cu122.zip 554 | token: ${{ secrets.GITHUB_TOKEN }} 555 | 556 | upload-cu122-chickenrice: 557 | name: Upload CUDA 12.2 Chickenrice 558 | needs: [create-release] 559 | runs-on: ubuntu-latest 560 | if: startsWith(github.ref, 'refs/tags/v') 561 | permissions: 562 | contents: write 563 | 564 | steps: 565 | - name: Download artifact 566 | uses: actions/download-artifact@v6 567 | with: 568 | name: faster_whisper_transwithai_windows_cu122-chickenrice 569 | path: artifact/ 570 | 571 | - name: Create archive with optimized compression 572 | run: | 573 | cd artifact 574 | echo "Creating archive for CUDA 12.2 chickenrice variant..." 575 | # Using compression level 5 for faster builds (was level 9) 576 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu122-chickenrice.zip . 577 | cd .. 578 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu122-chickenrice.zip | awk '{print $5}')" 579 | 580 | - name: Upload to release with large file support 581 | uses: ading2210/gh-large-releases@v1 582 | with: 583 | repository: ${{ github.repository }} 584 | tag_name: ${{ github.ref }} 585 | files: faster_whisper_transwithai_windows_cu122-chickenrice.zip 586 | token: ${{ secrets.GITHUB_TOKEN }} 587 | 588 | upload-cu128: 589 | name: Upload CUDA 12.8 Base 590 | needs: [create-release] 591 | runs-on: ubuntu-latest 592 | if: startsWith(github.ref, 'refs/tags/v') 593 | permissions: 594 | contents: write 595 | 596 | steps: 597 | - name: Download artifact 598 | uses: actions/download-artifact@v6 599 | with: 600 | name: faster_whisper_transwithai_windows_cu128 601 | path: artifact/ 602 | 603 | - name: Create archive with optimized compression 604 | run: | 605 | cd artifact 606 | echo "Creating archive for CUDA 12.8 base variant..." 607 | # Using compression level 5 for faster builds (was level 9) 608 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu128.zip . 609 | cd .. 610 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu128.zip | awk '{print $5}')" 611 | 612 | - name: Upload to release with large file support 613 | uses: ading2210/gh-large-releases@v1 614 | with: 615 | repository: ${{ github.repository }} 616 | tag_name: ${{ github.ref }} 617 | files: faster_whisper_transwithai_windows_cu128.zip 618 | token: ${{ secrets.GITHUB_TOKEN }} 619 | 620 | upload-cu128-chickenrice: 621 | name: Upload CUDA 12.8 Chickenrice 622 | needs: [create-release] 623 | runs-on: ubuntu-latest 624 | if: startsWith(github.ref, 'refs/tags/v') 625 | permissions: 626 | contents: write 627 | 628 | steps: 629 | - name: Download artifact 630 | uses: actions/download-artifact@v6 631 | with: 632 | name: faster_whisper_transwithai_windows_cu128-chickenrice 633 | path: artifact/ 634 | 635 | - name: Create archive with optimized compression 636 | run: | 637 | cd artifact 638 | echo "Creating archive for CUDA 12.8 chickenrice variant..." 639 | # Using compression level 5 for faster builds (was level 9) 640 | zip -5 -r -q ../faster_whisper_transwithai_windows_cu128-chickenrice.zip . 641 | cd .. 642 | echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu128-chickenrice.zip | awk '{print $5}')" 643 | 644 | - name: Upload to release with large file support 645 | uses: ading2210/gh-large-releases@v1 646 | with: 647 | repository: ${{ github.repository }} 648 | tag_name: ${{ github.ref }} 649 | files: faster_whisper_transwithai_windows_cu128-chickenrice.zip 650 | token: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /src/faster_whisper_transwithai_chickenrice/infer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Inference script with custom VAD injection support 4 | """ 5 | 6 | import argparse 7 | import sys 8 | import logging 9 | import os 10 | import json 11 | import code 12 | import platform 13 | import subprocess 14 | import traceback 15 | from dataclasses import dataclass 16 | from pathlib import Path 17 | from collections import ChainMap 18 | from typing import Optional, Dict, Any 19 | 20 | import pyjson5 21 | from faster_whisper import WhisperModel, BatchedInferencePipeline 22 | import ctranslate2 23 | 24 | # Import our VAD injection system 25 | from . import inject_vad, uninject_vad, VadOptionsCompat 26 | from .vad_manager import VadConfig 27 | 28 | # Import modern i18n module for translations 29 | from . import i18n_modern as i18n 30 | 31 | # Convenience imports 32 | _ = i18n._ 33 | format_duration = i18n.format_duration 34 | format_percentage = i18n.format_percentage 35 | 36 | 37 | def parse_arguments(): 38 | parser = argparse.ArgumentParser(description=_("app.description")) 39 | parser.add_argument('--model_name_or_path', type=str, default="models", 40 | help=_("args.model_path")) 41 | parser.add_argument('--device', type=str, default='auto', 42 | help=_("args.device")) 43 | parser.add_argument('--compute_type', type=str, default='auto', 44 | help=_("args.compute_type")) 45 | parser.add_argument('--overwrite', action='store_true', default=False, 46 | help=_("args.overwrite")) 47 | parser.add_argument('--audio_suffixes', type=str, default="wav,flac,mp3", 48 | help=_("args.audio_extensions")) 49 | parser.add_argument('--sub_formats', type=str, default="lrc,vtt", 50 | help=_("args.subtitle_formats")) 51 | parser.add_argument('--output_dir', type=str, default=None, 52 | help=_("args.output_dir")) 53 | parser.add_argument('--generation_config', type=str, default="generation_config.json5", 54 | help=_("args.config_file")) 55 | parser.add_argument('--log_level', type=str, default="DEBUG", 56 | help=_("args.log_level")) 57 | 58 | # VAD parameter overrides (whisper_vad is always used) 59 | parser.add_argument('--vad_threshold', type=float, default=None, 60 | help=_("args.vad_threshold")) 61 | parser.add_argument('--vad_min_speech_duration_ms', type=int, default=None, 62 | help=_("args.min_speech_duration")) 63 | parser.add_argument('--vad_min_silence_duration_ms', type=int, default=None, 64 | help=_("args.min_silence_duration")) 65 | parser.add_argument('--vad_speech_pad_ms', type=int, default=None, 66 | help=_("args.speech_padding")) 67 | 68 | # Debug option for interactive console 69 | parser.add_argument('--console', action='store_true', 70 | help="Launch interactive Python console for debugging") 71 | 72 | # Batch inference options 73 | parser.add_argument('--enable_batching', action='store_true', 74 | help="Enable batched inference for faster processing (requires more VRAM)") 75 | parser.add_argument('--batch_size', type=int, default=None, 76 | help="Batch size for batched inference (auto-detect if not specified)") 77 | parser.add_argument('--max_batch_size', type=int, default=8, 78 | help="Maximum batch size to try when auto-detecting (default: 8)") 79 | 80 | parser.add_argument('base_dirs', nargs=argparse.REMAINDER, 81 | help=_("args.directories")) 82 | return parser.parse_args() 83 | 84 | 85 | def select_best_compute_type(device: str) -> str: 86 | """ 87 | Automatically select the best compute type based on device and available types. 88 | 89 | Preference order: 90 | - bfloat16 > float16 > int8 types > float32 91 | - Prefer int8 over float32 for better memory usage 92 | 93 | Args: 94 | device: The device to use ('cpu', 'cuda', or 'auto') 95 | 96 | Returns: 97 | The best available compute type for the device 98 | """ 99 | # Determine the actual device if 'auto' is specified 100 | actual_device = device 101 | if device == 'auto': 102 | # Check if CUDA devices are actually available 103 | # First check CUDA_VISIBLE_DEVICES environment variable 104 | import os 105 | cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', None) 106 | 107 | if cuda_visible == '': 108 | # Empty string means CUDA is explicitly disabled 109 | actual_device = 'cpu' 110 | elif cuda_visible == '-1': 111 | # -1 also means CUDA is disabled 112 | actual_device = 'cpu' 113 | else: 114 | # Try to check if CUDA is actually available by attempting to get its compute types 115 | # and checking if we can actually use it 116 | try: 117 | # Try to get CUDA compute types 118 | cuda_types = ctranslate2.get_supported_compute_types('cuda') 119 | # Also check if we can import and use faster_whisper with CUDA 120 | # This is a more reliable check 121 | from faster_whisper import WhisperModel 122 | # Try to get default device - if CUDA not available, this should fail 123 | # Note: We're not actually loading a model, just checking device availability 124 | if cuda_visible is not None: 125 | # CUDA_VISIBLE_DEVICES is set to specific devices 126 | # Make sure at least one device is visible 127 | visible_devices = [d.strip() for d in cuda_visible.split(',') if d.strip()] 128 | if not visible_devices: 129 | actual_device = 'cpu' 130 | else: 131 | actual_device = 'cuda' 132 | else: 133 | # CUDA_VISIBLE_DEVICES not set, CUDA should be available if drivers installed 134 | actual_device = 'cuda' 135 | except Exception as e: 136 | # If we can't get CUDA types or import fails, fall back to CPU 137 | actual_device = 'cpu' 138 | logger.info(_("info.auto_detected_device").format(device=actual_device)) 139 | 140 | # Get supported compute types for the device 141 | try: 142 | supported_types = ctranslate2.get_supported_compute_types(actual_device) 143 | except Exception as e: 144 | logger.warning(_("warnings.compute_types_unavailable").format(device=actual_device, error=e)) 145 | # Fallback to safe default 146 | return 'int8' if actual_device == 'cpu' else 'float16' 147 | 148 | # Define preference order 149 | # Prefer bfloat16 > float16 > int8 types > float32 150 | preference_order = [ 151 | 'bfloat16', 152 | 'float16', 153 | 'int16', # For CPU 154 | 'int8_bfloat16', 155 | 'int8_float16', 156 | 'int8_float32', 157 | 'int8', 158 | 'float32' # Least preferred due to memory usage 159 | ] 160 | 161 | # Select the best available type based on preference 162 | for compute_type in preference_order: 163 | if compute_type in supported_types: 164 | logger.info(_("info.auto_selected_compute_type").format(compute_type=compute_type, device=actual_device)) 165 | return compute_type 166 | 167 | # If nothing matched (shouldn't happen), use a safe default 168 | default = 'int8' if actual_device == 'cpu' else 'float16' 169 | logger.warning(_("warnings.no_preferred_compute_type").format(default=default)) 170 | return default 171 | 172 | 173 | @dataclass 174 | class Segment: 175 | start: int # ms 176 | end: int # ms 177 | text: str 178 | 179 | 180 | def merge_segments(segments: list[Segment]) -> list[Segment]: 181 | segments.sort(key=lambda s: s.start) 182 | merged: list[Segment] = [] 183 | i = 0 184 | while i < len(segments): 185 | if segments[i].text.strip() == '': 186 | i += 1 187 | continue 188 | start, end, text = segments[i].start, segments[i].end, segments[i].text 189 | j = i + 1 190 | while j < len(segments): 191 | if segments[j].text.startswith(text): 192 | end, text = segments[j].end, segments[j].text 193 | j += 1 194 | continue 195 | break 196 | k = j 197 | while k < len(segments): 198 | if segments[k].text.strip() == '': 199 | break 200 | if text.endswith(segments[k].text): 201 | end = segments[k].end 202 | k += 1 203 | continue 204 | break 205 | merged.append(Segment(start=start, end=end, text=text)) 206 | i = j 207 | return merged 208 | 209 | 210 | class SubWriter: 211 | @classmethod 212 | def txt(cls, segments: list[Segment], path: str): 213 | lines = [] 214 | for idx, segment in enumerate(segments): 215 | lines.append(f"{segment.text}\n") 216 | with open(path, "w", encoding="utf-8") as f: 217 | f.writelines(lines) 218 | 219 | @classmethod 220 | def lrc(cls, segments: list[Segment], path: str): 221 | lines = [] 222 | for idx, segment in enumerate(segments): 223 | start_ts = cls.lrc_timestamp(segment.start) 224 | end_es = cls.lrc_timestamp(segment.end) 225 | lines.append(f"[{start_ts}]{segment.text}\n") 226 | if idx != len(segments) - 1: 227 | next_start = segments[idx + 1].start 228 | if next_start is not None and end_es == cls.lrc_timestamp(next_start): 229 | continue 230 | lines.append(f"[{end_es}]\n") 231 | with open(path, "w", encoding="utf-8") as f: 232 | f.writelines(lines) 233 | 234 | @staticmethod 235 | def lrc_timestamp(ms: int) -> str: 236 | m = ms // 60_000 237 | ms = ms - m * 60_000 238 | s = ms // 1_000 239 | ms = ms - s * 1_000 240 | ms = ms // 10 241 | return f"{m:02d}:{s:02d}.{ms:02d}" 242 | 243 | @classmethod 244 | def vtt(cls, segments: list[Segment], path: str): 245 | lines = ["WebVTT\n\n"] 246 | for idx, segment in enumerate(segments): 247 | lines.append(f"{idx + 1}\n") 248 | lines.append(f"{cls.vtt_timestamp(segment.start)} --> {cls.vtt_timestamp(segment.end)}\n") 249 | lines.append(f"{segment.text}\n\n") 250 | with open(path, "w", encoding="utf-8") as f: 251 | f.writelines(lines) 252 | 253 | @classmethod 254 | def vtt_timestamp(cls, ms: int): 255 | return cls._timestamp(ms, '.') 256 | 257 | @classmethod 258 | def srt(cls, segments: list[Segment], path: str): 259 | lines = [] 260 | for idx, segment in enumerate(segments): 261 | lines.append(f"{idx + 1}\n") 262 | lines.append(f"{cls.srt_timestamp(segment.start)} --> {cls.srt_timestamp(segment.end)}\n") 263 | lines.append(f"{segment.text}\n\n") 264 | with open(path, "w", encoding="utf-8") as f: 265 | f.writelines(lines) 266 | 267 | @classmethod 268 | def srt_timestamp(cls, ms: int): 269 | return cls._timestamp(ms, ',') 270 | 271 | @classmethod 272 | def _timestamp(cls, ms: int, delim: str): 273 | h = ms // 3600_000 274 | ms -= h * 3600_000 275 | m = ms // 60_000 276 | ms -= m * 60_000 277 | s = ms // 1_000 278 | ms -= s * 1_000 279 | return ( 280 | f"{h:02d}:{m:02d}:{s:02d}{delim}{ms:03d}" 281 | ) 282 | 283 | 284 | @dataclass 285 | class InferenceTask: 286 | audio_path: str 287 | sub_prefix: str 288 | sub_formats: list[str] 289 | 290 | 291 | logger = logging.getLogger(__name__) 292 | log_handler = logging.StreamHandler() 293 | log_handler.setFormatter(logging.Formatter('%(message)s')) 294 | logger.addHandler(log_handler) 295 | 296 | 297 | class Inference: 298 | sub_writers = {"lrc": SubWriter.lrc, "srt": SubWriter.srt, "vtt": SubWriter.vtt, "txt": SubWriter.txt} 299 | 300 | def __init__(self, args): 301 | self.args = args 302 | self.model_name_or_path = args.model_name_or_path 303 | self.device = args.device 304 | # Auto-select compute type if 'auto' or 'default' is specified 305 | if args.compute_type in ['auto', 'default']: 306 | self.compute_type = select_best_compute_type(self.device) 307 | else: 308 | self.compute_type = args.compute_type 309 | 310 | # Batch inference settings 311 | self.enable_batching = args.enable_batching 312 | self.batch_size = args.batch_size if args.batch_size else 0 313 | self.max_batch_size = args.max_batch_size 314 | 315 | self.overwrite = args.overwrite 316 | self.output_dir = args.output_dir 317 | if self.output_dir: 318 | if not os.path.isabs(self.output_dir): 319 | self.output_dir = os.path.join(os.getcwd(), self.output_dir) 320 | logger.info(_("info.output_dir", output_dir=self.output_dir)) 321 | self.audio_suffixes = {k: True for k in args.audio_suffixes.split(',')} 322 | self.sub_formats = [] 323 | for k in args.sub_formats.split(','): 324 | if k not in self.sub_writers: 325 | raise ValueError(_("warnings.unknown_format", format=k)) 326 | self.sub_formats.append(k) 327 | 328 | # Load generation config 329 | self.generation_config = self._load_generation_config(args) 330 | 331 | # Setup VAD injection if requested 332 | self._setup_vad_injection(args) 333 | 334 | logger.info(_("info.generation_config", config=self.generation_config)) 335 | 336 | def _load_generation_config(self, args) -> Dict[str, Any]: 337 | """Load and process generation configuration""" 338 | # Default config 339 | config = { 340 | "language": "ja", 341 | "task": "translate", 342 | "vad_filter": True, 343 | } 344 | 345 | 346 | # Load from file if exists 347 | if os.path.exists(args.generation_config): 348 | with open(args.generation_config, "r", encoding='utf-8') as f: 349 | file_config = pyjson5.decode_io(f) 350 | config = dict(**ChainMap(file_config, config)) 351 | 352 | # Process VAD parameters from config file 353 | if "vad_parameters" in config: 354 | vad_params = config.pop("vad_parameters") 355 | 356 | # Convert to VadOptions format 357 | vad_options = {} 358 | 359 | # Map common parameters 360 | if "threshold" in vad_params: 361 | vad_options["threshold"] = vad_params["threshold"] 362 | if "neg_threshold" in vad_params: 363 | vad_options["neg_threshold"] = vad_params["neg_threshold"] 364 | if "min_speech_duration_ms" in vad_params: 365 | vad_options["min_speech_duration_ms"] = vad_params["min_speech_duration_ms"] 366 | if "max_speech_duration_s" in vad_params: 367 | vad_options["max_speech_duration_s"] = vad_params["max_speech_duration_s"] 368 | if "min_silence_duration_ms" in vad_params: 369 | vad_options["min_silence_duration_ms"] = vad_params["min_silence_duration_ms"] 370 | if "speech_pad_ms" in vad_params: 371 | vad_options["speech_pad_ms"] = vad_params["speech_pad_ms"] 372 | 373 | config["vad_parameters"] = vad_options 374 | 375 | # Override with command line arguments 376 | if args.vad_threshold is not None: 377 | if "vad_parameters" not in config: 378 | config["vad_parameters"] = {} 379 | config["vad_parameters"]["threshold"] = args.vad_threshold 380 | 381 | if args.vad_min_speech_duration_ms is not None: 382 | if "vad_parameters" not in config: 383 | config["vad_parameters"] = {} 384 | config["vad_parameters"]["min_speech_duration_ms"] = args.vad_min_speech_duration_ms 385 | 386 | if args.vad_min_silence_duration_ms is not None: 387 | if "vad_parameters" not in config: 388 | config["vad_parameters"] = {} 389 | config["vad_parameters"]["min_silence_duration_ms"] = args.vad_min_silence_duration_ms 390 | 391 | if args.vad_speech_pad_ms is not None: 392 | if "vad_parameters" not in config: 393 | config["vad_parameters"] = {} 394 | config["vad_parameters"]["speech_pad_ms"] = args.vad_speech_pad_ms 395 | 396 | return config 397 | 398 | def _vad_progress_callback(self, chunk_idx, total_chunks, device): 399 | """Progress callback for VAD processing.""" 400 | progress_pct = (chunk_idx / total_chunks) * 100 401 | # Use carriage return to update the same line 402 | print("\r " + _("progress.vad", current=chunk_idx, total=total_chunks, 403 | percent=progress_pct, device=device), end="", flush=True) 404 | if chunk_idx == total_chunks: 405 | print() # New line when done 406 | 407 | def _setup_vad_injection(self, args): 408 | """Setup whisper_vad injection - always enforced""" 409 | # Always use whisper_vad model 410 | vad_model = "whisper_vad" 411 | 412 | logger.info(_("info.initializing_vad")) 413 | 414 | # Create VAD config with progress callback 415 | vad_config = VadConfig(default_model=vad_model) 416 | 417 | # Apply VAD parameters from generation config 418 | if "vad_parameters" in self.generation_config: 419 | vad_params = self.generation_config["vad_parameters"] 420 | if "threshold" in vad_params: 421 | vad_config.threshold = vad_params["threshold"] 422 | if "neg_threshold" in vad_params: 423 | vad_config.neg_threshold = vad_params["neg_threshold"] 424 | if "min_speech_duration_ms" in vad_params: 425 | vad_config.min_speech_duration_ms = vad_params["min_speech_duration_ms"] 426 | if "max_speech_duration_s" in vad_params: 427 | vad_config.max_speech_duration_s = vad_params["max_speech_duration_s"] 428 | if "min_silence_duration_ms" in vad_params: 429 | vad_config.min_silence_duration_ms = vad_params["min_silence_duration_ms"] 430 | if "speech_pad_ms" in vad_params: 431 | vad_config.speech_pad_ms = vad_params["speech_pad_ms"] 432 | 433 | # Load ONNX VAD configuration from metadata 434 | vad_metadata_path = "models/whisper_vad_metadata.json" 435 | vad_config.onnx_model_path = "models/whisper_vad.onnx" 436 | vad_config.onnx_metadata_path = vad_metadata_path 437 | 438 | # Read model configuration from metadata JSON if it exists 439 | if os.path.exists(vad_metadata_path): 440 | try: 441 | with open(vad_metadata_path, 'r') as f: 442 | metadata = json.load(f) 443 | 444 | # Load model configuration from metadata 445 | vad_config.whisper_model_name = metadata.get("whisper_model_name", "openai/whisper-base") 446 | vad_config.frame_duration_ms = metadata.get("frame_duration_ms", 20) 447 | vad_config.chunk_duration_ms = metadata.get("total_duration_ms", 30000) 448 | 449 | logger.info(_("warnings.loaded_vad_config", path=vad_metadata_path)) 450 | except Exception as e: 451 | logger.warning(_("warnings.failed_load_vad", path=vad_metadata_path, error=e)) 452 | logger.warning(_("warnings.using_default_vad")) 453 | # Fallback to defaults 454 | vad_config.whisper_model_name = "openai/whisper-base" 455 | vad_config.frame_duration_ms = 20 456 | vad_config.chunk_duration_ms = 30000 457 | else: 458 | # Use defaults if metadata file doesn't exist 459 | logger.warning(_("warnings.vad_file_not_found", path=vad_metadata_path)) 460 | logger.warning(_("warnings.using_default_vad")) 461 | vad_config.whisper_model_name = "openai/whisper-base" 462 | vad_config.frame_duration_ms = 20 463 | vad_config.chunk_duration_ms = 30000 464 | 465 | # Hardcoded runtime configuration 466 | vad_config.force_cpu = False 467 | vad_config.num_threads = 8 468 | 469 | # Inject VAD with progress callback 470 | inject_vad(model_id=vad_model, config=vad_config, progress_callback=self._vad_progress_callback) 471 | self.vad_injected = True 472 | logger.info(_("info.vad_activated", threshold=vad_config.threshold)) 473 | 474 | def generates(self, base_dirs): 475 | if len(base_dirs) == 0: 476 | logger.warning(_("warnings.provide_directories")) 477 | return 478 | 479 | tasks = self._scan(base_dirs) 480 | if len(tasks) == 0: 481 | logger.info(_("info.no_files_found")) 482 | return 483 | 484 | logger.info(_("tasks.translation", count=len(tasks))) 485 | logger.info(_("info.loading_whisper")) 486 | 487 | try: 488 | model = WhisperModel(self.model_name_or_path, device=self.device, compute_type=self.compute_type) 489 | logger.info(_("info.model_precision").format(precision=self.compute_type, device=self.device)) 490 | 491 | # Setup batched inference if enabled 492 | batched_model = None 493 | batch_size_to_use = self.batch_size 494 | 495 | if self.enable_batching: 496 | try: 497 | batched_model = BatchedInferencePipeline(model=model) 498 | 499 | # Auto-detect batch size if not specified 500 | if batch_size_to_use == 0 and len(tasks) > 0: 501 | # Use the first audio file as sample for testing 502 | batch_size_to_use = self._find_executable_batch_size( 503 | model, 504 | tasks[0].audio_path, 505 | min_batch_size=1, 506 | max_batch_size=self.max_batch_size 507 | ) 508 | 509 | if batch_size_to_use == 0: 510 | logger.warning("Could not find suitable batch size. Falling back to non-batched mode.") 511 | batched_model = None 512 | 513 | if batched_model and batch_size_to_use > 0: 514 | logger.info(f"Using batched inference with batch size: {batch_size_to_use}") 515 | 516 | except Exception as e: 517 | logger.warning(f"Failed to setup batched inference: {str(e)}. Falling back to non-batched mode.") 518 | batched_model = None 519 | 520 | for i, task in enumerate(tasks): 521 | logger.info(_("info.translating", current=i + 1, total=len(tasks), path=task.audio_path)) 522 | 523 | # Use batched or regular inference 524 | if batched_model and batch_size_to_use > 0: 525 | # Use auto-retry with batch size reduction on OOM 526 | # This mimics HuggingFace Accelerate's find_executable_batch_size behavior 527 | try: 528 | _segments, info, actual_batch_size = self._transcribe_with_auto_batch_size( 529 | batched_model, 530 | task.audio_path, 531 | starting_batch_size=batch_size_to_use 532 | ) 533 | # Update batch_size_to_use if it was auto-adjusted 534 | if actual_batch_size < batch_size_to_use: 535 | logger.info(f"Batch size auto-adjusted from {batch_size_to_use} to {actual_batch_size}") 536 | batch_size_to_use = actual_batch_size 537 | except Exception as e: 538 | logger.warning(f"Batched inference failed: {str(e)}. Falling back to non-batched mode.") 539 | # Fallback to non-batched 540 | _segments, info = model.transcribe( 541 | task.audio_path, 542 | **self.generation_config, 543 | ) 544 | else: 545 | _segments, info = model.transcribe( 546 | task.audio_path, 547 | **self.generation_config, 548 | ) 549 | 550 | if info.duration == info.duration_after_vad or info.duration_after_vad == 0: 551 | logger.info(_("info.duration", duration=format_duration(info.duration))) 552 | else: 553 | rate = info.duration_after_vad / info.duration 554 | logger.info(_("info.duration_filtered", 555 | original=format_duration(info.duration), 556 | filtered=format_duration(info.duration_after_vad), 557 | percent=format_percentage(rate))) 558 | 559 | segments = [] 560 | for _segment in _segments: 561 | segment = Segment( 562 | start=int(_segment.start*1_000), 563 | end=int(_segment.end*1_000), 564 | text=_segment.text.strip(), 565 | ) 566 | segments.append(segment) 567 | logger.debug(f"[{SubWriter.lrc_timestamp(segment.start)} --> " 568 | f"{SubWriter.lrc_timestamp(segment.end)}] {segment.text}") 569 | 570 | segments = merge_segments(segments) 571 | os.makedirs(os.path.dirname(task.sub_prefix), exist_ok=True) 572 | for sub_suffix in task.sub_formats: 573 | sub_path = f"{task.sub_prefix}.{sub_suffix}" 574 | logger.info(_("info.writing", path=sub_path)) 575 | self.sub_writers[sub_suffix](segments, sub_path) 576 | 577 | finally: 578 | # Clean up VAD injection 579 | if self.vad_injected: 580 | uninject_vad() 581 | logger.info(_("info.vad_deactivated")) 582 | 583 | def _find_executable_batch_size(self, model, sample_audio_path, min_batch_size=1, max_batch_size=64): 584 | """ 585 | Find the maximum executable batch size for batched inference. 586 | Starts from max_batch_size and works down exponentially on OOM. 587 | 588 | Args: 589 | model: WhisperModel instance 590 | sample_audio_path: Path to a sample audio file for testing 591 | min_batch_size: Minimum batch size to try 592 | max_batch_size: Maximum batch size to try 593 | 594 | Returns: 595 | Optimal batch size that fits in memory 596 | """ 597 | if not self.enable_batching: 598 | return 0 599 | 600 | logger.info(_("batch.finding_optimal", min_size=min_batch_size, max_size=max_batch_size)) 601 | 602 | # Start from max and work down on failure (like HuggingFace Accelerate) 603 | current_batch_size = max_batch_size 604 | 605 | while current_batch_size >= min_batch_size: 606 | try: 607 | logger.info(_("batch.testing_size", size=current_batch_size)) 608 | 609 | # Try to create batched pipeline with this batch size 610 | batched_model = BatchedInferencePipeline(model=model) 611 | 612 | # Test transcription with this batch size 613 | # Note: batch_size is passed separately to BatchedInferencePipeline.transcribe() 614 | # It's NOT part of generation_config 615 | segments, info = batched_model.transcribe( 616 | sample_audio_path, 617 | batch_size=current_batch_size, # batch_size is a separate parameter 618 | **self.generation_config # generation_config doesn't include batch_size 619 | ) 620 | 621 | # Force evaluation by converting to list 622 | list(segments) 623 | 624 | # Success! This batch size works 625 | logger.info(_("batch.size_successful", size=current_batch_size)) 626 | logger.info(_("batch.optimal_found", size=current_batch_size)) 627 | return current_batch_size 628 | 629 | except RuntimeError as e: 630 | # If OOM, reduce batch size exponentially 631 | error_msg = str(e) 632 | if "out of memory" in error_msg.lower() or "oom" in error_msg.lower(): 633 | logger.warning(_("batch.oom_error", size=current_batch_size)) 634 | else: 635 | logger.warning(_("batch.runtime_error", size=current_batch_size, error=error_msg)) 636 | 637 | # Reduce batch size by half (exponential backoff) 638 | new_batch_size = current_batch_size // 2 639 | 640 | # Ensure we reduce by at least 1 641 | if new_batch_size == current_batch_size: 642 | new_batch_size = current_batch_size - 1 643 | 644 | if new_batch_size < min_batch_size: 645 | logger.error(_("batch.no_suitable_size", min_size=min_batch_size)) 646 | return 0 647 | 648 | logger.info(_("batch.reducing_size", old_size=current_batch_size, new_size=new_batch_size)) 649 | current_batch_size = new_batch_size 650 | 651 | except Exception as e: 652 | logger.warning(_("batch.unexpected_error", size=current_batch_size, error=str(e))) 653 | 654 | # Reduce batch size by half on unexpected errors too 655 | new_batch_size = current_batch_size // 2 656 | if new_batch_size < min_batch_size: 657 | return 0 658 | current_batch_size = new_batch_size 659 | 660 | # Should not reach here 661 | logger.error(_("batch.no_suitable_size", min_size=min_batch_size)) 662 | return 0 663 | 664 | def _transcribe_with_auto_batch_size(self, batched_model, audio_path, starting_batch_size=None): 665 | """ 666 | Transcribe with automatic batch size reduction on OOM. 667 | Similar to HuggingFace Accelerate's find_executable_batch_size decorator. 668 | 669 | This function automatically retries with smaller batch sizes if OOM occurs, 670 | implementing the same behavior as Accelerate's find_executable_batch_size. 671 | 672 | Args: 673 | batched_model: BatchedInferencePipeline instance 674 | audio_path: Path to audio file 675 | starting_batch_size: Initial batch size to try (uses self.batch_size if not specified) 676 | 677 | Returns: 678 | Tuple of (segments, info, actual_batch_size_used) 679 | """ 680 | batch_size = starting_batch_size or self.batch_size or 32 681 | min_batch_size = 1 682 | 683 | while batch_size >= min_batch_size: 684 | try: 685 | logger.debug(_("batch.attempting_transcription", size=batch_size)) 686 | 687 | # Try transcription with current batch size 688 | segments, info = batched_model.transcribe( 689 | audio_path, 690 | batch_size=batch_size, 691 | **self.generation_config 692 | ) 693 | 694 | # Success! Return results with the batch size that worked 695 | if batch_size < (starting_batch_size or self.batch_size or 32): 696 | logger.info(_("batch.auto_adjusted", size=batch_size)) 697 | 698 | return segments, info, batch_size 699 | 700 | except RuntimeError as e: 701 | if "out of memory" in str(e).lower() or "oom" in str(e).lower(): 702 | # Reduce batch size by 0.8 (20% reduction, similar to Accelerate's 0.9 but more aggressive) 703 | new_batch_size = int(batch_size * 0.8) 704 | 705 | # Ensure we reduce by at least 1 706 | if new_batch_size == batch_size: 707 | new_batch_size = batch_size - 1 708 | 709 | logger.warning(_("batch.oom_reducing", old_size=batch_size, new_size=new_batch_size)) 710 | 711 | batch_size = new_batch_size 712 | 713 | if batch_size < min_batch_size: 714 | logger.error(_("batch.cannot_run_min", min_size=min_batch_size)) 715 | raise RuntimeError(_("batch.inference_failed", min_size=min_batch_size)) from e 716 | else: 717 | # Not an OOM error, re-raise 718 | raise 719 | 720 | # Should not reach here 721 | raise RuntimeError("Failed to find executable batch size") 722 | 723 | def _scan(self, base_dirs) -> list[InferenceTask]: 724 | tasks: list[InferenceTask] = [] 725 | 726 | def process(base_path, audio_path): 727 | nonlocal tasks 728 | p = Path(audio_path) 729 | suffix = p.suffix.lower().lstrip('.') 730 | 731 | logger.debug(_("debug.processing", path=audio_path)) 732 | logger.debug(_("debug.file_suffix", suffix=suffix)) 733 | logger.debug(_("debug.valid_suffixes", suffixes=self.audio_suffixes)) 734 | 735 | if suffix not in self.audio_suffixes: 736 | logger.debug(_("debug.skipped_suffix", suffix=suffix)) 737 | return 738 | 739 | rel_path = p.relative_to(base_path) 740 | abs_path = Path(os.path.join(self.output_dir or base_path, rel_path)) 741 | sub_formats = [] 742 | 743 | for suffix in self.sub_formats: 744 | sub_path = abs_path.parent / f"{abs_path.stem}.{suffix}" 745 | if sub_path.exists() and not self.overwrite: 746 | logger.debug(_("debug.subtitle_exists", path=sub_path)) 747 | continue 748 | sub_formats.append(suffix) 749 | 750 | if len(sub_formats) == 0: 751 | logger.debug(_("debug.skipped_all_exist")) 752 | return 753 | 754 | logger.debug(_("debug.added_task", formats=sub_formats)) 755 | tasks.append(InferenceTask(audio_path, str(abs_path.parent / abs_path.stem), sub_formats)) 756 | 757 | for base_dir in base_dirs: 758 | # Expand user home directory 759 | base_dir = os.path.expanduser(base_dir) 760 | logger.debug(_("debug.scanning", path=base_dir)) 761 | 762 | parent_dir = os.path.dirname(base_dir) 763 | if os.path.isdir(base_dir): 764 | for root, dirs, files in os.walk(base_dir, topdown=True): 765 | for file in files: 766 | process(parent_dir, os.path.join(root, file)) 767 | else: 768 | process(parent_dir, base_dir) 769 | 770 | logger.info(_("files.found", count=len(tasks))) 771 | return tasks 772 | 773 | 774 | def diagnose_environment(): 775 | """Run comprehensive environment diagnostics for debugging""" 776 | print("=" * 60) 777 | print("ENVIRONMENT DIAGNOSTICS") 778 | print("=" * 60) 779 | 780 | # System info 781 | print("\n1. System Information:") 782 | print(f" Platform: {platform.system()}") 783 | print(f" Architecture: {platform.machine()}") 784 | print(f" Python: {sys.version}") 785 | print(f" Executable: {sys.executable}") 786 | print(f" Frozen: {getattr(sys, 'frozen', False)}") 787 | 788 | if getattr(sys, 'frozen', False): 789 | print(f" Bundle Dir: {getattr(sys, '_MEIPASS', 'Unknown')}") 790 | 791 | # CUDA environment 792 | print("\n2. CUDA Environment Variables:") 793 | cuda_vars = ['CUDA_HOME', 'CUDA_PATH', 'CUDA_ROOT', 'CUDNN_HOME', 'LD_LIBRARY_PATH', 'PATH'] 794 | for var in cuda_vars: 795 | value = os.environ.get(var, 'Not set') 796 | if var == 'PATH' and value != 'Not set': 797 | # Just show cuda-related paths 798 | cuda_paths = [p for p in value.split(os.pathsep) if 'cuda' in p.lower() or 'nvidia' in p.lower()] 799 | value = os.pathsep.join(cuda_paths) if cuda_paths else 'No CUDA paths in PATH' 800 | print(f" {var}: {value}") 801 | 802 | # Check for nvidia-smi 803 | print("\n3. NVIDIA GPU Detection:") 804 | try: 805 | result = subprocess.run(['nvidia-smi', '--query-gpu=name,driver_version,cuda_version', '--format=csv,noheader'], 806 | capture_output=True, text=True, timeout=5) 807 | if result.returncode == 0: 808 | print(f" GPU Info: {result.stdout.strip()}") 809 | else: 810 | print(" nvidia-smi failed") 811 | except FileNotFoundError: 812 | print(" nvidia-smi not found in PATH") 813 | except Exception as e: 814 | print(f" Error: {e}") 815 | 816 | 817 | def check_onnxruntime_detailed(): 818 | """Detailed ONNX Runtime check for debugging""" 819 | print("\n" + "=" * 60) 820 | print("ONNX RUNTIME DIAGNOSTICS") 821 | print("=" * 60) 822 | 823 | try: 824 | import onnxruntime as ort 825 | print(f"\n✓ onnxruntime imported successfully") 826 | print(f" Version: {ort.__version__}") 827 | print(f" Location: {ort.__file__}") 828 | 829 | # Check available providers 830 | providers = ort.get_available_providers() 831 | print(f"\n Available providers: {providers}") 832 | 833 | # Check for GPU support 834 | has_cuda = 'CUDAExecutionProvider' in providers 835 | has_tensorrt = 'TensorrtExecutionProvider' in providers 836 | has_directml = 'DmlExecutionProvider' in providers 837 | 838 | print(f"\n GPU Support:") 839 | print(f" CUDA: {'✓ Available' if has_cuda else '✗ Not Available'}") 840 | print(f" TensorRT: {'✓ Available' if has_tensorrt else '✗ Not Available'}") 841 | print(f" DirectML: {'✓ Available' if has_directml else '✗ Not Available'}") 842 | 843 | if not has_cuda and sys.platform != 'darwin': 844 | print("\n ⚠️ CUDA not available. This might be because:") 845 | print(" 1. onnxruntime (CPU) is installed instead of onnxruntime-gpu") 846 | print(" 2. CUDA libraries are missing or not in PATH") 847 | print(" 3. Incompatible CUDA/cuDNN versions") 848 | 849 | # Check bundled libraries if frozen 850 | if getattr(sys, 'frozen', False): 851 | bundle_dir = getattr(sys, '_MEIPASS', '') 852 | print(f"\n Checking bundled libraries in: {bundle_dir}") 853 | 854 | cuda_libs = [] 855 | onnx_libs = [] 856 | 857 | try: 858 | for root, dirs, files in os.walk(bundle_dir): 859 | for file in files: 860 | if any(x in file.lower() for x in ['cuda', 'cudnn', 'cublas', 'cufft']): 861 | cuda_libs.append(file) 862 | elif 'onnx' in file.lower(): 863 | onnx_libs.append(file) 864 | 865 | if cuda_libs: 866 | print(f"\n Found {len(cuda_libs)} CUDA-related libraries:") 867 | for lib in cuda_libs[:10]: 868 | print(f" - {lib}") 869 | if len(cuda_libs) > 10: 870 | print(f" ... and {len(cuda_libs) - 10} more") 871 | else: 872 | print("\n ⚠️ No CUDA libraries found in bundle") 873 | except Exception as e: 874 | print(f" Error scanning bundle: {e}") 875 | 876 | return True 877 | 878 | except ImportError as e: 879 | print(f"\n✗ Failed to import onnxruntime: {e}") 880 | print("\nSuggestions:") 881 | print(" 1. Install onnxruntime-gpu for GPU support") 882 | print(" 2. Check if package is bundled correctly in PyInstaller") 883 | return False 884 | except Exception as e: 885 | print(f"\n✗ Error during ONNX Runtime check: {e}") 886 | traceback.print_exc() 887 | return False 888 | 889 | 890 | def test_vad_initialization(): 891 | """Test VAD model initialization for debugging""" 892 | print("\n" + "=" * 60) 893 | print("VAD MODEL TEST") 894 | print("=" * 60) 895 | 896 | try: 897 | from .vad_manager import WhisperVADOnnxWrapper, VadModelManager 898 | print("✓ VAD modules imported successfully") 899 | 900 | # Check for model files 901 | model_paths = [ 902 | 'models/whisper_vad.onnx', 903 | 'models/vad/whisper_vad.onnx', 904 | os.path.join(os.path.dirname(sys.executable), 'models', 'whisper_vad.onnx'), 905 | ] 906 | 907 | # If frozen, also check in bundle directory 908 | if getattr(sys, 'frozen', False): 909 | bundle_dir = getattr(sys, '_MEIPASS', '') 910 | model_paths.extend([ 911 | os.path.join(bundle_dir, 'models', 'whisper_vad.onnx'), 912 | os.path.join(bundle_dir, 'whisper_vad.onnx'), 913 | ]) 914 | 915 | model_path = None 916 | print("\nSearching for VAD model:") 917 | for path in model_paths: 918 | exists = os.path.exists(path) 919 | print(f" {path}: {'Found' if exists else 'Not found'}") 920 | if exists and model_path is None: 921 | model_path = path 922 | 923 | if model_path: 924 | print(f"\n✓ Using model: {model_path}") 925 | 926 | # Try to initialize 927 | print("\nTesting VAD initialization (GPU if available):") 928 | try: 929 | wrapper = WhisperVADOnnxWrapper( 930 | model_path=model_path, 931 | force_cpu=False, 932 | num_threads=1 933 | ) 934 | print(f" ✓ Device: {wrapper.device}") 935 | print(f" ✓ Providers: {wrapper.session.get_providers()}") 936 | except Exception as e: 937 | print(f" ✗ Error: {e}") 938 | 939 | # Test with forced CPU for comparison 940 | print("\nTesting VAD initialization (Force CPU):") 941 | try: 942 | wrapper_cpu = WhisperVADOnnxWrapper( 943 | model_path=model_path, 944 | force_cpu=True, 945 | num_threads=1 946 | ) 947 | print(f" ✓ Device: {wrapper_cpu.device}") 948 | except Exception as e: 949 | print(f" ✗ Error: {e}") 950 | else: 951 | print("\n✗ No VAD model file found") 952 | print(" Download the model using download_models.py") 953 | 954 | except ImportError as e: 955 | print(f"✗ Failed to import VAD modules: {e}") 956 | except Exception as e: 957 | print(f"✗ Error during VAD test: {e}") 958 | traceback.print_exc() 959 | 960 | 961 | def launch_debug_console(): 962 | """Launch interactive Python console for debugging""" 963 | print("\n" + "=" * 60) 964 | print("INTERACTIVE DEBUG CONSOLE") 965 | print("=" * 60) 966 | print("\nYou now have access to an interactive Python console.") 967 | print("\nAvailable commands:") 968 | print(" diagnose() - Run environment diagnostics") 969 | print(" check_onnx() - Check ONNX Runtime status") 970 | print(" test_vad() - Test VAD initialization") 971 | print(" import X - Try importing any module") 972 | print(" exit() or Ctrl+D - Exit console and continue") 973 | print("\nUseful variables:") 974 | print(" sys.path - Python module search paths") 975 | print(" os.environ - Environment variables") 976 | print(" sys.frozen - Check if running from PyInstaller") 977 | print("=" * 60 + "\n") 978 | 979 | # Create namespace with useful functions 980 | namespace = { 981 | 'diagnose': diagnose_environment, 982 | 'check_onnx': check_onnxruntime_detailed, 983 | 'test_vad': test_vad_initialization, 984 | 'sys': sys, 985 | 'os': os, 986 | 'platform': platform, 987 | } 988 | 989 | # Launch interactive console 990 | code.InteractiveConsole(locals=namespace).interact(banner="") 991 | 992 | 993 | def main(): 994 | """Main entry point for the script""" 995 | if getattr(sys, 'frozen', False): 996 | os.chdir(os.path.dirname(sys.executable)) 997 | else: 998 | # When run as a module, don't change directory 999 | pass 1000 | 1001 | args = parse_arguments() 1002 | 1003 | # Display open-source notice 1004 | print("=" * 70) 1005 | print("⚠️ 重要声明 / IMPORTANT NOTICE") 1006 | print("=" * 70) 1007 | print("本软件开源于: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice") 1008 | print("开发团队: AI汉化组 (https://t.me/transWithAI)") 1009 | print("任何第三方非免费下载均为智商税") 1010 | print("=" * 70) 1011 | print() 1012 | 1013 | # Check if console mode requested 1014 | if args.console: 1015 | # Run diagnostics first 1016 | diagnose_environment() 1017 | check_onnxruntime_detailed() 1018 | test_vad_initialization() 1019 | 1020 | # Launch interactive console 1021 | launch_debug_console() 1022 | 1023 | # After console exits, ask if user wants to continue with normal operation 1024 | print("\nDebug console exited.") 1025 | try: 1026 | response = input("Continue with normal inference? (y/n): ").strip().lower() 1027 | if response != 'y': 1028 | print("Exiting...") 1029 | sys.exit(0) 1030 | except (KeyboardInterrupt, EOFError): 1031 | print("\nExiting...") 1032 | sys.exit(0) 1033 | 1034 | # Normal operation 1035 | logger.setLevel(args.log_level) 1036 | 1037 | # Add file logging to latest.log in current working directory 1038 | # This helps users report issues by providing a log file 1039 | log_file_path = os.path.join(os.getcwd(), 'latest.log') 1040 | file_handler = logging.FileHandler(log_file_path, mode='w', encoding='utf-8') 1041 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) 1042 | file_handler.setLevel(args.log_level) 1043 | 1044 | # Add file handler to the module logger 1045 | logger.addHandler(file_handler) 1046 | 1047 | logger.info(_("info.logging_to_file").format(path=log_file_path)) 1048 | logger.info(_("info.program_version").format(version="v1.3")) 1049 | logger.info(_("info.python_version").format(version=sys.version)) 1050 | logger.info(_("info.platform").format(platform=platform.platform())) 1051 | logger.info(_("info.arguments").format(args=vars(args))) 1052 | 1053 | if len(args.base_dirs) == 0: 1054 | logger.warning(_("warnings.drag_files")) 1055 | sys.exit(1) 1056 | 1057 | inference = Inference(args) 1058 | inference.generates(args.base_dirs) 1059 | sys.exit(0) 1060 | 1061 | 1062 | if __name__ == '__main__': 1063 | # When run directly as a script 1064 | import os 1065 | os.chdir(os.path.dirname(__file__)) 1066 | main() 1067 | --------------------------------------------------------------------------------