├── .python-version
├── transwithai.ico
├── 运行(CPU).bat
├── 运行(GPU).bat
├── 运行(GPU,低显存模式).bat
├── 运行(GPU)(输出到当前文件夹).bat
├── 运行(GPU,高显存加速模式).bat
├── infer.py
├── src
    └── faster_whisper_transwithai_chickenrice
    │   ├── __init__.py
    │   ├── injection.py
    │   ├── i18n_modern.py
    │   ├── vad_manager.py
    │   └── infer.py
├── environment-cuda128.yml
├── environment-cuda122.yml
├── LICENSE
├── environment-cuda118.yml
├── generation_config.json5
├── runtime_hook.py
├── 使用说明.txt
├── patches
    └── batch-transcribe.patch
├── README.md
├── .gitignore
├── locales
    ├── zh-CN
    │   └── messages.json
    └── en-US
    │   └── messages.json
├── RELEASE_NOTES_CN.md
├── project.spec
├── download_models.py
└── .github
    └── workflows
        └── build-release-conda.yml


/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/transwithai.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/HEAD/transwithai.ico


--------------------------------------------------------------------------------
/运行(CPU).bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 | set cpath=%~dp0
4 | set cpath=%cpath:~0,-1%
5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cpu" %*
6 | pause
7 | 


--------------------------------------------------------------------------------
/运行(GPU).bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 | set cpath=%~dp0
4 | set cpath=%cpath:~0,-1%
5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" %*
6 | pause
7 | 


--------------------------------------------------------------------------------
/运行(GPU,低显存模式).bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 | set cpath=%~dp0
4 | set cpath=%cpath:~0,-1%
5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" %*
6 | pause
7 | 


--------------------------------------------------------------------------------
/运行(GPU)(输出到当前文件夹).bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 | set cpath=%~dp0
4 | set cpath=%cpath:~0,-1%
5 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --output_dir="输出" --device="cuda" %*
6 | pause
7 | 


--------------------------------------------------------------------------------
/运行(GPU,高显存加速模式).bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | chcp 65001
 3 | set cpath=%~dp0
 4 | set cpath=%cpath:~0,-1%
 5 | echo ========================================
 6 | echo   GPU批处理加速模式 (Batch Inference)
 7 | echo   自动检测最佳批处理大小以提高速度
 8 | echo   需要更多显存 (建议8GB+)
 9 | echo ========================================
10 | "%cpath%\infer.exe" --audio_suffixes="mp3,wav,flac,m4a,aac,ogg,wma,mp4,mkv,avi,mov,webm,flv,wmv" --sub_formats="srt,vtt,lrc" --device="cuda" --enable_batching --max_batch_size=8 %*
11 | pause


--------------------------------------------------------------------------------
/infer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Standalone inference script with custom VAD injection
 4 | This can be run directly from the project root without installation
 5 | """
 6 | 
 7 | import sys
 8 | import os
 9 | 
10 | # Add src to path for local development
11 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
12 | 
13 | from faster_whisper_transwithai_chickenrice.infer import main
14 | 
15 | if __name__ == '__main__':
16 |     if getattr(sys, 'frozen', False):
17 |         os.chdir(os.path.dirname(sys.executable))
18 |     else:
19 |         os.chdir(os.path.dirname(__file__))
20 |     main()


--------------------------------------------------------------------------------
/src/faster_whisper_transwithai_chickenrice/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | faster_whisper_transwithai_chickenrice - Custom VAD injection for faster_whisper
 3 | """
 4 | 
 5 | from .injection import (
 6 |     inject_vad,
 7 |     uninject_vad,
 8 |     VadInjectionContext,
 9 |     with_vad_injection,
10 |     auto_inject_vad,
11 |     VadOptionsCompat,
12 |     is_injection_active,
13 | )
14 | from .vad_manager import VadModelManager, WhisperVadModel
15 | 
16 | __version__ = "0.1.0"
17 | 
18 | __all__ = [
19 |     "inject_vad",
20 |     "uninject_vad",
21 |     "VadInjectionContext",
22 |     "with_vad_injection",
23 |     "auto_inject_vad",
24 |     "VadOptionsCompat",
25 |     "is_injection_active",
26 |     "VadModelManager",
27 |     "WhisperVadModel",
28 | ]


--------------------------------------------------------------------------------
/environment-cuda128.yml:
--------------------------------------------------------------------------------
 1 | # Conda environment for CUDA 12.8
 2 | name: faster-whisper-cu128
 3 | channels:
 4 |   - conda-forge
 5 |   - defaults
 6 | 
 7 | dependencies:
 8 |   # Python version
 9 |   - python=3.10
10 | 
11 |   # Core dependencies
12 |   - librosa>=0.10.0
13 |   - ffmpeg>=8.0
14 |   - pip
15 | 
16 |   # CUDA 12.8 toolkit with cuDNN 9
17 |   - cuda-runtime=12.8.*
18 |   - cudnn=9.10.*
19 | 
20 |   # Pip dependencies (some packages not available in conda)
21 |   - pip:
22 |       # CTranslate2 for CUDA 12
23 |       - ctranslate2>=4.5.0
24 | 
25 |       # faster-whisper and related
26 |       - faster-whisper>=1.0.0
27 | 
28 |       # Other ML dependencies
29 |       - transformers>=4.30.0
30 | 
31 |       # Utilities
32 |       - pyjson5>=1.6.0
33 |       - markupsafe==2.1.5
34 |       - backports.functools-lru-cache  # Fix for PyInstaller ModuleNotFoundError
35 | 
36 |       # Build tools
37 |       - pyinstaller>=6.0.0
38 |       - setuptools>=65.0.0
39 |       - wheel>=0.38.0
40 |       - build>=0.10.0
41 |       - requests>=2.28.0
42 | 
43 |       # Test dependencies
44 |       - pytest>=7.0.0
45 |       - pytest-cov>=4.0.0
46 | 


--------------------------------------------------------------------------------
/environment-cuda122.yml:
--------------------------------------------------------------------------------
 1 | # Conda environment for CUDA 12.2 with cuDNN 8
 2 | name: faster-whisper-cu122
 3 | channels:
 4 |   - conda-forge
 5 |   - defaults
 6 | 
 7 | dependencies:
 8 |   # Python version
 9 |   - python=3.10
10 | 
11 |   # Core dependencies
12 |   - librosa>=0.10.0
13 |   - ffmpeg>=8.0
14 |   - pip
15 | 
16 |   # CUDA 12.2 toolkit with cuDNN 9
17 |   - cuda-runtime=12.2.*
18 |   - cudnn=9.2.*
19 | 
20 |   # Pip dependencies (some packages not available in conda)
21 |   - pip:
22 |       # CTranslate2 for CUDA 12
23 |       - ctranslate2>=4.5.0
24 | 
25 |       # faster-whisper and related
26 |       - faster-whisper>=1.0.0
27 | 
28 |       # Other ML dependencies
29 |       - transformers>=4.30.0
30 | 
31 |       # Utilities
32 |       - pyjson5>=1.6.0
33 |       - markupsafe==2.1.5
34 |       - backports.functools-lru-cache  # Fix for PyInstaller ModuleNotFoundError
35 | 
36 |       # Build tools
37 |       - pyinstaller>=6.0.0
38 |       - setuptools>=65.0.0
39 |       - wheel>=0.38.0
40 |       - build>=0.10.0
41 |       - requests>=2.28.0
42 | 
43 |       # Test dependencies
44 |       - pytest>=7.0.0
45 |       - pytest-cov>=4.0.0
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 TransWithAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/environment-cuda118.yml:
--------------------------------------------------------------------------------
 1 | # Conda environment for CUDA 11.8 using nvidia channel packages
 2 | name: faster-whisper-cu118
 3 | channels:
 4 |   - conda-forge
 5 |   - defaults
 6 | 
 7 | dependencies:
 8 |   # Python version
 9 |   - python=3.10
10 | 
11 |   # Core dependencies
12 |   - librosa>=0.10.0
13 |   - ffmpeg<6 # av pip installation issue workaround
14 |   - pip
15 | 
16 |   # CUDA 11.8 toolkit with cuDNN 8
17 |   - cudatoolkit=11.8.*
18 |   - cudnn=8.*
19 | 
20 |   # Pip dependencies (some packages not available in conda)
21 |   - pip:
22 |       # CTranslate2 for CUDA 11, later forced reinstall to 3.24.0 in CI
23 |       - ctranslate2
24 | 
25 |       # onnxruntime compatibility workaround
26 |       - numpy==1.26.4
27 | 
28 |       # faster-whisper and related
29 |       - faster-whisper>=1.0.0
30 | 
31 |       # Other ML dependencies
32 |       - transformers>=4.30.0
33 | 
34 |       # Utilities
35 |       - pyjson5>=1.6.0
36 |       - markupsafe==2.1.5
37 |       - backports.functools-lru-cache  # Fix for PyInstaller ModuleNotFoundError
38 | 
39 |       # Build tools
40 |       - pyinstaller>=6.0.0
41 |       - setuptools>=65.0.0
42 |       - wheel>=0.38.0
43 |       - build>=0.10.0
44 |       - requests>=2.28.0
45 | 
46 |       # Test dependencies
47 |       - pytest>=7.0.0
48 |       - pytest-cov>=4.0.0
49 | 


--------------------------------------------------------------------------------
/generation_config.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     // 可以在这里控制各种生成字幕的参数, 下面这个链接里的参数都可以控制
 3 |     // You can control various subtitle generation parameters here, all parameters in the link below can be controlled
 4 |     // https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733
 5 | 
 6 |     // VAD 参数设置 (使用改进的 whisper_vad 模型)
 7 |     // VAD parameters (using improved whisper_vad model)
 8 |     "vad_parameters": {
 9 |         // VAD检测阈值 (0.3-0.7, 推荐0.5)
10 |         // 太大会导致漏翻, 太小可能会导致时间轴不准或文本质量下降(幻听)
11 |         // VAD detection threshold (0.3-0.7, recommended 0.5)
12 |         // Too high will cause missed translations, too low may cause timeline inaccuracy or text quality degradation (hallucinations)
13 |         "threshold": 0.5,
14 | 
15 |         // 最小语音持续时间 (毫秒)
16 |         // Minimum speech duration (milliseconds)
17 |         "min_speech_duration_ms": 300,
18 | 
19 |         // 最小静音持续时间 (毫秒)
20 |         // Minimum silence duration (milliseconds)
21 |         "min_silence_duration_ms": 100,
22 | 
23 |         // 语音前后填充时间 (毫秒)
24 |         // Speech padding before and after (milliseconds)
25 |         "speech_pad_ms": 200,
26 |     },
27 | 
28 |     // 避免时间轴向前偏移过长的问题
29 |     // Avoid excessive forward shift of timeline
30 |     "max_initial_timestamp": 30,
31 | 
32 |     "repetition_penalty": 1.1,
33 | 
34 | }


--------------------------------------------------------------------------------
/runtime_hook.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Runtime hook for PyInstaller to set environment variables before the application starts.
 4 | This resolves OpenMP conflicts when multiple libraries bring their own OpenMP implementations.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import multiprocessing
10 | 
11 | # Set KMP_DUPLICATE_LIB_OK to allow multiple OpenMP libraries
12 | # This is needed because different packages (numpy, scipy, ctranslate2, onnxruntime)
13 | # may bring different OpenMP implementations (libiomp5md.dll vs mk2iomp5md.dll)
14 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
15 | 
16 | # Suppress transformers advisory warnings
17 | os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
18 | 
19 | # Configure ONNX Runtime to use half of available CPU cores for better performance
20 | # This prevents oversubscription and resource contention
21 | cpu_count = multiprocessing.cpu_count()
22 | optimal_threads = max(1, cpu_count // 2)
23 | 
24 | # Set ONNX Runtime environment variables for CPU execution
25 | os.environ['OMP_NUM_THREADS'] = str(optimal_threads)
26 | os.environ['MKL_NUM_THREADS'] = str(optimal_threads)
27 | 
28 | print(f"Runtime hook: Set KMP_DUPLICATE_LIB_OK=TRUE to resolve OpenMP conflicts")
29 | print(f"Runtime hook: Set TRANSFORMERS_NO_ADVISORY_WARNINGS=1 to suppress advisory warnings")
30 | print(f"Runtime hook: Configured ONNX Runtime to use {optimal_threads} threads (half of {cpu_count} available CPUs)")


--------------------------------------------------------------------------------
/使用说明.txt:
--------------------------------------------------------------------------------
  1 | ========================================
  2 | ⚠️ 重要声明
  3 | ========================================
  4 | 本软件开源于: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice
  5 | 开发团队: AI汉化组 (https://t.me/transWithAI)
  6 | ========================================
  7 | 
  8 | 基本用法:
  9 | 
 10 | 将需要转录/翻译的音频或视频文件（或包含它们的文件夹）拖放到相应的批处理文件上运行。
 11 | 
 12 | === 选择运行模式 ===
 13 | 
 14 | CPU模式:
 15 | - 拖放到 "运行(CPU).bat" - 使用CPU进行处理
 16 | 
 17 | GPU模式（仅限NVIDIA显卡）:
 18 | - 拖放到 "运行(GPU).bat" - 显存≥6GB时使用
 19 | - 拖放到 "运行(GPU,低显存模式).bat" - 显存4GB时使用
 20 | - 建议先更新显卡驱动到最新版本
 21 | 
 22 | 视频专用模式:
 23 | - 拖放到 "运行(翻译视频)(CPU).bat" - 使用CPU处理视频
 24 | - 拖放到 "运行(翻译视频)(GPU).bat" - 使用GPU处理视频
 25 | - 拖放到 "运行(翻译视频)(GPU,低显存模式).bat" - 低显存GPU处理视频
 26 | 
 27 | 输出到指定文件夹:
 28 | - 拖放到 "运行(GPU)(输出到当前文件夹).bat" - 字幕输出到"输出"文件夹
 29 | 
 30 | === 支持的格式 ===
 31 | 
 32 | 音频格式: mp3, wav, flac, m4a, aac, ogg, wma
 33 | 视频格式: mp4, mkv, avi, mov, webm, flv, wmv
 34 | 
 35 | 输出格式:
 36 | - LRC (歌词格式，适合音乐播放器)
 37 | - SRT (常用字幕格式，适合视频播放器)
 38 | - VTT (WebVTT格式，适合网页视频)
 39 | 
 40 | 所有批处理文件默认会生成这三种格式的字幕。如果字幕文件已存在，将自动跳过。
 41 | 
 42 | ---
 43 | 
 44 | 调整参数:
 45 | 
 46 | 1. 基本参数调整：
 47 | 编辑批处理文件，找到以 "%cpath%\infer.exe" 开头的行，在其后添加参数。
 48 | 
 49 | 示例（添加覆盖模式）:
 50 | 添加前: "%cpath%\infer.exe" --device="cuda" %*
 51 | 添加后: "%cpath%\infer.exe" --overwrite --device="cuda" %*
 52 | 
 53 | 常用参数:
 54 | --overwrite : 覆盖已存在的字幕文件
 55 | --output_dir="路径" : 指定输出文件夹（默认输出到源文件所在文件夹）
 56 | --audio_suffixes="mp3,wav" : 自定义处理的文件格式
 57 | --sub_formats="srt,vtt,lrc" : 自定义输出格式
 58 | --log_level="INFO" : 减少控制台输出（默认为DEBUG）
 59 | 
 60 | 2. 生成参数调整（高级）:
 61 | 编辑 generation_config.json5 文件调整转录参数。
 62 | 参数详情见下方相关项目链接。
 63 | 
 64 | 注意：通常不需要调整生成参数。如遇到以下情况可尝试调整：
 65 | - 声音过小导致漏翻
 66 | - 时间轴对不上
 67 | - 出现幻听
 68 | 
 69 | ---
 70 | 
 71 | 故障排除:
 72 | 
 73 | 1. GPU模式无法运行：
 74 |    - 确认是否为NVIDIA显卡
 75 |    - 更新显卡驱动到最新版本
 76 |    - 检查CUDA是否正确安装
 77 | 
 78 | 2. 字幕未生成：
 79 |    - 检查文件格式是否支持
 80 |    - 查看控制台是否有错误信息
 81 |    - 尝试使用 --overwrite 参数重新生成
 82 | 
 83 | 3. 内存不足：
 84 |    - 使用低显存模式
 85 |    - 尝试CPU模式
 86 |    - 处理较小的文件或分段处理
 87 | 
 88 | ---
 89 | 
 90 | 相关项目：
 91 | 
 92 | - Faster Whisper: https://github.com/SYSTRAN/faster-whisper
 93 | - 海南鸡模型 (日文转中文优化): https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st
 94 | - 音声优化 VAD 模型: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx
 95 | - OpenAI Whisper: https://github.com/openai/whisper
 96 | - 参数详情: https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733
 97 | 
 98 | 致谢：
 99 | - 基于 SYSTRAN/faster-whisper 开发
100 | - 使用 chickenrice0721 日文转中文优化模型（5000小时音频数据训练）
101 | - 使用 TransWithAI 音声优化 VAD 模型 (Whisper-Vad-EncDec-ASMR-onnx)
102 | - 感谢某匿名群友的算力和技术支持
103 | 


--------------------------------------------------------------------------------
/patches/batch-transcribe.patch:
--------------------------------------------------------------------------------
 1 | --- a/faster_whisper/transcribe.py
 2 | +++ b/faster_whisper/transcribe.py
 3 | @@ -219,6 +219,10 @@
 4 |              for i, language_token in enumerate(language_tokens):
 5 |                  prompts[i][language_token_index] = language_token
 6 | 
 7 | +        max_initial_timestamp_index = int(
 8 | +            round(options.max_initial_timestamp / self.model.time_precision)
 9 | +        )
10 | +
11 |          results = self.model.model.generate(
12 |              encoder_output,
13 |              prompts,
14 | @@ -228,6 +232,7 @@
15 |              max_length=max_length,
16 |              suppress_blank=options.suppress_blank,
17 |              suppress_tokens=options.suppress_tokens,
18 | +            max_initial_timestamp_index=max_initial_timestamp_index,
19 |              return_scores=True,
20 |              return_no_speech_prob=True,
21 |              sampling_temperature=options.temperatures[0],
22 | @@ -280,7 +285,7 @@
23 |          prefix: Optional[str] = None,
24 |          suppress_blank: bool = True,
25 |          suppress_tokens: Optional[List[int]] = [-1],
26 | -        without_timestamps: bool = True,
27 | +        without_timestamps: bool = False,
28 |          max_initial_timestamp: float = 1.0,
29 |          word_timestamps: bool = False,
30 |          prepend_punctuations: str = "\"'"¿([{-",
31 | @@ -321,6 +326,7 @@
32 |              suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
33 |                  of symbols as defined in `tokenizer.non_speech_tokens()`.
34 |              without_timestamps: Only sample text tokens.
35 | +            max_initial_timestamp: The initial timestamp cannot be later than this.
36 |              word_timestamps: Extract word-level timestamps using the cross-attention pattern
37 |                  and dynamic time warping, and include the timestamps for each word in each segment.
38 |                  Set as False.
39 | @@ -363,7 +369,6 @@
40 |              prompt_reset_on_temperature: Resets prompt if temperature is above this value.
41 |                  Arg has effect only if condition_on_previous_text is True. Set at 0.5
42 |              prefix: Optional text to provide as a prefix at the beginning of each window.
43 | -            max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
44 |              hallucination_silence_threshold: Optional[float]
45 |                  When word_timestamps is True, skip silent periods longer than this threshold
46 |                  (in seconds) when a possible hallucination is detected. set as None.
47 | @@ -549,7 +554,7 @@
48 |              prompt_reset_on_temperature=0.5,
49 |              multilingual=multilingual,
50 |              without_timestamps=without_timestamps,
51 | -            max_initial_timestamp=0.0,
52 | +            max_initial_timestamp=max_initial_timestamp,
53 |          )
54 | 
55 |          info = TranscriptionInfo(


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🎙️ Faster Whisper TransWithAI ChickenRice
  2 | 
  3 | [![GitHub Release](https://img.shields.io/github/v/release/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice)](https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/releases)
  4 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
  5 | 
  6 | 高性能音视频转录和翻译工具 - 基于 Faster Whisper 和音声优化 VAD 的日文转中文优化版本
  7 | 
  8 | High-performance audio/video transcription and translation tool - Japanese-to-Chinese optimized version based on Faster Whisper and voice-optimized VAD
  9 | 
 10 | ## ⚠️ 重要声明 / Important Notice
 11 | 
 12 | > **本软件为开源软件 / This software is open source**
 13 | >
 14 | > 🔗 **开源地址 / Repository**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice
 15 | >
 16 | > 👥 **开发团队 / Development Team**: AI汉化组 (https://t.me/transWithAI)
 17 | >
 18 | > 本软件完全免费开源 / This software is completely free and open source
 19 | 
 20 | ## 🙏 致谢 / Acknowledgments
 21 | 
 22 | - 🚀 基于 [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) 开发
 23 | - 🐔 使用 [chickenrice0721/whisper-large-v2-translate-zh-v0.2-st](https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st) 日文转中文优化模型
 24 | - 🔊 使用 [TransWithAI/Whisper-Vad-EncDec-ASMR-onnx](https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx) 音声优化 VAD 模型
 25 | - 💪 **感谢某匿名群友的算力和技术支持**
 26 | 
 27 | ## ✨ 功能特性 / Features
 28 | 
 29 | - 🎯 **高精度日文转中文翻译**: 基于5000小时音频数据训练的"海南鸡v2"日文转中文优化模型
 30 | - 🚀 **GPU加速**: 支持CUDA 11.8/12.2/12.8，充分利用NVIDIA显卡性能
 31 | - 📝 **多格式输出**: 支持SRT、VTT、LRC等多种字幕格式
 32 | - 🎬 **音视频支持**: 支持常见音频(mp3/wav/flac等)和视频格式(mp4/mkv/avi等)
 33 | - 💾 **智能缓存**: 自动跳过已处理文件，提高批量处理效率
 34 | - 🔧 **灵活配置**: 可自定义转录参数，满足不同场景需求
 35 | 
 36 | ## 📦 版本说明 / Package Variants
 37 | 
 38 | ### 基础版 (Base Package) - 约 2.2GB
 39 | - ✅ 所有 GPU 依赖项
 40 | - ✅ 音声优化 VAD（语音活动检测）模型
 41 | - ❌ 不含 Whisper 模型（需自行下载）
 42 | 
 43 | ### 海南鸡版 (ChickenRice Edition) - 约 4.4GB
 44 | - ✅ 所有 GPU 依赖项
 45 | - ✅ 音声优化 VAD（语音活动检测）模型
 46 | - ✅ **"海南鸡v2 5000小时"** 日文转中文优化模型（开箱即用）
 47 | 
 48 | ## 🚀 快速开始 / Quick Start
 49 | 
 50 | ### 1. 选择适合的CUDA版本 / Choose CUDA Version
 51 | 
 52 | 运行 `nvidia-smi` 查看您的CUDA版本：
 53 | 
 54 | | 显卡系列 | 推荐 CUDA 版本 |
 55 | |---------|--------------|
 56 | | GTX 10/16系列 | CUDA 11.8 |
 57 | | RTX 20/30系列 | CUDA 11.8 或 12.2 |
 58 | | RTX 40系列 | CUDA 12.2 或 12.8 |
 59 | | RTX 50系列 | **必须使用 CUDA 12.8** |
 60 | 
 61 | ### 2. 下载对应版本 / Download
 62 | 
 63 | 从 [Releases](https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice/releases) 页面下载对应版本
 64 | 
 65 | ### 3. 使用方法 / Usage
 66 | 
 67 | 将音视频文件拖放到相应的批处理文件：
 68 | 
 69 | ```bash
 70 | # GPU模式（推荐，显存≥6GB）
 71 | 运行(GPU).bat
 72 | 
 73 | # GPU低显存模式（显存4GB）
 74 | 运行(GPU,低显存模式).bat
 75 | 
 76 | # CPU模式（无显卡用户）
 77 | 运行(CPU).bat
 78 | 
 79 | # 视频专用模式
 80 | 运行(翻译视频)(GPU).bat
 81 | ```
 82 | 
 83 | ## 📖 详细文档 / Documentation
 84 | 
 85 | - 📝 [使用说明](使用说明.txt) - 详细的使用指南和参数配置
 86 | - 📋 [发行说明](RELEASE_NOTES_CN.md) - 版本更新日志和选择指南
 87 | - ⚙️ [生成配置](generation_config.json5) - 转录参数配置文件
 88 | 
 89 | ## 🛠️ 高级配置 / Advanced Configuration
 90 | 
 91 | ### 命令行参数
 92 | 
 93 | 编辑批处理文件，在 `infer.exe` 后添加参数：
 94 | 
 95 | ```batch
 96 | # 覆盖已存在的字幕文件
 97 | --overwrite
 98 | 
 99 | # 指定输出文件夹
100 | --output_dir="路径"
101 | 
102 | # 自定义文件格式
103 | --audio_suffixes="mp3,wav"
104 | --sub_formats="srt,vtt,lrc"
105 | 
106 | # 调整日志级别
107 | --log_level="INFO"
108 | ```
109 | 
110 | ### 转录参数调整
111 | 
112 | 编辑 `generation_config.json5` 文件调整转录参数。
113 | 
114 | 参数详情请参考 [Faster Whisper 文档](https://github.com/SYSTRAN/faster-whisper/blob/dea24cbcc6cbef23ff599a63be0bbb647a0b23d6/faster_whisper/transcribe.py#L733)
115 | 
116 | ## 🔗 相关链接 / Links
117 | 
118 | - **Faster Whisper**: https://github.com/SYSTRAN/faster-whisper
119 | - **海南鸡模型**: https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st
120 | - **音声优化 VAD 模型**: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx
121 | - **OpenAI Whisper**: https://github.com/openai/whisper
122 | - **AI汉化组**: https://t.me/transWithAI
123 | 
124 | ## 💡 常见问题 / FAQ
125 | 
126 | **Q: GPU模式无法运行？**
127 | A: 确认是否为NVIDIA显卡，更新显卡驱动到最新版本
128 | 
129 | **Q: 字幕未生成？**
130 | A: 检查文件格式是否支持，查看控制台错误信息，尝试使用 `--overwrite` 参数
131 | 
132 | **Q: 内存/显存不足？**
133 | A: 使用低显存模式或切换到CPU模式
134 | 
135 | **Q: 如何选择CUDA版本？**
136 | A: 运行 `nvidia-smi` 查看CUDA Version，参考[发行说明](RELEASE_NOTES_CN.md)中的兼容性表
137 | 
138 | ## 📞 技术支持 / Support
139 | 
140 | 如遇到问题，请：
141 | 1. 查看[使用说明](使用说明.txt)和[发行说明](RELEASE_NOTES_CN.md)
142 | 2. 检查显卡驱动是否为最新版本
143 | 3. 确认选择了正确的CUDA版本
144 | 4. 提交Issue到项目仓库
145 | 
146 | ## 📄 许可证 / License
147 | 
148 | 本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件
149 | 
150 | ---
151 | 
152 | *本工具基于 Faster Whisper 开发，海南鸡模型经过5000小时音频数据优化训练，专门针对日文转中文翻译场景。*
153 | *由AI汉化组开源维护，永久免费。*
154 | 
155 | **再次感谢某匿名群友的算力和技术支持！**
156 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | !project.spec  # Keep our custom spec file
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | # (Using JSON-based i18n now, no compiled files)
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | 
165 | ### Python Patch ###
166 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
167 | poetry.toml
168 | 
169 | # ruff
170 | .ruff_cache/
171 | 
172 | # LSP config files
173 | pyrightconfig.json
174 | 
175 | # End of https://www.toptal.com/developers/gitignore/api/python
176 | 
177 | # Model files (downloaded separately)
178 | models/*.bin
179 | models/*.onnx
180 | models/*.pt
181 | models/*.pth
182 | models/*.safetensors
183 | # Keep metadata and config files
184 | !models/*.json
185 | !models/*.json5
186 | !models/*.txt
187 | !models/*.yaml
188 | !models/*.yml
189 | 
190 | # UV package manager
191 | .uv/
192 | uv.lock
193 | 
194 | # OS specific
195 | .DS_Store
196 | Thumbs.db
197 | desktop.ini
198 | 
199 | models/


--------------------------------------------------------------------------------
/locales/zh-CN/messages.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "app": {
  3 |     "name": "Whisper转录增强版",
  4 |     "description": "基于自定义VAD注入的Whisper转录"
  5 |   },
  6 | 
  7 |   "args": {
  8 |     "model_path": "Whisper模型路径",
  9 |     "device": "运行模型的设备 (cpu, cuda, auto)",
 10 |     "compute_type": "模型计算类型",
 11 |     "overwrite": "覆盖现有字幕文件",
 12 |     "audio_extensions": "要处理的音频文件扩展名列表（逗号分隔）",
 13 |     "subtitle_formats": "字幕格式列表（逗号分隔）：lrc, srt, vtt, txt",
 14 |     "output_dir": "字幕文件输出目录",
 15 |     "config_file": "生成配置文件路径",
 16 |     "log_level": "日志级别",
 17 |     "vad_threshold": "覆盖VAD阈值",
 18 |     "min_speech_duration": "覆盖最小语音持续时间（毫秒）",
 19 |     "min_silence_duration": "覆盖最小静音持续时间（毫秒）",
 20 |     "speech_padding": "覆盖语音填充时间（毫秒）",
 21 |     "directories": "要处理的目录或文件"
 22 |   },
 23 | 
 24 |   "info": {
 25 |     "output_dir": "输出目录：{output_dir}",
 26 |     "generation_config": "生成配置：{config}",
 27 |     "initializing_vad": "正在初始化增强VAD模型...",
 28 |     "vad_activated": "✓ 增强VAD已激活（阈值={threshold}）",
 29 |     "loading_whisper": "正在加载Whisper模型...",
 30 |     "model_precision": "模型运行精度：{precision}，设备：{device}",
 31 |     "translating": "正在翻译（{current}/{total}）：{path}",
 32 |     "duration": "时长：{duration}",
 33 |     "duration_filtered": "时长：{original} → {filtered}（检测到 {percent} 语音）",
 34 |     "writing": "正在写入：{path}",
 35 |     "vad_deactivated": "VAD注入已停用",
 36 |     "no_files_found": "未找到要翻译的文件",
 37 |     "logging_to_file": "日志文件：{path}",
 38 |     "program_version": "程序版本：{version}",
 39 |     "python_version": "Python版本：{version}",
 40 |     "platform": "运行平台：{platform}",
 41 |     "arguments": "运行参数：{args}",
 42 |     "auto_detected_device": "自动检测到设备：{device}",
 43 |     "auto_selected_compute_type": "自动选择计算类型 '{compute_type}'，设备：'{device}'"
 44 |   },
 45 | 
 46 |   "tasks": {
 47 |     "translation": {
 48 |       "one": "翻译任务：{count}",
 49 |       "other": "翻译任务：{count}"
 50 |     }
 51 |   },
 52 | 
 53 |   "files": {
 54 |     "found": {
 55 |       "one": "找到 {count} 个文件待处理",
 56 |       "other": "找到 {count} 个文件待处理"
 57 |     },
 58 |     "count": {
 59 |       "one": "{count} 个文件",
 60 |       "other": "{count} 个文件"
 61 |     }
 62 |   },
 63 | 
 64 |   "warnings": {
 65 |     "provide_directories": "请提供要翻译的目录",
 66 |     "drag_files": "请将要翻译的文件或目录拖放到此程序上",
 67 |     "unknown_format": "未知格式：{format}",
 68 |     "loaded_vad_config": "已从 {path} 加载VAD配置",
 69 |     "failed_load_vad": "无法从 {path} 加载VAD元数据：{error}",
 70 |     "using_default_vad": "使用默认VAD配置",
 71 |     "vad_file_not_found": "在 {path} 未找到VAD元数据文件",
 72 |     "compute_types_unavailable": "无法获取设备 {device} 支持的计算类型：{error}",
 73 |     "no_preferred_compute_type": "未找到首选计算类型，使用默认值 '{default}'"
 74 |   },
 75 | 
 76 |   "progress": {
 77 |     "vad": "VAD进度：{current}/{total} 块（{percent:0.1f}%）在 {device} 上"
 78 |   },
 79 | 
 80 |   "debug": {
 81 |     "processing": "正在处理：{path}",
 82 |     "file_suffix": "文件后缀：{suffix}",
 83 |     "valid_suffixes": "有效后缀：{suffixes}",
 84 |     "skipped_suffix": "已跳过 - 后缀 '{suffix}' 不在有效音频格式中",
 85 |     "subtitle_exists": "字幕已存在：{path}",
 86 |     "skipped_all_exist": "已跳过 - 所有字幕格式已存在",
 87 |     "added_task": "为格式添加任务：{formats}",
 88 |     "scanning": "正在扫描：{path}"
 89 |   },
 90 | 
 91 |   "time": {
 92 |     "duration_hours": "{hours}小时{minutes}分{seconds:0.0f}秒",
 93 |     "duration_minutes": "{minutes}分{seconds:0.1f}秒",
 94 |     "duration_seconds": "{seconds:0.2f}秒"
 95 |   },
 96 | 
 97 |   "format": {
 98 |     "percentage": "{value:0.1f}%"
 99 |   },
100 | 
101 |   "vad": {
102 |     "onnx_not_installed": "未安装onnxruntime。请使用以下命令安装：\n  pip install onnxruntime      # CPU版本\n  pip install onnxruntime-gpu  # GPU版本",
103 |     "transformers_not_installed": "未安装transformers。请使用以下命令安装：\n  pip install transformers",
104 |     "model_loaded": "ONNX模型已加载：{path}",
105 |     "auto_configured": "自动配置ONNX使用{threads}个CPU线程（可用{total}个的一半）",
106 |     "device": "设备：{device}",
107 |     "providers": "提供器：{providers}",
108 |     "chunk_duration": "块时长：{duration}毫秒",
109 |     "frame_duration": "帧时长：{duration}毫秒",
110 |     "librosa_not_installed": "未安装librosa，假设音频已经是16kHz",
111 |     "starting": "在 {device} 上开始VAD处理",
112 |     "total_samples": "总音频采样数：{samples}",
113 |     "chunk_size": "块大小：{samples} 个采样（{duration}毫秒）",
114 |     "total_chunks": "待处理总块数：{chunks}",
115 |     "processing_chunk": "正在处理第 {current}/{total} 块（{percent:0.1f}%）在 {device} 上",
116 |     "completed": "VAD处理完成：在 {device} 上处理了 {chunks} 块",
117 |     "model_initialized": "WhisperVadModel已用模型初始化：{path}",
118 |     "using_device": "使用设备：{device}",
119 |     "init_failed": "初始化ONNX模型失败：{error}",
120 |     "path_invalid": "未提供ONNX模型路径或路径不存在：{path}",
121 |     "not_initialized": "WhisperVadModel：ONNX模型未初始化。请提供有效的ONNX模型路径。",
122 |     "speech_segments": {
123 |       "one": "使用Whisper VAD找到 {count} 个语音片段",
124 |       "other": "使用Whisper VAD找到 {count} 个语音片段"
125 |     },
126 |     "registered": "已注册带进度回调的whisper_vad模型",
127 |     "model_not_found": "未找到模型 {model_id}，使用默认模型",
128 |     "feature_extractor_loaded": "从本地文件夹加载 WhisperFeatureExtractor: {path}"
129 |   },
130 | 
131 |   "injection": {
132 |     "already_active": "VAD注入已激活，跳过",
133 |     "patched": "已修补 {path}",
134 |     "patch_failed": "无法修补 {path}：{error}",
135 |     "activated_with_model": "VAD注入已激活，使用模型 '{model_id}'",
136 |     "activated": "VAD注入已激活",
137 |     "not_active": "VAD注入未激活，无需取消注入",
138 |     "stop_error": "停止修补时出错：{error}",
139 |     "auto_injected": "已自动注入VAD，使用模型：{model_id}"
140 |   },
141 | 
142 |   "batch": {
143 |     "finding_optimal": "正在寻找最佳批次大小（测试范围：{min_size}-{max_size}）...",
144 |     "testing_size": "测试批次大小：{size}",
145 |     "size_successful": "批次大小 {size} 成功",
146 |     "optimal_found": "找到最佳批次大小：{size}",
147 |     "oom_error": "批次大小 {size} 因内存不足而失败",
148 |     "runtime_error": "批次大小 {size} 失败，错误：{error}",
149 |     "reducing_size": "将批次大小从 {old_size} 减小到 {new_size}",
150 |     "no_suitable_size": "即使使用最小批次大小 {min_size} 也无法找到合适的批次大小",
151 |     "unexpected_error": "测试批次大小 {size} 时出现意外错误：{error}",
152 |     "attempting_transcription": "尝试使用批次大小={size}进行转录",
153 |     "auto_adjusted": "内存不足后批次大小自动调整为 {size}",
154 |     "oom_reducing": "批次大小={old_size}时内存不足，减小到 {new_size} (x0.8)...",
155 |     "cannot_run_min": "即使使用批次大小={min_size}也无法运行",
156 |     "inference_failed": "即使使用最小批次大小={min_size}也无法运行推理。请考虑减小模型大小或使用CPU。"
157 |   }
158 | }


--------------------------------------------------------------------------------
/RELEASE_NOTES_CN.md:
--------------------------------------------------------------------------------
  1 | # 🎙️ Faster Whisper 转录工具 - 发行说明
  2 | 
  3 | ## ⚠️ 重要声明
  4 | 
  5 | > **本软件为开源软件**
  6 | >
  7 | > 🔗 **开源地址**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice
  8 | >
  9 | > 👥 **开发团队**: AI汉化组 (https://t.me/transWithAI)
 10 | 
 11 | ---
 12 | 
 13 | ## 📦 发行包说明
 14 | 
 15 | 本发行版包含多个变体版本，请根据您的显卡型号选择合适的版本：
 16 | 
 17 | ### 🎯 版本类型说明
 18 | 
 19 | #### 基础版（Base Package）
 20 | - **下载大小**：约 2.2GB
 21 | - **包含内容**：
 22 |   - ✅ 所有 GPU 依赖项
 23 |   - ✅ 音声优化 VAD（语音活动检测）ONNX 模型
 24 |   - ❌ 不含 Whisper 模型（需自行下载）
 25 | - **适用场景**：需要使用自定义模型的用户
 26 | 
 27 | #### 海南鸡版（Chickenrice Edition）
 28 | - **下载大小**：约 4.4GB
 29 | - **包含内容**：
 30 |   - ✅ 所有 GPU 依赖项
 31 |   - ✅ 音声优化 VAD（语音活动检测）ONNX 模型
 32 |   - ✅ **"海南鸡v2 5000小时"** 日文转中文优化模型
 33 | - **适用场景**：开箱即用的日文转中文翻译
 34 | - **模型说明**：包含经过5000小时音频数据训练的海南鸡v2版本模型，专门优化日文转中文翻译
 35 | 
 36 | ### 📌 文件命名规则
 37 | 
 38 | | 文件名后缀 | CUDA版本 | 模型类型 |
 39 | |-----------|---------|---------|
 40 | | `*_cu118.zip` | CUDA 11.8 | 基础版 |
 41 | | `*_cu118-chickenrice.zip` | CUDA 11.8 | 海南鸡版 |
 42 | | `*_cu122.zip` | CUDA 12.2 | 基础版 |
 43 | | `*_cu122-chickenrice.zip` | CUDA 12.2 | 海南鸡版 |
 44 | | `*_cu128.zip` | CUDA 12.8 | 基础版 |
 45 | | `*_cu128-chickenrice.zip` | CUDA 12.8 | 海南鸡版 |
 46 | 
 47 | ---
 48 | 
 49 | ## 🔍 如何选择正确的 CUDA 版本
 50 | 
 51 | ### 方法一：通过 nvidia-smi 查询
 52 | 
 53 | 1. 打开命令提示符或终端
 54 | 2. 输入命令：`nvidia-smi`
 55 | 3. 查看输出中的 **Driver Version** 和 **CUDA Version**
 56 | 
 57 | ```
 58 | +-------------------------------------------------------------------------+
 59 | | NVIDIA-SMI 570.00       Driver Version: 570.00       CUDA Version: 12.8|
 60 | +-------------------------------------------------------------------------+
 61 | ```
 62 | 
 63 | ### 方法二：通过显卡型号和驱动版本对照表
 64 | 
 65 | #### 📊 NVIDIA 驱动版本与 CUDA 版本兼容性表
 66 | 
 67 | | CUDA 版本 | 最低驱动要求（Windows） | 最低驱动要求（Linux） | 推荐使用场景 |
 68 | |----------|------------------------|---------------------|------------|
 69 | | **CUDA 11.8** | ≥452.39 | ≥450.80.02 | 较旧的显卡（GTX 10系列、RTX 20/30系列） |
 70 | | **CUDA 12.2** | ≥525.60.13 | ≥525.60.13 | RTX 30/40系列，较新的驱动 |
 71 | | **CUDA 12.8** | ≥570.65 | ≥570.26 | RTX 40/50系列，最新驱动 |
 72 | 
 73 | #### 🎮 显卡型号推荐表
 74 | 
 75 | | 显卡系列 | 推荐 CUDA 版本 | 说明 |
 76 | |---------|--------------|------|
 77 | | GTX 10系列（1060/1070/1080等） | **CUDA 11.8** | 兼容性最好 |
 78 | | GTX 16系列（1650/1660等） | **CUDA 11.8** | 兼容性最好 |
 79 | | RTX 20系列（2060/2070/2080等） | **CUDA 11.8** 或 **12.2** | 根据驱动版本选择 |
 80 | | RTX 30系列（3060/3070/3080/3090等） | **CUDA 12.2** | 推荐使用 |
 81 | | RTX 40系列（4060/4070/4080/4090等） | **CUDA 12.2** 或 **12.8** | 最新驱动用12.8 |
 82 | | **RTX 50系列（5090/5080/5070等）** | **🔴 必须使用 CUDA 12.8** | ⚠️ 注意：RTX 50系列必须使用CUDA 12.8版本 |
 83 | 
 84 | ### ⚠️ 重要提示
 85 | 
 86 | - **RTX 50系列用户**：由于新架构要求，**必须使用 CUDA 12.8 版本**，驱动版本必须 ≥570.00
 87 | - **驱动版本查询**：在 nvidia-smi 中显示的 CUDA Version 是您的驱动**支持的最高**CUDA版本
 88 | - **向下兼容**：高版本驱动可以运行低版本CUDA程序（例如：570驱动可以运行CUDA 11.8程序）
 89 | - **性能考虑**：使用与驱动匹配的CUDA版本可获得最佳性能
 90 | 
 91 | ---
 92 | 
 93 | ## 📥 模型下载说明
 94 | 
 95 | ### 基础版用户（需自行下载模型）
 96 | 
 97 | 基础版包含VAD模型，但**不包含**Whisper语音识别模型。您需要：
 98 | 
 99 | 1. **从 Hugging Face 下载模型**
100 |    - 示例模型地址：https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st
101 |    - 这是"海南鸡v2 5000小时"版本的日文转中文优化模型
102 | 
103 | 2. **放置模型文件**
104 |    ```
105 |    将下载的模型文件放入：
106 |    faster_whisper_transwithai_chickenrice/
107 |    └── models/
108 |        └── [您下载的模型文件夹]/
109 |    ```
110 | 
111 | 3. **其他可用模型**
112 |    - OpenAI Whisper官方模型
113 |    - 其他社区优化模型
114 | 
115 | ### 海南鸡版用户（开箱即用）
116 | 
117 | 海南鸡版已包含：
118 | - ✅ 音声优化 VAD 语音活动检测模型
119 | - ✅ "海南鸡v2 5000小时"日文转中文优化版Whisper模型
120 | - ✅ 所有必要的配置文件
121 | 
122 | **无需额外下载**，解压后直接运行即可使用！
123 | 
124 | ---
125 | 
126 | ## 🚀 快速开始指南
127 | 
128 | ### 1. 选择版本
129 | 根据上述表格，选择适合您显卡的CUDA版本
130 | 
131 | ### 2. 下载对应版本
132 | - 仅转录/翻译：下载基础版 + 自行下载模型
133 | - 日文转中文优化：下载海南鸡版（推荐）
134 | 
135 | ### 3. 解压并运行
136 | ```bash
137 | # GPU模式（推荐）
138 | 将音视频文件拖放到 "运行(GPU).bat"
139 | 
140 | # CPU模式（无显卡用户）
141 | 将音视频文件拖放到 "运行(CPU).bat"
142 | 
143 | # 低显存模式（4GB显存）
144 | 将音视频文件拖放到 "运行(GPU,低显存模式).bat"
145 | ```
146 | 
147 | ---
148 | 
149 | ## 💡 常见问题
150 | 
151 | **Q: 我应该选择哪个CUDA版本？**
152 | A: 运行 `nvidia-smi` 查看您的驱动版本，然后对照上表选择。
153 | 
154 | **Q: 海南鸡版和基础版有什么区别？**
155 | A: 海南鸡版包含预训练的日文转中文优化模型（5000小时训练），基础版需要自行下载模型。
156 | 
157 | **Q: RTX 4090 应该用哪个版本？**
158 | A: 推荐使用 CUDA 12.2 或 12.8 版本，取决于您的驱动版本。
159 | 
160 | **Q: 显存不足怎么办？**
161 | A: 使用"低显存模式"批处理文件，或切换到CPU模式。
162 | 
163 | ---
164 | 
165 | ## 📝 更新日志
166 | 
167 | ### v1.4 (2025-11-25)
168 | - 🚀 **批处理推理支持**：新增批处理推理模式（--enable_batching），大幅提升处理速度
169 | - ⚡ **智能批次大小自动检测**：程序启动时自动测试不同批次大小（1-8），找到显存允许的最大批次
170 | - 🎯 **手动批次大小控制**：支持通过 --batch_size 参数手动指定批次大小，跳过自动检测
171 | - 🔧 **运行时自适应调整**：处理过程中如遇到显存不足（OOM），自动降低批次大小（每次减少20%）继续处理
172 | - 📈 **最大批次大小配置**：通过 --max_batch_size 参数控制自动检测的上限（默认8，可根据显存调整）
173 | - 📦 **新增高显存加速模式**：提供 "运行(GPU,高显存加速模式).bat" 专门为8GB+显存用户优化
174 | - 🔨 **修复批处理兼容性**：应用补丁修复faster-whisper批处理的max_initial_timestamp参数传递问题
175 | - 🌐 **批处理日志国际化**：为批处理功能添加完整的中英文本地化消息，便于调试和使用
176 | 
177 | **📊 批处理模式说明**：
178 | - **并行处理优势**：批处理模式下，多个音频片段并行转录，每个片段独立处理，不依赖前面片段的结果
179 | - **精度权衡**：批处理可能略微降低转录精度（由于失去了条件生成的上下文信息）
180 | - **特殊场景优化**：在某些场景下批处理反而效果更好，因为避免了条件生成可能带来的错误传播
181 |   - 噪声较多的音频：避免噪声片段影响后续转录
182 |   - 多说话人场景：减少不同说话人之间的相互干扰
183 |   - 长音频文件：防止错误累积效应
184 | 
185 | **🎮 使用建议**：
186 | - 8GB+ 显存：使用 "运行(GPU,高显存加速模式).bat"，自动检测最优批次大小
187 | - 4-8GB 显存：手动设置较小批次大小，如 --batch_size=2 或 4
188 | - 追求最高精度：使用常规模式（不加 --enable_batching 参数）
189 | - 追求处理速度：启用批处理模式，接受轻微的精度权衡
190 | 
191 | ### v1.3 (2025-11-17)
192 | - 🤖 智能计算类型选择：自动检测设备并选择最优计算类型（bfloat16 > float16 > int8 > float32）
193 | - 🔍 增强设备自动检测：改进CUDA可用性检测，支持CUDA_VISIBLE_DEVICES环境变量
194 | - 🔇 抑制警告信息：添加TRANSFORMERS_NO_ADVISORY_WARNINGS环境变量，减少日志噪音
195 | - 🎯 简化批处理文件：移除硬编码的计算类型设置，全部使用自动检测模式
196 | - 📊 改进日志记录：添加自动检测设备和计算类型的详细日志信息
197 | - 🐛 修复日志重复问题：移除重复的根日志处理器，避免日志重复输出
198 | - 🌐 增强国际化支持：为自动检测功能添加完整的中英文本地化消息
199 | 
200 | ### v1.2 (2025-11-15)
201 | - ⚡ CPU模式性能优化：添加 int16 计算类型支持，提升CPU处理速度
202 | - 🎮 GPU兼容性改进：强制使用 float16 替代模型精度 bfloat16，提升显卡兼容性
203 | - 🔧 精简批处理文件：合并视频翻译功能到主批处理文件中
204 | - 📝 新增日志记录功能：自动保存运行日志到 latest.log 文件，方便问题反馈
205 | 
206 | ### v1.1 (2025-11-14)
207 | - 🌐 离线支持改进：预下载 whisper-base 模型文件，实现完全离线运行
208 | - 📥 自动模型管理：WhisperFeatureExtractor 优先使用本地模型，避免网络超时
209 | - 🔧 优化下载流程：支持从已有模型文件夹复制，减少重复下载
210 | - 🚀 提升稳定性：解决网络不稳定环境下的 HuggingFace 连接超时问题
211 | 
212 | ### v1.0 (2025-11-13)
213 | - 🎯 支持多CUDA版本（11.8/12.2/12.8）
214 | - 🚀 优化的日文转中文翻译效果（海南鸡v2版本）
215 | - 🔊 音声优化的VAD语音活动检测
216 | - 💾 改进的缓存机制，加快CI/CD构建速度
217 | - 📦 分离的基础版和完整版，满足不同需求
218 | - 🔧 自动VAD模型下载和管理
219 | 
220 | ---
221 | 
222 | ## 📞 技术支持
223 | 
224 | 如遇到问题，请：
225 | 1. 检查显卡驱动是否为最新版本
226 | 2. 确认选择了正确的CUDA版本
227 | 3. 查看控制台输出的错误信息
228 | 4. 提交Issue到项目仓库: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice
229 | 
230 | ### 🔗 官方链接
231 | - **GitHub仓库**: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice
232 | - **音声优化 VAD 模型**: https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx
233 | - **Telegram群组**: https://t.me/transWithAI
234 | - **开发团队**: AI汉化组
235 | 
236 | ---
237 | 
238 | ## 🙏 致谢
239 | 
240 | - 🚀 基于 [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) 开发
241 | - 🐔 使用 [chickenrice0721/whisper-large-v2-translate-zh-v0.2-st](https://huggingface.co/chickenrice0721/whisper-large-v2-translate-zh-v0.2-st) 日文转中文优化模型
242 | - 🔊 使用 [TransWithAI/Whisper-Vad-EncDec-ASMR-onnx](https://huggingface.co/TransWithAI/Whisper-Vad-EncDec-ASMR-onnx) 音声优化 VAD 模型
243 | - 🎙️ [OpenAI Whisper](https://github.com/openai/whisper) 原始项目
244 | - 💪 **感谢某匿名群友的算力和技术支持**
245 | 
246 | ---
247 | 
248 | *本工具基于 Faster Whisper 开发，海南鸡模型经过5000小时音频数据优化训练，专门针对日文转中文翻译场景。*
249 | *由AI汉化组开源维护，永久免费。*
250 | 


--------------------------------------------------------------------------------
/locales/en-US/messages.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "app": {
  3 |     "name": "Whisper Transcription Enhanced",
  4 |     "description": "Whisper transcription with custom VAD injection"
  5 |   },
  6 | 
  7 |   "args": {
  8 |     "model_path": "Path to the Whisper model",
  9 |     "device": "Device to run the model on (cpu, cuda, auto)",
 10 |     "compute_type": "Compute type for the model",
 11 |     "overwrite": "Overwrite existing subtitle files",
 12 |     "audio_extensions": "Comma-separated list of audio file extensions to process",
 13 |     "subtitle_formats": "Comma-separated list of subtitle formats (lrc, srt, vtt, txt)",
 14 |     "output_dir": "Output directory for subtitle files",
 15 |     "config_file": "Path to generation config file",
 16 |     "log_level": "Logging level",
 17 |     "vad_threshold": "Override VAD threshold",
 18 |     "min_speech_duration": "Override minimum speech duration in ms",
 19 |     "min_silence_duration": "Override minimum silence duration in ms",
 20 |     "speech_padding": "Override speech padding in ms",
 21 |     "directories": "Directories or files to process"
 22 |   },
 23 | 
 24 |   "info": {
 25 |     "output_dir": "Output directory: {output_dir}",
 26 |     "generation_config": "Generation config: {config}",
 27 |     "initializing_vad": "Initializing enhanced VAD model...",
 28 |     "vad_activated": "✓ Enhanced VAD activated (threshold={threshold})",
 29 |     "loading_whisper": "Loading Whisper model...",
 30 |     "model_precision": "Model running with precision: {precision} on device: {device}",
 31 |     "translating": "Translating ({current}/{total}): {path}",
 32 |     "duration": "Duration: {duration}",
 33 |     "duration_filtered": "Duration: {original} → {filtered} ({percent} speech detected)",
 34 |     "writing": "Writing: {path}",
 35 |     "vad_deactivated": "VAD injection deactivated",
 36 |     "no_files_found": "No files found to translate",
 37 |     "logging_to_file": "Logging to file: {path}",
 38 |     "program_version": "Program version: {version}",
 39 |     "python_version": "Python version: {version}",
 40 |     "platform": "Platform: {platform}",
 41 |     "arguments": "Arguments: {args}",
 42 |     "auto_detected_device": "Auto-detected device: {device}",
 43 |     "auto_selected_compute_type": "Auto-selected compute type '{compute_type}' for device '{device}'"
 44 |   },
 45 | 
 46 |   "tasks": {
 47 |     "translation": {
 48 |       "one": "Translation task: {count}",
 49 |       "other": "Translation tasks: {count}"
 50 |     }
 51 |   },
 52 | 
 53 |   "files": {
 54 |     "found": {
 55 |       "one": "Found {count} file to process",
 56 |       "other": "Found {count} files to process"
 57 |     },
 58 |     "count": {
 59 |       "one": "{count} file",
 60 |       "other": "{count} files"
 61 |     }
 62 |   },
 63 | 
 64 |   "warnings": {
 65 |     "provide_directories": "Please provide directories to translate",
 66 |     "drag_files": "Please drag files or directories to translate onto this program",
 67 |     "unknown_format": "Unknown format: {format}",
 68 |     "loaded_vad_config": "Loaded VAD configuration from {path}",
 69 |     "failed_load_vad": "Failed to load VAD metadata from {path}: {error}",
 70 |     "using_default_vad": "Using default VAD configuration",
 71 |     "vad_file_not_found": "VAD metadata file not found at {path}",
 72 |     "compute_types_unavailable": "Could not get supported compute types for {device}: {error}",
 73 |     "no_preferred_compute_type": "No preferred compute type found, using default '{default}'"
 74 |   },
 75 | 
 76 |   "progress": {
 77 |     "vad": "VAD Progress: {current}/{total} chunks ({percent:0.1f}%) on {device}"
 78 |   },
 79 | 
 80 |   "debug": {
 81 |     "processing": "Processing: {path}",
 82 |     "file_suffix": "File suffix: {suffix}",
 83 |     "valid_suffixes": "Valid suffixes: {suffixes}",
 84 |     "skipped_suffix": "Skipped - suffix '{suffix}' not in valid audio formats",
 85 |     "subtitle_exists": "Subtitle already exists: {path}",
 86 |     "skipped_all_exist": "Skipped - all subtitle formats already exist",
 87 |     "added_task": "Added task for formats: {formats}",
 88 |     "scanning": "Scanning: {path}"
 89 |   },
 90 | 
 91 |   "time": {
 92 |     "duration_hours": "{hours}h {minutes}m {seconds:0.0f}s",
 93 |     "duration_minutes": "{minutes}m {seconds:0.1f}s",
 94 |     "duration_seconds": "{seconds:0.2f}s"
 95 |   },
 96 | 
 97 |   "format": {
 98 |     "percentage": "{value:0.1f}%"
 99 |   },
100 | 
101 |   "vad": {
102 |     "onnx_not_installed": "onnxruntime not installed. Install with:\n  pip install onnxruntime      # For CPU\n  pip install onnxruntime-gpu  # For GPU",
103 |     "transformers_not_installed": "transformers not installed. Install with:\n  pip install transformers",
104 |     "model_loaded": "ONNX Model loaded: {path}",
105 |     "auto_configured": "Auto-configured ONNX to use {threads} CPU threads (half of {total} available)",
106 |     "device": "Device: {device}",
107 |     "providers": "Providers: {providers}",
108 |     "chunk_duration": "Chunk duration: {duration}ms",
109 |     "frame_duration": "Frame duration: {duration}ms",
110 |     "librosa_not_installed": "librosa not installed, assuming audio is already at 16kHz",
111 |     "starting": "Starting VAD processing on {device}",
112 |     "total_samples": "Total audio samples: {samples}",
113 |     "chunk_size": "Chunk size: {samples} samples ({duration}ms)",
114 |     "total_chunks": "Total chunks to process: {chunks}",
115 |     "processing_chunk": "Processing chunk {current}/{total} ({percent:0.1f}%) on {device}",
116 |     "completed": "VAD processing completed: {chunks} chunks processed on {device}",
117 |     "model_initialized": "WhisperVadModel initialized with model: {path}",
118 |     "using_device": "Using device: {device}",
119 |     "init_failed": "Failed to initialize ONNX model: {error}",
120 |     "path_invalid": "ONNX model path not provided or doesn't exist: {path}",
121 |     "not_initialized": "WhisperVadModel: ONNX model not initialized. Please provide a valid ONNX model path.",
122 |     "speech_segments": {
123 |       "one": "Found {count} speech segment using Whisper VAD",
124 |       "other": "Found {count} speech segments using Whisper VAD"
125 |     },
126 |     "registered": "Registered whisper_vad model with progress callback",
127 |     "model_not_found": "Model {model_id} not found, using default",
128 |     "feature_extractor_loaded": "Loaded WhisperFeatureExtractor from local folder: {path}"
129 |   },
130 | 
131 |   "injection": {
132 |     "already_active": "VAD injection already active, skipping",
133 |     "patched": "Patched {path}",
134 |     "patch_failed": "Could not patch {path}: {error}",
135 |     "activated_with_model": "VAD injection activated with model '{model_id}'",
136 |     "activated": "VAD injection activated",
137 |     "not_active": "VAD injection not active, nothing to uninject",
138 |     "stop_error": "Error stopping patch: {error}",
139 |     "auto_injected": "Auto-injected VAD with model: {model_id}"
140 |   },
141 | 
142 |   "batch": {
143 |     "finding_optimal": "Finding optimal batch size (testing range: {min_size}-{max_size})...",
144 |     "testing_size": "Testing batch size: {size}",
145 |     "size_successful": "Batch size {size} successful",
146 |     "optimal_found": "Optimal batch size found: {size}",
147 |     "oom_error": "Batch size {size} failed with out of memory error",
148 |     "runtime_error": "Batch size {size} failed with error: {error}",
149 |     "reducing_size": "Reducing batch size from {old_size} to {new_size}",
150 |     "no_suitable_size": "Cannot find suitable batch size even with minimum size {min_size}",
151 |     "unexpected_error": "Unexpected error testing batch size {size}: {error}",
152 |     "attempting_transcription": "Attempting transcription with batch_size={size}",
153 |     "auto_adjusted": "Batch size auto-adjusted to {size} after OOM",
154 |     "oom_reducing": "OOM with batch_size={old_size}, reducing to {new_size} (x0.8)...",
155 |     "cannot_run_min": "Cannot run even with batch_size={min_size}",
156 |     "inference_failed": "Unable to run inference even with minimum batch_size={min_size}. Consider reducing model size or using CPU."
157 |   }
158 | }


--------------------------------------------------------------------------------
/src/faster_whisper_transwithai_chickenrice/injection.py:
--------------------------------------------------------------------------------
  1 | """
  2 | VAD Injection System - Redirects faster_whisper VAD calls to custom implementations
  3 | Provides transparent switching between custom VAD models
  4 | """
  5 | 
  6 | import unittest.mock as mock
  7 | from typing import List, Dict, Any, Optional, Callable
  8 | import logging
  9 | import numpy as np
 10 | from dataclasses import dataclass
 11 | 
 12 | from .vad_manager import VadModelManager, VadConfig
 13 | 
 14 | # Import modern i18n module for translations
 15 | from . import i18n_modern as i18n
 16 | 
 17 | # Convenience imports
 18 | _ = i18n._
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | # Global flag to track if injection is active
 23 | _injection_active = False
 24 | _active_patches = []
 25 | _global_config = None
 26 | _global_progress_callback = None
 27 | 
 28 | 
 29 | @dataclass
 30 | class VadOptionsCompat:
 31 |     """Mock VadOptions class that mimics faster_whisper.vad.VadOptions"""
 32 |     threshold: float = 0.5
 33 |     neg_threshold: Optional[float] = None
 34 |     min_speech_duration_ms: int = 0
 35 |     max_speech_duration_s: float = float('inf')
 36 |     min_silence_duration_ms: int = 2000
 37 |     speech_pad_ms: int = 400
 38 | 
 39 |     def __post_init__(self):
 40 |         """Compatibility with the original VadOptions"""
 41 |         pass
 42 | 
 43 | 
 44 | def set_global_config(config: VadConfig):
 45 |     """Set the global configuration for VAD injection"""
 46 |     global _global_config
 47 |     _global_config = config
 48 | 
 49 | 
 50 | def get_global_config() -> VadConfig:
 51 |     """Get the global configuration, creating default if needed"""
 52 |     global _global_config
 53 |     if _global_config is None:
 54 |         _global_config = VadConfig()
 55 |     return _global_config
 56 | 
 57 | 
 58 | def get_speech_timestamps_injected(
 59 |     audio: np.ndarray,
 60 |     vad_options: Any = None,
 61 |     sampling_rate: int = 16000,
 62 |     **kwargs
 63 | ) -> List[Dict[str, Any]]:
 64 |     """
 65 |     Injected implementation of get_speech_timestamps that uses our VAD model manager.
 66 | 
 67 |     This function is injected in place of faster_whisper.vad.get_speech_timestamps
 68 |     to transparently use custom VAD models.
 69 |     """
 70 |     # Get configuration
 71 |     config = get_global_config()
 72 | 
 73 |     # Check if a specific model was requested via kwargs
 74 |     model_id = kwargs.get('vad_model_id', config.default_model)
 75 | 
 76 |     # Check if a progress callback was provided (from kwargs or global)
 77 |     progress_callback = kwargs.get('progress_callback', None) or _global_progress_callback
 78 | 
 79 |     # Create manager (this uses cached instances internally)
 80 |     manager = VadModelManager(config=config, ttl=config.ttl, progress_callback=progress_callback)
 81 | 
 82 |     # Extract options from vad_options (works with both real and mock VadOptions)
 83 |     if vad_options is not None:
 84 |         options_dict = {
 85 |             'threshold': getattr(vad_options, 'threshold', config.threshold),
 86 |             'neg_threshold': getattr(vad_options, 'neg_threshold', config.neg_threshold),
 87 |             'min_speech_duration_ms': getattr(vad_options, 'min_speech_duration_ms', config.min_speech_duration_ms),
 88 |             'max_speech_duration_s': getattr(vad_options, 'max_speech_duration_s', config.max_speech_duration_s),
 89 |             'min_silence_duration_ms': getattr(vad_options, 'min_silence_duration_ms', config.min_silence_duration_ms),
 90 |             'speech_pad_ms': getattr(vad_options, 'speech_pad_ms', config.speech_pad_ms),
 91 |         }
 92 |     else:
 93 |         # Use defaults from config
 94 |         options_dict = {
 95 |             'threshold': config.threshold,
 96 |             'neg_threshold': config.neg_threshold,
 97 |             'min_speech_duration_ms': config.min_speech_duration_ms,
 98 |             'max_speech_duration_s': config.max_speech_duration_s,
 99 |             'min_silence_duration_ms': config.min_silence_duration_ms,
100 |             'speech_pad_ms': config.speech_pad_ms,
101 |         }
102 | 
103 |     # Remove vad_model_id and progress_callback from kwargs to avoid passing them to the actual VAD
104 |     kwargs_copy = kwargs.copy()
105 |     kwargs_copy.pop('vad_model_id', None)
106 |     kwargs_copy.pop('progress_callback', None)
107 | 
108 |     # Merge options_dict with remaining kwargs
109 |     final_kwargs = {**options_dict, **kwargs_copy}
110 | 
111 |     # Get speech timestamps using the model manager
112 |     return manager.get_speech_timestamps(
113 |         model_id=model_id,
114 |         audio=audio,
115 |         sampling_rate=sampling_rate,
116 |         **final_kwargs
117 |     )
118 | 
119 | 
120 | def get_vad_patches(model_id: Optional[str] = None) -> Dict[str, mock.Mock]:
121 |     """
122 |     Get all VAD-related patches for the codebase.
123 | 
124 |     Args:
125 |         model_id: Optional model ID to force (e.g., "whisper_vad")
126 | 
127 |     Returns:
128 |         Dictionary of patch paths to mock objects
129 |     """
130 |     # Create wrapper functions that include model_id if specified
131 |     if model_id:
132 |         def get_timestamps_wrapper(audio, vad_options=None, sampling_rate=16000, **kwargs):
133 |             kwargs['vad_model_id'] = model_id
134 |             return get_speech_timestamps_injected(audio, vad_options, sampling_rate, **kwargs)
135 |     else:
136 |         get_timestamps_wrapper = get_speech_timestamps_injected
137 | 
138 |     patches = {
139 |         # Core VAD module patches
140 |         'faster_whisper.vad.VadOptions': mock.Mock(side_effect=VadOptionsCompat),
141 |         'faster_whisper.vad.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper),
142 | 
143 |         # Alternative import location (used in transcribe module)
144 |         'faster_whisper.transcribe.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper),
145 | 
146 |         # Patch for VadOptions in transcribe module
147 |         'faster_whisper.transcribe.VadOptions': mock.Mock(side_effect=VadOptionsCompat),
148 | 
149 |         # You can add more patches here for specific modules if needed
150 |         # For example, if you have modules that directly import from faster_whisper:
151 |         # 'your_module.VadOptions': mock.Mock(side_effect=VadOptionsCompat),
152 |         # 'your_module.get_speech_timestamps': mock.Mock(side_effect=get_timestamps_wrapper),
153 |     }
154 | 
155 |     return patches
156 | 
157 | 
158 | def inject_vad(model_id: Optional[str] = None, config: Optional[VadConfig] = None, progress_callback: Optional[Callable] = None) -> None:
159 |     """
160 |     Inject VAD implementation to redirect faster_whisper calls.
161 | 
162 |     Args:
163 |         model_id: Optional model ID to force (e.g., "whisper_vad")
164 |                  If None, uses the configured default model.
165 |         config: Optional VadConfig to use for injection
166 |         progress_callback: Optional progress callback for VAD processing
167 |     """
168 |     global _injection_active, _active_patches, _global_progress_callback
169 | 
170 |     if _injection_active:
171 |         logger.warning(_("injection.already_active"))
172 |         return
173 | 
174 |     # Store progress callback globally
175 |     _global_progress_callback = progress_callback
176 | 
177 |     # Set config if provided
178 |     if config:
179 |         set_global_config(config)
180 | 
181 |     patches_dict = get_vad_patches(model_id)
182 | 
183 |     for path, mock_obj in patches_dict.items():
184 |         try:
185 |             patch = mock.patch(path, mock_obj)
186 |             patch.start()
187 |             _active_patches.append(patch)
188 |             logger.debug(_("injection.patched", path=path))
189 |         except Exception as e:
190 |             logger.debug(_("injection.patch_failed", path=path, error=e))
191 | 
192 |     _injection_active = True
193 |     if model_id:
194 |         logger.info(_("injection.activated_with_model", model_id=model_id))
195 |     else:
196 |         logger.info(_("injection.activated"))
197 | 
198 | 
199 | def uninject_vad() -> None:
200 |     """
201 |     Remove VAD injection and restore original faster_whisper behavior.
202 |     """
203 |     global _injection_active, _active_patches, _global_progress_callback
204 | 
205 |     if not _injection_active:
206 |         logger.warning(_("injection.not_active"))
207 |         return
208 | 
209 |     for patch in _active_patches:
210 |         try:
211 |             patch.stop()
212 |         except Exception as e:
213 |             logger.warning(_("injection.stop_error", error=e))
214 | 
215 |     _active_patches.clear()
216 |     _injection_active = False
217 |     _global_progress_callback = None  # Clear the progress callback
218 |     logger.info(_("info.vad_deactivated"))
219 | 
220 | 
221 | class VadInjectionContext:
222 |     """
223 |     Context manager for VAD injection.
224 | 
225 |     Usage:
226 |         with VadInjectionContext(model_id="whisper_vad"):
227 |             # Code that uses faster_whisper VAD will now use whisper VAD
228 |             from faster_whisper.vad import get_speech_timestamps
229 |             timestamps = get_speech_timestamps(audio, vad_options)
230 |     """
231 | 
232 |     def __init__(self, model_id: Optional[str] = None, config: Optional[VadConfig] = None):
233 |         self.model_id = model_id
234 |         self.config = config
235 |         self.was_active = False
236 | 
237 |     def __enter__(self):
238 |         global _injection_active
239 |         self.was_active = _injection_active
240 |         if self.was_active:
241 |             uninject_vad()
242 |         inject_vad(self.model_id, self.config)
243 |         return self
244 | 
245 |     def __exit__(self, exc_type, exc_val, exc_tb):
246 |         uninject_vad()
247 |         if self.was_active:
248 |             inject_vad()  # Restore previous injection
249 | 
250 | 
251 | def auto_inject_vad(config: Optional[VadConfig] = None) -> None:
252 |     """
253 |     Automatically inject VAD based on configuration.
254 |     This should be called during application startup.
255 | 
256 |     Args:
257 |         config: Optional VadConfig to use
258 |     """
259 |     if config is None:
260 |         config = get_global_config()
261 |     else:
262 |         set_global_config(config)
263 | 
264 |     # Check if we should inject based on configuration
265 |     if config.auto_inject:
266 |         model_id = config.default_model
267 |         inject_vad(model_id, config)
268 |         logger.info(_("injection.auto_injected", model_id=model_id))
269 | 
270 | 
271 | def with_vad_injection(model_id: Optional[str] = None, config: Optional[VadConfig] = None):
272 |     """
273 |     Decorator to use VAD injection for a specific function.
274 | 
275 |     Usage:
276 |         @with_vad_injection(model_id="whisper_vad")
277 |         def my_function():
278 |             # This function will use whisper VAD
279 |             from faster_whisper.vad import get_speech_timestamps
280 |             return get_speech_timestamps(audio, vad_options)
281 |     """
282 |     def decorator(func):
283 |         def wrapper(*args, **kwargs):
284 |             with VadInjectionContext(model_id, config):
285 |                 return func(*args, **kwargs)
286 |         return wrapper
287 |     return decorator
288 | 
289 | 
290 | def is_injection_active() -> bool:
291 |     """Check if VAD injection is currently active"""
292 |     return _injection_active


--------------------------------------------------------------------------------
/project.spec:
--------------------------------------------------------------------------------
  1 | # -*- mode: python ; coding: utf-8 -*-
  2 | import sys
  3 | import os
  4 | from PyInstaller.utils.hooks import collect_all, collect_data_files, collect_submodules
  5 | from pathlib import Path
  6 | import glob
  7 | 
  8 | block_cipher = None
  9 | 
 10 | # Collect all data and binaries from critical packages
 11 | datas = []
 12 | binaries = []
 13 | hiddenimports = []
 14 | 
 15 | # Function to detect conda environment and CUDA version
 16 | def get_conda_cuda_libs():
 17 |     """Detect and collect CUDA/cuDNN libraries from the active conda environment"""
 18 |     cuda_binaries = []
 19 | 
 20 |     # Get the conda environment path
 21 |     conda_prefix = os.environ.get('CONDA_PREFIX', sys.prefix)
 22 |     print(f"Conda environment detected: {conda_prefix}")
 23 | 
 24 |     # Detect CUDA version from environment path or libraries
 25 |     cuda_version = None
 26 |     if 'cu118' in conda_prefix or 'cuda118' in conda_prefix:
 27 |         cuda_version = '11.8'
 28 |     elif 'cu122' in conda_prefix or 'cuda122' in conda_prefix:
 29 |         cuda_version = '12.2'
 30 |     elif 'cu128' in conda_prefix or 'cuda128' in conda_prefix:
 31 |         cuda_version = '12.8'
 32 |     else:
 33 |         # Try to detect from cudart version
 34 |         cudart_files = glob.glob(os.path.join(conda_prefix, 'lib', 'libcudart.so.*'))
 35 |         if cudart_files:
 36 |             cudart_file = os.path.basename(cudart_files[0])
 37 |             if '11.8' in cudart_file:
 38 |                 cuda_version = '11.8'
 39 |             elif '12.2' in cudart_file:
 40 |                 cuda_version = '12.2'
 41 |             elif '12.8' in cudart_file:
 42 |                 cuda_version = '12.8'
 43 | 
 44 |     print(f"Detected CUDA version: {cuda_version}")
 45 | 
 46 |     # Library paths to check - Windows uses different paths than Linux
 47 |     if sys.platform == 'win32':
 48 |         lib_dirs = [
 49 |             os.path.join(conda_prefix, 'Library', 'bin'),  # Primary location for Windows DLLs
 50 |             os.path.join(conda_prefix, 'bin'),              # Alternative location
 51 |             os.path.join(conda_prefix, 'DLLs'),             # Python DLLs location
 52 |         ]
 53 | 
 54 |         # Also check Python site-packages for ONNX Runtime libraries
 55 |         import site
 56 |         site_packages = site.getsitepackages()
 57 |         for sp in site_packages:
 58 |             if conda_prefix in sp:
 59 |                 onnx_capi_path = os.path.join(sp, 'onnxruntime', 'capi')
 60 |                 if os.path.exists(onnx_capi_path):
 61 |                     lib_dirs.append(onnx_capi_path)
 62 |                     print(f"  Added ONNX Runtime path: {onnx_capi_path}")
 63 | 
 64 |         # Windows CUDA library patterns with version numbers
 65 |         cuda_libs_patterns = [
 66 |             # CUDA Runtime
 67 |             'cudart64_*.dll',
 68 |             'cudart32_*.dll',  # 32-bit variant if exists
 69 |             # cuBLAS
 70 |             'cublas64_*.dll',
 71 |             'cublasLt64_*.dll',
 72 |             # cuDNN libraries - critical for deep learning
 73 |             'cudnn64_*.dll',
 74 |             'cudnn_ops_infer64_*.dll',
 75 |             'cudnn_ops_train64_*.dll',
 76 |             'cudnn_cnn_infer64_*.dll',
 77 |             'cudnn_cnn_train64_*.dll',
 78 |             'cudnn_adv_infer64_*.dll',
 79 |             'cudnn_adv_train64_*.dll',
 80 |             # For newer cuDNN versions (9.x)
 81 |             'cudnn*.dll',
 82 |             # cuFFT
 83 |             'cufft64_*.dll',
 84 |             'cufftw64_*.dll',
 85 |             # cuRAND
 86 |             'curand64_*.dll',
 87 |             # cuSPARSE
 88 |             'cusparse64_*.dll',
 89 |             # cuSOLVER
 90 |             'cusolver64_*.dll',
 91 |             'cusolverMg64_*.dll',
 92 |             # NVRTC
 93 |             'nvrtc64_*.dll',
 94 |             'nvrtc-builtins64_*.dll',
 95 |             # NVIDIA Tools Extension
 96 |             'nvToolsExt64_*.dll',
 97 |             # Additional potential libraries
 98 |             'nppc64_*.dll',
 99 |             'nppif64_*.dll',
100 |             'npps64_*.dll',
101 |             # ONNX Runtime GPU dependencies (important!)
102 |             'onnxruntime_providers_cuda.dll',
103 |             'onnxruntime_providers_tensorrt.dll',
104 |             'onnxruntime_providers_shared.dll',
105 |             # Python binding for ONNX Runtime
106 |             'onnxruntime_pybind11_state*.pyd',
107 |         ]
108 |     else:
109 |         # Linux/Unix library paths
110 |         lib_dirs = [
111 |             os.path.join(conda_prefix, 'lib'),
112 |             os.path.join(conda_prefix, 'lib', 'stubs'),
113 |         ]
114 | 
115 |         # Also check Python site-packages for ONNX Runtime libraries
116 |         # This is crucial for finding libonnxruntime_providers_cuda.so, etc.
117 |         import site
118 |         # Try to get the site-packages directory in the conda environment
119 |         python_version = f"python{sys.version_info.major}.{sys.version_info.minor}"
120 |         site_packages_paths = [
121 |             os.path.join(conda_prefix, 'lib', python_version, 'site-packages'),
122 |             os.path.join(conda_prefix, 'lib', 'python3.10', 'site-packages'),  # Fallback for CI
123 |             os.path.join(conda_prefix, 'lib', 'python3.11', 'site-packages'),  # Alternative version
124 |         ]
125 | 
126 |         for sp_path in site_packages_paths:
127 |             onnx_capi_path = os.path.join(sp_path, 'onnxruntime', 'capi')
128 |             if os.path.exists(onnx_capi_path):
129 |                 lib_dirs.append(onnx_capi_path)
130 |                 print(f"  Added ONNX Runtime path: {onnx_capi_path}")
131 |                 break
132 | 
133 |         # Linux CUDA library patterns
134 |         cuda_libs_patterns = [
135 |             'libcudart.so*',
136 |             'libcublas.so*',
137 |             'libcublasLt.so*',
138 |             'libcudnn*.so*',
139 |             'libcufft.so*',
140 |             'libcufftw.so*',
141 |             'libcurand.so*',
142 |             'libcusparse.so*',
143 |             'libcusolver.so*',
144 |             'libnvrtc.so*',
145 |             'libnvToolsExt.so*',
146 |             # ONNX Runtime GPU dependencies
147 |             'libonnxruntime_providers_cuda.so*',
148 |             'libonnxruntime_providers_tensorrt.so*',
149 |             'libonnxruntime_providers_shared.so*',
150 |             # Also check without 'lib' prefix (for files in capi directory)
151 |             'onnxruntime_providers_cuda.so*',
152 |             'onnxruntime_providers_tensorrt.so*',
153 |             'onnxruntime_providers_shared.so*',
154 |             # Python extension module
155 |             'onnxruntime_pybind11_state*.so',
156 |         ]
157 | 
158 |     # Collect all matching libraries
159 |     for lib_dir in lib_dirs:
160 |         if not os.path.exists(lib_dir):
161 |             continue
162 | 
163 |         for pattern in cuda_libs_patterns:
164 |             for lib_file in glob.glob(os.path.join(lib_dir, pattern)):
165 |                 if os.path.isfile(lib_file) and not os.path.islink(lib_file):
166 |                     # Add to binaries list with destination directory
167 |                     dest_dir = '.'
168 |                     if 'stubs' in lib_file:
169 |                         dest_dir = 'stubs'
170 |                     cuda_binaries.append((lib_file, dest_dir))
171 |                     print(f"  Including CUDA library: {os.path.basename(lib_file)}")
172 | 
173 | 
174 |     return cuda_binaries
175 | 
176 | # Collect CUDA/cuDNN libraries
177 | cuda_binaries = get_conda_cuda_libs()
178 | binaries += cuda_binaries
179 | 
180 | # Collect CTranslate2 (the actual inference engine for faster-whisper)
181 | try:
182 |     ctranslate2_datas, ctranslate2_binaries, ctranslate2_hiddenimports = collect_all('ctranslate2')
183 |     datas += ctranslate2_datas
184 |     binaries += ctranslate2_binaries
185 |     hiddenimports += ctranslate2_hiddenimports
186 | except:
187 |     pass
188 | 
189 | # Collect faster-whisper
190 | faster_whisper_datas, faster_whisper_binaries, faster_whisper_hiddenimports = collect_all('faster_whisper')
191 | datas += faster_whisper_datas
192 | binaries += faster_whisper_binaries
193 | hiddenimports += faster_whisper_hiddenimports
194 | 
195 | # Collect transformers (needed for tokenizers)
196 | transformers_datas, transformers_binaries, transformers_hiddenimports = collect_all('transformers')
197 | datas += transformers_datas
198 | binaries += transformers_binaries
199 | hiddenimports += transformers_hiddenimports
200 | 
201 | # Collect onnxruntime for VAD model
202 | # Note: The Python module is always 'onnxruntime' regardless of whether
203 | # you installed onnxruntime-gpu or onnxruntime via pip
204 | onnx_collected = False
205 | onnx_package = 'onnxruntime'  # Module name is always 'onnxruntime'
206 | try:
207 |     onnx_datas, onnx_binaries, onnx_hiddenimports = collect_all(onnx_package)
208 |     datas += onnx_datas
209 |     binaries += onnx_binaries
210 |     hiddenimports += onnx_hiddenimports
211 |     print(f"Collected {onnx_package} successfully")
212 |     onnx_collected = True
213 | 
214 |     # Explicitly add ONNX Runtime capi libraries if not already included
215 |     try:
216 |         import importlib.util
217 |         spec = importlib.util.find_spec(onnx_package)
218 |         if spec and spec.origin:
219 |             onnx_path = os.path.dirname(spec.origin)
220 |             capi_path = os.path.join(onnx_path, 'capi')
221 | 
222 |             if os.path.exists(capi_path):
223 |                 print(f"  Found ONNX Runtime capi directory: {capi_path}")
224 |                 for file in os.listdir(capi_path):
225 |                     if file.endswith(('.so', '.dll', '.pyd', '.dylib')):
226 |                         src = os.path.join(capi_path, file)
227 |                         # Add to root directory of the bundle
228 |                         binaries.append((src, '.'))
229 |                         print(f"    Added capi library: {file}")
230 |     except Exception as e:
231 |         print(f"  Warning: Could not collect capi libraries: {e}")
232 | 
233 | except Exception as e:
234 |     print(f"Could not collect {onnx_package}: {e}")
235 |     onnx_collected = False
236 | 
237 | if not onnx_collected:
238 |     print("WARNING: Could not collect any ONNX Runtime package")
239 | 
240 | # Collect librosa for audio processing
241 | librosa_datas, librosa_binaries, librosa_hiddenimports = collect_all('librosa')
242 | datas += librosa_datas
243 | binaries += librosa_binaries
244 | hiddenimports += librosa_hiddenimports
245 | 
246 | # Add numpy
247 | numpy_datas, numpy_binaries, numpy_hiddenimports = collect_all('numpy')
248 | datas += numpy_datas
249 | binaries += numpy_binaries
250 | hiddenimports += numpy_hiddenimports
251 | 
252 | # Add other necessary packages
253 | for package in ['pyjson5', 'scipy', 'soundfile', 'audioread', 'resampy', 'numba', 'av', 'tokenizers']:
254 |     try:
255 |         pkg_datas, pkg_binaries, pkg_hiddenimports = collect_all(package)
256 |         datas += pkg_datas
257 |         binaries += pkg_binaries
258 |         hiddenimports += pkg_hiddenimports
259 |     except:
260 |         pass
261 | 
262 | # Collect setuptools and pkg_resources data to fix missing modules
263 | try:
264 |     from PyInstaller.utils.hooks import collect_data_files
265 |     setuptools_datas = collect_data_files('setuptools')
266 |     datas += setuptools_datas
267 |     pkg_resources_datas = collect_data_files('pkg_resources')
268 |     datas += pkg_resources_datas
269 | except:
270 |     pass
271 | 
272 | # Explicitly collect backports module to fix ModuleNotFoundError
273 | try:
274 |     backports_datas, backports_binaries, backports_hiddenimports = collect_all('backports')
275 |     datas += backports_datas
276 |     binaries += backports_binaries
277 |     hiddenimports += backports_hiddenimports
278 |     print("Collected backports module successfully")
279 | except Exception as e:
280 |     print(f"Could not collect backports module: {e}")
281 |     # Try alternative collection method
282 |     try:
283 |         import backports
284 |         import os
285 |         backports_path = os.path.dirname(backports.__file__)
286 |         datas.append((backports_path, 'backports'))
287 |         print(f"Added backports from path: {backports_path}")
288 |     except:
289 |         print("Warning: backports module not found - may need to be installed")
290 | 
291 | # Add hidden imports for modules that might not be detected automatically
292 | hiddenimports += [
293 |     'ctranslate2',
294 |     'transformers.models',
295 |     'transformers.models.whisper',
296 |     'transformers.tokenization_utils',
297 |     'transformers.tokenization_utils_base',
298 |     'tokenizers',
299 |     'tokenizers.implementations',
300 |     'tokenizers.models',
301 |     'tokenizers.pre_tokenizers',
302 |     'tokenizers.processors',
303 |     'onnxruntime.capi',
304 |     'onnxruntime.capi._pybind_state',
305 |     'onnxruntime.capi.onnxruntime_providers_cuda',  # Important for GPU
306 |     'onnxruntime.capi.onnxruntime_providers_tensorrt',  # TensorRT if available
307 |     'librosa.core',
308 |     'librosa.feature',
309 |     'scipy.special._ufuncs_cxx',
310 |     'scipy.linalg._fblas',
311 |     'scipy.linalg._flapack',
312 |     'scipy.linalg._cythonized_array_utils',
313 |     'scipy.linalg._solve_toeplitz',
314 |     'scipy.linalg._matfuncs_sqrtm_triu',
315 |     'scipy.linalg._decomp_lu_cython',
316 |     'scipy.linalg._matfuncs_expm',
317 |     'scipy.linalg.cython_blas',
318 |     'scipy.linalg.cython_lapack',
319 |     'numba.core',
320 |     'numba.cuda',
321 |     'av.audio',
322 |     'av.container',
323 |     'av.stream',
324 |     'pkg_resources.extern',
325 |     'pkg_resources._vendor',
326 |     'packaging',
327 |     'packaging.version',
328 |     'packaging.specifiers',
329 |     'packaging.requirements',
330 |     'backports',  # Fix for ModuleNotFoundError
331 |     'backports.functools_lru_cache',  # Common backports module
332 |     'setuptools._vendor.jaraco',  # Include jaraco modules
333 |     'setuptools._vendor.jaraco.text',
334 |     'setuptools._vendor.jaraco.context',
335 |     'setuptools._vendor.jaraco.functools',
336 |     'code',  # For interactive console with --console option
337 |     'readline',  # For better console experience (if available)
338 |     'rlcompleter',  # For tab completion in console
339 | ]
340 | 
341 | # Add project data files
342 | # Note: models directory is excluded and handled separately by CI
343 | datas += [
344 |     ('src/faster_whisper_transwithai_chickenrice', 'faster_whisper_transwithai_chickenrice'),
345 |     ('locales', 'locales'),  # Include the locales directory with translations
346 | ]
347 | 
348 | a = Analysis(
349 |     ['infer.py'],
350 |     pathex=[],
351 |     binaries=binaries,
352 |     datas=datas,
353 |     hiddenimports=hiddenimports,
354 |     hookspath=[],  # PyInstaller hooks contrib should be auto-detected
355 |     hooksconfig={},
356 |     runtime_hooks=['runtime_hook.py'],  # Add runtime hook to set KMP_DUPLICATE_LIB_OK
357 |     excludes=[
358 |         'matplotlib',
359 |         'tkinter',
360 |         'PyQt5',
361 |         'PyQt6',
362 |         'PySide2',
363 |         'PySide6',
364 |         'notebook',
365 |         'jupyter',
366 |         'IPython',
367 |         'pytest',
368 |     ],
369 |     win_no_prefer_redirects=False,
370 |     win_private_assemblies=False,
371 |     cipher=block_cipher,
372 |     noarchive=False,
373 | )
374 | 
375 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
376 | 
377 | exe = EXE(
378 |     pyz,
379 |     a.scripts,
380 |     [],
381 |     exclude_binaries=True,
382 |     name='infer',
383 |     debug=False,
384 |     bootloader_ignore_signals=False,
385 |     strip=False,
386 |     upx=False,
387 |     console=True,
388 |     disable_windowed_traceback=False,
389 |     argv_emulation=False,
390 |     target_arch=None,
391 |     codesign_identity=None,
392 |     entitlements_file=None,
393 |     icon='transwithai.ico' if os.path.exists('transwithai.ico') else None,
394 | )
395 | 
396 | coll = COLLECT(
397 |     exe,
398 |     a.binaries,
399 |     a.zipfiles,
400 |     a.datas,
401 |     strip=False,
402 |     upx=False,
403 |     upx_exclude=[],
404 |     name='faster_whisper_transwithai_chickenrice',
405 | )


--------------------------------------------------------------------------------
/src/faster_whisper_transwithai_chickenrice/i18n_modern.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ultra-modern, dependency-free i18n module using JSON.
  3 | 
  4 | This is a lightweight, modern internationalization solution that:
  5 | - Uses JSON files (human-readable, easy to edit)
  6 | - No external dependencies
  7 | - Supports nested keys with dot notation
  8 | - Interpolation with {variable} syntax
  9 | - Pluralization support
 10 | - Lazy loading
 11 | - Type hints for better IDE support
 12 | """
 13 | 
 14 | import os
 15 | import sys
 16 | import json
 17 | import locale
 18 | import re
 19 | from pathlib import Path
 20 | from typing import Dict, Any, Optional, List, Union
 21 | from functools import lru_cache
 22 | from dataclasses import dataclass
 23 | from enum import Enum
 24 | 
 25 | 
 26 | class PluralForm(Enum):
 27 |     """Plural forms for different languages."""
 28 |     ZERO = "zero"
 29 |     ONE = "one"
 30 |     TWO = "two"
 31 |     FEW = "few"
 32 |     MANY = "many"
 33 |     OTHER = "other"
 34 | 
 35 | 
 36 | @dataclass
 37 | class LocaleInfo:
 38 |     """Information about a locale."""
 39 |     code: str
 40 |     language: str
 41 |     region: Optional[str] = None
 42 |     script: Optional[str] = None
 43 | 
 44 |     @property
 45 |     def language_code(self) -> str:
 46 |         """Get just the language part."""
 47 |         return self.language
 48 | 
 49 |     @property
 50 |     def full_code(self) -> str:
 51 |         """Get full locale code."""
 52 |         parts = [self.language]
 53 |         if self.script:
 54 |             parts.append(self.script)
 55 |         if self.region:
 56 |             parts.append(self.region)
 57 |         return '-'.join(parts)
 58 | 
 59 | 
 60 | class PluralRules:
 61 |     """Simplified plural rules for common languages."""
 62 | 
 63 |     @staticmethod
 64 |     def get_plural_form(locale_code: str, count: Union[int, float]) -> PluralForm:
 65 |         """
 66 |         Get the appropriate plural form for a count in a given locale.
 67 | 
 68 |         This is a simplified version of CLDR plural rules.
 69 |         """
 70 |         lang = locale_code.split('-')[0].lower()
 71 |         n = abs(count)
 72 | 
 73 |         # Languages with single form (Chinese, Japanese, Korean, Thai, etc.)
 74 |         if lang in ['zh', 'ja', 'ko', 'th', 'vi', 'id', 'ms']:
 75 |             return PluralForm.OTHER
 76 | 
 77 |         # English and Germanic languages
 78 |         if lang in ['en', 'de', 'nl', 'sv', 'da', 'no']:
 79 |             return PluralForm.ONE if n == 1 else PluralForm.OTHER
 80 | 
 81 |         # French, Portuguese, Spanish, Italian
 82 |         if lang in ['fr', 'pt', 'es', 'it']:
 83 |             if n == 0:
 84 |                 return PluralForm.ZERO if lang == 'fr' else PluralForm.OTHER
 85 |             elif n == 1:
 86 |                 return PluralForm.ONE
 87 |             else:
 88 |                 return PluralForm.OTHER
 89 | 
 90 |         # Russian and Slavic languages (simplified)
 91 |         if lang in ['ru', 'uk', 'pl', 'cs', 'sk']:
 92 |             if n == 1:
 93 |                 return PluralForm.ONE
 94 |             elif 2 <= n <= 4:
 95 |                 return PluralForm.FEW
 96 |             else:
 97 |                 return PluralForm.OTHER
 98 | 
 99 |         # Arabic (simplified)
100 |         if lang == 'ar':
101 |             if n == 0:
102 |                 return PluralForm.ZERO
103 |             elif n == 1:
104 |                 return PluralForm.ONE
105 |             elif n == 2:
106 |                 return PluralForm.TWO
107 |             elif 3 <= n <= 10:
108 |                 return PluralForm.FEW
109 |             elif 11 <= n <= 99:
110 |                 return PluralForm.MANY
111 |             else:
112 |                 return PluralForm.OTHER
113 | 
114 |         # Default
115 |         return PluralForm.OTHER
116 | 
117 | 
118 | class ModernI18n:
119 |     """
120 |     Modern, lightweight i18n implementation using JSON.
121 | 
122 |     Features:
123 |     - JSON-based translations (human-readable)
124 |     - Nested key support with dot notation
125 |     - Variable interpolation with {var} syntax
126 |     - Smart pluralization
127 |     - Locale auto-detection
128 |     - Fallback chains
129 |     - No external dependencies
130 |     """
131 | 
132 |     def __init__(self,
133 |                  locales_dir: Optional[Union[str, Path]] = None,
134 |                  default_locale: str = 'zh-CN',
135 |                  fallback_locale: str = 'en-US'):
136 |         """
137 |         Initialize the i18n system.
138 | 
139 |         Args:
140 |             locales_dir: Directory containing JSON translation files
141 |             default_locale: Default locale to use
142 |             fallback_locale: Fallback locale for missing translations
143 |         """
144 |         self.locales_dir = Path(locales_dir or self._find_locales_dir())
145 |         self.default_locale = default_locale
146 |         self.fallback_locale = fallback_locale
147 |         self._translations: Dict[str, Dict[str, Any]] = {}
148 |         self._current_locale: Optional[str] = None
149 | 
150 |         # Auto-detect and set locale
151 |         detected = self._detect_locale()
152 |         self.set_locale(detected)
153 | 
154 |     def _find_locales_dir(self) -> Path:
155 |         """Find the locales directory."""
156 |         # Check if running from PyInstaller bundle
157 |         if getattr(sys, 'frozen', False):
158 |             # Running from executable
159 |             # sys._MEIPASS is the temporary folder where PyInstaller extracts files
160 |             base_path = Path(sys._MEIPASS)
161 |             possible_paths = [
162 |                 base_path / 'locales',
163 |                 Path(sys.executable).parent / 'locales',
164 |             ]
165 |         else:
166 |             # Running from source
167 |             possible_paths = [
168 |                 Path(__file__).parent.parent.parent / 'locales',
169 |                 Path(__file__).parent / 'locales',
170 |                 Path.cwd() / 'locales',
171 |             ]
172 | 
173 |         for path in possible_paths:
174 |             if path.exists() and path.is_dir():
175 |                 return path
176 | 
177 |         # Create default
178 |         default_path = Path(__file__).parent.parent.parent / 'locales'
179 |         default_path.mkdir(parents=True, exist_ok=True)
180 |         return default_path
181 | 
182 |     def _detect_locale(self) -> str:
183 |         """Auto-detect user's preferred locale."""
184 |         # Environment variables
185 |         for env_var in ['LANGUAGE', 'LANG', 'LC_ALL', 'LC_MESSAGES']:
186 |             if lang := os.environ.get(env_var):
187 |                 return self._normalize_locale(lang.split(':')[0].split('.')[0])
188 | 
189 |         # System locale
190 |         try:
191 |             system_locale, _ = locale.getdefaultlocale()
192 |             if system_locale:
193 |                 return self._normalize_locale(system_locale)
194 |         except:
195 |             pass
196 | 
197 |         # Windows-specific
198 |         if sys.platform == 'win32':
199 |             try:
200 |                 import ctypes
201 |                 lang_id = ctypes.windll.kernel32.GetUserDefaultUILanguage()
202 |                 locale_map = {
203 |                     0x0804: 'zh-CN',
204 |                     0x0404: 'zh-TW',
205 |                     0x0409: 'en-US',
206 |                     0x0411: 'ja-JP',
207 |                     0x0412: 'ko-KR',
208 |                 }
209 |                 if lang_id in locale_map:
210 |                     return locale_map[lang_id]
211 |             except:
212 |                 pass
213 | 
214 |         return self.default_locale
215 | 
216 |     def _normalize_locale(self, locale_code: str) -> str:
217 |         """Normalize locale code to standard format."""
218 |         if not locale_code:
219 |             return self.default_locale
220 | 
221 |         # Replace underscores
222 |         locale_code = locale_code.replace('_', '-')
223 | 
224 |         # Add default region if needed
225 |         if '-' not in locale_code:
226 |             defaults = {
227 |                 'zh': 'zh-CN',
228 |                 'en': 'en-US',
229 |                 'ja': 'ja-JP',
230 |                 'ko': 'ko-KR',
231 |                 'es': 'es-ES',
232 |                 'fr': 'fr-FR',
233 |                 'de': 'de-DE',
234 |                 'it': 'it-IT',
235 |                 'pt': 'pt-BR',
236 |                 'ru': 'ru-RU',
237 |             }
238 |             locale_code = defaults.get(locale_code.lower(), locale_code)
239 | 
240 |         return locale_code
241 | 
242 |     @lru_cache(maxsize=10)
243 |     def _load_translations(self, locale_code: str) -> Dict[str, Any]:
244 |         """Load translations for a locale (cached)."""
245 |         translations = {}
246 | 
247 |         # Try JSON file
248 |         json_path = self.locales_dir / locale_code / 'messages.json'
249 |         if json_path.exists():
250 |             try:
251 |                 with open(json_path, 'r', encoding='utf-8') as f:
252 |                     translations = json.load(f)
253 |             except Exception as e:
254 |                 print(f"Warning: Failed to load {json_path}: {e}", file=sys.stderr)
255 | 
256 |         return translations
257 | 
258 |     def set_locale(self, locale_code: str):
259 |         """Set the current locale."""
260 |         self._current_locale = self._normalize_locale(locale_code)
261 |         # Pre-load translations
262 |         self._translations[self._current_locale] = self._load_translations(self._current_locale)
263 |         if self.fallback_locale != self._current_locale:
264 |             self._translations[self.fallback_locale] = self._load_translations(self.fallback_locale)
265 | 
266 |     def _get_nested_value(self, data: Dict[str, Any], key: str) -> Any:
267 |         """Get value from nested dict using dot notation."""
268 |         keys = key.split('.')
269 |         value = data
270 | 
271 |         for k in keys:
272 |             if isinstance(value, dict):
273 |                 value = value.get(k)
274 |                 if value is None:
275 |                     return None
276 |             else:
277 |                 return None
278 | 
279 |         return value
280 | 
281 |     def _interpolate(self, template: str, variables: Dict[str, Any]) -> str:
282 |         """Interpolate variables in template string."""
283 |         if not isinstance(template, str):
284 |             return str(template)
285 | 
286 |         # Match {variable_name} or {variable_name:format}
287 |         pattern = r'\{(\w+)(?::([^}]+))?\}'
288 | 
289 |         def replacer(match):
290 |             var_name = match.group(1)
291 |             format_spec = match.group(2)
292 | 
293 |             if var_name not in variables:
294 |                 return match.group(0)  # Keep original if variable not found
295 | 
296 |             value = variables[var_name]
297 | 
298 |             # Apply format if specified
299 |             if format_spec:
300 |                 try:
301 |                     if format_spec.endswith('f'):
302 |                         # Float formatting like {value:0.2f}
303 |                         decimals = int(format_spec[:-1].split('.')[-1]) if '.' in format_spec else 0
304 |                         return f"{float(value):.{decimals}f}"
305 |                     elif format_spec.isdigit():
306 |                         # Padding like {value:5}
307 |                         return str(value).zfill(int(format_spec))
308 |                 except:
309 |                     pass
310 | 
311 |             return str(value)
312 | 
313 |         return re.sub(pattern, replacer, template)
314 | 
315 |     def get(self, key: str, **variables) -> str:
316 |         """
317 |         Get a translated string.
318 | 
319 |         Args:
320 |             key: Translation key (supports dot notation)
321 |             **variables: Variables for interpolation
322 | 
323 |         Returns:
324 |             Translated and interpolated string
325 |         """
326 |         # Handle pluralization
327 |         if 'count' in variables:
328 |             plural_key = self._get_plural_key(key, variables['count'])
329 |             result = self._get_translation(plural_key)
330 |             if result is not None and result != plural_key:
331 |                 return self._interpolate(result, variables)
332 | 
333 |         # Regular translation
334 |         result = self._get_translation(key)
335 | 
336 |         # Fallback to key if not found
337 |         if result is None:
338 |             result = key
339 | 
340 |         # Interpolate variables
341 |         if variables:
342 |             result = self._interpolate(result, variables)
343 | 
344 |         return result
345 | 
346 |     def _get_plural_key(self, base_key: str, count: Union[int, float]) -> str:
347 |         """Get the plural form key."""
348 |         plural_form = PluralRules.get_plural_form(self._current_locale, count)
349 |         return f"{base_key}.{plural_form.value}"
350 | 
351 |     def _get_translation(self, key: str) -> Optional[str]:
352 |         """Get translation from current or fallback locale."""
353 |         # Try current locale
354 |         if self._current_locale in self._translations:
355 |             value = self._get_nested_value(self._translations[self._current_locale], key)
356 |             if value is not None:
357 |                 return value
358 | 
359 |         # Try fallback locale
360 |         if self.fallback_locale in self._translations:
361 |             value = self._get_nested_value(self._translations[self.fallback_locale], key)
362 |             if value is not None:
363 |                 return value
364 | 
365 |         return None
366 | 
367 |     def format_duration(self, seconds: float) -> str:
368 |         """Format duration in a localized way."""
369 |         hours = int(seconds // 3600)
370 |         minutes = int((seconds % 3600) // 60)
371 |         secs = seconds % 60
372 | 
373 |         if hours > 0:
374 |             return self.get('time.duration_hours', hours=hours, minutes=minutes, seconds=secs)
375 |         elif minutes > 0:
376 |             return self.get('time.duration_minutes', minutes=minutes, seconds=secs)
377 |         else:
378 |             return self.get('time.duration_seconds', seconds=secs)
379 | 
380 |     def format_percentage(self, value: float, decimals: int = 1) -> str:
381 |         """Format percentage in a localized way."""
382 |         return self.get('format.percentage', value=value * 100, decimals=decimals)
383 | 
384 |     def format_file_count(self, count: int) -> str:
385 |         """Format file count with proper pluralization."""
386 |         return self.get('files.count', count=count)
387 | 
388 |     @property
389 |     def current_locale(self) -> str:
390 |         """Get current locale."""
391 |         return self._current_locale
392 | 
393 |     @property
394 |     def available_locales(self) -> List[str]:
395 |         """Get list of available locales."""
396 |         locales = []
397 |         if self.locales_dir.exists():
398 |             for path in self.locales_dir.iterdir():
399 |                 if path.is_dir() and (path / 'messages.json').exists():
400 |                     locales.append(path.name)
401 |         return sorted(locales)
402 | 
403 |     def has_key(self, key: str) -> bool:
404 |         """Check if a translation key exists."""
405 |         return self._get_translation(key) is not None
406 | 
407 |     def get_all_keys(self) -> List[str]:
408 |         """Get all available translation keys."""
409 |         keys = set()
410 | 
411 |         def extract_keys(data: Dict[str, Any], prefix: str = ''):
412 |             for key, value in data.items():
413 |                 full_key = f"{prefix}.{key}" if prefix else key
414 |                 if isinstance(value, dict):
415 |                     extract_keys(value, full_key)
416 |                 else:
417 |                     keys.add(full_key)
418 | 
419 |         for locale_code in [self._current_locale, self.fallback_locale]:
420 |             if locale_code in self._translations:
421 |                 extract_keys(self._translations[locale_code])
422 | 
423 |         return sorted(keys)
424 | 
425 | 
426 | # Global instance
427 | _i18n: Optional[ModernI18n] = None
428 | 
429 | 
430 | def init(locales_dir: Optional[Union[str, Path]] = None,
431 |          default_locale: str = 'zh-CN',
432 |          fallback_locale: str = 'en-US') -> ModernI18n:
433 |     """Initialize the global i18n instance."""
434 |     global _i18n
435 |     _i18n = ModernI18n(locales_dir, default_locale, fallback_locale)
436 |     return _i18n
437 | 
438 | 
439 | def get_i18n() -> ModernI18n:
440 |     """Get the global i18n instance."""
441 |     global _i18n
442 |     if _i18n is None:
443 |         _i18n = init()
444 |     return _i18n
445 | 
446 | 
447 | # Convenience functions
448 | def _(key: str, **variables) -> str:
449 |     """Get translated string."""
450 |     return get_i18n().get(key, **variables)
451 | 
452 | 
453 | def set_locale(locale_code: str):
454 |     """Set current locale."""
455 |     get_i18n().set_locale(locale_code)
456 | 
457 | 
458 | def get_locale() -> str:
459 |     """Get current locale."""
460 |     return get_i18n().current_locale
461 | 
462 | 
463 | def available_locales() -> List[str]:
464 |     """Get available locales."""
465 |     return get_i18n().available_locales
466 | 
467 | 
468 | # Format helpers
469 | format_duration = lambda s: get_i18n().format_duration(s)
470 | format_percentage = lambda v, d=1: get_i18n().format_percentage(v, d)
471 | format_file_count = lambda c: get_i18n().format_file_count(c)
472 | 
473 | # Auto-initialize
474 | init()


--------------------------------------------------------------------------------
/download_models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Download required model files from Hugging Face repositories
  4 | ONNX VAD model is always downloaded, additional models can be specified via HuggingFace repo path
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import json
 10 | import shutil
 11 | import argparse
 12 | import requests
 13 | from pathlib import Path
 14 | from typing import List, Dict, Any, Optional
 15 | from urllib.parse import urljoin
 16 | 
 17 | # Detect if the environment supports Unicode/emoji
 18 | def can_use_unicode():
 19 |     """Check if the current environment supports Unicode output"""
 20 |     # If we're in a CI environment, be conservative and use ASCII
 21 |     if os.environ.get('CI') or os.environ.get('GITHUB_ACTIONS'):
 22 |         # CI environments often have encoding issues, especially on Windows
 23 |         return False
 24 | 
 25 |     # Check if UTF-8 is explicitly set
 26 |     if os.environ.get('PYTHONIOENCODING', '').lower().startswith('utf'):
 27 |         return True
 28 | 
 29 |     if sys.platform == 'win32':
 30 |         # Windows console often doesn't support Unicode well
 31 |         # Try to enable UTF-8 on Windows
 32 |         try:
 33 |             import codecs
 34 |             # Test if we can encode an emoji
 35 |             test_emoji = "✓"
 36 |             test_emoji.encode(sys.stdout.encoding or 'utf-8')
 37 |             return True
 38 |         except (UnicodeEncodeError, LookupError):
 39 |             return False
 40 | 
 41 |     # On other platforms (Linux, Mac), usually Unicode works
 42 |     return True
 43 | 
 44 | # Define symbols based on Unicode support
 45 | USE_UNICODE = can_use_unicode()
 46 | 
 47 | if USE_UNICODE:
 48 |     # Unicode/emoji symbols
 49 |     CHECKMARK = "✓"
 50 |     CROSS = "✗"
 51 |     DOWNLOAD = "⬇"
 52 |     PACKAGE = "📦"
 53 |     SEARCH = "🔍"
 54 |     SUCCESS = "✅"
 55 |     ERROR = "❌"
 56 |     WARNING = "⚠"
 57 | else:
 58 |     # ASCII fallback symbols
 59 |     CHECKMARK = "[OK]"
 60 |     CROSS = "[X]"
 61 |     DOWNLOAD = "[DOWNLOADING]"
 62 |     PACKAGE = "[PACKAGE]"
 63 |     SEARCH = "[SEARCH]"
 64 |     SUCCESS = "[SUCCESS]"
 65 |     ERROR = "[ERROR]"
 66 |     WARNING = "[WARNING]"
 67 | 
 68 | # Force UTF-8 encoding on stdout/stderr if possible
 69 | if sys.platform == 'win32' and not USE_UNICODE:
 70 |     # On Windows CI, try to set UTF-8 mode
 71 |     try:
 72 |         import io
 73 |         sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
 74 |         sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
 75 |     except:
 76 |         pass  # If this fails, we'll use ASCII symbols anyway
 77 | 
 78 | def download_file(url: str, dest_path: Path, session: requests.Session = None) -> bool:
 79 |     """Download a file with progress indicator"""
 80 |     if session is None:
 81 |         session = requests.Session()
 82 | 
 83 |     try:
 84 |         # Create parent directory if needed
 85 |         dest_path.parent.mkdir(parents=True, exist_ok=True)
 86 | 
 87 |         # Skip if file already exists
 88 |         if dest_path.exists():
 89 |             print(f"  {CHECKMARK} {dest_path.name} already exists")
 90 |             return True
 91 | 
 92 |         print(f"  {DOWNLOAD} Downloading {dest_path.name}...", end=" ")
 93 | 
 94 |         response = session.get(url, stream=True, timeout=30)
 95 |         response.raise_for_status()
 96 | 
 97 |         # Get file size
 98 |         total_size = int(response.headers.get('content-length', 0))
 99 | 
100 |         # Download with progress
101 |         downloaded = 0
102 |         last_reported_progress = -1
103 |         with open(dest_path, 'wb') as f:
104 |             for chunk in response.iter_content(chunk_size=8192):
105 |                 if chunk:
106 |                     f.write(chunk)
107 |                     downloaded += len(chunk)
108 |                     if total_size > 0:
109 |                         progress = downloaded / total_size * 100
110 |                         # Only update display every 10%
111 |                         progress_milestone = int(progress // 10) * 10
112 |                         if progress_milestone > last_reported_progress:
113 |                             print(f"\r  {DOWNLOAD} Downloading {dest_path.name}... {progress_milestone}%", end="")
114 |                             last_reported_progress = progress_milestone
115 | 
116 |         print(f"\r  {CHECKMARK} Downloaded {dest_path.name} ({downloaded / (1024*1024):.1f} MB)")
117 |         return True
118 | 
119 |     except Exception as e:
120 |         print(f"\r  {CROSS} Failed to download {dest_path.name}: {e}")
121 |         if dest_path.exists():
122 |             dest_path.unlink()
123 |         return False
124 | 
125 | def get_hf_api_files(repo_id: str) -> List[str]:
126 |     """Get list of files from Hugging Face repo using API"""
127 |     api_url = f"https://huggingface.co/api/models/{repo_id}/tree/main"
128 | 
129 |     try:
130 |         response = requests.get(api_url, timeout=10)
131 |         response.raise_for_status()
132 |         files = response.json()
133 |         return [f['path'] for f in files if f['type'] == 'file']
134 |     except Exception as e:
135 |         print(f"Warning: Could not fetch file list from API: {e}")
136 |         return []
137 | 
138 | def download_hf_model(repo_id: str, target_dir: Optional[str] = None):
139 |     """Download model files from any HuggingFace repository"""
140 |     base_url = f"https://huggingface.co/{repo_id}/resolve/main/"
141 | 
142 |     # Determine target directory
143 |     if target_dir:
144 |         models_dir = Path("models") / target_dir
145 |     else:
146 |         # Use repository name as default subdirectory
147 |         repo_name = repo_id.split('/')[-1]
148 |         models_dir = Path("models") / repo_name
149 | 
150 |     print(f"\n{PACKAGE} Downloading model from {repo_id}")
151 |     print(f"   Target directory: {models_dir}")
152 | 
153 |     # Essential file extensions to download for transformer/whisper models
154 |     essential_extensions = ['.json', '.bin', '.txt', '.onnx', '.safetensors', '.model']
155 | 
156 |     # Try to get full file list from API
157 |     api_files = get_hf_api_files(repo_id)
158 |     if api_files:
159 |         # Filter for essential files
160 |         files_to_download = [f for f in api_files if any(
161 |             f.endswith(ext) for ext in essential_extensions
162 |         )]
163 |         print(f"  Found {len(files_to_download)} files in repository")
164 |     else:
165 |         # If API fails, try common file names
166 |         files_to_download = [
167 |             "config.json",
168 |             "model.bin",
169 |             "pytorch_model.bin",
170 |             "model.safetensors",
171 |             "preprocessor_config.json",
172 |             "tokenizer.json",
173 |             "tokenizer_config.json",
174 |             "vocabulary.json",
175 |             "vocab.json",
176 |             "special_tokens_map.json",
177 |             "merges.txt",
178 |         ]
179 |         print(f"  Using common file list (API unavailable)")
180 | 
181 |     session = requests.Session()
182 |     session.headers.update({
183 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
184 |     })
185 | 
186 |     success_count = 0
187 |     for filename in files_to_download:
188 |         url = urljoin(base_url, filename)
189 |         dest_path = models_dir / filename
190 |         if download_file(url, dest_path, session):
191 |             success_count += 1
192 | 
193 |     print(f"  {CHECKMARK} Downloaded {success_count}/{len(files_to_download)} files")
194 |     return success_count > 0
195 | 
196 | def download_vad_model():
197 |     """Download VAD ONNX model files (always required)"""
198 |     repo_id = "TransWithAI/Whisper-Vad-EncDec-ASMR-onnx"
199 |     base_url = f"https://huggingface.co/{repo_id}/resolve/main/"
200 |     models_dir = Path("models")
201 | 
202 |     print(f"\n{PACKAGE} Downloading VAD ONNX model from {repo_id}")
203 | 
204 |     # Files to download (renamed to match existing structure)
205 |     files = [
206 |         ("model.onnx", "whisper_vad.onnx"),  # Download as model.onnx, save as whisper_vad.onnx
207 |         ("model_metadata.json", "whisper_vad_metadata.json"),  # Download as model_metadata.json, save as whisper_vad_metadata.json
208 |     ]
209 | 
210 |     session = requests.Session()
211 |     session.headers.update({
212 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
213 |     })
214 | 
215 |     success_count = 0
216 |     for source_name, dest_name in files:
217 |         url = urljoin(base_url, source_name)
218 |         dest_path = models_dir / dest_name
219 |         if download_file(url, dest_path, session):
220 |             success_count += 1
221 | 
222 |     print(f"  {CHECKMARK} Downloaded {success_count}/{len(files)} files")
223 |     return success_count == len(files)
224 | 
225 | def download_whisper_base_for_feature_extractor():
226 |     """Download whisper-base model files specifically for feature extractor (offline usage)"""
227 |     repo_id = "openai/whisper-base"
228 |     models_dir = Path("models") / "whisper-base"
229 |     base_url = f"https://huggingface.co/{repo_id}/resolve/main/"
230 | 
231 |     print(f"\n{PACKAGE} Downloading whisper-base for feature extractor (offline usage)")
232 | 
233 |     # Check if files already exist from main models folder
234 |     existing_models_dir = Path("models")
235 |     if existing_models_dir.exists():
236 |         # Files we can copy from existing models folder if available
237 |         files_to_copy = [
238 |             "preprocessor_config.json",
239 |             "config.json",
240 |             "tokenizer.json",
241 |             "vocab.json",
242 |         ]
243 | 
244 |         copied = 0
245 |         models_dir.mkdir(parents=True, exist_ok=True)
246 |         for filename in files_to_copy:
247 |             src = existing_models_dir / filename
248 |             dest = models_dir / filename
249 |             if src.exists() and not dest.exists():
250 |                 shutil.copy2(src, dest)
251 |                 print(f"  {CHECKMARK} Copied {filename} from existing models folder")
252 |                 copied += 1
253 |             elif dest.exists():
254 |                 print(f"  {CHECKMARK} {filename} already exists")
255 |                 copied += 1
256 | 
257 |         if copied >= 2:  # At minimum we need preprocessor_config.json and config.json
258 |             print(f"  {CHECKMARK} Used existing files for whisper-base")
259 |             return True
260 | 
261 |     # Download ONLY the specific files needed for feature extractor
262 |     # We don't need model weights (.bin, .safetensors) for feature extraction
263 |     required_files = [
264 |         "preprocessor_config.json",  # Required for feature extractor
265 |         "config.json",               # Required for configuration
266 |         "tokenizer.json",            # Optional but useful for tokenization
267 |         "vocab.json",                # Optional but useful for vocabulary
268 |     ]
269 | 
270 |     models_dir.mkdir(parents=True, exist_ok=True)
271 |     print(f"  Downloading feature extractor files from {repo_id}...")
272 | 
273 |     session = requests.Session()
274 |     session.headers.update({
275 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
276 |     })
277 | 
278 |     success_count = 0
279 |     for filename in required_files:
280 |         url = urljoin(base_url, filename)
281 |         dest_path = models_dir / filename
282 |         if download_file(url, dest_path, session):
283 |             success_count += 1
284 | 
285 |     print(f"  {CHECKMARK} Downloaded {success_count}/{len(required_files)} feature extractor files")
286 |     return success_count >= 2  # At minimum we need the two required files
287 | 
288 | def verify_whisper_base_feature_extractor():
289 |     """Verify that whisper-base feature extractor files exist"""
290 |     models_dir = Path("models") / "whisper-base"
291 | 
292 |     required_files = [
293 |         ("preprocessor_config.json", "Feature extractor config"),
294 |         ("config.json", "Model configuration"),
295 |     ]
296 | 
297 |     optional_files = [
298 |         ("tokenizer.json", "Tokenizer"),
299 |         ("vocab.json", "Vocabulary"),
300 |     ]
301 | 
302 |     if not models_dir.exists():
303 |         return False
304 | 
305 |     print(f"\n{SEARCH} Verifying whisper-base feature extractor files...")
306 |     all_required_present = True
307 | 
308 |     for filename, description in required_files:
309 |         filepath = models_dir / filename
310 |         if filepath.exists():
311 |             size_kb = filepath.stat().st_size / 1024
312 |             print(f"  {CHECKMARK} {filename} ({size_kb:.1f} KB)")
313 |         else:
314 |             print(f"  {CROSS} {filename} missing - {description}")
315 |             all_required_present = False
316 | 
317 |     for filename, description in optional_files:
318 |         filepath = models_dir / filename
319 |         if filepath.exists():
320 |             size_kb = filepath.stat().st_size / 1024
321 |             print(f"  {CHECKMARK} {filename} ({size_kb:.1f} KB) - optional")
322 | 
323 |     return all_required_present
324 | 
325 | def verify_vad_model():
326 |     """Verify that required VAD model files exist"""
327 |     models_dir = Path("models")
328 | 
329 |     required_files = [
330 |         ("whisper_vad.onnx", "VAD ONNX model"),
331 |         ("whisper_vad_metadata.json", "VAD metadata"),
332 |     ]
333 | 
334 |     print(f"\n{SEARCH} Verifying VAD model files...")
335 |     all_present = True
336 | 
337 |     for filename, description in required_files:
338 |         filepath = models_dir / filename
339 |         if filepath.exists():
340 |             size_mb = filepath.stat().st_size / (1024 * 1024)
341 |             print(f"  {CHECKMARK} {filename} ({size_mb:.1f} MB)")
342 |         else:
343 |             print(f"  {CROSS} {filename} missing - {description}")
344 |             all_present = False
345 | 
346 |     return all_present
347 | 
348 | def verify_hf_model(repo_id: str, target_dir: Optional[str] = None):
349 |     """Verify that HuggingFace model files exist"""
350 |     if target_dir:
351 |         models_dir = Path("models") / target_dir
352 |     else:
353 |         repo_name = repo_id.split('/')[-1]
354 |         models_dir = Path("models") / repo_name
355 | 
356 |     if not models_dir.exists():
357 |         print(f"\n{WARNING} Model directory {models_dir} does not exist")
358 |         return False
359 | 
360 |     print(f"\n{SEARCH} Verifying model files in {models_dir}...")
361 | 
362 |     # Check for common model files
363 |     common_files = ["config.json", "model.bin", "pytorch_model.bin", "model.safetensors", "model.onnx"]
364 |     found_files = []
365 | 
366 |     for file in models_dir.iterdir():
367 |         if file.is_file():
368 |             size_mb = file.stat().st_size / (1024 * 1024)
369 |             print(f"  {CHECKMARK} {file.name} ({size_mb:.1f} MB)")
370 |             found_files.append(file.name)
371 | 
372 |     # Check if at least one model file exists
373 |     has_model = any(f in found_files for f in common_files)
374 | 
375 |     if not has_model and found_files:
376 |         print(f"  {WARNING} Warning: No common model files found, but other files exist")
377 |     elif not found_files:
378 |         print(f"  {CROSS} No files found in model directory")
379 |         return False
380 | 
381 |     return True
382 | 
383 | def main():
384 |     """Main download function"""
385 |     parser = argparse.ArgumentParser(
386 |         description="Model Downloader for Faster Whisper Custom VAD",
387 |         formatter_class=argparse.RawDescriptionHelpFormatter,
388 |         epilog="""
389 | Examples:
390 |   %(prog)s
391 |     # Download VAD model and whisper-base (both required for offline usage)
392 | 
393 |   %(prog)s --skip-whisper-base
394 |     # Download only VAD model, skip whisper-base (not recommended)
395 | 
396 |   %(prog)s --hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2
397 |     # Download VAD, whisper-base, and Chickenrice Whisper model
398 | 
399 |   %(prog)s --hf-model openai/whisper-large-v3 --target-dir whisper-v3
400 |     # Download VAD, whisper-base, and Whisper v3 to specific directory
401 | 
402 |   %(prog)s --force --hf-model myusername/my-custom-model
403 |     # Force re-download everything including VAD, whisper-base, and custom model
404 |         """
405 |     )
406 | 
407 |     parser.add_argument(
408 |         '--hf-model',
409 |         type=str,
410 |         help='HuggingFace repository path to download (e.g., "openai/whisper-large-v3")'
411 |     )
412 | 
413 |     parser.add_argument(
414 |         '--target-dir',
415 |         type=str,
416 |         help='Target subdirectory name in models/ for the HuggingFace model (defaults to repo name)'
417 |     )
418 | 
419 |     parser.add_argument(
420 |         '--force',
421 |         action='store_true',
422 |         help='Force re-download even if models already exist'
423 |     )
424 | 
425 |     parser.add_argument(
426 |         '--skip-vad',
427 |         action='store_true',
428 |         help='Skip downloading VAD model (not recommended, for testing only)'
429 |     )
430 | 
431 |     parser.add_argument(
432 |         '--skip-whisper-base',
433 |         action='store_true',
434 |         help='Skip downloading whisper-base model for feature extractor (not recommended)'
435 |     )
436 | 
437 |     args = parser.parse_args()
438 | 
439 |     print("=" * 60)
440 |     print("Model Downloader for Faster Whisper Custom VAD")
441 |     print("=" * 60)
442 | 
443 |     models_dir = Path("models")
444 |     models_dir.mkdir(exist_ok=True)
445 | 
446 |     # Check if VAD model already exists
447 |     if not args.force and not args.skip_vad and verify_vad_model():
448 |         print(f"\n{CHECKMARK} VAD model files already present")
449 |         vad_exists = True
450 |     else:
451 |         vad_exists = False
452 | 
453 |     # Check if whisper-base feature extractor already exists
454 |     whisper_base_exists = False
455 |     if not args.skip_whisper_base and not args.force:
456 |         if verify_whisper_base_feature_extractor():
457 |             print(f"\n{CHECKMARK} Whisper-base feature extractor files already present")
458 |             whisper_base_exists = True
459 | 
460 |     # Check if HF model already exists (if specified)
461 |     hf_exists = False
462 |     if args.hf_model and not args.force:
463 |         if verify_hf_model(args.hf_model, args.target_dir):
464 |             print(f"\n{CHECKMARK} Model {args.hf_model} already present")
465 |             hf_exists = True
466 | 
467 |     # If everything exists and no force flag, ask user
468 |     all_exists = vad_exists and (not args.hf_model or hf_exists) and (args.skip_whisper_base or whisper_base_exists)
469 |     if all_exists and not args.force:
470 |         response = input("\nAll required models are present. Re-download? (y/N): ").strip().lower()
471 |         if response != 'y':
472 |             print("Skipping download.")
473 |             return 0
474 | 
475 |     # Download models
476 |     success = True
477 | 
478 |     # Always download VAD model (unless explicitly skipped)
479 |     if not args.skip_vad:
480 |         if not download_vad_model():
481 |             print(f"{WARNING} Error: VAD model is required and could not be downloaded")
482 |             success = False
483 |     else:
484 |         print(f"\n{WARNING} Skipping VAD model download (not recommended)")
485 | 
486 |     # Download whisper-base feature extractor (unless explicitly skipped)
487 |     if not args.skip_whisper_base:
488 |         if not download_whisper_base_for_feature_extractor():
489 |             print(f"{WARNING} Warning: Whisper-base feature extractor could not be downloaded completely")
490 |             # Don't fail completely if feature extractor download has issues
491 |     else:
492 |         print(f"\n{WARNING} Skipping whisper-base download (not recommended for offline usage)")
493 | 
494 |     # Download HuggingFace model if specified
495 |     if args.hf_model:
496 |         if not download_hf_model(args.hf_model, args.target_dir):
497 |             print(f"{WARNING} Warning: Model {args.hf_model} could not be downloaded completely")
498 |             # Don't fail completely if HF model download has issues
499 | 
500 |     # Final verification
501 |     print("\n" + "=" * 60)
502 | 
503 |     # Verify VAD model
504 |     if not args.skip_vad:
505 |         if verify_vad_model():
506 |             print(f"\n{SUCCESS} VAD model downloaded successfully!")
507 |         else:
508 |             print(f"\n{ERROR} Critical: VAD model is missing. Cannot proceed without it.")
509 |             return 1
510 | 
511 |     # Verify whisper-base feature extractor (unless skipped)
512 |     if not args.skip_whisper_base:
513 |         if verify_whisper_base_feature_extractor():
514 |             print(f"\n{SUCCESS} Whisper-base feature extractor downloaded successfully!")
515 |         else:
516 |             print(f"\n{WARNING} Warning: Some whisper-base feature extractor files may be missing.")
517 | 
518 |     # Verify HF model if specified
519 |     if args.hf_model:
520 |         if verify_hf_model(args.hf_model, args.target_dir):
521 |             print(f"\n{SUCCESS} Model {args.hf_model} downloaded successfully!")
522 |         else:
523 |             print(f"\n{WARNING} Warning: Some files from {args.hf_model} may be missing.")
524 | 
525 |     return 0
526 | 
527 | if __name__ == "__main__":
528 |     try:
529 |         sys.exit(main())
530 |     except KeyboardInterrupt:
531 |         print("\n\nDownload cancelled by user.")
532 |         sys.exit(1)
533 |     except Exception as e:
534 |         print(f"\n{ERROR} Error: {e}")
535 |         sys.exit(1)


--------------------------------------------------------------------------------
/src/faster_whisper_transwithai_chickenrice/vad_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | VAD Model Manager - Manages different VAD model implementations
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | import os
  8 | import warnings
  9 | from pathlib import Path
 10 | from typing import List, Dict, Any, Optional, Protocol, Callable
 11 | import numpy as np
 12 | from dataclasses import dataclass
 13 | 
 14 | # Import modern i18n module for translations
 15 | from . import i18n_modern as i18n
 16 | 
 17 | # Convenience imports
 18 | _ = i18n._
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | @dataclass
 24 | class VadConfig:
 25 |     """Configuration for VAD models"""
 26 |     default_model: str = "whisper_vad"
 27 |     auto_inject: bool = False
 28 |     ttl: int = 3600  # Cache TTL in seconds
 29 | 
 30 |     # VAD parameters
 31 |     threshold: float = 0.5
 32 |     neg_threshold: Optional[float] = None
 33 |     min_speech_duration_ms: int = 250
 34 |     max_speech_duration_s: float = float('inf')
 35 |     min_silence_duration_ms: int = 2000
 36 |     speech_pad_ms: int = 400
 37 | 
 38 |     # ONNX-specific parameters
 39 |     onnx_model_path: Optional[str] = None
 40 |     onnx_metadata_path: Optional[str] = None
 41 |     whisper_model_name: str = "openai/whisper-base"
 42 |     frame_duration_ms: int = 20
 43 |     chunk_duration_ms: int = 30000
 44 |     force_cpu: bool = False
 45 |     num_threads: int = 1
 46 | 
 47 | 
 48 | class VadModel(Protocol):
 49 |     """Protocol for VAD models"""
 50 | 
 51 |     def get_speech_timestamps(
 52 |         self,
 53 |         audio: np.ndarray,
 54 |         sampling_rate: int = 16000,
 55 |         **kwargs
 56 |     ) -> List[Dict[str, Any]]:
 57 |         """Get speech timestamps from audio"""
 58 |         ...
 59 | 
 60 | 
 61 | class WhisperVADOnnxWrapper:
 62 |     """ONNX wrapper for Whisper-based VAD model following Silero's architecture."""
 63 | 
 64 |     def __init__(
 65 |         self,
 66 |         model_path: str,
 67 |         metadata_path: Optional[str] = None,
 68 |         force_cpu: bool = False,
 69 |         num_threads: int = 1,
 70 |         progress_callback: Optional[Callable[[int, int, str], None]] = None,
 71 |     ):
 72 |         """Initialize ONNX model wrapper.
 73 | 
 74 |         Args:
 75 |             model_path: Path to ONNX model file
 76 |             metadata_path: Path to metadata JSON file (optional)
 77 |             force_cpu: Force CPU execution even if GPU is available
 78 |             num_threads: Number of CPU threads for inference
 79 |             progress_callback: Optional callback for progress tracking (chunk_idx, total_chunks, device)
 80 |         """
 81 |         try:
 82 |             import onnxruntime as ort
 83 |         except ImportError:
 84 |             raise ImportError(_("vad.onnx_not_installed"))
 85 | 
 86 |         try:
 87 |             from transformers import WhisperFeatureExtractor
 88 |         except ImportError:
 89 |             raise ImportError(_("vad.transformers_not_installed"))
 90 | 
 91 |         self.model_path = model_path
 92 |         self.progress_callback = progress_callback
 93 |         self.device = "CPU"  # Will be updated based on actual provider
 94 | 
 95 |         # Load metadata
 96 |         if metadata_path is None:
 97 |             metadata_path = model_path.replace('.onnx', '_metadata.json')
 98 | 
 99 |         if os.path.exists(metadata_path):
100 |             with open(metadata_path, 'r') as f:
101 |                 self.metadata = json.load(f)
102 |         else:
103 |             warnings.warn("No metadata file found. Using default values.")
104 |             self.metadata = {
105 |                 'whisper_model_name': 'openai/whisper-base',
106 |                 'frame_duration_ms': 20,
107 |                 'total_duration_ms': 30000,
108 |             }
109 | 
110 |         # Initialize feature extractor - try local folder first for offline usage
111 |         local_whisper_base_path = Path("models/whisper-base")
112 |         if local_whisper_base_path.exists() and (local_whisper_base_path / "preprocessor_config.json").exists():
113 |             # Load from local folder for offline usage
114 |             try:
115 |                 self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
116 |                     str(local_whisper_base_path)
117 |                 )
118 |                 logger.info(_("vad.feature_extractor_loaded", path=local_whisper_base_path))
119 |             except Exception as e:
120 |                 warnings.warn(f"Failed to load from local folder, trying online: {e}")
121 |                 self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
122 |                     self.metadata['whisper_model_name']
123 |                 )
124 |         else:
125 |             # Try to load from HuggingFace (requires internet)
126 |             self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
127 |                 self.metadata['whisper_model_name']
128 |             )
129 | 
130 |         # Set up ONNX Runtime session
131 |         opts = ort.SessionOptions()
132 | 
133 |         # Determine execution provider first
134 |         providers = ['CPUExecutionProvider']
135 |         use_gpu = not force_cpu and 'CUDAExecutionProvider' in ort.get_available_providers()
136 | 
137 |         if use_gpu:
138 |             providers.insert(0, 'CUDAExecutionProvider')
139 |             self.device = "GPU (CUDA)"
140 |             # For GPU, use the provided num_threads or default
141 |             opts.inter_op_num_threads = num_threads
142 |             opts.intra_op_num_threads = num_threads
143 |         else:
144 |             self.device = "CPU"
145 |             # For CPU, use half of available processors if num_threads is default (1)
146 |             import multiprocessing
147 |             if num_threads == 1:
148 |                 # Use half of CPU count for optimal performance
149 |                 optimal_threads = max(1, multiprocessing.cpu_count() // 2)
150 |                 opts.inter_op_num_threads = optimal_threads
151 |                 opts.intra_op_num_threads = optimal_threads
152 |                 logger.info(_("vad.auto_configured", threads=optimal_threads,
153 |                     total=multiprocessing.cpu_count()))
154 |             else:
155 |                 # Use user-specified thread count
156 |                 opts.inter_op_num_threads = num_threads
157 |                 opts.intra_op_num_threads = num_threads
158 | 
159 |         self.session = ort.InferenceSession(model_path, providers=providers, sess_options=opts)
160 | 
161 |         # Get input/output info
162 |         self.input_name = self.session.get_inputs()[0].name
163 |         self.output_names = [out.name for out in self.session.get_outputs()]
164 | 
165 |         # Model parameters
166 |         self.sample_rate = 16000  # Whisper uses 16kHz
167 |         self.frame_duration_ms = self.metadata.get('frame_duration_ms', 20)
168 |         self.chunk_duration_ms = self.metadata.get('total_duration_ms', 30000)
169 |         self.chunk_samples = int(self.chunk_duration_ms * self.sample_rate / 1000)
170 |         self.frames_per_chunk = int(self.chunk_duration_ms / self.frame_duration_ms)
171 | 
172 |         # Initialize state
173 |         self.reset_states()
174 | 
175 |         logger.info(_("vad.model_loaded", path=model_path))
176 |         logger.info(_("vad.device", device=self.device))
177 |         logger.info(_("vad.providers", providers=providers))
178 |         logger.info(_("vad.chunk_duration", duration=self.chunk_duration_ms))
179 |         logger.info(_("vad.frame_duration", duration=self.frame_duration_ms))
180 | 
181 |     def reset_states(self):
182 |         """Reset internal states for new audio stream."""
183 |         self._context = None
184 |         self._last_chunk = None
185 | 
186 |     def _validate_input(self, audio: np.ndarray, sr: int) -> np.ndarray:
187 |         """Validate and preprocess input audio.
188 | 
189 |         Args:
190 |             audio: Input audio array
191 |             sr: Sample rate
192 | 
193 |         Returns:
194 |             Preprocessed audio at 16kHz
195 |         """
196 |         if audio.ndim > 1:
197 |             # Convert to mono if multi-channel
198 |             audio = audio.mean(axis=0 if audio.shape[0] > audio.shape[1] else 1)
199 | 
200 |         # Resample if needed
201 |         if sr != self.sample_rate:
202 |             try:
203 |                 import librosa
204 |                 audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
205 |             except ImportError:
206 |                 logger.warning(_("vad.librosa_not_installed"))
207 |                 # Basic downsampling if librosa not available
208 |                 if sr > self.sample_rate:
209 |                     # Simple downsampling
210 |                     ratio = sr // self.sample_rate
211 |                     audio = audio[::ratio]
212 | 
213 |         return audio
214 | 
215 |     def __call__(self, audio_chunk: np.ndarray, sr: int = 16000) -> np.ndarray:
216 |         """Process a single audio chunk.
217 | 
218 |         Args:
219 |             audio_chunk: Audio chunk to process
220 |             sr: Sample rate
221 | 
222 |         Returns:
223 |             Frame-level speech probabilities
224 |         """
225 |         # Validate input
226 |         audio_chunk = self._validate_input(audio_chunk, sr)
227 | 
228 |         # Ensure chunk is correct size
229 |         if len(audio_chunk) < self.chunk_samples:
230 |             audio_chunk = np.pad(
231 |                 audio_chunk,
232 |                 (0, self.chunk_samples - len(audio_chunk)),
233 |                 mode='constant'
234 |             )
235 |         elif len(audio_chunk) > self.chunk_samples:
236 |             audio_chunk = audio_chunk[:self.chunk_samples]
237 | 
238 |         # Extract features
239 |         inputs = self.feature_extractor(
240 |             audio_chunk,
241 |             sampling_rate=self.sample_rate,
242 |             return_tensors="np"
243 |         )
244 | 
245 |         # Run inference
246 |         outputs = self.session.run(
247 |             self.output_names,
248 |             {self.input_name: inputs.input_features}
249 |         )
250 | 
251 |         # Apply sigmoid to get probabilities
252 |         frame_logits = outputs[0][0]  # Remove batch dimension
253 |         frame_probs = 1 / (1 + np.exp(-frame_logits))
254 | 
255 |         return frame_probs
256 | 
257 |     def audio_forward(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
258 |         """Process full audio file in chunks (Silero-style).
259 | 
260 |         Args:
261 |             audio: Full audio array
262 |             sr: Sample rate
263 | 
264 |         Returns:
265 |             Concatenated frame probabilities for entire audio
266 |         """
267 |         audio = self._validate_input(audio, sr)
268 |         self.reset_states()
269 | 
270 |         all_probs = []
271 | 
272 |         # Calculate total number of chunks
273 |         total_chunks = (len(audio) + self.chunk_samples - 1) // self.chunk_samples
274 | 
275 |         # Log initial processing info
276 |         logger.info(_("vad.starting", device=self.device))
277 |         logger.info(_("vad.total_samples", samples=len(audio)))
278 |         logger.info(_("vad.chunk_size", samples=self.chunk_samples, duration=self.chunk_duration_ms))
279 |         logger.info(_("vad.total_chunks", chunks=total_chunks))
280 | 
281 |         # Process in chunks
282 |         for chunk_idx, i in enumerate(range(0, len(audio), self.chunk_samples)):
283 |             chunk = audio[i:i + self.chunk_samples]
284 | 
285 |             # Pad last chunk if needed
286 |             if len(chunk) < self.chunk_samples:
287 |                 chunk = np.pad(chunk, (0, self.chunk_samples - len(chunk)), mode='constant')
288 | 
289 |             # Report progress
290 |             if self.progress_callback:
291 |                 self.progress_callback(chunk_idx + 1, total_chunks, self.device)
292 | 
293 |             # Log chunk progress
294 |             progress_pct = ((chunk_idx + 1) / total_chunks) * 100
295 |             logger.debug(_("vad.processing_chunk", current=chunk_idx + 1, total=total_chunks,
296 |                 percent=progress_pct, device=self.device))
297 | 
298 |             # Get predictions for chunk
299 |             chunk_probs = self.__call__(chunk, self.sample_rate)
300 |             all_probs.append(chunk_probs)
301 | 
302 |         logger.info(_("vad.completed", chunks=total_chunks, device=self.device))
303 | 
304 |         # Concatenate all probabilities
305 |         if all_probs:
306 |             return np.concatenate(all_probs)
307 |         return np.array([])
308 | 
309 | 
310 | def get_speech_timestamps_onnx(
311 |     audio: np.ndarray,
312 |     model,
313 |     threshold: float = 0.5,
314 |     sampling_rate: int = 16000,
315 |     min_speech_duration_ms: int = 250,
316 |     max_speech_duration_s: float = float('inf'),
317 |     min_silence_duration_ms: int = 100,
318 |     speech_pad_ms: int = 30,
319 |     return_seconds: bool = True,
320 |     neg_threshold: Optional[float] = None,
321 |     progress_tracking_callback: Optional[Callable[[float], None]] = None,
322 | ) -> List[Dict[str, float]]:
323 |     """Extract speech timestamps from audio using Silero-style processing.
324 | 
325 |     This function implements Silero VAD's approach with:
326 |     - Dual threshold (positive and negative) for hysteresis
327 |     - Proper segment padding
328 |     - Minimum duration filtering
329 |     - Maximum duration handling with intelligent splitting
330 | 
331 |     Args:
332 |         audio: Input audio array
333 |         model: VAD model (WhisperVADOnnxWrapper instance)
334 |         threshold: Speech threshold (default: 0.5)
335 |         sampling_rate: Audio sample rate
336 |         min_speech_duration_ms: Minimum speech segment duration
337 |         max_speech_duration_s: Maximum speech segment duration
338 |         min_silence_duration_ms: Minimum silence to split segments
339 |         speech_pad_ms: Padding to add to speech segments
340 |         return_seconds: Return times in seconds vs samples
341 |         neg_threshold: Negative threshold for hysteresis (default: threshold - 0.15)
342 |         progress_tracking_callback: Progress callback function
343 | 
344 |     Returns:
345 |         List of speech segments with start/end times
346 |     """
347 |     # Audio should already be numpy array
348 | 
349 |     # Validate audio
350 |     if audio.ndim > 1:
351 |         audio = audio.mean(axis=0 if audio.shape[0] > audio.shape[1] else 1)
352 | 
353 |     # Get frame probabilities for entire audio
354 |     model.reset_states()
355 |     speech_probs = model.audio_forward(audio, sampling_rate)
356 | 
357 |     # Calculate frame parameters
358 |     frame_duration_ms = model.frame_duration_ms
359 |     frame_samples = int(sampling_rate * frame_duration_ms / 1000)
360 | 
361 |     # Convert durations to frames
362 |     min_speech_frames = int(min_speech_duration_ms / frame_duration_ms)
363 |     min_silence_frames = int(min_silence_duration_ms / frame_duration_ms)
364 |     speech_pad_frames = int(speech_pad_ms / frame_duration_ms)
365 |     max_speech_frames = int(max_speech_duration_s * 1000 / frame_duration_ms) if max_speech_duration_s != float('inf') else len(speech_probs)
366 | 
367 |     # Set negative threshold for hysteresis
368 |     if neg_threshold is None:
369 |         neg_threshold = max(threshold - 0.15, 0.01)
370 | 
371 |     # Track speech segments
372 |     triggered = False
373 |     speeches = []
374 |     current_speech = {}
375 |     current_probs = []  # Track probabilities for current segment
376 |     temp_end = 0
377 | 
378 |     # Process each frame
379 |     for i, speech_prob in enumerate(speech_probs):
380 |         # Report progress
381 |         if progress_tracking_callback:
382 |             progress = (i + 1) / len(speech_probs) * 100
383 |             progress_tracking_callback(progress)
384 | 
385 |         # Track probabilities for current segment
386 |         if triggered:
387 |             current_probs.append(float(speech_prob))
388 | 
389 |         # Speech onset detection
390 |         if speech_prob >= threshold and not triggered:
391 |             triggered = True
392 |             current_speech['start'] = i
393 |             current_probs = [float(speech_prob)]  # Start tracking probabilities
394 |             continue
395 | 
396 |         # Check for maximum speech duration
397 |         if triggered and 'start' in current_speech:
398 |             duration = i - current_speech['start']
399 |             if duration > max_speech_frames:
400 |                 # Force end segment at max duration
401 |                 current_speech['end'] = current_speech['start'] + max_speech_frames
402 |                 # Calculate probability statistics for segment
403 |                 if current_probs:
404 |                     current_speech['probability'] = np.mean(current_probs)
405 |                 speeches.append(current_speech)
406 |                 current_speech = {}
407 |                 current_probs = []
408 |                 triggered = False
409 |                 temp_end = 0
410 |                 continue
411 | 
412 |         # Speech offset detection with hysteresis
413 |         if speech_prob < neg_threshold and triggered:
414 |             if not temp_end:
415 |                 temp_end = i
416 | 
417 |             # Check if silence is long enough
418 |             if i - temp_end >= min_silence_frames:
419 |                 # End current speech segment
420 |                 current_speech['end'] = temp_end
421 | 
422 |                 # Check minimum duration
423 |                 if current_speech['end'] - current_speech['start'] >= min_speech_frames:
424 |                     # Calculate probability statistics for segment
425 |                     if current_probs:
426 |                         current_speech['probability'] = np.mean(current_probs[:temp_end - current_speech['start']])
427 |                     speeches.append(current_speech)
428 | 
429 |                 current_speech = {}
430 |                 current_probs = []
431 |                 triggered = False
432 |                 temp_end = 0
433 | 
434 |         # Reset temp_end if speech resumes
435 |         elif speech_prob >= threshold and temp_end:
436 |             temp_end = 0
437 | 
438 |     # Handle speech that continues to the end
439 |     if triggered and 'start' in current_speech:
440 |         current_speech['end'] = len(speech_probs)
441 |         if current_speech['end'] - current_speech['start'] >= min_speech_frames:
442 |             # Calculate probability statistics for segment
443 |             if current_probs:
444 |                 current_speech['probability'] = np.mean(current_probs)
445 |             speeches.append(current_speech)
446 | 
447 |     # Apply padding to segments
448 |     for i, speech in enumerate(speeches):
449 |         # Add padding
450 |         if i == 0:
451 |             speech['start'] = max(0, speech['start'] - speech_pad_frames)
452 |         else:
453 |             speech['start'] = max(speeches[i-1]['end'], speech['start'] - speech_pad_frames)
454 | 
455 |         if i < len(speeches) - 1:
456 |             speech['end'] = min(speeches[i+1]['start'], speech['end'] + speech_pad_frames)
457 |         else:
458 |             speech['end'] = min(len(speech_probs), speech['end'] + speech_pad_frames)
459 | 
460 |     # Convert to time units or sample indices based on return_seconds
461 |     for speech in speeches:
462 |         if return_seconds:
463 |             # Convert frame indices to seconds
464 |             speech['start'] = speech['start'] * frame_duration_ms / 1000
465 |             speech['end'] = speech['end'] * frame_duration_ms / 1000
466 |         else:
467 |             # Convert frame indices to sample indices
468 |             speech['start'] = int(speech['start'] * frame_samples)
469 |             speech['end'] = int(speech['end'] * frame_samples)
470 | 
471 |     return speeches
472 | 
473 | 
474 | class WhisperVadModel:
475 |     """
476 |     Whisper-based VAD model implementation using ONNX.
477 |     Uses a Whisper model exported to ONNX for voice activity detection.
478 |     """
479 | 
480 |     def __init__(self, config: Optional[VadConfig] = None, progress_callback: Optional[Callable[[int, int, str], None]] = None):
481 |         self.config = config or VadConfig()
482 |         self.wrapper = None
483 |         self.progress_callback = progress_callback
484 | 
485 |         # Initialize ONNX model if path provided
486 |         if self.config.onnx_model_path and os.path.exists(self.config.onnx_model_path):
487 |             try:
488 |                 self.wrapper = WhisperVADOnnxWrapper(
489 |                     model_path=self.config.onnx_model_path,
490 |                     metadata_path=self.config.onnx_metadata_path,
491 |                     force_cpu=self.config.force_cpu,
492 |                     num_threads=self.config.num_threads,
493 |                     progress_callback=progress_callback,
494 |                 )
495 |                 logger.info(_("vad.model_initialized", path=self.config.onnx_model_path))
496 |                 if self.wrapper.device:
497 |                     logger.info(_("vad.using_device", device=self.wrapper.device))
498 |             except Exception as e:
499 |                 logger.error(_("vad.init_failed", error=e))
500 |         else:
501 |             logger.warning(_("vad.path_invalid", path=self.config.onnx_model_path))
502 | 
503 |     def get_speech_timestamps(
504 |         self,
505 |         audio: np.ndarray,
506 |         sampling_rate: int = 16000,
507 |         threshold: float = None,
508 |         min_speech_duration_ms: int = None,
509 |         max_speech_duration_s: float = None,
510 |         min_silence_duration_ms: int = None,
511 |         speech_pad_ms: int = None,
512 |         neg_threshold: float = None,
513 |         **kwargs
514 |     ) -> List[Dict[str, Any]]:
515 |         """
516 |         Get speech timestamps using Whisper VAD.
517 |         """
518 |         if self.wrapper is None:
519 |             logger.error(_("vad.not_initialized"))
520 |             return []
521 | 
522 |         # Use provided parameters or defaults from config
523 |         threshold = threshold if threshold is not None else self.config.threshold
524 |         neg_threshold = neg_threshold if neg_threshold is not None else self.config.neg_threshold
525 |         min_speech_duration_ms = min_speech_duration_ms if min_speech_duration_ms is not None else self.config.min_speech_duration_ms
526 |         max_speech_duration_s = max_speech_duration_s if max_speech_duration_s is not None else self.config.max_speech_duration_s
527 |         min_silence_duration_ms = min_silence_duration_ms if min_silence_duration_ms is not None else self.config.min_silence_duration_ms
528 |         speech_pad_ms = speech_pad_ms if speech_pad_ms is not None else self.config.speech_pad_ms
529 | 
530 |         # Use ONNX model for speech detection
531 |         # Return sample indices (not seconds) for compatibility with faster_whisper
532 |         segments = get_speech_timestamps_onnx(
533 |             audio=audio,
534 |             model=self.wrapper,
535 |             threshold=threshold,
536 |             sampling_rate=sampling_rate,
537 |             min_speech_duration_ms=min_speech_duration_ms,
538 |             max_speech_duration_s=max_speech_duration_s,
539 |             min_silence_duration_ms=min_silence_duration_ms,
540 |             speech_pad_ms=speech_pad_ms,
541 |             return_seconds=False,  # faster_whisper expects sample indices
542 |             neg_threshold=neg_threshold,
543 |         )
544 | 
545 |         logger.debug(_("vad.speech_segments", count=len(segments)))
546 |         return segments
547 | 
548 |     def get_device(self) -> str:
549 |         """Get the device being used for VAD processing."""
550 |         if self.wrapper:
551 |             return self.wrapper.device
552 |         return "Not initialized"
553 | 
554 | 
555 | class VadModelManager:
556 |     """
557 |     Manages different VAD model implementations.
558 |     Provides a unified interface for VAD operations.
559 |     """
560 | 
561 |     def __init__(self, config: Optional[VadConfig] = None, ttl: int = 3600, progress_callback: Optional[Callable[[int, int, str], None]] = None):
562 |         self.config = config or VadConfig()
563 |         self.ttl = ttl
564 |         self.progress_callback = progress_callback
565 |         self._models: Dict[str, VadModel] = {}  # Instance variable, not class variable
566 |         self._config = config
567 | 
568 |         # Register available models
569 |         self._register_models()
570 | 
571 |     def _register_models(self):
572 |         """Register available VAD models"""
573 |         # Always recreate the model with the current progress callback
574 |         self._models["whisper_vad"] = WhisperVadModel(self.config, progress_callback=self.progress_callback)
575 |         logger.debug(_("vad.registered"))
576 | 
577 |     def get_model(self, model_id: str) -> VadModel:
578 |         """Get a VAD model by ID"""
579 |         if model_id not in self._models:
580 |             logger.warning(_("vad.model_not_found", model_id=model_id))
581 |             model_id = self.config.default_model
582 | 
583 |         return self._models.get(model_id, self._models["whisper_vad"])
584 | 
585 |     def get_speech_timestamps(
586 |         self,
587 |         model_id: str,
588 |         audio: np.ndarray,
589 |         sampling_rate: int = 16000,
590 |         **kwargs
591 |     ) -> List[Dict[str, Any]]:
592 |         """
593 |         Get speech timestamps using specified model.
594 | 
595 |         Args:
596 |             model_id: ID of the VAD model to use
597 |             audio: Audio array
598 |             sampling_rate: Sample rate of audio
599 |             **kwargs: Additional parameters for the VAD model
600 | 
601 |         Returns:
602 |             List of speech segments with start, end, and probability
603 |         """
604 |         model = self.get_model(model_id)
605 |         return model.get_speech_timestamps(audio, sampling_rate, **kwargs)
606 | 
607 |     @classmethod
608 |     def get_available_models(cls) -> List[str]:
609 |         """Get list of available VAD models"""
610 |         # Since models are now instance variables, return the known model types
611 |         return ["whisper_vad"]
612 | 
613 |     def get_device(self, model_id: str = None) -> str:
614 |         """Get the device being used for VAD processing."""
615 |         if model_id is None:
616 |             model_id = self.config.default_model
617 |         model = self.get_model(model_id)
618 |         if hasattr(model, 'get_device'):
619 |             return model.get_device()
620 |         return "Unknown"


--------------------------------------------------------------------------------
/.github/workflows/build-release-conda.yml:
--------------------------------------------------------------------------------
  1 | name: Build and Release with Conda
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - 'v*'
  7 |   pull_request:
  8 |     branches: [ main ]
  9 |   workflow_dispatch:
 10 |     inputs:
 11 |       release_version:
 12 |         description: 'Release version (e.g., v1.0.0)'
 13 |         required: false
 14 |         type: string
 15 |       include_chickenrice:
 16 |         description: 'Include Chickenrice model in releases'
 17 |         required: false
 18 |         type: boolean
 19 |         default: false
 20 | 
 21 | jobs:
 22 |   build-windows:
 23 |     name: Build Windows - CUDA ${{ matrix.cuda }} ${{ matrix.model_variant }}
 24 |     runs-on: windows-latest
 25 |     defaults:
 26 |       run:
 27 |         shell: bash -el {0}  # Important for conda activation on Windows
 28 |     strategy:
 29 |       matrix:
 30 |         include:
 31 |           # CUDA 11.8 versions
 32 |           - cuda: "11.8"
 33 |             env_file: "environment-cuda118.yml"
 34 |             env_name: "faster-whisper-cu118"
 35 |             artifact_suffix: "cu118"
 36 |             model_variant: "base"
 37 |             hf_model: ""
 38 |           - cuda: "11.8"
 39 |             env_file: "environment-cuda118.yml"
 40 |             env_name: "faster-whisper-cu118"
 41 |             artifact_suffix: "cu118-chickenrice"
 42 |             model_variant: "chickenrice"
 43 |             hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2"
 44 |           # CUDA 12.2 versions
 45 |           - cuda: "12.2"
 46 |             env_file: "environment-cuda122.yml"
 47 |             env_name: "faster-whisper-cu122"
 48 |             artifact_suffix: "cu122"
 49 |             model_variant: "base"
 50 |             hf_model: ""
 51 |           - cuda: "12.2"
 52 |             env_file: "environment-cuda122.yml"
 53 |             env_name: "faster-whisper-cu122"
 54 |             artifact_suffix: "cu122-chickenrice"
 55 |             model_variant: "chickenrice"
 56 |             hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2"
 57 |           # CUDA 12.8 versions
 58 |           - cuda: "12.8"
 59 |             env_file: "environment-cuda128.yml"
 60 |             env_name: "faster-whisper-cu128"
 61 |             artifact_suffix: "cu128"
 62 |             model_variant: "base"
 63 |             hf_model: ""
 64 |           - cuda: "12.8"
 65 |             env_file: "environment-cuda128.yml"
 66 |             env_name: "faster-whisper-cu128"
 67 |             artifact_suffix: "cu128-chickenrice"
 68 |             model_variant: "chickenrice"
 69 |             hf_model: "--hf-model chickenrice0721/whisper-large-v2-translate-zh-v0.2-st-ct2"
 70 | 
 71 |     steps:
 72 |     - name: Checkout code
 73 |       uses: actions/checkout@v4
 74 | 
 75 |     - name: Configure stdout buffering
 76 |       run: |
 77 |         # Enable line buffering for better performance with large outputs
 78 |         echo "Configuring buffering for improved CI performance..."
 79 |         echo "PYTHONUNBUFFERED=1" >> $GITHUB_ENV
 80 |         # For shell commands, we'll use stdbuf where needed
 81 |         echo "Buffering configuration complete."
 82 | 
 83 |     - name: Cache conda packages
 84 |       uses: actions/cache@v4
 85 |       id: conda-cache
 86 |       env:
 87 |         CACHE_NUMBER: 1  # Increment to invalidate cache
 88 |       with:
 89 |         path: |
 90 |           ~/conda_pkgs_dir
 91 |         key: ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}-${{ env.CACHE_NUMBER }}-${{ hashFiles(matrix.env_file) }}
 92 |         restore-keys: |
 93 |           ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}-${{ env.CACHE_NUMBER }}-
 94 |           ${{ runner.os }}-conda-pkgs-${{ matrix.env_name }}-${{ matrix.cuda }}-
 95 | 
 96 | 
 97 |     - name: Setup Miniforge
 98 |       uses: conda-incubator/setup-miniconda@v3
 99 |       with:
100 |         miniforge-version: latest
101 |         auto-update-conda: true
102 |         environment-file: ${{ matrix.env_file }}
103 |         activate-environment: ${{ matrix.env_name }}
104 |         show-channel-urls: true
105 |         use-only-tar-bz2: true
106 |         use-mamba: true  # Use mamba for faster dependency resolution
107 |         # Add conda-pkgs-dir to use cached packages
108 |         pkgs-dirs: ~/conda_pkgs_dir
109 |         python-version: "3.10"
110 | 
111 |     - name: Force reinstall ctranslate2 for CUDA 11.8
112 |       if: matrix.cuda == '11.8'
113 |       run: |
114 |         echo "Force reinstalling ctranslate2==3.24.0 for CUDA 11.8 compatibility..."
115 |         pip install --force-reinstall ctranslate2==3.24.0 numpy==1.26.4
116 |         echo "ctranslate2 reinstalled successfully"
117 |         python -c "import ctranslate2; print(f'CTranslate2 version: {ctranslate2.__version__}')"
118 | 
119 |     - name: Fix onnxruntime CPU/GPU conflict
120 |       run: |
121 |         echo "Removing onnxruntime CPU version to avoid conflicts..."
122 |         pip uninstall onnxruntime -y || true
123 |         echo ""
124 |         echo "Installing appropriate onnxruntime-gpu version for CUDA ${{ matrix.cuda }}..."
125 |         if [ "${{ matrix.cuda }}" = "11.8" ]; then
126 |           echo "Installing onnxruntime-gpu==1.18.0 for CUDA 11.8..."
127 |           pip install onnxruntime-gpu==1.18.0
128 |         elif [ "${{ matrix.cuda }}" = "12.2" ] || [ "${{ matrix.cuda }}" = "12.8" ]; then
129 |           echo "Installing onnxruntime-gpu==1.20.2 for CUDA ${{ matrix.cuda }}..."
130 |           pip install onnxruntime-gpu==1.20.2
131 |         else
132 |           echo "Installing onnxruntime-gpu>=1.17.0 for CUDA ${{ matrix.cuda }}..."
133 |           pip install "onnxruntime-gpu>=1.17.0"
134 |         fi
135 |         echo ""
136 |         echo "Verifying onnxruntime-gpu installation..."
137 |         python -c "import onnxruntime as ort; print(f'ONNX Runtime version: {ort.__version__}'); print(f'Available providers: {ort.get_available_providers()}')" || echo "Note: GPU providers won't show on GitHub runners (no GPU)"
138 | 
139 |     - name: Apply batch transcribe patch to faster-whisper
140 |       run: |
141 |         echo "Applying batch transcribe patch to faster-whisper package..."
142 |         # Find the faster-whisper package installation directory
143 |         FASTER_WHISPER_PATH=$(python -c "import faster_whisper; import os; print(os.path.dirname(faster_whisper.__file__))")
144 |         echo "faster-whisper package location: $FASTER_WHISPER_PATH"
145 | 
146 |         # Verify the transcribe.py file exists
147 |         if [ -f "$FASTER_WHISPER_PATH/transcribe.py" ]; then
148 |           echo "Found transcribe.py at: $FASTER_WHISPER_PATH/transcribe.py"
149 | 
150 |           # Apply the batch transcribe patch
151 |           echo "Applying batch-transcribe.patch..."
152 |           patch -p1 -d "$FASTER_WHISPER_PATH/.." < patches/batch-transcribe.patch
153 | 
154 |           # Verify patch was applied
155 |           if [ $? -eq 0 ]; then
156 |             echo "Batch transcribe patch applied successfully!"
157 | 
158 |             # Display the patched sections for verification
159 |             echo ""
160 |             echo "Verifying batch transcribe patch was applied correctly..."
161 |             echo "Checking for max_initial_timestamp_index calculation:"
162 |             grep -A 2 -B 2 "max_initial_timestamp_index = int" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found"
163 | 
164 |             echo ""
165 |             echo "Checking for without_timestamps default value:"
166 |             grep "without_timestamps: bool = False" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found"
167 | 
168 |             echo ""
169 |             echo "Checking for max_initial_timestamp parameter pass-through:"
170 |             grep "max_initial_timestamp=max_initial_timestamp" "$FASTER_WHISPER_PATH/transcribe.py" || echo "Pattern not found"
171 |           else
172 |             echo "WARNING: Batch transcribe patch failed to apply, but continuing build..."
173 |             echo "This might be because the patch is already applied or the file has changed."
174 |           fi
175 |         else
176 |           echo "ERROR: Could not find transcribe.py at expected location!"
177 |           echo "Build will continue without batch transcribe patch."
178 |         fi
179 | 
180 |     - name: Report conda cache status
181 |       run: |
182 |         echo "Conda packages cache hit: ${{ steps.conda-cache.outputs.cache-hit }}"
183 |         if [ "${{ steps.conda-cache.outputs.cache-hit }}" = "true" ]; then
184 |           echo "Package cache was restored, installation should be faster"
185 |         else
186 |           echo "Package cache miss, downloading packages"
187 |         fi
188 |         echo ""
189 |         echo "Conda environment location:"
190 |         conda info --envs
191 | 
192 |     - name: Cache HuggingFace and whisper-base models
193 |       uses: actions/cache@v4
194 |       with:
195 |         # Cache HuggingFace model subdirectories and whisper-base, not VAD models
196 |         # The chickenrice model goes into models/whisper-large-v2-translate-zh-v0.2-st-ct2/
197 |         # The whisper-base goes into models/whisper-base/
198 |         path: |
199 |           models/*/
200 |           !models/*.onnx
201 |           !models/*.json
202 |         key: hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}-${{ matrix.hf_model }}-whisper-base
203 |         restore-keys: |
204 |           hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}-${{ matrix.hf_model }}-
205 |           hf-model-${{ matrix.model_variant }}-${{ hashFiles('download_models.py') }}-
206 | 
207 |     - name: Display environment info
208 |       run: |
209 |         conda info
210 |         conda list
211 |         python --version
212 |         python -c "import ctranslate2; print(f'CTranslate2 version: {ctranslate2.__version__}')"
213 |         echo "Note: CUDA availability check skipped (no GPU on GitHub runners)"
214 | 
215 |     - name: Check cached models
216 |       run: |
217 |         echo "Checking for cached models..."
218 |         if [ -d "models" ]; then
219 |           echo "Models directory exists:"
220 |           # Use buffered find instead of ls for better performance
221 |           find models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null | head -20
222 |           echo ""
223 |           echo "Model subdirectories (HuggingFace models and whisper-base, cached):"
224 |           # Pre-calculate all sizes at once
225 |           du -sh models/*/ 2>/dev/null | while read size dir; do
226 |             echo "  - $(basename "$dir"): $size"
227 |           done
228 |           echo ""
229 |           echo "Root model files (VAD models, not cached):"
230 |           find models -maxdepth 1 \( -name "*.onnx" -o -name "*.json" \) -printf "%s %p\n" 2>/dev/null | \
231 |             awk '{size=$1; $1=""; printf "  %s (%s)\n", $0, size}' || echo "  No VAD model files yet"
232 |         else
233 |           echo "Models directory does not exist yet"
234 |         fi
235 | 
236 |     - name: Download models
237 |       run: |
238 |         python download_models.py ${{ matrix.hf_model }}
239 |       continue-on-error: false
240 | 
241 |     - name: Verify downloaded models
242 |       run: |
243 |         echo "Model files after download:"
244 |         echo ""
245 |         echo "VAD models (not cached, always re-downloaded):"
246 |         find models -maxdepth 1 \( -name "*.onnx" -o -name "*.json" \) -printf "  %p (%s bytes)\n" 2>/dev/null || echo "  No VAD model files found"
247 |         echo ""
248 |         echo "Whisper-base feature extractor files:"
249 |         if [ -d "models/whisper-base" ]; then
250 |           find models/whisper-base -type f -printf "  %p (%s bytes)\n" 2>/dev/null | head -10
251 |         else
252 |           echo "  Not yet downloaded"
253 |         fi
254 |         echo ""
255 |         if [ "${{ matrix.hf_model }}" != "" ]; then
256 |           echo "HuggingFace models (cached):"
257 |           # Pre-calculate all directory sizes
258 |           du -sh models/*/ 2>/dev/null > /tmp/model_sizes.txt
259 |           for dir in models/*/; do
260 |             if [ -d "$dir" ]; then
261 |               echo "  Directory: $(basename "$dir")"
262 |               find "$dir" -maxdepth 1 -type f -printf "    %f (%s bytes)\n" 2>/dev/null | head -10
263 |               size=$(grep "$dir" /tmp/model_sizes.txt | cut -f1)
264 |               echo "  Total size: $size"
265 |               echo ""
266 |             fi
267 |           done
268 |         else
269 |           echo "No HuggingFace models (base variant)"
270 |         fi
271 | 
272 | 
273 |     - name: Build with PyInstaller
274 |       run: |
275 |         echo "Using conda environment: $CONDA_DEFAULT_ENV"
276 |         echo "Python path: $(which python)"
277 |         echo "PyInstaller version:"
278 |         python -m pip show pyinstaller
279 |         export PYTHONPATH="${PYTHONPATH}:${PWD}/src"
280 | 
281 |         python build_windows.py
282 | 
283 |     - name: Copy models to distribution
284 |       run: |
285 |         echo "Copying models to distribution directory..."
286 |         if [ -d "models" ]; then
287 |           echo "Found models directory"
288 |           echo "Contents of models directory:"
289 |           find models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null
290 |           echo ""
291 | 
292 |           # Create models directory in dist
293 |           mkdir -p dist/faster_whisper_transwithai_chickenrice/models
294 | 
295 |           # Copy VAD model files (always included)
296 |           echo "Copying VAD models..."
297 |           cp models/*.onnx dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true
298 |           cp models/*.json dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true
299 | 
300 |           # Copy whisper-base for feature extractor (always included for offline usage)
301 |           echo "Copying whisper-base for feature extractor..."
302 |           if [ -d "models/whisper-base" ]; then
303 |             echo "  Found whisper-base directory, copying..."
304 |             cp -r models/whisper-base dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true
305 |             echo "  Whisper-base copied for offline feature extractor support"
306 |           else
307 |             echo "  WARNING: whisper-base directory not found"
308 |           fi
309 | 
310 |           # Copy HuggingFace models if this is a chickenrice variant
311 |           if [ "${{ matrix.model_variant }}" = "chickenrice" ]; then
312 |             echo "Copying Chickenrice model..."
313 |             for dir in models/*/; do
314 |               if [ -d "$dir" ]; then
315 |                 model_name=$(basename "$dir")
316 |                 # Skip whisper-base as we already copied it
317 |                 if [ "$model_name" != "whisper-base" ]; then
318 |                   echo "  Copying model contents from: $model_name"
319 |                   # Copy the contents of the model directory, not the directory itself
320 |                   cp -r "$dir"* dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true
321 |                   # Also copy hidden files if any exist
322 |                   cp -r "$dir".* dist/faster_whisper_transwithai_chickenrice/models/ 2>/dev/null || true
323 |                 fi
324 |               fi
325 |             done
326 |           fi
327 | 
328 |           echo ""
329 |           echo "Models in distribution:"
330 |           find dist/faster_whisper_transwithai_chickenrice/models -maxdepth 1 -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null
331 |           echo ""
332 |           echo "Total distribution size:"
333 |           du -sh dist/faster_whisper_transwithai_chickenrice/
334 |         else
335 |           echo "WARNING: Models directory not found!"
336 |         fi
337 | 
338 |     - name: Copy batch files, configuration, and documentation
339 |       run: |
340 |         echo "Copying batch files, configuration, and documentation to distribution..."
341 | 
342 |         # Copy usage instructions
343 |         if [ -f "使用说明.txt" ]; then
344 |           cp "使用说明.txt" dist/faster_whisper_transwithai_chickenrice/
345 |           echo "Copied: 使用说明.txt"
346 |         fi
347 | 
348 |         # Copy release notes
349 |         if [ -f "RELEASE_NOTES_CN.md" ]; then
350 |           cp "RELEASE_NOTES_CN.md" dist/faster_whisper_transwithai_chickenrice/
351 |           echo "Copied: RELEASE_NOTES_CN.md"
352 |         fi
353 | 
354 |         # Copy generation config to root directory (for easy user editing)
355 |         if [ -f "generation_config.json5" ]; then
356 |           cp "generation_config.json5" dist/faster_whisper_transwithai_chickenrice/
357 |           echo "Copied: generation_config.json5 to root directory"
358 |         fi
359 | 
360 |         # Copy all batch files
361 |         for bat_file in *.bat; do
362 |           if [ -f "$bat_file" ]; then
363 |             # Skip any build-related batch files
364 |             if [[ "$bat_file" != *"build"* ]] && [[ "$bat_file" == "运行"* ]]; then
365 |               cp "$bat_file" dist/faster_whisper_transwithai_chickenrice/
366 |               echo "Copied: $bat_file"
367 |             fi
368 |           fi
369 |         done
370 | 
371 |         echo ""
372 |         echo "Distribution contents:"
373 |         find dist/faster_whisper_transwithai_chickenrice -maxdepth 1 \( -name "*.bat" -o -name "*.txt" -o -name "*.md" -o -name "*.json5" \) -printf "%M %u %g %s %TY-%Tm-%Td %TH:%TM %p\n" 2>/dev/null || echo "No batch/text/config files found"
374 | 
375 |     - name: Test executable (CPU mode)
376 |       shell: cmd /C CALL {0}
377 |       run: |
378 |         cd dist\faster_whisper_transwithai_chickenrice
379 |         infer.exe --help
380 | 
381 |     - name: Upload artifact
382 |       uses: actions/upload-artifact@v4
383 |       with:
384 |         name: faster_whisper_transwithai_windows_${{ matrix.artifact_suffix }}
385 |         path: dist/faster_whisper_transwithai_chickenrice/
386 |         retention-days: 30
387 | 
388 |     - name: List artifact directory structure
389 |       run: |
390 |         echo "========================================================================"
391 |         echo "📦 ARTIFACT DIRECTORY STRUCTURE"
392 |         echo "========================================================================"
393 |         echo "Build variant: ${{ matrix.artifact_suffix }}"
394 |         echo "CUDA version: ${{ matrix.cuda }}"
395 |         echo "Model variant: ${{ matrix.model_variant }}"
396 |         echo "------------------------------------------------------------------------"
397 | 
398 |         # Simple directory tree (depth limited to 3)
399 |         echo "Directory structure:"
400 |         find dist/faster_whisper_transwithai_chickenrice -type d -maxdepth 3 | \
401 |           sed 's|[^/]*/|  |g' | \
402 |           sed 's|^  |dist/|'
403 | 
404 |         echo ""
405 |         echo "Total artifact size: $(du -sh dist/faster_whisper_transwithai_chickenrice/ | cut -f1)"
406 |         echo "========================================================================"
407 | 
408 |   # Create the initial GitHub release
409 |   create-release:
410 |     name: Create GitHub Release
411 |     needs: [build-windows]
412 |     runs-on: ubuntu-latest
413 |     if: startsWith(github.ref, 'refs/tags/v')
414 |     permissions:
415 |       contents: write
416 |     outputs:
417 |       release_created: ${{ steps.create.outputs.release_created }}
418 | 
419 |     steps:
420 |     - name: Checkout code for release notes
421 |       uses: actions/checkout@v4
422 |       with:
423 |         sparse-checkout: |
424 |           RELEASE_NOTES_CN.md
425 |         sparse-checkout-cone-mode: false
426 | 
427 |     - name: Read release body
428 |       id: read_body
429 |       run: |
430 |         if [ -f "RELEASE_NOTES_CN.md" ]; then
431 |           echo 'body<<EOF' >> $GITHUB_OUTPUT
432 |           cat RELEASE_NOTES_CN.md >> $GITHUB_OUTPUT
433 |           echo 'EOF' >> $GITHUB_OUTPUT
434 |         else
435 |           echo 'body=Release created by GitHub Actions' >> $GITHUB_OUTPUT
436 |         fi
437 | 
438 |     - name: Create empty placeholder file for initial release
439 |       run: |
440 |         echo "This release contains large binary files. Files are being uploaded..." > placeholder.txt
441 | 
442 |     - name: Create Release with placeholder
443 |       id: create
444 |       uses: ading2210/gh-large-releases@v1
445 |       with:
446 |         repository: ${{ github.repository }}
447 |         tag_name: ${{ github.ref }}
448 |         name: ${{ github.ref_name }}
449 |         body: ${{ steps.read_body.outputs.body }}
450 |         draft: false
451 |         prerelease: false
452 |         files: placeholder.txt
453 |         token: ${{ secrets.GITHUB_TOKEN }}
454 | 
455 |     - name: Set output
456 |       run: echo "release_created=true" >> $GITHUB_OUTPUT
457 | 
458 |   # Parallel upload jobs - each handles one artifact
459 |   upload-cu118:
460 |     name: Upload CUDA 11.8 Base
461 |     needs: [create-release]
462 |     runs-on: ubuntu-latest
463 |     if: startsWith(github.ref, 'refs/tags/v')
464 |     permissions:
465 |       contents: write
466 | 
467 |     steps:
468 |     - name: Download artifact
469 |       uses: actions/download-artifact@v6
470 |       with:
471 |         name: faster_whisper_transwithai_windows_cu118
472 |         path: artifact/
473 | 
474 |     - name: Create archive with optimized compression
475 |       run: |
476 |         cd artifact
477 |         echo "Creating archive for CUDA 11.8 base variant..."
478 |         # Using compression level 5 for faster builds (was level 9)
479 |         # Level 5 provides good balance between speed and compression ratio
480 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu118.zip .
481 |         cd ..
482 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu118.zip | awk '{print $5}')"
483 | 
484 |     - name: Upload to release with large file support
485 |       uses: ading2210/gh-large-releases@v1
486 |       with:
487 |         repository: ${{ github.repository }}
488 |         tag_name: ${{ github.ref }}
489 |         files: faster_whisper_transwithai_windows_cu118.zip
490 |         token: ${{ secrets.GITHUB_TOKEN }}
491 | 
492 |   upload-cu118-chickenrice:
493 |     name: Upload CUDA 11.8 Chickenrice
494 |     needs: [create-release]
495 |     runs-on: ubuntu-latest
496 |     if: startsWith(github.ref, 'refs/tags/v')
497 |     permissions:
498 |       contents: write
499 | 
500 |     steps:
501 |     - name: Download artifact
502 |       uses: actions/download-artifact@v6
503 |       with:
504 |         name: faster_whisper_transwithai_windows_cu118-chickenrice
505 |         path: artifact/
506 | 
507 |     - name: Create archive with optimized compression
508 |       run: |
509 |         cd artifact
510 |         echo "Creating archive for CUDA 11.8 chickenrice variant..."
511 |         # Using compression level 5 for faster builds (was level 9)
512 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu118-chickenrice.zip .
513 |         cd ..
514 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu118-chickenrice.zip | awk '{print $5}')"
515 | 
516 |     - name: Upload to release with large file support
517 |       uses: ading2210/gh-large-releases@v1
518 |       with:
519 |         repository: ${{ github.repository }}
520 |         tag_name: ${{ github.ref }}
521 |         files: faster_whisper_transwithai_windows_cu118-chickenrice.zip
522 |         token: ${{ secrets.GITHUB_TOKEN }}
523 | 
524 |   upload-cu122:
525 |     name: Upload CUDA 12.2 Base
526 |     needs: [create-release]
527 |     runs-on: ubuntu-latest
528 |     if: startsWith(github.ref, 'refs/tags/v')
529 |     permissions:
530 |       contents: write
531 | 
532 |     steps:
533 |     - name: Download artifact
534 |       uses: actions/download-artifact@v6
535 |       with:
536 |         name: faster_whisper_transwithai_windows_cu122
537 |         path: artifact/
538 | 
539 |     - name: Create archive with optimized compression
540 |       run: |
541 |         cd artifact
542 |         echo "Creating archive for CUDA 12.2 base variant..."
543 |         # Using compression level 5 for faster builds (was level 9)
544 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu122.zip .
545 |         cd ..
546 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu122.zip | awk '{print $5}')"
547 | 
548 |     - name: Upload to release with large file support
549 |       uses: ading2210/gh-large-releases@v1
550 |       with:
551 |         repository: ${{ github.repository }}
552 |         tag_name: ${{ github.ref }}
553 |         files: faster_whisper_transwithai_windows_cu122.zip
554 |         token: ${{ secrets.GITHUB_TOKEN }}
555 | 
556 |   upload-cu122-chickenrice:
557 |     name: Upload CUDA 12.2 Chickenrice
558 |     needs: [create-release]
559 |     runs-on: ubuntu-latest
560 |     if: startsWith(github.ref, 'refs/tags/v')
561 |     permissions:
562 |       contents: write
563 | 
564 |     steps:
565 |     - name: Download artifact
566 |       uses: actions/download-artifact@v6
567 |       with:
568 |         name: faster_whisper_transwithai_windows_cu122-chickenrice
569 |         path: artifact/
570 | 
571 |     - name: Create archive with optimized compression
572 |       run: |
573 |         cd artifact
574 |         echo "Creating archive for CUDA 12.2 chickenrice variant..."
575 |         # Using compression level 5 for faster builds (was level 9)
576 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu122-chickenrice.zip .
577 |         cd ..
578 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu122-chickenrice.zip | awk '{print $5}')"
579 | 
580 |     - name: Upload to release with large file support
581 |       uses: ading2210/gh-large-releases@v1
582 |       with:
583 |         repository: ${{ github.repository }}
584 |         tag_name: ${{ github.ref }}
585 |         files: faster_whisper_transwithai_windows_cu122-chickenrice.zip
586 |         token: ${{ secrets.GITHUB_TOKEN }}
587 | 
588 |   upload-cu128:
589 |     name: Upload CUDA 12.8 Base
590 |     needs: [create-release]
591 |     runs-on: ubuntu-latest
592 |     if: startsWith(github.ref, 'refs/tags/v')
593 |     permissions:
594 |       contents: write
595 | 
596 |     steps:
597 |     - name: Download artifact
598 |       uses: actions/download-artifact@v6
599 |       with:
600 |         name: faster_whisper_transwithai_windows_cu128
601 |         path: artifact/
602 | 
603 |     - name: Create archive with optimized compression
604 |       run: |
605 |         cd artifact
606 |         echo "Creating archive for CUDA 12.8 base variant..."
607 |         # Using compression level 5 for faster builds (was level 9)
608 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu128.zip .
609 |         cd ..
610 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu128.zip | awk '{print $5}')"
611 | 
612 |     - name: Upload to release with large file support
613 |       uses: ading2210/gh-large-releases@v1
614 |       with:
615 |         repository: ${{ github.repository }}
616 |         tag_name: ${{ github.ref }}
617 |         files: faster_whisper_transwithai_windows_cu128.zip
618 |         token: ${{ secrets.GITHUB_TOKEN }}
619 | 
620 |   upload-cu128-chickenrice:
621 |     name: Upload CUDA 12.8 Chickenrice
622 |     needs: [create-release]
623 |     runs-on: ubuntu-latest
624 |     if: startsWith(github.ref, 'refs/tags/v')
625 |     permissions:
626 |       contents: write
627 | 
628 |     steps:
629 |     - name: Download artifact
630 |       uses: actions/download-artifact@v6
631 |       with:
632 |         name: faster_whisper_transwithai_windows_cu128-chickenrice
633 |         path: artifact/
634 | 
635 |     - name: Create archive with optimized compression
636 |       run: |
637 |         cd artifact
638 |         echo "Creating archive for CUDA 12.8 chickenrice variant..."
639 |         # Using compression level 5 for faster builds (was level 9)
640 |         zip -5 -r -q ../faster_whisper_transwithai_windows_cu128-chickenrice.zip .
641 |         cd ..
642 |         echo "Archive created: $(ls -lh faster_whisper_transwithai_windows_cu128-chickenrice.zip | awk '{print $5}')"
643 | 
644 |     - name: Upload to release with large file support
645 |       uses: ading2210/gh-large-releases@v1
646 |       with:
647 |         repository: ${{ github.repository }}
648 |         tag_name: ${{ github.ref }}
649 |         files: faster_whisper_transwithai_windows_cu128-chickenrice.zip
650 |         token: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/src/faster_whisper_transwithai_chickenrice/infer.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """
   3 | Inference script with custom VAD injection support
   4 | """
   5 | 
   6 | import argparse
   7 | import sys
   8 | import logging
   9 | import os
  10 | import json
  11 | import code
  12 | import platform
  13 | import subprocess
  14 | import traceback
  15 | from dataclasses import dataclass
  16 | from pathlib import Path
  17 | from collections import ChainMap
  18 | from typing import Optional, Dict, Any
  19 | 
  20 | import pyjson5
  21 | from faster_whisper import WhisperModel, BatchedInferencePipeline
  22 | import ctranslate2
  23 | 
  24 | # Import our VAD injection system
  25 | from . import inject_vad, uninject_vad, VadOptionsCompat
  26 | from .vad_manager import VadConfig
  27 | 
  28 | # Import modern i18n module for translations
  29 | from . import i18n_modern as i18n
  30 | 
  31 | # Convenience imports
  32 | _ = i18n._
  33 | format_duration = i18n.format_duration
  34 | format_percentage = i18n.format_percentage
  35 | 
  36 | 
  37 | def parse_arguments():
  38 |     parser = argparse.ArgumentParser(description=_("app.description"))
  39 |     parser.add_argument('--model_name_or_path', type=str, default="models",
  40 |                        help=_("args.model_path"))
  41 |     parser.add_argument('--device', type=str, default='auto',
  42 |                        help=_("args.device"))
  43 |     parser.add_argument('--compute_type', type=str, default='auto',
  44 |                        help=_("args.compute_type"))
  45 |     parser.add_argument('--overwrite', action='store_true', default=False,
  46 |                        help=_("args.overwrite"))
  47 |     parser.add_argument('--audio_suffixes', type=str, default="wav,flac,mp3",
  48 |                        help=_("args.audio_extensions"))
  49 |     parser.add_argument('--sub_formats', type=str, default="lrc,vtt",
  50 |                        help=_("args.subtitle_formats"))
  51 |     parser.add_argument('--output_dir', type=str, default=None,
  52 |                        help=_("args.output_dir"))
  53 |     parser.add_argument('--generation_config', type=str, default="generation_config.json5",
  54 |                        help=_("args.config_file"))
  55 |     parser.add_argument('--log_level', type=str, default="DEBUG",
  56 |                        help=_("args.log_level"))
  57 | 
  58 |     # VAD parameter overrides (whisper_vad is always used)
  59 |     parser.add_argument('--vad_threshold', type=float, default=None,
  60 |                        help=_("args.vad_threshold"))
  61 |     parser.add_argument('--vad_min_speech_duration_ms', type=int, default=None,
  62 |                        help=_("args.min_speech_duration"))
  63 |     parser.add_argument('--vad_min_silence_duration_ms', type=int, default=None,
  64 |                        help=_("args.min_silence_duration"))
  65 |     parser.add_argument('--vad_speech_pad_ms', type=int, default=None,
  66 |                        help=_("args.speech_padding"))
  67 | 
  68 |     # Debug option for interactive console
  69 |     parser.add_argument('--console', action='store_true',
  70 |                        help="Launch interactive Python console for debugging")
  71 | 
  72 |     # Batch inference options
  73 |     parser.add_argument('--enable_batching', action='store_true',
  74 |                        help="Enable batched inference for faster processing (requires more VRAM)")
  75 |     parser.add_argument('--batch_size', type=int, default=None,
  76 |                        help="Batch size for batched inference (auto-detect if not specified)")
  77 |     parser.add_argument('--max_batch_size', type=int, default=8,
  78 |                        help="Maximum batch size to try when auto-detecting (default: 8)")
  79 | 
  80 |     parser.add_argument('base_dirs', nargs=argparse.REMAINDER,
  81 |                        help=_("args.directories"))
  82 |     return parser.parse_args()
  83 | 
  84 | 
  85 | def select_best_compute_type(device: str) -> str:
  86 |     """
  87 |     Automatically select the best compute type based on device and available types.
  88 | 
  89 |     Preference order:
  90 |     - bfloat16 > float16 > int8 types > float32
  91 |     - Prefer int8 over float32 for better memory usage
  92 | 
  93 |     Args:
  94 |         device: The device to use ('cpu', 'cuda', or 'auto')
  95 | 
  96 |     Returns:
  97 |         The best available compute type for the device
  98 |     """
  99 |     # Determine the actual device if 'auto' is specified
 100 |     actual_device = device
 101 |     if device == 'auto':
 102 |         # Check if CUDA devices are actually available
 103 |         # First check CUDA_VISIBLE_DEVICES environment variable
 104 |         import os
 105 |         cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', None)
 106 | 
 107 |         if cuda_visible == '':
 108 |             # Empty string means CUDA is explicitly disabled
 109 |             actual_device = 'cpu'
 110 |         elif cuda_visible == '-1':
 111 |             # -1 also means CUDA is disabled
 112 |             actual_device = 'cpu'
 113 |         else:
 114 |             # Try to check if CUDA is actually available by attempting to get its compute types
 115 |             # and checking if we can actually use it
 116 |             try:
 117 |                 # Try to get CUDA compute types
 118 |                 cuda_types = ctranslate2.get_supported_compute_types('cuda')
 119 |                 # Also check if we can import and use faster_whisper with CUDA
 120 |                 # This is a more reliable check
 121 |                 from faster_whisper import WhisperModel
 122 |                 # Try to get default device - if CUDA not available, this should fail
 123 |                 # Note: We're not actually loading a model, just checking device availability
 124 |                 if cuda_visible is not None:
 125 |                     # CUDA_VISIBLE_DEVICES is set to specific devices
 126 |                     # Make sure at least one device is visible
 127 |                     visible_devices = [d.strip() for d in cuda_visible.split(',') if d.strip()]
 128 |                     if not visible_devices:
 129 |                         actual_device = 'cpu'
 130 |                     else:
 131 |                         actual_device = 'cuda'
 132 |                 else:
 133 |                     # CUDA_VISIBLE_DEVICES not set, CUDA should be available if drivers installed
 134 |                     actual_device = 'cuda'
 135 |             except Exception as e:
 136 |                 # If we can't get CUDA types or import fails, fall back to CPU
 137 |                 actual_device = 'cpu'
 138 |         logger.info(_("info.auto_detected_device").format(device=actual_device))
 139 | 
 140 |     # Get supported compute types for the device
 141 |     try:
 142 |         supported_types = ctranslate2.get_supported_compute_types(actual_device)
 143 |     except Exception as e:
 144 |         logger.warning(_("warnings.compute_types_unavailable").format(device=actual_device, error=e))
 145 |         # Fallback to safe default
 146 |         return 'int8' if actual_device == 'cpu' else 'float16'
 147 | 
 148 |     # Define preference order
 149 |     # Prefer bfloat16 > float16 > int8 types > float32
 150 |     preference_order = [
 151 |         'bfloat16',
 152 |         'float16',
 153 |         'int16',  # For CPU
 154 |         'int8_bfloat16',
 155 |         'int8_float16',
 156 |         'int8_float32',
 157 |         'int8',
 158 |         'float32'  # Least preferred due to memory usage
 159 |     ]
 160 | 
 161 |     # Select the best available type based on preference
 162 |     for compute_type in preference_order:
 163 |         if compute_type in supported_types:
 164 |             logger.info(_("info.auto_selected_compute_type").format(compute_type=compute_type, device=actual_device))
 165 |             return compute_type
 166 | 
 167 |     # If nothing matched (shouldn't happen), use a safe default
 168 |     default = 'int8' if actual_device == 'cpu' else 'float16'
 169 |     logger.warning(_("warnings.no_preferred_compute_type").format(default=default))
 170 |     return default
 171 | 
 172 | 
 173 | @dataclass
 174 | class Segment:
 175 |     start: int  # ms
 176 |     end: int    # ms
 177 |     text: str
 178 | 
 179 | 
 180 | def merge_segments(segments: list[Segment]) -> list[Segment]:
 181 |     segments.sort(key=lambda s: s.start)
 182 |     merged: list[Segment] = []
 183 |     i = 0
 184 |     while i < len(segments):
 185 |         if segments[i].text.strip() == '':
 186 |             i += 1
 187 |             continue
 188 |         start, end, text = segments[i].start, segments[i].end, segments[i].text
 189 |         j = i + 1
 190 |         while j < len(segments):
 191 |             if segments[j].text.startswith(text):
 192 |                 end, text = segments[j].end, segments[j].text
 193 |                 j += 1
 194 |                 continue
 195 |             break
 196 |         k = j
 197 |         while k < len(segments):
 198 |             if segments[k].text.strip() == '':
 199 |                 break
 200 |             if text.endswith(segments[k].text):
 201 |                 end = segments[k].end
 202 |                 k += 1
 203 |                 continue
 204 |             break
 205 |         merged.append(Segment(start=start, end=end, text=text))
 206 |         i = j
 207 |     return merged
 208 | 
 209 | 
 210 | class SubWriter:
 211 |     @classmethod
 212 |     def txt(cls, segments: list[Segment], path: str):
 213 |         lines = []
 214 |         for idx, segment in enumerate(segments):
 215 |             lines.append(f"{segment.text}\n")
 216 |         with open(path, "w", encoding="utf-8") as f:
 217 |             f.writelines(lines)
 218 | 
 219 |     @classmethod
 220 |     def lrc(cls, segments: list[Segment], path: str):
 221 |         lines = []
 222 |         for idx, segment in enumerate(segments):
 223 |             start_ts = cls.lrc_timestamp(segment.start)
 224 |             end_es = cls.lrc_timestamp(segment.end)
 225 |             lines.append(f"[{start_ts}]{segment.text}\n")
 226 |             if idx != len(segments) - 1:
 227 |                 next_start = segments[idx + 1].start
 228 |                 if next_start is not None and end_es == cls.lrc_timestamp(next_start):
 229 |                     continue
 230 |             lines.append(f"[{end_es}]\n")
 231 |         with open(path, "w", encoding="utf-8") as f:
 232 |             f.writelines(lines)
 233 | 
 234 |     @staticmethod
 235 |     def lrc_timestamp(ms: int) -> str:
 236 |         m = ms // 60_000
 237 |         ms = ms - m * 60_000
 238 |         s = ms // 1_000
 239 |         ms = ms - s * 1_000
 240 |         ms = ms // 10
 241 |         return f"{m:02d}:{s:02d}.{ms:02d}"
 242 | 
 243 |     @classmethod
 244 |     def vtt(cls, segments: list[Segment], path: str):
 245 |         lines = ["WebVTT\n\n"]
 246 |         for idx, segment in enumerate(segments):
 247 |             lines.append(f"{idx + 1}\n")
 248 |             lines.append(f"{cls.vtt_timestamp(segment.start)} --> {cls.vtt_timestamp(segment.end)}\n")
 249 |             lines.append(f"{segment.text}\n\n")
 250 |         with open(path, "w", encoding="utf-8") as f:
 251 |             f.writelines(lines)
 252 | 
 253 |     @classmethod
 254 |     def vtt_timestamp(cls, ms: int):
 255 |         return cls._timestamp(ms, '.')
 256 | 
 257 |     @classmethod
 258 |     def srt(cls, segments: list[Segment], path: str):
 259 |         lines = []
 260 |         for idx, segment in enumerate(segments):
 261 |             lines.append(f"{idx + 1}\n")
 262 |             lines.append(f"{cls.srt_timestamp(segment.start)} --> {cls.srt_timestamp(segment.end)}\n")
 263 |             lines.append(f"{segment.text}\n\n")
 264 |         with open(path, "w", encoding="utf-8") as f:
 265 |             f.writelines(lines)
 266 | 
 267 |     @classmethod
 268 |     def srt_timestamp(cls, ms: int):
 269 |         return cls._timestamp(ms, ',')
 270 | 
 271 |     @classmethod
 272 |     def _timestamp(cls, ms: int, delim: str):
 273 |         h = ms // 3600_000
 274 |         ms -= h * 3600_000
 275 |         m = ms // 60_000
 276 |         ms -= m * 60_000
 277 |         s = ms // 1_000
 278 |         ms -= s * 1_000
 279 |         return (
 280 |             f"{h:02d}:{m:02d}:{s:02d}{delim}{ms:03d}"
 281 |         )
 282 | 
 283 | 
 284 | @dataclass
 285 | class InferenceTask:
 286 |     audio_path: str
 287 |     sub_prefix: str
 288 |     sub_formats: list[str]
 289 | 
 290 | 
 291 | logger = logging.getLogger(__name__)
 292 | log_handler = logging.StreamHandler()
 293 | log_handler.setFormatter(logging.Formatter('%(message)s'))
 294 | logger.addHandler(log_handler)
 295 | 
 296 | 
 297 | class Inference:
 298 |     sub_writers = {"lrc": SubWriter.lrc, "srt": SubWriter.srt, "vtt": SubWriter.vtt, "txt": SubWriter.txt}
 299 | 
 300 |     def __init__(self, args):
 301 |         self.args = args
 302 |         self.model_name_or_path = args.model_name_or_path
 303 |         self.device = args.device
 304 |         # Auto-select compute type if 'auto' or 'default' is specified
 305 |         if args.compute_type in ['auto', 'default']:
 306 |             self.compute_type = select_best_compute_type(self.device)
 307 |         else:
 308 |             self.compute_type = args.compute_type
 309 | 
 310 |         # Batch inference settings
 311 |         self.enable_batching = args.enable_batching
 312 |         self.batch_size = args.batch_size if args.batch_size else 0
 313 |         self.max_batch_size = args.max_batch_size
 314 | 
 315 |         self.overwrite = args.overwrite
 316 |         self.output_dir = args.output_dir
 317 |         if self.output_dir:
 318 |             if not os.path.isabs(self.output_dir):
 319 |                 self.output_dir = os.path.join(os.getcwd(), self.output_dir)
 320 |             logger.info(_("info.output_dir", output_dir=self.output_dir))
 321 |         self.audio_suffixes = {k: True for k in args.audio_suffixes.split(',')}
 322 |         self.sub_formats = []
 323 |         for k in args.sub_formats.split(','):
 324 |             if k not in self.sub_writers:
 325 |                 raise ValueError(_("warnings.unknown_format", format=k))
 326 |             self.sub_formats.append(k)
 327 | 
 328 |         # Load generation config
 329 |         self.generation_config = self._load_generation_config(args)
 330 | 
 331 |         # Setup VAD injection if requested
 332 |         self._setup_vad_injection(args)
 333 | 
 334 |         logger.info(_("info.generation_config", config=self.generation_config))
 335 | 
 336 |     def _load_generation_config(self, args) -> Dict[str, Any]:
 337 |         """Load and process generation configuration"""
 338 |         # Default config
 339 |         config = {
 340 |             "language": "ja",
 341 |             "task": "translate",
 342 |             "vad_filter": True,
 343 |         }
 344 | 
 345 | 
 346 |         # Load from file if exists
 347 |         if os.path.exists(args.generation_config):
 348 |             with open(args.generation_config, "r", encoding='utf-8') as f:
 349 |                 file_config = pyjson5.decode_io(f)
 350 |                 config = dict(**ChainMap(file_config, config))
 351 | 
 352 |         # Process VAD parameters from config file
 353 |         if "vad_parameters" in config:
 354 |             vad_params = config.pop("vad_parameters")
 355 | 
 356 |             # Convert to VadOptions format
 357 |             vad_options = {}
 358 | 
 359 |             # Map common parameters
 360 |             if "threshold" in vad_params:
 361 |                 vad_options["threshold"] = vad_params["threshold"]
 362 |             if "neg_threshold" in vad_params:
 363 |                 vad_options["neg_threshold"] = vad_params["neg_threshold"]
 364 |             if "min_speech_duration_ms" in vad_params:
 365 |                 vad_options["min_speech_duration_ms"] = vad_params["min_speech_duration_ms"]
 366 |             if "max_speech_duration_s" in vad_params:
 367 |                 vad_options["max_speech_duration_s"] = vad_params["max_speech_duration_s"]
 368 |             if "min_silence_duration_ms" in vad_params:
 369 |                 vad_options["min_silence_duration_ms"] = vad_params["min_silence_duration_ms"]
 370 |             if "speech_pad_ms" in vad_params:
 371 |                 vad_options["speech_pad_ms"] = vad_params["speech_pad_ms"]
 372 | 
 373 |             config["vad_parameters"] = vad_options
 374 | 
 375 |         # Override with command line arguments
 376 |         if args.vad_threshold is not None:
 377 |             if "vad_parameters" not in config:
 378 |                 config["vad_parameters"] = {}
 379 |             config["vad_parameters"]["threshold"] = args.vad_threshold
 380 | 
 381 |         if args.vad_min_speech_duration_ms is not None:
 382 |             if "vad_parameters" not in config:
 383 |                 config["vad_parameters"] = {}
 384 |             config["vad_parameters"]["min_speech_duration_ms"] = args.vad_min_speech_duration_ms
 385 | 
 386 |         if args.vad_min_silence_duration_ms is not None:
 387 |             if "vad_parameters" not in config:
 388 |                 config["vad_parameters"] = {}
 389 |             config["vad_parameters"]["min_silence_duration_ms"] = args.vad_min_silence_duration_ms
 390 | 
 391 |         if args.vad_speech_pad_ms is not None:
 392 |             if "vad_parameters" not in config:
 393 |                 config["vad_parameters"] = {}
 394 |             config["vad_parameters"]["speech_pad_ms"] = args.vad_speech_pad_ms
 395 | 
 396 |         return config
 397 | 
 398 |     def _vad_progress_callback(self, chunk_idx, total_chunks, device):
 399 |         """Progress callback for VAD processing."""
 400 |         progress_pct = (chunk_idx / total_chunks) * 100
 401 |         # Use carriage return to update the same line
 402 |         print("\r  " + _("progress.vad", current=chunk_idx, total=total_chunks,
 403 |             percent=progress_pct, device=device), end="", flush=True)
 404 |         if chunk_idx == total_chunks:
 405 |             print()  # New line when done
 406 | 
 407 |     def _setup_vad_injection(self, args):
 408 |         """Setup whisper_vad injection - always enforced"""
 409 |         # Always use whisper_vad model
 410 |         vad_model = "whisper_vad"
 411 | 
 412 |         logger.info(_("info.initializing_vad"))
 413 | 
 414 |         # Create VAD config with progress callback
 415 |         vad_config = VadConfig(default_model=vad_model)
 416 | 
 417 |         # Apply VAD parameters from generation config
 418 |         if "vad_parameters" in self.generation_config:
 419 |             vad_params = self.generation_config["vad_parameters"]
 420 |             if "threshold" in vad_params:
 421 |                 vad_config.threshold = vad_params["threshold"]
 422 |             if "neg_threshold" in vad_params:
 423 |                 vad_config.neg_threshold = vad_params["neg_threshold"]
 424 |             if "min_speech_duration_ms" in vad_params:
 425 |                 vad_config.min_speech_duration_ms = vad_params["min_speech_duration_ms"]
 426 |             if "max_speech_duration_s" in vad_params:
 427 |                 vad_config.max_speech_duration_s = vad_params["max_speech_duration_s"]
 428 |             if "min_silence_duration_ms" in vad_params:
 429 |                 vad_config.min_silence_duration_ms = vad_params["min_silence_duration_ms"]
 430 |             if "speech_pad_ms" in vad_params:
 431 |                 vad_config.speech_pad_ms = vad_params["speech_pad_ms"]
 432 | 
 433 |         # Load ONNX VAD configuration from metadata
 434 |         vad_metadata_path = "models/whisper_vad_metadata.json"
 435 |         vad_config.onnx_model_path = "models/whisper_vad.onnx"
 436 |         vad_config.onnx_metadata_path = vad_metadata_path
 437 | 
 438 |         # Read model configuration from metadata JSON if it exists
 439 |         if os.path.exists(vad_metadata_path):
 440 |             try:
 441 |                 with open(vad_metadata_path, 'r') as f:
 442 |                     metadata = json.load(f)
 443 | 
 444 |                 # Load model configuration from metadata
 445 |                 vad_config.whisper_model_name = metadata.get("whisper_model_name", "openai/whisper-base")
 446 |                 vad_config.frame_duration_ms = metadata.get("frame_duration_ms", 20)
 447 |                 vad_config.chunk_duration_ms = metadata.get("total_duration_ms", 30000)
 448 | 
 449 |                 logger.info(_("warnings.loaded_vad_config", path=vad_metadata_path))
 450 |             except Exception as e:
 451 |                 logger.warning(_("warnings.failed_load_vad", path=vad_metadata_path, error=e))
 452 |                 logger.warning(_("warnings.using_default_vad"))
 453 |                 # Fallback to defaults
 454 |                 vad_config.whisper_model_name = "openai/whisper-base"
 455 |                 vad_config.frame_duration_ms = 20
 456 |                 vad_config.chunk_duration_ms = 30000
 457 |         else:
 458 |             # Use defaults if metadata file doesn't exist
 459 |             logger.warning(_("warnings.vad_file_not_found", path=vad_metadata_path))
 460 |             logger.warning(_("warnings.using_default_vad"))
 461 |             vad_config.whisper_model_name = "openai/whisper-base"
 462 |             vad_config.frame_duration_ms = 20
 463 |             vad_config.chunk_duration_ms = 30000
 464 | 
 465 |         # Hardcoded runtime configuration
 466 |         vad_config.force_cpu = False
 467 |         vad_config.num_threads = 8
 468 | 
 469 |         # Inject VAD with progress callback
 470 |         inject_vad(model_id=vad_model, config=vad_config, progress_callback=self._vad_progress_callback)
 471 |         self.vad_injected = True
 472 |         logger.info(_("info.vad_activated", threshold=vad_config.threshold))
 473 | 
 474 |     def generates(self, base_dirs):
 475 |         if len(base_dirs) == 0:
 476 |             logger.warning(_("warnings.provide_directories"))
 477 |             return
 478 | 
 479 |         tasks = self._scan(base_dirs)
 480 |         if len(tasks) == 0:
 481 |             logger.info(_("info.no_files_found"))
 482 |             return
 483 | 
 484 |         logger.info(_("tasks.translation", count=len(tasks)))
 485 |         logger.info(_("info.loading_whisper"))
 486 | 
 487 |         try:
 488 |             model = WhisperModel(self.model_name_or_path, device=self.device, compute_type=self.compute_type)
 489 |             logger.info(_("info.model_precision").format(precision=self.compute_type, device=self.device))
 490 | 
 491 |             # Setup batched inference if enabled
 492 |             batched_model = None
 493 |             batch_size_to_use = self.batch_size
 494 | 
 495 |             if self.enable_batching:
 496 |                 try:
 497 |                     batched_model = BatchedInferencePipeline(model=model)
 498 | 
 499 |                     # Auto-detect batch size if not specified
 500 |                     if batch_size_to_use == 0 and len(tasks) > 0:
 501 |                         # Use the first audio file as sample for testing
 502 |                         batch_size_to_use = self._find_executable_batch_size(
 503 |                             model,
 504 |                             tasks[0].audio_path,
 505 |                             min_batch_size=1,
 506 |                             max_batch_size=self.max_batch_size
 507 |                         )
 508 | 
 509 |                         if batch_size_to_use == 0:
 510 |                             logger.warning("Could not find suitable batch size. Falling back to non-batched mode.")
 511 |                             batched_model = None
 512 | 
 513 |                     if batched_model and batch_size_to_use > 0:
 514 |                         logger.info(f"Using batched inference with batch size: {batch_size_to_use}")
 515 | 
 516 |                 except Exception as e:
 517 |                     logger.warning(f"Failed to setup batched inference: {str(e)}. Falling back to non-batched mode.")
 518 |                     batched_model = None
 519 | 
 520 |             for i, task in enumerate(tasks):
 521 |                 logger.info(_("info.translating", current=i + 1, total=len(tasks), path=task.audio_path))
 522 | 
 523 |                 # Use batched or regular inference
 524 |                 if batched_model and batch_size_to_use > 0:
 525 |                     # Use auto-retry with batch size reduction on OOM
 526 |                     # This mimics HuggingFace Accelerate's find_executable_batch_size behavior
 527 |                     try:
 528 |                         _segments, info, actual_batch_size = self._transcribe_with_auto_batch_size(
 529 |                             batched_model,
 530 |                             task.audio_path,
 531 |                             starting_batch_size=batch_size_to_use
 532 |                         )
 533 |                         # Update batch_size_to_use if it was auto-adjusted
 534 |                         if actual_batch_size < batch_size_to_use:
 535 |                             logger.info(f"Batch size auto-adjusted from {batch_size_to_use} to {actual_batch_size}")
 536 |                             batch_size_to_use = actual_batch_size
 537 |                     except Exception as e:
 538 |                         logger.warning(f"Batched inference failed: {str(e)}. Falling back to non-batched mode.")
 539 |                         # Fallback to non-batched
 540 |                         _segments, info = model.transcribe(
 541 |                             task.audio_path,
 542 |                             **self.generation_config,
 543 |                         )
 544 |                 else:
 545 |                     _segments, info = model.transcribe(
 546 |                         task.audio_path,
 547 |                         **self.generation_config,
 548 |                     )
 549 | 
 550 |                 if info.duration == info.duration_after_vad or info.duration_after_vad == 0:
 551 |                     logger.info(_("info.duration", duration=format_duration(info.duration)))
 552 |                 else:
 553 |                     rate = info.duration_after_vad / info.duration
 554 |                     logger.info(_("info.duration_filtered",
 555 |                         original=format_duration(info.duration),
 556 |                         filtered=format_duration(info.duration_after_vad),
 557 |                         percent=format_percentage(rate)))
 558 | 
 559 |                 segments = []
 560 |                 for _segment in _segments:
 561 |                     segment = Segment(
 562 |                         start=int(_segment.start*1_000),
 563 |                         end=int(_segment.end*1_000),
 564 |                         text=_segment.text.strip(),
 565 |                     )
 566 |                     segments.append(segment)
 567 |                     logger.debug(f"[{SubWriter.lrc_timestamp(segment.start)} --> "
 568 |                                f"{SubWriter.lrc_timestamp(segment.end)}] {segment.text}")
 569 | 
 570 |                 segments = merge_segments(segments)
 571 |                 os.makedirs(os.path.dirname(task.sub_prefix), exist_ok=True)
 572 |                 for sub_suffix in task.sub_formats:
 573 |                     sub_path = f"{task.sub_prefix}.{sub_suffix}"
 574 |                     logger.info(_("info.writing", path=sub_path))
 575 |                     self.sub_writers[sub_suffix](segments, sub_path)
 576 | 
 577 |         finally:
 578 |             # Clean up VAD injection
 579 |             if self.vad_injected:
 580 |                 uninject_vad()
 581 |                 logger.info(_("info.vad_deactivated"))
 582 | 
 583 |     def _find_executable_batch_size(self, model, sample_audio_path, min_batch_size=1, max_batch_size=64):
 584 |         """
 585 |         Find the maximum executable batch size for batched inference.
 586 |         Starts from max_batch_size and works down exponentially on OOM.
 587 | 
 588 |         Args:
 589 |             model: WhisperModel instance
 590 |             sample_audio_path: Path to a sample audio file for testing
 591 |             min_batch_size: Minimum batch size to try
 592 |             max_batch_size: Maximum batch size to try
 593 | 
 594 |         Returns:
 595 |             Optimal batch size that fits in memory
 596 |         """
 597 |         if not self.enable_batching:
 598 |             return 0
 599 | 
 600 |         logger.info(_("batch.finding_optimal", min_size=min_batch_size, max_size=max_batch_size))
 601 | 
 602 |         # Start from max and work down on failure (like HuggingFace Accelerate)
 603 |         current_batch_size = max_batch_size
 604 | 
 605 |         while current_batch_size >= min_batch_size:
 606 |             try:
 607 |                 logger.info(_("batch.testing_size", size=current_batch_size))
 608 | 
 609 |                 # Try to create batched pipeline with this batch size
 610 |                 batched_model = BatchedInferencePipeline(model=model)
 611 | 
 612 |                 # Test transcription with this batch size
 613 |                 # Note: batch_size is passed separately to BatchedInferencePipeline.transcribe()
 614 |                 # It's NOT part of generation_config
 615 |                 segments, info = batched_model.transcribe(
 616 |                     sample_audio_path,
 617 |                     batch_size=current_batch_size,  # batch_size is a separate parameter
 618 |                     **self.generation_config  # generation_config doesn't include batch_size
 619 |                 )
 620 | 
 621 |                 # Force evaluation by converting to list
 622 |                 list(segments)
 623 | 
 624 |                 # Success! This batch size works
 625 |                 logger.info(_("batch.size_successful", size=current_batch_size))
 626 |                 logger.info(_("batch.optimal_found", size=current_batch_size))
 627 |                 return current_batch_size
 628 | 
 629 |             except RuntimeError as e:
 630 |                 # If OOM, reduce batch size exponentially
 631 |                 error_msg = str(e)
 632 |                 if "out of memory" in error_msg.lower() or "oom" in error_msg.lower():
 633 |                     logger.warning(_("batch.oom_error", size=current_batch_size))
 634 |                 else:
 635 |                     logger.warning(_("batch.runtime_error", size=current_batch_size, error=error_msg))
 636 | 
 637 |                 # Reduce batch size by half (exponential backoff)
 638 |                 new_batch_size = current_batch_size // 2
 639 | 
 640 |                 # Ensure we reduce by at least 1
 641 |                 if new_batch_size == current_batch_size:
 642 |                     new_batch_size = current_batch_size - 1
 643 | 
 644 |                 if new_batch_size < min_batch_size:
 645 |                     logger.error(_("batch.no_suitable_size", min_size=min_batch_size))
 646 |                     return 0
 647 | 
 648 |                 logger.info(_("batch.reducing_size", old_size=current_batch_size, new_size=new_batch_size))
 649 |                 current_batch_size = new_batch_size
 650 | 
 651 |             except Exception as e:
 652 |                 logger.warning(_("batch.unexpected_error", size=current_batch_size, error=str(e)))
 653 | 
 654 |                 # Reduce batch size by half on unexpected errors too
 655 |                 new_batch_size = current_batch_size // 2
 656 |                 if new_batch_size < min_batch_size:
 657 |                     return 0
 658 |                 current_batch_size = new_batch_size
 659 | 
 660 |         # Should not reach here
 661 |         logger.error(_("batch.no_suitable_size", min_size=min_batch_size))
 662 |         return 0
 663 | 
 664 |     def _transcribe_with_auto_batch_size(self, batched_model, audio_path, starting_batch_size=None):
 665 |         """
 666 |         Transcribe with automatic batch size reduction on OOM.
 667 |         Similar to HuggingFace Accelerate's find_executable_batch_size decorator.
 668 | 
 669 |         This function automatically retries with smaller batch sizes if OOM occurs,
 670 |         implementing the same behavior as Accelerate's find_executable_batch_size.
 671 | 
 672 |         Args:
 673 |             batched_model: BatchedInferencePipeline instance
 674 |             audio_path: Path to audio file
 675 |             starting_batch_size: Initial batch size to try (uses self.batch_size if not specified)
 676 | 
 677 |         Returns:
 678 |             Tuple of (segments, info, actual_batch_size_used)
 679 |         """
 680 |         batch_size = starting_batch_size or self.batch_size or 32
 681 |         min_batch_size = 1
 682 | 
 683 |         while batch_size >= min_batch_size:
 684 |             try:
 685 |                 logger.debug(_("batch.attempting_transcription", size=batch_size))
 686 | 
 687 |                 # Try transcription with current batch size
 688 |                 segments, info = batched_model.transcribe(
 689 |                     audio_path,
 690 |                     batch_size=batch_size,
 691 |                     **self.generation_config
 692 |                 )
 693 | 
 694 |                 # Success! Return results with the batch size that worked
 695 |                 if batch_size < (starting_batch_size or self.batch_size or 32):
 696 |                     logger.info(_("batch.auto_adjusted", size=batch_size))
 697 | 
 698 |                 return segments, info, batch_size
 699 | 
 700 |             except RuntimeError as e:
 701 |                 if "out of memory" in str(e).lower() or "oom" in str(e).lower():
 702 |                     # Reduce batch size by 0.8 (20% reduction, similar to Accelerate's 0.9 but more aggressive)
 703 |                     new_batch_size = int(batch_size * 0.8)
 704 | 
 705 |                     # Ensure we reduce by at least 1
 706 |                     if new_batch_size == batch_size:
 707 |                         new_batch_size = batch_size - 1
 708 | 
 709 |                     logger.warning(_("batch.oom_reducing", old_size=batch_size, new_size=new_batch_size))
 710 | 
 711 |                     batch_size = new_batch_size
 712 | 
 713 |                     if batch_size < min_batch_size:
 714 |                         logger.error(_("batch.cannot_run_min", min_size=min_batch_size))
 715 |                         raise RuntimeError(_("batch.inference_failed", min_size=min_batch_size)) from e
 716 |                 else:
 717 |                     # Not an OOM error, re-raise
 718 |                     raise
 719 | 
 720 |         # Should not reach here
 721 |         raise RuntimeError("Failed to find executable batch size")
 722 | 
 723 |     def _scan(self, base_dirs) -> list[InferenceTask]:
 724 |         tasks: list[InferenceTask] = []
 725 | 
 726 |         def process(base_path, audio_path):
 727 |             nonlocal tasks
 728 |             p = Path(audio_path)
 729 |             suffix = p.suffix.lower().lstrip('.')
 730 | 
 731 |             logger.debug(_("debug.processing", path=audio_path))
 732 |             logger.debug(_("debug.file_suffix", suffix=suffix))
 733 |             logger.debug(_("debug.valid_suffixes", suffixes=self.audio_suffixes))
 734 | 
 735 |             if suffix not in self.audio_suffixes:
 736 |                 logger.debug(_("debug.skipped_suffix", suffix=suffix))
 737 |                 return
 738 | 
 739 |             rel_path = p.relative_to(base_path)
 740 |             abs_path = Path(os.path.join(self.output_dir or base_path, rel_path))
 741 |             sub_formats = []
 742 | 
 743 |             for suffix in self.sub_formats:
 744 |                 sub_path = abs_path.parent / f"{abs_path.stem}.{suffix}"
 745 |                 if sub_path.exists() and not self.overwrite:
 746 |                     logger.debug(_("debug.subtitle_exists", path=sub_path))
 747 |                     continue
 748 |                 sub_formats.append(suffix)
 749 | 
 750 |             if len(sub_formats) == 0:
 751 |                 logger.debug(_("debug.skipped_all_exist"))
 752 |                 return
 753 | 
 754 |             logger.debug(_("debug.added_task", formats=sub_formats))
 755 |             tasks.append(InferenceTask(audio_path, str(abs_path.parent / abs_path.stem), sub_formats))
 756 | 
 757 |         for base_dir in base_dirs:
 758 |             # Expand user home directory
 759 |             base_dir = os.path.expanduser(base_dir)
 760 |             logger.debug(_("debug.scanning", path=base_dir))
 761 | 
 762 |             parent_dir = os.path.dirname(base_dir)
 763 |             if os.path.isdir(base_dir):
 764 |                 for root, dirs, files in os.walk(base_dir, topdown=True):
 765 |                     for file in files:
 766 |                         process(parent_dir, os.path.join(root, file))
 767 |             else:
 768 |                 process(parent_dir, base_dir)
 769 | 
 770 |         logger.info(_("files.found", count=len(tasks)))
 771 |         return tasks
 772 | 
 773 | 
 774 | def diagnose_environment():
 775 |     """Run comprehensive environment diagnostics for debugging"""
 776 |     print("=" * 60)
 777 |     print("ENVIRONMENT DIAGNOSTICS")
 778 |     print("=" * 60)
 779 | 
 780 |     # System info
 781 |     print("\n1. System Information:")
 782 |     print(f"   Platform: {platform.system()}")
 783 |     print(f"   Architecture: {platform.machine()}")
 784 |     print(f"   Python: {sys.version}")
 785 |     print(f"   Executable: {sys.executable}")
 786 |     print(f"   Frozen: {getattr(sys, 'frozen', False)}")
 787 | 
 788 |     if getattr(sys, 'frozen', False):
 789 |         print(f"   Bundle Dir: {getattr(sys, '_MEIPASS', 'Unknown')}")
 790 | 
 791 |     # CUDA environment
 792 |     print("\n2. CUDA Environment Variables:")
 793 |     cuda_vars = ['CUDA_HOME', 'CUDA_PATH', 'CUDA_ROOT', 'CUDNN_HOME', 'LD_LIBRARY_PATH', 'PATH']
 794 |     for var in cuda_vars:
 795 |         value = os.environ.get(var, 'Not set')
 796 |         if var == 'PATH' and value != 'Not set':
 797 |             # Just show cuda-related paths
 798 |             cuda_paths = [p for p in value.split(os.pathsep) if 'cuda' in p.lower() or 'nvidia' in p.lower()]
 799 |             value = os.pathsep.join(cuda_paths) if cuda_paths else 'No CUDA paths in PATH'
 800 |         print(f"   {var}: {value}")
 801 | 
 802 |     # Check for nvidia-smi
 803 |     print("\n3. NVIDIA GPU Detection:")
 804 |     try:
 805 |         result = subprocess.run(['nvidia-smi', '--query-gpu=name,driver_version,cuda_version', '--format=csv,noheader'],
 806 |                                capture_output=True, text=True, timeout=5)
 807 |         if result.returncode == 0:
 808 |             print(f"   GPU Info: {result.stdout.strip()}")
 809 |         else:
 810 |             print("   nvidia-smi failed")
 811 |     except FileNotFoundError:
 812 |         print("   nvidia-smi not found in PATH")
 813 |     except Exception as e:
 814 |         print(f"   Error: {e}")
 815 | 
 816 | 
 817 | def check_onnxruntime_detailed():
 818 |     """Detailed ONNX Runtime check for debugging"""
 819 |     print("\n" + "=" * 60)
 820 |     print("ONNX RUNTIME DIAGNOSTICS")
 821 |     print("=" * 60)
 822 | 
 823 |     try:
 824 |         import onnxruntime as ort
 825 |         print(f"\n✓ onnxruntime imported successfully")
 826 |         print(f"  Version: {ort.__version__}")
 827 |         print(f"  Location: {ort.__file__}")
 828 | 
 829 |         # Check available providers
 830 |         providers = ort.get_available_providers()
 831 |         print(f"\n  Available providers: {providers}")
 832 | 
 833 |         # Check for GPU support
 834 |         has_cuda = 'CUDAExecutionProvider' in providers
 835 |         has_tensorrt = 'TensorrtExecutionProvider' in providers
 836 |         has_directml = 'DmlExecutionProvider' in providers
 837 | 
 838 |         print(f"\n  GPU Support:")
 839 |         print(f"    CUDA: {'✓ Available' if has_cuda else '✗ Not Available'}")
 840 |         print(f"    TensorRT: {'✓ Available' if has_tensorrt else '✗ Not Available'}")
 841 |         print(f"    DirectML: {'✓ Available' if has_directml else '✗ Not Available'}")
 842 | 
 843 |         if not has_cuda and sys.platform != 'darwin':
 844 |             print("\n  ⚠️ CUDA not available. This might be because:")
 845 |             print("    1. onnxruntime (CPU) is installed instead of onnxruntime-gpu")
 846 |             print("    2. CUDA libraries are missing or not in PATH")
 847 |             print("    3. Incompatible CUDA/cuDNN versions")
 848 | 
 849 |         # Check bundled libraries if frozen
 850 |         if getattr(sys, 'frozen', False):
 851 |             bundle_dir = getattr(sys, '_MEIPASS', '')
 852 |             print(f"\n  Checking bundled libraries in: {bundle_dir}")
 853 | 
 854 |             cuda_libs = []
 855 |             onnx_libs = []
 856 | 
 857 |             try:
 858 |                 for root, dirs, files in os.walk(bundle_dir):
 859 |                     for file in files:
 860 |                         if any(x in file.lower() for x in ['cuda', 'cudnn', 'cublas', 'cufft']):
 861 |                             cuda_libs.append(file)
 862 |                         elif 'onnx' in file.lower():
 863 |                             onnx_libs.append(file)
 864 | 
 865 |                 if cuda_libs:
 866 |                     print(f"\n  Found {len(cuda_libs)} CUDA-related libraries:")
 867 |                     for lib in cuda_libs[:10]:
 868 |                         print(f"    - {lib}")
 869 |                     if len(cuda_libs) > 10:
 870 |                         print(f"    ... and {len(cuda_libs) - 10} more")
 871 |                 else:
 872 |                     print("\n  ⚠️ No CUDA libraries found in bundle")
 873 |             except Exception as e:
 874 |                 print(f"  Error scanning bundle: {e}")
 875 | 
 876 |         return True
 877 | 
 878 |     except ImportError as e:
 879 |         print(f"\n✗ Failed to import onnxruntime: {e}")
 880 |         print("\nSuggestions:")
 881 |         print("  1. Install onnxruntime-gpu for GPU support")
 882 |         print("  2. Check if package is bundled correctly in PyInstaller")
 883 |         return False
 884 |     except Exception as e:
 885 |         print(f"\n✗ Error during ONNX Runtime check: {e}")
 886 |         traceback.print_exc()
 887 |         return False
 888 | 
 889 | 
 890 | def test_vad_initialization():
 891 |     """Test VAD model initialization for debugging"""
 892 |     print("\n" + "=" * 60)
 893 |     print("VAD MODEL TEST")
 894 |     print("=" * 60)
 895 | 
 896 |     try:
 897 |         from .vad_manager import WhisperVADOnnxWrapper, VadModelManager
 898 |         print("✓ VAD modules imported successfully")
 899 | 
 900 |         # Check for model files
 901 |         model_paths = [
 902 |             'models/whisper_vad.onnx',
 903 |             'models/vad/whisper_vad.onnx',
 904 |             os.path.join(os.path.dirname(sys.executable), 'models', 'whisper_vad.onnx'),
 905 |         ]
 906 | 
 907 |         # If frozen, also check in bundle directory
 908 |         if getattr(sys, 'frozen', False):
 909 |             bundle_dir = getattr(sys, '_MEIPASS', '')
 910 |             model_paths.extend([
 911 |                 os.path.join(bundle_dir, 'models', 'whisper_vad.onnx'),
 912 |                 os.path.join(bundle_dir, 'whisper_vad.onnx'),
 913 |             ])
 914 | 
 915 |         model_path = None
 916 |         print("\nSearching for VAD model:")
 917 |         for path in model_paths:
 918 |             exists = os.path.exists(path)
 919 |             print(f"  {path}: {'Found' if exists else 'Not found'}")
 920 |             if exists and model_path is None:
 921 |                 model_path = path
 922 | 
 923 |         if model_path:
 924 |             print(f"\n✓ Using model: {model_path}")
 925 | 
 926 |             # Try to initialize
 927 |             print("\nTesting VAD initialization (GPU if available):")
 928 |             try:
 929 |                 wrapper = WhisperVADOnnxWrapper(
 930 |                     model_path=model_path,
 931 |                     force_cpu=False,
 932 |                     num_threads=1
 933 |                 )
 934 |                 print(f"  ✓ Device: {wrapper.device}")
 935 |                 print(f"  ✓ Providers: {wrapper.session.get_providers()}")
 936 |             except Exception as e:
 937 |                 print(f"  ✗ Error: {e}")
 938 | 
 939 |             # Test with forced CPU for comparison
 940 |             print("\nTesting VAD initialization (Force CPU):")
 941 |             try:
 942 |                 wrapper_cpu = WhisperVADOnnxWrapper(
 943 |                     model_path=model_path,
 944 |                     force_cpu=True,
 945 |                     num_threads=1
 946 |                 )
 947 |                 print(f"  ✓ Device: {wrapper_cpu.device}")
 948 |             except Exception as e:
 949 |                 print(f"  ✗ Error: {e}")
 950 |         else:
 951 |             print("\n✗ No VAD model file found")
 952 |             print("  Download the model using download_models.py")
 953 | 
 954 |     except ImportError as e:
 955 |         print(f"✗ Failed to import VAD modules: {e}")
 956 |     except Exception as e:
 957 |         print(f"✗ Error during VAD test: {e}")
 958 |         traceback.print_exc()
 959 | 
 960 | 
 961 | def launch_debug_console():
 962 |     """Launch interactive Python console for debugging"""
 963 |     print("\n" + "=" * 60)
 964 |     print("INTERACTIVE DEBUG CONSOLE")
 965 |     print("=" * 60)
 966 |     print("\nYou now have access to an interactive Python console.")
 967 |     print("\nAvailable commands:")
 968 |     print("  diagnose()       - Run environment diagnostics")
 969 |     print("  check_onnx()     - Check ONNX Runtime status")
 970 |     print("  test_vad()       - Test VAD initialization")
 971 |     print("  import X         - Try importing any module")
 972 |     print("  exit() or Ctrl+D - Exit console and continue")
 973 |     print("\nUseful variables:")
 974 |     print("  sys.path         - Python module search paths")
 975 |     print("  os.environ       - Environment variables")
 976 |     print("  sys.frozen       - Check if running from PyInstaller")
 977 |     print("=" * 60 + "\n")
 978 | 
 979 |     # Create namespace with useful functions
 980 |     namespace = {
 981 |         'diagnose': diagnose_environment,
 982 |         'check_onnx': check_onnxruntime_detailed,
 983 |         'test_vad': test_vad_initialization,
 984 |         'sys': sys,
 985 |         'os': os,
 986 |         'platform': platform,
 987 |     }
 988 | 
 989 |     # Launch interactive console
 990 |     code.InteractiveConsole(locals=namespace).interact(banner="")
 991 | 
 992 | 
 993 | def main():
 994 |     """Main entry point for the script"""
 995 |     if getattr(sys, 'frozen', False):
 996 |         os.chdir(os.path.dirname(sys.executable))
 997 |     else:
 998 |         # When run as a module, don't change directory
 999 |         pass
1000 | 
1001 |     args = parse_arguments()
1002 | 
1003 |     # Display open-source notice
1004 |     print("=" * 70)
1005 |     print("⚠️  重要声明 / IMPORTANT NOTICE")
1006 |     print("=" * 70)
1007 |     print("本软件开源于: https://github.com/TransWithAI/Faster-Whisper-TransWithAI-ChickenRice")
1008 |     print("开发团队: AI汉化组 (https://t.me/transWithAI)")
1009 |     print("任何第三方非免费下载均为智商税")
1010 |     print("=" * 70)
1011 |     print()
1012 | 
1013 |     # Check if console mode requested
1014 |     if args.console:
1015 |         # Run diagnostics first
1016 |         diagnose_environment()
1017 |         check_onnxruntime_detailed()
1018 |         test_vad_initialization()
1019 | 
1020 |         # Launch interactive console
1021 |         launch_debug_console()
1022 | 
1023 |         # After console exits, ask if user wants to continue with normal operation
1024 |         print("\nDebug console exited.")
1025 |         try:
1026 |             response = input("Continue with normal inference? (y/n): ").strip().lower()
1027 |             if response != 'y':
1028 |                 print("Exiting...")
1029 |                 sys.exit(0)
1030 |         except (KeyboardInterrupt, EOFError):
1031 |             print("\nExiting...")
1032 |             sys.exit(0)
1033 | 
1034 |     # Normal operation
1035 |     logger.setLevel(args.log_level)
1036 | 
1037 |     # Add file logging to latest.log in current working directory
1038 |     # This helps users report issues by providing a log file
1039 |     log_file_path = os.path.join(os.getcwd(), 'latest.log')
1040 |     file_handler = logging.FileHandler(log_file_path, mode='w', encoding='utf-8')
1041 |     file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
1042 |     file_handler.setLevel(args.log_level)
1043 | 
1044 |     # Add file handler to the module logger
1045 |     logger.addHandler(file_handler)
1046 | 
1047 |     logger.info(_("info.logging_to_file").format(path=log_file_path))
1048 |     logger.info(_("info.program_version").format(version="v1.3"))
1049 |     logger.info(_("info.python_version").format(version=sys.version))
1050 |     logger.info(_("info.platform").format(platform=platform.platform()))
1051 |     logger.info(_("info.arguments").format(args=vars(args)))
1052 | 
1053 |     if len(args.base_dirs) == 0:
1054 |         logger.warning(_("warnings.drag_files"))
1055 |         sys.exit(1)
1056 | 
1057 |     inference = Inference(args)
1058 |     inference.generates(args.base_dirs)
1059 |     sys.exit(0)
1060 | 
1061 | 
1062 | if __name__ == '__main__':
1063 |     # When run directly as a script
1064 |     import os
1065 |     os.chdir(os.path.dirname(__file__))
1066 |     main()
1067 | 


--------------------------------------------------------------------------------