├── README.md ├── requirements.txt └── transcription.py /README.md: -------------------------------------------------------------------------------- 1 | # Insanely Fast Transcription 2 | 3 | This tool provides an easy-to-use interface for transcribing audio from YouTube videos or local audio files using the Insanely Fast Whisper model. It leverages the power of GPU acceleration to provide quick and accurate transcriptions. 4 | 5 | ## Features 6 | 7 | - Download audio from YouTube videos 8 | - Transcribe local audio files 9 | - Utilize GPU acceleration for faster processing 10 | - Support for both Mac (MPS) and NVIDIA (CUDA) GPUs 11 | 12 | ## Requirements 13 | 14 | - Python 3.7+ 15 | - pipx (for installing Insanely Fast Whisper) 16 | - FFmpeg (for audio processing) 17 | 18 | ## Installation 19 | 20 | 1. Clone this repository: 21 | ``` 22 | git clone https://github.com/doriandarko/insanely-fast-whisper-tool.git 23 | cd insanely-fast-whisper-tool 24 | ``` 25 | 26 | 2. Install the required Python packages: 27 | ``` 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | 3. Install Insanely Fast Whisper: 32 | ``` 33 | pipx install insanely-fast-whisper==0.0.15 --force --pip-args="--ignore-requires-python" 34 | ``` 35 | 36 | ## Usage 37 | 38 | Run the script using Python: 39 | ``` 40 | python3 transcription.py 41 | ``` 42 | Follow the prompts to either download a YouTube video or specify a local audio file for transcription. 43 | 44 | ## Mac vs. NVIDIA GPU Usage 45 | 46 | ### Mac with Apple Silicon (M1/M2) 47 | 48 | The script is configured to use the MPS (Metal Performance Shaders) backend on Mac. It uses the following settings: 49 | 50 | - `--device-id mps` 51 | - `--batch-size 4` 52 | 53 | These settings are optimized for Mac devices to avoid out-of-memory issues. 54 | 55 | ### NVIDIA GPUs 56 | 57 | For systems with NVIDIA GPUs, you should modify the `transcribe_audio` function in `nemain.py`: 58 | 59 | 1. Change `--device-id mps` to `--device-id 0` (or the appropriate GPU index) 60 | 2. You can increase `--batch-size` to 24 or higher, depending on your GPU's memory 61 | 62 | ## Notes 63 | 64 | - The Insanely Fast Whisper model used is "openai/whisper-large-v3" 65 | - Transcriptions are saved in the "youtube_transcript" folder 66 | - Downloaded audio files are saved in the "youtube_audio" folder 67 | 68 | ## Troubleshooting 69 | 70 | If you encounter any issues, please ensure that: 71 | - FFmpeg is installed and accessible in your system PATH 72 | - You have the latest version of Insanely Fast Whisper installed 73 | - Your GPU drivers are up to date 74 | 75 | For Mac users, if you face memory issues, try reducing the batch size further. 76 | 77 | ## Acknowledgements 78 | 79 | This tool uses the Insanely Fast Whisper project, which is powered by 🤗 Transformers, Optimum & flash-attn. Special thanks to the OpenAI Whisper team and the Hugging Face Transformers team. 80 | 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | yt-dlp 2 | torch 3 | transformers 4 | optimum 5 | accelerate 6 | -------------------------------------------------------------------------------- /transcription.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import yt_dlp 4 | import re 5 | import glob 6 | import json 7 | 8 | def sanitize_filename(filename): 9 | return re.sub(r'[\\/*?:"<>|]', "", filename) 10 | 11 | def download_youtube_audio(url, output_folder): 12 | ydl_opts = { 13 | 'format': 'bestaudio/best', 14 | 'postprocessors': [{ 15 | 'key': 'FFmpegExtractAudio', 16 | 'preferredcodec': 'wav', 17 | 'preferredquality': '192', 18 | }], 19 | 'outtmpl': os.path.join(output_folder, '%(title)s.%(ext)s'), 20 | } 21 | 22 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 23 | info = ydl.extract_info(url, download=True) 24 | title = sanitize_filename(info['title']) 25 | return find_audio_file(output_folder, title) 26 | 27 | def find_audio_file(folder, title): 28 | pattern = os.path.join(folder, f"{title}.*") 29 | files = glob.glob(pattern) 30 | return files[0] if files else None 31 | 32 | def transcribe_audio(audio_file): 33 | # Set the environment variable to enable CPU fallback 34 | os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' 35 | 36 | command = [ 37 | "insanely-fast-whisper", 38 | "--file-name", audio_file, 39 | "--device-id", "mps", 40 | "--model-name", "openai/whisper-large-v3", 41 | "--batch-size", "4", 42 | "--timestamp", "word" 43 | ] 44 | result = subprocess.run(command, capture_output=True, text=True) 45 | if result.returncode != 0: 46 | print("Error during transcription:") 47 | print(result.stderr) 48 | return None 49 | 50 | # Parse the output manually 51 | output_lines = result.stdout.strip().split('\n') 52 | transcription_text = ' '.join(output_lines[1:]) # Skip the first line which is usually a progress bar 53 | 54 | return { 55 | 'text': transcription_text, 56 | 'raw_output': result.stdout 57 | } 58 | 59 | 60 | def main(): 61 | choice = input("Enter '1' for YouTube URL or '2' for local audio file path: ") 62 | 63 | if choice == '1': 64 | youtube_url = input("Enter the YouTube URL: ") 65 | audio_folder = "youtube_audio" 66 | os.makedirs(audio_folder, exist_ok=True) 67 | print("Downloading and converting YouTube video...") 68 | audio_file = download_youtube_audio(youtube_url, audio_folder) 69 | elif choice == '2': 70 | audio_file = input("Enter the path to the local audio file: ") 71 | else: 72 | print("Invalid choice. Exiting.") 73 | return 74 | 75 | if audio_file is None: 76 | print("Error: Could not find the audio file.") 77 | return 78 | 79 | transcript_folder = "youtube_transcript" 80 | os.makedirs(transcript_folder, exist_ok=True) 81 | 82 | print(f"Audio file to be transcribed: {os.path.abspath(audio_file)}") 83 | 84 | if not os.path.exists(audio_file): 85 | print(f"Error: The audio file '{audio_file}' does not exist.") 86 | return 87 | 88 | print("Transcribing audio...") 89 | transcription = transcribe_audio(audio_file) 90 | 91 | if transcription is None: 92 | print("Transcription failed.") 93 | return 94 | 95 | print("Transcription complete. Results:") 96 | if isinstance(transcription, dict) and 'text' in transcription: 97 | print(transcription['text']) 98 | 99 | video_name = os.path.splitext(os.path.basename(audio_file))[0] 100 | transcript_file = os.path.join(transcript_folder, f"{video_name}.txt") 101 | with open(transcript_file, "w", encoding="utf-8") as f: 102 | f.write(transcription['text']) 103 | print(f"Transcription saved to {transcript_file}") 104 | 105 | # Save the raw output 106 | raw_output_file = os.path.join(transcript_folder, f"{video_name}_raw_output.txt") 107 | with open(raw_output_file, "w", encoding="utf-8") as f: 108 | f.write(transcription['raw_output']) 109 | print(f"Raw transcription output saved to {raw_output_file}") 110 | else: 111 | print("Unexpected transcription format:") 112 | print(transcription) 113 | 114 | if __name__ == "__main__": 115 | main() 116 | --------------------------------------------------------------------------------