├── .gitignore ├── requirements.txt ├── media ├── image1.png ├── image2.png └── image3.png ├── main.py ├── LICENSE ├── transcriber.py ├── single_shot_fixer.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | examples 2 | .idea 3 | __pycache__ 4 | .env 5 | venv -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai==1.51.0 2 | python-dotenv==1.0.1 3 | -------------------------------------------------------------------------------- /media/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image1.png -------------------------------------------------------------------------------- /media/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image2.png -------------------------------------------------------------------------------- /media/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image3.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from transcriber import VideoTranscriber 4 | import single_shot_fixer 5 | import logging 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 11 | 12 | 13 | def main(): 14 | if len(sys.argv) != 2: 15 | logging.error("Usage: python script.py ") 16 | sys.exit(1) 17 | 18 | api_key = os.getenv("OPENAI_API_KEY") 19 | if not api_key: 20 | raise ValueError("OPENAI_API_KEY environment variable is not set") 21 | 22 | video_path = sys.argv[1] 23 | transcriber = VideoTranscriber(video_path) 24 | transcription, transcription_path = transcriber.process() 25 | 26 | logging.info("Fixing transcription...") 27 | improved = single_shot_fixer.fix(file_path=transcription_path, api_key=api_key) 28 | logging.info("Improved transcription:") 29 | logging.info(improved) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 Or Hiltch 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /transcriber.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | from typing import Tuple 5 | import logging 6 | 7 | from openai import OpenAI 8 | 9 | # Set up logging 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 11 | 12 | 13 | class VideoTranscriber: 14 | def __init__(self, video_path: str): 15 | self.video_path = Path(video_path) 16 | self.audio_path = self.video_path.with_suffix('.mp3') 17 | self.whisper_subs_path = self.video_path.parent / "whisper_subs" 18 | self.transcription_path = self.whisper_subs_path / "transcription.txt" 19 | 20 | api_key = os.getenv("OPENAI_API_KEY") 21 | if not api_key: 22 | raise ValueError("OPENAI_API_KEY environment variable is not set") 23 | self.client = OpenAI(api_key=api_key) 24 | 25 | def extract_audio(self) -> None: 26 | """Extract audio from video file if it doesn't already exist.""" 27 | if self.audio_path.exists(): 28 | logging.info('Audio already extracted') 29 | return 30 | 31 | command = [ 32 | 'ffmpeg', 33 | '-i', str(self.video_path), 34 | '-q:a', '0', 35 | '-map', 'a', 36 | str(self.audio_path) 37 | ] 38 | 39 | try: 40 | subprocess.run(command, check=True, capture_output=True, text=True) 41 | logging.info(f"Audio extracted successfully: {self.audio_path}") 42 | except subprocess.CalledProcessError as e: 43 | logging.error(f"Error extracting audio: {e}") 44 | logging.error(f"FFMPEG stderr: {e.stderr}") 45 | raise 46 | 47 | def transcribe(self) -> Tuple[str, Path]: 48 | """Transcribe audio file if transcription doesn't already exist.""" 49 | if self.transcription_path.exists(): 50 | logging.info('Transcription already exists') 51 | return self.transcription_path.read_text(), self.transcription_path 52 | 53 | if not self.audio_path.exists(): 54 | logging.warning("Audio file not found. Extracting audio first.") 55 | self.extract_audio() 56 | 57 | try: 58 | with self.audio_path.open("rb") as audio_file: 59 | transcription = self.client.audio.transcriptions.create( 60 | model="whisper-1", 61 | file=audio_file, 62 | response_format="text" 63 | ) 64 | 65 | self.whisper_subs_path.mkdir(parents=True, exist_ok=True) 66 | self.transcription_path.write_text(transcription) 67 | logging.info(f"Transcription saved to: {self.transcription_path}") 68 | 69 | return transcription, self.transcription_path 70 | except Exception as e: 71 | logging.error(f"Error during transcription: {str(e)}") 72 | raise 73 | 74 | def process(self) -> Tuple[str, Path]: 75 | """Extract audio and transcribe in one step.""" 76 | self.extract_audio() 77 | return self.transcribe() 78 | 79 | 80 | -------------------------------------------------------------------------------- /single_shot_fixer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from openai import OpenAI 3 | from pathlib import Path 4 | import logging 5 | from typing import Dict, Any 6 | 7 | # Set up logging 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | 10 | 11 | class TranscriptionImprover: 12 | def __init__(self, api_key: str): 13 | self.client = OpenAI(api_key=api_key) 14 | self.model = "gpt-4o" 15 | 16 | def suggest_improvements(self, file_path: Path) -> Dict[str, Any]: 17 | try: 18 | original_transcription = self._read_file(file_path) 19 | video_topic = self._get_video_topic(original_transcription) 20 | suggestions = self._get_suggestions(original_transcription, video_topic) 21 | return suggestions 22 | except Exception as e: 23 | logging.error(f"Error in suggest_improvements: {str(e)}") 24 | raise 25 | 26 | def _read_file(self, file_path: Path) -> str: 27 | try: 28 | with file_path.open('r') as file: 29 | return file.read() 30 | except IOError as e: 31 | logging.error(f"Error reading file {file_path}: {str(e)}") 32 | raise 33 | 34 | def _get_video_topic(self, transcription: str) -> str: 35 | prompt = f"The following text is a transcription of a video. What is the video about?\nTranscription:\n{transcription}" 36 | response = self._create_chat_completion(prompt) 37 | return response.choices[0].message.content.strip() 38 | 39 | def _get_suggestions(self, transcription: str, video_topic: str) -> Dict[str, Any]: 40 | prompt = f"""The following text is a transcription of a video. The video is about "{video_topic}". 41 | Based on this information, suggest corrections to possible mistakes in the transcription. 42 | 43 | Generate JSON of suggestions in the following format: 44 | 45 | {{ 46 | "word_to_replace_1": "suggested_word_1", 47 | "word_to_replace_2": "suggested_word_2", 48 | ... 49 | }} 50 | 51 | Transcription: 52 | {transcription} 53 | """ 54 | response = self._create_chat_completion(prompt, response_format={"type": "json_object"}) 55 | return json.loads(response.choices[0].message.content.strip()) 56 | 57 | def _create_chat_completion(self, prompt: str, response_format: Dict[str, str] = None) -> Any: 58 | messages = [ 59 | {"role": "system", "content": "You are an AI assistant that improves video transcriptions."}, 60 | {"role": "user", "content": prompt} 61 | ] 62 | kwargs = { 63 | "model": self.model, 64 | "messages": messages, 65 | "n": 1, 66 | "temperature": 0.5, 67 | } 68 | if response_format: 69 | kwargs["response_format"] = response_format 70 | return self.client.chat.completions.create(**kwargs) 71 | 72 | 73 | def save_suggestions(file_path: Path, suggestions: Dict[str, Any]) -> Path: 74 | output_path = file_path.with_suffix('.suggestions.json') 75 | with output_path.open('w') as file: 76 | json.dump(suggestions, file, indent=2) 77 | return output_path 78 | 79 | 80 | def save_improved(file_path: Path, improved_transcription: str) -> Path: 81 | output_path = file_path.with_suffix('.improved.txt') 82 | with output_path.open('w') as file: 83 | file.write(improved_transcription) 84 | return output_path 85 | 86 | 87 | def find_and_replace_suggestions(file_path: Path, suggestions: Dict[str, str]) -> str: 88 | with file_path.open('r') as file: 89 | original_transcription = file.read() 90 | for word_to_replace, suggested_word in suggestions.items(): 91 | original_transcription = original_transcription.replace(word_to_replace, suggested_word) 92 | return original_transcription 93 | 94 | 95 | def fix(file_path: Path, api_key: str) -> str: 96 | try: 97 | improver = TranscriptionImprover(api_key) 98 | suggestions = improver.suggest_improvements(file_path) 99 | save_suggestions(file_path, suggestions) 100 | improved_transcription = find_and_replace_suggestions(file_path, suggestions) 101 | save_improved(file_path, improved_transcription) 102 | return improved_transcription 103 | except Exception as e: 104 | logging.error(f"Error in fix function: {str(e)}") 105 | raise -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Improving Whisper Transcriptions with GPT-4o 2 | 3 | I was watching the [latest news episode from Whisky.com](https://www.youtube.com/watch?v=rYUOnaPfigg) (where fine spirits meet ™) the other day on YouTube, and noticed that the transcription was really off. 4 | 5 | I'm not sure which transcriber is being used by YouTube to generate the closed captions, but it makes a bunch of mistakes, some of which are obviously related to the whisky domain, while others are general transcriptions mistakes. 6 | 7 | Using OpenAI's whisper transcriber, results are significantly better, but still, domain-related errors are common. The transcriber is missing important context. 8 | 9 | Here are a couple of examples: 10 | 11 | ## Example 1 12 | 13 | ![Example 1 Image](media/image1.png) 14 | 15 | **Expected**: "two **single malts**" (single malt is a common whisky term) 16 | **YouTube transcriber**: "two single molds" 17 | **OpenAI Whisper**: "two single moulds" 18 | 19 | ## Example 2 20 | 21 | ![Example 2 Image](media/image2.png) 22 | 23 | **Expected:** "the Distel Group and they had **Bunnahabhain**, **Deanston** and Tobermory in their group" 24 | 25 | **YouTube transcriber**: "the distel group and they uh uh had buah haban deanston and toomore in their group" 26 | **OpenAI Whisper**: "the Distel Group and they had Bunnehaben, Diensten and Tobermory in their group" 27 | 28 | So, while it looks like YouTube could generate some improvements by using a model similar to Whisper (perhaps it's some conscious decision on their end to use a smaller and weaker model due to their scale), there is still much room for improvement on top of Whisper's result as well. 29 | 30 | Unfortunately, the transcription cannot be handed over as-is to something like GPT as it would change the original text such that it does not match the video anymore (using prompt engineering, I could not get the model to avoid changing some pieces of text from the original). 31 | 32 | However, we can easily solve this issue almost entirely, by implementing the following pattern to provide context to the transcriber: 33 | 34 | ![Process Diagram](media/image3.png) 35 | 36 | Once the audio has been extracted from the video and transcribed with Whisper, we can feed it to an LLM like GPT4-o to extract the topic. 37 | 38 | For example, for the above mentioned episode, the topic extract from the transcription was: "The video is a news update from whisky.com, covering various developments and releases in the whisky industry as of September 23, 2024. It includes announcements about new whisky collections, anniversaries, and special editions from well-known distilleries such as Macallan, Glenmorangie, and Johnny Walker, among others. The video also discusses acquisitions in the whisky industry, environmentally friendly packaging innovations, and upcoming live whisky tastings. The news spans regions including Scotland, Ireland, the United States, and Germany, highlighting both new products and industry events." 39 | 40 | With this context in hand, we can make yet another call to the LLM, this time with the dynamically-generated context about the video. Using OpenAI's JSON response mode, we can ask the LLM to generate a JSON file with suggestions for replacements based on the original text and the newly learned context. 41 | 42 | For example, for the above episode, the JSON looks like so: 43 | 44 | ```json 45 | { 46 | "McAllen": "Macallan", 47 | "moulds": "malts", 48 | "Bunnehaben": "Bunnahabhain", 49 | "Diensten": "Deanston", 50 | "Cape Vinn": "Capevin", 51 | "Kings Bar & Distillery": "Kingsbarns Distillery", 52 | "Kings Bar's Coal Town": "Kingsbarns Coal Town", 53 | "Oktimo": "Octomore", 54 | "Eila": "Islay", 55 | "Tom & Tal": "Tomintoul", 56 | "Schippers Riegel": "Schipper's Riegel", 57 | "Pungent": "Puncheon", 58 | "cream sherry": "Cream Sherry", 59 | "chinkampin": "Chinquapin", 60 | "Sluers": "Slyrs", 61 | "Aluvalia": "Aluwalia" 62 | } 63 | ``` 64 | 65 | The whisky geeks amongst you will notice that the suggestions are really spot on for the whisky domain industry. 66 | 67 | Finally, we can simply run a find & replace function on the original text using the suggested keywords, to end up with a perfect transcription! 68 | 69 | ## Implementation 70 | 71 | This project is a Python-based implementation of the above that transcribes video files using OpenAI's Whisper model and improves the resulting transcription using OpenAI's GPT-4o model. 72 | 73 | ### Features 74 | 75 | - Extracts audio from video files (ffmpeg required) 76 | - Transcribes audio using OpenAI's Whisper model (OpenAI API key required) 77 | - Improves transcription using GPT-4o (OpenAI API key required) 78 | - Generates suggestions for corrections (OpenAI API key required) 79 | - Applies corrections to the original transcription 80 | 81 | ### Prerequisites 82 | 83 | - Python 3.6+ 84 | - FFmpeg (for audio extraction) 85 | - OpenAI API key 86 | 87 | ### Installation 88 | 89 | 1. Clone this repository: 90 | 91 | ``` 92 | git clone 93 | cd 94 | ``` 95 | 96 | 2. Install the required packages: 97 | 98 | ``` 99 | pip install -r requirements.txt 100 | ``` 101 | 102 | You also need ffmpeg installed to extract the audio from a video file. 103 | 104 | 3. Set up your OpenAI API key: 105 | 106 | Create a `.env` file in the project root and add your API key: 107 | 108 | ``` 109 | OPENAI_API_KEY=your_api_key_here 110 | ``` 111 | 112 | ### Usage 113 | 114 | Run the main script with the path to your video file: 115 | 116 | ``` 117 | python main.py 118 | ``` 119 | 120 | The script will: 121 | 122 | 1. Extract audio from the video 123 | 2. Transcribe the audio 124 | 3. Improve the transcription 125 | 4. Save the improved transcription 126 | 127 | ### File Structure 128 | 129 | - `main.py`: The main script that orchestrates the transcription and improvement process 130 | - `transcriber.py`: Contains the `VideoTranscriber` class for audio extraction and transcription 131 | - `single_shot_fixer.py`: Contains the `TranscriptionImprover` class and related functions for improving transcriptions 132 | - `requirements.txt`: Lists the required Python packages 133 | 134 | ### Output 135 | 136 | The script generates the following files: 137 | 138 | - `.mp3`: Extracted audio file 139 | - `whisper_subs/transcription.txt`: Original transcription 140 | - `whisper_subs/transcription.suggestions.json`: Suggested improvements 141 | - `whisper_subs/transcription.improved.txt`: Improved transcription 142 | 143 | ### Error Handling 144 | 145 | The script includes error handling and logging. Check the console output for any error messages or the log file if you've set up logging to a file. 146 | 147 | ### Contributing 148 | 149 | Contributions are welcome! Please feel free to submit a Pull Request. 150 | 151 | ### License 152 | 153 | The code in this repo is licensed under MIT. The media in this repo is compiled from screenshots of the Whisky.com YouTube channel and is used for educational purposes only, and is property of Whisky.com. 154 | 155 | ### Acknowledgements 156 | 157 | This project uses the following open-source libraries and APIs: 158 | 159 | - OpenAI's GPT-4 and Whisper models 160 | - python-dotenv 161 | - FFmpeg (indirectly) 162 | - [Whisky.com video from YouTube](https://www.youtube.com/watch?v=rYUOnaPfigg) --------------------------------------------------------------------------------