├── .gitignore
├── requirements.txt
├── media
    ├── image1.png
    ├── image2.png
    └── image3.png
├── main.py
├── LICENSE
├── transcriber.py
├── single_shot_fixer.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | examples
2 | .idea
3 | __pycache__
4 | .env
5 | venv


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==1.51.0
2 | python-dotenv==1.0.1
3 | 


--------------------------------------------------------------------------------
/media/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image1.png


--------------------------------------------------------------------------------
/media/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image2.png


--------------------------------------------------------------------------------
/media/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/orcaman/improving_whisper_transcriptions_with_gpt4o/HEAD/media/image3.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from transcriber import VideoTranscriber
 4 | import single_shot_fixer
 5 | import logging
 6 | from dotenv import load_dotenv
 7 | 
 8 | load_dotenv()
 9 | 
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11 | 
12 | 
13 | def main():
14 |     if len(sys.argv) != 2:
15 |         logging.error("Usage: python script.py <path_to_video_file>")
16 |         sys.exit(1)
17 | 
18 |     api_key = os.getenv("OPENAI_API_KEY")
19 |     if not api_key:
20 |         raise ValueError("OPENAI_API_KEY environment variable is not set")
21 | 
22 |     video_path = sys.argv[1]
23 |     transcriber = VideoTranscriber(video_path)
24 |     transcription, transcription_path = transcriber.process()
25 | 
26 |     logging.info("Fixing transcription...")
27 |     improved = single_shot_fixer.fix(file_path=transcription_path, api_key=api_key)
28 |     logging.info("Improved transcription:")
29 |     logging.info(improved)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2024 Or Hiltch
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/transcriber.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | from typing import Tuple
 5 | import logging
 6 | 
 7 | from openai import OpenAI
 8 | 
 9 | # Set up logging
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11 | 
12 | 
13 | class VideoTranscriber:
14 |     def __init__(self, video_path: str):
15 |         self.video_path = Path(video_path)
16 |         self.audio_path = self.video_path.with_suffix('.mp3')
17 |         self.whisper_subs_path = self.video_path.parent / "whisper_subs"
18 |         self.transcription_path = self.whisper_subs_path / "transcription.txt"
19 | 
20 |         api_key = os.getenv("OPENAI_API_KEY")
21 |         if not api_key:
22 |             raise ValueError("OPENAI_API_KEY environment variable is not set")
23 |         self.client = OpenAI(api_key=api_key)
24 | 
25 |     def extract_audio(self) -> None:
26 |         """Extract audio from video file if it doesn't already exist."""
27 |         if self.audio_path.exists():
28 |             logging.info('Audio already extracted')
29 |             return
30 | 
31 |         command = [
32 |             'ffmpeg',
33 |             '-i', str(self.video_path),
34 |             '-q:a', '0',
35 |             '-map', 'a',
36 |             str(self.audio_path)
37 |         ]
38 | 
39 |         try:
40 |             subprocess.run(command, check=True, capture_output=True, text=True)
41 |             logging.info(f"Audio extracted successfully: {self.audio_path}")
42 |         except subprocess.CalledProcessError as e:
43 |             logging.error(f"Error extracting audio: {e}")
44 |             logging.error(f"FFMPEG stderr: {e.stderr}")
45 |             raise
46 | 
47 |     def transcribe(self) -> Tuple[str, Path]:
48 |         """Transcribe audio file if transcription doesn't already exist."""
49 |         if self.transcription_path.exists():
50 |             logging.info('Transcription already exists')
51 |             return self.transcription_path.read_text(), self.transcription_path
52 | 
53 |         if not self.audio_path.exists():
54 |             logging.warning("Audio file not found. Extracting audio first.")
55 |             self.extract_audio()
56 | 
57 |         try:
58 |             with self.audio_path.open("rb") as audio_file:
59 |                 transcription = self.client.audio.transcriptions.create(
60 |                     model="whisper-1",
61 |                     file=audio_file,
62 |                     response_format="text"
63 |                 )
64 | 
65 |             self.whisper_subs_path.mkdir(parents=True, exist_ok=True)
66 |             self.transcription_path.write_text(transcription)
67 |             logging.info(f"Transcription saved to: {self.transcription_path}")
68 | 
69 |             return transcription, self.transcription_path
70 |         except Exception as e:
71 |             logging.error(f"Error during transcription: {str(e)}")
72 |             raise
73 | 
74 |     def process(self) -> Tuple[str, Path]:
75 |         """Extract audio and transcribe in one step."""
76 |         self.extract_audio()
77 |         return self.transcribe()
78 | 
79 |  
80 | 


--------------------------------------------------------------------------------
/single_shot_fixer.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from openai import OpenAI
  3 | from pathlib import Path
  4 | import logging
  5 | from typing import Dict, Any
  6 | 
  7 | # Set up logging
  8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  9 | 
 10 | 
 11 | class TranscriptionImprover:
 12 |     def __init__(self, api_key: str):
 13 |         self.client = OpenAI(api_key=api_key)
 14 |         self.model = "gpt-4o"
 15 | 
 16 |     def suggest_improvements(self, file_path: Path) -> Dict[str, Any]:
 17 |         try:
 18 |             original_transcription = self._read_file(file_path)
 19 |             video_topic = self._get_video_topic(original_transcription)
 20 |             suggestions = self._get_suggestions(original_transcription, video_topic)
 21 |             return suggestions
 22 |         except Exception as e:
 23 |             logging.error(f"Error in suggest_improvements: {str(e)}")
 24 |             raise
 25 | 
 26 |     def _read_file(self, file_path: Path) -> str:
 27 |         try:
 28 |             with file_path.open('r') as file:
 29 |                 return file.read()
 30 |         except IOError as e:
 31 |             logging.error(f"Error reading file {file_path}: {str(e)}")
 32 |             raise
 33 | 
 34 |     def _get_video_topic(self, transcription: str) -> str:
 35 |         prompt = f"The following text is a transcription of a video. What is the video about?\nTranscription:\n{transcription}"
 36 |         response = self._create_chat_completion(prompt)
 37 |         return response.choices[0].message.content.strip()
 38 | 
 39 |     def _get_suggestions(self, transcription: str, video_topic: str) -> Dict[str, Any]:
 40 |         prompt = f"""The following text is a transcription of a video. The video is about "{video_topic}". 
 41 |         Based on this information, suggest corrections to possible mistakes in the transcription. 
 42 | 
 43 |         Generate JSON of suggestions in the following format: 
 44 | 
 45 |         {{
 46 |                 "word_to_replace_1": "suggested_word_1",
 47 |                 "word_to_replace_2": "suggested_word_2",
 48 |                 ...
 49 |         }}
 50 | 
 51 |         Transcription: 
 52 |         {transcription}
 53 |         """
 54 |         response = self._create_chat_completion(prompt, response_format={"type": "json_object"})
 55 |         return json.loads(response.choices[0].message.content.strip())
 56 | 
 57 |     def _create_chat_completion(self, prompt: str, response_format: Dict[str, str] = None) -> Any:
 58 |         messages = [
 59 |             {"role": "system", "content": "You are an AI assistant that improves video transcriptions."},
 60 |             {"role": "user", "content": prompt}
 61 |         ]
 62 |         kwargs = {
 63 |             "model": self.model,
 64 |             "messages": messages,
 65 |             "n": 1,
 66 |             "temperature": 0.5,
 67 |         }
 68 |         if response_format:
 69 |             kwargs["response_format"] = response_format
 70 |         return self.client.chat.completions.create(**kwargs)
 71 | 
 72 | 
 73 | def save_suggestions(file_path: Path, suggestions: Dict[str, Any]) -> Path:
 74 |     output_path = file_path.with_suffix('.suggestions.json')
 75 |     with output_path.open('w') as file:
 76 |         json.dump(suggestions, file, indent=2)
 77 |     return output_path
 78 | 
 79 | 
 80 | def save_improved(file_path: Path, improved_transcription: str) -> Path:
 81 |     output_path = file_path.with_suffix('.improved.txt')
 82 |     with output_path.open('w') as file:
 83 |         file.write(improved_transcription)
 84 |     return output_path
 85 | 
 86 | 
 87 | def find_and_replace_suggestions(file_path: Path, suggestions: Dict[str, str]) -> str:
 88 |     with file_path.open('r') as file:
 89 |         original_transcription = file.read()
 90 |         for word_to_replace, suggested_word in suggestions.items():
 91 |             original_transcription = original_transcription.replace(word_to_replace, suggested_word)
 92 |         return original_transcription
 93 | 
 94 | 
 95 | def fix(file_path: Path, api_key: str) -> str:
 96 |     try:
 97 |         improver = TranscriptionImprover(api_key)
 98 |         suggestions = improver.suggest_improvements(file_path)
 99 |         save_suggestions(file_path, suggestions)
100 |         improved_transcription = find_and_replace_suggestions(file_path, suggestions)
101 |         save_improved(file_path, improved_transcription)
102 |         return improved_transcription
103 |     except Exception as e:
104 |         logging.error(f"Error in fix function: {str(e)}")
105 |         raise


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Improving Whisper Transcriptions with GPT-4o
  2 | 
  3 | I was watching the [latest news episode from Whisky.com](https://www.youtube.com/watch?v=rYUOnaPfigg) (where fine spirits meet ™) the other day on YouTube, and noticed that the transcription was really off.
  4 | 
  5 | I'm not sure which transcriber is being used by YouTube to generate the closed captions, but it makes a bunch of mistakes, some of which are obviously related to the whisky domain, while others are general transcriptions mistakes.
  6 | 
  7 | Using OpenAI's whisper transcriber, results are significantly better, but still, domain-related errors are common. The transcriber is missing important context.
  8 | 
  9 | Here are a couple of examples:
 10 | 
 11 | ## Example 1
 12 | 
 13 | ![Example 1 Image](media/image1.png)
 14 | 
 15 | **Expected**: "two **single malts**" (single malt is a common whisky term)  
 16 | **YouTube transcriber**: "two single molds"  
 17 | **OpenAI Whisper**: "two single moulds"
 18 | 
 19 | ## Example 2
 20 | 
 21 | ![Example 2 Image](media/image2.png)
 22 | 
 23 | **Expected:** "the Distel Group and they had **Bunnahabhain**, **Deanston** and Tobermory in their group"
 24 | 
 25 | **YouTube transcriber**: "the distel group and they uh uh had buah haban deanston and toomore in their group"  
 26 | **OpenAI Whisper**: "the Distel Group and they had Bunnehaben, Diensten and Tobermory in their group"
 27 | 
 28 | So, while it looks like YouTube could generate some improvements by using a model similar to Whisper (perhaps it's some conscious decision on their end to use a smaller and weaker model due to their scale), there is still much room for improvement on top of Whisper's result as well. 
 29 | 
 30 | Unfortunately, the transcription cannot be handed over as-is to something like GPT as it would change the original text such that it does not match the video anymore (using prompt engineering, I could not get the model to avoid changing some pieces of text from the original).
 31 | 
 32 | However, we can easily solve this issue almost entirely, by implementing the following pattern to provide context to the transcriber:
 33 | 
 34 | ![Process Diagram](media/image3.png)
 35 | 
 36 | Once the audio has been extracted from the video and transcribed with Whisper, we can feed it to an LLM like GPT4-o to extract the topic.
 37 | 
 38 | For example, for the above mentioned episode, the topic extract from the transcription was: "The video is a news update from whisky.com, covering various developments and releases in the whisky industry as of September 23, 2024. It includes announcements about new whisky collections, anniversaries, and special editions from well-known distilleries such as Macallan, Glenmorangie, and Johnny Walker, among others. The video also discusses acquisitions in the whisky industry, environmentally friendly packaging innovations, and upcoming live whisky tastings. The news spans regions including Scotland, Ireland, the United States, and Germany, highlighting both new products and industry events."
 39 | 
 40 | With this context in hand, we can make yet another call to the LLM, this time with the dynamically-generated context about the video. Using OpenAI's JSON response mode, we can ask the LLM to generate a JSON file with suggestions for replacements based on the original text and the newly learned context.
 41 | 
 42 | For example, for the above episode, the JSON looks like so:
 43 | 
 44 | ```json
 45 | {
 46 |   "McAllen": "Macallan",
 47 |   "moulds": "malts",
 48 |   "Bunnehaben": "Bunnahabhain",
 49 |   "Diensten": "Deanston",
 50 |   "Cape Vinn": "Capevin",
 51 |   "Kings Bar & Distillery": "Kingsbarns Distillery",
 52 |   "Kings Bar's Coal Town": "Kingsbarns Coal Town",
 53 |   "Oktimo": "Octomore",
 54 |   "Eila": "Islay",
 55 |   "Tom & Tal": "Tomintoul",
 56 |   "Schippers Riegel": "Schipper's Riegel",
 57 |   "Pungent": "Puncheon",
 58 |   "cream sherry": "Cream Sherry",
 59 |   "chinkampin": "Chinquapin",
 60 |   "Sluers": "Slyrs",
 61 |   "Aluvalia": "Aluwalia"
 62 | }
 63 | ```
 64 | 
 65 | The whisky geeks amongst you will notice that the suggestions are really spot on for the whisky domain industry.
 66 | 
 67 | Finally, we can simply run a find & replace function on the original text using the suggested keywords, to end up with a perfect transcription!
 68 | 
 69 | ## Implementation
 70 | 
 71 | This project is a Python-based implementation of the above that transcribes video files using OpenAI's Whisper model and improves the resulting transcription using OpenAI's GPT-4o model.
 72 | 
 73 | ### Features
 74 | 
 75 | - Extracts audio from video files (ffmpeg required)
 76 | - Transcribes audio using OpenAI's Whisper model (OpenAI API key required)
 77 | - Improves transcription using GPT-4o (OpenAI API key required)
 78 | - Generates suggestions for corrections (OpenAI API key required)
 79 | - Applies corrections to the original transcription
 80 | 
 81 | ### Prerequisites
 82 | 
 83 | - Python 3.6+
 84 | - FFmpeg (for audio extraction)
 85 | - OpenAI API key
 86 | 
 87 | ### Installation
 88 | 
 89 | 1. Clone this repository:
 90 | 
 91 | ```
 92 | git clone <repository-url>
 93 | cd <repository-directory>
 94 | ```
 95 | 
 96 | 2. Install the required packages:
 97 | 
 98 | ```
 99 | pip install -r requirements.txt
100 | ```
101 | 
102 | You also need ffmpeg installed to extract the audio from a video file.
103 | 
104 | 3. Set up your OpenAI API key:
105 | 
106 | Create a `.env` file in the project root and add your API key:
107 | 
108 | ```
109 | OPENAI_API_KEY=your_api_key_here
110 | ```
111 | 
112 | ### Usage
113 | 
114 | Run the main script with the path to your video file:
115 | 
116 | ```
117 | python main.py <path_to_video_file>
118 | ```
119 | 
120 | The script will:
121 | 
122 | 1. Extract audio from the video
123 | 2. Transcribe the audio
124 | 3. Improve the transcription
125 | 4. Save the improved transcription
126 | 
127 | ### File Structure
128 | 
129 | - `main.py`: The main script that orchestrates the transcription and improvement process
130 | - `transcriber.py`: Contains the `VideoTranscriber` class for audio extraction and transcription
131 | - `single_shot_fixer.py`: Contains the `TranscriptionImprover` class and related functions for improving transcriptions
132 | - `requirements.txt`: Lists the required Python packages
133 | 
134 | ### Output
135 | 
136 | The script generates the following files:
137 | 
138 | - `<video_name>.mp3`: Extracted audio file
139 | - `whisper_subs/transcription.txt`: Original transcription
140 | - `whisper_subs/transcription.suggestions.json`: Suggested improvements
141 | - `whisper_subs/transcription.improved.txt`: Improved transcription
142 | 
143 | ### Error Handling
144 | 
145 | The script includes error handling and logging. Check the console output for any error messages or the log file if you've set up logging to a file.
146 | 
147 | ### Contributing
148 | 
149 | Contributions are welcome! Please feel free to submit a Pull Request.
150 | 
151 | ### License
152 | 
153 | The code in this repo is licensed under MIT. The media in this repo is compiled from screenshots of the Whisky.com YouTube channel and is used for educational purposes only, and is property of Whisky.com.
154 | 
155 | ### Acknowledgements
156 | 
157 | This project uses the following open-source libraries and APIs:
158 | 
159 | - OpenAI's GPT-4 and Whisper models
160 | - python-dotenv
161 | - FFmpeg (indirectly)
162 | - [Whisky.com video from YouTube](https://www.youtube.com/watch?v=rYUOnaPfigg) 


--------------------------------------------------------------------------------