├── .env.example ├── .gitignore ├── LICENSE ├── index.py ├── main.ipynb ├── readme.md └── requirements.txt /.env.example: -------------------------------------------------------------------------------- 1 | YOUTUBE_API_KEY= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Rohit Das 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import re 5 | import math 6 | import warnings 7 | import subprocess 8 | import platform 9 | 10 | import nltk 11 | import googleapiclient.discovery 12 | import googleapiclient.errors 13 | from deepmultilingualpunctuation import PunctuationModel 14 | from youtube_transcript_api import YouTubeTranscriptApi 15 | 16 | logging.basicConfig(level=logging.INFO, force=True) 17 | # stop any warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | 21 | def open_file(filename): 22 | # Open the file using the default application 23 | logging.info(f'Opening \'{filename}\'...') 24 | try: 25 | if platform.system() == "Darwin": # macOS 26 | subprocess.call(('open', filename)) 27 | elif platform.system() == "Windows": # Windows 28 | os.startfile(filename) 29 | else: # linux variants 30 | subprocess.call(('xdg-open', filename)) 31 | except Exception as e: 32 | logging.error(f'Error: {e}') 33 | 34 | 35 | def clean_for_filename(title): 36 | # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses 37 | cleaned_title = re.sub(r'[^\w\s\.\-\(\)\[\]]', '', title) 38 | 39 | # Remove leading and trailing spaces 40 | return cleaned_title.strip() 41 | 42 | 43 | def remove_tags(text): 44 | # Remove any text inside [] like [music] 45 | updated_text = re.sub(r'\[.*?\]', '', text) 46 | return updated_text 47 | 48 | 49 | def remove_period_after_hashes(text): 50 | # Remove . after # or ##, considering newline characters 51 | return re.sub(r'(#\.|##\.)', lambda match: match.group(1)[:-1], text) 52 | 53 | 54 | def remove_escape_sequences(text): 55 | # Some old videos contain escape sequences like \n in their subtitle 56 | # Remove \n, \r\n, \t, \b, \r 57 | return re.sub(r'\\[nrtb]|\\r\n', '', text) 58 | 59 | 60 | def remove_double_greater_than(text): 61 | # Replace occurrences of ">>" with an empty string 62 | cleaned_text = re.sub(r'>>', '', text) 63 | return cleaned_text 64 | 65 | 66 | def add_punctuation(text, punctuation_model): 67 | if punctuation_model != "": 68 | model = PunctuationModel(model=punctuation_model) 69 | else: 70 | model = PunctuationModel() 71 | 72 | punctuated_text = model.restore_punctuation(text) 73 | return punctuated_text 74 | 75 | 76 | def capitalize_sentences(sentences): 77 | # Capitalize the first letter of each sentence in a batch 78 | capitalized_sentences = [sentence[0].upper() + sentence[1:] 79 | for sentence in sentences] 80 | return capitalized_sentences 81 | 82 | 83 | def parse_youtube_url(url): 84 | video_id_match = re.search( 85 | r'(?:youtube\.com\/.*?[?&]v=|youtu\.be\/)([^"&?\/\s]{11})', url) 86 | if video_id_match: 87 | return video_id_match.group(1) 88 | else: 89 | raise ValueError('Invalid YouTube URL') 90 | 91 | 92 | def parse_chapters(description): 93 | lines = description.split("\n") 94 | regex = re.compile(r"(\d{0,2}:?\d{1,2}:\d{2})") 95 | chapters = [] 96 | 97 | for line in lines: 98 | matches = regex.findall(line) 99 | if matches: 100 | ts = matches[0] 101 | title = line.replace(ts, "").strip() 102 | 103 | # Check if the title contains another timestamp and remove it 104 | title = re.sub(r'\d{0,2}:?\d{1,2}:\d{2}', '', title).strip().strip( 105 | '-').strip().strip('-').strip() 106 | 107 | chapters.append({ 108 | "timestamp": ts, 109 | "title": title, 110 | }) 111 | 112 | return chapters 113 | 114 | 115 | def get_transcript(video_id, language, video_info, verbose=True): 116 | transcript_list = YouTubeTranscriptApi.get_transcript( 117 | video_id, languages=[language]) 118 | 119 | if video_info["title"] != "": 120 | transcript = f'# {video_info["title"]}\n\n' 121 | else: 122 | transcript = '' 123 | current_chapter_index = 0 124 | chapters = video_info["chapters"] 125 | logging.info(f"""Transcript List Length: { 126 | len(transcript_list)}, Chapter Length: {len(chapters)}""") 127 | 128 | for i, line in enumerate(transcript_list): 129 | # Floor and convert to integer 130 | start_time = int(math.floor(line['start'])) 131 | 132 | # Check if current_chapter_index is within the valid range 133 | if 0 <= current_chapter_index < len(chapters): 134 | chapter_time = chapters[current_chapter_index]['timestamp'] 135 | 136 | try: 137 | # Extract start time from the chapter timestamp 138 | chapter_start = chapter_time.strip() 139 | chapter_start_seconds = sum( 140 | int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':')))) 141 | chapters[current_chapter_index]["title"] = chapters[current_chapter_index]["title"].strip() 142 | buffer_time = 2 143 | 144 | if start_time >= chapter_start_seconds - buffer_time: 145 | # If the start time is within the buffer time, add the chapter title 146 | transcript += f'\n\n## {chapters[current_chapter_index]["title"]}\n\n' 147 | current_chapter_index += 1 148 | except Exception as e: 149 | logging.error( 150 | f"Error processing chapter timestamp: {chapter_time}") 151 | logging.error(f"Error details: {e}") 152 | 153 | line['text'] = remove_tags(line['text']) 154 | line['text'] = remove_escape_sequences(line['text']) 155 | line['text'] = remove_double_greater_than(line['text']) 156 | if line['text']: 157 | transcript += line['text'].strip() + ' ' 158 | 159 | # Log progress information 160 | if verbose and i % 100 == 0: # Adjust the log frequency as needed 161 | logging.info(f"Processed {i} lines out of {len(transcript_list)}") 162 | 163 | return transcript 164 | 165 | 166 | def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model): 167 | try: 168 | logging.info('Getting transcript...') 169 | raw_transcript = get_transcript( 170 | video_id, language, video_info, verbose) 171 | 172 | if generate_punctuated: 173 | logging.info('Generating punctuated transcript...') 174 | with_punctuation = add_punctuation( 175 | raw_transcript, punctuation_model) 176 | with_punctuation = remove_period_after_hashes(with_punctuation) 177 | logging.info('Capitalizing sentences...') 178 | sentences = nltk.sent_tokenize(with_punctuation) 179 | else: 180 | sentences = nltk.sent_tokenize(raw_transcript) 181 | 182 | # Capitalize sentences without batching 183 | capitalized_sentences = capitalize_sentences(sentences) 184 | 185 | double_linesep = os.linesep + os.linesep 186 | capitalized_transcript = double_linesep.join(capitalized_sentences) 187 | output_path = os.path.join(output_dir, f'{filename}.md') 188 | 189 | logging.info(f'Saving transcript to {output_path}...') 190 | with open(output_path, 'w', encoding='utf-8') as f: 191 | f.write(capitalized_transcript) 192 | 193 | # set log level to info to print the output path 194 | logging.getLogger().setLevel(logging.INFO) 195 | if generate_punctuated: 196 | logging.info(f'Punctuated transcript saved to \'{output_path}\'') 197 | else: 198 | logging.info(f'Raw transcript saved to \'{output_path}\'') 199 | 200 | except Exception as e: 201 | logging.error(f'Error: {e}') 202 | 203 | 204 | def getVideoInfo(video_id): 205 | try: 206 | # Set up Google API credentials using API key 207 | api_key = os.environ.get('YOUTUBE_API_KEY') 208 | if api_key is None: 209 | raise Exception( 210 | "No API key found, please set the YOUTUBE_API_KEY environment variable. \n Example: export YOUTUBE_API_KEY=your_api_key" 211 | ) 212 | logging.info('Getting video info...') 213 | youtube = googleapiclient.discovery.build( 214 | "youtube", "v3", developerKey=api_key) 215 | request = youtube.videos().list(part="id,snippet", 216 | id=video_id 217 | ) 218 | response = request.execute() 219 | title = response['items'][0]['snippet']['title'] 220 | description = response['items'][0]['snippet']['description'] 221 | data = {"title": title, "chapters": parse_chapters(description)} 222 | return data 223 | except Exception as e: 224 | logging.error(f'Error: {e}') 225 | return {"title": "", "chapters": []} 226 | 227 | 228 | def main(): 229 | parser = argparse.ArgumentParser( 230 | description='Process YouTube video transcript and save it.') 231 | parser.add_argument('url', type=str, help='YouTube video URL') 232 | parser.add_argument('-l', '--language', type=str, default='en', 233 | help='Language for the transcript (default: en)') 234 | parser.add_argument('-p', '--punctuated', action='store_true', 235 | help='Generate punctuated transcript (default: False)') 236 | parser.add_argument('-o', '--output_dir', type=str, default='.', 237 | help='Output directory for saving the transcript (default: .)') 238 | parser.add_argument('-f', '--filename', type=str, default='', 239 | help='Filename for saving the transcript (default: Video Title or Video Id)') 240 | parser.add_argument('-m', '--punctuation_model', type=str, default='', 241 | help='Path to the punctuation model (default: None)') 242 | parser.add_argument('-a', '--auto-open', action='store_true', 243 | help='Automatically open the generated file in the default application (default: False)') 244 | parser.add_argument('-v', '--verbose', action='store_true', 245 | help='Print verbose output (default: False)') 246 | 247 | args = parser.parse_args() 248 | 249 | # Install NLTK punkt if not already installed 250 | try: 251 | nltk.data.find('tokenizers/punkt') 252 | except LookupError: 253 | logging.error('NLTK punkt not found.') 254 | logging.info('Downloading punkt...') 255 | try: 256 | nltk.download('punkt') 257 | except Exception as e: 258 | logging.error(f'Error: {e}') 259 | 260 | # Check if the Errno 60 error is thrown and suggest using a proxy/vpn 261 | if 'Errno 60' in str(e): 262 | logging.error( 263 | 'Error downloading punkt. Try using a proxy or a VPN.') 264 | else: 265 | logging.error('Error downloading punkt. Exiting.') 266 | exit(1) 267 | 268 | # if verbose is false, set logging level to error 269 | if not args.verbose: 270 | logging.getLogger().setLevel(logging.INFO) 271 | 272 | video_id = parse_youtube_url(args.url) 273 | video_info = getVideoInfo(video_id) 274 | filename = args.filename or clean_for_filename( 275 | video_info["title"]) or clean_for_filename(video_id) 276 | 277 | process_and_save_transcript(video_id, video_info, args.language, args.punctuated, 278 | args.output_dir, filename, args.verbose, args.punctuation_model) 279 | 280 | if args.auto_open: 281 | output_path = os.path.join(args.output_dir, f'{filename}.md') 282 | open_file(output_path) 283 | 284 | 285 | if __name__ == "__main__": 286 | main() 287 | -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "mount_file_id": "15KTDpG-Cy2JIQo_r4uFYGOYv3cuuySLE", 9 | "authorship_tag": "ABX9TyOOFv7bxULf3jxYdyCciRs+", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | "\"Open" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "source": [ 35 | "# YouTube Transcript Generator\n", 36 | "[![Open in Collab](https://img.shields.io/badge/Open_in_Collab-555?style=for-the-badge&logo=googlecolab&labelColor=gray&color=purple)](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb)\n", 37 | "![GitHub License](https://img.shields.io/github/license/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&color=blue) ![GitHub Repo stars](https://img.shields.io/github/stars/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&logo=github)\n", 38 | "\n", 39 | "## Overview 🌐\n", 40 | "\n", 41 | "The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution.\n", 42 | "\n", 43 | "### How It Can Help 🚀\n", 44 | "\n", 45 | "This tool is particularly useful for:\n", 46 | "- **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking.\n", 47 | "- **Content Analysis:** Analyze and derive insights from video content by converting it into text data.\n", 48 | "- **Chat Bot Training:** Use the generated transcripts to train chatbots, such as ChatGPT, for natural language understanding.\n", 49 | "- **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video.\n", 50 | "- **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format.\n", 51 | "- **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content.\n", 52 | "\n", 53 | "## Features 🛠️\n", 54 | "\n", 55 | "- **Transcription:** Obtain raw transcripts from YouTube videos.\n", 56 | "- **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large).\n", 57 | "- **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps.\n", 58 | "- **User-friendly:** Easy-to-use script with customizable parameters.\n", 59 | "\n", 60 | "## Environment Variables 🌐\n", 61 | "\n", 62 | "- `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the google cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added.\n", 63 | "\n", 64 | "## Runtime\n", 65 | "Please go to `Runtime > Change runtime type > Select T4 GPU`\n", 66 | "This will ensure best performance. Without a gpu, the punctuation will be very slow and can take minutes.\n", 67 | "\n", 68 | "## Script Parameters 📜\n", 69 | "```python\n", 70 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n", 71 | "language = 'en'\n", 72 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n", 73 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n", 74 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n", 75 | "punctuation_model = '' # More info down below\n", 76 | "verbose = True # To get logs\n", 77 | "```\n", 78 | "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n", 79 | "\n", 80 | "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n", 81 | "\n", 82 | "## Support 💬\n", 83 | "\n", 84 | "For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues).\n", 85 | "\n", 86 | "## Acknowledgments 🙌\n", 87 | "\n", 88 | "This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors.\n", 89 | "\n", 90 | "Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing!\n", 91 | "\n", 92 | "## Connect with me 📧\n", 93 | "The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com)\n", 94 | "- [x/therohitdas](https://x.com/therohitdas)\n", 95 | "- [GitHub/therohitdas](https://github.com/therohitdas)\n", 96 | "\n", 97 | "🚀 Happy transcribing!" 98 | ], 99 | "metadata": { 100 | "id": "UMDjo6KMV590" 101 | } 102 | }, 103 | { 104 | "cell_type": "code", 105 | "source": [ 106 | "!pip install youtube-transcript-api deepmultilingualpunctuation nltk google-api-python-client" 107 | ], 108 | "metadata": { 109 | "id": "HjaKQBJeT2d7" 110 | }, 111 | "execution_count": null, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "source": [ 117 | "**Example Usage:**\n", 118 | "```python\n", 119 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n", 120 | "language = 'en'\n", 121 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n", 122 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n", 123 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n", 124 | "punctuation_model = '' # More info down below\n", 125 | "verbose = True # To get logs\n", 126 | "```\n", 127 | "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n", 128 | "\n", 129 | "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n", 130 | "\n", 131 | "After filling the cell below, press `CMD+F9` / `CTRL+F9` to run all cells." 132 | ], 133 | "metadata": { 134 | "id": "U5fmwoG6UFDd" 135 | } 136 | }, 137 | { 138 | "cell_type": "code", 139 | "source": [ 140 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID'\n", 141 | "language = 'en'\n", 142 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n", 143 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive, In the cell below, Uncomment the mount line\n", 144 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n", 145 | "punctuation_model = ''\n", 146 | "verbose = True" 147 | ], 148 | "metadata": { 149 | "id": "5CT6UxWtUYOn" 150 | }, 151 | "execution_count": null, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "source": [ 157 | "# Run this if you want to mount and store generated files in google drive.\n", 158 | "from google.colab import drive\n", 159 | "\n", 160 | "# Uncomment this:\n", 161 | "# drive.mount(\"/content/drive\")" 162 | ], 163 | "metadata": { 164 | "id": "7MBjbAlC8a3c" 165 | }, 166 | "execution_count": null, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "source": [ 172 | "import os\n", 173 | "import logging\n", 174 | "import re\n", 175 | "import math\n", 176 | "import nltk\n", 177 | "import youtube_transcript_api\n", 178 | "from deepmultilingualpunctuation import PunctuationModel\n", 179 | "import googleapiclient.discovery\n", 180 | "import googleapiclient.errors\n", 181 | "\n", 182 | "from google.colab import userdata\n", 183 | "import warnings" 184 | ], 185 | "metadata": { 186 | "id": "CCqYukC-T5EN" 187 | }, 188 | "execution_count": null, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "source": [ 194 | "try:\n", 195 | " nltk.data.find('tokenizers/punkt')\n", 196 | "except LookupError:\n", 197 | " nltk.download('punkt')\n", 198 | "\n", 199 | "logging.basicConfig(level=logging.INFO, force=True)\n", 200 | "warnings.filterwarnings('ignore')" 201 | ], 202 | "metadata": { 203 | "id": "vPjOXOkseYTt" 204 | }, 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "source": [ 211 | "def clean_for_filename(title):\n", 212 | " # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses\n", 213 | " cleaned_title = re.sub(r'[^\\w\\s\\.\\-\\(\\)\\[\\]]', '', title)\n", 214 | "\n", 215 | " # Remove leading and trailing spaces\n", 216 | " return cleaned_title.strip()\n", 217 | "\n", 218 | "def remove_music_tags(text):\n", 219 | " # Remove [Music] or [music]\n", 220 | " updated_text = re.sub(r'\\[music\\]', '', text, flags=re.IGNORECASE)\n", 221 | " return updated_text\n", 222 | "\n", 223 | "def remove_period_after_hashes(text):\n", 224 | " # Remove . after # or ##, considering newline characters\n", 225 | " return re.sub(r'(#\\.|##\\.)', lambda match: match.group(1)[:-1], text)\n", 226 | "\n", 227 | "def remove_escape_sequences(text):\n", 228 | " # Some old videos contain escape sequences like \\n in their subtitle\n", 229 | " # Remove \\n, \\r\\n, \\t, \\b, \\r\n", 230 | " return re.sub(r'\\\\[nrtb]|\\\\r\\n', '', text)\n", 231 | "\n", 232 | "def remove_double_greater_than(text):\n", 233 | " # Replace occurrences of \">>\" with an empty string\n", 234 | " cleaned_text = re.sub(r'>>', '', text)\n", 235 | " return cleaned_text\n", 236 | "\n", 237 | "def add_punctuation(text, punctuation_model):\n", 238 | " if punctuation_model != \"\":\n", 239 | " model = PunctuationModel(model=punctuation_model)\n", 240 | " else:\n", 241 | " model = PunctuationModel()\n", 242 | " punctuated_text = model.restore_punctuation(text)\n", 243 | " return punctuated_text\n", 244 | "\n", 245 | "def capitalize_sentences(sentences):\n", 246 | " # Capitalize the first letter of each sentence in a batch\n", 247 | " capitalized_sentences = [sentence[0].upper() + sentence[1:] for sentence in sentences]\n", 248 | " return capitalized_sentences\n", 249 | "\n", 250 | "def parse_youtube_url(url):\n", 251 | " video_id_match = re.search(r'(?:youtube\\.com\\/.*?[?&]v=|youtu\\.be\\/)([^\"&?\\/\\s]{11})', url)\n", 252 | " if video_id_match:\n", 253 | " return video_id_match.group(1)\n", 254 | " else:\n", 255 | " raise ValueError('Invalid YouTube URL')\n", 256 | "\n", 257 | "def parse_chapters(description):\n", 258 | " lines = description.split(\"\\n\")\n", 259 | " regex = re.compile(r\"(\\d{0,2}:?\\d{1,2}:\\d{2})\")\n", 260 | " chapters = []\n", 261 | "\n", 262 | " for line in lines:\n", 263 | " matches = regex.findall(line)\n", 264 | " if matches:\n", 265 | " ts = matches[0]\n", 266 | " title = line.replace(ts, \"\").strip()\n", 267 | "\n", 268 | " # Check if the title contains another timestamp and remove it\n", 269 | " title = re.sub(r'\\d{0,2}:?\\d{1,2}:\\d{2}', '', title).strip().strip('-').strip().strip('-').strip()\n", 270 | "\n", 271 | " chapters.append({\n", 272 | " \"timestamp\": ts,\n", 273 | " \"title\": title,\n", 274 | " })\n", 275 | "\n", 276 | " return chapters\n", 277 | "\n", 278 | "def get_transcript(video_id, language, video_info, verbose=True):\n", 279 | " transcript_list = youtube_transcript_api.YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n", 280 | "\n", 281 | " if video_info[\"title\"] != \"\":\n", 282 | " transcript = f'# {video_info[\"title\"]}\\n\\n'\n", 283 | "\n", 284 | " current_chapter_index = 0\n", 285 | " chapters = video_info[\"chapters\"]\n", 286 | " logging.info(f\"Transcript_List Length: {len(transcript_list)}, Chapter Length: {len(chapters)}\")\n", 287 | "\n", 288 | " for i, line in enumerate(transcript_list):\n", 289 | " start_time = int(math.floor(line['start'])) # Floor and convert to integer\n", 290 | "\n", 291 | " # Check if current_chapter_index is within the valid range\n", 292 | " if 0 <= current_chapter_index < len(chapters):\n", 293 | " chapter_time = chapters[current_chapter_index]['timestamp']\n", 294 | "\n", 295 | " try:\n", 296 | " # Extract start time from the chapter timestamp\n", 297 | " chapter_start = chapter_time.strip()\n", 298 | " chapter_start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':'))))\n", 299 | " chapters[current_chapter_index][\"title\"] = chapters[current_chapter_index][\"title\"].strip()\n", 300 | " buffer_time = 2\n", 301 | "\n", 302 | " if start_time >= chapter_start_seconds - buffer_time:\n", 303 | " logging.info(f'\\n\\n## {chapters[current_chapter_index][\"title\"]}\\n')\n", 304 | " current_chapter_index += 1\n", 305 | " except Exception as e:\n", 306 | " logging.error(f\"Error processing chapter timestamp: {chapter_time}\")\n", 307 | " logging.error(f\"Error details: {e}\")\n", 308 | "\n", 309 | " line['text'] = remove_music_tags(line['text'])\n", 310 | " line['text'] = remove_escape_sequences(line['text'])\n", 311 | " line['text'] = remove_double_greater_than(line['text'])\n", 312 | " if line['text']:\n", 313 | " transcript += line['text'].strip() + ' '\n", 314 | "\n", 315 | " # Log progress information\n", 316 | " if verbose and i % 100 == 0: # Adjust the log frequency as needed\n", 317 | " logging.info(f\"Processed {i} lines out of {len(transcript_list)}\")\n", 318 | "\n", 319 | " return transcript\n", 320 | "\n", 321 | "def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model):\n", 322 | " try:\n", 323 | " raw_transcript = get_transcript(video_id, language, video_info, verbose)\n", 324 | " logging.info(\"Raw Transcript Length: %d\", len(raw_transcript))\n", 325 | "\n", 326 | " if generate_punctuated:\n", 327 | " with_punctuation = add_punctuation(raw_transcript, punctuation_model)\n", 328 | " with_punctuation = remove_period_after_hashes(with_punctuation)\n", 329 | " logging.info(\"Punctuation Char Length: %d\", len(with_punctuation))\n", 330 | " sentences = nltk.sent_tokenize(with_punctuation)\n", 331 | " logging.info(\"Sentences to process, (punctuated): %d\", len(sentences))\n", 332 | " else:\n", 333 | " sentences = nltk.sent_tokenize(raw_transcript)\n", 334 | " logging.info(\"Sentences to process, (raw): %d\", len(sentences))\n", 335 | "\n", 336 | " # Capitalize sentences without batching\n", 337 | " capitalized_sentences = capitalize_sentences(sentences)\n", 338 | "\n", 339 | " double_linesep = os.linesep + os.linesep\n", 340 | " capitalized_transcript = double_linesep.join(capitalized_sentences)\n", 341 | " output_path = os.path.join(output_dir, f'{filename}.md')\n", 342 | "\n", 343 | " with open(output_path, 'w', encoding='utf-8') as f:\n", 344 | " f.write(capitalized_transcript)\n", 345 | "\n", 346 | " if generate_punctuated:\n", 347 | " logging.info(f'Punctuated transcript saved to {output_path}')\n", 348 | " else:\n", 349 | " logging.info(f'Raw transcript saved to {output_path}')\n", 350 | "\n", 351 | " except Exception as e:\n", 352 | " logging.error(f'Error: {e}')\n", 353 | "\n", 354 | "def getVideoInfo (video_id):\n", 355 | " try:\n", 356 | " # Set up Google API credentials using API key\n", 357 | " api_key = userdata.get('YOUTUBE_API_KEY') # Replace with your actual API key\n", 358 | " youtube = googleapiclient.discovery.build(\"youtube\", \"v3\", developerKey=api_key)\n", 359 | " request = youtube.videos().list(part=\"id,snippet\",\n", 360 | " id = video_id\n", 361 | " )\n", 362 | " response = request.execute()\n", 363 | " title = response['items'][0]['snippet']['title']\n", 364 | " description = response['items'][0]['snippet']['description']\n", 365 | " data = {\"title\" : title, \"chapters\" : parse_chapters(description)}\n", 366 | " return data\n", 367 | " except Exception as e:\n", 368 | " logging.error(f'Error: {e}')\n", 369 | " return {\"title\": \"\", \"chapters\": []}" 370 | ], 371 | "metadata": { 372 | "id": "oasPyMVQoi7u" 373 | }, 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "source": [ 380 | "video_id = parse_youtube_url(url)\n", 381 | "video_info = getVideoInfo(video_id)\n", 382 | "filename = filename = filename or clean_for_filename(video_info[\"title\"]) or clean_for_filename(video_id)" 383 | ], 384 | "metadata": { 385 | "id": "c-M0h6sCmHK1" 386 | }, 387 | "execution_count": null, 388 | "outputs": [] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "source": [ 393 | "process_and_save_transcript(video_id, video_info, language, punctuated, output_dir, filename, verbose, punctuation_model)" 394 | ], 395 | "metadata": { 396 | "id": "CJgLX_DhcPsS" 397 | }, 398 | "execution_count": null, 399 | "outputs": [] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "source": [ 404 | "# Download the Generated File\n", 405 | "from google.colab import files\n", 406 | "files.download(os.path.join(output_dir, f'{filename}.md'))" 407 | ], 408 | "metadata": { 409 | "id": "w9xpxQPTmalR" 410 | }, 411 | "execution_count": null, 412 | "outputs": [] 413 | } 414 | ] 415 | } -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # YouTube Transcript Generator 2 | 3 | [![Open in Colab](https://img.shields.io/badge/Open_in_Colab-555?style=for-the-badge&logo=googlecolab&labelColor=gray&color=purple)](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb) 4 | ![GitHub License](https://img.shields.io/github/license/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&color=blue) ![GitHub Repo stars](https://img.shields.io/github/stars/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&logo=github) 5 | [![CodeFactor](https://www.codefactor.io/repository/github/therohitdas/youtube-transcript-generator/badge?style=for-the-badge)](https://www.codefactor.io/repository/github/therohitdas/youtube-transcript-generator) 6 | 7 | ## Overview 🌐 8 | 9 | The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution. 10 | 11 | ### How It Can Help 🚀 12 | 13 | This tool is particularly useful for: 14 | 15 | - **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking. 16 | - **Content Analysis:** Analyze and derive insights from video content by converting it into text data. 17 | - **Chat Bot Training:** Use the generated transcripts to train chat bots, such as ChatGPT, for natural language understanding. 18 | - **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video. 19 | - **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format. 20 | - **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content. 21 | 22 | ## Features 🛠️ 23 | 24 | - **Transcription:** Obtain raw transcripts from YouTube videos. 25 | - **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large). 26 | - **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps. 27 | - **User-friendly:** Easy-to-use script with customizable parameters. 28 | 29 | ## Environment Variables 🌐 30 | 31 | - `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the Google Cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added. 32 | 33 | ## Script Parameters 📜 34 | 35 | When running the script locally, you can pass these parameters to the script: 36 | 37 | ### Positional Argument: 38 | 39 | - `url`: YouTube video URL 40 | 41 | ### Optional Arguments: 42 | 43 | - `-h, --help`: Show the help message and exit 44 | - `-l LANGUAGE, --language LANGUAGE`: Language for the transcript (default: en) 45 | - `-p, --punctuated`: Generate punctuated transcript (default: False) 46 | - `-a, -auto-open`: Automatically open the transcript in the default app (default: False) 47 | - `-o OUTPUT_DIR, --output_dir OUTPUT_DIR`: Output directory for saving the transcript (default: current directory) 48 | - `-f FILENAME, --filename FILENAME`: Filename for saving the transcript (default: Video Title or Video Id) 49 | - `-m PUNCTUATION_MODEL, --punctuation_model PUNCTUATION_MODEL`: Path to the punctuation model (default: None) 50 | - `-v, --verbose`: Print verbose output (default: False) 51 | 52 | ## Run in Google Colab 🚀 53 | 54 | To run this project in Google Colab, follow these steps: 55 | 56 | 1. Open the [Google Colab Notebook](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb). 57 | 2. Add Google's Project API key to the secrets tab under this key: `YOUTUBE_API_KEY` and toggle notebook access to on. 58 | 3. Go to Runtime > Change Runtime Type and select T4 GPU type. If you use CPU, the output for punctuated transcript will take some minutes to complete (around 1 minute per 10-minute video) 59 | 4. Change the values in the second cell to include your URL etc. 60 | 5. Press CTRL+F9 or CMD+F9 to run the notebook. 61 | 62 | ## Run Locally 💻 63 | 64 | I do not recommend running locally as it will download tensors and other stuff which are over 6gb. But if you want you can do this: 65 | 66 | 1. Clone the repository: `git clone https://github.com/therohitdas/Youtube-Transcript-Generator.git && cd Youtube-Transcript-Generator` 67 | 2. Create a virtual environment: `python -m venv venv` 68 | 3. Activate the virtual environment: `source venv/bin/activate` (Linux/MacOS) or `venv\Scripts\activate` (Windows) 69 | 4. Install dependencies: `pip install -r requirements.txt` 70 | 5. Set up the required environment variables: `YOUTUBE_API_KEY` (optional). You can either create a `.env` file or set them up in your system using. 71 | 6. Run the script: `python index.py ` or `python index.py -h` for the help menu. 72 | 73 | ## Support 🤝 74 | 75 | For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues). 76 | 77 | ## Example 📋 78 | 79 | Here's an example of how to run the script with various options: 80 | 81 | ### Basic Usage 82 | 83 | ```bash 84 | python index.py https://www.youtube.com/watch?v=VIDEO_ID 85 | ``` 86 | 87 | ### Specify the Language 88 | 89 | ```bash 90 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -l fr 91 | ``` 92 | 93 | ### Generate a Raw Transcript 94 | 95 | ```bash 96 | python index.py https://www.youtube.com/watch?v=VIDEO_ID 97 | ``` 98 | 99 | ### Generate a Punctuated Transcript 100 | 101 | ```bash 102 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -p 103 | ``` 104 | 105 | ### Specify the Output Directory 106 | 107 | ```bash 108 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -o /path/to/output 109 | ``` 110 | 111 | ### Specify a Custom Filename 112 | 113 | ```bash 114 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -f custom_filename 115 | ``` 116 | 117 | ### Enable Verbose Mode 118 | 119 | ```bash 120 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -v 121 | ``` 122 | 123 | ### Specify a Punctuation Model 124 | 125 | ```bash 126 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -m author/model_name 127 | ``` 128 | 129 | Punctuation model name can be taken from [here](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages). 130 | 131 | Make sure to replace `https://www.youtube.com/watch?v=VIDEO_ID` with the actual URL of the YouTube video you want to process. 132 | 133 | Feel free to copy and paste these examples into your terminal. 134 | 135 | ## Acknowledgments 🙌 136 | 137 | This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors. 138 | 139 | Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing! 140 | 141 | ## Connect with me 📧 142 | 143 | The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com) 144 | 145 | - [x/therohitdas](https://x.com/therohitdas) 146 | - [GitHub/therohitdas](https://github.com/therohitdas) 147 | 148 | 🚀 Happy transcribing! 149 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==5.3.2 2 | certifi==2023.11.17 3 | charset-normalizer==3.3.2 4 | click==8.1.7 5 | deepmultilingualpunctuation==1.0.1 6 | filelock==3.13.1 7 | fsspec==2023.12.1 8 | google-api-core==2.15.0 9 | google-api-python-client==2.110.0 10 | google-auth==2.25.2 11 | google-auth-httplib2==0.1.1 12 | google-auth-oauthlib==1.1.0 13 | googleapis-common-protos==1.62.0 14 | httplib2==0.22.0 15 | huggingface-hub==0.19.4 16 | idna==3.6 17 | Jinja2==3.1.2 18 | joblib==1.3.2 19 | MarkupSafe==2.1.3 20 | mpmath==1.3.0 21 | networkx==3.2.1 22 | nltk==3.8.1 23 | numpy==1.26.2 24 | oauthlib==3.2.2 25 | packaging==23.2 26 | protobuf==4.25.1 27 | pyasn1==0.5.1 28 | pyasn1-modules==0.3.0 29 | pyparsing==3.1.1 30 | PyYAML==6.0.1 31 | regex==2023.10.3 32 | requests==2.31.0 33 | requests-oauthlib==1.3.1 34 | rsa==4.9 35 | safetensors==0.4.1 36 | sympy==1.12 37 | tokenizers==0.15.0 38 | torch==2.1.1 39 | tqdm==4.66.1 40 | transformers==4.36.0 41 | typing_extensions==4.9.0 42 | uritemplate==4.1.1 43 | urllib3==2.1.0 44 | youtube-transcript-api==0.6.1 45 | --------------------------------------------------------------------------------