├── .env.example
├── .gitignore
├── LICENSE
├── index.py
├── main.ipynb
├── readme.md
└── requirements.txt
/.env.example:
--------------------------------------------------------------------------------
1 | YOUTUBE_API_KEY=
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Rohit Das
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import logging
4 | import re
5 | import math
6 | import warnings
7 | import subprocess
8 | import platform
9 |
10 | import nltk
11 | import googleapiclient.discovery
12 | import googleapiclient.errors
13 | from deepmultilingualpunctuation import PunctuationModel
14 | from youtube_transcript_api import YouTubeTranscriptApi
15 |
16 | logging.basicConfig(level=logging.INFO, force=True)
17 | # stop any warnings
18 | warnings.filterwarnings("ignore")
19 |
20 |
21 | def open_file(filename):
22 | # Open the file using the default application
23 | logging.info(f'Opening \'{filename}\'...')
24 | try:
25 | if platform.system() == "Darwin": # macOS
26 | subprocess.call(('open', filename))
27 | elif platform.system() == "Windows": # Windows
28 | os.startfile(filename)
29 | else: # linux variants
30 | subprocess.call(('xdg-open', filename))
31 | except Exception as e:
32 | logging.error(f'Error: {e}')
33 |
34 |
35 | def clean_for_filename(title):
36 | # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses
37 | cleaned_title = re.sub(r'[^\w\s\.\-\(\)\[\]]', '', title)
38 |
39 | # Remove leading and trailing spaces
40 | return cleaned_title.strip()
41 |
42 |
43 | def remove_tags(text):
44 | # Remove any text inside [] like [music]
45 | updated_text = re.sub(r'\[.*?\]', '', text)
46 | return updated_text
47 |
48 |
49 | def remove_period_after_hashes(text):
50 | # Remove . after # or ##, considering newline characters
51 | return re.sub(r'(#\.|##\.)', lambda match: match.group(1)[:-1], text)
52 |
53 |
54 | def remove_escape_sequences(text):
55 | # Some old videos contain escape sequences like \n in their subtitle
56 | # Remove \n, \r\n, \t, \b, \r
57 | return re.sub(r'\\[nrtb]|\\r\n', '', text)
58 |
59 |
60 | def remove_double_greater_than(text):
61 | # Replace occurrences of ">>" with an empty string
62 | cleaned_text = re.sub(r'>>', '', text)
63 | return cleaned_text
64 |
65 |
66 | def add_punctuation(text, punctuation_model):
67 | if punctuation_model != "":
68 | model = PunctuationModel(model=punctuation_model)
69 | else:
70 | model = PunctuationModel()
71 |
72 | punctuated_text = model.restore_punctuation(text)
73 | return punctuated_text
74 |
75 |
76 | def capitalize_sentences(sentences):
77 | # Capitalize the first letter of each sentence in a batch
78 | capitalized_sentences = [sentence[0].upper() + sentence[1:]
79 | for sentence in sentences]
80 | return capitalized_sentences
81 |
82 |
83 | def parse_youtube_url(url):
84 | video_id_match = re.search(
85 | r'(?:youtube\.com\/.*?[?&]v=|youtu\.be\/)([^"&?\/\s]{11})', url)
86 | if video_id_match:
87 | return video_id_match.group(1)
88 | else:
89 | raise ValueError('Invalid YouTube URL')
90 |
91 |
92 | def parse_chapters(description):
93 | lines = description.split("\n")
94 | regex = re.compile(r"(\d{0,2}:?\d{1,2}:\d{2})")
95 | chapters = []
96 |
97 | for line in lines:
98 | matches = regex.findall(line)
99 | if matches:
100 | ts = matches[0]
101 | title = line.replace(ts, "").strip()
102 |
103 | # Check if the title contains another timestamp and remove it
104 | title = re.sub(r'\d{0,2}:?\d{1,2}:\d{2}', '', title).strip().strip(
105 | '-').strip().strip('-').strip()
106 |
107 | chapters.append({
108 | "timestamp": ts,
109 | "title": title,
110 | })
111 |
112 | return chapters
113 |
114 |
115 | def get_transcript(video_id, language, video_info, verbose=True):
116 | transcript_list = YouTubeTranscriptApi.get_transcript(
117 | video_id, languages=[language])
118 |
119 | if video_info["title"] != "":
120 | transcript = f'# {video_info["title"]}\n\n'
121 | else:
122 | transcript = ''
123 | current_chapter_index = 0
124 | chapters = video_info["chapters"]
125 | logging.info(f"""Transcript List Length: {
126 | len(transcript_list)}, Chapter Length: {len(chapters)}""")
127 |
128 | for i, line in enumerate(transcript_list):
129 | # Floor and convert to integer
130 | start_time = int(math.floor(line['start']))
131 |
132 | # Check if current_chapter_index is within the valid range
133 | if 0 <= current_chapter_index < len(chapters):
134 | chapter_time = chapters[current_chapter_index]['timestamp']
135 |
136 | try:
137 | # Extract start time from the chapter timestamp
138 | chapter_start = chapter_time.strip()
139 | chapter_start_seconds = sum(
140 | int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':'))))
141 | chapters[current_chapter_index]["title"] = chapters[current_chapter_index]["title"].strip()
142 | buffer_time = 2
143 |
144 | if start_time >= chapter_start_seconds - buffer_time:
145 | # If the start time is within the buffer time, add the chapter title
146 | transcript += f'\n\n## {chapters[current_chapter_index]["title"]}\n\n'
147 | current_chapter_index += 1
148 | except Exception as e:
149 | logging.error(
150 | f"Error processing chapter timestamp: {chapter_time}")
151 | logging.error(f"Error details: {e}")
152 |
153 | line['text'] = remove_tags(line['text'])
154 | line['text'] = remove_escape_sequences(line['text'])
155 | line['text'] = remove_double_greater_than(line['text'])
156 | if line['text']:
157 | transcript += line['text'].strip() + ' '
158 |
159 | # Log progress information
160 | if verbose and i % 100 == 0: # Adjust the log frequency as needed
161 | logging.info(f"Processed {i} lines out of {len(transcript_list)}")
162 |
163 | return transcript
164 |
165 |
166 | def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model):
167 | try:
168 | logging.info('Getting transcript...')
169 | raw_transcript = get_transcript(
170 | video_id, language, video_info, verbose)
171 |
172 | if generate_punctuated:
173 | logging.info('Generating punctuated transcript...')
174 | with_punctuation = add_punctuation(
175 | raw_transcript, punctuation_model)
176 | with_punctuation = remove_period_after_hashes(with_punctuation)
177 | logging.info('Capitalizing sentences...')
178 | sentences = nltk.sent_tokenize(with_punctuation)
179 | else:
180 | sentences = nltk.sent_tokenize(raw_transcript)
181 |
182 | # Capitalize sentences without batching
183 | capitalized_sentences = capitalize_sentences(sentences)
184 |
185 | double_linesep = os.linesep + os.linesep
186 | capitalized_transcript = double_linesep.join(capitalized_sentences)
187 | output_path = os.path.join(output_dir, f'{filename}.md')
188 |
189 | logging.info(f'Saving transcript to {output_path}...')
190 | with open(output_path, 'w', encoding='utf-8') as f:
191 | f.write(capitalized_transcript)
192 |
193 | # set log level to info to print the output path
194 | logging.getLogger().setLevel(logging.INFO)
195 | if generate_punctuated:
196 | logging.info(f'Punctuated transcript saved to \'{output_path}\'')
197 | else:
198 | logging.info(f'Raw transcript saved to \'{output_path}\'')
199 |
200 | except Exception as e:
201 | logging.error(f'Error: {e}')
202 |
203 |
204 | def getVideoInfo(video_id):
205 | try:
206 | # Set up Google API credentials using API key
207 | api_key = os.environ.get('YOUTUBE_API_KEY')
208 | if api_key is None:
209 | raise Exception(
210 | "No API key found, please set the YOUTUBE_API_KEY environment variable. \n Example: export YOUTUBE_API_KEY=your_api_key"
211 | )
212 | logging.info('Getting video info...')
213 | youtube = googleapiclient.discovery.build(
214 | "youtube", "v3", developerKey=api_key)
215 | request = youtube.videos().list(part="id,snippet",
216 | id=video_id
217 | )
218 | response = request.execute()
219 | title = response['items'][0]['snippet']['title']
220 | description = response['items'][0]['snippet']['description']
221 | data = {"title": title, "chapters": parse_chapters(description)}
222 | return data
223 | except Exception as e:
224 | logging.error(f'Error: {e}')
225 | return {"title": "", "chapters": []}
226 |
227 |
228 | def main():
229 | parser = argparse.ArgumentParser(
230 | description='Process YouTube video transcript and save it.')
231 | parser.add_argument('url', type=str, help='YouTube video URL')
232 | parser.add_argument('-l', '--language', type=str, default='en',
233 | help='Language for the transcript (default: en)')
234 | parser.add_argument('-p', '--punctuated', action='store_true',
235 | help='Generate punctuated transcript (default: False)')
236 | parser.add_argument('-o', '--output_dir', type=str, default='.',
237 | help='Output directory for saving the transcript (default: .)')
238 | parser.add_argument('-f', '--filename', type=str, default='',
239 | help='Filename for saving the transcript (default: Video Title or Video Id)')
240 | parser.add_argument('-m', '--punctuation_model', type=str, default='',
241 | help='Path to the punctuation model (default: None)')
242 | parser.add_argument('-a', '--auto-open', action='store_true',
243 | help='Automatically open the generated file in the default application (default: False)')
244 | parser.add_argument('-v', '--verbose', action='store_true',
245 | help='Print verbose output (default: False)')
246 |
247 | args = parser.parse_args()
248 |
249 | # Install NLTK punkt if not already installed
250 | try:
251 | nltk.data.find('tokenizers/punkt')
252 | except LookupError:
253 | logging.error('NLTK punkt not found.')
254 | logging.info('Downloading punkt...')
255 | try:
256 | nltk.download('punkt')
257 | except Exception as e:
258 | logging.error(f'Error: {e}')
259 |
260 | # Check if the Errno 60 error is thrown and suggest using a proxy/vpn
261 | if 'Errno 60' in str(e):
262 | logging.error(
263 | 'Error downloading punkt. Try using a proxy or a VPN.')
264 | else:
265 | logging.error('Error downloading punkt. Exiting.')
266 | exit(1)
267 |
268 | # if verbose is false, set logging level to error
269 | if not args.verbose:
270 | logging.getLogger().setLevel(logging.INFO)
271 |
272 | video_id = parse_youtube_url(args.url)
273 | video_info = getVideoInfo(video_id)
274 | filename = args.filename or clean_for_filename(
275 | video_info["title"]) or clean_for_filename(video_id)
276 |
277 | process_and_save_transcript(video_id, video_info, args.language, args.punctuated,
278 | args.output_dir, filename, args.verbose, args.punctuation_model)
279 |
280 | if args.auto_open:
281 | output_path = os.path.join(args.output_dir, f'{filename}.md')
282 | open_file(output_path)
283 |
284 |
285 | if __name__ == "__main__":
286 | main()
287 |
--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4",
8 | "mount_file_id": "15KTDpG-Cy2JIQo_r4uFYGOYv3cuuySLE",
9 | "authorship_tag": "ABX9TyOOFv7bxULf3jxYdyCciRs+",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | },
19 | "accelerator": "GPU"
20 | },
21 | "cells": [
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {
25 | "id": "view-in-github",
26 | "colab_type": "text"
27 | },
28 | "source": [
29 | "
"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "source": [
35 | "# YouTube Transcript Generator\n",
36 | "[](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb)\n",
37 | " \n",
38 | "\n",
39 | "## Overview 🌐\n",
40 | "\n",
41 | "The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution.\n",
42 | "\n",
43 | "### How It Can Help 🚀\n",
44 | "\n",
45 | "This tool is particularly useful for:\n",
46 | "- **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking.\n",
47 | "- **Content Analysis:** Analyze and derive insights from video content by converting it into text data.\n",
48 | "- **Chat Bot Training:** Use the generated transcripts to train chatbots, such as ChatGPT, for natural language understanding.\n",
49 | "- **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video.\n",
50 | "- **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format.\n",
51 | "- **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content.\n",
52 | "\n",
53 | "## Features 🛠️\n",
54 | "\n",
55 | "- **Transcription:** Obtain raw transcripts from YouTube videos.\n",
56 | "- **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large).\n",
57 | "- **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps.\n",
58 | "- **User-friendly:** Easy-to-use script with customizable parameters.\n",
59 | "\n",
60 | "## Environment Variables 🌐\n",
61 | "\n",
62 | "- `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the google cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added.\n",
63 | "\n",
64 | "## Runtime\n",
65 | "Please go to `Runtime > Change runtime type > Select T4 GPU`\n",
66 | "This will ensure best performance. Without a gpu, the punctuation will be very slow and can take minutes.\n",
67 | "\n",
68 | "## Script Parameters 📜\n",
69 | "```python\n",
70 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n",
71 | "language = 'en'\n",
72 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
73 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n",
74 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
75 | "punctuation_model = '' # More info down below\n",
76 | "verbose = True # To get logs\n",
77 | "```\n",
78 | "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n",
79 | "\n",
80 | "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n",
81 | "\n",
82 | "## Support 💬\n",
83 | "\n",
84 | "For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues).\n",
85 | "\n",
86 | "## Acknowledgments 🙌\n",
87 | "\n",
88 | "This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors.\n",
89 | "\n",
90 | "Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing!\n",
91 | "\n",
92 | "## Connect with me 📧\n",
93 | "The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com)\n",
94 | "- [x/therohitdas](https://x.com/therohitdas)\n",
95 | "- [GitHub/therohitdas](https://github.com/therohitdas)\n",
96 | "\n",
97 | "🚀 Happy transcribing!"
98 | ],
99 | "metadata": {
100 | "id": "UMDjo6KMV590"
101 | }
102 | },
103 | {
104 | "cell_type": "code",
105 | "source": [
106 | "!pip install youtube-transcript-api deepmultilingualpunctuation nltk google-api-python-client"
107 | ],
108 | "metadata": {
109 | "id": "HjaKQBJeT2d7"
110 | },
111 | "execution_count": null,
112 | "outputs": []
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "source": [
117 | "**Example Usage:**\n",
118 | "```python\n",
119 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n",
120 | "language = 'en'\n",
121 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
122 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n",
123 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
124 | "punctuation_model = '' # More info down below\n",
125 | "verbose = True # To get logs\n",
126 | "```\n",
127 | "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n",
128 | "\n",
129 | "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n",
130 | "\n",
131 | "After filling the cell below, press `CMD+F9` / `CTRL+F9` to run all cells."
132 | ],
133 | "metadata": {
134 | "id": "U5fmwoG6UFDd"
135 | }
136 | },
137 | {
138 | "cell_type": "code",
139 | "source": [
140 | "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID'\n",
141 | "language = 'en'\n",
142 | "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
143 | "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive, In the cell below, Uncomment the mount line\n",
144 | "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
145 | "punctuation_model = ''\n",
146 | "verbose = True"
147 | ],
148 | "metadata": {
149 | "id": "5CT6UxWtUYOn"
150 | },
151 | "execution_count": null,
152 | "outputs": []
153 | },
154 | {
155 | "cell_type": "code",
156 | "source": [
157 | "# Run this if you want to mount and store generated files in google drive.\n",
158 | "from google.colab import drive\n",
159 | "\n",
160 | "# Uncomment this:\n",
161 | "# drive.mount(\"/content/drive\")"
162 | ],
163 | "metadata": {
164 | "id": "7MBjbAlC8a3c"
165 | },
166 | "execution_count": null,
167 | "outputs": []
168 | },
169 | {
170 | "cell_type": "code",
171 | "source": [
172 | "import os\n",
173 | "import logging\n",
174 | "import re\n",
175 | "import math\n",
176 | "import nltk\n",
177 | "import youtube_transcript_api\n",
178 | "from deepmultilingualpunctuation import PunctuationModel\n",
179 | "import googleapiclient.discovery\n",
180 | "import googleapiclient.errors\n",
181 | "\n",
182 | "from google.colab import userdata\n",
183 | "import warnings"
184 | ],
185 | "metadata": {
186 | "id": "CCqYukC-T5EN"
187 | },
188 | "execution_count": null,
189 | "outputs": []
190 | },
191 | {
192 | "cell_type": "code",
193 | "source": [
194 | "try:\n",
195 | " nltk.data.find('tokenizers/punkt')\n",
196 | "except LookupError:\n",
197 | " nltk.download('punkt')\n",
198 | "\n",
199 | "logging.basicConfig(level=logging.INFO, force=True)\n",
200 | "warnings.filterwarnings('ignore')"
201 | ],
202 | "metadata": {
203 | "id": "vPjOXOkseYTt"
204 | },
205 | "execution_count": null,
206 | "outputs": []
207 | },
208 | {
209 | "cell_type": "code",
210 | "source": [
211 | "def clean_for_filename(title):\n",
212 | " # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses\n",
213 | " cleaned_title = re.sub(r'[^\\w\\s\\.\\-\\(\\)\\[\\]]', '', title)\n",
214 | "\n",
215 | " # Remove leading and trailing spaces\n",
216 | " return cleaned_title.strip()\n",
217 | "\n",
218 | "def remove_music_tags(text):\n",
219 | " # Remove [Music] or [music]\n",
220 | " updated_text = re.sub(r'\\[music\\]', '', text, flags=re.IGNORECASE)\n",
221 | " return updated_text\n",
222 | "\n",
223 | "def remove_period_after_hashes(text):\n",
224 | " # Remove . after # or ##, considering newline characters\n",
225 | " return re.sub(r'(#\\.|##\\.)', lambda match: match.group(1)[:-1], text)\n",
226 | "\n",
227 | "def remove_escape_sequences(text):\n",
228 | " # Some old videos contain escape sequences like \\n in their subtitle\n",
229 | " # Remove \\n, \\r\\n, \\t, \\b, \\r\n",
230 | " return re.sub(r'\\\\[nrtb]|\\\\r\\n', '', text)\n",
231 | "\n",
232 | "def remove_double_greater_than(text):\n",
233 | " # Replace occurrences of \">>\" with an empty string\n",
234 | " cleaned_text = re.sub(r'>>', '', text)\n",
235 | " return cleaned_text\n",
236 | "\n",
237 | "def add_punctuation(text, punctuation_model):\n",
238 | " if punctuation_model != \"\":\n",
239 | " model = PunctuationModel(model=punctuation_model)\n",
240 | " else:\n",
241 | " model = PunctuationModel()\n",
242 | " punctuated_text = model.restore_punctuation(text)\n",
243 | " return punctuated_text\n",
244 | "\n",
245 | "def capitalize_sentences(sentences):\n",
246 | " # Capitalize the first letter of each sentence in a batch\n",
247 | " capitalized_sentences = [sentence[0].upper() + sentence[1:] for sentence in sentences]\n",
248 | " return capitalized_sentences\n",
249 | "\n",
250 | "def parse_youtube_url(url):\n",
251 | " video_id_match = re.search(r'(?:youtube\\.com\\/.*?[?&]v=|youtu\\.be\\/)([^\"&?\\/\\s]{11})', url)\n",
252 | " if video_id_match:\n",
253 | " return video_id_match.group(1)\n",
254 | " else:\n",
255 | " raise ValueError('Invalid YouTube URL')\n",
256 | "\n",
257 | "def parse_chapters(description):\n",
258 | " lines = description.split(\"\\n\")\n",
259 | " regex = re.compile(r\"(\\d{0,2}:?\\d{1,2}:\\d{2})\")\n",
260 | " chapters = []\n",
261 | "\n",
262 | " for line in lines:\n",
263 | " matches = regex.findall(line)\n",
264 | " if matches:\n",
265 | " ts = matches[0]\n",
266 | " title = line.replace(ts, \"\").strip()\n",
267 | "\n",
268 | " # Check if the title contains another timestamp and remove it\n",
269 | " title = re.sub(r'\\d{0,2}:?\\d{1,2}:\\d{2}', '', title).strip().strip('-').strip().strip('-').strip()\n",
270 | "\n",
271 | " chapters.append({\n",
272 | " \"timestamp\": ts,\n",
273 | " \"title\": title,\n",
274 | " })\n",
275 | "\n",
276 | " return chapters\n",
277 | "\n",
278 | "def get_transcript(video_id, language, video_info, verbose=True):\n",
279 | " transcript_list = youtube_transcript_api.YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
280 | "\n",
281 | " if video_info[\"title\"] != \"\":\n",
282 | " transcript = f'# {video_info[\"title\"]}\\n\\n'\n",
283 | "\n",
284 | " current_chapter_index = 0\n",
285 | " chapters = video_info[\"chapters\"]\n",
286 | " logging.info(f\"Transcript_List Length: {len(transcript_list)}, Chapter Length: {len(chapters)}\")\n",
287 | "\n",
288 | " for i, line in enumerate(transcript_list):\n",
289 | " start_time = int(math.floor(line['start'])) # Floor and convert to integer\n",
290 | "\n",
291 | " # Check if current_chapter_index is within the valid range\n",
292 | " if 0 <= current_chapter_index < len(chapters):\n",
293 | " chapter_time = chapters[current_chapter_index]['timestamp']\n",
294 | "\n",
295 | " try:\n",
296 | " # Extract start time from the chapter timestamp\n",
297 | " chapter_start = chapter_time.strip()\n",
298 | " chapter_start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':'))))\n",
299 | " chapters[current_chapter_index][\"title\"] = chapters[current_chapter_index][\"title\"].strip()\n",
300 | " buffer_time = 2\n",
301 | "\n",
302 | " if start_time >= chapter_start_seconds - buffer_time:\n",
303 | " logging.info(f'\\n\\n## {chapters[current_chapter_index][\"title\"]}\\n')\n",
304 | " current_chapter_index += 1\n",
305 | " except Exception as e:\n",
306 | " logging.error(f\"Error processing chapter timestamp: {chapter_time}\")\n",
307 | " logging.error(f\"Error details: {e}\")\n",
308 | "\n",
309 | " line['text'] = remove_music_tags(line['text'])\n",
310 | " line['text'] = remove_escape_sequences(line['text'])\n",
311 | " line['text'] = remove_double_greater_than(line['text'])\n",
312 | " if line['text']:\n",
313 | " transcript += line['text'].strip() + ' '\n",
314 | "\n",
315 | " # Log progress information\n",
316 | " if verbose and i % 100 == 0: # Adjust the log frequency as needed\n",
317 | " logging.info(f\"Processed {i} lines out of {len(transcript_list)}\")\n",
318 | "\n",
319 | " return transcript\n",
320 | "\n",
321 | "def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model):\n",
322 | " try:\n",
323 | " raw_transcript = get_transcript(video_id, language, video_info, verbose)\n",
324 | " logging.info(\"Raw Transcript Length: %d\", len(raw_transcript))\n",
325 | "\n",
326 | " if generate_punctuated:\n",
327 | " with_punctuation = add_punctuation(raw_transcript, punctuation_model)\n",
328 | " with_punctuation = remove_period_after_hashes(with_punctuation)\n",
329 | " logging.info(\"Punctuation Char Length: %d\", len(with_punctuation))\n",
330 | " sentences = nltk.sent_tokenize(with_punctuation)\n",
331 | " logging.info(\"Sentences to process, (punctuated): %d\", len(sentences))\n",
332 | " else:\n",
333 | " sentences = nltk.sent_tokenize(raw_transcript)\n",
334 | " logging.info(\"Sentences to process, (raw): %d\", len(sentences))\n",
335 | "\n",
336 | " # Capitalize sentences without batching\n",
337 | " capitalized_sentences = capitalize_sentences(sentences)\n",
338 | "\n",
339 | " double_linesep = os.linesep + os.linesep\n",
340 | " capitalized_transcript = double_linesep.join(capitalized_sentences)\n",
341 | " output_path = os.path.join(output_dir, f'{filename}.md')\n",
342 | "\n",
343 | " with open(output_path, 'w', encoding='utf-8') as f:\n",
344 | " f.write(capitalized_transcript)\n",
345 | "\n",
346 | " if generate_punctuated:\n",
347 | " logging.info(f'Punctuated transcript saved to {output_path}')\n",
348 | " else:\n",
349 | " logging.info(f'Raw transcript saved to {output_path}')\n",
350 | "\n",
351 | " except Exception as e:\n",
352 | " logging.error(f'Error: {e}')\n",
353 | "\n",
354 | "def getVideoInfo (video_id):\n",
355 | " try:\n",
356 | " # Set up Google API credentials using API key\n",
357 | " api_key = userdata.get('YOUTUBE_API_KEY') # Replace with your actual API key\n",
358 | " youtube = googleapiclient.discovery.build(\"youtube\", \"v3\", developerKey=api_key)\n",
359 | " request = youtube.videos().list(part=\"id,snippet\",\n",
360 | " id = video_id\n",
361 | " )\n",
362 | " response = request.execute()\n",
363 | " title = response['items'][0]['snippet']['title']\n",
364 | " description = response['items'][0]['snippet']['description']\n",
365 | " data = {\"title\" : title, \"chapters\" : parse_chapters(description)}\n",
366 | " return data\n",
367 | " except Exception as e:\n",
368 | " logging.error(f'Error: {e}')\n",
369 | " return {\"title\": \"\", \"chapters\": []}"
370 | ],
371 | "metadata": {
372 | "id": "oasPyMVQoi7u"
373 | },
374 | "execution_count": null,
375 | "outputs": []
376 | },
377 | {
378 | "cell_type": "code",
379 | "source": [
380 | "video_id = parse_youtube_url(url)\n",
381 | "video_info = getVideoInfo(video_id)\n",
382 | "filename = filename = filename or clean_for_filename(video_info[\"title\"]) or clean_for_filename(video_id)"
383 | ],
384 | "metadata": {
385 | "id": "c-M0h6sCmHK1"
386 | },
387 | "execution_count": null,
388 | "outputs": []
389 | },
390 | {
391 | "cell_type": "code",
392 | "source": [
393 | "process_and_save_transcript(video_id, video_info, language, punctuated, output_dir, filename, verbose, punctuation_model)"
394 | ],
395 | "metadata": {
396 | "id": "CJgLX_DhcPsS"
397 | },
398 | "execution_count": null,
399 | "outputs": []
400 | },
401 | {
402 | "cell_type": "code",
403 | "source": [
404 | "# Download the Generated File\n",
405 | "from google.colab import files\n",
406 | "files.download(os.path.join(output_dir, f'{filename}.md'))"
407 | ],
408 | "metadata": {
409 | "id": "w9xpxQPTmalR"
410 | },
411 | "execution_count": null,
412 | "outputs": []
413 | }
414 | ]
415 | }
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # YouTube Transcript Generator
2 |
3 | [](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb)
4 |  
5 | [](https://www.codefactor.io/repository/github/therohitdas/youtube-transcript-generator)
6 |
7 | ## Overview 🌐
8 |
9 | The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution.
10 |
11 | ### How It Can Help 🚀
12 |
13 | This tool is particularly useful for:
14 |
15 | - **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking.
16 | - **Content Analysis:** Analyze and derive insights from video content by converting it into text data.
17 | - **Chat Bot Training:** Use the generated transcripts to train chat bots, such as ChatGPT, for natural language understanding.
18 | - **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video.
19 | - **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format.
20 | - **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content.
21 |
22 | ## Features 🛠️
23 |
24 | - **Transcription:** Obtain raw transcripts from YouTube videos.
25 | - **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large).
26 | - **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps.
27 | - **User-friendly:** Easy-to-use script with customizable parameters.
28 |
29 | ## Environment Variables 🌐
30 |
31 | - `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the Google Cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added.
32 |
33 | ## Script Parameters 📜
34 |
35 | When running the script locally, you can pass these parameters to the script:
36 |
37 | ### Positional Argument:
38 |
39 | - `url`: YouTube video URL
40 |
41 | ### Optional Arguments:
42 |
43 | - `-h, --help`: Show the help message and exit
44 | - `-l LANGUAGE, --language LANGUAGE`: Language for the transcript (default: en)
45 | - `-p, --punctuated`: Generate punctuated transcript (default: False)
46 | - `-a, -auto-open`: Automatically open the transcript in the default app (default: False)
47 | - `-o OUTPUT_DIR, --output_dir OUTPUT_DIR`: Output directory for saving the transcript (default: current directory)
48 | - `-f FILENAME, --filename FILENAME`: Filename for saving the transcript (default: Video Title or Video Id)
49 | - `-m PUNCTUATION_MODEL, --punctuation_model PUNCTUATION_MODEL`: Path to the punctuation model (default: None)
50 | - `-v, --verbose`: Print verbose output (default: False)
51 |
52 | ## Run in Google Colab 🚀
53 |
54 | To run this project in Google Colab, follow these steps:
55 |
56 | 1. Open the [Google Colab Notebook](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb).
57 | 2. Add Google's Project API key to the secrets tab under this key: `YOUTUBE_API_KEY` and toggle notebook access to on.
58 | 3. Go to Runtime > Change Runtime Type and select T4 GPU type. If you use CPU, the output for punctuated transcript will take some minutes to complete (around 1 minute per 10-minute video)
59 | 4. Change the values in the second cell to include your URL etc.
60 | 5. Press CTRL+F9 or CMD+F9 to run the notebook.
61 |
62 | ## Run Locally 💻
63 |
64 | I do not recommend running locally as it will download tensors and other stuff which are over 6gb. But if you want you can do this:
65 |
66 | 1. Clone the repository: `git clone https://github.com/therohitdas/Youtube-Transcript-Generator.git && cd Youtube-Transcript-Generator`
67 | 2. Create a virtual environment: `python -m venv venv`
68 | 3. Activate the virtual environment: `source venv/bin/activate` (Linux/MacOS) or `venv\Scripts\activate` (Windows)
69 | 4. Install dependencies: `pip install -r requirements.txt`
70 | 5. Set up the required environment variables: `YOUTUBE_API_KEY` (optional). You can either create a `.env` file or set them up in your system using.
71 | 6. Run the script: `python index.py ` or `python index.py -h` for the help menu.
72 |
73 | ## Support 🤝
74 |
75 | For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues).
76 |
77 | ## Example 📋
78 |
79 | Here's an example of how to run the script with various options:
80 |
81 | ### Basic Usage
82 |
83 | ```bash
84 | python index.py https://www.youtube.com/watch?v=VIDEO_ID
85 | ```
86 |
87 | ### Specify the Language
88 |
89 | ```bash
90 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -l fr
91 | ```
92 |
93 | ### Generate a Raw Transcript
94 |
95 | ```bash
96 | python index.py https://www.youtube.com/watch?v=VIDEO_ID
97 | ```
98 |
99 | ### Generate a Punctuated Transcript
100 |
101 | ```bash
102 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -p
103 | ```
104 |
105 | ### Specify the Output Directory
106 |
107 | ```bash
108 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -o /path/to/output
109 | ```
110 |
111 | ### Specify a Custom Filename
112 |
113 | ```bash
114 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -f custom_filename
115 | ```
116 |
117 | ### Enable Verbose Mode
118 |
119 | ```bash
120 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -v
121 | ```
122 |
123 | ### Specify a Punctuation Model
124 |
125 | ```bash
126 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -m author/model_name
127 | ```
128 |
129 | Punctuation model name can be taken from [here](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages).
130 |
131 | Make sure to replace `https://www.youtube.com/watch?v=VIDEO_ID` with the actual URL of the YouTube video you want to process.
132 |
133 | Feel free to copy and paste these examples into your terminal.
134 |
135 | ## Acknowledgments 🙌
136 |
137 | This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors.
138 |
139 | Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing!
140 |
141 | ## Connect with me 📧
142 |
143 | The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com)
144 |
145 | - [x/therohitdas](https://x.com/therohitdas)
146 | - [GitHub/therohitdas](https://github.com/therohitdas)
147 |
148 | 🚀 Happy transcribing!
149 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cachetools==5.3.2
2 | certifi==2023.11.17
3 | charset-normalizer==3.3.2
4 | click==8.1.7
5 | deepmultilingualpunctuation==1.0.1
6 | filelock==3.13.1
7 | fsspec==2023.12.1
8 | google-api-core==2.15.0
9 | google-api-python-client==2.110.0
10 | google-auth==2.25.2
11 | google-auth-httplib2==0.1.1
12 | google-auth-oauthlib==1.1.0
13 | googleapis-common-protos==1.62.0
14 | httplib2==0.22.0
15 | huggingface-hub==0.19.4
16 | idna==3.6
17 | Jinja2==3.1.2
18 | joblib==1.3.2
19 | MarkupSafe==2.1.3
20 | mpmath==1.3.0
21 | networkx==3.2.1
22 | nltk==3.8.1
23 | numpy==1.26.2
24 | oauthlib==3.2.2
25 | packaging==23.2
26 | protobuf==4.25.1
27 | pyasn1==0.5.1
28 | pyasn1-modules==0.3.0
29 | pyparsing==3.1.1
30 | PyYAML==6.0.1
31 | regex==2023.10.3
32 | requests==2.31.0
33 | requests-oauthlib==1.3.1
34 | rsa==4.9
35 | safetensors==0.4.1
36 | sympy==1.12
37 | tokenizers==0.15.0
38 | torch==2.1.1
39 | tqdm==4.66.1
40 | transformers==4.36.0
41 | typing_extensions==4.9.0
42 | uritemplate==4.1.1
43 | urllib3==2.1.0
44 | youtube-transcript-api==0.6.1
45 |
--------------------------------------------------------------------------------