├── .env.example
├── .gitignore
├── LICENSE
├── index.py
├── main.ipynb
├── readme.md
└── requirements.txt


/.env.example:
--------------------------------------------------------------------------------
1 | YOUTUBE_API_KEY=


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Rohit Das
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import logging
  4 | import re
  5 | import math
  6 | import warnings
  7 | import subprocess
  8 | import platform
  9 | 
 10 | import nltk
 11 | import googleapiclient.discovery
 12 | import googleapiclient.errors
 13 | from deepmultilingualpunctuation import PunctuationModel
 14 | from youtube_transcript_api import YouTubeTranscriptApi
 15 | 
 16 | logging.basicConfig(level=logging.INFO, force=True)
 17 | # stop any warnings
 18 | warnings.filterwarnings("ignore")
 19 | 
 20 | 
 21 | def open_file(filename):
 22 |     # Open the file using the default application
 23 |     logging.info(f'Opening \'{filename}\'...')
 24 |     try:
 25 |         if platform.system() == "Darwin":       # macOS
 26 |             subprocess.call(('open', filename))
 27 |         elif platform.system() == "Windows":    # Windows
 28 |             os.startfile(filename)
 29 |         else:                                   # linux variants
 30 |             subprocess.call(('xdg-open', filename))
 31 |     except Exception as e:
 32 |         logging.error(f'Error: {e}')
 33 | 
 34 | 
 35 | def clean_for_filename(title):
 36 |     # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses
 37 |     cleaned_title = re.sub(r'[^\w\s\.\-\(\)\[\]]', '', title)
 38 | 
 39 |     # Remove leading and trailing spaces
 40 |     return cleaned_title.strip()
 41 | 
 42 | 
 43 | def remove_tags(text):
 44 |     # Remove any text inside [] like [music]
 45 |     updated_text = re.sub(r'\[.*?\]', '', text)
 46 |     return updated_text
 47 | 
 48 | 
 49 | def remove_period_after_hashes(text):
 50 |     # Remove . after # or ##, considering newline characters
 51 |     return re.sub(r'(#\.|##\.)', lambda match: match.group(1)[:-1], text)
 52 | 
 53 | 
 54 | def remove_escape_sequences(text):
 55 |     # Some old videos contain escape sequences like \n in their subtitle
 56 |     # Remove \n, \r\n, \t, \b, \r
 57 |     return re.sub(r'\\[nrtb]|\\r\n', '', text)
 58 | 
 59 | 
 60 | def remove_double_greater_than(text):
 61 |     # Replace occurrences of ">>" with an empty string
 62 |     cleaned_text = re.sub(r'>>', '', text)
 63 |     return cleaned_text
 64 | 
 65 | 
 66 | def add_punctuation(text, punctuation_model):
 67 |     if punctuation_model != "":
 68 |         model = PunctuationModel(model=punctuation_model)
 69 |     else:
 70 |         model = PunctuationModel()
 71 | 
 72 |     punctuated_text = model.restore_punctuation(text)
 73 |     return punctuated_text
 74 | 
 75 | 
 76 | def capitalize_sentences(sentences):
 77 |     # Capitalize the first letter of each sentence in a batch
 78 |     capitalized_sentences = [sentence[0].upper() + sentence[1:]
 79 |                              for sentence in sentences]
 80 |     return capitalized_sentences
 81 | 
 82 | 
 83 | def parse_youtube_url(url):
 84 |     video_id_match = re.search(
 85 |         r'(?:youtube\.com\/.*?[?&]v=|youtu\.be\/)([^"&?\/\s]{11})', url)
 86 |     if video_id_match:
 87 |         return video_id_match.group(1)
 88 |     else:
 89 |         raise ValueError('Invalid YouTube URL')
 90 | 
 91 | 
 92 | def parse_chapters(description):
 93 |     lines = description.split("\n")
 94 |     regex = re.compile(r"(\d{0,2}:?\d{1,2}:\d{2})")
 95 |     chapters = []
 96 | 
 97 |     for line in lines:
 98 |         matches = regex.findall(line)
 99 |         if matches:
100 |             ts = matches[0]
101 |             title = line.replace(ts, "").strip()
102 | 
103 |             # Check if the title contains another timestamp and remove it
104 |             title = re.sub(r'\d{0,2}:?\d{1,2}:\d{2}', '', title).strip().strip(
105 |                 '-').strip().strip('-').strip()
106 | 
107 |             chapters.append({
108 |                 "timestamp": ts,
109 |                 "title": title,
110 |             })
111 | 
112 |     return chapters
113 | 
114 | 
115 | def get_transcript(video_id, language, video_info, verbose=True):
116 |     transcript_list = YouTubeTranscriptApi.get_transcript(
117 |         video_id, languages=[language])
118 | 
119 |     if video_info["title"] != "":
120 |         transcript = f'# {video_info["title"]}\n\n'
121 |     else:
122 |         transcript = ''
123 |     current_chapter_index = 0
124 |     chapters = video_info["chapters"]
125 |     logging.info(f"""Transcript List Length: {
126 |                  len(transcript_list)}, Chapter Length: {len(chapters)}""")
127 | 
128 |     for i, line in enumerate(transcript_list):
129 |         # Floor and convert to integer
130 |         start_time = int(math.floor(line['start']))
131 | 
132 |         # Check if current_chapter_index is within the valid range
133 |         if 0 <= current_chapter_index < len(chapters):
134 |             chapter_time = chapters[current_chapter_index]['timestamp']
135 | 
136 |             try:
137 |                 # Extract start time from the chapter timestamp
138 |                 chapter_start = chapter_time.strip()
139 |                 chapter_start_seconds = sum(
140 |                     int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':'))))
141 |                 chapters[current_chapter_index]["title"] = chapters[current_chapter_index]["title"].strip()
142 |                 buffer_time = 2
143 | 
144 |                 if start_time >= chapter_start_seconds - buffer_time:
145 |                     # If the start time is within the buffer time, add the chapter title
146 |                     transcript += f'\n\n## {chapters[current_chapter_index]["title"]}\n\n'
147 |                     current_chapter_index += 1
148 |             except Exception as e:
149 |                 logging.error(
150 |                     f"Error processing chapter timestamp: {chapter_time}")
151 |                 logging.error(f"Error details: {e}")
152 | 
153 |         line['text'] = remove_tags(line['text'])
154 |         line['text'] = remove_escape_sequences(line['text'])
155 |         line['text'] = remove_double_greater_than(line['text'])
156 |         if line['text']:
157 |             transcript += line['text'].strip() + ' '
158 | 
159 |         # Log progress information
160 |         if verbose and i % 100 == 0:  # Adjust the log frequency as needed
161 |             logging.info(f"Processed {i} lines out of {len(transcript_list)}")
162 | 
163 |     return transcript
164 | 
165 | 
166 | def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model):
167 |     try:
168 |         logging.info('Getting transcript...')
169 |         raw_transcript = get_transcript(
170 |             video_id, language, video_info, verbose)
171 | 
172 |         if generate_punctuated:
173 |             logging.info('Generating punctuated transcript...')
174 |             with_punctuation = add_punctuation(
175 |                 raw_transcript, punctuation_model)
176 |             with_punctuation = remove_period_after_hashes(with_punctuation)
177 |             logging.info('Capitalizing sentences...')
178 |             sentences = nltk.sent_tokenize(with_punctuation)
179 |         else:
180 |             sentences = nltk.sent_tokenize(raw_transcript)
181 | 
182 |         # Capitalize sentences without batching
183 |         capitalized_sentences = capitalize_sentences(sentences)
184 | 
185 |         double_linesep = os.linesep + os.linesep
186 |         capitalized_transcript = double_linesep.join(capitalized_sentences)
187 |         output_path = os.path.join(output_dir, f'{filename}.md')
188 | 
189 |         logging.info(f'Saving transcript to {output_path}...')
190 |         with open(output_path, 'w', encoding='utf-8') as f:
191 |             f.write(capitalized_transcript)
192 | 
193 |         # set log level to info to print the output path
194 |         logging.getLogger().setLevel(logging.INFO)
195 |         if generate_punctuated:
196 |             logging.info(f'Punctuated transcript saved to \'{output_path}\'')
197 |         else:
198 |             logging.info(f'Raw transcript saved to \'{output_path}\'')
199 | 
200 |     except Exception as e:
201 |         logging.error(f'Error: {e}')
202 | 
203 | 
204 | def getVideoInfo(video_id):
205 |     try:
206 |         # Set up Google API credentials using API key
207 |         api_key = os.environ.get('YOUTUBE_API_KEY')
208 |         if api_key is None:
209 |             raise Exception(
210 |                 "No API key found, please set the YOUTUBE_API_KEY environment variable. \n Example: export YOUTUBE_API_KEY=your_api_key"
211 |             )
212 |         logging.info('Getting video info...')
213 |         youtube = googleapiclient.discovery.build(
214 |             "youtube", "v3", developerKey=api_key)
215 |         request = youtube.videos().list(part="id,snippet",
216 |                                         id=video_id
217 |                                         )
218 |         response = request.execute()
219 |         title = response['items'][0]['snippet']['title']
220 |         description = response['items'][0]['snippet']['description']
221 |         data = {"title": title, "chapters": parse_chapters(description)}
222 |         return data
223 |     except Exception as e:
224 |         logging.error(f'Error: {e}')
225 |         return {"title": "", "chapters": []}
226 | 
227 | 
228 | def main():
229 |     parser = argparse.ArgumentParser(
230 |         description='Process YouTube video transcript and save it.')
231 |     parser.add_argument('url', type=str, help='YouTube video URL')
232 |     parser.add_argument('-l', '--language', type=str, default='en',
233 |                         help='Language for the transcript (default: en)')
234 |     parser.add_argument('-p', '--punctuated', action='store_true',
235 |                         help='Generate punctuated transcript (default: False)')
236 |     parser.add_argument('-o', '--output_dir', type=str, default='.',
237 |                         help='Output directory for saving the transcript (default: .)')
238 |     parser.add_argument('-f', '--filename', type=str, default='',
239 |                         help='Filename for saving the transcript (default: Video Title or Video Id)')
240 |     parser.add_argument('-m', '--punctuation_model', type=str, default='',
241 |                         help='Path to the punctuation model (default: None)')
242 |     parser.add_argument('-a', '--auto-open', action='store_true',
243 |                         help='Automatically open the generated file in the default application (default: False)')
244 |     parser.add_argument('-v', '--verbose', action='store_true',
245 |                         help='Print verbose output (default: False)')
246 | 
247 |     args = parser.parse_args()
248 | 
249 |     # Install NLTK punkt if not already installed
250 |     try:
251 |         nltk.data.find('tokenizers/punkt')
252 |     except LookupError:
253 |         logging.error('NLTK punkt not found.')
254 |         logging.info('Downloading punkt...')
255 |         try:
256 |             nltk.download('punkt')
257 |         except Exception as e:
258 |             logging.error(f'Error: {e}')
259 | 
260 |             # Check if the Errno 60 error is thrown and suggest using a proxy/vpn
261 |             if 'Errno 60' in str(e):
262 |                 logging.error(
263 |                     'Error downloading punkt. Try using a proxy or a VPN.')
264 |             else:
265 |                 logging.error('Error downloading punkt. Exiting.')
266 |             exit(1)
267 | 
268 |     # if verbose is false, set logging level to error
269 |     if not args.verbose:
270 |         logging.getLogger().setLevel(logging.INFO)
271 | 
272 |     video_id = parse_youtube_url(args.url)
273 |     video_info = getVideoInfo(video_id)
274 |     filename = args.filename or clean_for_filename(
275 |         video_info["title"]) or clean_for_filename(video_id)
276 | 
277 |     process_and_save_transcript(video_id, video_info, args.language, args.punctuated,
278 |                                 args.output_dir, filename, args.verbose, args.punctuation_model)
279 | 
280 |     if args.auto_open:
281 |         output_path = os.path.join(args.output_dir, f'{filename}.md')
282 |         open_file(output_path)
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     main()
287 | 


--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4",
  8 |       "mount_file_id": "15KTDpG-Cy2JIQo_r4uFYGOYv3cuuySLE",
  9 |       "authorship_tag": "ABX9TyOOFv7bxULf3jxYdyCciRs+",
 10 |       "include_colab_link": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "language_info": {
 17 |       "name": "python"
 18 |     },
 19 |     "accelerator": "GPU"
 20 |   },
 21 |   "cells": [
 22 |     {
 23 |       "cell_type": "markdown",
 24 |       "metadata": {
 25 |         "id": "view-in-github",
 26 |         "colab_type": "text"
 27 |       },
 28 |       "source": [
 29 |         "<a href=\"https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 30 |       ]
 31 |     },
 32 |     {
 33 |       "cell_type": "markdown",
 34 |       "source": [
 35 |         "# YouTube Transcript Generator\n",
 36 |         "[![Open in Collab](https://img.shields.io/badge/Open_in_Collab-555?style=for-the-badge&logo=googlecolab&labelColor=gray&color=purple)](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb)\n",
 37 |         "![GitHub License](https://img.shields.io/github/license/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&color=blue) ![GitHub Repo stars](https://img.shields.io/github/stars/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&logo=github)\n",
 38 |         "\n",
 39 |         "## Overview 🌐\n",
 40 |         "\n",
 41 |         "The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution.\n",
 42 |         "\n",
 43 |         "### How It Can Help 🚀\n",
 44 |         "\n",
 45 |         "This tool is particularly useful for:\n",
 46 |         "- **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking.\n",
 47 |         "- **Content Analysis:** Analyze and derive insights from video content by converting it into text data.\n",
 48 |         "- **Chat Bot Training:** Use the generated transcripts to train chatbots, such as ChatGPT, for natural language understanding.\n",
 49 |         "- **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video.\n",
 50 |         "- **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format.\n",
 51 |         "- **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content.\n",
 52 |         "\n",
 53 |         "## Features 🛠️\n",
 54 |         "\n",
 55 |         "- **Transcription:** Obtain raw transcripts from YouTube videos.\n",
 56 |         "- **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large).\n",
 57 |         "- **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps.\n",
 58 |         "- **User-friendly:** Easy-to-use script with customizable parameters.\n",
 59 |         "\n",
 60 |         "## Environment Variables 🌐\n",
 61 |         "\n",
 62 |         "- `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the google cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added.\n",
 63 |         "\n",
 64 |         "## Runtime\n",
 65 |         "Please go to `Runtime > Change runtime type > Select T4 GPU`\n",
 66 |         "This will ensure best performance. Without a gpu, the punctuation will be very slow and can take minutes.\n",
 67 |         "\n",
 68 |         "## Script Parameters 📜\n",
 69 |         "```python\n",
 70 |         "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n",
 71 |         "language = 'en'\n",
 72 |         "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
 73 |         "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n",
 74 |         "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
 75 |         "punctuation_model = '' # More info down below\n",
 76 |         "verbose = True # To get logs\n",
 77 |         "```\n",
 78 |         "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n",
 79 |         "\n",
 80 |         "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n",
 81 |         "\n",
 82 |         "## Support 💬\n",
 83 |         "\n",
 84 |         "For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues).\n",
 85 |         "\n",
 86 |         "## Acknowledgments 🙌\n",
 87 |         "\n",
 88 |         "This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors.\n",
 89 |         "\n",
 90 |         "Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing!\n",
 91 |         "\n",
 92 |         "## Connect with me 📧\n",
 93 |         "The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com)\n",
 94 |         "- [x/therohitdas](https://x.com/therohitdas)\n",
 95 |         "- [GitHub/therohitdas](https://github.com/therohitdas)\n",
 96 |         "\n",
 97 |         "🚀 Happy transcribing!"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "UMDjo6KMV590"
101 |       }
102 |     },
103 |     {
104 |       "cell_type": "code",
105 |       "source": [
106 |         "!pip install youtube-transcript-api deepmultilingualpunctuation nltk google-api-python-client"
107 |       ],
108 |       "metadata": {
109 |         "id": "HjaKQBJeT2d7"
110 |       },
111 |       "execution_count": null,
112 |       "outputs": []
113 |     },
114 |     {
115 |       "cell_type": "markdown",
116 |       "source": [
117 |         "**Example Usage:**\n",
118 |         "```python\n",
119 |         "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID' # youtu.be link works too\n",
120 |         "language = 'en'\n",
121 |         "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
122 |         "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive\n",
123 |         "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
124 |         "punctuation_model = '' # More info down below\n",
125 |         "verbose = True # To get logs\n",
126 |         "```\n",
127 |         "`language` use the language code to get the video. By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated.\n",
128 |         "\n",
129 |         "`punctuation_model` values can be found at https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages\n",
130 |         "\n",
131 |         "After filling the cell below, press `CMD+F9` / `CTRL+F9` to run all cells."
132 |       ],
133 |       "metadata": {
134 |         "id": "U5fmwoG6UFDd"
135 |       }
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "source": [
140 |         "url = 'https://www.youtube.com/watch?v=YOUR_VIDEO_ID'\n",
141 |         "language = 'en'\n",
142 |         "punctuated = True # Default False, takes significantly more time when enabled on CPU, use T4 GPU type in google collab.\n",
143 |         "output_dir = '.' # add /content/drive/MyDrive/ to save content in You Google Drive, In the cell below, Uncomment the mount line\n",
144 |         "filename = \"\" # Leave empty for default filename: Video Title or Video Id\n",
145 |         "punctuation_model = ''\n",
146 |         "verbose = True"
147 |       ],
148 |       "metadata": {
149 |         "id": "5CT6UxWtUYOn"
150 |       },
151 |       "execution_count": null,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "source": [
157 |         "# Run this if you want to mount and store generated files in google drive.\n",
158 |         "from google.colab import drive\n",
159 |         "\n",
160 |         "# Uncomment this:\n",
161 |         "# drive.mount(\"/content/drive\")"
162 |       ],
163 |       "metadata": {
164 |         "id": "7MBjbAlC8a3c"
165 |       },
166 |       "execution_count": null,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "source": [
172 |         "import os\n",
173 |         "import logging\n",
174 |         "import re\n",
175 |         "import math\n",
176 |         "import nltk\n",
177 |         "import youtube_transcript_api\n",
178 |         "from deepmultilingualpunctuation import PunctuationModel\n",
179 |         "import googleapiclient.discovery\n",
180 |         "import googleapiclient.errors\n",
181 |         "\n",
182 |         "from google.colab import userdata\n",
183 |         "import warnings"
184 |       ],
185 |       "metadata": {
186 |         "id": "CCqYukC-T5EN"
187 |       },
188 |       "execution_count": null,
189 |       "outputs": []
190 |     },
191 |     {
192 |       "cell_type": "code",
193 |       "source": [
194 |         "try:\n",
195 |         "    nltk.data.find('tokenizers/punkt')\n",
196 |         "except LookupError:\n",
197 |         "    nltk.download('punkt')\n",
198 |         "\n",
199 |         "logging.basicConfig(level=logging.INFO, force=True)\n",
200 |         "warnings.filterwarnings('ignore')"
201 |       ],
202 |       "metadata": {
203 |         "id": "vPjOXOkseYTt"
204 |       },
205 |       "execution_count": null,
206 |       "outputs": []
207 |     },
208 |     {
209 |       "cell_type": "code",
210 |       "source": [
211 |         "def clean_for_filename(title):\n",
212 |         "    # Define a regular expression to keep only alphanumeric characters, spaces, dots, hyphens, and various parentheses\n",
213 |         "    cleaned_title = re.sub(r'[^\\w\\s\\.\\-\\(\\)\\[\\]]', '', title)\n",
214 |         "\n",
215 |         "    # Remove leading and trailing spaces\n",
216 |         "    return cleaned_title.strip()\n",
217 |         "\n",
218 |         "def remove_music_tags(text):\n",
219 |         "    # Remove [Music] or [music]\n",
220 |         "    updated_text = re.sub(r'\\[music\\]', '', text, flags=re.IGNORECASE)\n",
221 |         "    return updated_text\n",
222 |         "\n",
223 |         "def remove_period_after_hashes(text):\n",
224 |         "    # Remove . after # or ##, considering newline characters\n",
225 |         "    return re.sub(r'(#\\.|##\\.)', lambda match: match.group(1)[:-1], text)\n",
226 |         "\n",
227 |         "def remove_escape_sequences(text):\n",
228 |         "    # Some old videos contain escape sequences like \\n in their subtitle\n",
229 |         "    # Remove \\n, \\r\\n, \\t, \\b, \\r\n",
230 |         "    return re.sub(r'\\\\[nrtb]|\\\\r\\n', '', text)\n",
231 |         "\n",
232 |         "def remove_double_greater_than(text):\n",
233 |         "    # Replace occurrences of \">>\" with an empty string\n",
234 |         "    cleaned_text = re.sub(r'>>', '', text)\n",
235 |         "    return cleaned_text\n",
236 |         "\n",
237 |         "def add_punctuation(text, punctuation_model):\n",
238 |         "    if punctuation_model != \"\":\n",
239 |         "        model = PunctuationModel(model=punctuation_model)\n",
240 |         "    else:\n",
241 |         "        model = PunctuationModel()\n",
242 |         "        punctuated_text = model.restore_punctuation(text)\n",
243 |         "    return punctuated_text\n",
244 |         "\n",
245 |         "def capitalize_sentences(sentences):\n",
246 |         "    # Capitalize the first letter of each sentence in a batch\n",
247 |         "    capitalized_sentences = [sentence[0].upper() + sentence[1:] for sentence in sentences]\n",
248 |         "    return capitalized_sentences\n",
249 |         "\n",
250 |         "def parse_youtube_url(url):\n",
251 |         "    video_id_match = re.search(r'(?:youtube\\.com\\/.*?[?&]v=|youtu\\.be\\/)([^\"&?\\/\\s]{11})', url)\n",
252 |         "    if video_id_match:\n",
253 |         "        return video_id_match.group(1)\n",
254 |         "    else:\n",
255 |         "        raise ValueError('Invalid YouTube URL')\n",
256 |         "\n",
257 |         "def parse_chapters(description):\n",
258 |         "    lines = description.split(\"\\n\")\n",
259 |         "    regex = re.compile(r\"(\\d{0,2}:?\\d{1,2}:\\d{2})\")\n",
260 |         "    chapters = []\n",
261 |         "\n",
262 |         "    for line in lines:\n",
263 |         "        matches = regex.findall(line)\n",
264 |         "        if matches:\n",
265 |         "            ts = matches[0]\n",
266 |         "            title = line.replace(ts, \"\").strip()\n",
267 |         "\n",
268 |         "            # Check if the title contains another timestamp and remove it\n",
269 |         "            title = re.sub(r'\\d{0,2}:?\\d{1,2}:\\d{2}', '', title).strip().strip('-').strip().strip('-').strip()\n",
270 |         "\n",
271 |         "            chapters.append({\n",
272 |         "                \"timestamp\": ts,\n",
273 |         "                \"title\": title,\n",
274 |         "            })\n",
275 |         "\n",
276 |         "    return chapters\n",
277 |         "\n",
278 |         "def get_transcript(video_id, language, video_info, verbose=True):\n",
279 |         "    transcript_list = youtube_transcript_api.YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
280 |         "\n",
281 |         "    if video_info[\"title\"] != \"\":\n",
282 |         "        transcript = f'# {video_info[\"title\"]}\\n\\n'\n",
283 |         "\n",
284 |         "    current_chapter_index = 0\n",
285 |         "    chapters = video_info[\"chapters\"]\n",
286 |         "    logging.info(f\"Transcript_List Length: {len(transcript_list)}, Chapter Length: {len(chapters)}\")\n",
287 |         "\n",
288 |         "    for i, line in enumerate(transcript_list):\n",
289 |         "        start_time = int(math.floor(line['start']))  # Floor and convert to integer\n",
290 |         "\n",
291 |         "        # Check if current_chapter_index is within the valid range\n",
292 |         "        if 0 <= current_chapter_index < len(chapters):\n",
293 |         "            chapter_time = chapters[current_chapter_index]['timestamp']\n",
294 |         "\n",
295 |         "            try:\n",
296 |         "                # Extract start time from the chapter timestamp\n",
297 |         "                chapter_start = chapter_time.strip()\n",
298 |         "                chapter_start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(chapter_start.split(':'))))\n",
299 |         "                chapters[current_chapter_index][\"title\"] = chapters[current_chapter_index][\"title\"].strip()\n",
300 |         "                buffer_time = 2\n",
301 |         "\n",
302 |         "                if start_time >= chapter_start_seconds - buffer_time:\n",
303 |         "                    logging.info(f'\\n\\n## {chapters[current_chapter_index][\"title\"]}\\n')\n",
304 |         "                    current_chapter_index += 1\n",
305 |         "            except Exception as e:\n",
306 |         "                logging.error(f\"Error processing chapter timestamp: {chapter_time}\")\n",
307 |         "                logging.error(f\"Error details: {e}\")\n",
308 |         "\n",
309 |         "        line['text'] = remove_music_tags(line['text'])\n",
310 |         "        line['text'] = remove_escape_sequences(line['text'])\n",
311 |         "        line['text'] = remove_double_greater_than(line['text'])\n",
312 |         "        if line['text']:\n",
313 |         "          transcript += line['text'].strip() + ' '\n",
314 |         "\n",
315 |         "        # Log progress information\n",
316 |         "        if verbose and i % 100 == 0:  # Adjust the log frequency as needed\n",
317 |         "            logging.info(f\"Processed {i} lines out of {len(transcript_list)}\")\n",
318 |         "\n",
319 |         "    return transcript\n",
320 |         "\n",
321 |         "def process_and_save_transcript(video_id, video_info, language, generate_punctuated, output_dir, filename, verbose, punctuation_model):\n",
322 |         "    try:\n",
323 |         "        raw_transcript = get_transcript(video_id, language, video_info, verbose)\n",
324 |         "        logging.info(\"Raw Transcript Length: %d\", len(raw_transcript))\n",
325 |         "\n",
326 |         "        if generate_punctuated:\n",
327 |         "            with_punctuation = add_punctuation(raw_transcript, punctuation_model)\n",
328 |         "            with_punctuation = remove_period_after_hashes(with_punctuation)\n",
329 |         "            logging.info(\"Punctuation Char Length: %d\", len(with_punctuation))\n",
330 |         "            sentences = nltk.sent_tokenize(with_punctuation)\n",
331 |         "            logging.info(\"Sentences to process, (punctuated): %d\", len(sentences))\n",
332 |         "        else:\n",
333 |         "            sentences = nltk.sent_tokenize(raw_transcript)\n",
334 |         "            logging.info(\"Sentences to process, (raw): %d\", len(sentences))\n",
335 |         "\n",
336 |         "        # Capitalize sentences without batching\n",
337 |         "        capitalized_sentences = capitalize_sentences(sentences)\n",
338 |         "\n",
339 |         "        double_linesep = os.linesep + os.linesep\n",
340 |         "        capitalized_transcript = double_linesep.join(capitalized_sentences)\n",
341 |         "        output_path = os.path.join(output_dir, f'{filename}.md')\n",
342 |         "\n",
343 |         "        with open(output_path, 'w', encoding='utf-8') as f:\n",
344 |         "            f.write(capitalized_transcript)\n",
345 |         "\n",
346 |         "        if generate_punctuated:\n",
347 |         "            logging.info(f'Punctuated transcript saved to {output_path}')\n",
348 |         "        else:\n",
349 |         "            logging.info(f'Raw transcript saved to {output_path}')\n",
350 |         "\n",
351 |         "    except Exception as e:\n",
352 |         "        logging.error(f'Error: {e}')\n",
353 |         "\n",
354 |         "def getVideoInfo (video_id):\n",
355 |         "  try:\n",
356 |         "    # Set up Google API credentials using API key\n",
357 |         "    api_key =  userdata.get('YOUTUBE_API_KEY') # Replace with your actual API key\n",
358 |         "    youtube = googleapiclient.discovery.build(\"youtube\", \"v3\", developerKey=api_key)\n",
359 |         "    request = youtube.videos().list(part=\"id,snippet\",\n",
360 |         "                                id = video_id\n",
361 |         "        )\n",
362 |         "    response = request.execute()\n",
363 |         "    title = response['items'][0]['snippet']['title']\n",
364 |         "    description = response['items'][0]['snippet']['description']\n",
365 |         "    data = {\"title\" : title, \"chapters\" : parse_chapters(description)}\n",
366 |         "    return data\n",
367 |         "  except Exception as e:\n",
368 |         "    logging.error(f'Error: {e}')\n",
369 |         "    return {\"title\": \"\", \"chapters\": []}"
370 |       ],
371 |       "metadata": {
372 |         "id": "oasPyMVQoi7u"
373 |       },
374 |       "execution_count": null,
375 |       "outputs": []
376 |     },
377 |     {
378 |       "cell_type": "code",
379 |       "source": [
380 |         "video_id = parse_youtube_url(url)\n",
381 |         "video_info = getVideoInfo(video_id)\n",
382 |         "filename = filename = filename or clean_for_filename(video_info[\"title\"]) or clean_for_filename(video_id)"
383 |       ],
384 |       "metadata": {
385 |         "id": "c-M0h6sCmHK1"
386 |       },
387 |       "execution_count": null,
388 |       "outputs": []
389 |     },
390 |     {
391 |       "cell_type": "code",
392 |       "source": [
393 |         "process_and_save_transcript(video_id, video_info, language, punctuated, output_dir, filename, verbose, punctuation_model)"
394 |       ],
395 |       "metadata": {
396 |         "id": "CJgLX_DhcPsS"
397 |       },
398 |       "execution_count": null,
399 |       "outputs": []
400 |     },
401 |     {
402 |       "cell_type": "code",
403 |       "source": [
404 |         "# Download the Generated File\n",
405 |         "from google.colab import files\n",
406 |         "files.download(os.path.join(output_dir, f'{filename}.md'))"
407 |       ],
408 |       "metadata": {
409 |         "id": "w9xpxQPTmalR"
410 |       },
411 |       "execution_count": null,
412 |       "outputs": []
413 |     }
414 |   ]
415 | }


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # YouTube Transcript Generator
  2 | 
  3 | [![Open in Colab](https://img.shields.io/badge/Open_in_Colab-555?style=for-the-badge&logo=googlecolab&labelColor=gray&color=purple)](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb)
  4 | ![GitHub License](https://img.shields.io/github/license/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&color=blue) ![GitHub Repo stars](https://img.shields.io/github/stars/therohitdas/Youtube-Transcript-Generator?style=for-the-badge&logo=github)
  5 | [![CodeFactor](https://www.codefactor.io/repository/github/therohitdas/youtube-transcript-generator/badge?style=for-the-badge)](https://www.codefactor.io/repository/github/therohitdas/youtube-transcript-generator)
  6 | 
  7 | ## Overview 🌐
  8 | 
  9 | The YouTube Transcript Generator is a powerful tool designed to streamline the process of extracting and processing transcripts from YouTube videos. Whether you're looking to transcribe lectures, interviews, or any other video content, this project provides a convenient solution.
 10 | 
 11 | ### How It Can Help 🚀
 12 | 
 13 | This tool is particularly useful for:
 14 | 
 15 | - **Note Taking:** Quickly convert YouTube videos into text format for easy note-taking.
 16 | - **Content Analysis:** Analyze and derive insights from video content by converting it into text data.
 17 | - **Chat Bot Training:** Use the generated transcripts to train chat bots, such as ChatGPT, for natural language understanding.
 18 | - **Archiving:** Create a textual archive of valuable information from YouTube videos. This can be particularly useful for archiving interviews, tutorials, or any content you'd like to reference later without the need to re-watch the video.
 19 | - **Personal Knowledge Base:** Build a personal knowledge base by extracting and processing transcripts from YouTube videos. This can aid in consolidating information on diverse topics in a readable and accessible format.
 20 | - **Accessibility Improvement:** Enhance accessibility for individuals who prefer or require text-based content. The tool can be used to generate transcripts with added punctuation, improving the overall readability of the content.
 21 | 
 22 | ## Features 🛠️
 23 | 
 24 | - **Transcription:** Obtain raw transcripts from YouTube videos.
 25 | - **Punctuation:** Enhance transcripts by adding punctuation using [deep multilingual punctuation models](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large).
 26 | - **Chapter Detection:** Identify and separate chapters in the video based on provided timestamps.
 27 | - **User-friendly:** Easy-to-use script with customizable parameters.
 28 | 
 29 | ## Environment Variables 🌐
 30 | 
 31 | - `YOUTUBE_API_KEY`: Set up your Google API key for video information retrieval. You will need to create a Project in the Google Cloud for this and enable the YouTube v3 API. This is optional, if you don't add it, the chapters will not be added.
 32 | 
 33 | ## Script Parameters 📜
 34 | 
 35 | When running the script locally, you can pass these parameters to the script:
 36 | 
 37 | ### Positional Argument:
 38 | 
 39 | - `url`: YouTube video URL
 40 | 
 41 | ### Optional Arguments:
 42 | 
 43 | - `-h, --help`: Show the help message and exit
 44 | - `-l LANGUAGE, --language LANGUAGE`: Language for the transcript (default: en)
 45 | - `-p, --punctuated`: Generate punctuated transcript (default: False)
 46 | - `-a, -auto-open`: Automatically open the transcript in the default app (default: False)
 47 | - `-o OUTPUT_DIR, --output_dir OUTPUT_DIR`: Output directory for saving the transcript (default: current directory)
 48 | - `-f FILENAME, --filename FILENAME`: Filename for saving the transcript (default: Video Title or Video Id)
 49 | - `-m PUNCTUATION_MODEL, --punctuation_model PUNCTUATION_MODEL`: Path to the punctuation model (default: None)
 50 | - `-v, --verbose`: Print verbose output (default: False)
 51 | 
 52 | ## Run in Google Colab 🚀
 53 | 
 54 | To run this project in Google Colab, follow these steps:
 55 | 
 56 | 1. Open the [Google Colab Notebook](https://colab.research.google.com/github/therohitdas/Youtube-Transcript-Generator/blob/main/main.ipynb).
 57 | 2. Add Google's Project API key to the secrets tab under this key: `YOUTUBE_API_KEY` and toggle notebook access to on.
 58 | 3. Go to Runtime > Change Runtime Type and select T4 GPU type. If you use CPU, the output for punctuated transcript will take some minutes to complete (around 1 minute per 10-minute video)
 59 | 4. Change the values in the second cell to include your URL etc.
 60 | 5. Press CTRL+F9 or CMD+F9 to run the notebook.
 61 | 
 62 | ## Run Locally 💻
 63 | 
 64 | I do not recommend running locally as it will download tensors and other stuff which are over 6gb. But if you want you can do this:
 65 | 
 66 | 1. Clone the repository: `git clone https://github.com/therohitdas/Youtube-Transcript-Generator.git && cd Youtube-Transcript-Generator`
 67 | 2. Create a virtual environment: `python -m venv venv`
 68 | 3. Activate the virtual environment: `source venv/bin/activate` (Linux/MacOS) or `venv\Scripts\activate` (Windows)
 69 | 4. Install dependencies: `pip install -r requirements.txt`
 70 | 5. Set up the required environment variables: `YOUTUBE_API_KEY` (optional). You can either create a `.env` file or set them up in your system using.
 71 | 6. Run the script: `python index.py <YouTube_URL>` or `python index.py -h` for the help menu.
 72 | 
 73 | ## Support 🤝
 74 | 
 75 | For any issues or feature requests, please [create an issue](https://github.com/therohitdas/Youtube-Transcript-Generator/issues).
 76 | 
 77 | ## Example 📋
 78 | 
 79 | Here's an example of how to run the script with various options:
 80 | 
 81 | ### Basic Usage
 82 | 
 83 | ```bash
 84 | python index.py https://www.youtube.com/watch?v=VIDEO_ID
 85 | ```
 86 | 
 87 | ### Specify the Language
 88 | 
 89 | ```bash
 90 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -l fr
 91 | ```
 92 | 
 93 | ### Generate a Raw Transcript
 94 | 
 95 | ```bash
 96 | python index.py https://www.youtube.com/watch?v=VIDEO_ID
 97 | ```
 98 | 
 99 | ### Generate a Punctuated Transcript
100 | 
101 | ```bash
102 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -p
103 | ```
104 | 
105 | ### Specify the Output Directory
106 | 
107 | ```bash
108 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -o /path/to/output
109 | ```
110 | 
111 | ### Specify a Custom Filename
112 | 
113 | ```bash
114 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -f custom_filename
115 | ```
116 | 
117 | ### Enable Verbose Mode
118 | 
119 | ```bash
120 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -v
121 | ```
122 | 
123 | ### Specify a Punctuation Model
124 | 
125 | ```bash
126 | python index.py https://www.youtube.com/watch?v=VIDEO_ID -m author/model_name
127 | ```
128 | 
129 | Punctuation model name can be taken from [here](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large#languages).
130 | 
131 | Make sure to replace `https://www.youtube.com/watch?v=VIDEO_ID` with the actual URL of the YouTube video you want to process.
132 | 
133 | Feel free to copy and paste these examples into your terminal.
134 | 
135 | ## Acknowledgments 🙌
136 | 
137 | This script utilizes the [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api) and [fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) libraries. Special thanks to their contributors.
138 | 
139 | Feel free to adapt and use the script based on your requirements. Enjoy the convenience of YouTube transcript processing!
140 | 
141 | ## Connect with me 📧
142 | 
143 | The best way to connect is to email me [namaste@theRohitDas.com](mailto:namaste@therohitdas.com)
144 | 
145 | - [x/therohitdas](https://x.com/therohitdas)
146 | - [GitHub/therohitdas](https://github.com/therohitdas)
147 | 
148 | 🚀 Happy transcribing!
149 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cachetools==5.3.2
 2 | certifi==2023.11.17
 3 | charset-normalizer==3.3.2
 4 | click==8.1.7
 5 | deepmultilingualpunctuation==1.0.1
 6 | filelock==3.13.1
 7 | fsspec==2023.12.1
 8 | google-api-core==2.15.0
 9 | google-api-python-client==2.110.0
10 | google-auth==2.25.2
11 | google-auth-httplib2==0.1.1
12 | google-auth-oauthlib==1.1.0
13 | googleapis-common-protos==1.62.0
14 | httplib2==0.22.0
15 | huggingface-hub==0.19.4
16 | idna==3.6
17 | Jinja2==3.1.2
18 | joblib==1.3.2
19 | MarkupSafe==2.1.3
20 | mpmath==1.3.0
21 | networkx==3.2.1
22 | nltk==3.8.1
23 | numpy==1.26.2
24 | oauthlib==3.2.2
25 | packaging==23.2
26 | protobuf==4.25.1
27 | pyasn1==0.5.1
28 | pyasn1-modules==0.3.0
29 | pyparsing==3.1.1
30 | PyYAML==6.0.1
31 | regex==2023.10.3
32 | requests==2.31.0
33 | requests-oauthlib==1.3.1
34 | rsa==4.9
35 | safetensors==0.4.1
36 | sympy==1.12
37 | tokenizers==0.15.0
38 | torch==2.1.1
39 | tqdm==4.66.1
40 | transformers==4.36.0
41 | typing_extensions==4.9.0
42 | uritemplate==4.1.1
43 | urllib3==2.1.0
44 | youtube-transcript-api==0.6.1
45 | 


--------------------------------------------------------------------------------