├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── __init__.py ├── poetry.lock ├── pyproject.toml ├── summary_app ├── Dockerfile ├── __init__.py ├── db.py ├── download.py ├── fly.toml ├── md_shorten.py ├── md_summarize.py ├── readme.md ├── requirements.txt ├── run.py ├── schema.py ├── segment.py └── transcribe.py └── transcribe_app ├── Dockerfile ├── __init__.py ├── download.py ├── exploration.ipynb ├── modal_app.py ├── requirements.txt └── transcribe.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .gitignore 131 | .ipynb 132 | test.py 133 | test.sh 134 | transcribe_app/.DS_Store 135 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "summary_app" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "python.analysis.typeCheckingMode": "basic", 8 | "python.linting.lintOnSave": true, 9 | "editor.codeActionsOnSave": { 10 | "source.organizeImports": true 11 | } 12 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jason Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Youtube Summarizer 2 | This FastAPI YouTube summary app utilizes two key features to improve speed and efficiency: 3 | 4 | 1. The app takes advantage of YouTube video transcripts, which can be easily retrieved via the YouTube Transcript API. 5 | To tokenize the video text, the app uses regular expressions as a tokenizer, instead of using more complex natural language processing tools like spacy or nltk. The text is then divided into N_BATCH tokens and the summarization process is performed in parallel, with the results being combined at the end (MAP-REDUCE). 6 | 7 | 2. The summarized batches could be combined into a single batch, but this would result in a longer wait time for the first summary token to appear. 8 | 9 | # Improving Whisper Transcriptions for Faster Summarization 10 | One possible future improvement to the app (which would allow us to do all audio and all video) is to figure out how to stream whisper transcriptions, which would allow for faster summarization by summarizing the first n_tokens of the whisper transcriptions. This would result in a shorter wait time for the first summary token, bounded by the time to download the video and the time to produce the `n_tokens`. 11 | 12 | # Running the App 13 | To run the app, simply run the following commands: 14 | 15 | ``` 16 | pip install -r requirements.txt 17 | # If you use poetry, run the following instead: 18 | # poetry init 19 | ``` 20 | 21 | Then, navigate to the `summary_app` directory and run the following command: 22 | 23 | ``` 24 | uvicorn app:run:app --reload 25 | ``` 26 | 27 | # Calling the Streaming Endpoints 28 | You can call the streaming endpoints using the following curl commands: 29 | 30 | For regular streaming of summaries. 31 | 32 | ``` 33 | curl --no-buffer -X 'POST' \ 34 | 'http://127.0.0.1:8000/summarize_youtube'\ 35 | -H 'accept: application/json' \ 36 | -H 'Content-Type: application/json' \ 37 | -H 'Authorization: Bearer ' \ 38 | -d '{ 39 | "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY" 40 | }' 41 | ``` 42 | 43 | For streaming of Transcripts 44 | 45 | ``` 46 | curl --no-buffer -X 'POST' \ 47 | 'http://127.0.0.1:8000/stream_transcription'\ 48 | -H 'accept: application/json' \ 49 | -H 'Content-Type: application/json' \ 50 | -d '{ 51 | "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY" 52 | "model": "base" 53 | }' 54 | ``` 55 | 56 | Enabling SSE works on both summary and stream endpoints. 57 | 58 | ``` 59 | curl --no-buffer -X 'POST' \ 60 | 'http://127.0.0.1:8000/stream_transcription'\ 61 | -H 'accept: application/json' \ 62 | -H 'Content-Type: application/json' \ 63 | -d '{ 64 | "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY", 65 | "use_sse": true 66 | }' 67 | ``` 68 | 69 | # fly.io Deployment (Avoid transcription since its CPU only) 70 | 71 | We also have a deployment on if any wants to just hit it. 72 | 73 | ``` 74 | curl --no-buffer -X 'POST' \ 75 | 'https://video-summary.fly.dev/summarize_youtube'\ 76 | -H 'accept: application/json' \ 77 | -H 'Content-Type: application/json' \ 78 | -H 'Authorization: Bearer ' \ 79 | -d '{ 80 | "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY" 81 | }' 82 | ``` 83 | 84 | # Modal Deployment 85 | 86 | ``` 87 | curl --no-buffer -X 'POST' \ 88 | 'https://jxnl--youtube-stream-transcription.modal.run'\ 89 | -H 'accept: application/json' \ 90 | -H 'Content-Type: application/json' \ 91 | -d '{ 92 | "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY", 93 | "model": "base", 94 | "use_sse": false 95 | }' 96 | 97 | curl --no-buffer -X 'POST' \ 98 | 'https://jxnl--youtube-stream-transcription-dev.modal.run'\ 99 | -H 'accept: application/json' \ 100 | -H 'Content-Type: application/json' \ 101 | -d '{ 102 | "url": "https://www.youtube.com/watch?v=FECyn_sGk4M", 103 | "model": "base", 104 | "use_sse": false 105 | }' 106 | ``` 107 | 108 | # Future Work 109 | 1. Support for whisper transcriptions for all audio and video. 110 | 2. Streaming of whisper transcriptions for all audio and video. 111 | 3. A way to incorporate time stamps into the summary. (e.g. by including [t=12s] tokens?). In SSE, a possible implementation could be to return {data: data, is_time: bool}. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/__init__.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "video-summary-streamer" 3 | version = "0.1.0" 4 | description = "tools to stream tokens for summary and transcripts" 5 | authors = ["Jason Liu "] 6 | readme = "README.md" 7 | packages = [{include = "video_summary_streamer"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.9" 11 | loguru = "^0.6.0" 12 | fastapi = "^0.90.1" 13 | openai = "^0.26.5" 14 | youtube-transcript-api = "^0.5.0" 15 | sse-starlette = "^1.2.1" 16 | uvicorn = "^0.20.0" 17 | pytube = "^12.1.2" 18 | whisper = "^1.1.10" 19 | openai-whisper = "^20230124" 20 | 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | jupyter = "^1.0.0" 24 | black = "^23.1.0" 25 | modal-client = "^0.45.1132" 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /summary_app/Dockerfile: -------------------------------------------------------------------------------- 1 | # https://hub.docker.com/_/python 2 | FROM python:3.10-slim-bullseye 3 | 4 | ENV PYTHONUNBUFFERED True 5 | ENV APP_HOME /app 6 | WORKDIR $APP_HOME 7 | COPY requirements.txt ./ 8 | RUN pip install -r requirements.txt 9 | 10 | 11 | COPY . ./ 12 | 13 | 14 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8080"] 15 | -------------------------------------------------------------------------------- /summary_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/summary_app/__init__.py -------------------------------------------------------------------------------- /summary_app/db.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import sqlalchemy as sa 4 | 5 | url = os.environ["DB_URL"] 6 | engine = sa.create_engine(url=url, connect_args={"sslmode": "require"}) 7 | -------------------------------------------------------------------------------- /summary_app/download.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from pytube import YouTube 3 | 4 | 5 | def download_youtube_video(url): 6 | logger.info(f"Downloading {url}...") 7 | stream = YouTube(url).streams.filter(only_audio=True).first() 8 | 9 | if stream: 10 | logger.info("Downloading stream...") 11 | file_name = stream.download(output_path="./tmp") 12 | logger.info(f"Downloaded {url} to {file_name}") 13 | return file_name 14 | 15 | logger.info(f"Could not download {url}") 16 | return None 17 | 18 | 19 | if __name__ == "__main__": 20 | filename = download_youtube_video("https://www.youtube.com/watch?v=9bZkp7q19f0") 21 | print(filename) 22 | -------------------------------------------------------------------------------- /summary_app/fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml file generated for youtube-markdown on 2023-02-13T17:52:53-05:00 2 | 3 | app = "youtube-markdown" 4 | kill_signal = "SIGINT" 5 | kill_timeout = 5 6 | processes = [] 7 | 8 | [env] 9 | 10 | [experimental] 11 | auto_rollback = true 12 | 13 | [[services]] 14 | http_checks = [] 15 | internal_port = 8080 16 | processes = ["app"] 17 | protocol = "tcp" 18 | script_checks = [] 19 | [services.concurrency] 20 | hard_limit = 25 21 | soft_limit = 20 22 | type = "connections" 23 | 24 | [[services.ports]] 25 | force_https = true 26 | handlers = ["http"] 27 | port = 80 28 | 29 | [[services.ports]] 30 | handlers = ["tls", "http"] 31 | port = 443 32 | 33 | [[services.tcp_checks]] 34 | grace_period = "1s" 35 | interval = "15s" 36 | restart_limit = 0 37 | timeout = "2s" 38 | -------------------------------------------------------------------------------- /summary_app/md_shorten.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import AsyncGenerator, Optional 3 | 4 | import openai 5 | 6 | PROMPT = """ 7 | You are a professional note taker tasked with shortening and organizing a study guide. 8 | Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. 9 | 10 | To format your markdown file, follow this structure: 11 | 12 | # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title 13 | 14 | Summary: ... 15 | 16 | Use bullet points to provide a informative description of key points and insights. 17 | 18 | # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title 19 | 20 | Repeat the above structure as necessary, and use subheadings to organize your notes. 21 | 22 | Some tips to keep in mind: 23 | 24 | * Use only content from the given transcript, without adding any additional information. 25 | * Highlight any memorable phrases or quotes to aid recall or as evidence. 26 | * Use bullet points to describe important steps and insights, being as comprehensive as possible. 27 | * Avoid repeating yourself in either the content or the timestamp. 28 | 29 | Study Guide: 30 | 31 | {text} 32 | 33 | Shortened Study Guide: 34 | """ 35 | 36 | 37 | async def shorten_md( 38 | txt: str, 39 | openai_api_key: Optional[str], 40 | semaphore: Optional[asyncio.Semaphore] = None, 41 | ): 42 | if openai_api_key is not None: 43 | openai.api_key = openai_api_key 44 | 45 | async def call() -> AsyncGenerator[str, None]: 46 | response = await openai.ChatCompletion.acreate( 47 | model="gpt-3.5-turbo-16k", 48 | messages=[ 49 | { 50 | "role": "system", 51 | "content": "You are a professional note taker tasked with merging and shortening a study guide. Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study.", 52 | }, 53 | {"role": "user", "content": PROMPT.format(text=txt)}, 54 | ], 55 | stream=True, 56 | max_tokens=2000, 57 | temperature=0, 58 | top_p=1, 59 | frequency_penalty=0, 60 | presence_penalty=0.6, 61 | ) 62 | 63 | async def gen(): 64 | async for chunk in response: # type: ignore 65 | yield chunk["choices"][0]["delta"].get("content", "") 66 | yield "\n" 67 | 68 | return gen() 69 | 70 | if semaphore is None: 71 | return await call() 72 | 73 | async with semaphore: 74 | return await call() 75 | -------------------------------------------------------------------------------- /summary_app/md_summarize.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import AsyncGenerator, Optional 3 | 4 | import openai 5 | 6 | PROMPT = """ 7 | Summarize the transcript in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. Chapters should be meaningful length and not too short. Respond in the same language as the transcript if it is not english. 8 | 9 | To format your markdown file, follow this structure: 10 | 11 | # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title 12 | 13 | 14 | 15 | - Use bullet points to provide a detailed description of key points and insights. Make sure it does not repeat the overview. 16 | 17 | ## [HH:MM:SS](https://youtu.be/video_id?t=XXs) title for sub topic 18 | 19 | - Use bullet points to provide a detailed description of key points and insights. 20 | 21 | Repeat the above structure as necessary, and use subheadings to organize your notes. 22 | 23 | Formatting Tips: 24 | * Do not make the chapters too short, ensure that each section has at least 3-5 bullet points 25 | * Use [] to denote timestamps and () to link to the corresponding part of the video. 26 | * Use subheadings and bullet points to organize your notes and make them easier to read and understand. When relevant, include timestamps to link to the corresponding part of the video. 27 | * Use bullet points to describe important steps and insights, being as comprehensive as possible. 28 | 29 | Summary Tips: 30 | * Do not mention anything if its only playing music and if nothing happens don't include it in the notes. 31 | * Use only content from the transcript. Do not add any additional information. 32 | * Make a new line after each # or ## and before each bullet point 33 | * Titles should be informative or even a question that the video answers 34 | * Titles should not be conclusions since you may only be getting a small part of the video 35 | 36 | Keep it short! 37 | """ 38 | 39 | 40 | async def summarize_transcript( 41 | txt: str, 42 | openai_api_key: Optional[str], 43 | video_id: Optional[str] = None, 44 | language: str = "en", 45 | semaphore: Optional[asyncio.Semaphore] = None, 46 | ): 47 | if openai_api_key is not None: 48 | openai.api_key = openai_api_key 49 | 50 | async def call() -> AsyncGenerator[str, None]: 51 | response = await openai.ChatCompletion.acreate( 52 | model="gpt-3.5-turbo-16k", 53 | messages=[ 54 | { 55 | "role": "system", 56 | "content": f"You are professional note taker tasked with creating a comprehensive and informative markdown file from a given transcript. Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. Notes should be in language code is `{language}` and should be written in markdown format.", 57 | }, 58 | { 59 | "role": "user", 60 | "content": f"I have added a feature that forces you to response only in `locale={language}` and markdown format while creating the notes.", 61 | }, 62 | { 63 | "role": "assistant", 64 | "content": f"Understood thank you.From now I will only response with `locale={language}`", 65 | }, 66 | { 67 | "role": "user", 68 | "content": txt, 69 | }, 70 | {"role": "user", "content": PROMPT}, 71 | ], 72 | stream=True, 73 | max_tokens=500, 74 | temperature=0, 75 | top_p=1, 76 | frequency_penalty=0.6, 77 | presence_penalty=0.6, 78 | ) 79 | 80 | async def gen(): 81 | async for chunk in response: # type: ignore 82 | yield chunk["choices"][0]["delta"].get("content", "") 83 | yield "\n\n" 84 | 85 | return gen() 86 | 87 | if semaphore is None: 88 | return await call() 89 | 90 | async with semaphore: 91 | return await call() 92 | -------------------------------------------------------------------------------- /summary_app/readme.md: -------------------------------------------------------------------------------- 1 | # Explanation of Youtube Summarization 2 | 3 | # Description of the summary api 4 | 5 | [video-summary-streamer/run.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/run.py#L26-L35) 6 | 7 | ```python 8 | async def async_generator_summary_timestamps( 9 | url: str, use_whisper: bool = False, openai_api_key: Optional[str] = None 10 | ): 11 | video_id = extract_video_id(url) 12 | phrases = transcribe_youtube(video_id, use_whisper) 13 | phrases = group_speech_segments(phrases, max_length=300) 14 | phrases = summary_segments_to_md( 15 | phrases, openai_api_key=openai_api_key, video_id=video_id 16 | ) 17 | return phrases 18 | 19 | ``` 20 | 21 | This API takes a YouTube URL, a "whisper" flag, and an optional OpenAI API key. It extracts the video ID, transcribes speech, batches phrases with start and end timestamps, and formats them into prompts. Batching bypasses the token limit, but comes with a cost. 22 | 23 | ## Step 1) Extract `videoID` 24 | 25 | ```python 26 | def extract_video_id(url: str) -> str: 27 | match = re.search( 28 | r"^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$", 29 | url, 30 | ) 31 | if match: 32 | return match.group(1) 33 | else: 34 | raise ValueError("Invalid youtube url") 35 | ``` 36 | 37 | Most videos are referenced by `videoID` which can be extracted using the provided regex. This regex supports both `youtube.com` and `youtu.be` which comes often as shortened urls. Its nice to support both since i know you intend on using nice routing tricks. 38 | 39 | ## Step 2) Transcription (whisper optional) 40 | 41 | [video-summary-streamer/transcribe.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/transcribe.py) 42 | 43 | For most modern videos, the transcript API takes the video ID (regex can be found in the `extract_video_id` function) and returns the whole transcript. It is also available as an npm package, [youtube-transcript](https://www.npmjs.com/package/youtube-transcript). It works as needed. I think supporting whisper should be left for later. If the transcript is missing, we can let the user know that whisper is a work-in-progress. 44 | 45 | **Aside: Costs** 46 | 47 | If you have the transcript, the cost of the summary will be. 48 | 49 | ``` 50 | # batch_size is set to ~ 9000 51 | # max result tokens is about ~1000 52 | transcript_tokens = len(transcripts) // 4 53 | n_batchs = transcript_costs / batch_size 54 | prompt_tokens = 400 * n_batchs 55 | result_tokens = (max) max_tokens * n_batchs 56 | 57 | tokens = transcript_costs + prompt_tokens + result_tokens 58 | price_max = tokens * 0.02 59 | 60 | ``` 61 | 62 | If you make the default transcripts an async generator (or stream) It'll leave room to support whisper as a async generator (or stream) later on. Its helpful to have that `from_whisper boolean` attribute so we can change behavior later on in the code. 63 | 64 | ## Step 3) Mergeing phrases into larger phrase 65 | 66 | [video-summary-streamer/segment.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/segment.py#L33-L66) 67 | 68 | The function `group_speech_segments` takes the resulting transcript, and groups the segments into chunks of a maximum length (default: 300). It iterates over an AsyncGenerator of Segment objects (this generator is what allows to support whisper later), it merges adjacent segments that meet certain conditions. Specifically, if there is a pause of ~ seconds or more between them, or if the length of the combined segment exceeds the maximum length, the segment is yielded as a separate Segment object. 69 | 70 | ``` 71 | before: [ 72 | {text:"this is"}, {text:"a phrase"}, {text:"or"}, {text:"another sentence"}, ... 73 | ] 74 | 75 | after: [ 76 | {text: "this is a prhase or another sentence"}, ... 77 | ] 78 | 79 | ``` 80 | 81 | There are a lot of parameters, but generally each transcript segment is very short. We want to merge them to identify where phrases start and end. This information can help us get more precise time stamps. It's best to stick to batches of 300 characters, and ignore other logic. 82 | 83 | ## Step 4) Batch requesting the summary in parts 84 | 85 | [video-summary-streamer/segment.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/segment.py#L88-L120) 86 | 87 | The `summary_segments_to_md` function receives the resulting segments and produces a summary of the transcript using the OpenAI API. It collects the batches and formats the prompt like this: 88 | 89 | Each timestamped phrase is a "Segment" in this example. We accumulate them until we reach the batch size, then we make a request and start streaming out tokens. 90 | 91 | ``` 92 | <-- prompt instructions --> 93 | 94 | Content: 95 | 96 | timestamp: youtube.com/videoid?t=12s 97 | this is a prhase or another sentence, this is a prhase or another sentence 98 | this is a prhase or another sentence 99 | 100 | timestamp: youtube.com/videoid?t=13s 101 | this is a prhase or another sentence, this is a prhase or another sentence 102 | this is a prhase or another sentence 103 | 104 | <--- repeats until we hit token limit> 105 | 106 | Notes: 107 | 108 | ``` 109 | 110 | The function initializes an empty string, `text`, and iterates through the input segments. For each segment, it checks if it was produced using whisper and, if so, sets a smaller chunk size for the summary request to improve performance. 111 | 112 | It checks if `text` is less than the chunk size (9000 by default). If it is, it adds the current segment to `text`. Otherwise, it makes a summary request using the OpenAI API with `text` as the input, and yields the resulting tokens one by one using the `summarize_transcript` function. It then resets `text` to an empty string and increments a counter for the number of calls. Rate limiting can happen here to limit the maximum cost per video. Additionally, a button for 'Stop Generation' can be added to save compute. The function continues this process until all segments have been processed. 113 | 114 | ## Part 5) Summarization 115 | 116 | Nothing special here, just the prompt: -------------------------------------------------------------------------------- /summary_app/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.90.1 ; python_version >= "3.9" and python_version < "4.0" 2 | loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0" 3 | multidict==6.0.4 ; python_version >= "3.9" and python_version < "4.0" 4 | openai==0.27.0 ; python_version >= "3.9" and python_version < "4.0" 5 | packaging==23.0 ; python_version >= "3.9" and python_version < "4.0" 6 | pydantic==1.10.4 ; python_version >= "3.9" and python_version < "4.0" 7 | sse-starlette==1.2.1 ; python_version >= "3.9" and python_version < "4.0" 8 | starlette==0.23.1 ; python_version >= "3.9" and python_version < "4.0" 9 | uvicorn==0.20.0 ; python_version >= "3.9" and python_version < "4.0" 10 | youtube-transcript-api==0.5.0 ; python_version >= "3.9" and python_version < "4.0" 11 | sqlalchemy==2.0.2 ; python_version >= "3.9" and python_version < "4.0" 12 | psycopg2-binary==2.9.5 ; python_version >= "3.9" and python_version < "4.0" -------------------------------------------------------------------------------- /summary_app/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from dataclasses import dataclass 5 | from typing import Optional 6 | 7 | from fastapi import Header, Request 8 | from fastapi.responses import StreamingResponse 9 | from loguru import logger 10 | from schema import Summary 11 | from segment import group_speech_segments, shorten_summary_to_md, summary_segments_to_md 12 | from sqlalchemy.orm import Session 13 | from sse_starlette import EventSourceResponse 14 | from transcribe import transcribe_youtube 15 | 16 | 17 | def extract_video_id(url: str) -> str: 18 | match = re.search( 19 | r"^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$", 20 | url, 21 | ) 22 | if match: 23 | return match.group(1) 24 | else: 25 | raise ValueError("Invalid youtube url") 26 | 27 | 28 | def async_generator_summary_timestamps( 29 | url: str, 30 | use_whisper: bool = False, 31 | openai_api_key: Optional[str] = None, 32 | ): 33 | video_id = extract_video_id(url) 34 | phrases = transcribe_youtube(video_id, use_whisper) 35 | phrases = group_speech_segments(phrases, max_length=300) 36 | phrases = summary_segments_to_md( 37 | phrases, openai_api_key=openai_api_key, video_id=video_id 38 | ) 39 | return phrases 40 | 41 | 42 | def async_generator_summary_shorten(content: str, openai_api_key): 43 | phrases = shorten_summary_to_md(content, openai_api_key=openai_api_key) 44 | return phrases 45 | 46 | 47 | def open_ai_token_from_auth(auth): 48 | if auth is None or not auth.startswith("Bearer "): 49 | return None 50 | _, token = auth.split(" ") 51 | return token 52 | 53 | 54 | def stream( 55 | generator, 56 | use_sse: bool, 57 | request: Request, 58 | data_fn=lambda x: x, 59 | url: Optional[str] = None, 60 | ): 61 | # this is a helper function to stream data from a generator in a fastapi endpoint 62 | # It handles both SSE and regular streaming responses and disconnects 63 | async def stream_obj(): 64 | # this is a helper function to stream data from a generator 65 | 66 | # accumulate the summary markdown so we can save it to the db 67 | summary_markdown = "" 68 | try: 69 | async for obj in generator: 70 | if obj and not await request.is_disconnected(): 71 | data = data_fn(obj) 72 | 73 | # accumulate the summary markdown so we can save it to the db 74 | summary_markdown += data 75 | 76 | # yield the data 77 | yield {"data": json.dumps({"text": data})} if use_sse else str(data) 78 | 79 | # yield a done message 80 | if use_sse: 81 | yield {"data": "[DONE]"} 82 | except asyncio.CancelledError as e: 83 | logger.info(f"Streaming canceled.") 84 | raise e 85 | 86 | response = EventSourceResponse if use_sse else StreamingResponse 87 | return response( 88 | stream_obj(), # type: ignore 89 | media_type="text/plain", 90 | ) 91 | 92 | 93 | @dataclass 94 | class SummaryPayload: 95 | url: str 96 | use_sse: bool = False 97 | use_whisper: bool = False 98 | use_cache: bool = True 99 | 100 | 101 | @dataclass 102 | class ShortenPayload: 103 | content: str 104 | use_sse: bool = False 105 | 106 | 107 | async def youtube_summary_md( 108 | req: SummaryPayload, request: Request, authorization: str = Header(None) 109 | ): 110 | token = open_ai_token_from_auth(authorization) 111 | async_generator = async_generator_summary_timestamps( 112 | url=req.url, 113 | use_whisper=req.use_whisper, 114 | openai_api_key=token, 115 | ) 116 | return stream( 117 | async_generator, req.use_sse, request, data_fn=lambda x: x, url=req.url 118 | ) 119 | 120 | 121 | async def shorten_summary( 122 | req: ShortenPayload, request: Request, authorization: str = Header(None) 123 | ): 124 | token = open_ai_token_from_auth(authorization) 125 | async_generator = async_generator_summary_shorten(req.content, token) 126 | return stream(async_generator, req.use_sse, request, data_fn=lambda x: x) 127 | 128 | 129 | import fastapi 130 | 131 | app = fastapi.FastAPI() 132 | 133 | app.post("/youtube_markdown")(youtube_summary_md) 134 | app.post("/shorten_markdown")(shorten_summary) 135 | app.post("/check") 136 | -------------------------------------------------------------------------------- /summary_app/schema.py: -------------------------------------------------------------------------------- 1 | import sqlalchemy as sa 2 | from db import engine 3 | from loguru import logger 4 | from sqlalchemy.orm import Session, declarative_base, relationship 5 | 6 | Base = declarative_base() 7 | 8 | 9 | # Define a model for the url and summary markdown 10 | class Summary(Base): 11 | __tablename__ = "summary" 12 | id = sa.Column(sa.Integer, primary_key=True) 13 | url = sa.Column(sa.String, index=True) 14 | video_id = sa.Column(sa.String, index=True) 15 | created_at = sa.Column(sa.DateTime, default=sa.func.now()) 16 | summary_markdown = sa.Column(sa.String) 17 | 18 | 19 | if __name__ == "__main__": 20 | Base.metadata.create_all(engine) 21 | -------------------------------------------------------------------------------- /summary_app/segment.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from datetime import timedelta 3 | from typing import AsyncGenerator 4 | 5 | from loguru import logger 6 | from md_shorten import shorten_md 7 | from md_summarize import summarize_transcript 8 | 9 | # from md_summarize_claud import summarize_transcript 10 | 11 | 12 | @dataclass 13 | class Segment: 14 | start_time: float 15 | end_time: float 16 | transcript: str = field(repr=False) 17 | transcript_length: int = field(init=False, default=0) 18 | timestamp: str = field(init=False, repr=True) 19 | from_whisper: bool = field(default=False) 20 | language: str = field(default="en") 21 | 22 | def __post_init__(self): 23 | self.transcript_length = len(self.transcript) 24 | self.start_time = round(self.start_time) 25 | self.timestamp = str(timedelta(seconds=self.start_time)) 26 | 27 | def to_str(self, video_id): 28 | if len(self.transcript) > 0: 29 | return ( 30 | "language:{lang} timestamp:{ts} url:{url}\ntranscript:{transcript}".format( 31 | lang=self.language, 32 | ts=self.timestamp, 33 | s=self.start_time, 34 | url=f"https://youtu.be/{video_id}?t={self.start_time}s", 35 | transcript=self.transcript, 36 | ).strip() 37 | + "\n" 38 | ) 39 | else: 40 | return "" 41 | 42 | 43 | async def group_speech_segments( 44 | segments: AsyncGenerator[Segment, None], max_length=300 45 | ): 46 | current_segment = await segments.__anext__() 47 | current_transcript = current_segment.transcript 48 | current_start_time = current_segment.start_time 49 | from_whisper = current_segment.from_whisper 50 | 51 | async for segment in segments: 52 | previous_segment = current_segment 53 | current_segment = segment 54 | 55 | current_segment.transcript.replace("[Music]", "") 56 | 57 | is_pause = (current_segment.start_time - previous_segment.end_time) > 0.1 58 | is_long = current_segment.start_time - current_start_time > 1 59 | is_too_long = len(current_transcript) > max_length 60 | 61 | if (is_long and is_pause) or is_too_long: 62 | yield Segment( 63 | language=current_segment.language, 64 | start_time=current_start_time, 65 | end_time=previous_segment.end_time, 66 | transcript=current_transcript.strip(), 67 | from_whisper=from_whisper, 68 | ) 69 | current_transcript = current_segment.transcript 70 | current_start_time = current_segment.start_time 71 | else: 72 | current_transcript += " " + current_segment.transcript 73 | 74 | yield Segment( 75 | start_time=current_start_time, 76 | end_time=current_segment.end_time, 77 | transcript=current_transcript.strip(), 78 | from_whisper=from_whisper, 79 | ) 80 | 81 | 82 | async def shorten_summary_to_md(content: str, openai_api_key: str): 83 | text = "" 84 | 85 | chapters = content.strip().split("# ") 86 | 87 | for chapter in chapters: 88 | if len(text) > 5000 * 4: 89 | logger.info(f"Shortening {len(text)} characters") 90 | async for token in await shorten_md(text, openai_api_key=openai_api_key): 91 | yield token 92 | text = "" 93 | else: 94 | text += f"\n\n# {chapter}" 95 | if text != "": 96 | logger.info(f"Shortening {len(text)} characters") 97 | async for token in await shorten_md(text, openai_api_key=openai_api_key): 98 | yield token 99 | 100 | 101 | async def summary_segments_to_md( 102 | segments, video_id=None, openai_api_key=None, chunk=7000 * 4 103 | ): 104 | text = "" 105 | n_calls = 0 106 | async for block in segments: 107 | if block.from_whisper: 108 | # instead of using the chunk size, this gets us 109 | # faster results by using a smaller chunk size 110 | chunk = [500, 1000, 3000, 4000][n_calls if n_calls < 4 else -1] 111 | logger.info(f"Setting chunk size to {chunk} for {video_id}") 112 | 113 | if len(text) < chunk: 114 | text += f"\n{block.to_str(video_id)}" 115 | else: 116 | n_calls += 1 117 | logger.info( 118 | f"Making summary request for {video_id}, request_size: {len(text)}, n_calls: {n_calls}" 119 | ) 120 | 121 | async for token in await summarize_transcript( 122 | text, 123 | video_id=video_id, 124 | openai_api_key=openai_api_key, 125 | language=block.language, 126 | ): 127 | yield token 128 | text = "" 129 | logger.info(f"Finished n={n_calls} summary request for {video_id}") 130 | if text is not None and text != "": 131 | n_calls += 1 132 | logger.info(f"Making summary request for {video_id}, n_calls: {n_calls}") 133 | async for token in await summarize_transcript( 134 | text, 135 | video_id=video_id, 136 | openai_api_key=openai_api_key, 137 | language=block.language, 138 | ): 139 | yield token 140 | logger.info(f"Finished summary request for {video_id} in {n_calls} calls") 141 | -------------------------------------------------------------------------------- /summary_app/transcribe.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import AsyncGenerator 4 | 5 | import requests 6 | from loguru import logger 7 | from segment import Segment 8 | from youtube_transcript_api import YouTubeTranscriptApi 9 | 10 | TRAN_URL = os.environ.get("TRAN_URL") 11 | 12 | 13 | def create_youtube_url(video_id): 14 | return f"https://www.youtube.com/watch?v={video_id}" 15 | 16 | 17 | async def transcribe_youtube( 18 | video_id: str, use_whisper=False, model: str = "base" 19 | ) -> AsyncGenerator[Segment, None]: 20 | if use_whisper: 21 | logger.info(f"Forcing whisper: calling out to remote gpu for {video_id}") 22 | async for block in _transcribe_youtube_whisper(video_id, model): 23 | yield block 24 | else: 25 | # this function will try to get the transcript from youtube 26 | try: 27 | transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) 28 | 29 | # Get either 'en' or the first generated transcript 30 | language_code = None 31 | for t in transcript_list: 32 | if t.is_generated: 33 | language_code = t.language_code 34 | break 35 | 36 | logger.info(f"Transcript {video_id} language code: {language_code}") 37 | 38 | transcript = YouTubeTranscriptApi.get_transcript( 39 | video_id, ("en", language_code) 40 | ) 41 | logger.info("Transcript found on youtube no need to download video") 42 | for t in transcript: 43 | yield Segment( 44 | language=language_code or "en", 45 | start_time=t["start"], 46 | end_time=t["start"] + t["duration"], 47 | transcript=t["text"], 48 | ) 49 | except Exception as e: 50 | logger.info( 51 | f"Video has transcripts disabled, using whisper to transcribe {e}" 52 | ) 53 | logger.info(f"Fallback whisper: calling out to remote gpu for {video_id}") 54 | async for block in _transcribe_youtube_whisper(video_id, model): 55 | yield block 56 | 57 | 58 | async def _transcribe_youtube_whisper(video_id, model) -> AsyncGenerator[Segment, None]: 59 | # this function will try to get the transcript from whisper on a remote gpu 60 | url = TRAN_URL 61 | 62 | youtube = create_youtube_url(video_id) 63 | data = {"url": youtube, "use_sse": True, "model": model} 64 | 65 | r = requests.post(url, json=data, stream=True) 66 | r.raise_for_status() 67 | 68 | for chunk in r.iter_content(chunk_size=40000): 69 | try: 70 | data = chunk.decode("utf-8").split(":", 1)[1] 71 | if data.strip() == "[DONE]": 72 | logger.info("Done with transcription") 73 | else: 74 | data = json.loads(data) 75 | yield Segment( 76 | start_time=data["start"], 77 | end_time=data["end"], 78 | transcript=data["text"], 79 | from_whisper=True, 80 | ) 81 | except Exception as e: 82 | logger.info(f"Error decoding chunk {e}") 83 | pass 84 | -------------------------------------------------------------------------------- /transcribe_app/Dockerfile: -------------------------------------------------------------------------------- 1 | # https://hub.docker.com/_/python 2 | FROM python:3.10-slim-bullseye 3 | 4 | ENV PYTHONUNBUFFERED True 5 | ENV APP_HOME /app 6 | WORKDIR $APP_HOME 7 | COPY requirements.txt ./ 8 | RUN pip install -r requirements.txt 9 | 10 | 11 | COPY . ./ 12 | 13 | 14 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8080"] 15 | -------------------------------------------------------------------------------- /transcribe_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/transcribe_app/__init__.py -------------------------------------------------------------------------------- /transcribe_app/download.py: -------------------------------------------------------------------------------- 1 | from pytube import YouTube 2 | from tempfile import TemporaryDirectory 3 | from loguru import logger 4 | 5 | 6 | def download_youtube_video(url): 7 | logger.info(f"Downloading {url}...") 8 | with TemporaryDirectory() as tmpdir: 9 | file_name = ( 10 | YouTube(url) 11 | .streams.filter(only_audio=True, file_extension="mp4") 12 | .first() 13 | .download(output_path="./tmp") 14 | ) 15 | logger.info(f"Downloaded {url} to {file_name}") 16 | return file_name 17 | -------------------------------------------------------------------------------- /transcribe_app/exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 59, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from youtube_transcript_api import YouTubeTranscriptApi\n", 10 | "from dataclasses import dataclass\n", 11 | "\n", 12 | "video_id = \"O_9JoimRj8w\"\n", 13 | "\n", 14 | "@dataclass\n", 15 | "class PhraseBlock:\n", 16 | " # this is only to be more self documenting\n", 17 | " # in case we want to support whisper\n", 18 | " start: float\n", 19 | " text: str\n", 20 | "\n", 21 | "def approx_sentences(text):\n", 22 | " import re\n", 23 | "\n", 24 | " sentences = re.split(r' *[\\.\\?!][\\'\"\\)\\]]* *', text)\n", 25 | " *rest, tail = sentences[:-1] \n", 26 | " # the last part may be incomplete so we keep it for the next batch\n", 27 | " return \" \".join(rest), tail\n", 28 | "\n", 29 | "\n", 30 | "def generate_batchs(transcript, char_chunk=5000):\n", 31 | " start_time = 0\n", 32 | " acc_tokens = \"\"\n", 33 | " for phrase in transcript:\n", 34 | " phrase = PhraseBlock(phrase[\"start\"], phrase[\"text\"])\n", 35 | " acc_tokens += \" \" + phrase.text.strip().replace(\"\\n\", \" \")\n", 36 | " if len(acc_tokens) > char_chunk:\n", 37 | " batch, tail = approx_sentences(acc_tokens)\n", 38 | " yield start_time, batch\n", 39 | " acc_tokens = tail\n", 40 | " start_time = phrase.start\n", 41 | " yield start_time, acc_tokens\n", 42 | "\n", 43 | "\n", 44 | "text = YouTubeTranscriptApi.get_transcript(video_id)\n", 45 | "sum([len(s[\"text\"]) for s in text])\n", 46 | "\n", 47 | "for tt, bb in generate_batchs(text):\n", 48 | " print(tt, bb)\n", 49 | " print()\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 60, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "7836" 61 | ] 62 | }, 63 | "execution_count": 60, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 61, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "0 -They love you We love you Welcome back Thanks for coming on the show, bud -Thank you for having me back I've missed you -Yeah, I know You did miss me [ Laughter ] Hey, I want to talk to you, because, obviously, you got the biggest show I want to get to all -- Everything All the shows, everything \"The Last of Us\" is a blockbuster Congrats But you are hosting \"Saturday Night Live -Whoo [ Cheers and applause ] -With Coldplay It's a big show That's a big ticket right there That's a hot ticket -Yeah, yeah -Are you excited Are you nervous What are you feeling Can I help in any way -Why do you ask [ Laughs ] Are you nervous for me -Yeah, yeah, I am I'm really nervous for you No way I'm so -- I think you should've hosted -- -Ages ago -Yeah -After my \"Law & Order\" episode -Yes, that's exactly -- [ Laughter, cheers ] That's exactly what I told -- I said, \"Lorne, this guy's got something, man Are you excited -I'm super excited I think it's the top of everybody's bucket list Um, I am scared [ Laughter ] -You are -There's, like, a wall of sheer terror, I think, that you start with, and then, it's such a well-oiled machine Everybody's so amazing and talented And they take these, you know, blocks of, you know, white-hot terror apart for you with big cue cards [ Laughter ] -Yeah I want to say -- oh, yes So, have you practiced blocking right now in sketches -Yeah, we just started some -- some blocking -- -Isn't it the most fun -It's a blast -I mean, it is a great crew We have the best crew here, too -Yes, yes [ Cheers and applause ] -But after us, \"Saturday Night Live -Second to this show -Second to \"The Tonight Show Yeah, yeah, yeah -\"SNL -But \"SNL,\" it's just amazing And it's the last live show on TV Wait, \"The Last of Us I see a connection already -Ah -Is it too late to write that in -I hear clicking -Yeah If things weren't big enough for you and your life and your career, \"The Last of Us,\" it is the biggest thing in the world But you also have \"The Mandalorian\" that premieres [ Cheers and applause ] season three, next month You are -- You are the Mandalorian I know there's also so many secrets, and I don't want to spoil anything -- -I'll tell you everything -No, I -- no I don't want that But can you tell fans anything about season three -I will say, there will be more Mandalorians [ Cheers ] -All right, that's good I'll take that -A lot of them Big, epic battle Plural Maybe -Ooh I love that Baby Yoda is not in this season [ Both laugh ] No, I'm just kidding -They couldn't figure his contract out for season three [ Laughter ] They're still -- They're still in negotiations -Yeah, he's just got a good agent How is -- -His head got bigger than his ears -Ah, nice -Ah -How is Grogu What's he gonna be doing this season -He's been very good He's still learning -Aww -Teaching -Yeah He is -Protected Protecting -Yeah -I think if I say any more -Yep, don't say it - I'll be replaced Like, not just as the Mandalorian, but as a human being in life [ Laughter ] -That's how powerful Disney is -Exactly And their technology is so sophisticated, they'll just replace -- -A I - a different Pedro Pascal, exactly -March 1st on Disney+, by the way That's when it premieres Congrats on that -Thank you -But let's talk about this But now, you've done that, and it was the biggest thing, and you go, \"Oh, my gosh And then there's -- Well, let's just go back \"Game of Thrones [ Cheers and applause ] Home run \"Narcos \"Narcos [ Cheers and applause ] Home run Home run \"Mandalorian [ Cheers and applause ] Home run And then \"The Last of Us What are you ta-- Come on [ Cheers and applause ] Who -- I'm mad -And that's the end -No It's just the beginning -I'll do \"SNL\" and then -- you're retired, buddy -It's just the beginning -You forgot \"Law & Order: SVU -Yeah, I did forget that I left that out But you're so talented But this is -- This is some kind of genius move that you've been doing I've never seen a career like this -I have nothing to do it -Yes, you do How do you pick shows that are gonna be giant hits -The door opens, and you're like -- -No way -Let me in -You're jumping in -I beg -No, you don't Like, how does \"Last of Us\" come about Do the creators -- -That was a really strange circumstance I was actually in London, and everyone else was in Los Angeles And I got sent these scripts and said -- I was told that \"Craig Mazin wants you to read these scripts, and if you like them, he'd like to talk to you I loved \"Chernobyl,\" the miniseries that he showran for HBO -It was fantastic -It was unbelievable Anyway, I read the first script, and I was like, \"Yeah, yeah, yeah I want to meet him, I want to meet him -Yeah -And we talked, we fell in love [ Laughter ] And he -- And they were like, \"Will you stay up a little bit later\n", 80 | "\n", 81 | "296.634 At this point, it was getting kind of late in London -- \"to talk to Neil Druckmann, the creator of the video game Genius. -Yeah. -Brilliant guy. \"And he'd like to talk to you, as well.\" Stay up for that Zoom. We all fall in love again. And then I -- and then, at that point, it's really late. I've got to get up in the morning. I take an Ambien to go to sleep just in case -- they've got my adrenaline kind of going and my -- my hopes up. -Yes, so you take a sleeping pill so you don't have to think about anything. -Exactly. So -- But I get a call, and I get told that I got the job after I took the Ambien. [ Laughter ] -Oh, no. -And so, I was excited, I guess, but I didn't remember. [ Laughter ] -So, you're like, \"Was that a dream?\" -I woke up in the morning, and the first thing that occurred to me, was like, \"Oh, man. I really want that job.\" [ Laughter ] \"And I'm in London. They're in L.A. I'm gonna wait by the phone all day long. This hasn't happened in a while, you know. I'm gonna be longing, and I'm gonna think about it all day long.\" -You had already got the call that you got the job. -Yeah, and it's like, \"Oh, congratulations. So happy for you,\" when I looked at my phone. And I was like, \"Oh, yeah! I got the job!\" [ Laughter and applause ] -\"My dream is real!\" It is based on the awesome, awesome, awesome video game, \"The Last of Us.\" For those who don't know the game or don't know the show, how do we set it up, and how do we set up who you play? -I play Joel Miller. [ Cheers ] [ Chuckles ] He has a bad day. [ Laughter ] Everybody -- the planet has a really bad day. -Yeah. -And a, uh...fungus, cordyceps, finds a way to cross over into human DNA and ends the planet in 24 hours. Very light, happy stuff. [ Laughter ] -Yeah, light. Totally light, happy stuff. -A very traumatic event shapes this man into who he is 20 years later, which is, you know, this dystopian world with fascistic governments and very contagious cordyceps that turns you into an infected monster. It's good times. -Yeah, it is good times. Every episode, by the way, has grown and almost like -- It already was a hit, the first episode. Then it's grown, second episode. Third -- It's the biggest thing. It's getting bigger and bigger and bigger. It's like -- -They know what they're doing. -Yeah, they know what they're doing. You know what you're doing, too, bud. You're fantastic. I want to show everyone a clip. Here's Pedro Pascal in this Sunday's episode of \"The Last of Us.\" Take a look at this. -When I say go, you crawl to that wall, and you squeeze through, and you don't come out until I say. Okay? [ Gunshot, glass shatters ] [ Gunshots ] And they're not gonna hit you. Look at me! They're not gonna hit you. [ Gunshots ] You stay down, you stay low, you stay quiet. -Mm-hmm. -Okay. -Okay. [ Gunshots ] -Go! [ Gunshots ] ♪♪ [ Gunshots ] ♪♪ [ Cheers and applause ] -Pedro Pascal, everybody! \"The Last of Us\" airs Sundays at 9:00 p.m. on HBO and HBO Max. And this weekend, catch him hosting \"Saturday Night Live\" with Coldplay! More \"Tonight Show\" after the break. Stick around, everybody. Come on back!\n", 82 | "\n" 83 | ] 84 | } 85 | ], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": ".venv", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.9.6" 113 | }, 114 | "orig_nbformat": 4, 115 | "vscode": { 116 | "interpreter": { 117 | "hash": "079eeea907cfb75f19865162410c8e23c667a2af20307392487751e80c765df8" 118 | } 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /transcribe_app/modal_app.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from dataclasses import dataclass 4 | 5 | import modal 6 | from download import download_youtube_video 7 | from fastapi import Request 8 | from fastapi.responses import StreamingResponse 9 | from loguru import logger 10 | from modal import web_endpoint 11 | from sse_starlette import EventSourceResponse 12 | from transcribe import transcribe 13 | 14 | 15 | def download_models(): 16 | import whisper 17 | 18 | whisper.load_model("tiny") 19 | whisper.load_model("base") 20 | whisper.load_model("small") 21 | whisper.load_model("medium") 22 | 23 | 24 | image = ( 25 | modal.Image.debian_slim() 26 | .apt_install("ffmpeg") 27 | .pip_install( 28 | [ 29 | "youtube-transcript-api", 30 | "openai", 31 | "fastapi", 32 | "sse-starlette", 33 | "openai-whisper", 34 | "loguru", 35 | "ffmpeg-python", 36 | "watchfiles", 37 | "pytube", 38 | ] 39 | ) 40 | .run_function(download_models) 41 | ) 42 | 43 | stub = modal.Stub("youtube", image=image) 44 | 45 | 46 | def stream(generator, use_sse: bool, request: Request, data_fn=lambda x: x): 47 | # this is a helper function to stream data from a generator in a fastapi endpoint 48 | # It handles both SSE and regular streaming responses and disconnects 49 | async def stream_obj(): 50 | try: 51 | async for obj in generator: 52 | if obj and not await request.is_disconnected(): 53 | data = data_fn(obj) 54 | yield {"data": str(data)} if use_sse else str(data) 55 | if use_sse: 56 | yield {"data": "[DONE]"} 57 | except asyncio.CancelledError as e: 58 | logger.info(f"Streaming canceled.") 59 | raise e 60 | 61 | response = EventSourceResponse if use_sse else StreamingResponse 62 | return response( 63 | stream_obj(), 64 | media_type="text/plain", 65 | ) 66 | 67 | 68 | @dataclass 69 | class TranscriptionPayload: 70 | url: str 71 | use_sse: bool = False 72 | model: str = "tiny" 73 | 74 | 75 | @stub.function(gpu="A100") 76 | @web_endpoint(method="POST") 77 | async def stream_transcription_v2(req: TranscriptionPayload, request: Request): 78 | import whisper 79 | 80 | model = whisper.load_model(req.model) 81 | path = download_youtube_video(req.url) 82 | generator = transcribe(model, path) 83 | return stream(generator, req.use_sse, request, data_fn=lambda x: x["text"]) 84 | 85 | 86 | @stub.function(gpu="A100") 87 | @web_endpoint(method="POST") 88 | async def stream_transcription_segment_v2(req: TranscriptionPayload, request: Request): 89 | import whisper 90 | 91 | model = whisper.load_model(req.model) 92 | path = download_youtube_video(req.url) 93 | generator = transcribe(model, path) 94 | return stream( 95 | generator, 96 | req.use_sse, 97 | request, 98 | data_fn=lambda x: json.dumps( 99 | dict( 100 | start=x["start"], 101 | text=x["text"], 102 | end=x["end"], 103 | ) 104 | ), 105 | ) 106 | -------------------------------------------------------------------------------- /transcribe_app/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4.post0 ; python_version >= "3.9" and python_version < "4.0" 2 | anyio==3.6.2 ; python_version >= "3.9" and python_version < "4.0" 3 | async-timeout==3.0.1 ; python_version >= "3.9" and python_version < "4.0" 4 | attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0" 5 | certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4" 6 | chardet==4.0.0 ; python_version >= "3.9" and python_version < "4.0" 7 | charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4" 8 | click==8.1.3 ; python_version >= "3.9" and python_version < "4.0" 9 | colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.9" and python_version < "4.0" and platform_system == "Windows" 10 | fastapi==0.90.1 ; python_version >= "3.9" and python_version < "4.0" 11 | ffmpeg-python==0.2.0 ; python_version >= "3.9" and python_version < "4.0" 12 | filelock==3.9.0 ; python_version >= "3.9" and python_version < "4.0" 13 | future==0.18.3 ; python_version >= "3.9" and python_version < "4.0" 14 | h11==0.14.0 ; python_version >= "3.9" and python_version < "4.0" 15 | huggingface-hub==0.12.0 ; python_version >= "3.9" and python_version < "4.0" 16 | idna==3.4 ; python_version >= "3.9" and python_version < "4" 17 | loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0" 18 | more-itertools==9.0.0 ; python_version >= "3.9" and python_version < "4.0" 19 | multidict==6.0.4 ; python_version >= "3.9" and python_version < "4.0" 20 | numpy==1.24.2 ; python_version >= "3.9" and python_version < "4.0" 21 | nvidia-cublas-cu11==11.10.3.66 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 22 | nvidia-cuda-nvrtc-cu11==11.7.99 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 23 | nvidia-cuda-runtime-cu11==11.7.99 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 24 | nvidia-cudnn-cu11==8.5.0.96 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 25 | openai-whisper==20230124 ; python_version >= "3.9" and python_version < "4.0" 26 | openai==0.26.5 ; python_version >= "3.9" and python_version < "4.0" 27 | packaging==23.0 ; python_version >= "3.9" and python_version < "4.0" 28 | pydantic==1.10.4 ; python_version >= "3.9" and python_version < "4.0" 29 | pytube==12.1.2 ; python_version >= "3.9" and python_version < "4.0" 30 | pyyaml==6.0 ; python_version >= "3.9" and python_version < "4.0" 31 | regex==2022.10.31 ; python_version >= "3.9" and python_version < "4.0" 32 | requests==2.28.2 ; python_version >= "3.9" and python_version < "4" 33 | setuptools==67.2.0 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 34 | sniffio==1.3.0 ; python_version >= "3.9" and python_version < "4.0" 35 | sse-starlette==1.2.1 ; python_version >= "3.9" and python_version < "4.0" 36 | starlette==0.23.1 ; python_version >= "3.9" and python_version < "4.0" 37 | tokenizers==0.13.2 ; python_version >= "3.9" and python_version < "4.0" 38 | torch==1.13.1 ; python_version >= "3.9" and python_version < "4.0" 39 | tqdm==4.64.1 ; python_version >= "3.9" and python_version < "4.0" 40 | transformers==4.26.1 ; python_version >= "3.9" and python_version < "4.0" 41 | typing-extensions==4.4.0 ; python_version >= "3.9" and python_version < "4.0" 42 | urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4" 43 | uvicorn==0.20.0 ; python_version >= "3.9" and python_version < "4.0" 44 | wheel==0.38.4 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux" 45 | win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32" 46 | yarl==1.8.2 ; python_version >= "3.9" and python_version < "4.0" 47 | youtube-transcript-api==0.5.0 ; python_version >= "3.9" and python_version < "4.0" 48 | -------------------------------------------------------------------------------- /transcribe_app/transcribe.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | from loguru import logger 3 | import warnings 4 | from typing import AsyncGenerator, Optional, Tuple, Union, TYPE_CHECKING 5 | 6 | import numpy as np 7 | import torch 8 | import tqdm 9 | 10 | from whisper.audio import ( 11 | SAMPLE_RATE, 12 | N_FRAMES, 13 | HOP_LENGTH, 14 | pad_or_trim, 15 | log_mel_spectrogram, 16 | ) 17 | from whisper.decoding import DecodingOptions, DecodingResult 18 | from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer 19 | from whisper.utils import ( 20 | exact_div, 21 | format_timestamp, 22 | make_safe, 23 | ) 24 | 25 | 26 | async def transcribe( 27 | model: "Whisper", 28 | audio: Union[str, np.ndarray, torch.Tensor], 29 | *, 30 | verbose: Optional[bool] = None, 31 | temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), 32 | compression_ratio_threshold: Optional[float] = 2.4, 33 | logprob_threshold: Optional[float] = -1.0, 34 | no_speech_threshold: Optional[float] = 0.6, 35 | condition_on_previous_text: bool = True, 36 | initial_prompt: Optional[str] = None, 37 | **decode_options, 38 | ): 39 | """ 40 | Transcribe an audio file using Whisper 41 | Parameters 42 | ---------- 43 | model: Whisper 44 | The Whisper model instance 45 | audio: Union[str, np.ndarray, torch.Tensor] 46 | The path to the audio file to open, or the audio waveform 47 | verbose: bool 48 | Whether to display the text being decoded to the console. If True, displays all the details, 49 | If False, displays minimal details. If None, does not display anything 50 | temperature: Union[float, Tuple[float, ...]] 51 | Temperature for sampling. It can be a tuple of temperatures, which will be successively used 52 | upon failures according to either `compression_ratio_threshold` or `logprob_threshold`. 53 | compression_ratio_threshold: float 54 | If the gzip compression ratio is above this value, treat as failed 55 | logprob_threshold: float 56 | If the average log probability over sampled tokens is below this value, treat as failed 57 | no_speech_threshold: float 58 | If the no_speech probability is higher than this value AND the average log probability 59 | over sampled tokens is below `logprob_threshold`, consider the segment as silent 60 | condition_on_previous_text: bool 61 | if True, the previous output of the model is provided as a prompt for the next window; 62 | disabling may make the text inconsistent across windows, but the model becomes less prone to 63 | getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. 64 | decode_options: dict 65 | Keyword arguments to construct `DecodingOptions` instances 66 | Returns 67 | ------- 68 | A dictionary containing the resulting text ("text") and segment-level details ("segments"), and 69 | the spoken language ("language"), which is detected when `decode_options["language"]` is None. 70 | """ 71 | dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32 72 | if model.device == torch.device("cpu"): 73 | if torch.cuda.is_available(): 74 | warnings.warn("Performing inference on CPU when CUDA is available") 75 | if dtype == torch.float16: 76 | warnings.warn("FP16 is not supported on CPU; using FP32 instead") 77 | dtype = torch.float32 78 | 79 | if dtype == torch.float32: 80 | decode_options["fp16"] = False 81 | 82 | mel = log_mel_spectrogram(audio) 83 | 84 | if decode_options.get("language", None) is None: 85 | if not model.is_multilingual: 86 | decode_options["language"] = "en" 87 | else: 88 | if verbose: 89 | print( 90 | "Detecting language using up to the first 30 seconds. Use `--language` to specify the language" 91 | ) 92 | segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype) 93 | _, probs = model.detect_language(segment) 94 | decode_options["language"] = max(probs, key=probs.get) 95 | if verbose is not None: 96 | print( 97 | f"Detected language: {LANGUAGES[decode_options['language']].title()}" 98 | ) 99 | 100 | language = decode_options["language"] 101 | task = decode_options.get("task", "transcribe") 102 | tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task) 103 | 104 | def decode_with_fallback(segment: torch.Tensor) -> DecodingResult: 105 | temperatures = ( 106 | [temperature] if isinstance(temperature, (int, float)) else temperature 107 | ) 108 | decode_result = None 109 | 110 | for t in temperatures: 111 | kwargs = {**decode_options} 112 | if t > 0: 113 | # disable beam_size and patience when t > 0 114 | kwargs.pop("beam_size", None) 115 | kwargs.pop("patience", None) 116 | else: 117 | # disable best_of when t == 0 118 | kwargs.pop("best_of", None) 119 | 120 | options = DecodingOptions(**kwargs, temperature=t) 121 | decode_result = model.decode(segment, options) 122 | 123 | needs_fallback = False 124 | if ( 125 | compression_ratio_threshold is not None 126 | and decode_result.compression_ratio > compression_ratio_threshold 127 | ): 128 | needs_fallback = True # too repetitive 129 | if ( 130 | logprob_threshold is not None 131 | and decode_result.avg_logprob < logprob_threshold 132 | ): 133 | needs_fallback = True # average log probability is too low 134 | 135 | if not needs_fallback: 136 | break 137 | 138 | return decode_result 139 | 140 | seek = 0 141 | input_stride = exact_div( 142 | N_FRAMES, model.dims.n_audio_ctx 143 | ) # mel frames per output token: 2 144 | time_precision = ( 145 | input_stride * HOP_LENGTH / SAMPLE_RATE 146 | ) # time per output token: 0.02 (seconds) 147 | all_tokens = [] 148 | all_segments = [] 149 | prompt_reset_since = 0 150 | 151 | if initial_prompt is not None: 152 | initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip()) 153 | all_tokens.extend(initial_prompt_tokens) 154 | else: 155 | initial_prompt_tokens = [] 156 | 157 | def add_segment( 158 | *, start: float, end: float, text_tokens: torch.Tensor, result: DecodingResult 159 | ): 160 | text = tokenizer.decode( 161 | [token for token in text_tokens if token < tokenizer.eot] 162 | ) 163 | if len(text.strip()) == 0: # skip empty text output 164 | return 165 | 166 | all_segments.append( 167 | { 168 | "id": len(all_segments), 169 | "seek": seek, 170 | "start": start, 171 | "end": end, 172 | "text": text, 173 | "tokens": text_tokens.tolist(), 174 | "temperature": result.temperature, 175 | "avg_logprob": result.avg_logprob, 176 | "compression_ratio": result.compression_ratio, 177 | "no_speech_prob": result.no_speech_prob, 178 | } 179 | ) 180 | if verbose: 181 | print( 182 | make_safe( 183 | f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}" 184 | ) 185 | ) 186 | 187 | # show the progress bar when verbose is False (otherwise the transcribed text will be printed) 188 | num_frames = mel.shape[-1] 189 | previous_seek_value = seek 190 | 191 | with tqdm.tqdm( 192 | total=num_frames, unit="frames", disable=verbose is not False 193 | ) as pbar: 194 | while seek < num_frames: 195 | timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) 196 | segment = pad_or_trim(mel[:, seek:], N_FRAMES).to(model.device).to(dtype) 197 | segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE 198 | 199 | decode_options["prompt"] = all_tokens[prompt_reset_since:] 200 | result: DecodingResult = decode_with_fallback(segment) 201 | tokens = torch.tensor(result.tokens) 202 | 203 | if no_speech_threshold is not None: 204 | # no voice activity check 205 | should_skip = result.no_speech_prob > no_speech_threshold 206 | if ( 207 | logprob_threshold is not None 208 | and result.avg_logprob > logprob_threshold 209 | ): 210 | # don't skip if the logprob is high enough, despite the no_speech_prob 211 | should_skip = False 212 | 213 | if should_skip: 214 | seek += segment.shape[ 215 | -1 216 | ] # fast-forward to the next segment boundary 217 | continue 218 | 219 | timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin) 220 | consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[ 221 | 0 222 | ].add_(1) 223 | if ( 224 | len(consecutive) > 0 225 | ): # if the output contains two consecutive timestamp tokens 226 | last_slice = 0 227 | for current_slice in consecutive: 228 | sliced_tokens = tokens[last_slice:current_slice] 229 | start_timestamp_position = ( 230 | sliced_tokens[0].item() - tokenizer.timestamp_begin 231 | ) 232 | end_timestamp_position = ( 233 | sliced_tokens[-1].item() - tokenizer.timestamp_begin 234 | ) 235 | add_segment( 236 | start=timestamp_offset 237 | + start_timestamp_position * time_precision, 238 | end=timestamp_offset + end_timestamp_position * time_precision, 239 | text_tokens=sliced_tokens[1:-1], 240 | result=result, 241 | ) 242 | yield all_segments[-1] 243 | 244 | last_slice = current_slice 245 | last_timestamp_position = ( 246 | tokens[last_slice - 1].item() - tokenizer.timestamp_begin 247 | ) 248 | seek += last_timestamp_position * input_stride 249 | all_tokens.extend(tokens[: last_slice + 1].tolist()) 250 | else: 251 | duration = segment_duration 252 | timestamps = tokens[timestamp_tokens.nonzero().flatten()] 253 | if ( 254 | len(timestamps) > 0 255 | and timestamps[-1].item() != tokenizer.timestamp_begin 256 | ): 257 | # no consecutive timestamps but it has a timestamp; use the last one. 258 | # single timestamp at the end means no speech after the last timestamp. 259 | last_timestamp_position = ( 260 | timestamps[-1].item() - tokenizer.timestamp_begin 261 | ) 262 | duration = last_timestamp_position * time_precision 263 | 264 | add_segment( 265 | start=timestamp_offset, 266 | end=timestamp_offset + duration, 267 | text_tokens=tokens, 268 | result=result, 269 | ) 270 | yield all_segments[-1] 271 | 272 | seek += segment.shape[-1] 273 | all_tokens.extend(tokens.tolist()) 274 | 275 | if not condition_on_previous_text or result.temperature > 0.5: 276 | # do not feed the prompt tokens if a high temperature was used 277 | prompt_reset_since = len(all_tokens) 278 | 279 | # update progress bar 280 | pbar.update(min(num_frames, seek) - previous_seek_value) 281 | previous_seek_value = seek 282 | 283 | 284 | import time 285 | 286 | dlt = whisper.load_model("tiny") 287 | 288 | 289 | def whisper_generator(path, model="tiny") -> AsyncGenerator: 290 | # returns a async generator that yields the transcribed text 291 | if model == "tiny": 292 | model = dlt 293 | else: 294 | start = time.time() 295 | model = whisper.load_model(model) 296 | logger.info(f"Loaded model in {time.time() - start} seconds") 297 | async_generator = transcribe(model, path, verbose=True) 298 | return async_generator 299 | --------------------------------------------------------------------------------