├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── __init__.py
├── poetry.lock
├── pyproject.toml
├── summary_app
    ├── Dockerfile
    ├── __init__.py
    ├── db.py
    ├── download.py
    ├── fly.toml
    ├── md_shorten.py
    ├── md_summarize.py
    ├── readme.md
    ├── requirements.txt
    ├── run.py
    ├── schema.py
    ├── segment.py
    └── transcribe.py
└── transcribe_app
    ├── Dockerfile
    ├── __init__.py
    ├── download.py
    ├── exploration.ipynb
    ├── modal_app.py
    ├── requirements.txt
    └── transcribe.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .gitignore
131 | .ipynb
132 | test.py
133 | test.sh
134 | transcribe_app/.DS_Store
135 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.testing.pytestArgs": [
 3 |         "summary_app"
 4 |     ],
 5 |     "python.testing.unittestEnabled": false,
 6 |     "python.testing.pytestEnabled": true,
 7 |     "python.analysis.typeCheckingMode": "basic",
 8 |     "python.linting.lintOnSave": true,
 9 |     "editor.codeActionsOnSave": {
10 |         "source.organizeImports": true
11 |     }
12 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Jason Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Youtube Summarizer
  2 | This FastAPI YouTube summary app utilizes two key features to improve speed and efficiency:
  3 | 
  4 | 1. The app takes advantage of YouTube video transcripts, which can be easily retrieved via the YouTube Transcript API.
  5 | To tokenize the video text, the app uses regular expressions as a tokenizer, instead of using more complex natural language processing tools like spacy or nltk. The text is then divided into N_BATCH tokens and the summarization process is performed in parallel, with the results being combined at the end (MAP-REDUCE).
  6 | 
  7 | 2. The summarized batches could be combined into a single batch, but this would result in a longer wait time for the first summary token to appear.
  8 | 
  9 | # Improving Whisper Transcriptions for Faster Summarization
 10 | One possible future improvement to the app (which would allow us to do all audio and all video) is to figure out how to stream whisper transcriptions, which would allow for faster summarization by summarizing the first n_tokens of the whisper transcriptions. This would result in a shorter wait time for the first summary token, bounded by the time to download the video and the time to produce the `n_tokens`.
 11 | 
 12 | # Running the App
 13 | To run the app, simply run the following commands:
 14 | 
 15 | ```
 16 | pip install -r requirements.txt
 17 | # If you use poetry, run the following instead:
 18 | # poetry init
 19 | ```
 20 | 
 21 | Then, navigate to the `summary_app` directory and run the following command:
 22 | 
 23 | ```
 24 | uvicorn app:run:app --reload
 25 | ```
 26 | 
 27 | # Calling the Streaming Endpoints
 28 | You can call the streaming endpoints using the following curl commands:
 29 | 
 30 | For regular streaming of summaries.
 31 | 
 32 | ```
 33 | curl --no-buffer -X 'POST' \
 34 |   'http://127.0.0.1:8000/summarize_youtube'\
 35 |   -H 'accept: application/json' \
 36 |   -H 'Content-Type: application/json' \
 37 |   -H 'Authorization: Bearer <OPENAI_API_KEY>' \
 38 |   -d '{
 39 |   "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY"
 40 | }'
 41 | ```
 42 | 
 43 | For streaming of Transcripts
 44 | 
 45 | ```
 46 | curl --no-buffer -X 'POST' \
 47 |   'http://127.0.0.1:8000/stream_transcription'\
 48 |   -H 'accept: application/json' \
 49 |   -H 'Content-Type: application/json' \
 50 |   -d '{
 51 |   "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY"
 52 |   "model": "base"
 53 | }'
 54 | ```
 55 | 
 56 | Enabling SSE works on both summary and stream endpoints.
 57 | 
 58 | ```
 59 | curl --no-buffer -X 'POST' \
 60 |   'http://127.0.0.1:8000/stream_transcription'\
 61 |   -H 'accept: application/json' \
 62 |   -H 'Content-Type: application/json' \
 63 |   -d '{
 64 |   "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY", 
 65 |   "use_sse": true
 66 | }'
 67 | ```
 68 | 
 69 | # fly.io Deployment (Avoid transcription since its CPU only)
 70 | 
 71 | We also have a deployment on if any wants to just hit it. 
 72 | 
 73 | ```
 74 | curl --no-buffer -X 'POST' \
 75 |   'https://video-summary.fly.dev/summarize_youtube'\
 76 |   -H 'accept: application/json' \
 77 |   -H 'Content-Type: application/json' \
 78 |   -H 'Authorization: Bearer <OPENAI_API_TOKEN>' \
 79 |   -d '{
 80 |   "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY"
 81 | }'
 82 | ```
 83 | 
 84 | # Modal Deployment 
 85 | 
 86 | ```
 87 | curl --no-buffer -X 'POST' \
 88 |   'https://jxnl--youtube-stream-transcription.modal.run'\
 89 |   -H 'accept: application/json' \
 90 |   -H 'Content-Type: application/json' \
 91 |   -d '{
 92 |   "url": "https://www.youtube.com/watch?v=9Q9_CQxFUKY",
 93 |   "model": "base",
 94 |   "use_sse": false
 95 | }'
 96 | 
 97 | curl --no-buffer -X 'POST' \
 98 |   'https://jxnl--youtube-stream-transcription-dev.modal.run'\
 99 |   -H 'accept: application/json' \
100 |   -H 'Content-Type: application/json' \
101 |   -d '{
102 |   "url": "https://www.youtube.com/watch?v=FECyn_sGk4M",
103 |   "model": "base",
104 |   "use_sse": false
105 | }'
106 | ```
107 | 
108 | # Future Work
109 | 1. Support for whisper transcriptions for all audio and video.
110 | 2. Streaming of whisper transcriptions for all audio and video.
111 | 3. A way to incorporate time stamps into the summary. (e.g. by including [t=12s] tokens?). In SSE, a possible implementation could be to return {data: data, is_time: bool}.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/__init__.py


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "video-summary-streamer"
 3 | version = "0.1.0"
 4 | description = "tools to stream tokens for summary and transcripts"
 5 | authors = ["Jason Liu <jason@jxnl.coA>"]
 6 | readme = "README.md"
 7 | packages = [{include = "video_summary_streamer"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.9"
11 | loguru = "^0.6.0"
12 | fastapi = "^0.90.1"
13 | openai = "^0.26.5"
14 | youtube-transcript-api = "^0.5.0"
15 | sse-starlette = "^1.2.1"
16 | uvicorn = "^0.20.0"
17 | pytube = "^12.1.2"
18 | whisper = "^1.1.10"
19 | openai-whisper = "^20230124"
20 | 
21 | 
22 | [tool.poetry.group.dev.dependencies]
23 | jupyter = "^1.0.0"
24 | black = "^23.1.0"
25 | modal-client = "^0.45.1132"
26 | 
27 | [build-system]
28 | requires = ["poetry-core"]
29 | build-backend = "poetry.core.masonry.api"
30 | 


--------------------------------------------------------------------------------
/summary_app/Dockerfile:
--------------------------------------------------------------------------------
 1 | # https://hub.docker.com/_/python
 2 | FROM python:3.10-slim-bullseye
 3 | 
 4 | ENV PYTHONUNBUFFERED True
 5 | ENV APP_HOME /app
 6 | WORKDIR $APP_HOME
 7 | COPY requirements.txt ./
 8 | RUN pip install -r requirements.txt
 9 | 
10 | 
11 | COPY . ./
12 | 
13 | 
14 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8080"]
15 | 


--------------------------------------------------------------------------------
/summary_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/summary_app/__init__.py


--------------------------------------------------------------------------------
/summary_app/db.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | import sqlalchemy as sa
4 | 
5 | url = os.environ["DB_URL"]
6 | engine = sa.create_engine(url=url, connect_args={"sslmode": "require"})
7 | 


--------------------------------------------------------------------------------
/summary_app/download.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from pytube import YouTube
 3 | 
 4 | 
 5 | def download_youtube_video(url):
 6 |     logger.info(f"Downloading {url}...")
 7 |     stream = YouTube(url).streams.filter(only_audio=True).first()
 8 | 
 9 |     if stream:
10 |         logger.info("Downloading stream...")
11 |         file_name = stream.download(output_path="./tmp")
12 |         logger.info(f"Downloaded {url} to {file_name}")
13 |         return file_name
14 | 
15 |     logger.info(f"Could not download {url}")
16 |     return None
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     filename = download_youtube_video("https://www.youtube.com/watch?v=9bZkp7q19f0")
21 |     print(filename)
22 | 


--------------------------------------------------------------------------------
/summary_app/fly.toml:
--------------------------------------------------------------------------------
 1 | # fly.toml file generated for youtube-markdown on 2023-02-13T17:52:53-05:00
 2 | 
 3 | app = "youtube-markdown"
 4 | kill_signal = "SIGINT"
 5 | kill_timeout = 5
 6 | processes = []
 7 | 
 8 | [env]
 9 | 
10 | [experimental]
11 |   auto_rollback = true
12 | 
13 | [[services]]
14 |   http_checks = []
15 |   internal_port = 8080
16 |   processes = ["app"]
17 |   protocol = "tcp"
18 |   script_checks = []
19 |   [services.concurrency]
20 |     hard_limit = 25
21 |     soft_limit = 20
22 |     type = "connections"
23 | 
24 |   [[services.ports]]
25 |     force_https = true
26 |     handlers = ["http"]
27 |     port = 80
28 | 
29 |   [[services.ports]]
30 |     handlers = ["tls", "http"]
31 |     port = 443
32 | 
33 |   [[services.tcp_checks]]
34 |     grace_period = "1s"
35 |     interval = "15s"
36 |     restart_limit = 0
37 |     timeout = "2s"
38 | 


--------------------------------------------------------------------------------
/summary_app/md_shorten.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import AsyncGenerator, Optional
 3 | 
 4 | import openai
 5 | 
 6 | PROMPT = """
 7 | You are a professional note taker tasked with shortening and organizing a study guide.
 8 | Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. 
 9 | 
10 | To format your markdown file, follow this structure:
11 | 
12 |     # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title
13 | 
14 |     Summary: ...
15 | 
16 |     Use bullet points to provide a informative description of key points and insights.
17 | 
18 |     # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title
19 | 
20 |     Repeat the above structure as necessary, and use subheadings to organize your notes.
21 | 
22 | Some tips to keep in mind:
23 | 
24 | * Use only content from the given transcript, without adding any additional information.
25 | * Highlight any memorable phrases or quotes to aid recall or as evidence.
26 | * Use bullet points to describe important steps and insights, being as comprehensive as possible.
27 | * Avoid repeating yourself in either the content or the timestamp.
28 | 
29 | Study Guide:
30 | 
31 | {text} 
32 | 
33 | Shortened Study Guide:
34 | """
35 | 
36 | 
37 | async def shorten_md(
38 |     txt: str,
39 |     openai_api_key: Optional[str],
40 |     semaphore: Optional[asyncio.Semaphore] = None,
41 | ):
42 |     if openai_api_key is not None:
43 |         openai.api_key = openai_api_key
44 | 
45 |     async def call() -> AsyncGenerator[str, None]:
46 |         response = await openai.ChatCompletion.acreate(
47 |             model="gpt-3.5-turbo-16k",
48 |             messages=[
49 |                 {
50 |                     "role": "system",
51 |                     "content": "You are a professional note taker tasked with merging and shortening a study guide. Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study.",
52 |                 },
53 |                 {"role": "user", "content": PROMPT.format(text=txt)},
54 |             ],
55 |             stream=True,
56 |             max_tokens=2000,
57 |             temperature=0,
58 |             top_p=1,
59 |             frequency_penalty=0,
60 |             presence_penalty=0.6,
61 |         )
62 | 
63 |         async def gen():
64 |             async for chunk in response:  # type: ignore
65 |                 yield chunk["choices"][0]["delta"].get("content", "")
66 |             yield "\n"
67 | 
68 |         return gen()
69 | 
70 |     if semaphore is None:
71 |         return await call()
72 | 
73 |     async with semaphore:
74 |         return await call()
75 | 


--------------------------------------------------------------------------------
/summary_app/md_summarize.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import AsyncGenerator, Optional
 3 | 
 4 | import openai
 5 | 
 6 | PROMPT = """
 7 | Summarize the transcript in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. Chapters should be meaningful length and not too short. Respond in the same language as the transcript if it is not english.
 8 | 
 9 | To format your markdown file, follow this structure:
10 | 
11 |     # [HH:MM:SS](https://youtu.be/video_id?t=XXs) Descriptive Title
12 | 
13 |     <overview of the video>
14 | 
15 |     - Use bullet points to provide a detailed description of key points and insights. Make sure it does not repeat the overview.
16 | 
17 |     ## [HH:MM:SS](https://youtu.be/video_id?t=XXs) title for sub topic
18 | 
19 |     - Use bullet points to provide a detailed description of key points and insights.
20 | 
21 |     Repeat the above structure as necessary, and use subheadings to organize your notes.
22 | 
23 | Formatting Tips:
24 | * Do not make the chapters too short, ensure that each section has at least 3-5 bullet points
25 | * Use [] to denote timestamps and () to link to the corresponding part of the video.
26 | * Use subheadings and bullet points to organize your notes and make them easier to read and understand. When relevant, include timestamps to link to the corresponding part of the video.
27 | * Use bullet points to describe important steps and insights, being as comprehensive as possible.
28 | 
29 | Summary Tips:
30 | * Do not mention anything if its only playing music and if nothing happens don't include it in the notes.
31 | * Use only content from the transcript. Do not add any additional information.
32 | * Make a new line after each # or ## and before each bullet point
33 | * Titles should be informative or even a question that the video answers
34 | * Titles should not be conclusions since you may only be getting a small part of the video
35 | 
36 | Keep it short!
37 | """
38 | 
39 | 
40 | async def summarize_transcript(
41 |     txt: str,
42 |     openai_api_key: Optional[str],
43 |     video_id: Optional[str] = None,
44 |     language: str = "en",
45 |     semaphore: Optional[asyncio.Semaphore] = None,
46 | ):
47 |     if openai_api_key is not None:
48 |         openai.api_key = openai_api_key
49 | 
50 |     async def call() -> AsyncGenerator[str, None]:
51 |         response = await openai.ChatCompletion.acreate(
52 |             model="gpt-3.5-turbo-16k",
53 |             messages=[
54 |                 {
55 |                     "role": "system",
56 |                     "content": f"You are professional note taker tasked with creating a comprehensive and informative markdown file from a given transcript. Your markdown file should be structured in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. Notes should be in language code is `{language}` and should be written in markdown format.",
57 |                 },
58 |                 {
59 |                     "role": "user",
60 |                     "content": f"I have added a feature that forces you to response only in `locale={language}` and markdown format while creating the notes.",
61 |                 },
62 |                 {
63 |                     "role": "assistant",
64 |                     "content": f"Understood thank you.From now I will only response with `locale={language}`",
65 |                 },
66 |                 {
67 |                     "role": "user",
68 |                     "content": txt,
69 |                 },
70 |                 {"role": "user", "content": PROMPT},
71 |             ],
72 |             stream=True,
73 |             max_tokens=500,
74 |             temperature=0,
75 |             top_p=1,
76 |             frequency_penalty=0.6,
77 |             presence_penalty=0.6,
78 |         )
79 | 
80 |         async def gen():
81 |             async for chunk in response:  # type: ignore
82 |                 yield chunk["choices"][0]["delta"].get("content", "")
83 |             yield "\n\n"
84 | 
85 |         return gen()
86 | 
87 |     if semaphore is None:
88 |         return await call()
89 | 
90 |     async with semaphore:
91 |         return await call()
92 | 


--------------------------------------------------------------------------------
/summary_app/readme.md:
--------------------------------------------------------------------------------
  1 | # Explanation of Youtube Summarization
  2 | 
  3 | # Description of the summary api
  4 | 
  5 | [video-summary-streamer/run.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/run.py#L26-L35)
  6 | 
  7 | ```python
  8 | async def async_generator_summary_timestamps(
  9 |     url: str, use_whisper: bool = False, openai_api_key: Optional[str] = None
 10 | ):
 11 |     video_id = extract_video_id(url)
 12 |     phrases = transcribe_youtube(video_id, use_whisper)
 13 |     phrases = group_speech_segments(phrases, max_length=300)
 14 |     phrases = summary_segments_to_md(
 15 |         phrases, openai_api_key=openai_api_key, video_id=video_id
 16 |     )
 17 |     return phrases
 18 | 
 19 | ```
 20 | 
 21 | This API takes a YouTube URL, a "whisper" flag, and an optional OpenAI API key. It extracts the video ID, transcribes speech, batches phrases with start and end timestamps, and formats them into prompts. Batching bypasses the token limit, but comes with a cost.
 22 | 
 23 | ## Step 1) Extract `videoID`
 24 | 
 25 | ```python
 26 | def extract_video_id(url: str) -> str:
 27 |     match = re.search(
 28 |         r"^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$",
 29 |         url,
 30 |     )
 31 |     if match:
 32 |         return match.group(1)
 33 |     else:
 34 |         raise ValueError("Invalid youtube url")
 35 | ```
 36 | 
 37 | Most videos are referenced by `videoID` which can be extracted using the provided regex. This regex supports both `youtube.com` and `youtu.be` which comes often as shortened urls. Its nice to support both since i know you intend on using nice routing tricks.
 38 | 
 39 | ## Step 2) Transcription (whisper optional)
 40 | 
 41 | [video-summary-streamer/transcribe.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/transcribe.py)
 42 | 
 43 | For most modern videos, the transcript API takes the video ID (regex can be found in the `extract_video_id` function) and returns the whole transcript. It is also available as an npm package, [youtube-transcript](https://www.npmjs.com/package/youtube-transcript). It works as needed. I think supporting whisper should be left for later. If the transcript is missing, we can let the user know that whisper is a work-in-progress.
 44 | 
 45 | **Aside: Costs**
 46 | 
 47 | If you have the transcript, the cost of the summary will be.
 48 | 
 49 | ```
 50 | # batch_size is set to ~ 9000
 51 | # max result tokens is about ~1000
 52 | transcript_tokens = len(transcripts) // 4
 53 | n_batchs = transcript_costs / batch_size
 54 | prompt_tokens = 400 * n_batchs
 55 | result_tokens = (max) max_tokens * n_batchs
 56 | 
 57 | tokens = transcript_costs + prompt_tokens + result_tokens
 58 | price_max = tokens * 0.02
 59 | 
 60 | ```
 61 | 
 62 | If you make the default transcripts an async generator (or stream) It'll leave room to support whisper as a async generator (or stream) later on. Its helpful to have that `from_whisper boolean` attribute so we can change behavior later on in the code.
 63 | 
 64 | ## Step 3) Mergeing phrases into larger phrase
 65 | 
 66 | [video-summary-streamer/segment.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/segment.py#L33-L66)
 67 | 
 68 | The function `group_speech_segments` takes the resulting transcript, and groups the segments into chunks of a maximum length (default: 300). It iterates over an AsyncGenerator of Segment objects (this generator is what allows to support whisper later), it merges adjacent segments that meet certain conditions. Specifically, if there is a pause of ~ seconds or more between them, or if the length of the combined segment exceeds the maximum length, the segment is yielded as a separate Segment object.
 69 | 
 70 | ```
 71 | before: [
 72 |     {text:"this is"}, {text:"a phrase"}, {text:"or"}, {text:"another sentence"}, ...
 73 |     ]
 74 | 
 75 | after: [
 76 |     {text: "this is a prhase or another sentence"}, ...
 77 | ]
 78 | 
 79 | ```
 80 | 
 81 | There are a lot of parameters, but generally each transcript segment is very short. We want to merge them to identify where phrases start and end. This information can help us get more precise time stamps. It's best to stick to batches of 300 characters, and ignore other logic.
 82 | 
 83 | ## Step 4) Batch requesting the summary in parts
 84 | 
 85 | [video-summary-streamer/segment.py at main · jxnl/video-summary-streamer](https://github.com/jxnl/video-summary-streamer/blob/main/timestamps_app/segment.py#L88-L120)
 86 | 
 87 | The `summary_segments_to_md` function receives the resulting segments and produces a summary of the transcript using the OpenAI API. It collects the batches and formats the prompt like this:
 88 | 
 89 | Each timestamped phrase is a "Segment" in this example. We accumulate them until we reach the batch size, then we make a request and start streaming out tokens.
 90 | 
 91 | ```
 92 | <-- prompt instructions -->
 93 | 
 94 | Content:
 95 | 
 96 | timestamp: youtube.com/videoid?t=12s
 97 | this is a prhase or another sentence, this is a prhase or another sentence
 98 | this is a prhase or another sentence
 99 | 
100 | timestamp: youtube.com/videoid?t=13s
101 | this is a prhase or another sentence, this is a prhase or another sentence
102 | this is a prhase or another sentence
103 | 
104 | <--- repeats until we hit token limit>
105 | 
106 | Notes:
107 | 
108 | ```
109 | 
110 | The function initializes an empty string, `text`, and iterates through the input segments. For each segment, it checks if it was produced using whisper and, if so, sets a smaller chunk size for the summary request to improve performance.
111 | 
112 | It checks if `text` is less than the chunk size (9000 by default). If it is, it adds the current segment to `text`. Otherwise, it makes a summary request using the OpenAI API with `text` as the input, and yields the resulting tokens one by one using the `summarize_transcript` function. It then resets `text` to an empty string and increments a counter for the number of calls. Rate limiting can happen here to limit the maximum cost per video. Additionally, a button for 'Stop Generation' can be added to save compute. The function continues this process until all segments have been processed.
113 | 
114 | ## Part 5) Summarization
115 | 
116 | Nothing special here, just the prompt:


--------------------------------------------------------------------------------
/summary_app/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.90.1 ; python_version >= "3.9" and python_version < "4.0"
 2 | loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0"
 3 | multidict==6.0.4 ; python_version >= "3.9" and python_version < "4.0"
 4 | openai==0.27.0 ; python_version >= "3.9" and python_version < "4.0"
 5 | packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
 6 | pydantic==1.10.4 ; python_version >= "3.9" and python_version < "4.0"
 7 | sse-starlette==1.2.1 ; python_version >= "3.9" and python_version < "4.0"
 8 | starlette==0.23.1 ; python_version >= "3.9" and python_version < "4.0"
 9 | uvicorn==0.20.0 ; python_version >= "3.9" and python_version < "4.0"
10 | youtube-transcript-api==0.5.0 ; python_version >= "3.9" and python_version < "4.0"
11 | sqlalchemy==2.0.2 ; python_version >= "3.9" and python_version < "4.0"
12 | psycopg2-binary==2.9.5 ; python_version >= "3.9" and python_version < "4.0"


--------------------------------------------------------------------------------
/summary_app/run.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import re
  4 | from dataclasses import dataclass
  5 | from typing import Optional
  6 | 
  7 | from fastapi import Header, Request
  8 | from fastapi.responses import StreamingResponse
  9 | from loguru import logger
 10 | from schema import Summary
 11 | from segment import group_speech_segments, shorten_summary_to_md, summary_segments_to_md
 12 | from sqlalchemy.orm import Session
 13 | from sse_starlette import EventSourceResponse
 14 | from transcribe import transcribe_youtube
 15 | 
 16 | 
 17 | def extract_video_id(url: str) -> str:
 18 |     match = re.search(
 19 |         r"^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$",
 20 |         url,
 21 |     )
 22 |     if match:
 23 |         return match.group(1)
 24 |     else:
 25 |         raise ValueError("Invalid youtube url")
 26 | 
 27 | 
 28 | def async_generator_summary_timestamps(
 29 |     url: str,
 30 |     use_whisper: bool = False,
 31 |     openai_api_key: Optional[str] = None,
 32 | ):
 33 |     video_id = extract_video_id(url)
 34 |     phrases = transcribe_youtube(video_id, use_whisper)
 35 |     phrases = group_speech_segments(phrases, max_length=300)
 36 |     phrases = summary_segments_to_md(
 37 |         phrases, openai_api_key=openai_api_key, video_id=video_id
 38 |     )
 39 |     return phrases
 40 | 
 41 | 
 42 | def async_generator_summary_shorten(content: str, openai_api_key):
 43 |     phrases = shorten_summary_to_md(content, openai_api_key=openai_api_key)
 44 |     return phrases
 45 | 
 46 | 
 47 | def open_ai_token_from_auth(auth):
 48 |     if auth is None or not auth.startswith("Bearer "):
 49 |         return None
 50 |     _, token = auth.split(" ")
 51 |     return token
 52 | 
 53 | 
 54 | def stream(
 55 |     generator,
 56 |     use_sse: bool,
 57 |     request: Request,
 58 |     data_fn=lambda x: x,
 59 |     url: Optional[str] = None,
 60 | ):
 61 |     # this is a helper function to stream data from a generator in a fastapi endpoint
 62 |     # It handles both SSE and regular streaming responses and disconnects
 63 |     async def stream_obj():
 64 |         # this is a helper function to stream data from a generator
 65 | 
 66 |         # accumulate the summary markdown so we can save it to the db
 67 |         summary_markdown = ""
 68 |         try:
 69 |             async for obj in generator:
 70 |                 if obj and not await request.is_disconnected():
 71 |                     data = data_fn(obj)
 72 | 
 73 |                     # accumulate the summary markdown so we can save it to the db
 74 |                     summary_markdown += data
 75 | 
 76 |                     # yield the data
 77 |                     yield {"data": json.dumps({"text": data})} if use_sse else str(data)
 78 | 
 79 |             # yield a done message
 80 |             if use_sse:
 81 |                 yield {"data": "[DONE]"}
 82 |         except asyncio.CancelledError as e:
 83 |             logger.info(f"Streaming canceled.")
 84 |             raise e
 85 | 
 86 |     response = EventSourceResponse if use_sse else StreamingResponse
 87 |     return response(
 88 |         stream_obj(),  # type: ignore
 89 |         media_type="text/plain",
 90 |     )
 91 | 
 92 | 
 93 | @dataclass
 94 | class SummaryPayload:
 95 |     url: str
 96 |     use_sse: bool = False
 97 |     use_whisper: bool = False
 98 |     use_cache: bool = True
 99 | 
100 | 
101 | @dataclass
102 | class ShortenPayload:
103 |     content: str
104 |     use_sse: bool = False
105 | 
106 | 
107 | async def youtube_summary_md(
108 |     req: SummaryPayload, request: Request, authorization: str = Header(None)
109 | ):
110 |     token = open_ai_token_from_auth(authorization)
111 |     async_generator = async_generator_summary_timestamps(
112 |         url=req.url,
113 |         use_whisper=req.use_whisper,
114 |         openai_api_key=token,
115 |     )
116 |     return stream(
117 |         async_generator, req.use_sse, request, data_fn=lambda x: x, url=req.url
118 |     )
119 | 
120 | 
121 | async def shorten_summary(
122 |     req: ShortenPayload, request: Request, authorization: str = Header(None)
123 | ):
124 |     token = open_ai_token_from_auth(authorization)
125 |     async_generator = async_generator_summary_shorten(req.content, token)
126 |     return stream(async_generator, req.use_sse, request, data_fn=lambda x: x)
127 | 
128 | 
129 | import fastapi
130 | 
131 | app = fastapi.FastAPI()
132 | 
133 | app.post("/youtube_markdown")(youtube_summary_md)
134 | app.post("/shorten_markdown")(shorten_summary)
135 | app.post("/check")
136 | 


--------------------------------------------------------------------------------
/summary_app/schema.py:
--------------------------------------------------------------------------------
 1 | import sqlalchemy as sa
 2 | from db import engine
 3 | from loguru import logger
 4 | from sqlalchemy.orm import Session, declarative_base, relationship
 5 | 
 6 | Base = declarative_base()
 7 | 
 8 | 
 9 | # Define a model for the url and summary markdown
10 | class Summary(Base):
11 |     __tablename__ = "summary"
12 |     id = sa.Column(sa.Integer, primary_key=True)
13 |     url = sa.Column(sa.String, index=True)
14 |     video_id = sa.Column(sa.String, index=True)
15 |     created_at = sa.Column(sa.DateTime, default=sa.func.now())
16 |     summary_markdown = sa.Column(sa.String)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     Base.metadata.create_all(engine)
21 | 


--------------------------------------------------------------------------------
/summary_app/segment.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from datetime import timedelta
  3 | from typing import AsyncGenerator
  4 | 
  5 | from loguru import logger
  6 | from md_shorten import shorten_md
  7 | from md_summarize import summarize_transcript
  8 | 
  9 | # from md_summarize_claud import summarize_transcript
 10 | 
 11 | 
 12 | @dataclass
 13 | class Segment:
 14 |     start_time: float
 15 |     end_time: float
 16 |     transcript: str = field(repr=False)
 17 |     transcript_length: int = field(init=False, default=0)
 18 |     timestamp: str = field(init=False, repr=True)
 19 |     from_whisper: bool = field(default=False)
 20 |     language: str = field(default="en")
 21 | 
 22 |     def __post_init__(self):
 23 |         self.transcript_length = len(self.transcript)
 24 |         self.start_time = round(self.start_time)
 25 |         self.timestamp = str(timedelta(seconds=self.start_time))
 26 | 
 27 |     def to_str(self, video_id):
 28 |         if len(self.transcript) > 0:
 29 |             return (
 30 |                 "language:{lang} timestamp:{ts} url:{url}\ntranscript:{transcript}".format(
 31 |                     lang=self.language,
 32 |                     ts=self.timestamp,
 33 |                     s=self.start_time,
 34 |                     url=f"https://youtu.be/{video_id}?t={self.start_time}s",
 35 |                     transcript=self.transcript,
 36 |                 ).strip()
 37 |                 + "\n"
 38 |             )
 39 |         else:
 40 |             return ""
 41 | 
 42 | 
 43 | async def group_speech_segments(
 44 |     segments: AsyncGenerator[Segment, None], max_length=300
 45 | ):
 46 |     current_segment = await segments.__anext__()
 47 |     current_transcript = current_segment.transcript
 48 |     current_start_time = current_segment.start_time
 49 |     from_whisper = current_segment.from_whisper
 50 | 
 51 |     async for segment in segments:
 52 |         previous_segment = current_segment
 53 |         current_segment = segment
 54 | 
 55 |         current_segment.transcript.replace("[Music]", "")
 56 | 
 57 |         is_pause = (current_segment.start_time - previous_segment.end_time) > 0.1
 58 |         is_long = current_segment.start_time - current_start_time > 1
 59 |         is_too_long = len(current_transcript) > max_length
 60 | 
 61 |         if (is_long and is_pause) or is_too_long:
 62 |             yield Segment(
 63 |                 language=current_segment.language,
 64 |                 start_time=current_start_time,
 65 |                 end_time=previous_segment.end_time,
 66 |                 transcript=current_transcript.strip(),
 67 |                 from_whisper=from_whisper,
 68 |             )
 69 |             current_transcript = current_segment.transcript
 70 |             current_start_time = current_segment.start_time
 71 |         else:
 72 |             current_transcript += " " + current_segment.transcript
 73 | 
 74 |     yield Segment(
 75 |         start_time=current_start_time,
 76 |         end_time=current_segment.end_time,
 77 |         transcript=current_transcript.strip(),
 78 |         from_whisper=from_whisper,
 79 |     )
 80 | 
 81 | 
 82 | async def shorten_summary_to_md(content: str, openai_api_key: str):
 83 |     text = ""
 84 | 
 85 |     chapters = content.strip().split("# ")
 86 | 
 87 |     for chapter in chapters:
 88 |         if len(text) > 5000 * 4:
 89 |             logger.info(f"Shortening {len(text)} characters")
 90 |             async for token in await shorten_md(text, openai_api_key=openai_api_key):
 91 |                 yield token
 92 |             text = ""
 93 |         else:
 94 |             text += f"\n\n# {chapter}"
 95 |     if text != "":
 96 |         logger.info(f"Shortening {len(text)} characters")
 97 |         async for token in await shorten_md(text, openai_api_key=openai_api_key):
 98 |             yield token
 99 | 
100 | 
101 | async def summary_segments_to_md(
102 |     segments, video_id=None, openai_api_key=None, chunk=7000 * 4
103 | ):
104 |     text = ""
105 |     n_calls = 0
106 |     async for block in segments:
107 |         if block.from_whisper:
108 |             # instead of using the chunk size, this gets us
109 |             # faster results by using a smaller chunk size
110 |             chunk = [500, 1000, 3000, 4000][n_calls if n_calls < 4 else -1]
111 |             logger.info(f"Setting chunk size to {chunk} for {video_id}")
112 | 
113 |         if len(text) < chunk:
114 |             text += f"\n{block.to_str(video_id)}"
115 |         else:
116 |             n_calls += 1
117 |             logger.info(
118 |                 f"Making summary request for {video_id}, request_size: {len(text)}, n_calls: {n_calls}"
119 |             )
120 | 
121 |             async for token in await summarize_transcript(
122 |                 text,
123 |                 video_id=video_id,
124 |                 openai_api_key=openai_api_key,
125 |                 language=block.language,
126 |             ):
127 |                 yield token
128 |             text = ""
129 |             logger.info(f"Finished n={n_calls} summary request for {video_id}")
130 |     if text is not None and text != "":
131 |         n_calls += 1
132 |         logger.info(f"Making summary request for {video_id}, n_calls: {n_calls}")
133 |         async for token in await summarize_transcript(
134 |             text,
135 |             video_id=video_id,
136 |             openai_api_key=openai_api_key,
137 |             language=block.language,
138 |         ):
139 |             yield token
140 |     logger.info(f"Finished summary request for {video_id} in {n_calls} calls")
141 | 


--------------------------------------------------------------------------------
/summary_app/transcribe.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import AsyncGenerator
 4 | 
 5 | import requests
 6 | from loguru import logger
 7 | from segment import Segment
 8 | from youtube_transcript_api import YouTubeTranscriptApi
 9 | 
10 | TRAN_URL = os.environ.get("TRAN_URL")
11 | 
12 | 
13 | def create_youtube_url(video_id):
14 |     return f"https://www.youtube.com/watch?v={video_id}"
15 | 
16 | 
17 | async def transcribe_youtube(
18 |     video_id: str, use_whisper=False, model: str = "base"
19 | ) -> AsyncGenerator[Segment, None]:
20 |     if use_whisper:
21 |         logger.info(f"Forcing whisper: calling out to remote gpu for {video_id}")
22 |         async for block in _transcribe_youtube_whisper(video_id, model):
23 |             yield block
24 |     else:
25 |         # this function will try to get the transcript from youtube
26 |         try:
27 |             transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
28 | 
29 |             # Get either 'en' or the first generated transcript
30 |             language_code = None
31 |             for t in transcript_list:
32 |                 if t.is_generated:
33 |                     language_code = t.language_code
34 |                     break
35 | 
36 |             logger.info(f"Transcript {video_id} language code: {language_code}")
37 | 
38 |             transcript = YouTubeTranscriptApi.get_transcript(
39 |                 video_id, ("en", language_code)
40 |             )
41 |             logger.info("Transcript found on youtube no need to download video")
42 |             for t in transcript:
43 |                 yield Segment(
44 |                     language=language_code or "en",
45 |                     start_time=t["start"],
46 |                     end_time=t["start"] + t["duration"],
47 |                     transcript=t["text"],
48 |                 )
49 |         except Exception as e:
50 |             logger.info(
51 |                 f"Video has transcripts disabled, using whisper to transcribe {e}"
52 |             )
53 |             logger.info(f"Fallback whisper: calling out to remote gpu for {video_id}")
54 |             async for block in _transcribe_youtube_whisper(video_id, model):
55 |                 yield block
56 | 
57 | 
58 | async def _transcribe_youtube_whisper(video_id, model) -> AsyncGenerator[Segment, None]:
59 |     # this function will try to get the transcript from whisper on a remote gpu
60 |     url = TRAN_URL
61 | 
62 |     youtube = create_youtube_url(video_id)
63 |     data = {"url": youtube, "use_sse": True, "model": model}
64 | 
65 |     r = requests.post(url, json=data, stream=True)
66 |     r.raise_for_status()
67 | 
68 |     for chunk in r.iter_content(chunk_size=40000):
69 |         try:
70 |             data = chunk.decode("utf-8").split(":", 1)[1]
71 |             if data.strip() == "[DONE]":
72 |                 logger.info("Done with transcription")
73 |             else:
74 |                 data = json.loads(data)
75 |                 yield Segment(
76 |                     start_time=data["start"],
77 |                     end_time=data["end"],
78 |                     transcript=data["text"],
79 |                     from_whisper=True,
80 |                 )
81 |         except Exception as e:
82 |             logger.info(f"Error decoding chunk {e}")
83 |             pass
84 | 


--------------------------------------------------------------------------------
/transcribe_app/Dockerfile:
--------------------------------------------------------------------------------
 1 | # https://hub.docker.com/_/python
 2 | FROM python:3.10-slim-bullseye
 3 | 
 4 | ENV PYTHONUNBUFFERED True
 5 | ENV APP_HOME /app
 6 | WORKDIR $APP_HOME
 7 | COPY requirements.txt ./
 8 | RUN pip install -r requirements.txt
 9 | 
10 | 
11 | COPY . ./
12 | 
13 | 
14 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8080"]
15 | 


--------------------------------------------------------------------------------
/transcribe_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jxnl/youtubechapters-backend/c1590960453ab40615deba43cc1c16e978aa4f0e/transcribe_app/__init__.py


--------------------------------------------------------------------------------
/transcribe_app/download.py:
--------------------------------------------------------------------------------
 1 | from pytube import YouTube
 2 | from tempfile import TemporaryDirectory
 3 | from loguru import logger
 4 | 
 5 | 
 6 | def download_youtube_video(url):
 7 |     logger.info(f"Downloading {url}...")
 8 |     with TemporaryDirectory() as tmpdir:
 9 |         file_name = (
10 |             YouTube(url)
11 |             .streams.filter(only_audio=True, file_extension="mp4")
12 |             .first()
13 |             .download(output_path="./tmp")
14 |         )
15 |         logger.info(f"Downloaded {url} to {file_name}")
16 |         return file_name
17 | 


--------------------------------------------------------------------------------
/transcribe_app/exploration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 59,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from youtube_transcript_api import YouTubeTranscriptApi\n",
 10 |     "from dataclasses import dataclass\n",
 11 |     "\n",
 12 |     "video_id = \"O_9JoimRj8w\"\n",
 13 |     "\n",
 14 |     "@dataclass\n",
 15 |     "class PhraseBlock:\n",
 16 |     "    # this is only to be more self documenting\n",
 17 |     "    # in case we want to support whisper\n",
 18 |     "    start: float\n",
 19 |     "    text: str\n",
 20 |     "\n",
 21 |     "def approx_sentences(text):\n",
 22 |     "    import re\n",
 23 |     "\n",
 24 |     "    sentences = re.split(r' *[\\.\\?!][\\'\"\\)\\]]* *', text)\n",
 25 |     "    *rest, tail = sentences[:-1] \n",
 26 |     "    # the last part may be incomplete so we keep it for the next batch\n",
 27 |     "    return \" \".join(rest), tail\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "def generate_batchs(transcript, char_chunk=5000):\n",
 31 |     "    start_time = 0\n",
 32 |     "    acc_tokens = \"\"\n",
 33 |     "    for phrase in transcript:\n",
 34 |     "        phrase = PhraseBlock(phrase[\"start\"], phrase[\"text\"])\n",
 35 |     "        acc_tokens += \" \" + phrase.text.strip().replace(\"\\n\", \" \")\n",
 36 |     "        if len(acc_tokens) > char_chunk:\n",
 37 |     "            batch, tail = approx_sentences(acc_tokens)\n",
 38 |     "            yield start_time, batch\n",
 39 |     "            acc_tokens = tail\n",
 40 |     "            start_time = phrase.start\n",
 41 |     "    yield start_time, acc_tokens\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "text = YouTubeTranscriptApi.get_transcript(video_id)\n",
 45 |     "sum([len(s[\"text\"]) for s in text])\n",
 46 |     "\n",
 47 |     "for tt, bb in generate_batchs(text):\n",
 48 |     "    print(tt, bb)\n",
 49 |     "    print()\n"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 60,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "7836"
 61 |       ]
 62 |      },
 63 |      "execution_count": 60,
 64 |      "metadata": {},
 65 |      "output_type": "execute_result"
 66 |     }
 67 |    ],
 68 |    "source": []
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 61,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "0  -They love you We love you Welcome back Thanks for coming on the show, bud -Thank you for having me back I've missed you -Yeah, I know You did miss me [ Laughter ] Hey, I want to talk to you, because, obviously, you got the biggest show I want to get to all -- Everything All the shows, everything \"The Last of Us\" is a blockbuster Congrats But you are hosting \"Saturday Night Live -Whoo [ Cheers and applause ] -With Coldplay It's a big show That's a big ticket right there That's a hot ticket -Yeah, yeah -Are you excited Are you nervous What are you feeling Can I help in any way -Why do you ask [ Laughs ] Are you nervous for me -Yeah, yeah, I am I'm really nervous for you No way I'm so -- I think you should've hosted -- -Ages ago -Yeah -After my \"Law & Order\" episode -Yes, that's exactly -- [ Laughter, cheers ] That's exactly what I told -- I said, \"Lorne, this guy's got something, man Are you excited -I'm super excited I think it's the top of everybody's bucket list Um, I am scared [ Laughter ] -You are -There's, like, a wall of sheer terror, I think, that you start with, and then, it's such a well-oiled machine Everybody's so amazing and talented And they take these, you know, blocks of, you know, white-hot terror apart for you with big cue cards [ Laughter ] -Yeah I want to say -- oh, yes So, have you practiced blocking right now in sketches -Yeah, we just started some -- some blocking -- -Isn't it the most fun -It's a blast -I mean, it is a great crew We have the best crew here, too -Yes, yes [ Cheers and applause ] -But after us, \"Saturday Night Live -Second to this show -Second to \"The Tonight Show Yeah, yeah, yeah -\"SNL -But \"SNL,\" it's just amazing And it's the last live show on TV Wait, \"The Last of Us I see a connection already -Ah -Is it too late to write that in -I hear clicking -Yeah If things weren't big enough for you and your life and your career, \"The Last of Us,\" it is the biggest thing in the world But you also have \"The Mandalorian\" that premieres   [ Cheers and applause ]   season three, next month You are -- You are the Mandalorian I know there's also so many secrets, and I don't want to spoil anything -- -I'll tell you everything -No, I -- no I don't want that But can you tell fans anything about season three -I will say, there will be more Mandalorians [ Cheers ] -All right, that's good I'll take that -A lot of them Big, epic battle Plural Maybe -Ooh I love that Baby Yoda is not in this season [ Both laugh ] No, I'm just kidding -They couldn't figure his contract out for season three [ Laughter ] They're still -- They're still in negotiations -Yeah, he's just got a good agent How is -- -His head got bigger than his ears -Ah, nice -Ah -How is Grogu What's he gonna be doing this season -He's been very good He's still learning -Aww -Teaching -Yeah He is -Protected Protecting -Yeah -I think if I say any more   -Yep, don't say it -   I'll be replaced Like, not just as the Mandalorian, but as a human being in life [ Laughter ] -That's how powerful Disney is -Exactly And their technology is so sophisticated, they'll just replace -- -A I -   a different Pedro Pascal, exactly -March 1st on Disney+, by the way That's when it premieres Congrats on that -Thank you -But let's talk about this But now, you've done that, and it was the biggest thing, and you go, \"Oh, my gosh And then there's -- Well, let's just go back \"Game of Thrones [ Cheers and applause ] Home run \"Narcos \"Narcos [ Cheers and applause ] Home run Home run \"Mandalorian [ Cheers and applause ] Home run And then \"The Last of Us What are you ta-- Come on [ Cheers and applause ] Who -- I'm mad -And that's the end -No It's just the beginning -I'll do \"SNL\" and then -- you're retired, buddy -It's just the beginning -You forgot \"Law & Order: SVU -Yeah, I did forget that I left that out But you're so talented But this is -- This is some kind of genius move that you've been doing I've never seen a career like this -I have nothing to do it -Yes, you do How do you pick shows that are gonna be giant hits -The door opens, and you're like -- -No way -Let me in -You're jumping in -I beg -No, you don't Like, how does \"Last of Us\" come about Do the creators -- -That was a really strange circumstance I was actually in London, and everyone else was in Los Angeles And I got sent these scripts and said -- I was told that \"Craig Mazin wants you to read these scripts, and if you like them, he'd like to talk to you I loved \"Chernobyl,\" the miniseries that he showran for HBO -It was fantastic -It was unbelievable Anyway, I read the first script, and I was like, \"Yeah, yeah, yeah I want to meet him, I want to meet him -Yeah -And we talked, we fell in love [ Laughter ] And he -- And they were like, \"Will you stay up a little bit later\n",
 80 |       "\n",
 81 |       "296.634 At this point, it was getting kind of late in London -- \"to talk to Neil Druckmann, the creator of the video game Genius. -Yeah. -Brilliant guy. \"And he'd like to talk to you, as well.\" Stay up for that Zoom. We all fall in love again. And then I -- and then, at that point, it's really late. I've got to get up in the morning. I take an Ambien to go to sleep just in case -- they've got my adrenaline kind of going and my -- my hopes up. -Yes, so you take a sleeping pill so you don't have to think about anything. -Exactly. So -- But I get a call, and I get told that I got the job after I took the Ambien. [ Laughter ] -Oh, no. -And so, I was excited, I guess, but I didn't remember. [ Laughter ] -So, you're like, \"Was that a dream?\" -I woke up in the morning, and the first thing that occurred to me, was like, \"Oh, man. I really want that job.\" [ Laughter ] \"And I'm in London. They're in L.A. I'm gonna wait by the phone all day long. This hasn't happened in a while, you know. I'm gonna be longing, and I'm gonna think about it all day long.\" -You had already got the call that you got the job. -Yeah, and it's like, \"Oh, congratulations. So happy for you,\" when I looked at my phone. And I was like, \"Oh, yeah! I got the job!\" [ Laughter and applause ] -\"My dream is real!\" It is based on the awesome, awesome, awesome video game, \"The Last of Us.\" For those who don't know the game or don't know the show, how do we set it up, and how do we set up who you play? -I play Joel Miller. [ Cheers ] [ Chuckles ] He has a bad day. [ Laughter ] Everybody -- the planet has a really bad day. -Yeah. -And a, uh...fungus, cordyceps, finds a way to cross over into human DNA and ends the planet in 24 hours. Very light, happy stuff. [ Laughter ] -Yeah, light. Totally light, happy stuff. -A very traumatic event shapes this man into who he is 20 years later, which is, you know, this dystopian world with fascistic governments and very contagious cordyceps that turns you into an infected monster. It's good times. -Yeah, it is good times. Every episode, by the way, has grown and almost like -- It already was a hit, the first episode. Then it's grown, second episode. Third -- It's the biggest thing. It's getting bigger and bigger and bigger. It's like -- -They know what they're doing. -Yeah, they know what they're doing. You know what you're doing, too, bud. You're fantastic. I want to show everyone a clip. Here's Pedro Pascal in this Sunday's episode of \"The Last of Us.\" Take a look at this. -When I say go, you crawl to that wall, and you squeeze through, and you don't come out until I say. Okay? [ Gunshot, glass shatters ] [ Gunshots ] And they're not gonna hit you. Look at me! They're not gonna hit you. [ Gunshots ] You stay down, you stay low, you stay quiet. -Mm-hmm. -Okay. -Okay. [ Gunshots ] -Go! [ Gunshots ] ♪♪ [ Gunshots ] ♪♪ [ Cheers and applause ] -Pedro Pascal, everybody! \"The Last of Us\" airs Sundays at 9:00 p.m. on HBO and HBO Max. And this weekend, catch him hosting \"Saturday Night Live\" with Coldplay! More \"Tonight Show\" after the break. Stick around, everybody. Come on back!\n",
 82 |       "\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": []
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": ".venv",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.9.6"
113 |   },
114 |   "orig_nbformat": 4,
115 |   "vscode": {
116 |    "interpreter": {
117 |     "hash": "079eeea907cfb75f19865162410c8e23c667a2af20307392487751e80c765df8"
118 |    }
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 2
123 | }
124 | 


--------------------------------------------------------------------------------
/transcribe_app/modal_app.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | from dataclasses import dataclass
  4 | 
  5 | import modal
  6 | from download import download_youtube_video
  7 | from fastapi import Request
  8 | from fastapi.responses import StreamingResponse
  9 | from loguru import logger
 10 | from modal import web_endpoint
 11 | from sse_starlette import EventSourceResponse
 12 | from transcribe import transcribe
 13 | 
 14 | 
 15 | def download_models():
 16 |     import whisper
 17 | 
 18 |     whisper.load_model("tiny")
 19 |     whisper.load_model("base")
 20 |     whisper.load_model("small")
 21 |     whisper.load_model("medium")
 22 | 
 23 | 
 24 | image = (
 25 |     modal.Image.debian_slim()
 26 |     .apt_install("ffmpeg")
 27 |     .pip_install(
 28 |         [
 29 |             "youtube-transcript-api",
 30 |             "openai",
 31 |             "fastapi",
 32 |             "sse-starlette",
 33 |             "openai-whisper",
 34 |             "loguru",
 35 |             "ffmpeg-python",
 36 |             "watchfiles",
 37 |             "pytube",
 38 |         ]
 39 |     )
 40 |     .run_function(download_models)
 41 | )
 42 | 
 43 | stub = modal.Stub("youtube", image=image)
 44 | 
 45 | 
 46 | def stream(generator, use_sse: bool, request: Request, data_fn=lambda x: x):
 47 |     # this is a helper function to stream data from a generator in a fastapi endpoint
 48 |     # It handles both SSE and regular streaming responses and disconnects
 49 |     async def stream_obj():
 50 |         try:
 51 |             async for obj in generator:
 52 |                 if obj and not await request.is_disconnected():
 53 |                     data = data_fn(obj)
 54 |                     yield {"data": str(data)} if use_sse else str(data)
 55 |             if use_sse:
 56 |                 yield {"data": "[DONE]"}
 57 |         except asyncio.CancelledError as e:
 58 |             logger.info(f"Streaming canceled.")
 59 |             raise e
 60 | 
 61 |     response = EventSourceResponse if use_sse else StreamingResponse
 62 |     return response(
 63 |         stream_obj(),
 64 |         media_type="text/plain",
 65 |     )
 66 | 
 67 | 
 68 | @dataclass
 69 | class TranscriptionPayload:
 70 |     url: str
 71 |     use_sse: bool = False
 72 |     model: str = "tiny"
 73 | 
 74 | 
 75 | @stub.function(gpu="A100")
 76 | @web_endpoint(method="POST")
 77 | async def stream_transcription_v2(req: TranscriptionPayload, request: Request):
 78 |     import whisper
 79 | 
 80 |     model = whisper.load_model(req.model)
 81 |     path = download_youtube_video(req.url)
 82 |     generator = transcribe(model, path)
 83 |     return stream(generator, req.use_sse, request, data_fn=lambda x: x["text"])
 84 | 
 85 | 
 86 | @stub.function(gpu="A100")
 87 | @web_endpoint(method="POST")
 88 | async def stream_transcription_segment_v2(req: TranscriptionPayload, request: Request):
 89 |     import whisper
 90 | 
 91 |     model = whisper.load_model(req.model)
 92 |     path = download_youtube_video(req.url)
 93 |     generator = transcribe(model, path)
 94 |     return stream(
 95 |         generator,
 96 |         req.use_sse,
 97 |         request,
 98 |         data_fn=lambda x: json.dumps(
 99 |             dict(
100 |                 start=x["start"],
101 |                 text=x["text"],
102 |                 end=x["end"],
103 |             )
104 |         ),
105 |     )
106 | 


--------------------------------------------------------------------------------
/transcribe_app/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.4.post0 ; python_version >= "3.9" and python_version < "4.0"
 2 | anyio==3.6.2 ; python_version >= "3.9" and python_version < "4.0"
 3 | async-timeout==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
 4 | attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 5 | certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
 6 | chardet==4.0.0 ; python_version >= "3.9" and python_version < "4.0"
 7 | charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
 8 | click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
 9 | colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.9" and python_version < "4.0" and platform_system == "Windows"
10 | fastapi==0.90.1 ; python_version >= "3.9" and python_version < "4.0"
11 | ffmpeg-python==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
12 | filelock==3.9.0 ; python_version >= "3.9" and python_version < "4.0"
13 | future==0.18.3 ; python_version >= "3.9" and python_version < "4.0"
14 | h11==0.14.0 ; python_version >= "3.9" and python_version < "4.0"
15 | huggingface-hub==0.12.0 ; python_version >= "3.9" and python_version < "4.0"
16 | idna==3.4 ; python_version >= "3.9" and python_version < "4"
17 | loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0"
18 | more-itertools==9.0.0 ; python_version >= "3.9" and python_version < "4.0"
19 | multidict==6.0.4 ; python_version >= "3.9" and python_version < "4.0"
20 | numpy==1.24.2 ; python_version >= "3.9" and python_version < "4.0"
21 | nvidia-cublas-cu11==11.10.3.66 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
22 | nvidia-cuda-nvrtc-cu11==11.7.99 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
23 | nvidia-cuda-runtime-cu11==11.7.99 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
24 | nvidia-cudnn-cu11==8.5.0.96 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
25 | openai-whisper==20230124 ; python_version >= "3.9" and python_version < "4.0"
26 | openai==0.26.5 ; python_version >= "3.9" and python_version < "4.0"
27 | packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
28 | pydantic==1.10.4 ; python_version >= "3.9" and python_version < "4.0"
29 | pytube==12.1.2 ; python_version >= "3.9" and python_version < "4.0"
30 | pyyaml==6.0 ; python_version >= "3.9" and python_version < "4.0"
31 | regex==2022.10.31 ; python_version >= "3.9" and python_version < "4.0"
32 | requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
33 | setuptools==67.2.0 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
34 | sniffio==1.3.0 ; python_version >= "3.9" and python_version < "4.0"
35 | sse-starlette==1.2.1 ; python_version >= "3.9" and python_version < "4.0"
36 | starlette==0.23.1 ; python_version >= "3.9" and python_version < "4.0"
37 | tokenizers==0.13.2 ; python_version >= "3.9" and python_version < "4.0"
38 | torch==1.13.1 ; python_version >= "3.9" and python_version < "4.0"
39 | tqdm==4.64.1 ; python_version >= "3.9" and python_version < "4.0"
40 | transformers==4.26.1 ; python_version >= "3.9" and python_version < "4.0"
41 | typing-extensions==4.4.0 ; python_version >= "3.9" and python_version < "4.0"
42 | urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
43 | uvicorn==0.20.0 ; python_version >= "3.9" and python_version < "4.0"
44 | wheel==0.38.4 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Linux"
45 | win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32"
46 | yarl==1.8.2 ; python_version >= "3.9" and python_version < "4.0"
47 | youtube-transcript-api==0.5.0 ; python_version >= "3.9" and python_version < "4.0"
48 | 


--------------------------------------------------------------------------------
/transcribe_app/transcribe.py:
--------------------------------------------------------------------------------
  1 | import whisper
  2 | from loguru import logger
  3 | import warnings
  4 | from typing import AsyncGenerator, Optional, Tuple, Union, TYPE_CHECKING
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import tqdm
  9 | 
 10 | from whisper.audio import (
 11 |     SAMPLE_RATE,
 12 |     N_FRAMES,
 13 |     HOP_LENGTH,
 14 |     pad_or_trim,
 15 |     log_mel_spectrogram,
 16 | )
 17 | from whisper.decoding import DecodingOptions, DecodingResult
 18 | from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
 19 | from whisper.utils import (
 20 |     exact_div,
 21 |     format_timestamp,
 22 |     make_safe,
 23 | )
 24 | 
 25 | 
 26 | async def transcribe(
 27 |     model: "Whisper",
 28 |     audio: Union[str, np.ndarray, torch.Tensor],
 29 |     *,
 30 |     verbose: Optional[bool] = None,
 31 |     temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
 32 |     compression_ratio_threshold: Optional[float] = 2.4,
 33 |     logprob_threshold: Optional[float] = -1.0,
 34 |     no_speech_threshold: Optional[float] = 0.6,
 35 |     condition_on_previous_text: bool = True,
 36 |     initial_prompt: Optional[str] = None,
 37 |     **decode_options,
 38 | ):
 39 |     """
 40 |     Transcribe an audio file using Whisper
 41 |     Parameters
 42 |     ----------
 43 |     model: Whisper
 44 |         The Whisper model instance
 45 |     audio: Union[str, np.ndarray, torch.Tensor]
 46 |         The path to the audio file to open, or the audio waveform
 47 |     verbose: bool
 48 |         Whether to display the text being decoded to the console. If True, displays all the details,
 49 |         If False, displays minimal details. If None, does not display anything
 50 |     temperature: Union[float, Tuple[float, ...]]
 51 |         Temperature for sampling. It can be a tuple of temperatures, which will be successively used
 52 |         upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
 53 |     compression_ratio_threshold: float
 54 |         If the gzip compression ratio is above this value, treat as failed
 55 |     logprob_threshold: float
 56 |         If the average log probability over sampled tokens is below this value, treat as failed
 57 |     no_speech_threshold: float
 58 |         If the no_speech probability is higher than this value AND the average log probability
 59 |         over sampled tokens is below `logprob_threshold`, consider the segment as silent
 60 |     condition_on_previous_text: bool
 61 |         if True, the previous output of the model is provided as a prompt for the next window;
 62 |         disabling may make the text inconsistent across windows, but the model becomes less prone to
 63 |         getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
 64 |     decode_options: dict
 65 |         Keyword arguments to construct `DecodingOptions` instances
 66 |     Returns
 67 |     -------
 68 |     A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
 69 |     the spoken language ("language"), which is detected when `decode_options["language"]` is None.
 70 |     """
 71 |     dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
 72 |     if model.device == torch.device("cpu"):
 73 |         if torch.cuda.is_available():
 74 |             warnings.warn("Performing inference on CPU when CUDA is available")
 75 |         if dtype == torch.float16:
 76 |             warnings.warn("FP16 is not supported on CPU; using FP32 instead")
 77 |             dtype = torch.float32
 78 | 
 79 |     if dtype == torch.float32:
 80 |         decode_options["fp16"] = False
 81 | 
 82 |     mel = log_mel_spectrogram(audio)
 83 | 
 84 |     if decode_options.get("language", None) is None:
 85 |         if not model.is_multilingual:
 86 |             decode_options["language"] = "en"
 87 |         else:
 88 |             if verbose:
 89 |                 print(
 90 |                     "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
 91 |                 )
 92 |             segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
 93 |             _, probs = model.detect_language(segment)
 94 |             decode_options["language"] = max(probs, key=probs.get)
 95 |             if verbose is not None:
 96 |                 print(
 97 |                     f"Detected language: {LANGUAGES[decode_options['language']].title()}"
 98 |                 )
 99 | 
100 |     language = decode_options["language"]
101 |     task = decode_options.get("task", "transcribe")
102 |     tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
103 | 
104 |     def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
105 |         temperatures = (
106 |             [temperature] if isinstance(temperature, (int, float)) else temperature
107 |         )
108 |         decode_result = None
109 | 
110 |         for t in temperatures:
111 |             kwargs = {**decode_options}
112 |             if t > 0:
113 |                 # disable beam_size and patience when t > 0
114 |                 kwargs.pop("beam_size", None)
115 |                 kwargs.pop("patience", None)
116 |             else:
117 |                 # disable best_of when t == 0
118 |                 kwargs.pop("best_of", None)
119 | 
120 |             options = DecodingOptions(**kwargs, temperature=t)
121 |             decode_result = model.decode(segment, options)
122 | 
123 |             needs_fallback = False
124 |             if (
125 |                 compression_ratio_threshold is not None
126 |                 and decode_result.compression_ratio > compression_ratio_threshold
127 |             ):
128 |                 needs_fallback = True  # too repetitive
129 |             if (
130 |                 logprob_threshold is not None
131 |                 and decode_result.avg_logprob < logprob_threshold
132 |             ):
133 |                 needs_fallback = True  # average log probability is too low
134 | 
135 |             if not needs_fallback:
136 |                 break
137 | 
138 |         return decode_result
139 | 
140 |     seek = 0
141 |     input_stride = exact_div(
142 |         N_FRAMES, model.dims.n_audio_ctx
143 |     )  # mel frames per output token: 2
144 |     time_precision = (
145 |         input_stride * HOP_LENGTH / SAMPLE_RATE
146 |     )  # time per output token: 0.02 (seconds)
147 |     all_tokens = []
148 |     all_segments = []
149 |     prompt_reset_since = 0
150 | 
151 |     if initial_prompt is not None:
152 |         initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
153 |         all_tokens.extend(initial_prompt_tokens)
154 |     else:
155 |         initial_prompt_tokens = []
156 | 
157 |     def add_segment(
158 |         *, start: float, end: float, text_tokens: torch.Tensor, result: DecodingResult
159 |     ):
160 |         text = tokenizer.decode(
161 |             [token for token in text_tokens if token < tokenizer.eot]
162 |         )
163 |         if len(text.strip()) == 0:  # skip empty text output
164 |             return
165 | 
166 |         all_segments.append(
167 |             {
168 |                 "id": len(all_segments),
169 |                 "seek": seek,
170 |                 "start": start,
171 |                 "end": end,
172 |                 "text": text,
173 |                 "tokens": text_tokens.tolist(),
174 |                 "temperature": result.temperature,
175 |                 "avg_logprob": result.avg_logprob,
176 |                 "compression_ratio": result.compression_ratio,
177 |                 "no_speech_prob": result.no_speech_prob,
178 |             }
179 |         )
180 |         if verbose:
181 |             print(
182 |                 make_safe(
183 |                     f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
184 |                 )
185 |             )
186 | 
187 |     # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
188 |     num_frames = mel.shape[-1]
189 |     previous_seek_value = seek
190 | 
191 |     with tqdm.tqdm(
192 |         total=num_frames, unit="frames", disable=verbose is not False
193 |     ) as pbar:
194 |         while seek < num_frames:
195 |             timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
196 |             segment = pad_or_trim(mel[:, seek:], N_FRAMES).to(model.device).to(dtype)
197 |             segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE
198 | 
199 |             decode_options["prompt"] = all_tokens[prompt_reset_since:]
200 |             result: DecodingResult = decode_with_fallback(segment)
201 |             tokens = torch.tensor(result.tokens)
202 | 
203 |             if no_speech_threshold is not None:
204 |                 # no voice activity check
205 |                 should_skip = result.no_speech_prob > no_speech_threshold
206 |                 if (
207 |                     logprob_threshold is not None
208 |                     and result.avg_logprob > logprob_threshold
209 |                 ):
210 |                     # don't skip if the logprob is high enough, despite the no_speech_prob
211 |                     should_skip = False
212 | 
213 |                 if should_skip:
214 |                     seek += segment.shape[
215 |                         -1
216 |                     ]  # fast-forward to the next segment boundary
217 |                     continue
218 | 
219 |             timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
220 |             consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[
221 |                 0
222 |             ].add_(1)
223 |             if (
224 |                 len(consecutive) > 0
225 |             ):  # if the output contains two consecutive timestamp tokens
226 |                 last_slice = 0
227 |                 for current_slice in consecutive:
228 |                     sliced_tokens = tokens[last_slice:current_slice]
229 |                     start_timestamp_position = (
230 |                         sliced_tokens[0].item() - tokenizer.timestamp_begin
231 |                     )
232 |                     end_timestamp_position = (
233 |                         sliced_tokens[-1].item() - tokenizer.timestamp_begin
234 |                     )
235 |                     add_segment(
236 |                         start=timestamp_offset
237 |                         + start_timestamp_position * time_precision,
238 |                         end=timestamp_offset + end_timestamp_position * time_precision,
239 |                         text_tokens=sliced_tokens[1:-1],
240 |                         result=result,
241 |                     )
242 |                     yield all_segments[-1]
243 | 
244 |                     last_slice = current_slice
245 |                 last_timestamp_position = (
246 |                     tokens[last_slice - 1].item() - tokenizer.timestamp_begin
247 |                 )
248 |                 seek += last_timestamp_position * input_stride
249 |                 all_tokens.extend(tokens[: last_slice + 1].tolist())
250 |             else:
251 |                 duration = segment_duration
252 |                 timestamps = tokens[timestamp_tokens.nonzero().flatten()]
253 |                 if (
254 |                     len(timestamps) > 0
255 |                     and timestamps[-1].item() != tokenizer.timestamp_begin
256 |                 ):
257 |                     # no consecutive timestamps but it has a timestamp; use the last one.
258 |                     # single timestamp at the end means no speech after the last timestamp.
259 |                     last_timestamp_position = (
260 |                         timestamps[-1].item() - tokenizer.timestamp_begin
261 |                     )
262 |                     duration = last_timestamp_position * time_precision
263 | 
264 |                 add_segment(
265 |                     start=timestamp_offset,
266 |                     end=timestamp_offset + duration,
267 |                     text_tokens=tokens,
268 |                     result=result,
269 |                 )
270 |                 yield all_segments[-1]
271 | 
272 |                 seek += segment.shape[-1]
273 |                 all_tokens.extend(tokens.tolist())
274 | 
275 |             if not condition_on_previous_text or result.temperature > 0.5:
276 |                 # do not feed the prompt tokens if a high temperature was used
277 |                 prompt_reset_since = len(all_tokens)
278 | 
279 |             # update progress bar
280 |             pbar.update(min(num_frames, seek) - previous_seek_value)
281 |             previous_seek_value = seek
282 | 
283 | 
284 | import time
285 | 
286 | dlt = whisper.load_model("tiny")
287 | 
288 | 
289 | def whisper_generator(path, model="tiny") -> AsyncGenerator:
290 |     # returns a async generator that yields the transcribed text
291 |     if model == "tiny":
292 |         model = dlt
293 |     else:
294 |         start = time.time()
295 |         model = whisper.load_model(model)
296 |         logger.info(f"Loaded model in {time.time() - start} seconds")
297 |     async_generator = transcribe(model, path, verbose=True)
298 |     return async_generator
299 | 


--------------------------------------------------------------------------------