├── .github
    ├── dependabot.yml
    └── workflows
    │   └── docker-image.yml
├── .paperspace
    └── app.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── app.py
├── approach.png
├── data
    ├── README.md
    └── meanwhile.json
├── inputs
    ├── audio
    │   └── .DS_Store
    ├── saved
    │   └── .DS_store
    └── vids
    │   └── .DS_store
├── language-breakdown.svg
├── model-card.md
├── notebooks
    ├── LibriSpeech.ipynb
    └── Multilingual_ASR.ipynb
├── outputs
    └── saved
    │   └── .DS_store
├── requirements.txt
├── results
    └── subbed_vids
    │   └── .DS_store
├── setup.py
├── spec.yaml
├── templates
    ├── index.html
    └── logo.png
├── tests
    ├── jfk.flac
    ├── test_audio.py
    ├── test_normalizer.py
    ├── test_tokenizer.py
    └── test_transcribe.py
├── whisper-caption.ipynb
└── whisper
    ├── __init__.py
    ├── __main__.py
    ├── assets
        ├── gpt2
        │   ├── merges.txt
        │   ├── special_tokens_map.json
        │   ├── tokenizer_config.json
        │   └── vocab.json
        ├── mel_filters.npz
        └── multilingual
        │   ├── added_tokens.json
        │   ├── merges.txt
        │   ├── special_tokens_map.json
        │   ├── tokenizer_config.json
        │   └── vocab.json
    ├── audio.py
    ├── decoding.py
    ├── model.py
    ├── normalizers
        ├── __init__.py
        ├── basic.py
        ├── english.json
        └── english.py
    ├── tokenizer.py
    ├── transcribe.py
    └── utils.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |   - package-ecosystem: "docker"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "daily"
11 |   - package-ecosystem: "pip"
12 |     directory: "/api"
13 |     schedule:
14 |       interval: "daily"
15 |   - package-ecosystem: "npm"
16 |     directory: "/web"
17 |     schedule:
18 |       interval: "daily"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Create and publish a Docker image
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths-ignore:
 7 |       - "**.md"
 8 |       - "docker-compose.yml"
 9 |       - "docker-compose.dev.yml"
10 |       - ".github/ISSUE_TEMPLATE/*.yml"
11 |       - ".github/dependabot.yml"
12 |   pull_request:
13 |     branches:
14 |       - "main"
15 |     paths-ignore:
16 |       - "**.md"
17 |       - "docker-compose.yml"
18 |       - "docker-compose.dev.yml"
19 |       - ".github/ISSUE_TEMPLATE/*.yml"
20 |       - ".github/dependabot.yml"
21 |   workflow_dispatch:
22 |   release:
23 |     types: [published, edited]
24 | 
25 | jobs:
26 |   build-and-publish-image:
27 |     runs-on: ubuntu-latest
28 |     environment: 05a580ce02c1b4b40b081cc9f6e028
29 |     steps:
30 |       - name: Checkout
31 |         uses: actions/checkout@v3
32 |       - name: Docker metadata
33 |         id: meta
34 |         uses: docker/metadata-action@v4
35 |         with:
36 |           images: |
37 |             ghcr.io/${{ github.repository }}
38 |           tags: |
39 |             type=raw,value=latest,enable={{is_default_branch}}
40 |             type=ref,event=branch
41 |             type=ref,event=pr
42 |             type=semver,pattern={{version}}
43 |             type=semver,pattern={{major}}
44 |             type=semver,pattern={{major}}.{{minor}}
45 | 
46 |       - name: Set up QEMU
47 |         uses: docker/setup-qemu-action@v2
48 | 
49 |       - name: Set up Docker Buildx
50 |         uses: docker/setup-buildx-action@v2
51 | 
52 |       - name: Login to GitHub Container Registry
53 |         if: github.event_name != 'pull_request'
54 |         uses: docker/login-action@v2
55 |         with:
56 |           registry: ghcr.io
57 |           username: ${{ github.repository_owner }}
58 |           password: ${{ secrets.GITHUB_TOKEN }}
59 | 
60 |       - name: Build and Publish Docker Image
61 |         uses: docker/build-push-action@v4
62 |         id: build-push
63 |         with:
64 |           context: .
65 |           push: ${{ github.event_name != 'pull_request' }}
66 |           cache-from: type=gha
67 |           cache-to: type=gha,mode=max
68 |           platforms: linux/amd64,linux/arm64
69 |           tags: ${{ steps.meta.outputs.tags }}
70 |           labels: ${{ steps.meta.outputs.labels }}
71 | 
72 |       - name: Set container SHA
73 |         id: set-container-sha
74 |         run: |
75 |           echo "::set-output name=CONTAINER_SHA::${{ steps.build-push.outputs.digest }}"
76 |           echo $CONTAINER_SHA
77 | 
78 |       - uses: paperspace/deploy-action@v1.2
79 |         name: Deploy to Paperspace
80 |         id: deploy
81 |         env:
82 |           PAPERSPACE_API_KEY: 05a580ce02c1b4b40b081cc9f6e028
83 |           CONTAINER_SHA: ${{ steps.set-container-sha.outputs.CONTAINER_SHA }}
84 |         with:
85 |           projectId: pioiimjvdsx
86 |           configPath: /../.paperspace/app.yaml
87 |           image: ghcr.io/${{ github.repository }}:latest@${{ steps.build-push.outputs.digest }}
88 | 


--------------------------------------------------------------------------------
/.paperspace/app.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: latest
 2 | enabled: true
 3 | name: gradient-ai/Whisper-AutoCaption
 4 | image: ghcr.io/gradient-ai/Whisper-AutoCaption:latest
 5 | port: 8008
 6 | healthChecks:
 7 |   readiness:
 8 |     path: /
 9 | resources:
10 |   replicas: 1
11 |   instanceType: P6000
12 |   autoscaling:
13 |     enabled: true
14 |     maxReplicas: 2
15 |     metrics:
16 |       - metric: requestDuration
17 |         summary: average
18 |         value: 0.15
19 |       - metric: cpu
20 |         summary: average
21 |         value: 30
22 |       - metric: memory
23 |         summary: average
24 |         value: 45
25 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | RUN apt-get update \
 4 |   && apt-get install -y wget \
 5 |   && rm -rf /var/lib/apt/lists/*
 6 | RUN apt-get update && apt-get install -y git
 7 | RUN pip install flask
 8 | RUN pip install Werkzeug
 9 | RUN pip install numpy
10 | RUN pip install torch==1.10.0 -f https://download.pytorch.org/whl/torch_stable.html
11 | RUN pip install tqdm 
12 | RUN pip install more-itertools
13 | RUN pip install transformers>=4.19.0
14 | RUN pip install opencv-python-headless
15 | RUN pip install ffmpeg-python
16 | RUN apt install ffmpeg -y
17 | RUN pip install git+https://github.com/openai/whisper.git 
18 | RUN pip install pandas
19 | RUN pip install moviepy --upgrade 
20 | RUN apt install imagemagick -y 
21 | RUN sed -i '88d' ~/../etc/ImageMagick-6/policy.xml 
22 | RUN git clone https://github.com/gradient-ai/Whisper-AutoCaption
23 | WORKDIR Whisper-AutoCaption/
24 | RUN pip install -r requirements.txt
25 | RUN pip install -U yt-dlp
26 | RUN find .paperspace/ -type f > listOfFiles.list
27 | 
28 | EXPOSE 5000
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include whisper/assets/*
2 | include whisper/assets/gpt2/*
3 | include whisper/assets/multilingual/*
4 | include whisper/normalizers/english.json
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Whisper Auto Caption
  2 | 
  3 | This repo shows how to translate and automatically caption videos using Whisper and MoviePy.
  4 | 
  5 | Launch this in Paperspace Gradient by clicking the link below.
  6 | 
  7 | ## Launch Notebook
  8 | 
  9 | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/gradient-ai/Whisper-AutoCaption/blob/master/whisper-caption.ipynb?machine=Free-GPU)
 10 | 
 11 | ---
 12 | 
 13 | # The `subtitle_video` function
 14 | 
 15 | The `subtitle_video` function can be accessed through the whisper-caption.ipynb Notebook. This function uses Whisper and MoviePy to take in a video, extract its audio, convert its speech into text captions, and then add those captions at the correct timeslots back to the original video.
 16 | 
 17 | `subtitle_video` takes in the following parameters:
 18 | 
 19 | ```
 20 |     download:       bool, this tells your function if you are downloading a youtube video
 21 |     url: str,       str, the URL of youtube video to download if download is True
 22 |     aud_opts:       dict, audio file youtube-dl options
 23 |     vid_opts:       dict, video file youtube-dl options
 24 |     model_type:     str, which pretrained model to download. Options are:
 25 |                     ['tiny', 'small', 'base', 'medium','large','tiny.en', 'small.en', 'base.en',  'medium.en']
 26 |                     More details about model_types can be found in table in original repo here:
 27 |                     https://github.com/openai/whisper#Available-models-and-languages
 28 |      name:          str, name of directory to store files in in experiments folder
 29 |      audio_file:    str, path to extracted audio file for Whisper
 30 |      input_file:    str, path to video file for MoviePy to caption
 31 |      output:        str, destination of final output video file
 32 |      uploaded_vid:  str, path to uploaded video file if download is False
 33 | ```
 34 | 
 35 | ---
 36 | 
 37 | # The Whisper AutoCaption Flask application
 38 | 
 39 | To deploy Whisper AutoCaption in the Flask web application, go to Gradient Deployments, and create a new deployment. Then fill in the values, and create the deployment. From there, all you need to do is click the API endpoint URL in the Deployment's details page.
 40 | 
 41 | From there, you can directly input any video from your local computer or Youtube URL.
 42 | 
 43 | ```
 44 | image: paperspace/whisper-autocaption:v1.01
 45 | port: 5000
 46 | resources:
 47 | replicas: 1
 48 | instanceType: RTX4000
 49 | ```
 50 | 
 51 | The full spec is as follows:
 52 | 
 53 | ```
 54 | enabled: true
 55 | image: paperspace/whisper-autocaption:v1.01
 56 | port: 5000
 57 | resources:
 58 |   replicas: 1
 59 |   instanceType: RTX4000
 60 |   autoscaling:
 61 |     enabled: true
 62 |     maxReplicas: 5
 63 |     metrics:
 64 |       - metric: requestDuration
 65 |         summary: average
 66 |         value: 0.15
 67 |       - metric: cpu
 68 |         summary: average
 69 |         value: 30
 70 |       - metric: memory
 71 |         summary: average
 72 |         value: 45
 73 | ```
 74 | 
 75 | ---
 76 | 
 77 | Future plans:
 78 | 
 79 | - API version
 80 | 
 81 | ---
 82 | 
 83 | [[Blog]](https://openai.com/blog/whisper)
 84 | [[Paper]](https://cdn.openai.com/papers/whisper.pdf)
 85 | [[Model card]](model-card.md)
 86 | 
 87 | Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
 88 | 
 89 | ## Approach
 90 | 
 91 | ![Approach](approach.png)
 92 | 
 93 | A Transformer sequence-to-sequence model is trained on various speech processing tasks, including multilingual speech recognition, speech translation, spoken language identification, and voice activity detection. All of these tasks are jointly represented as a sequence of tokens to be predicted by the decoder, allowing for a single model to replace many different stages of a traditional speech processing pipeline. The multitask training format uses a set of special tokens that serve as task specifiers or classification targets.
 94 | 
 95 | ## Setup
 96 | 
 97 | We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.7 or later and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. The following command will pull and install the latest commit from this repository, along with its Python dependencies
 98 | 
 99 |     pip install git+https://github.com/openai/whisper.git
100 | 
101 | It also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
102 | 
103 | ```bash
104 | # on Ubuntu or Debian
105 | sudo apt update && sudo apt install ffmpeg
106 | 
107 | # on Arch Linux
108 | sudo pacman -S ffmpeg
109 | 
110 | # on MacOS using Homebrew (https://brew.sh/)
111 | brew install ffmpeg
112 | 
113 | # on Windows using Chocolatey (https://chocolatey.org/)
114 | choco install ffmpeg
115 | 
116 | # on Windows using Scoop (https://scoop.sh/)
117 | scoop install ffmpeg
118 | ```
119 | 
120 | You may need [`rust`](http://rust-lang.org) installed as well, in case [tokenizers](https://pypi.org/project/tokenizers/) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running:
121 | 
122 | ```bash
123 | pip install setuptools-rust
124 | ```
125 | 
126 | ## Available models and languages
127 | 
128 | There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed.
129 | 
130 | |  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
131 | | :----: | :--------: | :----------------: | :----------------: | :-----------: | :------------: |
132 | |  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
133 | |  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
134 | | small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
135 | | medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
136 | | large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
137 | 
138 | For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
139 | 
140 | Whisper's performance varies widely depending on the language. The figure below shows a WER breakdown by languages of Fleurs dataset, using the `large` model. More WER and BLEU scores corresponding to the other models and datasets can be found in Appendix D in [the paper](https://cdn.openai.com/papers/whisper.pdf).
141 | 
142 | ![WER breakdown by language](language-breakdown.svg)
143 | 
144 | ## Command-line usage
145 | 
146 | The following command will transcribe speech in audio files, using the `medium` model:
147 | 
148 |     whisper audio.flac audio.mp3 audio.wav --model medium
149 | 
150 | The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
151 | 
152 |     whisper japanese.wav --language Japanese
153 | 
154 | Adding `--task translate` will translate the speech into English:
155 | 
156 |     whisper japanese.wav --language Japanese --task translate
157 | 
158 | Run the following to view all available options:
159 | 
160 |     whisper --help
161 | 
162 | See [tokenizer.py](whisper/tokenizer.py) for the list of all available languages.
163 | 
164 | ## Python usage
165 | 
166 | Transcription can also be performed within Python:
167 | 
168 | ```python
169 | import whisper
170 | 
171 | model = whisper.load_model("base")
172 | result = model.transcribe("audio.mp3")
173 | print(result["text"])
174 | ```
175 | 
176 | Internally, the `transcribe()` method reads the entire file and processes the audio with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions on each window.
177 | 
178 | Below is an example usage of `whisper.detect_language()` and `whisper.decode()` which provide lower-level access to the model.
179 | 
180 | ```python
181 | import whisper
182 | 
183 | model = whisper.load_model("base")
184 | 
185 | # load audio and pad/trim it to fit 30 seconds
186 | audio = whisper.load_audio("audio.mp3")
187 | audio = whisper.pad_or_trim(audio)
188 | 
189 | # make log-Mel spectrogram and move to the same device as the model
190 | mel = whisper.log_mel_spectrogram(audio).to(model.device)
191 | 
192 | # detect the spoken language
193 | _, probs = model.detect_language(mel)
194 | print(f"Detected language: {max(probs, key=probs.get)}")
195 | 
196 | # decode the audio
197 | options = whisper.DecodingOptions()
198 | result = whisper.decode(model, mel, options)
199 | 
200 | # print the recognized text
201 | print(result.text)
202 | ```
203 | 
204 | ## More examples
205 | 
206 | Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussions/categories/show-and-tell) category in Discussions for sharing more example usages of Whisper and third-party extensions such as web demos, integrations with other tools, ports for different platforms, etc.
207 | 
208 | ## License
209 | 
210 | The code and the model weights of Whisper are released under the MIT License. See [LICENSE](LICENSE) for further details.
211 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | from cgitb import text
  3 | from yt_dlp import YoutubeDL
  4 | import yt_dlp
  5 | import whisper
  6 | import pandas as pd
  7 | from moviepy.editor import VideoFileClip
  8 | import moviepy.editor as mp
  9 | from moviepy.editor import *
 10 | from moviepy.video.tools.subtitles import SubtitlesClip
 11 | import os
 12 | 
 13 | import cv2
 14 | from os import listdir
 15 | from os.path import isfile, join
 16 | from werkzeug.utils import secure_filename
 17 | import shutil
 18 | import argparse
 19 | import torch
 20 | import torchvision.transforms as transforms
 21 | from PIL import Image
 22 | from flask import Flask, jsonify, request, render_template, redirect, url_for, send_from_directory
 23 | 
 24 | import sys
 25 | 
 26 | UPLOAD_FOLDER = 'inputs/vids'
 27 | OUTPUT_FOLDER = 'results/subbed_vids'
 28 | 
 29 | ALLOWED_EXTENSIONS = {'mp4', 'mov', 'webm', 'ts', 'avi', 'y4m', 'mkv'}
 30 | 
 31 | app = Flask(__name__,static_folder='results')
 32 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 33 | app.config['OUTPUT_FOLDER'] = OUTPUT_FOLDER
 34 | 
 35 | 
 36 | 
 37 | @app.route("/", methods = ['GET', 'POST'])
 38 | def index():
 39 |     return redirect(url_for('upload_file'))
 40 | 
 41 | def allowed_file(filename):
 42 |     return '.' in filename and \
 43 |            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 44 | 
 45 | @app.route("/upload", methods = ['GET', 'POST'])
 46 | def upload_file():
 47 |     # print(request.args.get('key', ''))
 48 |     source = 'inputs/vids'
 49 |     audio = 'inputs/audio'
 50 |     out = 'results/subbed_vids/'
 51 |     opts_aud = {'format': 'mp3/bestaudio/best','keep-video':True, 'outtmpl': f'inputs/audio/audio.mp3'}
 52 |     vid_opts = {'format': 'mp4/bestvideo/best','outtmpl': f'{source}/video.mp4'}
 53 |     for f in os.listdir(source):
 54 |         os.remove(os.path.join(source, f))
 55 |     for f in os.listdir(audio):
 56 |         os.remove(os.path.join(audio, f))
 57 |     for f in os.listdir(out):
 58 |         os.remove(os.path.join(out, f))
 59 |     try:
 60 |         text1 = request.form.values()
 61 |         text1 = list(text1)
 62 |         with YoutubeDL(vid_opts) as ydl:
 63 |             ydl.download(text1)
 64 |         with YoutubeDL(opts_aud) as ydl:
 65 |             ydl.download(text1)
 66 |     except:
 67 |         None
 68 |     
 69 |     if request.method == 'POST':
 70 |         # check if the post request has the file part
 71 |         if 'file' not in request.files:
 72 |             if 'video.mp4' in os.listdir('inputs/vids/'):
 73 |                 return redirect(url_for('main', name='inputs/vids/video.mp4'))
 74 |             print('No file part')
 75 |             return redirect(request.url)
 76 |         file = request.files['file']
 77 | 
 78 |         # If the user does not select a file, the browser submits an
 79 |         # empty file without a filename.
 80 |         if file.filename == '':
 81 |             print('No selected file')
 82 |             return redirect(request.url)
 83 |         if file and allowed_file(file.filename):
 84 |             filename = secure_filename(file.filename)
 85 |             file.save(os.path.join(app.config['UPLOAD_FOLDER'], 'video.mp4'))
 86 |             return redirect(url_for('main', name='video.mp4'))
 87 |     return '''
 88 | <!doctype html>
 89 | <html>
 90 | <style>
 91 |     #Geek_p {
 92 |         font-size: 30px;
 93 |         color: green;
 94 |     }
 95 | </style>
 96 | 
 97 | <body style="text-align:center;">
 98 | 
 99 |     <h1 style="color:blue;">
100 |         Whisper AutoCaption
101 |     </h1>
102 | 
103 |     <body>
104 |         <div>
105 |             Use Whisper AutoCaption to automatically generate translated subtitles for your videos in English from a
106 |             variety of languages.
107 |         </div>
108 |         <div>
109 |             After you select the video, click submit to run the model and add subtitles.
110 |         </div>
111 |         <br>
112 |         <div>
113 |             <form method=post enctype=multipart/form-data>
114 |                 <input type=file name=file>
115 |                 <input type=submit value=Upload>
116 |             </form>
117 |         </div>
118 |         <br>
119 |         <div>
120 |             <form method="POST">
121 |                 Alternatively, you can submit any Youtube video using the URL submission box below.
122 |                 <br>
123 |                 <br>
124 |                 <input name="text">
125 |                 <input type="submit">
126 |             </form>
127 |         </div>
128 |         <br>
129 |         <br>
130 |         <a href="https://www.paperspace.com/">
131 |         <img src="https://d25hn4jiqx5f7l.cloudfront.net/companies/logos/thumb/paperspace_1536862891.png?1536862891" width=80 height=60 alt="Paperspace logo">
132 |     </body>
133 | 
134 | </html>
135 | '''
136 | 
137 | @app.route("/playvideourl/<filename>")
138 | def playvideourl(filename): 
139 |     return render_template('index.html', 
140 |         movie_name='video.mp4',
141 |         movie_ext='mp4')
142 | 
143 | @app.route("/media_video/<filename>")
144 | def media_video(filename):
145 |    # config_any_dir
146 |    return send_from_directory(app.config['OUTPUT_FOLDER'],
147 |                                filename, as_attachment=True)
148 | 
149 | @app.route('/main', methods=['POST','GET'])
150 | def main():    
151 |     my_clip = mp.VideoFileClip('inputs/vids/video.mp4')
152 |     if len(os.listdir('inputs/audio')) == 0:
153 |         my_clip.audio.write_audiofile('inputs/audio/audio.mp3', codec="libmp3lame")
154 |     
155 | 
156 |     # Instantiate whisper model using model_type variable
157 |     model = whisper.load_model('medium')
158 |     
159 |     # Get text from speech for subtitles from audio file
160 |     result = model.transcribe(f'inputs/audio/audio.mp3', task = 'translate')
161 |     
162 |     # create Subtitle dataframe, and save it
163 |     dict1 = {'start':[], 'end':[], 'text':[]}
164 |     for i in result['segments']:
165 |         dict1['start'].append(int(i['start']))
166 |         dict1['end'].append(int(i['end']))
167 |         dict1['text'].append(i['text'])
168 |     df = pd.DataFrame.from_dict(dict1)
169 |     # df.to_csv(f'experiments/{name}/subs.csv')
170 |     vidcap = cv2.VideoCapture('inputs/vids/video.mp4')
171 |     success,image = vidcap.read()
172 |     height = image.shape[0]
173 |     width =image.shape[1]
174 | 
175 |     # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
176 |     generator = lambda txt: TextClip(txt, font='P052-Bold', fontsize=width/20, stroke_width=1, color='white', stroke_color = 'black', size = (width, height*.25), method='caption')
177 |     # generator = lambda txt: TextClip(txt, color='white', fontsize=20, font='Georgia-Regular',stroke_width=3, method='caption', align='south', size=video.size)
178 |     subs = tuple(zip(tuple(zip(df['start'].values, df['end'].values)), df['text'].values))
179 |     subtitles = SubtitlesClip(subs, generator)
180 |     
181 |     # Ff the file was on youtube, add the captions to the downloaded video
182 |     
183 |     video = VideoFileClip('inputs/vids/video.mp4')
184 |     final = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
185 |     final.write_videofile(f'results/subbed_vids/video.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
186 | 
187 |     onlyfiles = [f for f in listdir('results/subbed_vids') if isfile(join('results/subbed_vids', f))]
188 |     try:
189 |         # onlyfiles.remove('.DS_Store')
190 |         return playvideourl('results/subbed_vids/video.mp4')
191 |         # return render_template("index.html", variable = onlyfiles[0])
192 |     except:
193 |         return playvideourl('results/subbed_vids/video.mp4')
194 |         # return render_template("index.html", variable = onlyfiles[0])
195 | 
196 | 
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     app.debug = True
201 |     app.run(host="0.0.0.0")
202 |     main()
203 |     


--------------------------------------------------------------------------------
/approach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/approach.png


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
  1 | This directory supplements the paper with more details on how we prepared the data for evaluation, to help replicate our experiments. 
  2 | 
  3 | ## Short-form English-only datasets
  4 | 
  5 | ### LibriSpeech
  6 | 
  7 | We used the test-clean and test-other splits from the [LibriSpeech ASR corpus](https://www.openslr.org/12).
  8 | 
  9 | ### TED-LIUM 3
 10 | 
 11 | We used the test split of [TED-LIUM Release 3](https://www.openslr.org/51/), using the segmented manual transcripts included in the release.
 12 | 
 13 | ### Common Voice 5.1
 14 | 
 15 | We downloaded the English subset of Common Voice Corpus 5.1 from [the official website](https://commonvoice.mozilla.org/en/datasets)
 16 | 
 17 | ### Artie
 18 | 
 19 | We used the [Artie bias corpus](https://github.com/artie-inc/artie-bias-corpus). This is a subset of the Common Voice dataset.
 20 | 
 21 | ### CallHome & Switchboard
 22 | 
 23 | We used the two corpora from [LDC2002S09](https://catalog.ldc.upenn.edu/LDC2002S09) and [LDC2002T43](https://catalog.ldc.upenn.edu/LDC2002T43) and followed the [eval2000_data_prep.sh](https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/eval2000_data_prep.sh) script for preprocessing. The `wav.scp` files can be converted to WAV files with the following bash commands:
 24 | 
 25 | ```bash
 26 | mkdir -p wav
 27 | while read name cmd; do
 28 |     echo $name
 29 |     echo ${cmd/\|/} wav/$name.wav | bash
 30 | done < wav.scp
 31 | ```
 32 | 
 33 | 
 34 | ### WSJ
 35 | 
 36 | We used [LDC93S6B](https://catalog.ldc.upenn.edu/LDC93S6B) and [LDC94S13B](https://catalog.ldc.upenn.edu/LDC94S13B) and followed the [s5 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5) to preprocess the dataset.
 37 | 
 38 | ### CORAAL
 39 | 
 40 | We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the segmentations from [the FairSpeech project](https://github.com/stanford-policylab/asr-disparities/blob/master/input/CORAAL_transcripts.csv).
 41 | 
 42 | ### CHiME-6
 43 | 
 44 | We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challenge/CHiME5/download.html) and followed the stage 0 of the [s5_track1 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/chime6/s5_track1) to create the CHiME-6 dataset which fixes synchronization. We then used the binaural recordings (`*_P??.wav`) and the corresponding transcripts.
 45 | 
 46 | ### AMI-IHM, AMI-SDM1
 47 | 
 48 | We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
 49 | 
 50 | 
 51 | ## Long-form English-only datasets
 52 | 
 53 | ### TED-LIUM 3
 54 | 
 55 | To create a long-form transcription dataset from the [TED-LIUM3](https://www.openslr.org/51/) dataset, we sliced the audio between the beginning of the first labeled segment and the end of the last labeled segment of each talk, and we used the concatenated text as the label. Below are the timestamps used for slicing each of the 11 TED talks in the test split.   
 56 | 
 57 | | Filename            | Begin time (s) | End time (s) |
 58 | |---------------------|----------------|--------------|
 59 | | DanBarber_2010      | 16.09          | 1116.24      |
 60 | | JaneMcGonigal_2010  | 15.476         | 1187.61      |
 61 | | BillGates_2010      | 15.861         | 1656.94      |
 62 | | TomWujec_2010U      | 16.26          | 402.17       |
 63 | | GaryFlake_2010      | 16.06          | 367.14       |
 64 | | EricMead_2009P      | 18.434         | 536.44       |
 65 | | MichaelSpecter_2010 | 16.11          | 979.312      |
 66 | | DanielKahneman_2010 | 15.8           | 1199.44      |
 67 | | AimeeMullins_2009P  | 17.82          | 1296.59      |
 68 | | JamesCameron_2010   | 16.75          | 1010.65      |
 69 | | RobertGupta_2010U   | 16.8           | 387.03       |
 70 | 
 71 | ### Meanwhile
 72 | 
 73 | This dataset consists of 64 segments from The Late Show with Stephen Colbert. The YouTube video ID, start and end timestamps, and the labels can be found in [meanwhile.json](meanwhile.json). The labels are collected from the closed-caption data for each video and corrected with manual inspection.
 74 | 
 75 | ### Rev16
 76 | 
 77 | We use a subset of 16 files from the 30 podcast episodes in [Rev.AI's Podcast Transcription Benchmark](https://www.rev.ai/blog/podcast-transcription-benchmark-part-1/), after finding that there are multiple cases where a significant portion of the audio and the labels did not match, mostly on the parts introducing the sponsors. We selected 16 episodes that do not have this error, whose "file number" are:
 78 | 
 79 |     3 4 9 10 11 14 17 18 20 21 23 24 26 27 29 32
 80 | 
 81 | ### Kincaid46
 82 | 
 83 | This dataset consists of 46 audio files and the corresponding transcripts compiled in the blog article [Which automatic transcription service is the most accurate - 2018](https://medium.com/descript/which-automatic-transcription-service-is-the-most-accurate-2018-2e859b23ed19) by Jason Kincaid. We used the 46 audio files and reference transcripts from the Airtable widget in the article.
 84 | 
 85 | For the human transcription benchmark in the paper, we use a subset of 25 examples from this data, whose "Ref ID" are:
 86 | 
 87 |     2 4 5 8 9 10 12 13 14 16 19 21 23 25 26 28 29 30 33 35 36 37 42 43 45
 88 | 
 89 | ### Earnings-21, Earnings-22
 90 | 
 91 | For these datasets, we used the files available in [the speech-datasets repository](https://github.com/revdotcom/speech-datasets), as of their `202206` version.
 92 | 
 93 | ### CORAAL
 94 | 
 95 | We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the full-length interview files and transcripts.
 96 | 
 97 | 
 98 | ## Multilingual datasets
 99 | 
100 | ### Multilingual LibriSpeech
101 | 
102 | We used the test splits from each language in [the Multilingual LibriSpeech (MLS) corpus](https://www.openslr.org/94/).
103 | 
104 | ### Fleurs
105 | 
106 | We collected audio files and transcripts using the implementation available as [HuggingFace datasets](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py). To use as a translation dataset, we matched the numerical utterance IDs to find the corresponding transcript in English.   
107 | 
108 | ### VoxPopuli
109 | 
110 | We used the `get_asr_data.py` script from [the official repository](https://github.com/facebookresearch/voxpopuli) to collect the ASR data in 14 languages. 
111 | 
112 | ### Common Voice 9
113 | 
114 | We downloaded the Common Voice Corpus 9 from [the official website](https://commonvoice.mozilla.org/en/datasets)
115 | 
116 | ### CoVOST 2
117 | 
118 | We collected the `X into English` data collected using [the official repository](https://github.com/facebookresearch/covost).
119 | 


--------------------------------------------------------------------------------
/inputs/audio/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/inputs/audio/.DS_Store


--------------------------------------------------------------------------------
/inputs/saved/.DS_store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/inputs/saved/.DS_store


--------------------------------------------------------------------------------
/inputs/vids/.DS_store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/inputs/vids/.DS_store


--------------------------------------------------------------------------------
/model-card.md:
--------------------------------------------------------------------------------
 1 | # Model Card: Whisper
 2 | 
 3 | This is the official codebase for running the automatic speech recognition (ASR) models (Whisper models) trained and released by OpenAI.
 4 | 
 5 | Following [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993), we're providing some information about the automatic speech recognition model. More information on how these models were trained and evaluated can be found [in the paper](https://cdn.openai.com/papers/whisper.pdf).
 6 | 
 7 | 
 8 | ## Model Details
 9 | 
10 | The Whisper models are trained for speech recognition and translation tasks, capable of transcribing speech audio into the text in the language it is spoken (ASR) as well as translated into English (speech translation). Researchers at OpenAI developed the models to study the robustness of speech processing systems trained under large-scale weak supervision. There are 9 models of different sizes and capabilities, summarized in the following table.
11 | 
12 | |  Size  | Parameters | English-only model | Multilingual model |  
13 | |:------:|:----------:|:------------------:|:------------------:|
14 | |  tiny  |    39 M    |         ✓          |         ✓          |
15 | |  base  |    74 M    |         ✓          |         ✓          |
16 | | small  |   244 M    |         ✓          |         ✓          |
17 | | medium |   769 M    |         ✓          |         ✓          |
18 | | large  |   1550 M   |                    |         ✓          |
19 | 
20 | 
21 | ### Release date
22 | 
23 | September 2022
24 | 
25 | ### Model type
26 | 
27 | Sequence-to-sequence ASR (automatic speech recognition) and speech translation model
28 | 
29 | ### Paper & samples
30 | 
31 | [Paper](https://cdn.openai.com/papers/whisper.pdf) / [Blog](https://openai.com/blog/whisper)
32 | 
33 | 
34 | ## Model Use
35 | 
36 | ### Evaluated Use
37 | 
38 | The primary intended users of these models are AI researchers studying robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research.
39 | 
40 | The models are primarily trained and evaluated on ASR and speech translation to English tasks. They show strong ASR results in ~10 languages. They may exhibit additional capabilities, particularly if fine-tuned on certain tasks like voice activity detection, speaker classification, or speaker diarization but have not been robustly evaluated in these areas. We strongly recommend that users perform robust evaluations of the models in a particular context and domain before deploying them.
41 | 
42 | In particular, we caution against using Whisper models to transcribe recordings of individuals taken without their consent or purporting to use these models for any kind of subjective classification. We recommend against use in high-risk domains like decision-making contexts, where flaws in accuracy can lead to pronounced flaws in outcomes. The models are intended to transcribe and translate speech, use of the model for classification is not only not evaluated but also not appropriate, particularly to infer human attributes.
43 | 
44 | 
45 | ## Training Data
46 | 
47 | The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages. 
48 | 
49 | As discussed in [the accompanying paper](https://cdn.openai.com/papers/whisper.pdf), we see that performance on transcription in a given language is directly correlated with the amount of training data we employ in that language.
50 | 
51 | 
52 | ## Performance and Limitations
53 | 
54 | Our studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, technical language, as well as zero shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level. 
55 | 
56 | However, because the models are trained in a weakly supervised manner using large-scale noisy data, the predictions may include texts that are not actually spoken in the audio input (i.e. hallucination). We hypothesize that this happens because, given their general knowledge of language, the models combine trying to predict the next word in audio with trying to transcribe the audio itself.
57 | 
58 | Our models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in [the paper accompanying this release](https://cdn.openai.com/papers/whisper.pdf). 
59 | 
60 | In addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis on these limitations are provided in [the paper](https://cdn.openai.com/papers/whisper.pdf). It is likely that this behavior and hallucinations may be worse on lower-resource and/or lower-discoverability languages.
61 | 
62 | 
63 | ## Broader Implications
64 | 
65 | We anticipate that Whisper models’ transcription capabilities may be used for improving accessibility tools. While Whisper models cannot be used for real-time transcription out of the box – their speed and size suggest that others may be able to build applications on top of them that allow for near-real-time speech recognition and translation. The real value of beneficial applications built on top of Whisper models suggests that the disparate performance of these models may have real economic implications.
66 | 
67 | There are also potential dual use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects.
68 | 


--------------------------------------------------------------------------------
/notebooks/LibriSpeech.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "v5hvo8QWN-a9"
  7 |    },
  8 |    "source": [
  9 |     "# Installing Whisper\n",
 10 |     "\n",
 11 |     "The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "id": "ZsJUxc0aRsAf"
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "! pip install git+https://github.com/openai/whisper.git\n",
 23 |     "! pip install jiwer"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {
 29 |     "id": "1IMEkgyagYto"
 30 |    },
 31 |    "source": [
 32 |     "# Loading the LibriSpeech dataset\n",
 33 |     "\n",
 34 |     "The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {
 41 |     "id": "3CqtR2Fi5-vP"
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import os\n",
 46 |     "import numpy as np\n",
 47 |     "\n",
 48 |     "try:\n",
 49 |     "    import tensorflow  # required in Colab to avoid protobuf compatibility issues\n",
 50 |     "except ImportError:\n",
 51 |     "    pass\n",
 52 |     "\n",
 53 |     "import torch\n",
 54 |     "import pandas as pd\n",
 55 |     "import whisper\n",
 56 |     "import torchaudio\n",
 57 |     "\n",
 58 |     "from tqdm.notebook import tqdm\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {
 68 |     "id": "GuCCB2KYOJCE"
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "class LibriSpeech(torch.utils.data.Dataset):\n",
 73 |     "    \"\"\"\n",
 74 |     "    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
 75 |     "    It will drop the last few seconds of a very small portion of the utterances.\n",
 76 |     "    \"\"\"\n",
 77 |     "    def __init__(self, split=\"test-clean\", device=DEVICE):\n",
 78 |     "        self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
 79 |     "            root=os.path.expanduser(\"~/.cache\"),\n",
 80 |     "            url=split,\n",
 81 |     "            download=True,\n",
 82 |     "        )\n",
 83 |     "        self.device = device\n",
 84 |     "\n",
 85 |     "    def __len__(self):\n",
 86 |     "        return len(self.dataset)\n",
 87 |     "\n",
 88 |     "    def __getitem__(self, item):\n",
 89 |     "        audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
 90 |     "        assert sample_rate == 16000\n",
 91 |     "        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
 92 |     "        mel = whisper.log_mel_spectrogram(audio)\n",
 93 |     "        \n",
 94 |     "        return (mel, text)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 4,
100 |    "metadata": {
101 |     "id": "-YcRU5jqNqo2"
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "dataset = LibriSpeech(\"test-clean\")\n",
106 |     "loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {
112 |     "id": "0ljocCNuUAde"
113 |    },
114 |    "source": [
115 |     "# Running inference on the dataset using a base Whisper model\n",
116 |     "\n",
117 |     "The following will take a few minutes to transcribe all utterances in the dataset."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {
124 |     "colab": {
125 |      "base_uri": "https://localhost:8080/"
126 |     },
127 |     "id": "_PokfNJtOYNu",
128 |     "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
129 |    },
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "Model is English-only and has 71,825,408 parameters.\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "model = whisper.load_model(\"base.en\")\n",
141 |     "print(\n",
142 |     "    f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
143 |     "    f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
144 |     ")"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 6,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# predict without timestamps for short-form transcription\n",
154 |     "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 7,
160 |    "metadata": {
161 |     "colab": {
162 |      "base_uri": "https://localhost:8080/",
163 |      "height": 49,
164 |      "referenced_widgets": [
165 |       "09a29a91f58d4462942505a3cc415801",
166 |       "83391f98a240490987c397048fc1a0d4",
167 |       "06b9aa5f49fa44ba8c93b647dc7db224",
168 |       "da9c231ee67047fb89073c95326b72a5",
169 |       "48da931ebe7f4fd299f8c98c7d2460ff",
170 |       "7a901f447c1d477bb49f954e0feacedd",
171 |       "39f5a6ae8ba74c8598f9c6d5b8ad2d65",
172 |       "a0d10a42c753453283e5219c22239337",
173 |       "09f4cb79ff86465aaf48b0de24869af9",
174 |       "1b9cecf5b3584fba8258a81d4279a25b",
175 |       "039b53f2702c4179af7e0548018d0588"
176 |      ]
177 |     },
178 |     "id": "7OWTn_KvNk59",
179 |     "outputId": "a813a792-3c91-4144-f11f-054fd6778023"
180 |    },
181 |    "outputs": [
182 |     {
183 |      "data": {
184 |       "application/vnd.jupyter.widget-view+json": {
185 |        "model_id": "9df048b46f764cf68cbe0045b8ff73a8",
186 |        "version_major": 2,
187 |        "version_minor": 0
188 |       },
189 |       "text/plain": [
190 |        "  0%|          | 0/164 [00:00<?, ?it/s]"
191 |       ]
192 |      },
193 |      "metadata": {},
194 |      "output_type": "display_data"
195 |     }
196 |    ],
197 |    "source": [
198 |     "hypotheses = []\n",
199 |     "references = []\n",
200 |     "\n",
201 |     "for mels, texts in tqdm(loader):\n",
202 |     "    results = model.decode(mels, options)\n",
203 |     "    hypotheses.extend([result.text for result in results])\n",
204 |     "    references.extend(texts)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 8,
210 |    "metadata": {
211 |     "colab": {
212 |      "base_uri": "https://localhost:8080/",
213 |      "height": 424
214 |     },
215 |     "id": "4nTyynELQ42j",
216 |     "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/html": [
222 |        "<div>\n",
223 |        "<style scoped>\n",
224 |        "    .dataframe tbody tr th:only-of-type {\n",
225 |        "        vertical-align: middle;\n",
226 |        "    }\n",
227 |        "\n",
228 |        "    .dataframe tbody tr th {\n",
229 |        "        vertical-align: top;\n",
230 |        "    }\n",
231 |        "\n",
232 |        "    .dataframe thead th {\n",
233 |        "        text-align: right;\n",
234 |        "    }\n",
235 |        "</style>\n",
236 |        "<table border=\"1\" class=\"dataframe\">\n",
237 |        "  <thead>\n",
238 |        "    <tr style=\"text-align: right;\">\n",
239 |        "      <th></th>\n",
240 |        "      <th>hypothesis</th>\n",
241 |        "      <th>reference</th>\n",
242 |        "    </tr>\n",
243 |        "  </thead>\n",
244 |        "  <tbody>\n",
245 |        "    <tr>\n",
246 |        "      <th>0</th>\n",
247 |        "      <td>He hoped there would be stew for dinner, turni...</td>\n",
248 |        "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>1</th>\n",
252 |        "      <td>Stuffered into you, his belly counseled him.</td>\n",
253 |        "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>2</th>\n",
257 |        "      <td>After early nightfall the yellow lamps would l...</td>\n",
258 |        "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>3</th>\n",
262 |        "      <td>Hello Bertie, any good in your mind?</td>\n",
263 |        "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>4</th>\n",
267 |        "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
268 |        "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>...</th>\n",
272 |        "      <td>...</td>\n",
273 |        "      <td>...</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <th>2615</th>\n",
277 |        "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
278 |        "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>2616</th>\n",
282 |        "      <td>Then I, long tried by natural ills, received t...</td>\n",
283 |        "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
284 |        "    </tr>\n",
285 |        "    <tr>\n",
286 |        "      <th>2617</th>\n",
287 |        "      <td>I love thee freely as men strive for right. I ...</td>\n",
288 |        "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>2618</th>\n",
292 |        "      <td>I love thee with the passion put to use, in my...</td>\n",
293 |        "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>2619</th>\n",
297 |        "      <td>I love thee with the love I seemed to lose wit...</td>\n",
298 |        "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
299 |        "    </tr>\n",
300 |        "  </tbody>\n",
301 |        "</table>\n",
302 |        "<p>2620 rows × 2 columns</p>\n",
303 |        "</div>"
304 |       ],
305 |       "text/plain": [
306 |        "                                             hypothesis  \\\n",
307 |        "0     He hoped there would be stew for dinner, turni...   \n",
308 |        "1          Stuffered into you, his belly counseled him.   \n",
309 |        "2     After early nightfall the yellow lamps would l...   \n",
310 |        "3                  Hello Bertie, any good in your mind?   \n",
311 |        "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
312 |        "...                                                 ...   \n",
313 |        "2615  Oh, to shoot my soul's full meaning into futur...   \n",
314 |        "2616  Then I, long tried by natural ills, received t...   \n",
315 |        "2617  I love thee freely as men strive for right. I ...   \n",
316 |        "2618  I love thee with the passion put to use, in my...   \n",
317 |        "2619  I love thee with the love I seemed to lose wit...   \n",
318 |        "\n",
319 |        "                                              reference  \n",
320 |        "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...  \n",
321 |        "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM  \n",
322 |        "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...  \n",
323 |        "3                    HELLO BERTIE ANY GOOD IN YOUR MIND  \n",
324 |        "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...  \n",
325 |        "...                                                 ...  \n",
326 |        "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...  \n",
327 |        "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...  \n",
328 |        "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...  \n",
329 |        "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...  \n",
330 |        "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...  \n",
331 |        "\n",
332 |        "[2620 rows x 2 columns]"
333 |       ]
334 |      },
335 |      "execution_count": 8,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
342 |     "data"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {
348 |     "id": "HPppEJRXX4ox"
349 |    },
350 |    "source": [
351 |     "# Calculating the word error rate\n",
352 |     "\n",
353 |     "Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 9,
359 |    "metadata": {
360 |     "id": "dl-KBDflMhrg"
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "import jiwer\n",
365 |     "from whisper.normalizers import EnglishTextNormalizer\n",
366 |     "\n",
367 |     "normalizer = EnglishTextNormalizer()"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 10,
373 |    "metadata": {
374 |     "colab": {
375 |      "base_uri": "https://localhost:8080/",
376 |      "height": 641
377 |     },
378 |     "id": "6-O048q4WI4o",
379 |     "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
380 |    },
381 |    "outputs": [
382 |     {
383 |      "data": {
384 |       "text/html": [
385 |        "<div>\n",
386 |        "<style scoped>\n",
387 |        "    .dataframe tbody tr th:only-of-type {\n",
388 |        "        vertical-align: middle;\n",
389 |        "    }\n",
390 |        "\n",
391 |        "    .dataframe tbody tr th {\n",
392 |        "        vertical-align: top;\n",
393 |        "    }\n",
394 |        "\n",
395 |        "    .dataframe thead th {\n",
396 |        "        text-align: right;\n",
397 |        "    }\n",
398 |        "</style>\n",
399 |        "<table border=\"1\" class=\"dataframe\">\n",
400 |        "  <thead>\n",
401 |        "    <tr style=\"text-align: right;\">\n",
402 |        "      <th></th>\n",
403 |        "      <th>hypothesis</th>\n",
404 |        "      <th>reference</th>\n",
405 |        "      <th>hypothesis_clean</th>\n",
406 |        "      <th>reference_clean</th>\n",
407 |        "    </tr>\n",
408 |        "  </thead>\n",
409 |        "  <tbody>\n",
410 |        "    <tr>\n",
411 |        "      <th>0</th>\n",
412 |        "      <td>He hoped there would be stew for dinner, turni...</td>\n",
413 |        "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
414 |        "      <td>he hoped there would be stew for dinner turnip...</td>\n",
415 |        "      <td>he hoped there would be stew for dinner turnip...</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>1</th>\n",
419 |        "      <td>Stuffered into you, his belly counseled him.</td>\n",
420 |        "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
421 |        "      <td>stuffered into you his belly counseled him</td>\n",
422 |        "      <td>stuff it into you his belly counseled him</td>\n",
423 |        "    </tr>\n",
424 |        "    <tr>\n",
425 |        "      <th>2</th>\n",
426 |        "      <td>After early nightfall the yellow lamps would l...</td>\n",
427 |        "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
428 |        "      <td>after early nightfall the yellow lamps would l...</td>\n",
429 |        "      <td>after early nightfall the yellow lamps would l...</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>3</th>\n",
433 |        "      <td>Hello Bertie, any good in your mind?</td>\n",
434 |        "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
435 |        "      <td>hello bertie any good in your mind</td>\n",
436 |        "      <td>hello bertie any good in your mind</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>4</th>\n",
440 |        "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
441 |        "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
442 |        "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
443 |        "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>...</th>\n",
447 |        "      <td>...</td>\n",
448 |        "      <td>...</td>\n",
449 |        "      <td>...</td>\n",
450 |        "      <td>...</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>2615</th>\n",
454 |        "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
455 |        "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
456 |        "      <td>0 to shoot my soul is full meaning into future...</td>\n",
457 |        "      <td>0 to shoot my soul is full meaning into future...</td>\n",
458 |        "    </tr>\n",
459 |        "    <tr>\n",
460 |        "      <th>2616</th>\n",
461 |        "      <td>Then I, long tried by natural ills, received t...</td>\n",
462 |        "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
463 |        "      <td>then i long tried by natural ills received the...</td>\n",
464 |        "      <td>then i long tried by natural ills received the...</td>\n",
465 |        "    </tr>\n",
466 |        "    <tr>\n",
467 |        "      <th>2617</th>\n",
468 |        "      <td>I love thee freely as men strive for right. I ...</td>\n",
469 |        "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
470 |        "      <td>i love thee freely as men strive for right i l...</td>\n",
471 |        "      <td>i love thee freely as men strive for right i l...</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>2618</th>\n",
475 |        "      <td>I love thee with the passion put to use, in my...</td>\n",
476 |        "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
477 |        "      <td>i love thee with the passion put to use in my ...</td>\n",
478 |        "      <td>i love thee with the passion put to use in my ...</td>\n",
479 |        "    </tr>\n",
480 |        "    <tr>\n",
481 |        "      <th>2619</th>\n",
482 |        "      <td>I love thee with the love I seemed to lose wit...</td>\n",
483 |        "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
484 |        "      <td>i love thee with the love i seemed to lose wit...</td>\n",
485 |        "      <td>i love thee with a love i seemed to lose with ...</td>\n",
486 |        "    </tr>\n",
487 |        "  </tbody>\n",
488 |        "</table>\n",
489 |        "<p>2620 rows × 4 columns</p>\n",
490 |        "</div>"
491 |       ],
492 |       "text/plain": [
493 |        "                                             hypothesis  \\\n",
494 |        "0     He hoped there would be stew for dinner, turni...   \n",
495 |        "1          Stuffered into you, his belly counseled him.   \n",
496 |        "2     After early nightfall the yellow lamps would l...   \n",
497 |        "3                  Hello Bertie, any good in your mind?   \n",
498 |        "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
499 |        "...                                                 ...   \n",
500 |        "2615  Oh, to shoot my soul's full meaning into futur...   \n",
501 |        "2616  Then I, long tried by natural ills, received t...   \n",
502 |        "2617  I love thee freely as men strive for right. I ...   \n",
503 |        "2618  I love thee with the passion put to use, in my...   \n",
504 |        "2619  I love thee with the love I seemed to lose wit...   \n",
505 |        "\n",
506 |        "                                              reference  \\\n",
507 |        "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...   \n",
508 |        "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM   \n",
509 |        "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...   \n",
510 |        "3                    HELLO BERTIE ANY GOOD IN YOUR MIND   \n",
511 |        "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...   \n",
512 |        "...                                                 ...   \n",
513 |        "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...   \n",
514 |        "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...   \n",
515 |        "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...   \n",
516 |        "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...   \n",
517 |        "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...   \n",
518 |        "\n",
519 |        "                                       hypothesis_clean  \\\n",
520 |        "0     he hoped there would be stew for dinner turnip...   \n",
521 |        "1            stuffered into you his belly counseled him   \n",
522 |        "2     after early nightfall the yellow lamps would l...   \n",
523 |        "3                    hello bertie any good in your mind   \n",
524 |        "4     number 10 fresh nelly is waiting on you good n...   \n",
525 |        "...                                                 ...   \n",
526 |        "2615  0 to shoot my soul is full meaning into future...   \n",
527 |        "2616  then i long tried by natural ills received the...   \n",
528 |        "2617  i love thee freely as men strive for right i l...   \n",
529 |        "2618  i love thee with the passion put to use in my ...   \n",
530 |        "2619  i love thee with the love i seemed to lose wit...   \n",
531 |        "\n",
532 |        "                                        reference_clean  \n",
533 |        "0     he hoped there would be stew for dinner turnip...  \n",
534 |        "1             stuff it into you his belly counseled him  \n",
535 |        "2     after early nightfall the yellow lamps would l...  \n",
536 |        "3                    hello bertie any good in your mind  \n",
537 |        "4     number 10 fresh nelly is waiting on you good n...  \n",
538 |        "...                                                 ...  \n",
539 |        "2615  0 to shoot my soul is full meaning into future...  \n",
540 |        "2616  then i long tried by natural ills received the...  \n",
541 |        "2617  i love thee freely as men strive for right i l...  \n",
542 |        "2618  i love thee with the passion put to use in my ...  \n",
543 |        "2619  i love thee with a love i seemed to lose with ...  \n",
544 |        "\n",
545 |        "[2620 rows x 4 columns]"
546 |       ]
547 |      },
548 |      "execution_count": 10,
549 |      "metadata": {},
550 |      "output_type": "execute_result"
551 |     }
552 |    ],
553 |    "source": [
554 |     "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
555 |     "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
556 |     "data"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 11,
562 |    "metadata": {
563 |     "colab": {
564 |      "base_uri": "https://localhost:8080/"
565 |     },
566 |     "id": "EBGSITeBYPTT",
567 |     "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
568 |    },
569 |    "outputs": [
570 |     {
571 |      "name": "stdout",
572 |      "output_type": "stream",
573 |      "text": [
574 |       "WER: 4.26 %\n"
575 |      ]
576 |     }
577 |    ],
578 |    "source": [
579 |     "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
580 |     "\n",
581 |     "print(f\"WER: {wer * 100:.2f} %\")"
582 |    ]
583 |   }
584 |  ],
585 |  "metadata": {
586 |   "accelerator": "GPU",
587 |   "colab": {
588 |    "collapsed_sections": [],
589 |    "provenance": []
590 |   },
591 |   "gpuClass": "standard",
592 |   "kernelspec": {
593 |    "display_name": "Python 3 (ipykernel)",
594 |    "language": "python",
595 |    "name": "python3"
596 |   },
597 |   "language_info": {
598 |    "codemirror_mode": {
599 |     "name": "ipython",
600 |     "version": 3
601 |    },
602 |    "file_extension": ".py",
603 |    "mimetype": "text/x-python",
604 |    "name": "python",
605 |    "nbconvert_exporter": "python",
606 |    "pygments_lexer": "ipython3",
607 |    "version": "3.9.9"
608 |   },
609 |   "widgets": {
610 |    "application/vnd.jupyter.widget-state+json": {
611 |     "039b53f2702c4179af7e0548018d0588": {
612 |      "model_module": "@jupyter-widgets/controls",
613 |      "model_module_version": "1.5.0",
614 |      "model_name": "DescriptionStyleModel",
615 |      "state": {
616 |       "_model_module": "@jupyter-widgets/controls",
617 |       "_model_module_version": "1.5.0",
618 |       "_model_name": "DescriptionStyleModel",
619 |       "_view_count": null,
620 |       "_view_module": "@jupyter-widgets/base",
621 |       "_view_module_version": "1.2.0",
622 |       "_view_name": "StyleView",
623 |       "description_width": ""
624 |      }
625 |     },
626 |     "06b9aa5f49fa44ba8c93b647dc7db224": {
627 |      "model_module": "@jupyter-widgets/controls",
628 |      "model_module_version": "1.5.0",
629 |      "model_name": "FloatProgressModel",
630 |      "state": {
631 |       "_dom_classes": [],
632 |       "_model_module": "@jupyter-widgets/controls",
633 |       "_model_module_version": "1.5.0",
634 |       "_model_name": "FloatProgressModel",
635 |       "_view_count": null,
636 |       "_view_module": "@jupyter-widgets/controls",
637 |       "_view_module_version": "1.5.0",
638 |       "_view_name": "ProgressView",
639 |       "bar_style": "success",
640 |       "description": "",
641 |       "description_tooltip": null,
642 |       "layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
643 |       "max": 164,
644 |       "min": 0,
645 |       "orientation": "horizontal",
646 |       "style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
647 |       "value": 164
648 |      }
649 |     },
650 |     "09a29a91f58d4462942505a3cc415801": {
651 |      "model_module": "@jupyter-widgets/controls",
652 |      "model_module_version": "1.5.0",
653 |      "model_name": "HBoxModel",
654 |      "state": {
655 |       "_dom_classes": [],
656 |       "_model_module": "@jupyter-widgets/controls",
657 |       "_model_module_version": "1.5.0",
658 |       "_model_name": "HBoxModel",
659 |       "_view_count": null,
660 |       "_view_module": "@jupyter-widgets/controls",
661 |       "_view_module_version": "1.5.0",
662 |       "_view_name": "HBoxView",
663 |       "box_style": "",
664 |       "children": [
665 |        "IPY_MODEL_83391f98a240490987c397048fc1a0d4",
666 |        "IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
667 |        "IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
668 |       ],
669 |       "layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
670 |      }
671 |     },
672 |     "09f4cb79ff86465aaf48b0de24869af9": {
673 |      "model_module": "@jupyter-widgets/controls",
674 |      "model_module_version": "1.5.0",
675 |      "model_name": "ProgressStyleModel",
676 |      "state": {
677 |       "_model_module": "@jupyter-widgets/controls",
678 |       "_model_module_version": "1.5.0",
679 |       "_model_name": "ProgressStyleModel",
680 |       "_view_count": null,
681 |       "_view_module": "@jupyter-widgets/base",
682 |       "_view_module_version": "1.2.0",
683 |       "_view_name": "StyleView",
684 |       "bar_color": null,
685 |       "description_width": ""
686 |      }
687 |     },
688 |     "1b9cecf5b3584fba8258a81d4279a25b": {
689 |      "model_module": "@jupyter-widgets/base",
690 |      "model_module_version": "1.2.0",
691 |      "model_name": "LayoutModel",
692 |      "state": {
693 |       "_model_module": "@jupyter-widgets/base",
694 |       "_model_module_version": "1.2.0",
695 |       "_model_name": "LayoutModel",
696 |       "_view_count": null,
697 |       "_view_module": "@jupyter-widgets/base",
698 |       "_view_module_version": "1.2.0",
699 |       "_view_name": "LayoutView",
700 |       "align_content": null,
701 |       "align_items": null,
702 |       "align_self": null,
703 |       "border": null,
704 |       "bottom": null,
705 |       "display": null,
706 |       "flex": null,
707 |       "flex_flow": null,
708 |       "grid_area": null,
709 |       "grid_auto_columns": null,
710 |       "grid_auto_flow": null,
711 |       "grid_auto_rows": null,
712 |       "grid_column": null,
713 |       "grid_gap": null,
714 |       "grid_row": null,
715 |       "grid_template_areas": null,
716 |       "grid_template_columns": null,
717 |       "grid_template_rows": null,
718 |       "height": null,
719 |       "justify_content": null,
720 |       "justify_items": null,
721 |       "left": null,
722 |       "margin": null,
723 |       "max_height": null,
724 |       "max_width": null,
725 |       "min_height": null,
726 |       "min_width": null,
727 |       "object_fit": null,
728 |       "object_position": null,
729 |       "order": null,
730 |       "overflow": null,
731 |       "overflow_x": null,
732 |       "overflow_y": null,
733 |       "padding": null,
734 |       "right": null,
735 |       "top": null,
736 |       "visibility": null,
737 |       "width": null
738 |      }
739 |     },
740 |     "39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
741 |      "model_module": "@jupyter-widgets/controls",
742 |      "model_module_version": "1.5.0",
743 |      "model_name": "DescriptionStyleModel",
744 |      "state": {
745 |       "_model_module": "@jupyter-widgets/controls",
746 |       "_model_module_version": "1.5.0",
747 |       "_model_name": "DescriptionStyleModel",
748 |       "_view_count": null,
749 |       "_view_module": "@jupyter-widgets/base",
750 |       "_view_module_version": "1.2.0",
751 |       "_view_name": "StyleView",
752 |       "description_width": ""
753 |      }
754 |     },
755 |     "48da931ebe7f4fd299f8c98c7d2460ff": {
756 |      "model_module": "@jupyter-widgets/base",
757 |      "model_module_version": "1.2.0",
758 |      "model_name": "LayoutModel",
759 |      "state": {
760 |       "_model_module": "@jupyter-widgets/base",
761 |       "_model_module_version": "1.2.0",
762 |       "_model_name": "LayoutModel",
763 |       "_view_count": null,
764 |       "_view_module": "@jupyter-widgets/base",
765 |       "_view_module_version": "1.2.0",
766 |       "_view_name": "LayoutView",
767 |       "align_content": null,
768 |       "align_items": null,
769 |       "align_self": null,
770 |       "border": null,
771 |       "bottom": null,
772 |       "display": null,
773 |       "flex": null,
774 |       "flex_flow": null,
775 |       "grid_area": null,
776 |       "grid_auto_columns": null,
777 |       "grid_auto_flow": null,
778 |       "grid_auto_rows": null,
779 |       "grid_column": null,
780 |       "grid_gap": null,
781 |       "grid_row": null,
782 |       "grid_template_areas": null,
783 |       "grid_template_columns": null,
784 |       "grid_template_rows": null,
785 |       "height": null,
786 |       "justify_content": null,
787 |       "justify_items": null,
788 |       "left": null,
789 |       "margin": null,
790 |       "max_height": null,
791 |       "max_width": null,
792 |       "min_height": null,
793 |       "min_width": null,
794 |       "object_fit": null,
795 |       "object_position": null,
796 |       "order": null,
797 |       "overflow": null,
798 |       "overflow_x": null,
799 |       "overflow_y": null,
800 |       "padding": null,
801 |       "right": null,
802 |       "top": null,
803 |       "visibility": null,
804 |       "width": null
805 |      }
806 |     },
807 |     "7a901f447c1d477bb49f954e0feacedd": {
808 |      "model_module": "@jupyter-widgets/base",
809 |      "model_module_version": "1.2.0",
810 |      "model_name": "LayoutModel",
811 |      "state": {
812 |       "_model_module": "@jupyter-widgets/base",
813 |       "_model_module_version": "1.2.0",
814 |       "_model_name": "LayoutModel",
815 |       "_view_count": null,
816 |       "_view_module": "@jupyter-widgets/base",
817 |       "_view_module_version": "1.2.0",
818 |       "_view_name": "LayoutView",
819 |       "align_content": null,
820 |       "align_items": null,
821 |       "align_self": null,
822 |       "border": null,
823 |       "bottom": null,
824 |       "display": null,
825 |       "flex": null,
826 |       "flex_flow": null,
827 |       "grid_area": null,
828 |       "grid_auto_columns": null,
829 |       "grid_auto_flow": null,
830 |       "grid_auto_rows": null,
831 |       "grid_column": null,
832 |       "grid_gap": null,
833 |       "grid_row": null,
834 |       "grid_template_areas": null,
835 |       "grid_template_columns": null,
836 |       "grid_template_rows": null,
837 |       "height": null,
838 |       "justify_content": null,
839 |       "justify_items": null,
840 |       "left": null,
841 |       "margin": null,
842 |       "max_height": null,
843 |       "max_width": null,
844 |       "min_height": null,
845 |       "min_width": null,
846 |       "object_fit": null,
847 |       "object_position": null,
848 |       "order": null,
849 |       "overflow": null,
850 |       "overflow_x": null,
851 |       "overflow_y": null,
852 |       "padding": null,
853 |       "right": null,
854 |       "top": null,
855 |       "visibility": null,
856 |       "width": null
857 |      }
858 |     },
859 |     "83391f98a240490987c397048fc1a0d4": {
860 |      "model_module": "@jupyter-widgets/controls",
861 |      "model_module_version": "1.5.0",
862 |      "model_name": "HTMLModel",
863 |      "state": {
864 |       "_dom_classes": [],
865 |       "_model_module": "@jupyter-widgets/controls",
866 |       "_model_module_version": "1.5.0",
867 |       "_model_name": "HTMLModel",
868 |       "_view_count": null,
869 |       "_view_module": "@jupyter-widgets/controls",
870 |       "_view_module_version": "1.5.0",
871 |       "_view_name": "HTMLView",
872 |       "description": "",
873 |       "description_tooltip": null,
874 |       "layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
875 |       "placeholder": "​",
876 |       "style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
877 |       "value": "100%"
878 |      }
879 |     },
880 |     "a0d10a42c753453283e5219c22239337": {
881 |      "model_module": "@jupyter-widgets/base",
882 |      "model_module_version": "1.2.0",
883 |      "model_name": "LayoutModel",
884 |      "state": {
885 |       "_model_module": "@jupyter-widgets/base",
886 |       "_model_module_version": "1.2.0",
887 |       "_model_name": "LayoutModel",
888 |       "_view_count": null,
889 |       "_view_module": "@jupyter-widgets/base",
890 |       "_view_module_version": "1.2.0",
891 |       "_view_name": "LayoutView",
892 |       "align_content": null,
893 |       "align_items": null,
894 |       "align_self": null,
895 |       "border": null,
896 |       "bottom": null,
897 |       "display": null,
898 |       "flex": null,
899 |       "flex_flow": null,
900 |       "grid_area": null,
901 |       "grid_auto_columns": null,
902 |       "grid_auto_flow": null,
903 |       "grid_auto_rows": null,
904 |       "grid_column": null,
905 |       "grid_gap": null,
906 |       "grid_row": null,
907 |       "grid_template_areas": null,
908 |       "grid_template_columns": null,
909 |       "grid_template_rows": null,
910 |       "height": null,
911 |       "justify_content": null,
912 |       "justify_items": null,
913 |       "left": null,
914 |       "margin": null,
915 |       "max_height": null,
916 |       "max_width": null,
917 |       "min_height": null,
918 |       "min_width": null,
919 |       "object_fit": null,
920 |       "object_position": null,
921 |       "order": null,
922 |       "overflow": null,
923 |       "overflow_x": null,
924 |       "overflow_y": null,
925 |       "padding": null,
926 |       "right": null,
927 |       "top": null,
928 |       "visibility": null,
929 |       "width": null
930 |      }
931 |     },
932 |     "da9c231ee67047fb89073c95326b72a5": {
933 |      "model_module": "@jupyter-widgets/controls",
934 |      "model_module_version": "1.5.0",
935 |      "model_name": "HTMLModel",
936 |      "state": {
937 |       "_dom_classes": [],
938 |       "_model_module": "@jupyter-widgets/controls",
939 |       "_model_module_version": "1.5.0",
940 |       "_model_name": "HTMLModel",
941 |       "_view_count": null,
942 |       "_view_module": "@jupyter-widgets/controls",
943 |       "_view_module_version": "1.5.0",
944 |       "_view_name": "HTMLView",
945 |       "description": "",
946 |       "description_tooltip": null,
947 |       "layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
948 |       "placeholder": "​",
949 |       "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
950 |       "value": " 164/164 [05:08&lt;00:00,  1.86s/it]"
951 |      }
952 |     }
953 |    }
954 |   }
955 |  },
956 |  "nbformat": 4,
957 |  "nbformat_minor": 1
958 | }
959 | 


--------------------------------------------------------------------------------
/outputs/saved/.DS_store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/outputs/saved/.DS_store


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | torch
3 | tqdm
4 | more-itertools
5 | transformers>=4.19.0
6 | ffmpeg-python==0.2.0
7 | 


--------------------------------------------------------------------------------
/results/subbed_vids/.DS_store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/results/subbed_vids/.DS_store


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pkg_resources
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(
 7 |     name="whisper",
 8 |     py_modules=["whisper"],
 9 |     version="1.0",
10 |     description="",
11 |     author="OpenAI",
12 |     packages=find_packages(exclude=["tests*"]),
13 |     install_requires=[
14 |         str(r)
15 |         for r in pkg_resources.parse_requirements(
16 |             open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
17 |         )
18 |     ],
19 |     entry_points = {
20 |         'console_scripts': ['whisper=whisper.transcribe:cli'],
21 |     },
22 |     include_package_data=True,
23 |     extras_require={'dev': ['pytest']},
24 | )
25 | 


--------------------------------------------------------------------------------
/spec.yaml:
--------------------------------------------------------------------------------
1 | image: jameshskelton/whisper-autocaption:v1.0
2 | port: 5000
3 | resources:
4 |   replicas: 1
5 |   instanceType: RTX4000
6 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 | <body style="text-align:center;">
 5 | 
 6 |     <body>
 7 |         <script type=text/javascript>
 8 |        const movie_name = {{ movie_name|tojson }};
 9 |     </script>
10 | 
11 | 
12 |     <video id="video" defaultMuted autoplay playsinline controls>
13 |         <source src="{{ url_for('media_video', filename=movie_name) }}" type="video/{{movie_ext}}">
14 |         Your browser does not support the video tag.
15 |     </video>
16 |     <br>
17 |     <br>
18 |     <a href="{{url_for('upload_file')}}">Return to home</a>
19 |     <br>
20 |     <br>
21 |     <br>
22 |     <a href="https://www.paperspace.com/">
23 |     <img src="https://d25hn4jiqx5f7l.cloudfront.net/companies/logos/thumb/paperspace_1536862891.png?1536862891" width=80 height=60 alt="Paperspace logo">
24 | 
25 | </body>
26 | 
27 | </html>
28 | 


--------------------------------------------------------------------------------
/templates/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/templates/logo.png


--------------------------------------------------------------------------------
/tests/jfk.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/tests/jfk.flac


--------------------------------------------------------------------------------
/tests/test_audio.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import numpy as np
 4 | 
 5 | from whisper.audio import load_audio, log_mel_spectrogram, SAMPLE_RATE
 6 | 
 7 | 
 8 | def test_audio():
 9 |     audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
10 |     audio = load_audio(audio_path)
11 |     assert audio.ndim == 1
12 |     assert SAMPLE_RATE * 10 < audio.shape[0] < SAMPLE_RATE * 12
13 |     assert 0 < audio.std() < 1
14 | 
15 |     mel_from_audio = log_mel_spectrogram(audio)
16 |     mel_from_file = log_mel_spectrogram(audio_path)
17 | 
18 |     assert np.allclose(mel_from_audio, mel_from_file)
19 |     assert mel_from_audio.max() - mel_from_audio.min() <= 2.0
20 | 


--------------------------------------------------------------------------------
/tests/test_normalizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from whisper.normalizers import EnglishTextNormalizer
 4 | from whisper.normalizers.english import EnglishNumberNormalizer, EnglishSpellingNormalizer
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("std", [EnglishNumberNormalizer(), EnglishTextNormalizer()])
 8 | def test_number_normalizer(std):
 9 |     assert std("two") == "2"
10 |     assert std("thirty one") == "31"
11 |     assert std("five twenty four") == "524"
12 |     assert std("nineteen ninety nine") == "1999"
13 |     assert std("twenty nineteen") == "2019"
14 | 
15 |     assert std("two point five million") == "2500000"
16 |     assert std("four point two billions") == "4200000000s"
17 |     assert std("200 thousand") == "200000"
18 |     assert std("200 thousand dollars") == "$200000"
19 |     assert std("$20 million") == "$20000000"
20 |     assert std("€52.4 million") == "€52400000"
21 |     assert std("£77 thousands") == "£77000s"
22 | 
23 |     assert std("two double o eight") == "2008"
24 | 
25 |     assert std("three thousand twenty nine") == "3029"
26 |     assert std("forty three thousand two hundred sixty") == "43260"
27 |     assert std("forty three thousand two hundred and sixty") == "43260"
28 | 
29 |     assert std("nineteen fifties") == "1950s"
30 |     assert std("thirty first") == "31st"
31 |     assert std("thirty three thousand and three hundred and thirty third") == "33333rd"
32 | 
33 |     assert std("three billion") == "3000000000"
34 |     assert std("millions") == "1000000s"
35 | 
36 |     assert std("july third twenty twenty") == "july 3rd 2020"
37 |     assert std("august twenty sixth twenty twenty one") == "august 26th 2021"
38 |     assert std("3 14") == "3 14"
39 |     assert std("3.14") == "3.14"
40 |     assert std("3 point 2") == "3.2"
41 |     assert std("3 point 14") == "3.14"
42 |     assert std("fourteen point 4") == "14.4"
43 |     assert std("two point two five dollars") == "$2.25"
44 |     assert std("two hundred million dollars") == "$200000000"
45 |     assert std("$20.1 million") == "$20100000"
46 | 
47 |     assert std("ninety percent") == "90%"
48 |     assert std("seventy six per cent") == "76%"
49 | 
50 |     assert std("double oh seven") == "007"
51 |     assert std("double zero seven") == "007"
52 |     assert std("nine one one") == "911"
53 |     assert std("nine double one") == "911"
54 |     assert std("one triple oh one") == "10001"
55 | 
56 |     assert std("two thousandth") == "2000th"
57 |     assert std("thirty two thousandth") == "32000th"
58 | 
59 |     assert std("minus 500") == "-500"
60 |     assert std("positive twenty thousand") == "+20000"
61 | 
62 |     assert std("two dollars and seventy cents") == "$2.70"
63 |     assert std("3 cents") == "¢3"
64 |     assert std("$0.36") == "¢36"
65 |     assert std("three euros and sixty five cents") == "€3.65"
66 | 
67 |     assert std("three and a half million") == "3500000"
68 |     assert std("forty eight and a half dollars") == "$48.5"
69 |     assert std("b747") == "b 747"
70 |     assert std("10 th") == "10th"
71 |     assert std("10th") == "10th"
72 | 
73 | 
74 | def test_spelling_normalizer():
75 |     std = EnglishSpellingNormalizer()
76 | 
77 |     assert std("mobilisation") == "mobilization"
78 |     assert std("cancelation") == "cancellation"
79 | 
80 | 
81 | def test_text_normalizer():
82 |     std = EnglishTextNormalizer()
83 |     assert std("Let's") == "let us"
84 |     assert std("he's like") == "he is like"
85 |     assert std("she's been like") == "she has been like"
86 |     assert std("10km") == "10 km"
87 |     assert std("RC232") == "rc 232"
88 | 
89 |     assert (
90 |         std("Mr. Park visited Assoc. Prof. Kim Jr.")
91 |         == "mister park visited associate professor kim junior"
92 |     )
93 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from whisper.tokenizer import get_tokenizer
 2 | 
 3 | 
 4 | def test_tokenizer():
 5 |     gpt2_tokenizer = get_tokenizer(multilingual=False)
 6 |     multilingual_tokenizer = get_tokenizer(multilingual=True)
 7 | 
 8 |     text = "다람쥐 헌 쳇바퀴에 타고파"
 9 |     gpt2_tokens = gpt2_tokenizer.encode(text)
10 |     multilingual_tokens = multilingual_tokenizer.encode(text)
11 | 
12 |     assert gpt2_tokenizer.decode(gpt2_tokens) == text
13 |     assert multilingual_tokenizer.decode(multilingual_tokens) == text
14 |     assert len(gpt2_tokens) > len(multilingual_tokens)
15 | 


--------------------------------------------------------------------------------
/tests/test_transcribe.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | import whisper
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('model_name', whisper.available_models())
 9 | def test_transcribe(model_name: str):
10 |     model = whisper.load_model(model_name).cuda()
11 |     audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
12 | 
13 |     language = "en" if model_name.endswith(".en") else None
14 |     result = model.transcribe(audio_path, language=language, temperature=0.0)
15 |     assert result["language"] == "en"
16 | 
17 |     transcription = result["text"].lower()
18 |     assert "my fellow americans" in transcription
19 |     assert "your country" in transcription
20 |     assert "do for you" in transcription
21 | 


--------------------------------------------------------------------------------
/whisper/__init__.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import io
  3 | import os
  4 | import urllib
  5 | import warnings
  6 | from typing import List, Optional, Union
  7 | 
  8 | import torch
  9 | from tqdm import tqdm
 10 | 
 11 | from .audio import load_audio, log_mel_spectrogram, pad_or_trim
 12 | from .decoding import DecodingOptions, DecodingResult, decode, detect_language
 13 | from .model import Whisper, ModelDimensions
 14 | from .transcribe import transcribe
 15 | 
 16 | 
 17 | _MODELS = {
 18 |     "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
 19 |     "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
 20 |     "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
 21 |     "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
 22 |     "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
 23 |     "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
 24 |     "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
 25 |     "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
 26 |     "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
 27 | }
 28 | 
 29 | 
 30 | def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
 31 |     os.makedirs(root, exist_ok=True)
 32 | 
 33 |     expected_sha256 = url.split("/")[-2]
 34 |     download_target = os.path.join(root, os.path.basename(url))
 35 | 
 36 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
 37 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
 38 | 
 39 |     if os.path.isfile(download_target):
 40 |         model_bytes = open(download_target, "rb").read()
 41 |         if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
 42 |             return model_bytes if in_memory else download_target
 43 |         else:
 44 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 45 | 
 46 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
 47 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
 48 |             while True:
 49 |                 buffer = source.read(8192)
 50 |                 if not buffer:
 51 |                     break
 52 | 
 53 |                 output.write(buffer)
 54 |                 loop.update(len(buffer))
 55 | 
 56 |     model_bytes = open(download_target, "rb").read()
 57 |     if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
 58 |         raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.")
 59 | 
 60 |     return model_bytes if in_memory else download_target
 61 | 
 62 | 
 63 | def available_models() -> List[str]:
 64 |     """Returns the names of available models"""
 65 |     return list(_MODELS.keys())
 66 | 
 67 | 
 68 | def load_model(name: str, device: Optional[Union[str, torch.device]] = None, download_root: str = None, in_memory: bool = False) -> Whisper:
 69 |     """
 70 |     Load a Whisper ASR model
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     name : str
 75 |         one of the official model names listed by `whisper.available_models()`, or
 76 |         path to a model checkpoint containing the model dimensions and the model state_dict.
 77 |     device : Union[str, torch.device]
 78 |         the PyTorch device to put the model into
 79 |     download_root: str
 80 |         path to download the model files; by default, it uses "~/.cache/whisper"
 81 |     in_memory: bool
 82 |         whether to preload the model weights into host memory
 83 | 
 84 |     Returns
 85 |     -------
 86 |     model : Whisper
 87 |         The Whisper ASR model instance
 88 |     """
 89 | 
 90 |     if device is None:
 91 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 92 |     if download_root is None:
 93 |         download_root = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
 94 | 
 95 |     if name in _MODELS:
 96 |         checkpoint_file = _download(_MODELS[name], download_root, in_memory)
 97 |     elif os.path.isfile(name):
 98 |         checkpoint_file = open(name, "rb").read() if in_memory else name
 99 |     else:
100 |         raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
101 | 
102 |     with (io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")) as fp:
103 |         checkpoint = torch.load(fp, map_location=device)
104 |     del checkpoint_file
105 | 
106 |     dims = ModelDimensions(**checkpoint["dims"])
107 |     model = Whisper(dims)
108 |     model.load_state_dict(checkpoint["model_state_dict"])
109 | 
110 |     return model.to(device)
111 | 


--------------------------------------------------------------------------------
/whisper/__main__.py:
--------------------------------------------------------------------------------
1 | from .transcribe import cli
2 | 
3 | 
4 | cli()
5 | 


--------------------------------------------------------------------------------
/whisper/assets/gpt2/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/whisper/assets/gpt2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/whisper/assets/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/Whisper-AutoCaption/7707d05cccea6845158052f6cb6c9d970961a128/whisper/assets/mel_filters.npz


--------------------------------------------------------------------------------
/whisper/assets/multilingual/added_tokens.json:
--------------------------------------------------------------------------------
1 | {"<|endoftext|>": 50257}
2 | 


--------------------------------------------------------------------------------
/whisper/assets/multilingual/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/whisper/assets/multilingual/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/whisper/audio.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import lru_cache
  3 | from typing import Union
  4 | 
  5 | import ffmpeg
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | from .utils import exact_div
 11 | 
 12 | # hard-coded audio hyperparameters
 13 | SAMPLE_RATE = 16000
 14 | N_FFT = 400
 15 | N_MELS = 80
 16 | HOP_LENGTH = 160
 17 | CHUNK_LENGTH = 30
 18 | N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
 19 | N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
 20 | 
 21 | 
 22 | def load_audio(file: str, sr: int = SAMPLE_RATE):
 23 |     """
 24 |     Open an audio file and read as mono waveform, resampling as necessary
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     file: str
 29 |         The audio file to open
 30 | 
 31 |     sr: int
 32 |         The sample rate to resample the audio if necessary
 33 | 
 34 |     Returns
 35 |     -------
 36 |     A NumPy array containing the audio waveform, in float32 dtype.
 37 |     """
 38 |     try:
 39 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 40 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
 41 |         out, _ = (
 42 |             ffmpeg.input(file, threads=0)
 43 |             .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
 44 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
 45 |         )
 46 |     except ffmpeg.Error as e:
 47 |         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 48 | 
 49 |     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 50 | 
 51 | 
 52 | def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
 53 |     """
 54 |     Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
 55 |     """
 56 |     if torch.is_tensor(array):
 57 |         if array.shape[axis] > length:
 58 |             array = array.index_select(dim=axis, index=torch.arange(length))
 59 | 
 60 |         if array.shape[axis] < length:
 61 |             pad_widths = [(0, 0)] * array.ndim
 62 |             pad_widths[axis] = (0, length - array.shape[axis])
 63 |             array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
 64 |     else:
 65 |         if array.shape[axis] > length:
 66 |             array = array.take(indices=range(length), axis=axis)
 67 | 
 68 |         if array.shape[axis] < length:
 69 |             pad_widths = [(0, 0)] * array.ndim
 70 |             pad_widths[axis] = (0, length - array.shape[axis])
 71 |             array = np.pad(array, pad_widths)
 72 | 
 73 |     return array
 74 | 
 75 | 
 76 | @lru_cache(maxsize=None)
 77 | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
 78 |     """
 79 |     load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
 80 |     Allows decoupling librosa dependency; saved using:
 81 | 
 82 |         np.savez_compressed(
 83 |             "mel_filters.npz",
 84 |             mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
 85 |         )
 86 |     """
 87 |     assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
 88 |     with np.load(os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")) as f:
 89 |         return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
 90 | 
 91 | 
 92 | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
 93 |     """
 94 |     Compute the log-Mel spectrogram of
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
 99 |         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
100 | 
101 |     n_mels: int
102 |         The number of Mel-frequency filters, only 80 is supported
103 | 
104 |     Returns
105 |     -------
106 |     torch.Tensor, shape = (80, n_frames)
107 |         A Tensor that contains the Mel spectrogram
108 |     """
109 |     if not torch.is_tensor(audio):
110 |         if isinstance(audio, str):
111 |             audio = load_audio(audio)
112 |         audio = torch.from_numpy(audio)
113 | 
114 |     window = torch.hann_window(N_FFT).to(audio.device)
115 |     stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
116 |     magnitudes = stft[:, :-1].abs() ** 2
117 | 
118 |     filters = mel_filters(audio.device, n_mels)
119 |     mel_spec = filters @ magnitudes
120 | 
121 |     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
122 |     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
123 |     log_spec = (log_spec + 4.0) / 4.0
124 |     return log_spec
125 | 


--------------------------------------------------------------------------------
/whisper/decoding.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch import Tensor
  8 | from torch.distributions import Categorical
  9 | 
 10 | from .audio import CHUNK_LENGTH
 11 | from .tokenizer import Tokenizer, get_tokenizer
 12 | from .utils import compression_ratio
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from .model import Whisper
 16 | 
 17 | 
 18 | @torch.no_grad()
 19 | def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
 20 |     """
 21 |     Detect the spoken language in the audio, and return them as list of strings, along with the ids
 22 |     of the most probable language tokens and the probability distribution over all language tokens.
 23 |     This is performed outside the main decode loop in order to not interfere with kv-caching.
 24 | 
 25 |     Returns
 26 |     -------
 27 |     language_tokens : Tensor, shape = (n_audio,)
 28 |         ids of the most probable language tokens, which appears after the startoftranscript token.
 29 |     language_probs : List[Dict[str, float]], length = n_audio
 30 |         list of dictionaries containing the probability distribution over all languages.
 31 |     """
 32 |     if tokenizer is None:
 33 |         tokenizer = get_tokenizer(model.is_multilingual)
 34 |     if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
 35 |         raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
 36 | 
 37 |     single = mel.ndim == 2
 38 |     if single:
 39 |         mel = mel.unsqueeze(0)
 40 | 
 41 |     # skip encoder forward pass if already-encoded audio features were given
 42 |     if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
 43 |         mel = model.encoder(mel)
 44 | 
 45 |     # forward pass using a single token, startoftranscript
 46 |     n_audio = mel.shape[0]
 47 |     x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
 48 |     logits = model.logits(x, mel)[:, 0]
 49 | 
 50 |     # collect detected languages; suppress all non-language tokens
 51 |     mask = torch.ones(logits.shape[-1], dtype=torch.bool)
 52 |     mask[list(tokenizer.all_language_tokens)] = False
 53 |     logits[:, mask] = -np.inf
 54 |     language_tokens = logits.argmax(dim=-1)
 55 |     language_token_probs = logits.softmax(dim=-1).cpu()
 56 |     language_probs = [
 57 |         {
 58 |             c: language_token_probs[i, j].item()
 59 |             for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
 60 |         }
 61 |         for i in range(n_audio)
 62 |     ]
 63 | 
 64 |     if single:
 65 |         language_tokens = language_tokens[0]
 66 |         language_probs = language_probs[0]
 67 | 
 68 |     return language_tokens, language_probs
 69 | 
 70 | 
 71 | @dataclass(frozen=True)
 72 | class DecodingOptions:
 73 |     task: str = "transcribe"  # whether to perform X->X "transcribe" or X->English "translate"
 74 |     language: Optional[str] = None  # language that the audio is in; uses detected language if None
 75 | 
 76 |     # sampling-related options
 77 |     temperature: float = 0.0
 78 |     sample_len: Optional[int] = None  # maximum number of tokens to sample
 79 |     best_of: Optional[int] = None     # number of independent samples to collect, when t > 0
 80 |     beam_size: Optional[int] = None   # number of beams in beam search, when t == 0
 81 |     patience: Optional[float] = None  # patience in beam search (https://arxiv.org/abs/2204.05424)
 82 | 
 83 |     # options for ranking generations (either beams or best-of-N samples)
 84 |     length_penalty: Optional[float] = None   # "alpha" in Google NMT, None defaults to length norm
 85 | 
 86 |     # prompt, prefix, and token suppression
 87 |     prompt: Optional[Union[str, List[int]]] = None   # text or tokens for the previous context
 88 |     prefix: Optional[Union[str, List[int]]] = None   # text or tokens to prefix the current context
 89 |     suppress_blank: bool = True                      # this will suppress blank outputs
 90 | 
 91 |     # list of tokens ids (or comma-separated token ids) to suppress
 92 |     # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
 93 |     suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
 94 | 
 95 |     # timestamp sampling options
 96 |     without_timestamps: bool = False              # use <|notimestamps|> to sample text tokens only
 97 |     max_initial_timestamp: Optional[float] = 0.0  # the initial timestamp cannot be later than this
 98 | 
 99 |     # implementation details
100 |     fp16: bool = True  # use fp16 for most of the calculation
101 | 
102 | 
103 | @dataclass(frozen=True)
104 | class DecodingResult:
105 |     audio_features: Tensor
106 |     language: str
107 |     language_probs: Optional[Dict[str, float]] = None
108 |     tokens: List[int] = field(default_factory=list)
109 |     text: str = ""
110 |     avg_logprob: float = np.nan
111 |     no_speech_prob: float = np.nan
112 |     temperature: float = np.nan
113 |     compression_ratio: float = np.nan
114 | 
115 | 
116 | class Inference:
117 |     def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
118 |         """Perform a forward pass on the decoder and return per-token logits"""
119 |         raise NotImplementedError
120 | 
121 |     def rearrange_kv_cache(self, source_indices) -> None:
122 |         """Update the key-value cache according to the updated beams"""
123 |         raise NotImplementedError
124 | 
125 |     def cleanup_caching(self) -> None:
126 |         """Clean up any resources or hooks after decoding is finished"""
127 |         pass
128 | 
129 | 
130 | class PyTorchInference(Inference):
131 |     def __init__(self, model: "Whisper", initial_token_length: int):
132 |         self.model: "Whisper" = model
133 |         self.initial_token_length = initial_token_length
134 |         self.kv_cache = {}
135 |         self.hooks = []
136 | 
137 |     def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
138 |         if not self.kv_cache:
139 |             self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
140 | 
141 |         if tokens.shape[-1] > self.initial_token_length:
142 |             # only need to use the last token except in the first forward pass
143 |             tokens = tokens[:, -1:]
144 | 
145 |         return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
146 | 
147 |     def cleanup_caching(self):
148 |         for hook in self.hooks:
149 |             hook.remove()
150 | 
151 |         self.kv_cache = {}
152 |         self.hooks = []
153 | 
154 |     def rearrange_kv_cache(self, source_indices):
155 |         for module, tensor in self.kv_cache.items():
156 |             # update the key/value cache to contain the selected sequences
157 |             self.kv_cache[module] = tensor[source_indices].detach()
158 | 
159 | 
160 | class SequenceRanker:
161 |     def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
162 |         """
163 |         Given a list of groups of samples and their cumulative log probabilities,
164 |         return the indices of the samples in each group to select as the final result
165 |         """
166 |         raise NotImplementedError
167 | 
168 | 
169 | class MaximumLikelihoodRanker(SequenceRanker):
170 |     """
171 |     Select the sample with the highest log probabilities, penalized using either
172 |     a simple length normalization or Google NMT paper's length penalty
173 |     """
174 | 
175 |     def __init__(self, length_penalty: Optional[float]):
176 |         self.length_penalty = length_penalty
177 | 
178 |     def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
179 |         def scores(logprobs, lengths):
180 |             result = []
181 |             for logprob, length in zip(logprobs, lengths):
182 |                 if self.length_penalty is None:
183 |                     penalty = length
184 |                 else:
185 |                     # from the Google NMT paper
186 |                     penalty = ((5 + length) / 6) ** self.length_penalty
187 |                 result.append(logprob / penalty)
188 |             return result
189 | 
190 |         # get the sequence with the highest score
191 |         lengths = [[len(t) for t in s] for s in tokens]
192 |         return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
193 | 
194 | 
195 | class TokenDecoder:
196 |     def reset(self):
197 |         """Initialize any stateful variables for decoding a new sequence"""
198 | 
199 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
200 |         """Specify how to select the next token, based on the current trace and logits
201 | 
202 |         Parameters
203 |         ----------
204 |         tokens : Tensor, shape = (n_batch, current_sequence_length)
205 |             all tokens in the context so far, including the prefix and sot_sequence tokens
206 | 
207 |         logits : Tensor, shape = (n_batch, vocab_size)
208 |             per-token logits of the probability distribution at the current step
209 | 
210 |         sum_logprobs : Tensor, shape = (n_batch)
211 |             cumulative log probabilities for each sequence
212 | 
213 |         Returns
214 |         -------
215 |         tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
216 |             the tokens, appended with the selected next token
217 | 
218 |         completed : bool
219 |             True if all sequences has reached the end of text
220 | 
221 |         """
222 |         raise NotImplementedError
223 | 
224 |     def finalize(
225 |         self, tokens: Tensor, sum_logprobs: Tensor
226 |     ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
227 |         """Finalize search and return the final candidate sequences
228 | 
229 |         Parameters
230 |         ----------
231 |         tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
232 |             all tokens in the context so far, including the prefix and sot_sequence
233 | 
234 |         sum_logprobs : Tensor, shape = (n_audio, n_group)
235 |             cumulative log probabilities for each sequence
236 | 
237 |         Returns
238 |         -------
239 |         tokens : Sequence[Sequence[Tensor]], length = n_audio
240 |             sequence of Tensors containing candidate token sequences, for each audio input
241 | 
242 |         sum_logprobs : List[List[float]], length = n_audio
243 |             sequence of cumulative log probabilities corresponding to the above
244 | 
245 |         """
246 |         raise NotImplementedError
247 | 
248 | 
249 | class GreedyDecoder(TokenDecoder):
250 |     def __init__(self, temperature: float, eot: int):
251 |         self.temperature = temperature
252 |         self.eot = eot
253 | 
254 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
255 |         temperature = self.temperature
256 |         if temperature == 0:
257 |             next_tokens = logits.argmax(dim=-1)
258 |         else:
259 |             next_tokens = Categorical(logits=logits / temperature).sample()
260 | 
261 |         logprobs = F.log_softmax(logits.float(), dim=-1)
262 |         current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
263 |         sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
264 | 
265 |         next_tokens[tokens[:, -1] == self.eot] = self.eot
266 |         tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
267 | 
268 |         completed = (tokens[:, -1] == self.eot).all()
269 |         return tokens, completed
270 | 
271 |     def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
272 |         # make sure each sequence has at least one EOT token at the end
273 |         tokens = F.pad(tokens, (0, 1), value=self.eot)
274 |         return tokens, sum_logprobs.tolist()
275 | 
276 | 
277 | class BeamSearchDecoder(TokenDecoder):
278 |     def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
279 |         self.beam_size = beam_size
280 |         self.eot = eot
281 |         self.inference = inference
282 |         self.patience = patience or 1.0
283 |         self.max_candidates: int = round(beam_size * self.patience)
284 |         self.finished_sequences = None
285 | 
286 |         assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
287 | 
288 |     def reset(self):
289 |         self.finished_sequences = None
290 | 
291 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
292 |         if tokens.shape[0] % self.beam_size != 0:
293 |             raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
294 | 
295 |         n_audio = tokens.shape[0] // self.beam_size
296 |         if self.finished_sequences is None:  # for the first update
297 |             self.finished_sequences = [{} for _ in range(n_audio)]
298 | 
299 |         logprobs = F.log_softmax(logits.float(), dim=-1)
300 |         next_tokens, source_indices, finished_sequences = [], [], []
301 |         for i in range(n_audio):
302 |             scores, sources, finished = {}, {}, {}
303 | 
304 |             # STEP 1: calculate the cumulative log probabilities for possible candidates
305 |             for j in range(self.beam_size):
306 |                 idx = i * self.beam_size + j
307 |                 prefix = tokens[idx].tolist()
308 |                 for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
309 |                     new_logprob = (sum_logprobs[idx] + logprob).item()
310 |                     sequence = tuple(prefix + [token.item()])
311 |                     scores[sequence] = new_logprob
312 |                     sources[sequence] = idx
313 | 
314 |             # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
315 |             saved = 0
316 |             for sequence in sorted(scores, key=scores.get, reverse=True):
317 |                 if sequence[-1] == self.eot:
318 |                     finished[sequence] = scores[sequence]
319 |                 else:
320 |                     sum_logprobs[len(next_tokens)] = scores[sequence]
321 |                     next_tokens.append(sequence)
322 |                     source_indices.append(sources[sequence])
323 | 
324 |                     saved += 1
325 |                     if saved == self.beam_size:
326 |                         break
327 | 
328 |             finished_sequences.append(finished)
329 | 
330 |         tokens = torch.tensor(next_tokens, device=tokens.device)
331 |         self.inference.rearrange_kv_cache(source_indices)
332 | 
333 |         # add newly finished sequences to self.finished_sequences
334 |         assert len(self.finished_sequences) == len(finished_sequences)
335 |         for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
336 |             for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
337 |                 if len(previously_finished) >= self.max_candidates:
338 |                     break  # the candidate list is full
339 |                 previously_finished[seq] = newly_finished[seq]
340 | 
341 |         # mark as completed if all audio has enough number of samples
342 |         completed = all(
343 |             len(sequences) >= self.max_candidates for sequences in self.finished_sequences
344 |         )
345 |         return tokens, completed
346 | 
347 |     def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
348 |         # collect all finished sequences, including patience, and add unfinished ones if not enough
349 |         sum_logprobs = sum_logprobs.cpu()
350 |         for i, sequences in enumerate(self.finished_sequences):
351 |             if len(sequences) < self.beam_size:  # when not enough sequences are finished
352 |                 for j in list(np.argsort(sum_logprobs[i]))[::-1]:
353 |                     sequence = preceding_tokens[i, j].tolist() + [self.eot]
354 |                     sequences[tuple(sequence)] = sum_logprobs[i][j].item()
355 |                     if len(sequences) >= self.beam_size:
356 |                         break
357 | 
358 |         tokens: List[List[Tensor]] = [
359 |             [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
360 |         ]
361 |         sum_logprobs: List[List[float]] = [
362 |             list(sequences.values()) for sequences in self.finished_sequences
363 |         ]
364 |         return tokens, sum_logprobs
365 | 
366 | 
367 | class LogitFilter:
368 |     def apply(self, logits: Tensor, tokens: Tensor) -> None:
369 |         """Apply any filtering or masking to logits in-place
370 | 
371 |         Parameters
372 |         ----------
373 |         logits : Tensor, shape = (n_batch, vocab_size)
374 |             per-token logits of the probability distribution at the current step
375 | 
376 |         tokens : Tensor, shape = (n_batch, current_sequence_length)
377 |             all tokens in the context so far, including the prefix and sot_sequence tokens
378 | 
379 |         """
380 |         raise NotImplementedError
381 | 
382 | 
383 | class SuppressBlank(LogitFilter):
384 |     def __init__(self, tokenizer: Tokenizer, sample_begin: int):
385 |         self.tokenizer = tokenizer
386 |         self.sample_begin = sample_begin
387 | 
388 |     def apply(self, logits: Tensor, tokens: Tensor):
389 |         if tokens.shape[1] == self.sample_begin:
390 |             logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
391 | 
392 | 
393 | class SuppressTokens(LogitFilter):
394 |     def __init__(self, suppress_tokens: Sequence[int]):
395 |         self.suppress_tokens = list(suppress_tokens)
396 | 
397 |     def apply(self, logits: Tensor, tokens: Tensor):
398 |         logits[:, self.suppress_tokens] = -np.inf
399 | 
400 | 
401 | class ApplyTimestampRules(LogitFilter):
402 |     def __init__(
403 |         self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
404 |     ):
405 |         self.tokenizer = tokenizer
406 |         self.sample_begin = sample_begin
407 |         self.max_initial_timestamp_index = max_initial_timestamp_index
408 | 
409 |     def apply(self, logits: Tensor, tokens: Tensor):
410 |         # suppress <|notimestamps|> which is handled by without_timestamps
411 |         if self.tokenizer.no_timestamps is not None:
412 |             logits[:, self.tokenizer.no_timestamps] = -np.inf
413 | 
414 |         # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
415 |         for k in range(tokens.shape[0]):
416 |             seq = [t for t in tokens[k, self.sample_begin :].tolist()]
417 |             last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
418 |             penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
419 | 
420 |             if last_was_timestamp:
421 |                 if penultimate_was_timestamp:  # has to be non-timestamp
422 |                     logits[k, self.tokenizer.timestamp_begin :] = -np.inf
423 |                 else:  # cannot be normal text tokens
424 |                     logits[k, : self.tokenizer.eot] = -np.inf
425 | 
426 |         # apply the `max_initial_timestamp` option
427 |         if tokens.shape[1] == self.sample_begin and self.max_initial_timestamp_index is not None:
428 |             last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
429 |             logits[:, last_allowed + 1 :] = -np.inf
430 | 
431 |         # if sum of probability over timestamps is above any other token, sample timestamp
432 |         logprobs = F.log_softmax(logits.float(), dim=-1)
433 |         for k in range(tokens.shape[0]):
434 |             timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
435 |             max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
436 |             if timestamp_logprob > max_text_token_logprob:
437 |                 logits[k, : self.tokenizer.timestamp_begin] = -np.inf
438 | 
439 | 
440 | class DecodingTask:
441 |     inference: Inference
442 |     sequence_ranker: SequenceRanker
443 |     decoder: TokenDecoder
444 |     logit_filters: List[LogitFilter]
445 | 
446 |     def __init__(self, model: "Whisper", options: DecodingOptions):
447 |         self.model = model
448 | 
449 |         language = options.language or "en"
450 |         tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
451 |         self.tokenizer: Tokenizer = tokenizer
452 |         self.options: DecodingOptions = self._verify_options(options)
453 | 
454 |         self.n_group: int = options.beam_size or options.best_of or 1
455 |         self.n_ctx: int = model.dims.n_text_ctx
456 |         self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
457 | 
458 |         self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
459 |         if self.options.without_timestamps:
460 |             self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
461 | 
462 |         self.initial_tokens: Tuple[int] = self._get_initial_tokens()
463 |         self.sample_begin: int = len(self.initial_tokens)
464 |         self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
465 | 
466 |         # inference: implements the forward pass through the decoder, including kv caching
467 |         self.inference = PyTorchInference(model, len(self.initial_tokens))
468 | 
469 |         # sequence ranker: implements how to rank a group of sampled sequences
470 |         self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
471 | 
472 |         # decoder: implements how to select the next tokens, given the autoregressive distribution
473 |         if options.beam_size is not None:
474 |             self.decoder = BeamSearchDecoder(
475 |                 options.beam_size, tokenizer.eot, self.inference, options.patience
476 |             )
477 |         else:
478 |             self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
479 | 
480 |         # logit filters: applies various rules to suppress or penalize certain tokens
481 |         self.logit_filters = []
482 |         if self.options.suppress_blank:
483 |             self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
484 |         if self.options.suppress_tokens:
485 |             self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
486 |         if not options.without_timestamps:
487 |             precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
488 |             max_initial_timestamp_index = None
489 |             if options.max_initial_timestamp:
490 |                 max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
491 |             self.logit_filters.append(
492 |                 ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
493 |             )
494 | 
495 |     def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
496 |         if options.beam_size is not None and options.best_of is not None:
497 |             raise ValueError("beam_size and best_of can't be given together")
498 |         if options.temperature == 0:
499 |             if options.best_of is not None:
500 |                 raise ValueError("best_of with greedy sampling (T=0) is not compatible")
501 |         if options.patience is not None and options.beam_size is None:
502 |             raise ValueError("patience requires beam_size to be given")
503 |         if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
504 |             raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
505 | 
506 |         return options
507 | 
508 |     def _get_initial_tokens(self) -> Tuple[int]:
509 |         tokens = list(self.sot_sequence)
510 |         prefix = self.options.prefix
511 |         prompt = self.options.prompt
512 | 
513 |         if prefix:
514 |             prefix_tokens = (
515 |                 self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
516 |             )
517 |             if self.sample_len is not None:
518 |                 max_prefix_len = self.n_ctx // 2 - self.sample_len
519 |                 prefix_tokens = prefix_tokens[-max_prefix_len:]
520 |             tokens = tokens + prefix_tokens
521 | 
522 |         if prompt:
523 |             prompt_tokens = (
524 |                 self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
525 |             )
526 |             tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
527 | 
528 |         return tuple(tokens)
529 | 
530 |     def _get_suppress_tokens(self) -> Tuple[int]:
531 |         suppress_tokens = self.options.suppress_tokens
532 | 
533 |         if isinstance(suppress_tokens, str):
534 |             suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
535 | 
536 |         if -1 in suppress_tokens:
537 |             suppress_tokens = [t for t in suppress_tokens if t >= 0]
538 |             suppress_tokens.extend(self.tokenizer.non_speech_tokens)
539 |         elif suppress_tokens is None or len(suppress_tokens) == 0:
540 |             suppress_tokens = []  # interpret empty string as an empty list
541 |         else:
542 |             assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
543 | 
544 |         suppress_tokens.extend(
545 |             [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
546 |         )
547 |         if self.tokenizer.no_speech is not None:
548 |             # no-speech probability is collected separately
549 |             suppress_tokens.append(self.tokenizer.no_speech)
550 | 
551 |         return tuple(sorted(set(suppress_tokens)))
552 | 
553 |     def _get_audio_features(self, mel: Tensor):
554 |         if self.options.fp16:
555 |             mel = mel.half()
556 | 
557 |         if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
558 |             # encoded audio features are given; skip audio encoding
559 |             audio_features = mel
560 |         else:
561 |             audio_features = self.model.encoder(mel)
562 | 
563 |         if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
564 |             return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
565 | 
566 |         return audio_features
567 | 
568 |     def _detect_language(self, audio_features: Tensor, tokens: Tensor):
569 |         languages = [self.options.language] * audio_features.shape[0]
570 |         lang_probs = None
571 | 
572 |         if self.options.language is None or self.options.task == "lang_id":
573 |             lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
574 |             languages = [max(probs, key=probs.get) for probs in lang_probs]
575 |             if self.options.language is None:
576 |                 tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
577 | 
578 |         return languages, lang_probs
579 | 
580 |     def _main_loop(self, audio_features: Tensor, tokens: Tensor):
581 |         assert audio_features.shape[0] == tokens.shape[0]
582 |         n_batch = tokens.shape[0]
583 |         sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
584 |         no_speech_probs = [np.nan] * n_batch
585 | 
586 |         try:
587 |             for i in range(self.sample_len):
588 |                 logits = self.inference.logits(tokens, audio_features)
589 | 
590 |                 if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
591 |                     probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
592 |                     no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
593 | 
594 |                 # now we need to consider the logits at the last token only
595 |                 logits = logits[:, -1]
596 | 
597 |                 # apply the logit filters, e.g. for suppressing or applying penalty to
598 |                 for logit_filter in self.logit_filters:
599 |                     logit_filter.apply(logits, tokens)
600 | 
601 |                 # expand the tokens tensor with the selected next tokens
602 |                 tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
603 | 
604 |                 if completed or tokens.shape[-1] > self.n_ctx:
605 |                     break
606 |         finally:
607 |             self.inference.cleanup_caching()
608 | 
609 |         return tokens, sum_logprobs, no_speech_probs
610 | 
611 |     @torch.no_grad()
612 |     def run(self, mel: Tensor) -> List[DecodingResult]:
613 |         self.decoder.reset()
614 |         tokenizer: Tokenizer = self.tokenizer
615 |         n_audio: int = mel.shape[0]
616 | 
617 |         audio_features: Tensor = self._get_audio_features(mel)  # encoder forward pass
618 |         tokens: Tensor = torch.tensor([self.initial_tokens]).expand(n_audio, -1)
619 | 
620 |         # detect language if requested, overwriting the language token
621 |         languages, language_probs = self._detect_language(audio_features, tokens)
622 |         if self.options.task == "lang_id":
623 |             return [
624 |                 DecodingResult(audio_features=features, language=language, language_probs=probs)
625 |                 for features, language, probs in zip(audio_features, languages, language_probs)
626 |             ]
627 | 
628 |         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
629 |         audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
630 |         tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
631 | 
632 |         # call the main sampling loop
633 |         tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
634 | 
635 |         # reshape the tensors to have (n_audio, n_group) as the first two dimensions
636 |         audio_features = audio_features[:: self.n_group]
637 |         no_speech_probs = no_speech_probs[:: self.n_group]
638 |         assert audio_features.shape[0] == len(no_speech_probs) == n_audio
639 | 
640 |         tokens = tokens.reshape(n_audio, self.n_group, -1)
641 |         sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
642 | 
643 |         # get the final candidates for each group, and slice between the first sampled token and EOT
644 |         tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
645 |         tokens: List[List[Tensor]] = [
646 |             [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
647 |         ]
648 | 
649 |         # select the top-ranked sample in each group
650 |         selected = self.sequence_ranker.rank(tokens, sum_logprobs)
651 |         tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
652 |         texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
653 | 
654 |         sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
655 |         avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
656 | 
657 |         fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
658 |         if len(set(map(len, fields))) != 1:
659 |             raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
660 | 
661 |         return [
662 |             DecodingResult(
663 |                 audio_features=features,
664 |                 language=language,
665 |                 tokens=tokens,
666 |                 text=text,
667 |                 avg_logprob=avg_logprob,
668 |                 no_speech_prob=no_speech_prob,
669 |                 temperature=self.options.temperature,
670 |                 compression_ratio=compression_ratio(text),
671 |             )
672 |             for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
673 |         ]
674 | 
675 | 
676 | @torch.no_grad()
677 | def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
678 |     """
679 |     Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
680 | 
681 |     Parameters
682 |     ----------
683 |     model: Whisper
684 |         the Whisper model instance
685 | 
686 |     mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
687 |         A tensor containing the Mel spectrogram(s)
688 | 
689 |     options: DecodingOptions
690 |         A dataclass that contains all necessary options for decoding 30-second segments
691 | 
692 |     Returns
693 |     -------
694 |     result: Union[DecodingResult, List[DecodingResult]]
695 |         The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
696 |     """
697 |     single = mel.ndim == 2
698 |     if single:
699 |         mel = mel.unsqueeze(0)
700 | 
701 |     result = DecodingTask(model, options).run(mel)
702 |     
703 |     if single:
704 |         result = result[0]
705 | 
706 |     return result
707 | 


--------------------------------------------------------------------------------
/whisper/model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Dict
  3 | from typing import Iterable, Optional
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch import Tensor
  9 | from torch import nn
 10 | 
 11 | from .transcribe import transcribe as transcribe_function
 12 | from .decoding import detect_language as detect_language_function, decode as decode_function
 13 | 
 14 | 
 15 | @dataclass
 16 | class ModelDimensions:
 17 |     n_mels: int
 18 |     n_audio_ctx: int
 19 |     n_audio_state: int
 20 |     n_audio_head: int
 21 |     n_audio_layer: int
 22 |     n_vocab: int
 23 |     n_text_ctx: int
 24 |     n_text_state: int
 25 |     n_text_head: int
 26 |     n_text_layer: int
 27 | 
 28 | 
 29 | class LayerNorm(nn.LayerNorm):
 30 |     def forward(self, x: Tensor) -> Tensor:
 31 |         return super().forward(x.float()).type(x.dtype)
 32 | 
 33 | 
 34 | class Linear(nn.Linear):
 35 |     def forward(self, x: Tensor) -> Tensor:
 36 |         return F.linear(
 37 |             x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
 38 |         )
 39 | 
 40 | 
 41 | class Conv1d(nn.Conv1d):
 42 |     def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
 43 |         return super()._conv_forward(
 44 |             x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
 45 |         )
 46 | 
 47 | 
 48 | def sinusoids(length, channels, max_timescale=10000):
 49 |     """Returns sinusoids for positional embedding"""
 50 |     assert channels % 2 == 0
 51 |     log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
 52 |     inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
 53 |     scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
 54 |     return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
 55 | 
 56 | 
 57 | class MultiHeadAttention(nn.Module):
 58 |     def __init__(self, n_state: int, n_head: int):
 59 |         super().__init__()
 60 |         self.n_head = n_head
 61 |         self.query = Linear(n_state, n_state)
 62 |         self.key = Linear(n_state, n_state, bias=False)
 63 |         self.value = Linear(n_state, n_state)
 64 |         self.out = Linear(n_state, n_state)
 65 | 
 66 |     def forward(
 67 |         self,
 68 |         x: Tensor,
 69 |         xa: Optional[Tensor] = None,
 70 |         mask: Optional[Tensor] = None,
 71 |         kv_cache: Optional[dict] = None,
 72 |     ):
 73 |         q = self.query(x)
 74 | 
 75 |         if kv_cache is None or xa is None:
 76 |             # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
 77 |             # otherwise, perform key/value projections for self- or cross-attention as usual.
 78 |             k = self.key(x if xa is None else xa)
 79 |             v = self.value(x if xa is None else xa)
 80 |         else:
 81 |             # for cross-attention, calculate keys and values once and reuse in subsequent calls.
 82 |             k = kv_cache.get(self.key, self.key(xa))
 83 |             v = kv_cache.get(self.value, self.value(xa))
 84 | 
 85 |         wv = self.qkv_attention(q, k, v, mask)
 86 |         return self.out(wv)
 87 | 
 88 |     def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
 89 |         n_batch, n_ctx, n_state = q.shape
 90 |         scale = (n_state // self.n_head) ** -0.25
 91 |         q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
 92 |         k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
 93 |         v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
 94 | 
 95 |         qk = q @ k
 96 |         if mask is not None:
 97 |             qk = qk + mask[:n_ctx, :n_ctx]
 98 | 
 99 |         w = F.softmax(qk.float(), dim=-1).to(q.dtype)
100 |         return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
101 | 
102 | 
103 | class ResidualAttentionBlock(nn.Module):
104 |     def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
105 |         super().__init__()
106 | 
107 |         self.attn = MultiHeadAttention(n_state, n_head)
108 |         self.attn_ln = LayerNorm(n_state)
109 | 
110 |         self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
111 |         self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
112 | 
113 |         n_mlp = n_state * 4
114 |         self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
115 |         self.mlp_ln = LayerNorm(n_state)
116 | 
117 |     def forward(
118 |         self,
119 |         x: Tensor,
120 |         xa: Optional[Tensor] = None,
121 |         mask: Optional[Tensor] = None,
122 |         kv_cache: Optional[dict] = None,
123 |     ):
124 |         x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
125 |         if self.cross_attn:
126 |             x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
127 |         x = x + self.mlp(self.mlp_ln(x))
128 |         return x
129 | 
130 | 
131 | class AudioEncoder(nn.Module):
132 |     def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
133 |         super().__init__()
134 |         self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
135 |         self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
136 |         self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
137 | 
138 |         self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
139 |             [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
140 |         )
141 |         self.ln_post = LayerNorm(n_state)
142 | 
143 |     def forward(self, x: Tensor):
144 |         """
145 |         x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
146 |             the mel spectrogram of the audio
147 |         """
148 |         x = F.gelu(self.conv1(x))
149 |         x = F.gelu(self.conv2(x))
150 |         x = x.permute(0, 2, 1)
151 | 
152 |         assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
153 |         x = (x + self.positional_embedding).to(x.dtype)
154 | 
155 |         for block in self.blocks:
156 |             x = block(x)
157 | 
158 |         x = self.ln_post(x)
159 |         return x
160 | 
161 | 
162 | class TextDecoder(nn.Module):
163 |     def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
164 |         super().__init__()
165 | 
166 |         self.token_embedding = nn.Embedding(n_vocab, n_state)
167 |         self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
168 | 
169 |         self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
170 |             [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
171 |         )
172 |         self.ln = LayerNorm(n_state)
173 | 
174 |         mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
175 |         self.register_buffer("mask", mask, persistent=False)
176 | 
177 |     def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
178 |         """
179 |         x : torch.LongTensor, shape = (batch_size, <= n_ctx)
180 |             the text tokens
181 |         xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
182 |             the encoded audio features to be attended on
183 |         """
184 |         offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
185 |         x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
186 |         x = x.to(xa.dtype)
187 | 
188 |         for block in self.blocks:
189 |             x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
190 | 
191 |         x = self.ln(x)
192 |         logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
193 | 
194 |         return logits
195 | 
196 | 
197 | class Whisper(nn.Module):
198 |     def __init__(self, dims: ModelDimensions):
199 |         super().__init__()
200 |         self.dims = dims
201 |         self.encoder = AudioEncoder(
202 |             self.dims.n_mels,
203 |             self.dims.n_audio_ctx,
204 |             self.dims.n_audio_state,
205 |             self.dims.n_audio_head,
206 |             self.dims.n_audio_layer,
207 |         )
208 |         self.decoder = TextDecoder(
209 |             self.dims.n_vocab,
210 |             self.dims.n_text_ctx,
211 |             self.dims.n_text_state,
212 |             self.dims.n_text_head,
213 |             self.dims.n_text_layer,
214 |         )
215 | 
216 |     def embed_audio(self, mel: torch.Tensor):
217 |         return self.encoder.forward(mel)
218 | 
219 |     def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
220 |         return self.decoder.forward(tokens, audio_features)
221 | 
222 |     def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
223 |         return self.decoder(tokens, self.encoder(mel))
224 | 
225 |     @property
226 |     def device(self):
227 |         return next(self.parameters()).device
228 | 
229 |     @property
230 |     def is_multilingual(self):
231 |         return self.dims.n_vocab == 51865
232 | 
233 |     def install_kv_cache_hooks(self, cache: Optional[dict] = None):
234 |         """
235 |         The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
236 |         tensors calculated for the previous positions. This method returns a dictionary that stores
237 |         all caches, and the necessary hooks for the key and value projection modules that save the
238 |         intermediate tensors to be reused during later calculations.
239 | 
240 |         Returns
241 |         -------
242 |         cache : Dict[nn.Module, torch.Tensor]
243 |             A dictionary object mapping the key/value projection modules to its cache
244 |         hooks : List[RemovableHandle]
245 |             List of PyTorch RemovableHandle objects to stop the hooks to be called
246 |         """
247 |         cache = {**cache} if cache is not None else {}
248 |         hooks = []
249 | 
250 |         def save_to_cache(module, _, output):
251 |             if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
252 |                 cache[module] = output  # save as-is, for the first token or cross attention
253 |             else:
254 |                 cache[module] = torch.cat([cache[module], output], dim=1).detach()
255 |             return cache[module]
256 | 
257 |         def install_hooks(layer: nn.Module):
258 |             if isinstance(layer, MultiHeadAttention):
259 |                 hooks.append(layer.key.register_forward_hook(save_to_cache))
260 |                 hooks.append(layer.value.register_forward_hook(save_to_cache))
261 | 
262 |         self.decoder.apply(install_hooks)
263 |         return cache, hooks
264 | 
265 |     detect_language = detect_language_function
266 |     transcribe = transcribe_function
267 |     decode = decode_function
268 | 


--------------------------------------------------------------------------------
/whisper/normalizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .basic import BasicTextNormalizer
2 | from .english import EnglishTextNormalizer
3 | 


--------------------------------------------------------------------------------
/whisper/normalizers/basic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | 
 4 | import regex
 5 | 
 6 | # non-ASCII letters that are not separated by "NFKD" normalization
 7 | ADDITIONAL_DIACRITICS = {
 8 |     "œ": "oe",
 9 |     "Œ": "OE",
10 |     "ø": "o",
11 |     "Ø": "O",
12 |     "æ": "ae",
13 |     "Æ": "AE",
14 |     "ß": "ss",
15 |     "ẞ": "SS",
16 |     "đ": "d",
17 |     "Đ": "D",
18 |     "ð": "d",
19 |     "Ð": "D",
20 |     "þ": "th",
21 |     "Þ": "th",
22 |     "ł": "l",
23 |     "Ł": "L",
24 | }
25 | 
26 | 
27 | def remove_symbols_and_diacritics(s: str, keep=""):
28 |     """
29 |     Replace any other markers, symbols, and punctuations with a space,
30 |     and drop any diacritics (category 'Mn' and some manual mappings)
31 |     """
32 |     return "".join(
33 |         c
34 |         if c in keep
35 |         else ADDITIONAL_DIACRITICS[c]
36 |         if c in ADDITIONAL_DIACRITICS
37 |         else ""
38 |         if unicodedata.category(c) == "Mn"
39 |         else " "
40 |         if unicodedata.category(c)[0] in "MSP"
41 |         else c
42 |         for c in unicodedata.normalize("NFKD", s)
43 |     )
44 | 
45 | 
46 | def remove_symbols(s: str):
47 |     """
48 |     Replace any other markers, symbols, punctuations with a space, keeping diacritics
49 |     """
50 |     return "".join(
51 |         " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
52 |     )
53 | 
54 | 
55 | class BasicTextNormalizer:
56 |     def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
57 |         self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
58 |         self.split_letters = split_letters
59 | 
60 |     def __call__(self, s: str):
61 |         s = s.lower()
62 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
63 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
64 |         s = self.clean(s).lower()
65 | 
66 |         if self.split_letters:
67 |             s = " ".join(regex.findall(r"\X", s, regex.U))
68 | 
69 |         s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
70 | 
71 |         return s
72 | 


--------------------------------------------------------------------------------
/whisper/normalizers/english.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from fractions import Fraction
  5 | from typing import Iterator, List, Match, Optional, Union
  6 | 
  7 | from more_itertools import windowed
  8 | 
  9 | from .basic import remove_symbols_and_diacritics
 10 | 
 11 | 
 12 | class EnglishNumberNormalizer:
 13 |     """
 14 |     Convert any spelled-out numbers into arabic numbers, while handling:
 15 | 
 16 |     - remove any commas
 17 |     - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
 18 |     - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
 19 |     - spell out `one` and `ones`
 20 |     - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         super().__init__()
 25 | 
 26 |         self.zeros = {"o", "oh", "zero"}
 27 |         self.ones = {
 28 |             name: i
 29 |             for i, name in enumerate(
 30 |                 [
 31 |                     "one",
 32 |                     "two",
 33 |                     "three",
 34 |                     "four",
 35 |                     "five",
 36 |                     "six",
 37 |                     "seven",
 38 |                     "eight",
 39 |                     "nine",
 40 |                     "ten",
 41 |                     "eleven",
 42 |                     "twelve",
 43 |                     "thirteen",
 44 |                     "fourteen",
 45 |                     "fifteen",
 46 |                     "sixteen",
 47 |                     "seventeen",
 48 |                     "eighteen",
 49 |                     "nineteen",
 50 |                 ],
 51 |                 start=1,
 52 |             )
 53 |         }
 54 |         self.ones_plural = {
 55 |             "sixes" if name == "six" else name + "s": (value, "s")
 56 |             for name, value in self.ones.items()
 57 |         }
 58 |         self.ones_ordinal = {
 59 |             "zeroth": (0, "th"),
 60 |             "first": (1, "st"),
 61 |             "second": (2, "nd"),
 62 |             "third": (3, "rd"),
 63 |             "fifth": (5, "th"),
 64 |             "twelfth": (12, "th"),
 65 |             **{
 66 |                 name + ("h" if name.endswith("t") else "th"): (value, "th")
 67 |                 for name, value in self.ones.items()
 68 |                 if value > 3 and value != 5 and value != 12
 69 |             },
 70 |         }
 71 |         self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
 72 | 
 73 |         self.tens = {
 74 |             "twenty": 20,
 75 |             "thirty": 30,
 76 |             "forty": 40,
 77 |             "fifty": 50,
 78 |             "sixty": 60,
 79 |             "seventy": 70,
 80 |             "eighty": 80,
 81 |             "ninety": 90,
 82 |         }
 83 |         self.tens_plural = {
 84 |             name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
 85 |         }
 86 |         self.tens_ordinal = {
 87 |             name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
 88 |         }
 89 |         self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
 90 | 
 91 |         self.multipliers = {
 92 |             "hundred": 100,
 93 |             "thousand": 1_000,
 94 |             "million": 1_000_000,
 95 |             "billion": 1_000_000_000,
 96 |             "trillion": 1_000_000_000_000,
 97 |             "quadrillion": 1_000_000_000_000_000,
 98 |             "quintillion": 1_000_000_000_000_000_000,
 99 |             "sextillion": 1_000_000_000_000_000_000_000,
100 |             "septillion": 1_000_000_000_000_000_000_000_000,
101 |             "octillion": 1_000_000_000_000_000_000_000_000_000,
102 |             "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
103 |             "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
104 |         }
105 |         self.multipliers_plural = {
106 |             name + "s": (value, "s") for name, value in self.multipliers.items()
107 |         }
108 |         self.multipliers_ordinal = {
109 |             name + "th": (value, "th") for name, value in self.multipliers.items()
110 |         }
111 |         self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
112 |         self.decimals = {*self.ones, *self.tens, *self.zeros}
113 | 
114 |         self.preceding_prefixers = {
115 |             "minus": "-",
116 |             "negative": "-",
117 |             "plus": "+",
118 |             "positive": "+",
119 |         }
120 |         self.following_prefixers = {
121 |             "pound": "£",
122 |             "pounds": "£",
123 |             "euro": "€",
124 |             "euros": "€",
125 |             "dollar": "$",
126 |             "dollars": "$",
127 |             "cent": "¢",
128 |             "cents": "¢",
129 |         }
130 |         self.prefixes = set(
131 |             list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
132 |         )
133 |         self.suffixers = {
134 |             "per": {"cent": "%"},
135 |             "percent": "%",
136 |         }
137 |         self.specials = {"and", "double", "triple", "point"}
138 | 
139 |         self.words = set(
140 |             [
141 |                 key
142 |                 for mapping in [
143 |                     self.zeros,
144 |                     self.ones,
145 |                     self.ones_suffixed,
146 |                     self.tens,
147 |                     self.tens_suffixed,
148 |                     self.multipliers,
149 |                     self.multipliers_suffixed,
150 |                     self.preceding_prefixers,
151 |                     self.following_prefixers,
152 |                     self.suffixers,
153 |                     self.specials,
154 |                 ]
155 |                 for key in mapping
156 |             ]
157 |         )
158 |         self.literal_words = {"one", "ones"}
159 | 
160 |     def process_words(self, words: List[str]) -> Iterator[str]:
161 |         prefix: Optional[str] = None
162 |         value: Optional[Union[str, int]] = None
163 |         skip = False
164 | 
165 |         def to_fraction(s: str):
166 |             try:
167 |                 return Fraction(s)
168 |             except ValueError:
169 |                 return None
170 | 
171 |         def output(result: Union[str, int]):
172 |             nonlocal prefix, value
173 |             result = str(result)
174 |             if prefix is not None:
175 |                 result = prefix + result
176 |             value = None
177 |             prefix = None
178 |             return result
179 | 
180 |         if len(words) == 0:
181 |             return
182 | 
183 |         for prev, current, next in windowed([None] + words + [None], 3):
184 |             if skip:
185 |                 skip = False
186 |                 continue
187 | 
188 |             next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
189 |             has_prefix = current[0] in self.prefixes
190 |             current_without_prefix = current[1:] if has_prefix else current
191 |             if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
192 |                 # arabic numbers (potentially with signs and fractions)
193 |                 f = to_fraction(current_without_prefix)
194 |                 assert f is not None
195 |                 if value is not None:
196 |                     if isinstance(value, str) and value.endswith("."):
197 |                         # concatenate decimals / ip address components
198 |                         value = str(value) + str(current)
199 |                         continue
200 |                     else:
201 |                         yield output(value)
202 | 
203 |                 prefix = current[0] if has_prefix else prefix
204 |                 if f.denominator == 1:
205 |                     value = f.numerator  # store integers as int
206 |                 else:
207 |                     value = current_without_prefix
208 |             elif current not in self.words:
209 |                 # non-numeric words
210 |                 if value is not None:
211 |                     yield output(value)
212 |                 yield output(current)
213 |             elif current in self.zeros:
214 |                 value = str(value or "") + "0"
215 |             elif current in self.ones:
216 |                 ones = self.ones[current]
217 | 
218 |                 if value is None:
219 |                     value = ones
220 |                 elif isinstance(value, str) or prev in self.ones:
221 |                     if prev in self.tens and ones < 10:  # replace the last zero with the digit
222 |                         assert value[-1] == "0"
223 |                         value = value[:-1] + str(ones)
224 |                     else:
225 |                         value = str(value) + str(ones)
226 |                 elif ones < 10:
227 |                     if value % 10 == 0:
228 |                         value += ones
229 |                     else:
230 |                         value = str(value) + str(ones)
231 |                 else:  # eleven to nineteen
232 |                     if value % 100 == 0:
233 |                         value += ones
234 |                     else:
235 |                         value = str(value) + str(ones)
236 |             elif current in self.ones_suffixed:
237 |                 # ordinal or cardinal; yield the number right away
238 |                 ones, suffix = self.ones_suffixed[current]
239 |                 if value is None:
240 |                     yield output(str(ones) + suffix)
241 |                 elif isinstance(value, str) or prev in self.ones:
242 |                     if prev in self.tens and ones < 10:
243 |                         assert value[-1] == "0"
244 |                         yield output(value[:-1] + str(ones) + suffix)
245 |                     else:
246 |                         yield output(str(value) + str(ones) + suffix)
247 |                 elif ones < 10:
248 |                     if value % 10 == 0:
249 |                         yield output(str(value + ones) + suffix)
250 |                     else:
251 |                         yield output(str(value) + str(ones) + suffix)
252 |                 else:  # eleven to nineteen
253 |                     if value % 100 == 0:
254 |                         yield output(str(value + ones) + suffix)
255 |                     else:
256 |                         yield output(str(value) + str(ones) + suffix)
257 |                 value = None
258 |             elif current in self.tens:
259 |                 tens = self.tens[current]
260 |                 if value is None:
261 |                     value = tens
262 |                 elif isinstance(value, str):
263 |                     value = str(value) + str(tens)
264 |                 else:
265 |                     if value % 100 == 0:
266 |                         value += tens
267 |                     else:
268 |                         value = str(value) + str(tens)
269 |             elif current in self.tens_suffixed:
270 |                 # ordinal or cardinal; yield the number right away
271 |                 tens, suffix = self.tens_suffixed[current]
272 |                 if value is None:
273 |                     yield output(str(tens) + suffix)
274 |                 elif isinstance(value, str):
275 |                     yield output(str(value) + str(tens) + suffix)
276 |                 else:
277 |                     if value % 100 == 0:
278 |                         yield output(str(value + tens) + suffix)
279 |                     else:
280 |                         yield output(str(value) + str(tens) + suffix)
281 |             elif current in self.multipliers:
282 |                 multiplier = self.multipliers[current]
283 |                 if value is None:
284 |                     value = multiplier
285 |                 elif isinstance(value, str) or value == 0:
286 |                     f = to_fraction(value)
287 |                     p = f * multiplier if f is not None else None
288 |                     if f is not None and p.denominator == 1:
289 |                         value = p.numerator
290 |                     else:
291 |                         yield output(value)
292 |                         value = multiplier
293 |                 else:
294 |                     before = value // 1000 * 1000
295 |                     residual = value % 1000
296 |                     value = before + residual * multiplier
297 |             elif current in self.multipliers_suffixed:
298 |                 multiplier, suffix = self.multipliers_suffixed[current]
299 |                 if value is None:
300 |                     yield output(str(multiplier) + suffix)
301 |                 elif isinstance(value, str):
302 |                     f = to_fraction(value)
303 |                     p = f * multiplier if f is not None else None
304 |                     if f is not None and p.denominator == 1:
305 |                         yield output(str(p.numerator) + suffix)
306 |                     else:
307 |                         yield output(value)
308 |                         yield output(str(multiplier) + suffix)
309 |                 else:  # int
310 |                     before = value // 1000 * 1000
311 |                     residual = value % 1000
312 |                     value = before + residual * multiplier
313 |                     yield output(str(value) + suffix)
314 |                 value = None
315 |             elif current in self.preceding_prefixers:
316 |                 # apply prefix (positive, minus, etc.) if it precedes a number
317 |                 if value is not None:
318 |                     yield output(value)
319 | 
320 |                 if next in self.words or next_is_numeric:
321 |                     prefix = self.preceding_prefixers[current]
322 |                 else:
323 |                     yield output(current)
324 |             elif current in self.following_prefixers:
325 |                 # apply prefix (dollars, cents, etc.) only after a number
326 |                 if value is not None:
327 |                     prefix = self.following_prefixers[current]
328 |                     yield output(value)
329 |                 else:
330 |                     yield output(current)
331 |             elif current in self.suffixers:
332 |                 # apply suffix symbols (percent -> '%')
333 |                 if value is not None:
334 |                     suffix = self.suffixers[current]
335 |                     if isinstance(suffix, dict):
336 |                         if next in suffix:
337 |                             yield output(str(value) + suffix[next])
338 |                             skip = True
339 |                         else:
340 |                             yield output(value)
341 |                             yield output(current)
342 |                     else:
343 |                         yield output(str(value) + suffix)
344 |                 else:
345 |                     yield output(current)
346 |             elif current in self.specials:
347 |                 if next not in self.words and not next_is_numeric:
348 |                     # apply special handling only if the next word can be numeric
349 |                     if value is not None:
350 |                         yield output(value)
351 |                     yield output(current)
352 |                 elif current == "and":
353 |                     # ignore "and" after hundreds, thousands, etc.
354 |                     if prev not in self.multipliers:
355 |                         if value is not None:
356 |                             yield output(value)
357 |                         yield output(current)
358 |                 elif current == "double" or current == "triple":
359 |                     if next in self.ones or next in self.zeros:
360 |                         repeats = 2 if current == "double" else 3
361 |                         ones = self.ones.get(next, 0)
362 |                         value = str(value or "") + str(ones) * repeats
363 |                         skip = True
364 |                     else:
365 |                         if value is not None:
366 |                             yield output(value)
367 |                         yield output(current)
368 |                 elif current == "point":
369 |                     if next in self.decimals or next_is_numeric:
370 |                         value = str(value or "") + "."
371 |                 else:
372 |                     # should all have been covered at this point
373 |                     raise ValueError(f"Unexpected token: {current}")
374 |             else:
375 |                 # all should have been covered at this point
376 |                 raise ValueError(f"Unexpected token: {current}")
377 | 
378 |         if value is not None:
379 |             yield output(value)
380 | 
381 |     def preprocess(self, s: str):
382 |         # replace "<number> and a half" with "<number> point five"
383 |         results = []
384 | 
385 |         segments = re.split(r"\band\s+a\s+half\b", s)
386 |         for i, segment in enumerate(segments):
387 |             if len(segment.strip()) == 0:
388 |                 continue
389 |             if i == len(segments) - 1:
390 |                 results.append(segment)
391 |             else:
392 |                 results.append(segment)
393 |                 last_word = segment.rsplit(maxsplit=2)[-1]
394 |                 if last_word in self.decimals or last_word in self.multipliers:
395 |                     results.append("point five")
396 |                 else:
397 |                     results.append("and a half")
398 | 
399 |         s = " ".join(results)
400 | 
401 |         # put a space at number/letter boundary
402 |         s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
403 |         s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
404 | 
405 |         # but remove spaces which could be a suffix
406 |         s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
407 | 
408 |         return s
409 | 
410 |     def postprocess(self, s: str):
411 |         def combine_cents(m: Match):
412 |             try:
413 |                 currency = m.group(1)
414 |                 integer = m.group(2)
415 |                 cents = int(m.group(3))
416 |                 return f"{currency}{integer}.{cents:02d}"
417 |             except ValueError:
418 |                 return m.string
419 | 
420 |         def extract_cents(m: Match):
421 |             try:
422 |                 return f"¢{int(m.group(1))}"
423 |             except ValueError:
424 |                 return m.string
425 | 
426 |         # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
427 |         s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
428 |         s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
429 | 
430 |         # write "one(s)" instead of "1(s)", just for the readability
431 |         s = re.sub(r"\b1(s?)\b", r"one\1", s)
432 | 
433 |         return s
434 | 
435 |     def __call__(self, s: str):
436 |         s = self.preprocess(s)
437 |         s = " ".join(word for word in self.process_words(s.split()) if word is not None)
438 |         s = self.postprocess(s)
439 | 
440 |         return s
441 | 
442 | 
443 | class EnglishSpellingNormalizer:
444 |     """
445 |     Applies British-American spelling mappings as listed in [1].
446 | 
447 |     [1] https://www.tysto.com/uk-us-spelling-list.html
448 |     """
449 | 
450 |     def __init__(self):
451 |         mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
452 |         self.mapping = json.load(open(mapping_path))
453 | 
454 |     def __call__(self, s: str):
455 |         return " ".join(self.mapping.get(word, word) for word in s.split())
456 | 
457 | 
458 | class EnglishTextNormalizer:
459 |     def __init__(self):
460 |         self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
461 |         self.replacers = {
462 |             # common contractions
463 |             r"\bwon't\b": "will not",
464 |             r"\bcan't\b": "can not",
465 |             r"\blet's\b": "let us",
466 |             r"\bain't\b": "aint",
467 |             r"\by'all\b": "you all",
468 |             r"\bwanna\b": "want to",
469 |             r"\bgotta\b": "got to",
470 |             r"\bgonna\b": "going to",
471 |             r"\bi'ma\b": "i am going to",
472 |             r"\bimma\b": "i am going to",
473 |             r"\bwoulda\b": "would have",
474 |             r"\bcoulda\b": "could have",
475 |             r"\bshoulda\b": "should have",
476 |             r"\bma'am\b": "madam",
477 |             # contractions in titles/prefixes
478 |             r"\bmr\b": "mister ",
479 |             r"\bmrs\b": "missus ",
480 |             r"\bst\b": "saint ",
481 |             r"\bdr\b": "doctor ",
482 |             r"\bprof\b": "professor ",
483 |             r"\bcapt\b": "captain ",
484 |             r"\bgov\b": "governor ",
485 |             r"\bald\b": "alderman ",
486 |             r"\bgen\b": "general ",
487 |             r"\bsen\b": "senator ",
488 |             r"\brep\b": "representative ",
489 |             r"\bpres\b": "president ",
490 |             r"\brev\b": "reverend ",
491 |             r"\bhon\b": "honorable ",
492 |             r"\basst\b": "assistant ",
493 |             r"\bassoc\b": "associate ",
494 |             r"\blt\b": "lieutenant ",
495 |             r"\bcol\b": "colonel ",
496 |             r"\bjr\b": "junior ",
497 |             r"\bsr\b": "senior ",
498 |             r"\besq\b": "esquire ",
499 |             # prefect tenses, ideally it should be any past participles, but it's harder..
500 |             r"'d been\b": " had been",
501 |             r"'s been\b": " has been",
502 |             r"'d gone\b": " had gone",
503 |             r"'s gone\b": " has gone",
504 |             r"'d done\b": " had done",  # "'s done" is ambiguous
505 |             r"'s got\b": " has got",
506 |             # general contractions
507 |             r"n't\b": " not",
508 |             r"'re\b": " are",
509 |             r"'s\b": " is",
510 |             r"'d\b": " would",
511 |             r"'ll\b": " will",
512 |             r"'t\b": " not",
513 |             r"'ve\b": " have",
514 |             r"'m\b": " am",
515 |         }
516 |         self.standardize_numbers = EnglishNumberNormalizer()
517 |         self.standardize_spellings = EnglishSpellingNormalizer()
518 | 
519 |     def __call__(self, s: str):
520 |         s = s.lower()
521 | 
522 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
523 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
524 |         s = re.sub(self.ignore_patterns, "", s)
525 |         s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe
526 | 
527 |         for pattern, replacement in self.replacers.items():
528 |             s = re.sub(pattern, replacement, s)
529 | 
530 |         s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
531 |         s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
532 |         s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics
533 | 
534 |         s = self.standardize_numbers(s)
535 |         s = self.standardize_spellings(s)
536 | 
537 |         # now remove prefix/suffix symbols that are not preceded/followed by numbers
538 |         s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
539 |         s = re.sub(r"([^0-9])%", r"\1 ", s)
540 | 
541 |         s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
542 | 
543 |         return s
544 | 


--------------------------------------------------------------------------------
/whisper/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass
  3 | from functools import lru_cache
  4 | from typing import List, Optional, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from transformers import GPT2TokenizerFast
  9 | 
 10 | LANGUAGES = {
 11 |     "en": "english",
 12 |     "zh": "chinese",
 13 |     "de": "german",
 14 |     "es": "spanish",
 15 |     "ru": "russian",
 16 |     "ko": "korean",
 17 |     "fr": "french",
 18 |     "ja": "japanese",
 19 |     "pt": "portuguese",
 20 |     "tr": "turkish",
 21 |     "pl": "polish",
 22 |     "ca": "catalan",
 23 |     "nl": "dutch",
 24 |     "ar": "arabic",
 25 |     "sv": "swedish",
 26 |     "it": "italian",
 27 |     "id": "indonesian",
 28 |     "hi": "hindi",
 29 |     "fi": "finnish",
 30 |     "vi": "vietnamese",
 31 |     "iw": "hebrew",
 32 |     "uk": "ukrainian",
 33 |     "el": "greek",
 34 |     "ms": "malay",
 35 |     "cs": "czech",
 36 |     "ro": "romanian",
 37 |     "da": "danish",
 38 |     "hu": "hungarian",
 39 |     "ta": "tamil",
 40 |     "no": "norwegian",
 41 |     "th": "thai",
 42 |     "ur": "urdu",
 43 |     "hr": "croatian",
 44 |     "bg": "bulgarian",
 45 |     "lt": "lithuanian",
 46 |     "la": "latin",
 47 |     "mi": "maori",
 48 |     "ml": "malayalam",
 49 |     "cy": "welsh",
 50 |     "sk": "slovak",
 51 |     "te": "telugu",
 52 |     "fa": "persian",
 53 |     "lv": "latvian",
 54 |     "bn": "bengali",
 55 |     "sr": "serbian",
 56 |     "az": "azerbaijani",
 57 |     "sl": "slovenian",
 58 |     "kn": "kannada",
 59 |     "et": "estonian",
 60 |     "mk": "macedonian",
 61 |     "br": "breton",
 62 |     "eu": "basque",
 63 |     "is": "icelandic",
 64 |     "hy": "armenian",
 65 |     "ne": "nepali",
 66 |     "mn": "mongolian",
 67 |     "bs": "bosnian",
 68 |     "kk": "kazakh",
 69 |     "sq": "albanian",
 70 |     "sw": "swahili",
 71 |     "gl": "galician",
 72 |     "mr": "marathi",
 73 |     "pa": "punjabi",
 74 |     "si": "sinhala",
 75 |     "km": "khmer",
 76 |     "sn": "shona",
 77 |     "yo": "yoruba",
 78 |     "so": "somali",
 79 |     "af": "afrikaans",
 80 |     "oc": "occitan",
 81 |     "ka": "georgian",
 82 |     "be": "belarusian",
 83 |     "tg": "tajik",
 84 |     "sd": "sindhi",
 85 |     "gu": "gujarati",
 86 |     "am": "amharic",
 87 |     "yi": "yiddish",
 88 |     "lo": "lao",
 89 |     "uz": "uzbek",
 90 |     "fo": "faroese",
 91 |     "ht": "haitian creole",
 92 |     "ps": "pashto",
 93 |     "tk": "turkmen",
 94 |     "nn": "nynorsk",
 95 |     "mt": "maltese",
 96 |     "sa": "sanskrit",
 97 |     "lb": "luxembourgish",
 98 |     "my": "myanmar",
 99 |     "bo": "tibetan",
100 |     "tl": "tagalog",
101 |     "mg": "malagasy",
102 |     "as": "assamese",
103 |     "tt": "tatar",
104 |     "haw": "hawaiian",
105 |     "ln": "lingala",
106 |     "ha": "hausa",
107 |     "ba": "bashkir",
108 |     "jw": "javanese",
109 |     "su": "sundanese",
110 | }
111 | 
112 | # language code lookup by name, with a few language aliases
113 | TO_LANGUAGE_CODE = {
114 |     **{language: code for code, language in LANGUAGES.items()},
115 |     "burmese": "my",
116 |     "valencian": "ca",
117 |     "flemish": "nl",
118 |     "haitian": "ht",
119 |     "letzeburgesch": "lb",
120 |     "pushto": "ps",
121 |     "panjabi": "pa",
122 |     "moldavian": "ro",
123 |     "moldovan": "ro",
124 |     "sinhalese": "si",
125 |     "castilian": "es",
126 | }
127 | 
128 | 
129 | @dataclass(frozen=True)
130 | class Tokenizer:
131 |     """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
132 | 
133 |     tokenizer: "GPT2TokenizerFast"
134 |     language: Optional[str]
135 |     sot_sequence: Tuple[int]
136 | 
137 |     def encode(self, text, **kwargs):
138 |         return self.tokenizer.encode(text, **kwargs)
139 | 
140 |     def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
141 |         return self.tokenizer.decode(token_ids, **kwargs)
142 | 
143 |     def decode_with_timestamps(self, tokens) -> str:
144 |         """
145 |         Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
146 |         This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
147 |         """
148 |         outputs = [[]]
149 |         for token in tokens:
150 |             if token >= self.timestamp_begin:
151 |                 timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
152 |                 outputs.append(timestamp)
153 |                 outputs.append([])
154 |             else:
155 |                 outputs[-1].append(token)
156 |         outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
157 |         return "".join(outputs)
158 | 
159 |     @property
160 |     @lru_cache()
161 |     def eot(self) -> int:
162 |         return self.tokenizer.eos_token_id
163 | 
164 |     @property
165 |     @lru_cache()
166 |     def sot(self) -> int:
167 |         return self._get_single_token_id("<|startoftranscript|>")
168 | 
169 |     @property
170 |     @lru_cache()
171 |     def sot_lm(self) -> int:
172 |         return self._get_single_token_id("<|startoflm|>")
173 | 
174 |     @property
175 |     @lru_cache()
176 |     def sot_prev(self) -> int:
177 |         return self._get_single_token_id("<|startofprev|>")
178 | 
179 |     @property
180 |     @lru_cache()
181 |     def no_speech(self) -> int:
182 |         return self._get_single_token_id("<|nospeech|>")
183 | 
184 |     @property
185 |     @lru_cache()
186 |     def no_timestamps(self) -> int:
187 |         return self._get_single_token_id("<|notimestamps|>")
188 | 
189 |     @property
190 |     @lru_cache()
191 |     def timestamp_begin(self) -> int:
192 |         return self.tokenizer.all_special_ids[-1] + 1
193 | 
194 |     @property
195 |     @lru_cache()
196 |     def language_token(self) -> int:
197 |         """Returns the token id corresponding to the value of the `language` field"""
198 |         if self.language is None:
199 |             raise ValueError(f"This tokenizer does not have language token configured")
200 | 
201 |         additional_tokens = dict(
202 |             zip(
203 |                 self.tokenizer.additional_special_tokens,
204 |                 self.tokenizer.additional_special_tokens_ids,
205 |             )
206 |         )
207 |         candidate = f"<|{self.language}|>"
208 |         if candidate in additional_tokens:
209 |             return additional_tokens[candidate]
210 | 
211 |         raise KeyError(f"Language {self.language} not found in tokenizer.")
212 | 
213 |     @property
214 |     @lru_cache()
215 |     def all_language_tokens(self) -> Tuple[int]:
216 |         result = []
217 |         for token, token_id in zip(
218 |             self.tokenizer.additional_special_tokens,
219 |             self.tokenizer.additional_special_tokens_ids,
220 |         ):
221 |             if token.strip("<|>") in LANGUAGES:
222 |                 result.append(token_id)
223 |         return tuple(result)
224 | 
225 |     @property
226 |     @lru_cache()
227 |     def all_language_codes(self) -> Tuple[str]:
228 |         return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
229 | 
230 |     @property
231 |     @lru_cache()
232 |     def sot_sequence_including_notimestamps(self) -> Tuple[int]:
233 |         return tuple(list(self.sot_sequence) + [self.no_timestamps])
234 | 
235 |     @property
236 |     @lru_cache()
237 |     def non_speech_tokens(self) -> Tuple[int]:
238 |         """
239 |         Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
240 |         annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
241 | 
242 |         - ♪♪♪
243 |         - ( SPEAKING FOREIGN LANGUAGE )
244 |         - [DAVID] Hey there,
245 | 
246 |         keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
247 |         """
248 |         symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
249 |         symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
250 | 
251 |         # symbols that may be a single token or multiple tokens depending on the tokenizer.
252 |         # In case they're multiple tokens, suppress the first token, which is safe because:
253 |         # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
254 |         # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
255 |         miscellaneous = set("♩♪♫♬♭♮♯")
256 |         assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
257 | 
258 |         # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
259 |         result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
260 |         for symbol in symbols + list(miscellaneous):
261 |             for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
262 |                 if len(tokens) == 1 or symbol in miscellaneous:
263 |                     result.add(tokens[0])
264 | 
265 |         return tuple(sorted(result))
266 | 
267 |     def _get_single_token_id(self, text) -> int:
268 |         tokens = self.tokenizer.encode(text)
269 |         assert len(tokens) == 1, f"{text} is not encoded as a single token"
270 |         return tokens[0]
271 | 
272 | 
273 | @lru_cache(maxsize=None)
274 | def build_tokenizer(name: str = "gpt2"):
275 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
276 |     path = os.path.join(os.path.dirname(__file__), "assets", name)
277 |     tokenizer = GPT2TokenizerFast.from_pretrained(path)
278 | 
279 |     specials = [
280 |         "<|startoftranscript|>",
281 |         *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
282 |         "<|translate|>",
283 |         "<|transcribe|>",
284 |         "<|startoflm|>",
285 |         "<|startofprev|>",
286 |         "<|nospeech|>",
287 |         "<|notimestamps|>",
288 |     ]
289 | 
290 |     tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
291 |     return tokenizer
292 | 
293 | 
294 | @lru_cache(maxsize=None)
295 | def get_tokenizer(
296 |     multilingual: bool,
297 |     *,
298 |     task: Optional[str] = None,  # Literal["transcribe", "translate", None]
299 |     language: Optional[str] = None,
300 | ) -> Tokenizer:
301 |     if language is not None:
302 |         language = language.lower()
303 |         if language not in LANGUAGES:
304 |             if language in TO_LANGUAGE_CODE:
305 |                 language = TO_LANGUAGE_CODE[language]
306 |             else:
307 |                 raise ValueError(f"Unsupported language: {language}")
308 | 
309 |     if multilingual:
310 |         tokenizer_name = "multilingual"
311 |         task = task or "transcribe"
312 |         language = language or "en"
313 |     else:
314 |         tokenizer_name = "gpt2"
315 |         task = None
316 |         language = None
317 | 
318 |     tokenizer = build_tokenizer(name=tokenizer_name)
319 |     all_special_ids: List[int] = tokenizer.all_special_ids
320 |     sot: int = all_special_ids[1]
321 |     translate: int = all_special_ids[-6]
322 |     transcribe: int = all_special_ids[-5]
323 | 
324 |     langs = tuple(LANGUAGES.keys())
325 |     sot_sequence = [sot]
326 |     if language is not None:
327 |         sot_sequence.append(sot + 1 + langs.index(language))
328 |     if task is not None:
329 |         sot_sequence.append(transcribe if task == "transcribe" else translate)
330 | 
331 |     return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))
332 | 


--------------------------------------------------------------------------------
/whisper/transcribe.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | from typing import List, Optional, Tuple, Union, TYPE_CHECKING
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import tqdm
  9 | 
 10 | from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
 11 | from .decoding import DecodingOptions, DecodingResult
 12 | from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
 13 | from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_txt, write_vtt, write_srt
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from .model import Whisper
 17 | 
 18 | 
 19 | def transcribe(
 20 |     model: "Whisper",
 21 |     audio: Union[str, np.ndarray, torch.Tensor],
 22 |     *,
 23 |     verbose: Optional[bool] = None,
 24 |     temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
 25 |     compression_ratio_threshold: Optional[float] = 2.4,
 26 |     logprob_threshold: Optional[float] = -1.0,
 27 |     no_speech_threshold: Optional[float] = 0.6,
 28 |     condition_on_previous_text: bool = True,
 29 |     **decode_options,
 30 | ):
 31 |     """
 32 |     Transcribe an audio file using Whisper
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     model: Whisper
 37 |         The Whisper model instance
 38 | 
 39 |     audio: Union[str, np.ndarray, torch.Tensor]
 40 |         The path to the audio file to open, or the audio waveform
 41 | 
 42 |     verbose: bool
 43 |         Whether to display the text being decoded to the console. If True, displays all the details,
 44 |         If False, displays minimal details. If None, does not display anything
 45 | 
 46 |     temperature: Union[float, Tuple[float, ...]]
 47 |         Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
 48 |         upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
 49 | 
 50 |     compression_ratio_threshold: float
 51 |         If the gzip compression ratio is above this value, treat as failed
 52 | 
 53 |     logprob_threshold: float
 54 |         If the average log probability over sampled tokens is below this value, treat as failed
 55 | 
 56 |     no_speech_threshold: float
 57 |         If the no_speech probability is higher than this value AND the average log probability
 58 |         over sampled tokens is below `logprob_threshold`, consider the segment as silent
 59 | 
 60 |     condition_on_previous_text: bool
 61 |         if True, the previous output of the model is provided as a prompt for the next window;
 62 |         disabling may make the text inconsistent across windows, but the model becomes less prone to
 63 |         getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
 64 | 
 65 |     decode_options: dict
 66 |         Keyword arguments to construct `DecodingOptions` instances
 67 | 
 68 |     Returns
 69 |     -------
 70 |     A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
 71 |     the spoken language ("language"), which is detected when `decode_options["language"]` is None.
 72 |     """
 73 |     dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
 74 |     if model.device == torch.device("cpu"):
 75 |         if torch.cuda.is_available():
 76 |             warnings.warn("Performing inference on CPU when CUDA is available")
 77 |         if dtype == torch.float16:
 78 |             warnings.warn("FP16 is not supported on CPU; using FP32 instead")
 79 |             dtype = torch.float32
 80 | 
 81 |     if dtype == torch.float32:
 82 |         decode_options["fp16"] = False
 83 | 
 84 |     mel = log_mel_spectrogram(audio)
 85 | 
 86 |     if decode_options.get("language", None) is None:
 87 |         if verbose:
 88 |             print("Detecting language using up to the first 30 seconds. Use `--language` to specify the language")
 89 |         segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
 90 |         _, probs = model.detect_language(segment)
 91 |         decode_options["language"] = max(probs, key=probs.get)
 92 |         if verbose is not None:
 93 |             print(f"Detected language: {LANGUAGES[decode_options['language']].title()}")
 94 | 
 95 |     mel = mel.unsqueeze(0)
 96 |     language = decode_options["language"]
 97 |     task = decode_options.get("task", "transcribe")
 98 |     tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
 99 | 
100 |     def decode_with_fallback(segment: torch.Tensor) -> List[DecodingResult]:
101 |         temperatures = [temperature] if isinstance(temperature, (int, float)) else temperature
102 |         kwargs = {**decode_options}
103 |         t = temperatures[0]
104 |         if t == 0:
105 |             best_of = kwargs.pop("best_of", None)
106 |         else:
107 |             best_of = kwargs.get("best_of", None)
108 | 
109 |         options = DecodingOptions(**kwargs, temperature=t)
110 |         results = model.decode(segment, options)
111 | 
112 |         kwargs.pop("beam_size", None)  # no beam search for t > 0
113 |         kwargs.pop("patience", None)  # no patience for t > 0
114 |         kwargs["best_of"] = best_of  # enable best_of for t > 0
115 |         for t in temperatures[1:]:
116 |             needs_fallback = [
117 |                 compression_ratio_threshold is not None
118 |                 and result.compression_ratio > compression_ratio_threshold
119 |                 or logprob_threshold is not None
120 |                 and result.avg_logprob < logprob_threshold
121 |                 for result in results
122 |             ]
123 |             if any(needs_fallback):
124 |                 options = DecodingOptions(**kwargs, temperature=t)
125 |                 retries = model.decode(segment[needs_fallback], options)
126 |                 for retry_index, original_index in enumerate(np.nonzero(needs_fallback)[0]):
127 |                     results[original_index] = retries[retry_index]
128 | 
129 |         return results
130 | 
131 |     seek = 0
132 |     input_stride = exact_div(
133 |         N_FRAMES, model.dims.n_audio_ctx
134 |     )  # mel frames per output token: 2
135 |     time_precision = (
136 |         input_stride * HOP_LENGTH / SAMPLE_RATE
137 |     )  # time per output token: 0.02 (seconds)
138 |     all_tokens = []
139 |     all_segments = []
140 |     prompt_reset_since = 0
141 | 
142 |     initial_prompt = decode_options.pop("initial_prompt", None) or []
143 |     if initial_prompt:
144 |         initial_prompt = tokenizer.encode(" " + initial_prompt.strip())
145 |         all_tokens.extend(initial_prompt)
146 | 
147 |     def add_segment(
148 |         *, start: float, end: float, text_tokens: torch.Tensor, result: DecodingResult
149 |     ):
150 |         text = tokenizer.decode([token for token in text_tokens if token < tokenizer.eot])
151 |         if len(text.strip()) == 0:  # skip empty text output
152 |             return
153 | 
154 |         all_segments.append(
155 |             {
156 |                 "id": len(all_segments),
157 |                 "seek": seek,
158 |                 "start": start,
159 |                 "end": end,
160 |                 "text": text,
161 |                 "tokens": result.tokens,
162 |                 "temperature": result.temperature,
163 |                 "avg_logprob": result.avg_logprob,
164 |                 "compression_ratio": result.compression_ratio,
165 |                 "no_speech_prob": result.no_speech_prob,
166 |             }
167 |         )
168 |         if verbose:
169 |             print(f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}")
170 | 
171 |     # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
172 |     num_frames = mel.shape[-1]
173 |     previous_seek_value = seek
174 | 
175 |     with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar:
176 |         while seek < num_frames:
177 |             timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
178 |             segment = pad_or_trim(mel[:, :, seek:], N_FRAMES).to(model.device).to(dtype)
179 |             segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE
180 | 
181 |             decode_options["prompt"] = all_tokens[prompt_reset_since:]
182 |             result = decode_with_fallback(segment)[0]
183 |             tokens = torch.tensor(result.tokens)
184 | 
185 |             if no_speech_threshold is not None:
186 |                 # no voice activity check
187 |                 should_skip = result.no_speech_prob > no_speech_threshold
188 |                 if logprob_threshold is not None and result.avg_logprob > logprob_threshold:
189 |                     # don't skip if the logprob is high enough, despite the no_speech_prob
190 |                     should_skip = False
191 | 
192 |                 if should_skip:
193 |                     seek += segment.shape[-1]  # fast-forward to the next segment boundary
194 |                     continue
195 | 
196 |             timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
197 |             consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0].add_(1)
198 |             if len(consecutive) > 0:  # if the output contains two consecutive timestamp tokens
199 |                 last_slice = 0
200 |                 for current_slice in consecutive:
201 |                     sliced_tokens = tokens[last_slice:current_slice]
202 |                     start_timestamp_position = (
203 |                         sliced_tokens[0].item() - tokenizer.timestamp_begin
204 |                     )
205 |                     end_timestamp_position = (
206 |                         sliced_tokens[-1].item() - tokenizer.timestamp_begin
207 |                     )
208 |                     add_segment(
209 |                         start=timestamp_offset + start_timestamp_position * time_precision,
210 |                         end=timestamp_offset + end_timestamp_position * time_precision,
211 |                         text_tokens=sliced_tokens[1:-1],
212 |                         result=result,
213 |                     )
214 |                     last_slice = current_slice
215 |                 last_timestamp_position = (
216 |                     tokens[last_slice - 1].item() - tokenizer.timestamp_begin
217 |                 )
218 |                 seek += last_timestamp_position * input_stride
219 |                 all_tokens.extend(tokens[: last_slice + 1].tolist())
220 |             else:
221 |                 duration = segment_duration
222 |                 timestamps = tokens[timestamp_tokens.nonzero().flatten()]
223 |                 if len(timestamps) > 0 and timestamps[-1].item() != tokenizer.timestamp_begin:
224 |                     # no consecutive timestamps but it has a timestamp; use the last one.
225 |                     # single timestamp at the end means no speech after the last timestamp.
226 |                     last_timestamp_position = timestamps[-1].item() - tokenizer.timestamp_begin
227 |                     duration = last_timestamp_position * time_precision
228 | 
229 |                 add_segment(
230 |                     start=timestamp_offset,
231 |                     end=timestamp_offset + duration,
232 |                     text_tokens=tokens,
233 |                     result=result,
234 |                 )
235 | 
236 |                 seek += segment.shape[-1]
237 |                 all_tokens.extend(tokens.tolist())
238 | 
239 |             if not condition_on_previous_text or result.temperature > 0.5:
240 |                 # do not feed the prompt tokens if a high temperature was used
241 |                 prompt_reset_since = len(all_tokens)
242 | 
243 |             # update progress bar
244 |             pbar.update(min(num_frames, seek) - previous_seek_value)
245 |             previous_seek_value = seek
246 | 
247 |     return dict(text=tokenizer.decode(all_tokens[len(initial_prompt):]), segments=all_segments, language=language)
248 | 
249 | 
250 | def cli():
251 |     from . import available_models
252 | 
253 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
254 |     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
255 |     parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
256 |     parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
257 |     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
258 |     parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
259 | 
260 |     parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
261 |     parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
262 | 
263 |     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
264 |     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
265 |     parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
266 |     parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
267 |     parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
268 | 
269 |     parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
270 |     parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
271 |     parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
272 |     parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
273 | 
274 |     parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
275 |     parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
276 |     parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
277 |     parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
278 | 
279 |     args = parser.parse_args().__dict__
280 |     model_name: str = args.pop("model")
281 |     output_dir: str = args.pop("output_dir")
282 |     device: str = args.pop("device")
283 |     os.makedirs(output_dir, exist_ok=True)
284 | 
285 |     if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
286 |         warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
287 |         args["language"] = "en"
288 | 
289 |     temperature = args.pop("temperature")
290 |     temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
291 |     if temperature_increment_on_fallback is not None:
292 |         temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
293 |     else:
294 |         temperature = [temperature]
295 | 
296 |     from . import load_model
297 |     model = load_model(model_name, device=device)
298 | 
299 |     for audio_path in args.pop("audio"):
300 |         result = transcribe(model, audio_path, temperature=temperature, **args)
301 | 
302 |         audio_basename = os.path.basename(audio_path)
303 | 
304 |         # save TXT
305 |         with open(os.path.join(output_dir, audio_basename + ".txt"), "w", encoding="utf-8") as txt:
306 |             write_txt(result["segments"], file=txt)
307 | 
308 |         # save VTT
309 |         with open(os.path.join(output_dir, audio_basename + ".vtt"), "w", encoding="utf-8") as vtt:
310 |             write_vtt(result["segments"], file=vtt)
311 | 
312 |         # save SRT
313 |         with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt:
314 |             write_srt(result["segments"], file=srt)
315 | 
316 | 
317 | if __name__ == '__main__':
318 |     cli()
319 | 


--------------------------------------------------------------------------------
/whisper/utils.py:
--------------------------------------------------------------------------------
 1 | import zlib
 2 | from typing import Iterator, TextIO
 3 | 
 4 | 
 5 | def exact_div(x, y):
 6 |     assert x % y == 0
 7 |     return x // y
 8 | 
 9 | 
10 | def str2bool(string):
11 |     str2val = {"True": True, "False": False}
12 |     if string in str2val:
13 |         return str2val[string]
14 |     else:
15 |         raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
16 | 
17 | 
18 | def optional_int(string):
19 |     return None if string == "None" else int(string)
20 | 
21 | 
22 | def optional_float(string):
23 |     return None if string == "None" else float(string)
24 | 
25 | 
26 | def compression_ratio(text) -> float:
27 |     return len(text) / len(zlib.compress(text.encode("utf-8")))
28 | 
29 | 
30 | def format_timestamp(seconds: float, always_include_hours: bool = False):
31 |     assert seconds >= 0, "non-negative timestamp expected"
32 |     milliseconds = round(seconds * 1000.0)
33 | 
34 |     hours = milliseconds // 3_600_000
35 |     milliseconds -= hours * 3_600_000
36 | 
37 |     minutes = milliseconds // 60_000
38 |     milliseconds -= minutes * 60_000
39 | 
40 |     seconds = milliseconds // 1_000
41 |     milliseconds -= seconds * 1_000
42 | 
43 |     hours_marker = f"{hours}:" if always_include_hours or hours > 0 else ""
44 |     return f"{hours_marker}{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
45 | 
46 | 
47 | def write_txt(transcript: Iterator[dict], file: TextIO):
48 |     for segment in transcript:
49 |         print(segment['text'].strip(), file=file, flush=True)
50 | 
51 | 
52 | def write_vtt(transcript: Iterator[dict], file: TextIO):
53 |     print("WEBVTT\n", file=file)
54 |     for segment in transcript:
55 |         print(
56 |             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
57 |             f"{segment['text'].replace('-->', '->')}\n",
58 |             file=file,
59 |             flush=True,
60 |         )
61 | 
62 | 
63 | def write_srt(transcript: Iterator[dict], file: TextIO):
64 |     """
65 |     Write a transcript to a file in SRT format.
66 | 
67 |     Example usage:
68 |         from pathlib import Path
69 |         from whisper.utils import write_srt
70 | 
71 |         result = transcribe(model, audio_path, temperature=temperature, **args)
72 | 
73 |         # save SRT
74 |         audio_basename = Path(audio_path).stem
75 |         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
76 |             write_srt(result["segments"], file=srt)
77 |     """
78 |     for i, segment in enumerate(transcript, start=1):
79 |         # write srt lines
80 |         print(
81 |             f"{i}\n"
82 |             f"{format_timestamp(segment['start'], always_include_hours=True)} --> "
83 |             f"{format_timestamp(segment['end'], always_include_hours=True)}\n"
84 |             f"{segment['text'].strip().replace('-->', '->')}\n",
85 |             file=file,
86 |             flush=True,
87 |         )
88 | 


--------------------------------------------------------------------------------