├── .github
    ├── tests.json
    └── workflows
    │   ├── CD-docker_dev.yml
    │   ├── CD-docker_release.yml
    │   ├── CI-runpod_dep.yml
    │   └── CI-test_handler.yml
├── .gitignore
├── .runpod
    ├── hub.json
    └── tests.json
├── Dockerfile
├── LICENSE
├── README.md
├── builder
    ├── fetch_models.py
    └── requirements.txt
├── locustfile.py
├── public
    ├── banner.jpeg
    └── banner.png
├── src
    ├── predict.py
    ├── rp_handler.py
    └── rp_schema.py
└── test_input.json


/.github/tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "hardwareConfig": {
 4 |       "endpointConfig": {
 5 |         "gpuIds": "AMPERE_16",
 6 |         "name": "16GB GPU"
 7 |       }
 8 |     },
 9 |     "input": {
10 |       "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav",
11 |       "model": "base",
12 |       "transcription": "plain text",
13 |       "translate": false,
14 |       "temperature": 0,
15 |       "best_of": 5,
16 |       "beam_size": 5,
17 |       "suppress_tokens": "-1",
18 |       "condition_on_previous_text": false,
19 |       "temperature_increment_on_fallback": 0.2,
20 |       "compression_ratio_threshold": 2.4,
21 |       "logprob_threshold": -1,
22 |       "no_speech_threshold": 0.6
23 |     }
24 |   }
25 | ]
26 | 


--------------------------------------------------------------------------------
/.github/workflows/CD-docker_dev.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 | 
 8 | jobs:
 9 |   docker:
10 |     runs-on: DO
11 |     steps:
12 |       - name: Set up QEMU
13 |         uses: docker/setup-qemu-action@v2
14 | 
15 |       - name: Set up Docker Buildx
16 |         uses: docker/setup-buildx-action@v2
17 | 
18 |       - name: Login to Docker Hub
19 |         uses: docker/login-action@v2
20 |         with:
21 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
22 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
23 | 
24 |       - name: Build and push
25 |         uses: docker/build-push-action@v4
26 |         with:
27 |           push: true
28 |           tags: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:dev
29 |       - uses: actions/checkout@v3
30 |       - name: Run Tests
31 |         uses: direlines/runpod-test-runner@v1.7
32 |         with:
33 |           image-tag: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:dev
34 |           runpod-api-key: ${{ secrets.RUNPOD_API_KEY }}
35 |           request-timeout: 600
36 | 


--------------------------------------------------------------------------------
/.github/workflows/CD-docker_release.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   docker:
 9 |     runs-on: DO
10 |     steps:
11 |       - name: Set up QEMU
12 |         uses: docker/setup-qemu-action@v2
13 | 
14 |       - name: Set up Docker Buildx
15 |         uses: docker/setup-buildx-action@v2
16 | 
17 |       - name: Login to Docker Hub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: docker/build-push-action@v4
25 |         with:
26 |           push: true
27 |           tags: ${{ secrets.DOCKERHUB_REPO }}/${{ secrets.DOCKERHUB_IMG }}:${{ github.event.release.tag_name }}
28 | 


--------------------------------------------------------------------------------
/.github/workflows/CI-runpod_dep.yml:
--------------------------------------------------------------------------------
 1 | name: CI | Update runpod package version
 2 | 
 3 | on:
 4 |   repository_dispatch:
 5 |     types: [python-package-release]
 6 | 
 7 |   push:
 8 |     branches:
 9 |       - "main"
10 |       - "master"
11 | 
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   check_dep:
16 |     runs-on: ubuntu-latest
17 |     name: Check python requirements file and update
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Check for new package version and update
23 |         run: |
24 |           echo "Fetching the current runpod version from requirements.txt..."
25 | 
26 |           # Get current version, allowing for '~=' operator
27 |           current_version=$(grep -oP 'runpod~=\K[^ ]+' ./builder/requirements.txt)
28 |           echo "Current version: $current_version"
29 | 
30 |           # Get new version from PyPI
31 |           new_version=$(curl -s https://pypi.org/pypi/runpod/json | jq -r .info.version)
32 |           echo "NEW_VERSION_ENV=$new_version" >> $GITHUB_ENV
33 |           echo "New version: $new_version"
34 | 
35 |           if [ -z "$new_version" ]; then
36 |               echo "ERROR: Failed to fetch the new version from PyPI."
37 |               exit 1
38 |           fi
39 | 
40 |           # Extract major and minor from current version (e.g., 1.7)
41 |           current_major_minor=$(echo $current_version | cut -d. -f1,2)
42 |           new_major_minor=$(echo $new_version | cut -d. -f1,2)
43 | 
44 |           echo "Current major.minor: $current_major_minor"
45 |           echo "New major.minor: $new_major_minor"
46 | 
47 |           # Check if the new version is within the current major.minor range (e.g., 1.7.x)
48 |           if [ "$new_major_minor" = "$current_major_minor" ]; then
49 |               echo "No update needed. The new version ($new_version) is within the allowed range (~= $current_major_minor)."
50 |               exit 0
51 |           fi
52 | 
53 |           echo "New major/minor detected ($new_major_minor). Updating runpod version..."
54 | 
55 |           # Update requirements.txt with the new version while keeping '~='
56 |           sed -i "s/runpod~=.*/runpod~=$new_version/" ./builder/requirements.txt
57 |           echo "requirements.txt has been updated."
58 | 
59 |       - name: Create Pull Request
60 |         uses: peter-evans/create-pull-request@v3
61 |         with:
62 |           token: ${{ secrets.GITHUB_TOKEN }}
63 |           commit-message: Update runpod package version
64 |           title: Update runpod package version
65 |           body: The package version has been updated to ${{ env.NEW_VERSION_ENV }}
66 |           branch: runpod-package-update
67 | 


--------------------------------------------------------------------------------
/.github/workflows/CI-test_handler.yml:
--------------------------------------------------------------------------------
 1 | name: CI | Test Worker
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   initialize_worker:
16 |     runs-on: ubuntu-latest
17 |     outputs:
18 |       id: ${{ steps.extract_id.outputs.runpod_job_id }}
19 | 
20 |     steps:
21 |       - name: Deploy Worker
22 |         id: deploy
23 |         uses: fjogeleit/http-request-action@v1
24 |         with:
25 |           url: "https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT }}/run"
26 |           method: "POST"
27 |           customHeaders: '{"Content-Type": "application/json"}'
28 |           bearerToken: ${{ secrets.RUNPOD_API_KEY }}
29 |           data: '{"input":{"github_pat": "${{ secrets.GH_PAT }}", "github_org":"${{ secrets.GH_ORG }}"}}'
30 | 
31 |       - name: Extract Job ID
32 |         id: extract_id
33 |         run: |
34 |           ID=$(echo '${{ steps.deploy.outputs.response }}' | jq -r '.id')
35 |           echo "::set-output name=runpod_job_id::$ID"
36 | 
37 |   run_tests:
38 |     needs: initialize_worker
39 |     runs-on: runpod
40 | 
41 |     steps:
42 |       - uses: actions/checkout@v3
43 | 
44 |       - name: Set up environment
45 |         run: |
46 |           rm -f /etc/apt/sources.list.d/*.list
47 |           apt-get update -y
48 |           apt-get upgrade -y
49 |           apt-get install --yes --no-install-recommends sudo ca-certificates git wget curl bash libgl1 libx11-6 software-properties-common ffmpeg build-essential -y
50 |           apt-get autoremove -y
51 |           apt-get clean -y
52 |           rm -rf /var/lib/apt/lists/*
53 | 
54 |       - name: Set up Python 3.10 & install dependencies
55 |         uses: actions/setup-python@v4
56 |         with:
57 |           python-version: "3.10.12"
58 | 
59 |       - name: Install Dependencies
60 |         run: |
61 | 
62 |           python -m pip install --upgrade pip
63 |           pip install -r builder/requirements.txt
64 | 
65 |       - name: Fetch and run models
66 |         run: |
67 |           python builder/fetch_models.py
68 | 
69 |       - name: Execute Tests
70 |         run: |
71 |           python src/rp_handler.py --test_input='{"input": {"audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav"}}'
72 | 
73 |   terminate_worker:
74 |     if: ${{ always() && !success() }}
75 |     needs: initialize_worker
76 |     runs-on: ubuntu-latest
77 | 
78 |     steps:
79 |       - name: Shutdown Worker
80 |         uses: fjogeleit/http-request-action@v1
81 |         with:
82 |           url: "https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT }}/cancel/${{ needs.initialize_worker.outputs.id }}"
83 |           method: "POST"
84 |           customHeaders: '{"Content-Type": "application/json"}'
85 |           bearerToken: ${{ secrets.RUNPOD_API_KEY }}
86 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | runpod.toml
162 | 


--------------------------------------------------------------------------------
/.runpod/hub.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Faster Whisper",
 3 |   "description": "Process audio files using various Whisper models, with options for transcription formatting, language translation and more.",
 4 |   "type": "serverless",
 5 |   "category": "audio",
 6 |   "iconUrl": "https://dummyimage.com/100x100/0066ff/fff&text=FW",
 7 |   "config": {
 8 |     "runsOn": "GPU",
 9 |     "containerDiskInGb": 20,
10 |     "gpuIds": "ADA_24",
11 |     "gpuCount": 1,
12 |     "allowedCudaVersions": ["12.7", "12.6", "12.5", "12.4", "12.3"]
13 |   }
14 | }


--------------------------------------------------------------------------------
/.runpod/tests.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tests": [
 3 |     {
 4 |       "name": "basic_test",
 5 |       "input": {
 6 |         "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav",
 7 |         "model": "turbo",
 8 |         "transcription": "plain text",
 9 |         "translate": false,
10 |         "temperature": 0,
11 |         "best_of": 5,
12 |         "beam_size": 5,
13 |         "suppress_tokens": "-1",
14 |         "condition_on_previous_text": false,
15 |         "temperature_increment_on_fallback": 0.2,
16 |         "compression_ratio_threshold": 2.4,
17 |         "logprob_threshold": -1,
18 |         "no_speech_threshold": 0.6
19 |       },
20 |       "timeout": 10000
21 |     }
22 |   ],
23 |   "config": {
24 |     "gpuTypeId": "NVIDIA GeForce RTX 4090",
25 |     "gpuCount": 1,
26 |     "allowedCudaVersions": [
27 |       "12.7",
28 |       "12.6",
29 |       "12.5",
30 |       "12.4",
31 |       "12.3"
32 |     ]
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # faster-whisper turbo needs cudnnn >= 9
 2 | # see https://github.com/runpod-workers/worker-faster_whisper/pull/44
 3 | FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
 4 | 
 5 | # Remove any third-party apt sources to avoid issues with expiring keys.
 6 | RUN rm -f /etc/apt/sources.list.d/*.list
 7 | 
 8 | # Set shell and noninteractive environment variables
 9 | SHELL ["/bin/bash", "-c"]
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | ENV SHELL=/bin/bash
12 | 
13 | # Set working directory
14 | WORKDIR /
15 | 
16 | # Update and upgrade the system packages
17 | RUN apt-get update -y && \
18 |     apt-get upgrade -y && \
19 |     apt-get install --yes --no-install-recommends sudo ca-certificates git wget curl bash libgl1 libx11-6 software-properties-common ffmpeg build-essential -y &&\
20 |     apt-get autoremove -y && \
21 |     apt-get clean -y && \
22 |     rm -rf /var/lib/apt/lists/*
23 | 
24 | # Install Python 3.10
25 | RUN apt-get update -y && \
26 |     apt-get install python3.10 python3.10-dev python3.10-venv python3-pip -y --no-install-recommends && \
27 |     ln -s /usr/bin/python3.10 /usr/bin/python && \
28 |     rm -f /usr/bin/python3 && \
29 |     ln -s /usr/bin/python3.10 /usr/bin/python3 && \
30 |     apt-get autoremove -y && \
31 |     apt-get clean -y && \
32 |     rm -rf /var/lib/apt/lists/*
33 | 
34 | # Install Python dependencies
35 | COPY builder/requirements.txt /requirements.txt
36 | RUN --mount=type=cache,target=/root/.cache/pip \
37 |     pip install --upgrade pip && \
38 |     pip install huggingface_hub[hf_xet] && \
39 |     pip install -r /requirements.txt --no-cache-dir
40 | 
41 | # Copy and run script to fetch models
42 | COPY builder/fetch_models.py /fetch_models.py
43 | RUN python /fetch_models.py && \
44 |     rm /fetch_models.py
45 | 
46 | # Copy handler and other code
47 | COPY src .
48 | 
49 | # test input that will be used when the container runs outside of runpod
50 | COPY test_input.json .
51 | 
52 | # Set default command
53 | CMD python -u /rp_handler.py
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 RunPod
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Faster Whisper Logo](https://5ccaof7hvfzuzf4p.public.blob.vercel-storage.com/banner-pjbGKw0buxbWGhMVC165Gf9qgqWo7I.jpeg)
 2 | 
 3 | [Faster Whisper](https://github.com/guillaumekln/faster-whisper) is designed to process audio files using various Whisper models, with options for transcription formatting, language translation and more.
 4 | 
 5 | ---
 6 | 
 7 | [![RunPod](https://api.runpod.io/badge/runpod-workers/worker-faster_whisper)](https://www.runpod.io/console/hub/runpod-workers/worker-faster_whisper)
 8 | 
 9 | ---
10 | 
11 | ## Models
12 | 
13 | - tiny
14 | - base
15 | - small
16 | - medium
17 | - large-v1
18 | - large-v2
19 | - large-v3
20 | - distil-large-v2
21 | - distil-large-v3
22 | - turbo
23 | 
24 | ## Input
25 | 
26 | | Input                               | Type  | Description                                                                                                                                                            |
27 | | ----------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
28 | | `audio`                             | Path  | URL to Audio file                                                                                                                                                      |
29 | | `audio_base64`                      | str   | Base64-encoded audio file                                                                                                                                              |
30 | | `model`                             | str   | Choose a Whisper model. Choices: "tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3", "distil-large-v2", "distil-large-v3", "turbo". Default: "base" |
31 | | `transcription`                     | str   | Choose the format for the transcription. Choices: "plain_text", "formatted_text", "srt", "vtt". Default: "plain_text"                                                  |
32 | | `translate`                         | bool  | Translate the text to English when set to True. Default: False                                                                                                         |
33 | | `translation`                       | str   | Choose the format for the translation. Choices: "plain_text", "formatted_text", "srt", "vtt". Default: "plain_text"                                                    |
34 | | `language`                          | str   | Language spoken in the audio, specify None to perform language detection. Default: None                                                                                |
35 | | `temperature`                       | float | Temperature to use for sampling. Default: 0                                                                                                                            |
36 | | `best_of`                           | int   | Number of candidates when sampling with non-zero temperature. Default: 5                                                                                               |
37 | | `beam_size`                         | int   | Number of beams in beam search, only applicable when temperature is zero. Default: 5                                                                                   |
38 | | `patience`                          | float | Optional patience value to use in beam decoding. Default: None                                                                                                         |
39 | | `length_penalty`                    | float | Optional token length penalty coefficient (alpha). Default: None                                                                                                       |
40 | | `suppress_tokens`                   | str   | Comma-separated list of token ids to suppress during sampling. Default: "-1"                                                                                           |
41 | | `initial_prompt`                    | str   | Optional text to provide as a prompt for the first window. Default: None                                                                                               |
42 | | `condition_on_previous_text`        | bool  | If True, provide the previous output of the model as a prompt for the next window. Default: True                                                                       |
43 | | `temperature_increment_on_fallback` | float | Temperature to increase when falling back when the decoding fails. Default: 0.2                                                                                        |
44 | | `compression_ratio_threshold`       | float | If the gzip compression ratio is higher than this value, treat the decoding as failed. Default: 2.4                                                                    |
45 | | `logprob_threshold`                 | float | If the average log probability is lower than this value, treat the decoding as failed. Default: -1.0                                                                   |
46 | | `no_speech_threshold`               | float | If the probability of the token is higher than this value, consider the segment as silence. Default: 0.6                                                               |
47 | | `enable_vad`                        | bool  | If True, use the voice activity detection (VAD) to filter out parts of the audio without speech. This step is using the Silero VAD model. Default: False               |
48 | | `word_timestamps`                   | bool  | If True, include word timestamps in the output. Default: False                                                                                                         |
49 | 
50 | ### Example
51 | 
52 | The following inputs can be used for testing the model:
53 | 
54 | ```json
55 | {
56 |   "input": {
57 |     "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav",
58 |     "model": "turbo"
59 |   }
60 | }
61 | ```
62 | 
63 | producing an output like this:
64 | 
65 | ```json
66 | {
67 |   "segments": [
68 |     {
69 |       "id": 1,
70 |       "seek": 106,
71 |       "start": 0.11,
72 |       "end": 3.11,
73 |       "text": " Hello and welcome!",
74 |       "tokens": [50364, 25, 7, 287, 50514],
75 |       "temperature": 0.1,
76 |       "avg_logprob": -0.8348079785480325,
77 |       "compression_ratio": 0.5789473684210527,
78 |       "no_speech_prob": 0.1453857421875
79 |     }
80 |   ],
81 |   "detected_language": "en",
82 |   "transcription": "Hello and welcome!",
83 |   "translation": null,
84 |   "device": "cuda",
85 |   "model": "turbo",
86 |   "translation_time": 0.3796223163604736
87 | }
88 | ```
89 | 


--------------------------------------------------------------------------------
/builder/fetch_models.py:
--------------------------------------------------------------------------------
 1 | from faster_whisper.utils import download_model
 2 | 
 3 | model_names = [
 4 |     "tiny",
 5 |     "base",
 6 |     "small",
 7 |     "medium",
 8 |     "large-v1",
 9 |     "large-v2",
10 |     "large-v3",
11 |     "distil-large-v2",
12 |     "distil-large-v3",
13 |     "turbo",
14 | ]
15 | 
16 | 
17 | def download_model_weights(selected_model):
18 |     """
19 |     Download model weights.
20 |     """
21 |     print(f"Downloading {selected_model}...")
22 |     download_model(selected_model, cache_dir=None)
23 |     print(f"Finished downloading {selected_model}.")
24 | 
25 | 
26 | # Loop through models sequentially
27 | for model_name in model_names:
28 |     download_model_weights(model_name)
29 | 
30 | print("Finished downloading all models.")
31 | 


--------------------------------------------------------------------------------
/builder/requirements.txt:
--------------------------------------------------------------------------------
1 | runpod~=1.7.9
2 | 
3 | faster-whisper==1.1.0


--------------------------------------------------------------------------------
/locustfile.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import numpy as np
 4 | from locust import HttpUser, task
 5 | import base64
 6 | from pydub import AudioSegment
 7 | 
 8 | 
 9 | def generate_random_audio(duration_ms):
10 |     # Generate random data
11 |     samples = np.random.normal(0, 1, int(44100 * duration_ms / 1000.0))
12 | 
13 |     # Convert to int16 array so we can make use of the pydub package
14 |     samples = (samples * np.iinfo(np.int16).max).astype(np.int16)
15 | 
16 |     # Create an audio segment
17 |     audio_segment = AudioSegment(
18 |         samples.tobytes(),
19 |         frame_rate=44100,
20 |         sample_width=samples.dtype.itemsize,
21 |         channels=1
22 |     )
23 | 
24 |     # Convert the audio segment to a base64 string
25 |     buffer = io.BytesIO()
26 |     audio_segment.export(buffer, format="wav")
27 |     base64_audio = base64.b64encode(buffer.getvalue()).decode('utf-8')
28 | 
29 |     return base64_audio
30 | 
31 | class ApiUser(HttpUser):
32 |     @task
33 |     def send_audio_request(self):
34 |         headers = {
35 |             'Content-Type': 'application/json',
36 |         }
37 |         audio_data = generate_random_audio(1000)    # 1 second audio
38 |         
39 |         data = {
40 |             "input": {
41 |                 "audio": audio_data
42 |             }
43 |         }
44 |         
45 |         self.client.post("/v2/xxxxx/runsync", json=data, headers=headers)  # Replace with your endpoint ID
46 | 
47 | if __name__ == "__main__":
48 |     import os
49 |     os.system("locust -f locustfile.py")
50 | 


--------------------------------------------------------------------------------
/public/banner.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runpod-workers/worker-faster_whisper/bd500dc88f3828b4a03252473684059aa1b5b41c/public/banner.jpeg


--------------------------------------------------------------------------------
/public/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runpod-workers/worker-faster_whisper/bd500dc88f3828b4a03252473684059aa1b5b41c/public/banner.png


--------------------------------------------------------------------------------
/src/predict.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains the Predictor class, which is used to run predictions on the
  3 | Whisper model. It is based on the Predictor class from the original Whisper
  4 | repository, with some modifications to make it work with the RP platform.
  5 | """
  6 | 
  7 | import gc
  8 | import threading
  9 | from concurrent.futures import (
 10 |     ThreadPoolExecutor,
 11 | )  # Still needed for transcribe potentially?
 12 | import numpy as np
 13 | 
 14 | from runpod.serverless.utils import rp_cuda
 15 | 
 16 | from faster_whisper import WhisperModel
 17 | from faster_whisper.utils import format_timestamp
 18 | 
 19 | # Define available models (for validation)
 20 | AVAILABLE_MODELS = {
 21 |     "tiny",
 22 |     "base",
 23 |     "small",
 24 |     "medium",
 25 |     "large-v1",
 26 |     "large-v2",
 27 |     "large-v3",
 28 |     "turbo",
 29 | }
 30 | 
 31 | 
 32 | class Predictor:
 33 |     """A Predictor class for the Whisper model with lazy loading"""
 34 | 
 35 |     def __init__(self):
 36 |         """Initializes the predictor with no models loaded."""
 37 |         self.models = {}
 38 |         self.model_lock = (
 39 |             threading.Lock()
 40 |         )  # Lock for thread-safe model loading/unloading
 41 | 
 42 |     def setup(self):
 43 |         """No models are pre-loaded. Setup is minimal."""
 44 |         pass
 45 | 
 46 |     def predict(
 47 |         self,
 48 |         audio,
 49 |         model_name="base",
 50 |         transcription="plain_text",
 51 |         translate=False,
 52 |         translation="plain_text",  # Added in a previous PR
 53 |         language=None,
 54 |         temperature=0,
 55 |         best_of=5,
 56 |         beam_size=5,
 57 |         patience=1,
 58 |         length_penalty=None,
 59 |         suppress_tokens="-1",
 60 |         initial_prompt=None,
 61 |         condition_on_previous_text=True,
 62 |         temperature_increment_on_fallback=0.2,
 63 |         compression_ratio_threshold=2.4,
 64 |         logprob_threshold=-1.0,
 65 |         no_speech_threshold=0.6,
 66 |         enable_vad=False,
 67 |         word_timestamps=False,
 68 |     ):
 69 |         """
 70 |         Run a single prediction on the model, loading/unloading models as needed.
 71 |         """
 72 |         if model_name not in AVAILABLE_MODELS:
 73 |             raise ValueError(
 74 |                 f"Invalid model name: {model_name}. Available models are: {AVAILABLE_MODELS}"
 75 |             )
 76 | 
 77 |         with self.model_lock:
 78 |             model = None
 79 |             if model_name not in self.models:
 80 |                 # Unload existing model if necessary
 81 |                 if self.models:
 82 |                     existing_model_name = list(self.models.keys())[0]
 83 |                     print(f"Unloading model: {existing_model_name}...")
 84 |                     # Remove reference and clear dict
 85 |                     del self.models[existing_model_name]
 86 |                     self.models.clear()
 87 |                     # Hint Python to release memory
 88 |                     gc.collect()
 89 |                     if rp_cuda.is_available():
 90 |                         # If using PyTorch models, you might call torch.cuda.empty_cache()
 91 |                         # FasterWhisper uses CTranslate2; explicit cache clearing might not be needed
 92 |                         # but gc.collect() is generally helpful.
 93 |                         pass
 94 |                     print(f"Model {existing_model_name} unloaded.")
 95 | 
 96 |                 # Load the requested model
 97 |                 print(f"Loading model: {model_name}...")
 98 |                 try:
 99 |                     loaded_model = WhisperModel(
100 |                         model_name,
101 |                         device="cuda" if rp_cuda.is_available() else "cpu",
102 |                         compute_type="float16" if rp_cuda.is_available() else "int8",
103 |                     )
104 |                     self.models[model_name] = loaded_model
105 |                     model = loaded_model
106 |                     print(f"Model {model_name} loaded successfully.")
107 |                 except Exception as e:
108 |                     print(f"Error loading model {model_name}: {e}")
109 |                     raise ValueError(f"Failed to load model {model_name}: {e}") from e
110 |             else:
111 |                 # Model already loaded
112 |                 model = self.models[model_name]
113 |                 print(f"Using already loaded model: {model_name}")
114 | 
115 |             # Ensure model is loaded before proceeding
116 |             if model is None:
117 |                 raise RuntimeError(
118 |                     f"Model {model_name} could not be loaded or retrieved."
119 |                 )
120 | 
121 |         # Model is now loaded and ready, proceed with prediction (outside the lock?)
122 |         # Consider if transcribe is thread-safe or if it should also be within the lock
123 |         # For now, keeping transcribe outside as it's CPU/GPU bound work
124 | 
125 |         if temperature_increment_on_fallback is not None:
126 |             temperature = tuple(
127 |                 np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback)
128 |             )
129 |         else:
130 |             temperature = [temperature]
131 | 
132 |         # Note: FasterWhisper's transcribe might release the GIL, potentially allowing
133 |         # other threads to acquire the model_lock if transcribe is lengthy.
134 |         # If issues arise, the lock might need to encompass the transcribe call too.
135 |         segments, info = list(
136 |             model.transcribe(
137 |                 str(audio),
138 |                 language=language,
139 |                 task="transcribe",
140 |                 beam_size=beam_size,
141 |                 best_of=best_of,
142 |                 patience=patience,
143 |                 length_penalty=length_penalty,
144 |                 temperature=temperature,
145 |                 compression_ratio_threshold=compression_ratio_threshold,
146 |                 log_prob_threshold=logprob_threshold,
147 |                 no_speech_threshold=no_speech_threshold,
148 |                 condition_on_previous_text=condition_on_previous_text,
149 |                 initial_prompt=initial_prompt,
150 |                 prefix=None,
151 |                 suppress_blank=True,
152 |                 suppress_tokens=[-1],  # Might need conversion from string
153 |                 without_timestamps=False,
154 |                 max_initial_timestamp=1.0,
155 |                 word_timestamps=word_timestamps,
156 |                 vad_filter=enable_vad,
157 |             )
158 |         )
159 | 
160 |         segments = list(segments)
161 | 
162 |         # Format transcription
163 |         transcription_output = format_segments(transcription, segments)
164 | 
165 |         # Handle translation if requested
166 |         translation_output = None
167 |         if translate:
168 |             translation_segments, _ = model.transcribe(
169 |                 str(audio),
170 |                 task="translate",
171 |                 temperature=temperature,  # Reuse temperature settings for translation
172 |             )
173 |             translation_output = format_segments(
174 |                 translation, list(translation_segments)
175 |             )
176 | 
177 |         results = {
178 |             "segments": serialize_segments(segments),
179 |             "detected_language": info.language,
180 |             "transcription": transcription_output,
181 |             "translation": translation_output,
182 |             "device": "cuda" if rp_cuda.is_available() else "cpu",
183 |             "model": model_name,
184 |         }
185 | 
186 |         if word_timestamps:
187 |             word_timestamps_list = []
188 |             for segment in segments:
189 |                 for word in segment.words:
190 |                     word_timestamps_list.append(
191 |                         {
192 |                             "word": word.word,
193 |                             "start": word.start,
194 |                             "end": word.end,
195 |                         }
196 |                     )
197 |             results["word_timestamps"] = word_timestamps_list
198 | 
199 |         return results
200 | 
201 | 
202 | def serialize_segments(transcript):
203 |     """
204 |     Serialize the segments to be returned in the API response.
205 |     """
206 |     return [
207 |         {
208 |             "id": segment.id,
209 |             "seek": segment.seek,
210 |             "start": segment.start,
211 |             "end": segment.end,
212 |             "text": segment.text,
213 |             "tokens": segment.tokens,
214 |             "temperature": segment.temperature,
215 |             "avg_logprob": segment.avg_logprob,
216 |             "compression_ratio": segment.compression_ratio,
217 |             "no_speech_prob": segment.no_speech_prob,
218 |         }
219 |         for segment in transcript
220 |     ]
221 | 
222 | 
223 | def format_segments(format_type, segments):
224 |     """
225 |     Format the segments to the desired format
226 |     """
227 | 
228 |     if format_type == "plain_text":
229 |         return " ".join([segment.text.lstrip() for segment in segments])
230 |     elif format_type == "formatted_text":
231 |         return "\n".join([segment.text.lstrip() for segment in segments])
232 |     elif format_type == "srt":
233 |         return write_srt(segments)
234 |     elif format_type == "vtt":  # Added VTT case
235 |         return write_vtt(segments)
236 |     else:  # Default or unknown format
237 |         print(f"Warning: Unknown format '{format_type}', defaulting to plain text.")
238 |         return " ".join([segment.text.lstrip() for segment in segments])
239 | 
240 | 
241 | def write_vtt(transcript):
242 |     """
243 |     Write the transcript in VTT format.
244 |     """
245 |     result = ""
246 | 
247 |     for segment in transcript:
248 |         # Using the consistent timestamp format from previous PR
249 |         result += f"{format_timestamp(segment.start, always_include_hours=True)} --> {format_timestamp(segment.end, always_include_hours=True)}\n"
250 |         result += f"{segment.text.strip().replace('-->', '->')}\n"
251 |         result += "\n"
252 | 
253 |     return result
254 | 
255 | 
256 | def write_srt(transcript):
257 |     """
258 |     Write the transcript in SRT format.
259 |     """
260 |     result = ""
261 | 
262 |     for i, segment in enumerate(transcript, start=1):
263 |         result += f"{i}\n"
264 |         result += f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> "
265 |         result += f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n"
266 |         result += f"{segment.text.strip().replace('-->', '->')}\n"
267 |         result += "\n"
268 | 
269 |     return result
270 | 


--------------------------------------------------------------------------------
/src/rp_handler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | rp_handler.py for runpod worker
  3 | 
  4 | rp_debugger:
  5 | - Utility that provides additional debugging information.
  6 | The handler must be called with --rp_debugger flag to enable it.
  7 | """
  8 | import base64
  9 | import tempfile
 10 | 
 11 | from rp_schema import INPUT_VALIDATIONS
 12 | from runpod.serverless.utils import download_files_from_urls, rp_cleanup, rp_debugger
 13 | from runpod.serverless.utils.rp_validator import validate
 14 | import runpod
 15 | import predict
 16 | 
 17 | 
 18 | MODEL = predict.Predictor()
 19 | MODEL.setup()
 20 | 
 21 | 
 22 | def base64_to_tempfile(base64_file: str) -> str:
 23 |     '''
 24 |     Convert base64 file to tempfile.
 25 | 
 26 |     Parameters:
 27 |     base64_file (str): Base64 file
 28 | 
 29 |     Returns:
 30 |     str: Path to tempfile
 31 |     '''
 32 |     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
 33 |         temp_file.write(base64.b64decode(base64_file))
 34 | 
 35 |     return temp_file.name
 36 | 
 37 | 
 38 | @rp_debugger.FunctionTimer
 39 | def run_whisper_job(job):
 40 |     '''
 41 |     Run inference on the model.
 42 | 
 43 |     Parameters:
 44 |     job (dict): Input job containing the model parameters
 45 | 
 46 |     Returns:
 47 |     dict: The result of the prediction
 48 |     '''
 49 |     job_input = job['input']
 50 | 
 51 |     with rp_debugger.LineTimer('validation_step'):
 52 |         input_validation = validate(job_input, INPUT_VALIDATIONS)
 53 | 
 54 |         if 'errors' in input_validation:
 55 |             return {"error": input_validation['errors']}
 56 |         job_input = input_validation['validated_input']
 57 | 
 58 |     if not job_input.get('audio', False) and not job_input.get('audio_base64', False):
 59 |         return {'error': 'Must provide either audio or audio_base64'}
 60 | 
 61 |     if job_input.get('audio', False) and job_input.get('audio_base64', False):
 62 |         return {'error': 'Must provide either audio or audio_base64, not both'}
 63 | 
 64 |     if job_input.get('audio', False):
 65 |         with rp_debugger.LineTimer('download_step'):
 66 |             audio_input = download_files_from_urls(job['id'], [job_input['audio']])[0]
 67 | 
 68 |     if job_input.get('audio_base64', False):
 69 |         audio_input = base64_to_tempfile(job_input['audio_base64'])
 70 | 
 71 |     with rp_debugger.LineTimer('prediction_step'):
 72 |         whisper_results = MODEL.predict(
 73 |             audio=audio_input,
 74 |             model_name=job_input["model"],
 75 |             transcription=job_input["transcription"],
 76 |             translation=job_input["translation"],
 77 |             translate=job_input["translate"],
 78 |             language=job_input["language"],
 79 |             temperature=job_input["temperature"],
 80 |             best_of=job_input["best_of"],
 81 |             beam_size=job_input["beam_size"],
 82 |             patience=job_input["patience"],
 83 |             length_penalty=job_input["length_penalty"],
 84 |             suppress_tokens=job_input.get("suppress_tokens", "-1"),
 85 |             initial_prompt=job_input["initial_prompt"],
 86 |             condition_on_previous_text=job_input["condition_on_previous_text"],
 87 |             temperature_increment_on_fallback=job_input["temperature_increment_on_fallback"],
 88 |             compression_ratio_threshold=job_input["compression_ratio_threshold"],
 89 |             logprob_threshold=job_input["logprob_threshold"],
 90 |             no_speech_threshold=job_input["no_speech_threshold"],
 91 |             enable_vad=job_input["enable_vad"],
 92 |             word_timestamps=job_input["word_timestamps"]
 93 |         )
 94 | 
 95 |     with rp_debugger.LineTimer('cleanup_step'):
 96 |         rp_cleanup.clean(['input_objects'])
 97 | 
 98 |     return whisper_results
 99 | 
100 | 
101 | runpod.serverless.start({"handler": run_whisper_job})
102 | 


--------------------------------------------------------------------------------
/src/rp_schema.py:
--------------------------------------------------------------------------------
  1 | INPUT_VALIDATIONS = {
  2 |     'audio': {
  3 |         'type': str,
  4 |         'required': False,
  5 |         'default': None
  6 |     },
  7 |     'audio_base64': {
  8 |         'type': str,
  9 |         'required': False,
 10 |         'default': None
 11 |     },
 12 |     'model': {
 13 |         'type': str,
 14 |         'required': False,
 15 |         'default': 'base'
 16 |     },
 17 |     'transcription': {
 18 |         'type': str,
 19 |         'required': False,
 20 |         'default': 'plain_text'
 21 |     },
 22 |     'translate': {
 23 |         'type': bool,
 24 |         'required': False,
 25 |         'default': False
 26 |     },
 27 |     'translation': {
 28 |         'type': str,
 29 |         'required': False,
 30 |         'default': 'plain_text'
 31 |     },
 32 |     'language': {
 33 |         'type': str,
 34 |         'required': False,
 35 |         'default': None
 36 |     },
 37 |     'temperature': {
 38 |         'type': float,
 39 |         'required': False,
 40 |         'default': 0
 41 |     },
 42 |     'best_of': {
 43 |         'type': int,
 44 |         'required': False,
 45 |         'default': 5
 46 |     },
 47 |     'beam_size': {
 48 |         'type': int,
 49 |         'required': False,
 50 |         'default': 5
 51 |     },
 52 |     'patience': {
 53 |         'type': float,
 54 |         'required': False,
 55 |         'default': 1.0
 56 |     },
 57 |     'length_penalty': {
 58 |         'type': float,
 59 |         'required': False,
 60 |         'default': 0
 61 |     },
 62 |     'suppress_tokens': {
 63 |         'type': str,
 64 |         'required': False,
 65 |         'default': '-1'
 66 |     },
 67 |     'initial_prompt': {
 68 |         'type': str,
 69 |         'required': False,
 70 |         'default': None
 71 |     },
 72 |     'condition_on_previous_text': {
 73 |         'type': bool,
 74 |         'required': False,
 75 |         'default': True
 76 |     },
 77 |     'temperature_increment_on_fallback': {
 78 |         'type': float,
 79 |         'required': False,
 80 |         'default': 0.2
 81 |     },
 82 |     'compression_ratio_threshold': {
 83 |         'type': float,
 84 |         'required': False,
 85 |         'default': 2.4
 86 |     },
 87 |     'logprob_threshold': {
 88 |         'type': float,
 89 |         'required': False,
 90 |         'default': -1.0
 91 |     },
 92 |     'no_speech_threshold': {
 93 |         'type': float,
 94 |         'required': False,
 95 |         'default': 0.6
 96 |     },
 97 |     'enable_vad': {
 98 |         'type': bool,
 99 |         'required': False,
100 |         'default': False
101 |     },
102 |     'word_timestamps': {
103 |         'type': bool,
104 |         'required': False,
105 |         'default': False
106 |     },
107 | }
108 | 


--------------------------------------------------------------------------------
/test_input.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "input": {
 3 |     "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav",
 4 |     "model": "turbo",
 5 |     "transcription": "plain text",
 6 |     "translate": false,
 7 |     "temperature": 0,
 8 |     "best_of": 5,
 9 |     "beam_size": 5,
10 |     "suppress_tokens": "-1",
11 |     "condition_on_previous_text": false,
12 |     "temperature_increment_on_fallback": 0.2,
13 |     "compression_ratio_threshold": 2.4,
14 |     "logprob_threshold": -1,
15 |     "no_speech_threshold": 0.6
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------