├── .github ├── tests.json └── workflows │ ├── CD-docker_dev.yml │ ├── CD-docker_release.yml │ ├── CI-runpod_dep.yml │ └── CI-test_handler.yml ├── .gitignore ├── .runpod ├── hub.json └── tests.json ├── Dockerfile ├── LICENSE ├── README.md ├── builder ├── fetch_models.py └── requirements.txt ├── locustfile.py ├── public ├── banner.jpeg └── banner.png ├── src ├── predict.py ├── rp_handler.py └── rp_schema.py └── test_input.json /.github/tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "hardwareConfig": { 4 | "endpointConfig": { 5 | "gpuIds": "AMPERE_16", 6 | "name": "16GB GPU" 7 | } 8 | }, 9 | "input": { 10 | "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav", 11 | "model": "base", 12 | "transcription": "plain text", 13 | "translate": false, 14 | "temperature": 0, 15 | "best_of": 5, 16 | "beam_size": 5, 17 | "suppress_tokens": "-1", 18 | "condition_on_previous_text": false, 19 | "temperature_increment_on_fallback": 0.2, 20 | "compression_ratio_threshold": 2.4, 21 | "logprob_threshold": -1, 22 | "no_speech_threshold": 0.6 23 | } 24 | } 25 | ] 26 | -------------------------------------------------------------------------------- /.github/workflows/CD-docker_dev.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | 8 | jobs: 9 | docker: 10 | runs-on: DO 11 | steps: 12 | - name: Set up QEMU 13 | uses: docker/setup-qemu-action@v2 14 | 15 | - name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v2 17 | 18 | - name: Login to Docker Hub 19 | uses: docker/login-action@v2 20 | with: 21 | username: ${{ secrets.DOCKERHUB_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_TOKEN }} 23 | 24 | - name: Build and push 25 | uses: docker/build-push-action@v4 26 | with: 27 | push: true 28 | tags: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:dev 29 | - uses: actions/checkout@v3 30 | - name: Run Tests 31 | uses: direlines/runpod-test-runner@v1.7 32 | with: 33 | image-tag: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:dev 34 | runpod-api-key: ${{ secrets.RUNPOD_API_KEY }} 35 | request-timeout: 600 36 | -------------------------------------------------------------------------------- /.github/workflows/CD-docker_release.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | docker: 9 | runs-on: DO 10 | steps: 11 | - name: Set up QEMU 12 | uses: docker/setup-qemu-action@v2 13 | 14 | - name: Set up Docker Buildx 15 | uses: docker/setup-buildx-action@v2 16 | 17 | - name: Login to Docker Hub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: docker/build-push-action@v4 25 | with: 26 | push: true 27 | tags: ${{ secrets.DOCKERHUB_REPO }}/${{ secrets.DOCKERHUB_IMG }}:${{ github.event.release.tag_name }} 28 | -------------------------------------------------------------------------------- /.github/workflows/CI-runpod_dep.yml: -------------------------------------------------------------------------------- 1 | name: CI | Update runpod package version 2 | 3 | on: 4 | repository_dispatch: 5 | types: [python-package-release] 6 | 7 | push: 8 | branches: 9 | - "main" 10 | - "master" 11 | 12 | workflow_dispatch: 13 | 14 | jobs: 15 | check_dep: 16 | runs-on: ubuntu-latest 17 | name: Check python requirements file and update 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | 22 | - name: Check for new package version and update 23 | run: | 24 | echo "Fetching the current runpod version from requirements.txt..." 25 | 26 | # Get current version, allowing for '~=' operator 27 | current_version=$(grep -oP 'runpod~=\K[^ ]+' ./builder/requirements.txt) 28 | echo "Current version: $current_version" 29 | 30 | # Get new version from PyPI 31 | new_version=$(curl -s https://pypi.org/pypi/runpod/json | jq -r .info.version) 32 | echo "NEW_VERSION_ENV=$new_version" >> $GITHUB_ENV 33 | echo "New version: $new_version" 34 | 35 | if [ -z "$new_version" ]; then 36 | echo "ERROR: Failed to fetch the new version from PyPI." 37 | exit 1 38 | fi 39 | 40 | # Extract major and minor from current version (e.g., 1.7) 41 | current_major_minor=$(echo $current_version | cut -d. -f1,2) 42 | new_major_minor=$(echo $new_version | cut -d. -f1,2) 43 | 44 | echo "Current major.minor: $current_major_minor" 45 | echo "New major.minor: $new_major_minor" 46 | 47 | # Check if the new version is within the current major.minor range (e.g., 1.7.x) 48 | if [ "$new_major_minor" = "$current_major_minor" ]; then 49 | echo "No update needed. The new version ($new_version) is within the allowed range (~= $current_major_minor)." 50 | exit 0 51 | fi 52 | 53 | echo "New major/minor detected ($new_major_minor). Updating runpod version..." 54 | 55 | # Update requirements.txt with the new version while keeping '~=' 56 | sed -i "s/runpod~=.*/runpod~=$new_version/" ./builder/requirements.txt 57 | echo "requirements.txt has been updated." 58 | 59 | - name: Create Pull Request 60 | uses: peter-evans/create-pull-request@v3 61 | with: 62 | token: ${{ secrets.GITHUB_TOKEN }} 63 | commit-message: Update runpod package version 64 | title: Update runpod package version 65 | body: The package version has been updated to ${{ env.NEW_VERSION_ENV }} 66 | branch: runpod-package-update 67 | -------------------------------------------------------------------------------- /.github/workflows/CI-test_handler.yml: -------------------------------------------------------------------------------- 1 | name: CI | Test Worker 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | workflow_dispatch: 13 | 14 | jobs: 15 | initialize_worker: 16 | runs-on: ubuntu-latest 17 | outputs: 18 | id: ${{ steps.extract_id.outputs.runpod_job_id }} 19 | 20 | steps: 21 | - name: Deploy Worker 22 | id: deploy 23 | uses: fjogeleit/http-request-action@v1 24 | with: 25 | url: "https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT }}/run" 26 | method: "POST" 27 | customHeaders: '{"Content-Type": "application/json"}' 28 | bearerToken: ${{ secrets.RUNPOD_API_KEY }} 29 | data: '{"input":{"github_pat": "${{ secrets.GH_PAT }}", "github_org":"${{ secrets.GH_ORG }}"}}' 30 | 31 | - name: Extract Job ID 32 | id: extract_id 33 | run: | 34 | ID=$(echo '${{ steps.deploy.outputs.response }}' | jq -r '.id') 35 | echo "::set-output name=runpod_job_id::$ID" 36 | 37 | run_tests: 38 | needs: initialize_worker 39 | runs-on: runpod 40 | 41 | steps: 42 | - uses: actions/checkout@v3 43 | 44 | - name: Set up environment 45 | run: | 46 | rm -f /etc/apt/sources.list.d/*.list 47 | apt-get update -y 48 | apt-get upgrade -y 49 | apt-get install --yes --no-install-recommends sudo ca-certificates git wget curl bash libgl1 libx11-6 software-properties-common ffmpeg build-essential -y 50 | apt-get autoremove -y 51 | apt-get clean -y 52 | rm -rf /var/lib/apt/lists/* 53 | 54 | - name: Set up Python 3.10 & install dependencies 55 | uses: actions/setup-python@v4 56 | with: 57 | python-version: "3.10.12" 58 | 59 | - name: Install Dependencies 60 | run: | 61 | 62 | python -m pip install --upgrade pip 63 | pip install -r builder/requirements.txt 64 | 65 | - name: Fetch and run models 66 | run: | 67 | python builder/fetch_models.py 68 | 69 | - name: Execute Tests 70 | run: | 71 | python src/rp_handler.py --test_input='{"input": {"audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav"}}' 72 | 73 | terminate_worker: 74 | if: ${{ always() && !success() }} 75 | needs: initialize_worker 76 | runs-on: ubuntu-latest 77 | 78 | steps: 79 | - name: Shutdown Worker 80 | uses: fjogeleit/http-request-action@v1 81 | with: 82 | url: "https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT }}/cancel/${{ needs.initialize_worker.outputs.id }}" 83 | method: "POST" 84 | customHeaders: '{"Content-Type": "application/json"}' 85 | bearerToken: ${{ secrets.RUNPOD_API_KEY }} 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | runpod.toml 162 | -------------------------------------------------------------------------------- /.runpod/hub.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Faster Whisper", 3 | "description": "Process audio files using various Whisper models, with options for transcription formatting, language translation and more.", 4 | "type": "serverless", 5 | "category": "audio", 6 | "iconUrl": "https://dummyimage.com/100x100/0066ff/fff&text=FW", 7 | "config": { 8 | "runsOn": "GPU", 9 | "containerDiskInGb": 20, 10 | "gpuIds": "ADA_24", 11 | "gpuCount": 1, 12 | "allowedCudaVersions": ["12.7", "12.6", "12.5", "12.4", "12.3"] 13 | } 14 | } -------------------------------------------------------------------------------- /.runpod/tests.json: -------------------------------------------------------------------------------- 1 | { 2 | "tests": [ 3 | { 4 | "name": "basic_test", 5 | "input": { 6 | "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav", 7 | "model": "turbo", 8 | "transcription": "plain text", 9 | "translate": false, 10 | "temperature": 0, 11 | "best_of": 5, 12 | "beam_size": 5, 13 | "suppress_tokens": "-1", 14 | "condition_on_previous_text": false, 15 | "temperature_increment_on_fallback": 0.2, 16 | "compression_ratio_threshold": 2.4, 17 | "logprob_threshold": -1, 18 | "no_speech_threshold": 0.6 19 | }, 20 | "timeout": 10000 21 | } 22 | ], 23 | "config": { 24 | "gpuTypeId": "NVIDIA GeForce RTX 4090", 25 | "gpuCount": 1, 26 | "allowedCudaVersions": [ 27 | "12.7", 28 | "12.6", 29 | "12.5", 30 | "12.4", 31 | "12.3" 32 | ] 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # faster-whisper turbo needs cudnnn >= 9 2 | # see https://github.com/runpod-workers/worker-faster_whisper/pull/44 3 | FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04 4 | 5 | # Remove any third-party apt sources to avoid issues with expiring keys. 6 | RUN rm -f /etc/apt/sources.list.d/*.list 7 | 8 | # Set shell and noninteractive environment variables 9 | SHELL ["/bin/bash", "-c"] 10 | ENV DEBIAN_FRONTEND=noninteractive 11 | ENV SHELL=/bin/bash 12 | 13 | # Set working directory 14 | WORKDIR / 15 | 16 | # Update and upgrade the system packages 17 | RUN apt-get update -y && \ 18 | apt-get upgrade -y && \ 19 | apt-get install --yes --no-install-recommends sudo ca-certificates git wget curl bash libgl1 libx11-6 software-properties-common ffmpeg build-essential -y &&\ 20 | apt-get autoremove -y && \ 21 | apt-get clean -y && \ 22 | rm -rf /var/lib/apt/lists/* 23 | 24 | # Install Python 3.10 25 | RUN apt-get update -y && \ 26 | apt-get install python3.10 python3.10-dev python3.10-venv python3-pip -y --no-install-recommends && \ 27 | ln -s /usr/bin/python3.10 /usr/bin/python && \ 28 | rm -f /usr/bin/python3 && \ 29 | ln -s /usr/bin/python3.10 /usr/bin/python3 && \ 30 | apt-get autoremove -y && \ 31 | apt-get clean -y && \ 32 | rm -rf /var/lib/apt/lists/* 33 | 34 | # Install Python dependencies 35 | COPY builder/requirements.txt /requirements.txt 36 | RUN --mount=type=cache,target=/root/.cache/pip \ 37 | pip install --upgrade pip && \ 38 | pip install huggingface_hub[hf_xet] && \ 39 | pip install -r /requirements.txt --no-cache-dir 40 | 41 | # Copy and run script to fetch models 42 | COPY builder/fetch_models.py /fetch_models.py 43 | RUN python /fetch_models.py && \ 44 | rm /fetch_models.py 45 | 46 | # Copy handler and other code 47 | COPY src . 48 | 49 | # test input that will be used when the container runs outside of runpod 50 | COPY test_input.json . 51 | 52 | # Set default command 53 | CMD python -u /rp_handler.py 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 RunPod 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Faster Whisper Logo](https://5ccaof7hvfzuzf4p.public.blob.vercel-storage.com/banner-pjbGKw0buxbWGhMVC165Gf9qgqWo7I.jpeg) 2 | 3 | [Faster Whisper](https://github.com/guillaumekln/faster-whisper) is designed to process audio files using various Whisper models, with options for transcription formatting, language translation and more. 4 | 5 | --- 6 | 7 | [![RunPod](https://api.runpod.io/badge/runpod-workers/worker-faster_whisper)](https://www.runpod.io/console/hub/runpod-workers/worker-faster_whisper) 8 | 9 | --- 10 | 11 | ## Models 12 | 13 | - tiny 14 | - base 15 | - small 16 | - medium 17 | - large-v1 18 | - large-v2 19 | - large-v3 20 | - distil-large-v2 21 | - distil-large-v3 22 | - turbo 23 | 24 | ## Input 25 | 26 | | Input | Type | Description | 27 | | ----------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 28 | | `audio` | Path | URL to Audio file | 29 | | `audio_base64` | str | Base64-encoded audio file | 30 | | `model` | str | Choose a Whisper model. Choices: "tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3", "distil-large-v2", "distil-large-v3", "turbo". Default: "base" | 31 | | `transcription` | str | Choose the format for the transcription. Choices: "plain_text", "formatted_text", "srt", "vtt". Default: "plain_text" | 32 | | `translate` | bool | Translate the text to English when set to True. Default: False | 33 | | `translation` | str | Choose the format for the translation. Choices: "plain_text", "formatted_text", "srt", "vtt". Default: "plain_text" | 34 | | `language` | str | Language spoken in the audio, specify None to perform language detection. Default: None | 35 | | `temperature` | float | Temperature to use for sampling. Default: 0 | 36 | | `best_of` | int | Number of candidates when sampling with non-zero temperature. Default: 5 | 37 | | `beam_size` | int | Number of beams in beam search, only applicable when temperature is zero. Default: 5 | 38 | | `patience` | float | Optional patience value to use in beam decoding. Default: None | 39 | | `length_penalty` | float | Optional token length penalty coefficient (alpha). Default: None | 40 | | `suppress_tokens` | str | Comma-separated list of token ids to suppress during sampling. Default: "-1" | 41 | | `initial_prompt` | str | Optional text to provide as a prompt for the first window. Default: None | 42 | | `condition_on_previous_text` | bool | If True, provide the previous output of the model as a prompt for the next window. Default: True | 43 | | `temperature_increment_on_fallback` | float | Temperature to increase when falling back when the decoding fails. Default: 0.2 | 44 | | `compression_ratio_threshold` | float | If the gzip compression ratio is higher than this value, treat the decoding as failed. Default: 2.4 | 45 | | `logprob_threshold` | float | If the average log probability is lower than this value, treat the decoding as failed. Default: -1.0 | 46 | | `no_speech_threshold` | float | If the probability of the token is higher than this value, consider the segment as silence. Default: 0.6 | 47 | | `enable_vad` | bool | If True, use the voice activity detection (VAD) to filter out parts of the audio without speech. This step is using the Silero VAD model. Default: False | 48 | | `word_timestamps` | bool | If True, include word timestamps in the output. Default: False | 49 | 50 | ### Example 51 | 52 | The following inputs can be used for testing the model: 53 | 54 | ```json 55 | { 56 | "input": { 57 | "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav", 58 | "model": "turbo" 59 | } 60 | } 61 | ``` 62 | 63 | producing an output like this: 64 | 65 | ```json 66 | { 67 | "segments": [ 68 | { 69 | "id": 1, 70 | "seek": 106, 71 | "start": 0.11, 72 | "end": 3.11, 73 | "text": " Hello and welcome!", 74 | "tokens": [50364, 25, 7, 287, 50514], 75 | "temperature": 0.1, 76 | "avg_logprob": -0.8348079785480325, 77 | "compression_ratio": 0.5789473684210527, 78 | "no_speech_prob": 0.1453857421875 79 | } 80 | ], 81 | "detected_language": "en", 82 | "transcription": "Hello and welcome!", 83 | "translation": null, 84 | "device": "cuda", 85 | "model": "turbo", 86 | "translation_time": 0.3796223163604736 87 | } 88 | ``` 89 | -------------------------------------------------------------------------------- /builder/fetch_models.py: -------------------------------------------------------------------------------- 1 | from faster_whisper.utils import download_model 2 | 3 | model_names = [ 4 | "tiny", 5 | "base", 6 | "small", 7 | "medium", 8 | "large-v1", 9 | "large-v2", 10 | "large-v3", 11 | "distil-large-v2", 12 | "distil-large-v3", 13 | "turbo", 14 | ] 15 | 16 | 17 | def download_model_weights(selected_model): 18 | """ 19 | Download model weights. 20 | """ 21 | print(f"Downloading {selected_model}...") 22 | download_model(selected_model, cache_dir=None) 23 | print(f"Finished downloading {selected_model}.") 24 | 25 | 26 | # Loop through models sequentially 27 | for model_name in model_names: 28 | download_model_weights(model_name) 29 | 30 | print("Finished downloading all models.") 31 | -------------------------------------------------------------------------------- /builder/requirements.txt: -------------------------------------------------------------------------------- 1 | runpod~=1.7.9 2 | 3 | faster-whisper==1.1.0 -------------------------------------------------------------------------------- /locustfile.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import numpy as np 4 | from locust import HttpUser, task 5 | import base64 6 | from pydub import AudioSegment 7 | 8 | 9 | def generate_random_audio(duration_ms): 10 | # Generate random data 11 | samples = np.random.normal(0, 1, int(44100 * duration_ms / 1000.0)) 12 | 13 | # Convert to int16 array so we can make use of the pydub package 14 | samples = (samples * np.iinfo(np.int16).max).astype(np.int16) 15 | 16 | # Create an audio segment 17 | audio_segment = AudioSegment( 18 | samples.tobytes(), 19 | frame_rate=44100, 20 | sample_width=samples.dtype.itemsize, 21 | channels=1 22 | ) 23 | 24 | # Convert the audio segment to a base64 string 25 | buffer = io.BytesIO() 26 | audio_segment.export(buffer, format="wav") 27 | base64_audio = base64.b64encode(buffer.getvalue()).decode('utf-8') 28 | 29 | return base64_audio 30 | 31 | class ApiUser(HttpUser): 32 | @task 33 | def send_audio_request(self): 34 | headers = { 35 | 'Content-Type': 'application/json', 36 | } 37 | audio_data = generate_random_audio(1000) # 1 second audio 38 | 39 | data = { 40 | "input": { 41 | "audio": audio_data 42 | } 43 | } 44 | 45 | self.client.post("/v2/xxxxx/runsync", json=data, headers=headers) # Replace with your endpoint ID 46 | 47 | if __name__ == "__main__": 48 | import os 49 | os.system("locust -f locustfile.py") 50 | -------------------------------------------------------------------------------- /public/banner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runpod-workers/worker-faster_whisper/bd500dc88f3828b4a03252473684059aa1b5b41c/public/banner.jpeg -------------------------------------------------------------------------------- /public/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runpod-workers/worker-faster_whisper/bd500dc88f3828b4a03252473684059aa1b5b41c/public/banner.png -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the Predictor class, which is used to run predictions on the 3 | Whisper model. It is based on the Predictor class from the original Whisper 4 | repository, with some modifications to make it work with the RP platform. 5 | """ 6 | 7 | import gc 8 | import threading 9 | from concurrent.futures import ( 10 | ThreadPoolExecutor, 11 | ) # Still needed for transcribe potentially? 12 | import numpy as np 13 | 14 | from runpod.serverless.utils import rp_cuda 15 | 16 | from faster_whisper import WhisperModel 17 | from faster_whisper.utils import format_timestamp 18 | 19 | # Define available models (for validation) 20 | AVAILABLE_MODELS = { 21 | "tiny", 22 | "base", 23 | "small", 24 | "medium", 25 | "large-v1", 26 | "large-v2", 27 | "large-v3", 28 | "turbo", 29 | } 30 | 31 | 32 | class Predictor: 33 | """A Predictor class for the Whisper model with lazy loading""" 34 | 35 | def __init__(self): 36 | """Initializes the predictor with no models loaded.""" 37 | self.models = {} 38 | self.model_lock = ( 39 | threading.Lock() 40 | ) # Lock for thread-safe model loading/unloading 41 | 42 | def setup(self): 43 | """No models are pre-loaded. Setup is minimal.""" 44 | pass 45 | 46 | def predict( 47 | self, 48 | audio, 49 | model_name="base", 50 | transcription="plain_text", 51 | translate=False, 52 | translation="plain_text", # Added in a previous PR 53 | language=None, 54 | temperature=0, 55 | best_of=5, 56 | beam_size=5, 57 | patience=1, 58 | length_penalty=None, 59 | suppress_tokens="-1", 60 | initial_prompt=None, 61 | condition_on_previous_text=True, 62 | temperature_increment_on_fallback=0.2, 63 | compression_ratio_threshold=2.4, 64 | logprob_threshold=-1.0, 65 | no_speech_threshold=0.6, 66 | enable_vad=False, 67 | word_timestamps=False, 68 | ): 69 | """ 70 | Run a single prediction on the model, loading/unloading models as needed. 71 | """ 72 | if model_name not in AVAILABLE_MODELS: 73 | raise ValueError( 74 | f"Invalid model name: {model_name}. Available models are: {AVAILABLE_MODELS}" 75 | ) 76 | 77 | with self.model_lock: 78 | model = None 79 | if model_name not in self.models: 80 | # Unload existing model if necessary 81 | if self.models: 82 | existing_model_name = list(self.models.keys())[0] 83 | print(f"Unloading model: {existing_model_name}...") 84 | # Remove reference and clear dict 85 | del self.models[existing_model_name] 86 | self.models.clear() 87 | # Hint Python to release memory 88 | gc.collect() 89 | if rp_cuda.is_available(): 90 | # If using PyTorch models, you might call torch.cuda.empty_cache() 91 | # FasterWhisper uses CTranslate2; explicit cache clearing might not be needed 92 | # but gc.collect() is generally helpful. 93 | pass 94 | print(f"Model {existing_model_name} unloaded.") 95 | 96 | # Load the requested model 97 | print(f"Loading model: {model_name}...") 98 | try: 99 | loaded_model = WhisperModel( 100 | model_name, 101 | device="cuda" if rp_cuda.is_available() else "cpu", 102 | compute_type="float16" if rp_cuda.is_available() else "int8", 103 | ) 104 | self.models[model_name] = loaded_model 105 | model = loaded_model 106 | print(f"Model {model_name} loaded successfully.") 107 | except Exception as e: 108 | print(f"Error loading model {model_name}: {e}") 109 | raise ValueError(f"Failed to load model {model_name}: {e}") from e 110 | else: 111 | # Model already loaded 112 | model = self.models[model_name] 113 | print(f"Using already loaded model: {model_name}") 114 | 115 | # Ensure model is loaded before proceeding 116 | if model is None: 117 | raise RuntimeError( 118 | f"Model {model_name} could not be loaded or retrieved." 119 | ) 120 | 121 | # Model is now loaded and ready, proceed with prediction (outside the lock?) 122 | # Consider if transcribe is thread-safe or if it should also be within the lock 123 | # For now, keeping transcribe outside as it's CPU/GPU bound work 124 | 125 | if temperature_increment_on_fallback is not None: 126 | temperature = tuple( 127 | np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback) 128 | ) 129 | else: 130 | temperature = [temperature] 131 | 132 | # Note: FasterWhisper's transcribe might release the GIL, potentially allowing 133 | # other threads to acquire the model_lock if transcribe is lengthy. 134 | # If issues arise, the lock might need to encompass the transcribe call too. 135 | segments, info = list( 136 | model.transcribe( 137 | str(audio), 138 | language=language, 139 | task="transcribe", 140 | beam_size=beam_size, 141 | best_of=best_of, 142 | patience=patience, 143 | length_penalty=length_penalty, 144 | temperature=temperature, 145 | compression_ratio_threshold=compression_ratio_threshold, 146 | log_prob_threshold=logprob_threshold, 147 | no_speech_threshold=no_speech_threshold, 148 | condition_on_previous_text=condition_on_previous_text, 149 | initial_prompt=initial_prompt, 150 | prefix=None, 151 | suppress_blank=True, 152 | suppress_tokens=[-1], # Might need conversion from string 153 | without_timestamps=False, 154 | max_initial_timestamp=1.0, 155 | word_timestamps=word_timestamps, 156 | vad_filter=enable_vad, 157 | ) 158 | ) 159 | 160 | segments = list(segments) 161 | 162 | # Format transcription 163 | transcription_output = format_segments(transcription, segments) 164 | 165 | # Handle translation if requested 166 | translation_output = None 167 | if translate: 168 | translation_segments, _ = model.transcribe( 169 | str(audio), 170 | task="translate", 171 | temperature=temperature, # Reuse temperature settings for translation 172 | ) 173 | translation_output = format_segments( 174 | translation, list(translation_segments) 175 | ) 176 | 177 | results = { 178 | "segments": serialize_segments(segments), 179 | "detected_language": info.language, 180 | "transcription": transcription_output, 181 | "translation": translation_output, 182 | "device": "cuda" if rp_cuda.is_available() else "cpu", 183 | "model": model_name, 184 | } 185 | 186 | if word_timestamps: 187 | word_timestamps_list = [] 188 | for segment in segments: 189 | for word in segment.words: 190 | word_timestamps_list.append( 191 | { 192 | "word": word.word, 193 | "start": word.start, 194 | "end": word.end, 195 | } 196 | ) 197 | results["word_timestamps"] = word_timestamps_list 198 | 199 | return results 200 | 201 | 202 | def serialize_segments(transcript): 203 | """ 204 | Serialize the segments to be returned in the API response. 205 | """ 206 | return [ 207 | { 208 | "id": segment.id, 209 | "seek": segment.seek, 210 | "start": segment.start, 211 | "end": segment.end, 212 | "text": segment.text, 213 | "tokens": segment.tokens, 214 | "temperature": segment.temperature, 215 | "avg_logprob": segment.avg_logprob, 216 | "compression_ratio": segment.compression_ratio, 217 | "no_speech_prob": segment.no_speech_prob, 218 | } 219 | for segment in transcript 220 | ] 221 | 222 | 223 | def format_segments(format_type, segments): 224 | """ 225 | Format the segments to the desired format 226 | """ 227 | 228 | if format_type == "plain_text": 229 | return " ".join([segment.text.lstrip() for segment in segments]) 230 | elif format_type == "formatted_text": 231 | return "\n".join([segment.text.lstrip() for segment in segments]) 232 | elif format_type == "srt": 233 | return write_srt(segments) 234 | elif format_type == "vtt": # Added VTT case 235 | return write_vtt(segments) 236 | else: # Default or unknown format 237 | print(f"Warning: Unknown format '{format_type}', defaulting to plain text.") 238 | return " ".join([segment.text.lstrip() for segment in segments]) 239 | 240 | 241 | def write_vtt(transcript): 242 | """ 243 | Write the transcript in VTT format. 244 | """ 245 | result = "" 246 | 247 | for segment in transcript: 248 | # Using the consistent timestamp format from previous PR 249 | result += f"{format_timestamp(segment.start, always_include_hours=True)} --> {format_timestamp(segment.end, always_include_hours=True)}\n" 250 | result += f"{segment.text.strip().replace('-->', '->')}\n" 251 | result += "\n" 252 | 253 | return result 254 | 255 | 256 | def write_srt(transcript): 257 | """ 258 | Write the transcript in SRT format. 259 | """ 260 | result = "" 261 | 262 | for i, segment in enumerate(transcript, start=1): 263 | result += f"{i}\n" 264 | result += f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> " 265 | result += f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n" 266 | result += f"{segment.text.strip().replace('-->', '->')}\n" 267 | result += "\n" 268 | 269 | return result 270 | -------------------------------------------------------------------------------- /src/rp_handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | rp_handler.py for runpod worker 3 | 4 | rp_debugger: 5 | - Utility that provides additional debugging information. 6 | The handler must be called with --rp_debugger flag to enable it. 7 | """ 8 | import base64 9 | import tempfile 10 | 11 | from rp_schema import INPUT_VALIDATIONS 12 | from runpod.serverless.utils import download_files_from_urls, rp_cleanup, rp_debugger 13 | from runpod.serverless.utils.rp_validator import validate 14 | import runpod 15 | import predict 16 | 17 | 18 | MODEL = predict.Predictor() 19 | MODEL.setup() 20 | 21 | 22 | def base64_to_tempfile(base64_file: str) -> str: 23 | ''' 24 | Convert base64 file to tempfile. 25 | 26 | Parameters: 27 | base64_file (str): Base64 file 28 | 29 | Returns: 30 | str: Path to tempfile 31 | ''' 32 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: 33 | temp_file.write(base64.b64decode(base64_file)) 34 | 35 | return temp_file.name 36 | 37 | 38 | @rp_debugger.FunctionTimer 39 | def run_whisper_job(job): 40 | ''' 41 | Run inference on the model. 42 | 43 | Parameters: 44 | job (dict): Input job containing the model parameters 45 | 46 | Returns: 47 | dict: The result of the prediction 48 | ''' 49 | job_input = job['input'] 50 | 51 | with rp_debugger.LineTimer('validation_step'): 52 | input_validation = validate(job_input, INPUT_VALIDATIONS) 53 | 54 | if 'errors' in input_validation: 55 | return {"error": input_validation['errors']} 56 | job_input = input_validation['validated_input'] 57 | 58 | if not job_input.get('audio', False) and not job_input.get('audio_base64', False): 59 | return {'error': 'Must provide either audio or audio_base64'} 60 | 61 | if job_input.get('audio', False) and job_input.get('audio_base64', False): 62 | return {'error': 'Must provide either audio or audio_base64, not both'} 63 | 64 | if job_input.get('audio', False): 65 | with rp_debugger.LineTimer('download_step'): 66 | audio_input = download_files_from_urls(job['id'], [job_input['audio']])[0] 67 | 68 | if job_input.get('audio_base64', False): 69 | audio_input = base64_to_tempfile(job_input['audio_base64']) 70 | 71 | with rp_debugger.LineTimer('prediction_step'): 72 | whisper_results = MODEL.predict( 73 | audio=audio_input, 74 | model_name=job_input["model"], 75 | transcription=job_input["transcription"], 76 | translation=job_input["translation"], 77 | translate=job_input["translate"], 78 | language=job_input["language"], 79 | temperature=job_input["temperature"], 80 | best_of=job_input["best_of"], 81 | beam_size=job_input["beam_size"], 82 | patience=job_input["patience"], 83 | length_penalty=job_input["length_penalty"], 84 | suppress_tokens=job_input.get("suppress_tokens", "-1"), 85 | initial_prompt=job_input["initial_prompt"], 86 | condition_on_previous_text=job_input["condition_on_previous_text"], 87 | temperature_increment_on_fallback=job_input["temperature_increment_on_fallback"], 88 | compression_ratio_threshold=job_input["compression_ratio_threshold"], 89 | logprob_threshold=job_input["logprob_threshold"], 90 | no_speech_threshold=job_input["no_speech_threshold"], 91 | enable_vad=job_input["enable_vad"], 92 | word_timestamps=job_input["word_timestamps"] 93 | ) 94 | 95 | with rp_debugger.LineTimer('cleanup_step'): 96 | rp_cleanup.clean(['input_objects']) 97 | 98 | return whisper_results 99 | 100 | 101 | runpod.serverless.start({"handler": run_whisper_job}) 102 | -------------------------------------------------------------------------------- /src/rp_schema.py: -------------------------------------------------------------------------------- 1 | INPUT_VALIDATIONS = { 2 | 'audio': { 3 | 'type': str, 4 | 'required': False, 5 | 'default': None 6 | }, 7 | 'audio_base64': { 8 | 'type': str, 9 | 'required': False, 10 | 'default': None 11 | }, 12 | 'model': { 13 | 'type': str, 14 | 'required': False, 15 | 'default': 'base' 16 | }, 17 | 'transcription': { 18 | 'type': str, 19 | 'required': False, 20 | 'default': 'plain_text' 21 | }, 22 | 'translate': { 23 | 'type': bool, 24 | 'required': False, 25 | 'default': False 26 | }, 27 | 'translation': { 28 | 'type': str, 29 | 'required': False, 30 | 'default': 'plain_text' 31 | }, 32 | 'language': { 33 | 'type': str, 34 | 'required': False, 35 | 'default': None 36 | }, 37 | 'temperature': { 38 | 'type': float, 39 | 'required': False, 40 | 'default': 0 41 | }, 42 | 'best_of': { 43 | 'type': int, 44 | 'required': False, 45 | 'default': 5 46 | }, 47 | 'beam_size': { 48 | 'type': int, 49 | 'required': False, 50 | 'default': 5 51 | }, 52 | 'patience': { 53 | 'type': float, 54 | 'required': False, 55 | 'default': 1.0 56 | }, 57 | 'length_penalty': { 58 | 'type': float, 59 | 'required': False, 60 | 'default': 0 61 | }, 62 | 'suppress_tokens': { 63 | 'type': str, 64 | 'required': False, 65 | 'default': '-1' 66 | }, 67 | 'initial_prompt': { 68 | 'type': str, 69 | 'required': False, 70 | 'default': None 71 | }, 72 | 'condition_on_previous_text': { 73 | 'type': bool, 74 | 'required': False, 75 | 'default': True 76 | }, 77 | 'temperature_increment_on_fallback': { 78 | 'type': float, 79 | 'required': False, 80 | 'default': 0.2 81 | }, 82 | 'compression_ratio_threshold': { 83 | 'type': float, 84 | 'required': False, 85 | 'default': 2.4 86 | }, 87 | 'logprob_threshold': { 88 | 'type': float, 89 | 'required': False, 90 | 'default': -1.0 91 | }, 92 | 'no_speech_threshold': { 93 | 'type': float, 94 | 'required': False, 95 | 'default': 0.6 96 | }, 97 | 'enable_vad': { 98 | 'type': bool, 99 | 'required': False, 100 | 'default': False 101 | }, 102 | 'word_timestamps': { 103 | 'type': bool, 104 | 'required': False, 105 | 'default': False 106 | }, 107 | } 108 | -------------------------------------------------------------------------------- /test_input.json: -------------------------------------------------------------------------------- 1 | { 2 | "input": { 3 | "audio": "https://github.com/runpod-workers/sample-inputs/raw/main/audio/gettysburg.wav", 4 | "model": "turbo", 5 | "transcription": "plain text", 6 | "translate": false, 7 | "temperature": 0, 8 | "best_of": 5, 9 | "beam_size": 5, 10 | "suppress_tokens": "-1", 11 | "condition_on_previous_text": false, 12 | "temperature_increment_on_fallback": 0.2, 13 | "compression_ratio_threshold": 2.4, 14 | "logprob_threshold": -1, 15 | "no_speech_threshold": 0.6 16 | } 17 | } 18 | --------------------------------------------------------------------------------