├── .dockerignore ├── .github ├── FUNDING.yml └── workflows │ ├── docker-publish.yml │ └── documentation.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.gpu ├── LICENCE ├── README.md ├── app ├── asr_models │ ├── asr_model.py │ ├── faster_whisper_engine.py │ ├── mbain_whisperx_engine.py │ └── openai_whisper_engine.py ├── config.py ├── factory │ └── asr_model_factory.py ├── utils.py └── webservice.py ├── docker-compose.gpu.yml ├── docker-compose.yml ├── docs ├── .overrides │ └── main.html ├── assets │ ├── css │ │ └── extra.css │ └── images │ │ └── swagger-ui.png ├── build.md ├── changelog.md ├── endpoints.md ├── environmental-variables.md ├── index.md ├── licence.md └── run.md ├── mkdocs.yml ├── poetry.lock └── pyproject.toml /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .venv 3 | venv -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [ahmetoner] 4 | custom: ['https://bmc.link/ahmetoner'] 5 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker Image 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | branches: 7 | - debug 8 | 9 | env: 10 | DOCKER_USER: ${{secrets.DOCKER_USER}} 11 | DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}} 12 | REPO_NAME: ${{secrets.REPO_NAME}} 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | include: 19 | - docker_file: Dockerfile 20 | platforms: linux/arm64,linux/amd64 21 | - docker_file: Dockerfile.gpu 22 | tag_extension: -gpu 23 | platforms: linux/amd64 24 | steps: 25 | - name: Checkout 26 | uses: actions/checkout@v3 27 | - name: Free up disk space 28 | run: | 29 | sudo rm -rf /usr/share/dotnet 30 | sudo rm -rf /opt/ghc 31 | sudo rm -rf "/usr/local/share/boost" 32 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 33 | - name: Set up QEMU 34 | uses: docker/setup-qemu-action@v1 35 | - name: Set up Docker Buildx 36 | uses: docker/setup-buildx-action@v1 37 | - name: Login to DockerHub 38 | uses: docker/login-action@v1 39 | with: 40 | username: ${{ secrets.DOCKER_USER }} 41 | password: ${{ secrets.DOCKER_PASSWORD }} 42 | - name: Build and Publish the Docker debug image 43 | if: github.ref == 'refs/heads/debug' 44 | run: | 45 | DOCKER_IMAGE_DEBUG=$DOCKER_USER/$REPO_NAME:debug${{ matrix.tag_extension }} 46 | docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_DEBUG}" -f ${{ matrix.docker_file }} --push 47 | - name: Build and Publish the Docker image 48 | if: github.ref != 'refs/heads/debug' 49 | run: | 50 | DOCKER_IMAGE_LATEST=$DOCKER_USER/$REPO_NAME:latest${{ matrix.tag_extension }} 51 | DOCKER_IMAGE_VERSION=$DOCKER_USER/$REPO_NAME:$GITHUB_REF_NAME${{ matrix.tag_extension }} 52 | docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_LATEST}" -t "${DOCKER_IMAGE_VERSION}" -f ${{ matrix.docker_file }} --push 53 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | branches: 7 | - docs 8 | permissions: 9 | contents: write 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | if: github.event.repository.fork == false 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v4 17 | with: 18 | python-version: 3.x 19 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 20 | - uses: actions/cache@v3 21 | with: 22 | key: mkdocs-material-${{ env.cache_id }} 23 | path: .cache 24 | restore-keys: | 25 | mkdocs-material- 26 | - run: pip install mkdocs-material pymdown-extensions 27 | - run: mkdocs gh-deploy --force 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | # Packages 4 | *.egg 5 | !/tests/**/*.egg 6 | /*.egg-info 7 | /dist/* 8 | build 9 | _build 10 | .cache 11 | *.so 12 | venv 13 | 14 | # Installer logs 15 | pip-log.txt 16 | 17 | # Unit test / coverage reports 18 | .coverage 19 | .pytest_cache 20 | 21 | .DS_Store 22 | .idea/* 23 | .python-version 24 | .vscode/* 25 | 26 | /test.py 27 | /test_*.* 28 | 29 | /setup.cfg 30 | MANIFEST.in 31 | /setup.py 32 | /docs/site/* 33 | /tests/fixtures/simple_project/setup.py 34 | /tests/fixtures/project_with_extras/setup.py 35 | .mypy_cache 36 | 37 | .venv 38 | /releases/* 39 | pip-wheel-metadata 40 | /poetry.toml 41 | 42 | poetry/core/* 43 | 44 | public 45 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Unreleased 5 | ---------- 6 | 7 | [1.9.1] (2025-07-01) 8 | -------------------- 9 | 10 | ### Fixed 11 | 12 | - Fixed Whisperx diarization pipeline initialization 13 | - Fixed Whisperx language detection 14 | 15 | [1.9.0] (2025-06-29) 16 | -------------------- 17 | 18 | ### Changed 19 | 20 | - Upgraded 21 | - Poetry to v2.1.3 22 | - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625) 23 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1) 24 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2) 25 | - torch to v2.7.1 26 | - torchaudio to v2.7.1 27 | - numpy to v2.2.6 28 | - fastapi to v0.115.14 29 | - uvicorn to v0.35.0 30 | - numba to v0.61.2 31 | 32 | [1.8.2] (2025-02-18) 33 | -------------------- 34 | 35 | ### Changed 36 | 37 | - Reduced GPU image size by using `nvidia/cuda:12.6.3-base-ubuntu22.04` 38 | 39 | [1.8.1] (2025-02-18) 40 | -------------------- 41 | 42 | ### Fixed 43 | 44 | - Fixed issues with Torch CUDA and cuDNN 45 | - Updated Torch and Torchaudio dependencies for multi-architecture support 46 | 47 | [1.8.0] (2025-02-17) 48 | -------------------- 49 | 50 | ### Added 51 | 52 | - Added support for [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1) 53 | 54 | ### Changed 55 | 56 | - Upgraded Cuda GPU image to v12.6.3 57 | - Upgraded dependencies 58 | - torch to v2.6.0 59 | - fastapi to v0.115.8 60 | - llvmlite to v0.44.0 61 | - numba to v0.61.0 62 | - ruff to v0.9.6 63 | - black to v25.1.0 64 | - mkdocs-material to v9.6.4 65 | - pymdown-extensions to v10.14.3 66 | 67 | [1.7.1] (2024-12-18) 68 | -------------------- 69 | 70 | ### Fixed 71 | 72 | - Fix JSON serialization of segments due to Faster Whisper v1.1.0 changes 73 | 74 | [1.7.0] (2024-12-17) 75 | -------------------- 76 | 77 | ### Added 78 | 79 | - Timeout configured to allow model to be unloaded when idle 80 | - Added detection confidence to langauge detection endpoint 81 | - Set mel generation to adjust n_dims automatically to match the loaded model 82 | - Refactor classes, Add comments, implement abstract methods, and add factory method for engine selection 83 | 84 | ### Changed 85 | 86 | - Upgraded 87 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0) 88 | - uvicorn to v0.34.0 89 | - tqdm to v4.67.1 90 | - python-multipart to v0.0.20 91 | - fastapi to v0.115.6 92 | - pytest to v8.3.4 93 | - ruff to v0.8.3 94 | - black to v24.10.0 95 | - mkdocs to v1.6.1 96 | - mkdocs-material to v9.5.49 97 | - pymdown-extensions to v10.12 98 | 99 | [1.6.0] (2024-10-06) 100 | -------------------- 101 | 102 | ### Changed 103 | 104 | - Upgraded 105 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930) 106 | - fastapi to v0.115.0 107 | - uvicorn to v0.31.0 108 | - tqdm to v4.66.5 109 | - python-multipart to v0.0.12 110 | 111 | [1.5.0] (2024-07-04) 112 | -------------------- 113 | 114 | ### Changed 115 | 116 | - Upgraded 117 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.3) 118 | - fastapi to v0.111.0 119 | - uvicorn to v0.30.1 120 | - gunicorn to v22.0.0 121 | - tqdm to v4.66.4 122 | - llvmlite to v0.43.0 123 | - numba to v0.60.0 124 | 125 | [1.4.1] (2024-04-17) 126 | -------------------- 127 | 128 | ### Changed 129 | 130 | - Upgraded torch to v1.13.1 131 | 132 | [1.4.0] (2024-04-17) 133 | -------------------- 134 | 135 | ### Changed 136 | 137 | - Upgraded 138 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.1) 139 | - fastapi to v0.110.1 140 | - uvicorn to v0.29.0 141 | - gunicorn to v21.2.0 142 | - tqdm to v4.66.2 143 | - python-multipart to v0.0.9 144 | - llvmlite to v0.42.0 145 | - numba to v0.59.1 146 | 147 | [1.3.0] (2024-02-15) 148 | -------------------- 149 | 150 | ### Added 151 | 152 | - Compiled and added FFmpeg without LGPL libraries for license compliance 153 | 154 | [1.2.4] (2023-11-27) 155 | -------------------- 156 | 157 | ### Changed 158 | 159 | - Upgraded 160 | - [openai/whisper](https://github.com/openai/whisper) to [v20231117](https://github.com/openai/whisper/releases/tag/v20231117) 161 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v0.10.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v0.10.0) 162 | 163 | [1.2.3] (2023-11-07) 164 | -------------------- 165 | 166 | ### Changed 167 | 168 | - Upgraded 169 | - [openai/whisper](https://github.com/openai/whisper) to [v20231106](https://github.com/openai/whisper/releases/tag/v20231106) 170 | 171 | [1.2.2] (2023-11-03) 172 | -------------------- 173 | 174 | ### Fixed 175 | 176 | - Fixed `swagger-ui` rendering issues by upgrading to `v5.9.1`, fixes #153 and #154 177 | 178 | [1.2.1] (2023-11-03) 179 | -------------------- 180 | 181 | ### Enabled 182 | 183 | - Enabled `vad_filter` for `faster-whisper` engine 184 | 185 | ### Changed 186 | 187 | - Changed misspelling in "Word level timestamps" 188 | - Removed unused unidecode dependency 189 | - Upgraded 190 | - uvicorn to v0.23.2 191 | - gunicorn to v21.0.1 192 | - tqdm to v4.66.1 193 | - python-multipart to v0.0.6 194 | - fastapi to v0.104.1 195 | - llvmlite to v0.41.1 196 | - numba to v0.58.0 197 | 198 | [1.2.0] (2023-10-01) 199 | -------------------- 200 | 201 | ### Changed 202 | 203 | - Upgraded 204 | - [openai/whisper](https://github.com/openai/whisper) to [v20230918](https://github.com/openai/whisper/releases/tag/v20230918) 205 | - [guillaumekln/faster-whisper](https://github.com/guillaumekln/faster-whisper) to [v0.9.0](https://github.com/guillaumekln/faster-whisper/releases/tag/v0.9.0) 206 | 207 | ### Updated 208 | 209 | - Updated model conversion method (for Faster Whisper) to use Hugging Face downloader 210 | - Updated default model paths to `~/.cache/whisper` or `/root/.cache/whisper`. 211 | - For customization, modify the `ASR_MODEL_PATH` environment variable. 212 | - Ensure Docker volume is set for the corresponding directory to use caching. 213 | 214 | ```bash 215 | docker run -d -p 9000:9000 -e ASR_MODEL_PATH=/data/whisper -v $PWD/yourlocaldir:/data/whisper onerahmet/openai-whisper-asr-webservice:latest 216 | ``` 217 | 218 | - Removed the `triton` dependency from `poetry.lock` to ensure the stability of the pipeline for `ARM-based` Docker images 219 | 220 | [1.1.1] (2023-05-29) 221 | -------------------- 222 | 223 | ### Changed 224 | 225 | - 94 gpus that don't support float16 in #103 226 | - Update compute type in #108 227 | - Add word level functionality for Faster Whisper in #109 228 | 229 | [1.1.0] (2023-04-17) 230 | -------------------- 231 | 232 | ### Changed 233 | 234 | - Docs in #72 235 | - Fix language code typo in #77 236 | - Adds support for FasterWhisper in #81 237 | - Add an optional param to skip the encoding step in #82 238 | - Faster whisper in #92 239 | 240 | [1.0.6] (2023-02-05) 241 | -------------------- 242 | 243 | ### Changed 244 | 245 | - Update README.md in #58 246 | - 68 update the versions in #69 247 | - Fix gunicorn run command and remove deprecated poetry run script in #70 248 | - Move torch installation method into the pyproject.toml file in #71 249 | - Add prompt to ASR in #66 250 | 251 | [1.0.5] (2022-12-08) 252 | -------------------- 253 | 254 | ### Changed 255 | 256 | - 43 make swagger doc not depend on internet connection in #52 257 | - Add new large model v2 in #53 258 | 259 | [1.0.4] (2022-11-28) 260 | -------------------- 261 | 262 | ### Changed 263 | 264 | - 43 make swagger doc not depend on internet connection in #51 265 | - Anally retentively fixed markdown linting warnings in README. Sorry. in #48 266 | - Explicit macOS readme with explanation for no-GPU [closes #44] in #47 267 | 268 | [1.0.3-beta] (2022-11-17) 269 | ------------------------- 270 | 271 | ### Changed 272 | 273 | - Combine transcribe endpoints in #36 274 | - Add multi worker support with gunicorn in #37 275 | - Add multi platform (amd & arm) support in #39 276 | - Upgrade Cuda version to 11.7 in #40 277 | - Lock to the latest whisper version (eff383) in #41 278 | 279 | [1.0.2-beta] (2022-10-04) 280 | ------------------------- 281 | 282 | ### Changed 283 | 284 | - add mutex lock to the model in #19 285 | - Subtitles in #21 286 | - Add gpu support and create Docker image for cuda with GitHub flow in #22 287 | 288 | [1.0.1-beta] (2022-09-27) 289 | ------------------------- 290 | 291 | ### Changed 292 | 293 | - Init GitHub runners in #10 294 | - Lock Whisper dependency with b4308... revision number to prevent build crashes in #15 295 | 296 | [1.0.0-beta] (2022-09-25) 297 | ------------------------- 298 | 299 | ### Changed 300 | 301 | - Docker init in #1 302 | - Create LICENCE in #2 303 | - Fastapi init in #3 304 | - Avoid temp file in #4 305 | - Translate init in #5 306 | - mp3 support by using FFmpeg instead of librosa in #8 307 | - add language detection endpoint in #9 308 | 309 | [1.9.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.9.1 310 | [1.9.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.9.0 311 | [1.8.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.2 312 | [1.8.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.1 313 | [1.8.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.0 314 | [1.7.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.1 315 | [1.7.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.0 316 | [1.6.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.6.0 317 | [1.5.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.5.0 318 | [1.4.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.1 319 | [1.4.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.0 320 | [1.3.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.3.0 321 | [1.2.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.4 322 | [1.2.3]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.3 323 | [1.2.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.2 324 | [1.2.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.1 325 | [1.2.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.0 326 | [1.1.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.1 327 | [1.1.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.0 328 | [1.0.6]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.6 329 | [1.0.5]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.5 330 | [1.0.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.4 331 | [1.0.3-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.3-beta 332 | [1.0.2-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.2-beta 333 | [1.0.1-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.1-beta 334 | [1.0.0-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/1.0.0-beta 335 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg 2 | 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui 4 | 5 | FROM python:3.10-bookworm 6 | 7 | LABEL org.opencontainers.image.source="https://github.com/ahmetoner/whisper-asr-webservice" 8 | 9 | ENV POETRY_VENV=/app/.venv 10 | 11 | RUN python3 -m venv $POETRY_VENV \ 12 | && $POETRY_VENV/bin/pip install -U pip setuptools \ 13 | && $POETRY_VENV/bin/pip install poetry==2.1.3 14 | 15 | ENV PATH="${PATH}:${POETRY_VENV}/bin" 16 | 17 | WORKDIR /app 18 | 19 | COPY . . 20 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg 21 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css 22 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js 23 | 24 | RUN poetry config virtualenvs.in-project true 25 | RUN poetry install --extras cpu 26 | 27 | EXPOSE 9000 28 | 29 | ENTRYPOINT ["whisper-asr-webservice"] 30 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg 2 | 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui 4 | 5 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04 6 | 7 | LABEL org.opencontainers.image.source="https://github.com/ahmetoner/whisper-asr-webservice" 8 | 9 | ENV PYTHON_VERSION=3.10 10 | 11 | ENV POETRY_VENV=/app/.venv 12 | 13 | RUN export DEBIAN_FRONTEND=noninteractive \ 14 | && apt-get -qq update \ 15 | && apt-get -qq install --no-install-recommends \ 16 | python${PYTHON_VERSION} \ 17 | python${PYTHON_VERSION}-venv \ 18 | python3-pip \ 19 | libcudnn8 \ 20 | python3-pip \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \ 24 | ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \ 25 | ln -s -f /usr/bin/pip3 /usr/bin/pip 26 | 27 | RUN python3 -m venv $POETRY_VENV \ 28 | && $POETRY_VENV/bin/pip install -U pip setuptools \ 29 | && $POETRY_VENV/bin/pip install poetry==2.1.3 30 | 31 | ENV PATH="${PATH}:${POETRY_VENV}/bin" 32 | 33 | WORKDIR /app 34 | 35 | COPY . . 36 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg 37 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css 38 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js 39 | 40 | RUN poetry config virtualenvs.in-project true 41 | RUN poetry install --extras cuda 42 | 43 | EXPOSE 9000 44 | 45 | ENTRYPOINT ["whisper-asr-webservice"] 46 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ahmet Oner & Besim Alibegovic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Release](https://img.shields.io/github/v/release/ahmetoner/whisper-asr-webservice.svg) 2 | ![Docker Pulls](https://img.shields.io/docker/pulls/onerahmet/openai-whisper-asr-webservice.svg) 3 | ![Build](https://img.shields.io/github/actions/workflow/status/ahmetoner/whisper-asr-webservice/docker-publish.yml.svg) 4 | ![Licence](https://img.shields.io/github/license/ahmetoner/whisper-asr-webservice.svg) 5 | 6 | > 🎉 **Join our Discord Community!** Connect with other users, get help, and stay updated on the latest features: [https://discord.gg/4Q5YVrePzZ](https://discord.gg/4Q5YVrePzZ) 7 | 8 | # Whisper ASR Box 9 | 10 | Whisper ASR Box is a general-purpose speech recognition toolkit. Whisper Models are trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification. 11 | 12 | ## Features 13 | 14 | Current release (v1.9.1) supports following whisper models: 15 | 16 | - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625) 17 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1) 18 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2) 19 | 20 | ## Quick Usage 21 | 22 | ### CPU 23 | 24 | ```shell 25 | docker run -d -p 9000:9000 \ 26 | -e ASR_MODEL=base \ 27 | -e ASR_ENGINE=openai_whisper \ 28 | onerahmet/openai-whisper-asr-webservice:latest 29 | ``` 30 | 31 | ### GPU 32 | 33 | ```shell 34 | docker run -d --gpus all -p 9000:9000 \ 35 | -e ASR_MODEL=base \ 36 | -e ASR_ENGINE=openai_whisper \ 37 | onerahmet/openai-whisper-asr-webservice:latest-gpu 38 | ``` 39 | 40 | #### Cache 41 | 42 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory: 43 | 44 | ```shell 45 | docker run -d -p 9000:9000 \ 46 | -v $PWD/cache:/root/.cache/ \ 47 | onerahmet/openai-whisper-asr-webservice:latest 48 | ``` 49 | 50 | ## Key Features 51 | 52 | - Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX) 53 | - Multiple output formats (text, JSON, VTT, SRT, TSV) 54 | - Word-level timestamps support 55 | - Voice activity detection (VAD) filtering 56 | - Speaker diarization (with WhisperX) 57 | - FFmpeg integration for broad audio/video format support 58 | - GPU acceleration support 59 | - Configurable model loading/unloading 60 | - REST API with Swagger documentation 61 | 62 | ## Environment Variables 63 | 64 | Key configuration options: 65 | 66 | - `ASR_ENGINE`: Engine selection (openai_whisper, faster_whisper, whisperx) 67 | - `ASR_MODEL`: Model selection (tiny, base, small, medium, large-v3, etc.) 68 | - `ASR_MODEL_PATH`: Custom path to store/load models 69 | - `ASR_DEVICE`: Device selection (cuda, cpu) 70 | - `MODEL_IDLE_TIMEOUT`: Timeout for model unloading 71 | 72 | ## Documentation 73 | 74 | For complete documentation, visit: 75 | [https://ahmetoner.github.io/whisper-asr-webservice](https://ahmetoner.github.io/whisper-asr-webservice) 76 | 77 | ## Development 78 | 79 | ```shell 80 | # Install poetry v2.X 81 | pip3 install poetry 82 | 83 | # Install dependencies for cpu 84 | poetry install --extras cpu 85 | 86 | # Install dependencies for cuda 87 | poetry install --extras cuda 88 | 89 | # Run service 90 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000 91 | ``` 92 | 93 | After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:9000` in your browser to access the Swagger UI documentation and try out the API endpoints. 94 | 95 | ## Credits 96 | 97 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) 98 | -------------------------------------------------------------------------------- /app/asr_models/asr_model.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | from abc import ABC, abstractmethod 4 | from threading import Lock 5 | from typing import Union 6 | 7 | import torch 8 | 9 | from app.config import CONFIG 10 | 11 | 12 | class ASRModel(ABC): 13 | """ 14 | Abstract base class for ASR (Automatic Speech Recognition) models. 15 | """ 16 | 17 | model = None 18 | model_lock = Lock() 19 | last_activity_time = time.time() 20 | 21 | def __init__(self): 22 | pass 23 | 24 | @abstractmethod 25 | def load_model(self): 26 | """ 27 | Loads the model from the specified path. 28 | """ 29 | pass 30 | 31 | @abstractmethod 32 | def transcribe( 33 | self, 34 | audio, 35 | task: Union[str, None], 36 | language: Union[str, None], 37 | initial_prompt: Union[str, None], 38 | vad_filter: Union[bool, None], 39 | word_timestamps: Union[bool, None], 40 | options: Union[dict, None], 41 | output, 42 | ): 43 | """ 44 | Perform transcription on the given audio file. 45 | """ 46 | pass 47 | 48 | @abstractmethod 49 | def language_detection(self, audio): 50 | """ 51 | Perform language detection on the given audio file. 52 | """ 53 | pass 54 | 55 | def monitor_idleness(self): 56 | """ 57 | Monitors the idleness of the ASR model and releases the model if it has been idle for too long. 58 | """ 59 | if CONFIG.MODEL_IDLE_TIMEOUT <= 0: 60 | return 61 | while True: 62 | time.sleep(15) 63 | if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT: 64 | with self.model_lock: 65 | self.release_model() 66 | break 67 | 68 | def release_model(self): 69 | """ 70 | Unloads the model from memory and clears any cached GPU memory. 71 | """ 72 | del self.model 73 | torch.cuda.empty_cache() 74 | gc.collect() 75 | self.model = None 76 | print("Model unloaded due to timeout") 77 | -------------------------------------------------------------------------------- /app/asr_models/faster_whisper_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import whisper 7 | from faster_whisper import WhisperModel 8 | 9 | from app.asr_models.asr_model import ASRModel 10 | from app.config import CONFIG 11 | from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 12 | 13 | 14 | class FasterWhisperASR(ASRModel): 15 | 16 | def load_model(self): 17 | 18 | self.model = WhisperModel( 19 | model_size_or_path=CONFIG.MODEL_NAME, 20 | device=CONFIG.DEVICE, 21 | compute_type=CONFIG.MODEL_QUANTIZATION, 22 | download_root=CONFIG.MODEL_PATH 23 | ) 24 | 25 | Thread(target=self.monitor_idleness, daemon=True).start() 26 | 27 | def transcribe( 28 | self, 29 | audio, 30 | task: Union[str, None], 31 | language: Union[str, None], 32 | initial_prompt: Union[str, None], 33 | vad_filter: Union[bool, None], 34 | word_timestamps: Union[bool, None], 35 | options: Union[dict, None], 36 | output, 37 | ): 38 | self.last_activity_time = time.time() 39 | 40 | with self.model_lock: 41 | if self.model is None: 42 | self.load_model() 43 | 44 | options_dict = {"task": task} 45 | if language: 46 | options_dict["language"] = language 47 | if initial_prompt: 48 | options_dict["initial_prompt"] = initial_prompt 49 | if vad_filter: 50 | options_dict["vad_filter"] = True 51 | if word_timestamps: 52 | options_dict["word_timestamps"] = True 53 | with self.model_lock: 54 | segments = [] 55 | text = "" 56 | segment_generator, info = self.model.transcribe(audio, beam_size=5, **options_dict) 57 | for segment in segment_generator: 58 | segments.append(segment) 59 | text = text + segment.text 60 | result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text} 61 | 62 | output_file = StringIO() 63 | self.write_result(result, output_file, output) 64 | output_file.seek(0) 65 | 66 | return output_file 67 | 68 | def language_detection(self, audio): 69 | 70 | self.last_activity_time = time.time() 71 | 72 | with self.model_lock: 73 | if self.model is None: self.load_model() 74 | 75 | # load audio and pad/trim it to fit 30 seconds 76 | audio = whisper.pad_or_trim(audio) 77 | 78 | # detect the spoken language 79 | with self.model_lock: 80 | segments, info = self.model.transcribe(audio, beam_size=5) 81 | detected_lang_code = info.language 82 | detected_language_confidence = info.language_probability 83 | 84 | return detected_lang_code, detected_language_confidence 85 | 86 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 87 | if output == "srt": 88 | WriteSRT(ResultWriter).write_result(result, file=file) 89 | elif output == "vtt": 90 | WriteVTT(ResultWriter).write_result(result, file=file) 91 | elif output == "tsv": 92 | WriteTSV(ResultWriter).write_result(result, file=file) 93 | elif output == "json": 94 | WriteJSON(ResultWriter).write_result(result, file=file) 95 | else: 96 | WriteTXT(ResultWriter).write_result(result, file=file) 97 | -------------------------------------------------------------------------------- /app/asr_models/mbain_whisperx_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import whisperx 7 | from whisperx.audio import N_SAMPLES 8 | from whisperx.diarize import DiarizationPipeline 9 | from whisperx.utils import ResultWriter, SubtitlesWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 10 | 11 | from app.asr_models.asr_model import ASRModel 12 | from app.config import CONFIG 13 | 14 | 15 | class WhisperXASR(ASRModel): 16 | def __init__(self): 17 | super().__init__() 18 | self.model = { 19 | 'whisperx': None, 20 | 'diarize_model': None, 21 | 'align_model': {} 22 | } 23 | 24 | def load_model(self): 25 | asr_options = {"without_timestamps": False} 26 | self.model['whisperx'] = whisperx.load_model( 27 | CONFIG.MODEL_NAME, 28 | device=CONFIG.DEVICE, 29 | compute_type=CONFIG.MODEL_QUANTIZATION, 30 | asr_options=asr_options 31 | ) 32 | 33 | if CONFIG.HF_TOKEN != "": 34 | self.model['diarize_model'] = DiarizationPipeline( 35 | use_auth_token=CONFIG.HF_TOKEN, 36 | device=CONFIG.DEVICE 37 | ) 38 | 39 | Thread(target=self.monitor_idleness, daemon=True).start() 40 | 41 | def transcribe( 42 | self, 43 | audio, 44 | task: Union[str, None], 45 | language: Union[str, None], 46 | initial_prompt: Union[str, None], 47 | vad_filter: Union[bool, None], 48 | word_timestamps: Union[bool, None], 49 | options: Union[dict, None], 50 | output, 51 | ): 52 | self.last_activity_time = time.time() 53 | with self.model_lock: 54 | if self.model is None: 55 | self.load_model() 56 | 57 | options_dict = {"task": task} 58 | if language: 59 | options_dict["language"] = language 60 | if initial_prompt: 61 | options_dict["initial_prompt"] = initial_prompt 62 | with self.model_lock: 63 | result = self.model['whisperx'].transcribe(audio, **options_dict) 64 | language = result["language"] 65 | 66 | # Load the required model and cache it 67 | # If we transcribe models in many different languages, this may lead to OOM propblems 68 | if result["language"] in self.model['align_model']: 69 | model_x, metadata = self.model['align_model'][result["language"]] 70 | else: 71 | self.model['align_model'][result["language"]] = whisperx.load_align_model( 72 | language_code=result["language"], device=CONFIG.DEVICE 73 | ) 74 | model_x, metadata = self.model['align_model'][result["language"]] 75 | 76 | # Align whisper output 77 | result = whisperx.align( 78 | result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False 79 | ) 80 | 81 | if options.get("diarize", False) and CONFIG.HF_TOKEN != "": 82 | min_speakers = options.get("min_speakers", None) 83 | max_speakers = options.get("max_speakers", None) 84 | # add min/max number of speakers if known 85 | diarize_segments = self.model['diarize_model'](audio, min_speakers, max_speakers) 86 | result = whisperx.assign_word_speakers(diarize_segments, result) 87 | result["language"] = language 88 | 89 | output_file = StringIO() 90 | self.write_result(result, output_file, output) 91 | output_file.seek(0) 92 | 93 | return output_file 94 | 95 | def language_detection(self, audio): 96 | with self.model_lock: 97 | if self.model is None: 98 | self.load_model() 99 | if audio.shape[0] < N_SAMPLES: 100 | print("Warning: audio is shorter than 30s, language detection may be inaccurate.") 101 | results = self.model['whisperx'].model.detect_language(audio) 102 | language = results[0] 103 | language_probability = round(float(results[1]), 2) 104 | print(f"Detected language: {language} ({language_probability}) in first 30s of audio...") 105 | return language, language_probability 106 | 107 | 108 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 109 | default_options = { 110 | "max_line_width": CONFIG.SUBTITLE_MAX_LINE_WIDTH, 111 | "max_line_count": CONFIG.SUBTITLE_MAX_LINE_COUNT, 112 | "highlight_words": CONFIG.SUBTITLE_HIGHLIGHT_WORDS 113 | } 114 | 115 | if output == "srt": 116 | WriteSRT(SubtitlesWriter).write_result(result, file=file, options=default_options) 117 | elif output == "vtt": 118 | WriteVTT(SubtitlesWriter).write_result(result, file=file, options=default_options) 119 | elif output == "tsv": 120 | WriteTSV(ResultWriter).write_result(result, file=file, options=default_options) 121 | elif output == "json": 122 | WriteJSON(ResultWriter).write_result(result, file=file, options=default_options) 123 | else: 124 | WriteTXT(ResultWriter).write_result(result, file=file, options=default_options) 125 | -------------------------------------------------------------------------------- /app/asr_models/openai_whisper_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import torch 7 | import whisper 8 | from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 9 | 10 | from app.asr_models.asr_model import ASRModel 11 | from app.config import CONFIG 12 | 13 | 14 | class OpenAIWhisperASR(ASRModel): 15 | 16 | def load_model(self): 17 | 18 | if torch.cuda.is_available(): 19 | self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH).cuda() 20 | else: 21 | self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH) 22 | 23 | Thread(target=self.monitor_idleness, daemon=True).start() 24 | 25 | def transcribe( 26 | self, 27 | audio, 28 | task: Union[str, None], 29 | language: Union[str, None], 30 | initial_prompt: Union[str, None], 31 | vad_filter: Union[bool, None], 32 | word_timestamps: Union[bool, None], 33 | options: Union[dict, None], 34 | output, 35 | ): 36 | self.last_activity_time = time.time() 37 | 38 | with self.model_lock: 39 | if self.model is None: 40 | self.load_model() 41 | 42 | options_dict = {"task": task} 43 | if language: 44 | options_dict["language"] = language 45 | if initial_prompt: 46 | options_dict["initial_prompt"] = initial_prompt 47 | if word_timestamps: 48 | options_dict["word_timestamps"] = word_timestamps 49 | with self.model_lock: 50 | result = self.model.transcribe(audio, **options_dict) 51 | 52 | output_file = StringIO() 53 | self.write_result(result, output_file, output) 54 | output_file.seek(0) 55 | 56 | return output_file 57 | 58 | def language_detection(self, audio): 59 | 60 | self.last_activity_time = time.time() 61 | 62 | with self.model_lock: 63 | if self.model is None: 64 | self.load_model() 65 | 66 | # load audio and pad/trim it to fit 30 seconds 67 | audio = whisper.pad_or_trim(audio) 68 | 69 | # make log-Mel spectrogram and move to the same device as the model 70 | mel = whisper.log_mel_spectrogram(audio, self.model.dims.n_mels).to(self.model.device) 71 | 72 | # detect the spoken language 73 | with self.model_lock: 74 | _, probs = self.model.detect_language(mel) 75 | detected_lang_code = max(probs, key=probs.get) 76 | 77 | return detected_lang_code, probs[max(probs)] 78 | 79 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 80 | options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False} 81 | if output == "srt": 82 | WriteSRT(ResultWriter).write_result(result, file=file, options=options) 83 | elif output == "vtt": 84 | WriteVTT(ResultWriter).write_result(result, file=file, options=options) 85 | elif output == "tsv": 86 | WriteTSV(ResultWriter).write_result(result, file=file, options=options) 87 | elif output == "json": 88 | WriteJSON(ResultWriter).write_result(result, file=file, options=options) 89 | else: 90 | WriteTXT(ResultWriter).write_result(result, file=file, options=options) 91 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | class CONFIG: 7 | """ 8 | Configuration class for ASR models. 9 | Reads environment variables for runtime configuration, with sensible defaults. 10 | """ 11 | # Determine the ASR engine ('faster_whisper', 'openai_whisper' or 'whisperx') 12 | ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper") 13 | 14 | # Retrieve Huggingface Token 15 | HF_TOKEN = os.getenv("HF_TOKEN", "") 16 | if ASR_ENGINE == "whisperx" and HF_TOKEN == "": 17 | print("You must set the HF_TOKEN environment variable to download the diarization model used by WhisperX.") 18 | 19 | # Determine the computation device (GPU or CPU) 20 | DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | # Model name to use (e.g., "base", "small", etc.) 23 | MODEL_NAME = os.getenv("ASR_MODEL", "base") 24 | 25 | # Path to the model directory 26 | MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper")) 27 | 28 | # Model quantization level. Defines the precision for model weights: 29 | # 'float32' - 32-bit floating-point precision (higher precision, slower inference) 30 | # 'float16' - 16-bit floating-point precision (lower precision, faster inference) 31 | # 'int8' - 8-bit integer precision (lowest precision, fastest inference) 32 | # Defaults to 'float32' for GPU availability, 'int8' for CPU. 33 | MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8") 34 | if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}: 35 | raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.") 36 | 37 | # Idle timeout in seconds. If set to a non-zero value, the model will be unloaded 38 | # after being idle for this many seconds. A value of 0 means the model will never be unloaded. 39 | MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0)) 40 | 41 | # Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks. 42 | SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000)) 43 | 44 | # Subtitle output options for whisperx 45 | SUBTITLE_MAX_LINE_WIDTH = int(os.getenv("SUBTITLE_MAX_LINE_WIDTH", 1000)) 46 | SUBTITLE_MAX_LINE_COUNT = int(os.getenv("SUBTITLE_MAX_LINE_COUNT", 2)) 47 | SUBTITLE_HIGHLIGHT_WORDS = os.getenv("SUBTITLE_HIGHLIGHT_WORDS", "false").lower() == "true" 48 | -------------------------------------------------------------------------------- /app/factory/asr_model_factory.py: -------------------------------------------------------------------------------- 1 | from app.asr_models.asr_model import ASRModel 2 | from app.asr_models.faster_whisper_engine import FasterWhisperASR 3 | from app.asr_models.mbain_whisperx_engine import WhisperXASR 4 | from app.asr_models.openai_whisper_engine import OpenAIWhisperASR 5 | from app.config import CONFIG 6 | 7 | 8 | class ASRModelFactory: 9 | @staticmethod 10 | def create_asr_model() -> ASRModel: 11 | if CONFIG.ASR_ENGINE == "openai_whisper": 12 | return OpenAIWhisperASR() 13 | elif CONFIG.ASR_ENGINE == "faster_whisper": 14 | return FasterWhisperASR() 15 | elif CONFIG.ASR_ENGINE == "whisperx": 16 | return WhisperXASR() 17 | else: 18 | raise ValueError(f"Unsupported ASR engine: {CONFIG.ASR_ENGINE}") 19 | -------------------------------------------------------------------------------- /app/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dataclasses import asdict 4 | from typing import BinaryIO, TextIO 5 | 6 | import ffmpeg 7 | import numpy as np 8 | from faster_whisper.utils import format_timestamp 9 | 10 | from app.config import CONFIG 11 | 12 | 13 | class ResultWriter: 14 | extension: str 15 | 16 | def __init__(self, output_dir: str): 17 | self.output_dir = output_dir 18 | 19 | def __call__(self, result: dict, audio_path: str): 20 | audio_basename = os.path.basename(audio_path) 21 | output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension) 22 | 23 | with open(output_path, "w", encoding="utf-8") as f: 24 | self.write_result(result, file=f) 25 | 26 | def write_result(self, result: dict, file: TextIO): 27 | raise NotImplementedError 28 | 29 | 30 | class WriteTXT(ResultWriter): 31 | extension: str = "txt" 32 | 33 | def write_result(self, result: dict, file: TextIO): 34 | for segment in result["segments"]: 35 | print(segment.text.strip(), file=file, flush=True) 36 | 37 | 38 | class WriteVTT(ResultWriter): 39 | extension: str = "vtt" 40 | 41 | def write_result(self, result: dict, file: TextIO): 42 | print("WEBVTT\n", file=file) 43 | for segment in result["segments"]: 44 | print( 45 | f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n" 46 | f"{segment.text.strip().replace('-->', '->')}\n", 47 | file=file, 48 | flush=True, 49 | ) 50 | 51 | 52 | class WriteSRT(ResultWriter): 53 | extension: str = "srt" 54 | 55 | def write_result(self, result: dict, file: TextIO): 56 | for i, segment in enumerate(result["segments"], start=1): 57 | # write srt lines 58 | print( 59 | f"{i}\n" 60 | f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> " 61 | f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n" 62 | f"{segment.text.strip().replace('-->', '->')}\n", 63 | file=file, 64 | flush=True, 65 | ) 66 | 67 | 68 | class WriteTSV(ResultWriter): 69 | """ 70 | Write a transcript to a file in TSV (tab-separated values) format containing lines like: 71 | \t\t 72 | 73 | Using integer milliseconds as start and end times means there's no chance of interference from 74 | an environment setting a language encoding that causes the decimal in a floating point number 75 | to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++. 76 | """ 77 | 78 | extension: str = "tsv" 79 | 80 | def write_result(self, result: dict, file: TextIO): 81 | print("start", "end", "text", sep="\t", file=file) 82 | for segment in result["segments"]: 83 | print(round(1000 * segment.start), file=file, end="\t") 84 | print(round(1000 * segment.end), file=file, end="\t") 85 | print(segment.text.strip().replace("\t", " "), file=file, flush=True) 86 | 87 | 88 | class WriteJSON(ResultWriter): 89 | extension: str = "json" 90 | 91 | def write_result(self, result: dict, file: TextIO): 92 | if "segments" in result: 93 | result["segments"] = [asdict(segment) for segment in result["segments"]] 94 | json.dump(result, file) 95 | 96 | 97 | def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE): 98 | """ 99 | Open an audio file object and read as mono waveform, resampling as necessary. 100 | Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object 101 | Parameters 102 | ---------- 103 | file: BinaryIO 104 | The audio file like object 105 | encode: Boolean 106 | If true, encode audio stream to WAV before sending to whisper 107 | sr: int 108 | The sample rate to resample the audio if necessary 109 | Returns 110 | ------- 111 | A NumPy array containing the audio waveform, in float32 dtype. 112 | """ 113 | if encode: 114 | try: 115 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 116 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 117 | out, _ = ( 118 | ffmpeg.input("pipe:", threads=0) 119 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) 120 | .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read()) 121 | ) 122 | except ffmpeg.Error as e: 123 | raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e 124 | else: 125 | out = file.read() 126 | 127 | return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 128 | -------------------------------------------------------------------------------- /app/webservice.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import os 3 | from os import path 4 | from typing import Annotated, Optional, Union 5 | from urllib.parse import quote 6 | 7 | import click 8 | import uvicorn 9 | from fastapi import FastAPI, File, Query, UploadFile, applications 10 | from fastapi.openapi.docs import get_swagger_ui_html 11 | from fastapi.responses import RedirectResponse, StreamingResponse 12 | from fastapi.staticfiles import StaticFiles 13 | from whisper import tokenizer 14 | 15 | from app.config import CONFIG 16 | from app.factory.asr_model_factory import ASRModelFactory 17 | from app.utils import load_audio 18 | 19 | asr_model = ASRModelFactory.create_asr_model() 20 | asr_model.load_model() 21 | 22 | LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys()) 23 | 24 | projectMetadata = importlib.metadata.metadata("whisper-asr-webservice") 25 | app = FastAPI( 26 | title=projectMetadata["Name"].title().replace("-", " "), 27 | description=projectMetadata["Summary"], 28 | version=projectMetadata["Version"], 29 | contact={"url": projectMetadata["Home-page"]}, 30 | swagger_ui_parameters={"defaultModelsExpandDepth": -1}, 31 | license_info={"name": "MIT License", "url": "https://github.com/ahmetoner/whisper-asr-webservice/blob/main/LICENCE"}, 32 | ) 33 | 34 | assets_path = os.getcwd() + "/swagger-ui-assets" 35 | if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"): 36 | app.mount("/assets", StaticFiles(directory=assets_path), name="static") 37 | 38 | def swagger_monkey_patch(*args, **kwargs): 39 | return get_swagger_ui_html( 40 | *args, 41 | **kwargs, 42 | swagger_favicon_url="", 43 | swagger_css_url="/assets/swagger-ui.css", 44 | swagger_js_url="/assets/swagger-ui-bundle.js", 45 | ) 46 | 47 | applications.get_swagger_ui_html = swagger_monkey_patch 48 | 49 | 50 | @app.get("/", response_class=RedirectResponse, include_in_schema=False) 51 | async def index(): 52 | return "/docs" 53 | 54 | 55 | @app.post("/asr", tags=["Endpoints"]) 56 | async def asr( 57 | audio_file: UploadFile = File(...), # noqa: B008 58 | encode: bool = Query(default=True, description="Encode audio first through ffmpeg"), 59 | task: Union[str, None] = Query(default="transcribe", enum=["transcribe", "translate"]), 60 | language: Union[str, None] = Query(default=None, enum=LANGUAGE_CODES), 61 | initial_prompt: Union[str, None] = Query(default=None), 62 | vad_filter: Annotated[ 63 | bool | None, 64 | Query( 65 | description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech", 66 | include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False), 67 | ), 68 | ] = False, 69 | word_timestamps: bool = Query( 70 | default=False, 71 | description="Word level timestamps", 72 | include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False), 73 | ), 74 | diarize: bool = Query( 75 | default=False, 76 | description="Diarize the input", 77 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" and CONFIG.HF_TOKEN != "" else False), 78 | ), 79 | min_speakers: Union[int, None] = Query( 80 | default=None, 81 | description="Min speakers in this file", 82 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False), 83 | ), 84 | max_speakers: Union[int, None] = Query( 85 | default=None, 86 | description="Max speakers in this file", 87 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False), 88 | ), 89 | output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]), 90 | ): 91 | result = asr_model.transcribe( 92 | load_audio(audio_file.file, encode), 93 | task, 94 | language, 95 | initial_prompt, 96 | vad_filter, 97 | word_timestamps, 98 | {"diarize": diarize, "min_speakers": min_speakers, "max_speakers": max_speakers}, 99 | output, 100 | ) 101 | return StreamingResponse( 102 | result, 103 | media_type="text/plain", 104 | headers={ 105 | "Asr-Engine": CONFIG.ASR_ENGINE, 106 | "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"', 107 | }, 108 | ) 109 | 110 | 111 | @app.post("/detect-language", tags=["Endpoints"]) 112 | async def detect_language( 113 | audio_file: UploadFile = File(...), # noqa: B008 114 | encode: bool = Query(default=True, description="Encode audio first through FFmpeg"), 115 | ): 116 | detected_lang_code, confidence = asr_model.language_detection(load_audio(audio_file.file, encode)) 117 | return { 118 | "detected_language": tokenizer.LANGUAGES[detected_lang_code], 119 | "language_code": detected_lang_code, 120 | "confidence": confidence, 121 | } 122 | 123 | 124 | @click.command() 125 | @click.option( 126 | "-h", 127 | "--host", 128 | metavar="HOST", 129 | default="0.0.0.0", 130 | help="Host for the webservice (default: 0.0.0.0)", 131 | ) 132 | @click.option( 133 | "-p", 134 | "--port", 135 | metavar="PORT", 136 | default=9000, 137 | help="Port for the webservice (default: 9000)", 138 | ) 139 | @click.version_option(version=projectMetadata["Version"]) 140 | def start(host: str, port: Optional[int] = None): 141 | uvicorn.run(app, host=host, port=port) 142 | 143 | 144 | if __name__ == "__main__": 145 | start() 146 | -------------------------------------------------------------------------------- /docker-compose.gpu.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | whisper-asr-webservice-gpu: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile.gpu 8 | deploy: 9 | resources: 10 | reservations: 11 | devices: 12 | - driver: nvidia 13 | count: 1 14 | capabilities: [gpu] 15 | environment: 16 | - ASR_MODEL=base 17 | ports: 18 | - "9000:9000" 19 | volumes: 20 | - ./app:/app/app 21 | - cache-whisper:/root/.cache 22 | 23 | volumes: 24 | cache-whisper: 25 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | whisper-asr-webservice: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | environment: 9 | - ASR_MODEL=base 10 | ports: 11 | - "9000:9000" 12 | volumes: 13 | - ./app:/app/app 14 | - cache-whisper:/root/.cache 15 | 16 | volumes: 17 | cache-whisper: 18 | -------------------------------------------------------------------------------- /docs/.overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | -------------------------------------------------------------------------------- /docs/assets/css/extra.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #3d6178; 3 | --md-primary-fg-color--light: #3d6178; 4 | --md-primary-fg-color--dark: #3d6178; 5 | } 6 | -------------------------------------------------------------------------------- /docs/assets/images/swagger-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmetoner/whisper-asr-webservice/ea12224ef33941d69aa1ba9fca01d95ecea7e8e7/docs/assets/images/swagger-ui.png -------------------------------------------------------------------------------- /docs/build.md: -------------------------------------------------------------------------------- 1 | ## Development Environment 2 | 3 | Install poetry v2.X with following command: 4 | 5 | ```shell 6 | pip3 install poetry 7 | ``` 8 | 9 | ### Installation 10 | 11 | Install dependencies for cpu 12 | 13 | ```shell 14 | poetry install --extras cpu 15 | ``` 16 | 17 | Install dependencies for cuda 18 | 19 | ```shell 20 | poetry install --extras cuda 21 | ``` 22 | 23 | !!! Note 24 | By default, this will install the CPU version of PyTorch. For GPU support, you'll need to install the appropriate CUDA version of PyTorch separately: 25 | ```shell 26 | # For CUDA support (example for CUDA 11.8): 27 | pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121 28 | ``` 29 | 30 | ### Run 31 | 32 | Starting the Webservice: 33 | 34 | ```shell 35 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000 36 | ``` 37 | 38 | ### Build 39 | 40 | === ":octicons-file-code-16: `Docker`" 41 | 42 | With `Dockerfile`: 43 | 44 | === ":octicons-file-code-16: `CPU`" 45 | 46 | ```shell 47 | # Build Image 48 | docker build -t whisper-asr-webservice . 49 | 50 | # Run Container 51 | docker run -d -p 9000:9000 whisper-asr-webservice 52 | # or with specific model 53 | docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice 54 | ``` 55 | 56 | === ":octicons-file-code-16: `GPU`" 57 | 58 | ```shell 59 | # Build Image 60 | docker build -f Dockerfile.gpu -t whisper-asr-webservice-gpu . 61 | 62 | # Run Container 63 | docker run -d --gpus all -p 9000:9000 whisper-asr-webservice-gpu 64 | # or with specific model 65 | docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice-gpu 66 | ``` 67 | 68 | With `docker-compose`: 69 | 70 | === ":octicons-file-code-16: `CPU`" 71 | 72 | ```shell 73 | docker-compose up --build 74 | ``` 75 | 76 | === ":octicons-file-code-16: `GPU`" 77 | 78 | ```shell 79 | docker-compose -f docker-compose.gpu.yml up --build 80 | ``` 81 | === ":octicons-file-code-16: `Poetry`" 82 | 83 | Build .whl package 84 | 85 | ```shell 86 | poetry build 87 | ``` -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | --8<-- "CHANGELOG.md" 2 | -------------------------------------------------------------------------------- /docs/endpoints.md: -------------------------------------------------------------------------------- 1 | ## Quick start 2 | 3 | After running the docker image interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs) 4 | 5 | There are 2 endpoints available: 6 | 7 | - [/asr](##Automatic-Speech-recognition-service-/asr) (Automatic Speech Recognition) 8 | - [/detect-language](##Language-detection-service-/detect-language) 9 | 10 | ## Automatic speech recognition service /asr 11 | 12 | - 2 task choices: 13 | - **transcribe**: (default) task, transcribes the uploaded file. 14 | - **translate**: will provide an English transcript no matter which language was spoken. 15 | - Files are automatically converted with FFmpeg. 16 | - Full list of supported [audio](https://ffmpeg.org/general.html#Audio-Codecs) and [video](https://ffmpeg.org/general.html#Video-Codecs) formats. 17 | - You can enable word level timestamps output by `word_timestamps` parameter 18 | - You can Enable the voice activity detection (VAD) to filter out parts of the audio without speech by `vad_filter` parameter (only with `Faster Whisper` for now). 19 | 20 | ### Request URL Query Params 21 | 22 | | Name | Values | Description | 23 | |-----------------|------------------------------------------------|----------------------------------------------------------------| 24 | | audio_file | File | Audio or video file to transcribe | 25 | | output | `text` (default), `json`, `vtt`, `srt`, `tsv` | Output format | 26 | | task | `transcribe`, `translate` | Task type - transcribe in source language or translate to English | 27 | | language | `en` (default is auto recognition) | Source language code (see supported languages) | 28 | | word_timestamps | false (default) | Enable word-level timestamps (Faster Whisper only) | 29 | | vad_filter | false (default) | Enable voice activity detection filtering (Faster Whisper only) | 30 | | encode | true (default) | Encode audio through FFmpeg before processing | 31 | | diarize | false (default) | Enable speaker diarization (WhisperX only) | 32 | | min_speakers | null (default) | Minimum number of speakers for diarization (WhisperX only) | 33 | | max_speakers | null (default) | Maximum number of speakers for diarization (WhisperX only) | 34 | 35 | Example request with cURL 36 | 37 | ```bash 38 | curl -X POST -H "content-type: multipart/form-data" -F "audio_file=@/path/to/file" 0.0.0.0:9000/asr?output=json 39 | ``` 40 | 41 | ### Response (JSON) 42 | 43 | - **text**: Contains the full transcript 44 | - **segments**: Contains an entry per segment. Each entry provides `timestamps`, `transcript`, `token ids`, `word level timestamps` and other metadata 45 | - **language**: Detected or provided language (as a language code) 46 | 47 | ### Response Formats 48 | 49 | The API supports multiple output formats: 50 | 51 | - **text**: Plain text transcript (default) 52 | - **json**: Detailed JSON with segments, timestamps, and metadata 53 | - **vtt**: WebVTT subtitle format 54 | - **srt**: SubRip subtitle format 55 | - **tsv**: Tab-separated values with timestamps 56 | 57 | ### Supported Languages 58 | 59 | The service supports all languages supported by Whisper. Some common language codes: 60 | 61 | - Turkish (tr) 62 | - English (en) 63 | - Spanish (es) 64 | - French (fr) 65 | - German (de) 66 | - Italian (it) 67 | - Portuguese (pt) 68 | - And many more... 69 | 70 | See the [Whisper documentation](https://github.com/openai/whisper#available-models-and-languages) for the full list of supported languages. 71 | 72 | ### Speaker Diarization 73 | 74 | When using the WhisperX engine with diarization enabled (`diarize=true`), the output will include speaker labels for each segment. This requires: 75 | 76 | 1. WhisperX engine to be configured 77 | 2. Valid Hugging Face token set in HF_TOKEN 78 | 3. Sufficient memory for diarization models 79 | 80 | You can optionally specify `min_speakers` and `max_speakers` if you know the expected number of speakers. 81 | 82 | ## Language detection service /detect-language 83 | 84 | Detects the language spoken in the uploaded file. Only processes first 30 seconds. 85 | 86 | Returns a json with following fields: 87 | 88 | - **detected_language**: Human readable language name (e.g. "english") 89 | - **language_code**: ISO language code (e.g. "en") 90 | - **confidence**: Confidence score between 0 and 1 indicating detection reliability 91 | 92 | Example response: 93 | 94 | ```json 95 | { 96 | "detected_language": "english", 97 | "language_code": "en", 98 | "confidence": 0.98 99 | } 100 | ``` 101 | -------------------------------------------------------------------------------- /docs/environmental-variables.md: -------------------------------------------------------------------------------- 1 | ### Configuring the `Engine` 2 | 3 | === ":octicons-file-code-16: `openai_whisper`" 4 | 5 | ```shell 6 | export ASR_ENGINE=openai_whisper 7 | ``` 8 | 9 | === ":octicons-file-code-16: `faster_whisper`" 10 | 11 | ```shell 12 | export ASR_ENGINE=faster_whisper 13 | ``` 14 | 15 | === ":octicons-file-code-16: `whisperx`" 16 | 17 | ```shell 18 | export ASR_ENGINE=whisperx 19 | ``` 20 | 21 | ### Configuring the `Model` 22 | 23 | ```shell 24 | export ASR_MODEL=base 25 | ``` 26 | 27 | Available ASR_MODELs are: 28 | 29 | - Standard models: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` (or `large`), `large-v3-turbo` (or `turbo`) 30 | - English-optimized models: `tiny.en`, `base.en`, `small.en`, `medium.en` 31 | - Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3` (only for whisperx and faster-whisper) 32 | 33 | For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` 34 | models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. 35 | 36 | The distilled models offer improved inference speed while maintaining good accuracy. 37 | 38 | ### Configuring the `Model Path` 39 | 40 | ```shell 41 | export ASR_MODEL_PATH=/data/whisper 42 | ``` 43 | 44 | ### Configuring the `Model Unloading Timeout` 45 | 46 | ```shell 47 | export MODEL_IDLE_TIMEOUT=300 48 | ``` 49 | 50 | Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting 51 | `0` disables the timeout, keeping the model loaded indefinitely. 52 | 53 | ### Configuring the `SAMPLE_RATE` 54 | 55 | ```shell 56 | export SAMPLE_RATE=16000 57 | ``` 58 | 59 | Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks. 60 | 61 | ### Configuring Device and Quantization 62 | 63 | ```shell 64 | export ASR_DEVICE=cuda # or 'cpu' 65 | export ASR_QUANTIZATION=float32 # or 'float16', 'int8' 66 | ``` 67 | 68 | The `ASR_DEVICE` defaults to `cuda` if GPU is available, otherwise `cpu`. 69 | 70 | The `ASR_QUANTIZATION` defines the precision for model weights: 71 | 72 | - `float32`: 32-bit floating-point precision (higher precision, slower inference) 73 | - `float16`: 16-bit floating-point precision (lower precision, faster inference) 74 | - `int8`: 8-bit integer precision (lowest precision, fastest inference) 75 | 76 | Defaults to `float32` for GPU, `int8` for CPU. 77 | 78 | ### Configuring Subtitle Options (WhisperX) 79 | 80 | ```shell 81 | export SUBTITLE_MAX_LINE_WIDTH=1000 82 | export SUBTITLE_MAX_LINE_COUNT=2 83 | export SUBTITLE_HIGHLIGHT_WORDS=false 84 | ``` 85 | 86 | These options only apply when using the WhisperX engine: 87 | 88 | - `SUBTITLE_MAX_LINE_WIDTH`: Maximum width of subtitle lines (default: 1000) 89 | - `SUBTITLE_MAX_LINE_COUNT`: Maximum number of lines per subtitle (default: 2) 90 | - `SUBTITLE_HIGHLIGHT_WORDS`: Enable word highlighting in subtitles (default: false) 91 | 92 | ### Hugging Face Token 93 | 94 | ```shell 95 | export HF_TOKEN=your_token_here 96 | ``` 97 | 98 | Required when using the WhisperX engine to download the diarization model. 99 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification. 2 | 3 | !!! tip "Join our Discord Community!" 4 | 🎉 **Connect with other users, get help, and stay updated on the latest features!** 5 | [Join our Discord Server](https://discord.gg/4Q5YVrePzZ){target=_blank} 6 | 7 | ## Features 8 | 9 | Current release (v1.9.1) supports following whisper models: 10 | 11 | - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625) 12 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1) 13 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2) 14 | 15 | ## Quick Usage 16 | 17 | === ":octicons-file-code-16: `CPU`" 18 | 19 | ```shell 20 | docker run -d -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest 21 | ``` 22 | 23 | === ":octicons-file-code-16: `GPU`" 24 | 25 | ```shell 26 | docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest-gpu 27 | ``` 28 | 29 | for more information: 30 | 31 | - [Documentation/Run](https://ahmetoner.github.io/whisper-asr-webservice/run) 32 | - [Docker Hub](https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice) 33 | 34 | ## Credits 35 | 36 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) 37 | -------------------------------------------------------------------------------- /docs/licence.md: -------------------------------------------------------------------------------- 1 | # Licence 2 | 3 | ``` 4 | --8<-- "LICENCE" 5 | ``` 6 | -------------------------------------------------------------------------------- /docs/run.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | Whisper ASR Webservice now available on Docker Hub. You can find the latest version of this repository on docker hub for CPU and GPU. 4 | 5 | Docker Hub: 6 | 7 | === ":octicons-file-code-16: `CPU`" 8 | 9 | ```shell 10 | docker pull onerahmet/openai-whisper-asr-webservice:latest 11 | docker run -d -p 9000:9000 \ 12 | -e ASR_MODEL=base \ 13 | -e ASR_ENGINE=openai_whisper \ 14 | onerahmet/openai-whisper-asr-webservice:latest 15 | ``` 16 | 17 | === ":octicons-file-code-16: `CPU (macOS)`" 18 | 19 | > GPU passthrough does not work on macOS due to fundamental design limitations of Docker. Docker actually runs containers within a LinuxVM on macOS. If you wish to run GPU-accelerated containers, I'm afraid Linux is your only option. 20 | > 21 | > The `:latest` image tag provides both amd64 and arm64 architectures: 22 | 23 | ```shell 24 | docker pull onerahmet/openai-whisper-asr-webservice:latest 25 | docker run -d -p 9000:9000 \ 26 | -e ASR_MODEL=base \ 27 | -e ASR_ENGINE=openai_whisper \ 28 | onerahmet/openai-whisper-asr-webservice:latest 29 | ``` 30 | 31 | === ":octicons-file-code-16: `GPU`" 32 | 33 | ```shell 34 | docker pull onerahmet/openai-whisper-asr-webservice:latest-gpu 35 | docker run -d --gpus all -p 9000:9000 \ 36 | -e ASR_MODEL=base \ 37 | -e ASR_ENGINE=openai_whisper \ 38 | onerahmet/openai-whisper-asr-webservice:latest-gpu 39 | ``` 40 | 41 | ### Environment Variables 42 | 43 | The following environment variables can be used to configure the service: 44 | 45 | - `ASR_MODEL`: Whisper model to use (tiny, base, small, medium, large) [default: base] 46 | - `ASR_ENGINE`: ASR engine to use (openai_whisper, faster_whisper) [default: openai_whisper] 47 | - `ASR_MODEL_PATH`: Custom path to store/load model files [optional] 48 | 49 | > Interactive Swagger API documentation is available at 50 | 51 | ![Swagger UI](assets/images/swagger-ui.png) 52 | 53 | ## Cache 54 | 55 | The ASR model is downloaded each time you start the container. Using the large model can take significant time to download. 56 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory to local storage. 57 | The model will then be loaded from the cache instead of being downloaded again on subsequent container starts. 58 | 59 | **Important: Using a persistent cache will prevent you from receiving model updates.** 60 | 61 | === ":octicons-file-code-16: `Default cache dir`" 62 | 63 | ```shell 64 | docker run -d -p 9000:9000 \ 65 | -v $PWD/cache:/root/.cache \ 66 | onerahmet/openai-whisper-asr-webservice:latest 67 | ``` 68 | 69 | === ":octicons-file-code-16: `With ASR_MODEL_PATH`" 70 | 71 | ```shell 72 | docker run -d -p 9000:9000 \ 73 | -e ASR_MODEL_PATH=/data/whisper \ 74 | -v $PWD/cache:/data/whisper \ 75 | onerahmet/openai-whisper-asr-webservice:latest 76 | ``` 77 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Whisper ASR Webservice 2 | site_url: https://ahmetoner.github.io/whisper-asr-webservice 3 | site_dir: public 4 | 5 | site_description: "OpenAI Whisper ASR Webservice API" 6 | repo_url: "https://github.com/ahmetoner/whisper-asr-webservice" 7 | repo_name: "ahmetoner/whisper-asr-webservice" 8 | copyright: Copyright © 2025 9 | edit_uri: edit/main/docs/ 10 | 11 | validation: 12 | omitted_files: warn 13 | absolute_links: warn 14 | unrecognized_links: warn 15 | 16 | nav: 17 | - Overview: index.md 18 | - Installation & Usage: run.md 19 | - API Endpoints: endpoints.md 20 | - Configuration: environmental-variables.md 21 | - Development: build.md 22 | - Changelog: changelog.md 23 | - License: licence.md 24 | - Releases: https://github.com/ahmetoner/whisper-asr-webservice/releases 25 | - Docker Hub: https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice 26 | 27 | theme: 28 | name: material 29 | custom_dir: docs/.overrides 30 | icon: 31 | logo: material/subtitles 32 | features: 33 | - announce.dismiss 34 | - content.action.edit 35 | - content.action.view 36 | - content.code.annotate 37 | - content.code.copy 38 | - content.tooltips 39 | - navigation.footer 40 | - navigation.indexes 41 | # - navigation.sections # important 42 | - navigation.top 43 | # - navigation.tabs 44 | # - navigation.tabs.sticky 45 | - search.highlight 46 | - search.suggest 47 | - toc.follow 48 | - toc.integrate 49 | palette: 50 | # System preference 51 | - media: "(prefers-color-scheme)" 52 | toggle: 53 | icon: material/brightness-auto 54 | name: Switch to light mode 55 | # Light mode 56 | - media: "(prefers-color-scheme: light)" 57 | scheme: default 58 | primary: custom 59 | accent: teal 60 | toggle: 61 | icon: material/brightness-7 62 | name: Switch to dark mode 63 | # Dark mode 64 | - media: "(prefers-color-scheme: dark)" 65 | scheme: slate 66 | primary: black 67 | accent: lime 68 | toggle: 69 | icon: material/brightness-4 70 | name: Switch to system preference 71 | 72 | 73 | 74 | extra_css: 75 | - assets/css/extra.css 76 | markdown_extensions: 77 | - attr_list 78 | - admonition 79 | - footnotes 80 | - pymdownx.emoji: 81 | emoji_index: !!python/name:materialx.emoji.twemoji 82 | emoji_generator: !!python/name:materialx.emoji.to_svg 83 | - pymdownx.magiclink 84 | - pymdownx.snippets: 85 | check_paths: true 86 | dedent_subsections: true 87 | - pymdownx.superfences 88 | - pymdownx.tabbed: 89 | alternate_style: true 90 | slugify: !!python/object/apply:pymdownx.slugs.slugify 91 | kwds: 92 | case: lower 93 | - pymdownx.tasklist: 94 | custom_checkbox: true 95 | - toc: 96 | permalink: "¶" 97 | - pymdownx.superfences: 98 | custom_fences: 99 | - name: mermaid 100 | class: mermaid 101 | format: !!python/name:pymdownx.superfences.fence_code_format 102 | 103 | plugins: 104 | - search 105 | 106 | extra: 107 | generator: false 108 | social: 109 | - icon: fontawesome/brands/github 110 | link: https://github.com/ahmetoner 111 | - icon: fontawesome/brands/docker 112 | link: https://hub.docker.com/u/onerahmet 113 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "whisper-asr-webservice" 3 | version = "1.10.0-dev" 4 | description = "Whisper ASR Webservice is a general-purpose speech recognition webservice." 5 | requires-python = ">=3.10,<3.13" 6 | dependencies = [ 7 | "fastapi (>=0.115.14)", 8 | "uvicorn[standard] (>=0.35.0)", 9 | "python-multipart (>=0.0.20)", 10 | "ffmpeg-python (>=0.2.0)", 11 | "numpy (>=2.2.6)", 12 | "openai-whisper (>=20250625)", 13 | "faster-whisper (>=1.1.1)", 14 | "whisperx (>=3.4.2)", 15 | "tqdm (>=4.67.1)", 16 | "llvmlite (>=0.44.0)", 17 | "numba (>=0.61.2)", 18 | ] 19 | authors = [ 20 | { name = "Ahmet Öner" }, 21 | { name = "Besim Alibegovic" } 22 | ] 23 | license = { text = "MIT" } 24 | readme = "README.md" 25 | keywords = ["speech-recognition", "whisper", "asr", "webservice"] 26 | 27 | [tool.poetry] 28 | requires-poetry = ">=2.0" 29 | packages = [{ include = "app" }] 30 | 31 | [project.urls] 32 | Homepage = "https://github.com/ahmetoner/whisper-asr-webservice/" 33 | Repository = "https://github.com/ahmetoner/whisper-asr-webservice" 34 | 35 | [project.scripts] 36 | whisper-asr-webservice = "app.webservice:start" 37 | 38 | [project.optional-dependencies] 39 | cpu = [ 40 | "torch (==2.7.1)", 41 | "torchaudio (==2.7.1)" 42 | ] 43 | cuda = [ 44 | "torch (==2.7.1+cu126)", 45 | "torchaudio (==2.7.1+cu126)" 46 | ] 47 | 48 | [[tool.poetry.source]] 49 | name = "pytorch-cpu" 50 | url = "https://download.pytorch.org/whl/cpu" 51 | priority = "explicit" 52 | 53 | [[tool.poetry.source]] 54 | name = "pytorch-cuda" 55 | url = "https://download.pytorch.org/whl/cu126" 56 | priority = "explicit" 57 | 58 | [tool.poetry.dependencies] 59 | torch = [ 60 | { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cpu"}, 61 | { markers = "extra == 'cuda' and extra != 'cpu' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cuda"}, 62 | { markers = "extra == 'cpu' and extra != 'cuda' and sys_platform == 'darwin'", source = "pypi"}, 63 | { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'aarch64' and sys_platform != 'darwin'", source = "pypi"} 64 | ] 65 | torchaudio = [ 66 | { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cpu"}, 67 | { markers = "extra == 'cuda' and extra != 'cpu' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cuda"}, 68 | { markers = "extra == 'cpu' and extra != 'cuda' and sys_platform == 'darwin'", source = "pypi"}, 69 | { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'aarch64' and sys_platform != 'darwin'", source = "pypi"} 70 | ] 71 | 72 | 73 | [tool.poetry.group.dev.dependencies] 74 | pytest = ">=8.3.4,<9.0.0" 75 | ruff = ">=0.9.6,<1.0.0" 76 | black = ">=25.1.0,<26.0.0" 77 | mkdocs-material = ">=9.6.4,<10.0.0" 78 | pymdown-extensions = ">=10.14.3,<11.0.0" 79 | 80 | [build-system] 81 | requires = ["poetry-core>=2.0"] 82 | build-backend = "poetry.core.masonry.api" 83 | 84 | [tool.black] 85 | skip-string-normalization = true 86 | line-length = 120 87 | 88 | [tool.ruff] 89 | line-length = 120 90 | 91 | [tool.ruff.lint] 92 | select = [ 93 | "E", # pycodestyle errors 94 | "W", # pycodestyle warnings 95 | "F", # pyflakes 96 | "I", # isort 97 | "C", # flake8-comprehensions 98 | "B", # flake8-bugbear 99 | ] 100 | ignore = [ 101 | "E501", # line too long, handled by black 102 | "C901", # too complex 103 | ] 104 | 105 | [tool.ruff.lint.isort] 106 | order-by-type = true 107 | relative-imports-order = "closest-to-furthest" 108 | extra-standard-library = ["typing"] 109 | section-order = [ 110 | "future", 111 | "standard-library", 112 | "third-party", 113 | "first-party", 114 | "local-folder", 115 | ] 116 | known-first-party = [] 117 | --------------------------------------------------------------------------------