├── .dockerignore
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── docker-publish.yml
    │   └── documentation.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── Dockerfile.gpu
├── LICENCE
├── README.md
├── app
    ├── asr_models
    │   ├── asr_model.py
    │   ├── faster_whisper_engine.py
    │   ├── mbain_whisperx_engine.py
    │   └── openai_whisper_engine.py
    ├── config.py
    ├── factory
    │   └── asr_model_factory.py
    ├── utils.py
    └── webservice.py
├── docker-compose.gpu.yml
├── docker-compose.yml
├── docs
    ├── .overrides
    │   └── main.html
    ├── assets
    │   ├── css
    │   │   └── extra.css
    │   └── images
    │   │   └── swagger-ui.png
    ├── build.md
    ├── changelog.md
    ├── endpoints.md
    ├── environmental-variables.md
    ├── index.md
    ├── licence.md
    └── run.md
├── mkdocs.yml
├── poetry.lock
└── pyproject.toml


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .venv
3 | venv


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [ahmetoner]
4 | custom: ['https://bmc.link/ahmetoner']
5 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Docker Image
 2 | on:
 3 |   push:
 4 |     tags:        
 5 |       - '*'
 6 |     branches:
 7 |       - debug
 8 | 
 9 | env:
10 |   DOCKER_USER: ${{secrets.DOCKER_USER}}
11 |   DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}
12 |   REPO_NAME: ${{secrets.REPO_NAME}}
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         include:
19 |           - docker_file: Dockerfile
20 |             platforms: linux/arm64,linux/amd64
21 |           - docker_file: Dockerfile.gpu
22 |             tag_extension: -gpu
23 |             platforms: linux/amd64
24 |     steps:
25 |     - name: Checkout
26 |       uses: actions/checkout@v3
27 |     - name: Free up disk space
28 |       run: |
29 |         sudo rm -rf /usr/share/dotnet
30 |         sudo rm -rf /opt/ghc
31 |         sudo rm -rf "/usr/local/share/boost"
32 |         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
33 |     - name: Set up QEMU
34 |       uses: docker/setup-qemu-action@v1
35 |     - name: Set up Docker Buildx
36 |       uses: docker/setup-buildx-action@v1
37 |     - name: Login to DockerHub
38 |       uses: docker/login-action@v1
39 |       with:
40 |         username: ${{ secrets.DOCKER_USER }}
41 |         password: ${{ secrets.DOCKER_PASSWORD }}
42 |     - name: Build and Publish the Docker debug image
43 |       if: github.ref == 'refs/heads/debug'
44 |       run: |
45 |         DOCKER_IMAGE_DEBUG=$DOCKER_USER/$REPO_NAME:debug${{ matrix.tag_extension }}
46 |         docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_DEBUG}" -f ${{ matrix.docker_file }} --push
47 |     - name: Build and Publish the Docker image
48 |       if: github.ref != 'refs/heads/debug'
49 |       run: |
50 |         DOCKER_IMAGE_LATEST=$DOCKER_USER/$REPO_NAME:latest${{ matrix.tag_extension }}
51 |         DOCKER_IMAGE_VERSION=$DOCKER_USER/$REPO_NAME:$GITHUB_REF_NAME${{ matrix.tag_extension }}
52 |         docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_LATEST}" -t "${DOCKER_IMAGE_VERSION}" -f ${{ matrix.docker_file }} --push
53 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | on:
 3 |   push:
 4 |     tags:        
 5 |       - '*'
 6 |     branches:
 7 |       - docs
 8 | permissions:
 9 |   contents: write
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     if: github.event.repository.fork == false
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: actions/setup-python@v4
17 |         with:
18 |           python-version: 3.x
19 |       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
20 |       - uses: actions/cache@v3
21 |         with:
22 |           key: mkdocs-material-${{ env.cache_id }}
23 |           path: .cache
24 |           restore-keys: |
25 |             mkdocs-material-
26 |       - run: pip install mkdocs-material pymdown-extensions
27 |       - run: mkdocs gh-deploy --force
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | 
 3 | # Packages
 4 | *.egg
 5 | !/tests/**/*.egg
 6 | /*.egg-info
 7 | /dist/*
 8 | build
 9 | _build
10 | .cache
11 | *.so
12 | venv
13 | 
14 | # Installer logs
15 | pip-log.txt
16 | 
17 | # Unit test / coverage reports
18 | .coverage
19 | .pytest_cache
20 | 
21 | .DS_Store
22 | .idea/*
23 | .python-version
24 | .vscode/*
25 | 
26 | /test.py
27 | /test_*.*
28 | 
29 | /setup.cfg
30 | MANIFEST.in
31 | /setup.py
32 | /docs/site/*
33 | /tests/fixtures/simple_project/setup.py
34 | /tests/fixtures/project_with_extras/setup.py
35 | .mypy_cache
36 | 
37 | .venv
38 | /releases/*
39 | pip-wheel-metadata
40 | /poetry.toml
41 | 
42 | poetry/core/*
43 | 
44 | public
45 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | Unreleased
  5 | ----------
  6 | 
  7 | [1.9.1] (2025-07-01)
  8 | --------------------
  9 | 
 10 | ### Fixed
 11 | 
 12 | - Fixed Whisperx diarization pipeline initialization
 13 | - Fixed Whisperx language detection
 14 | 
 15 | [1.9.0] (2025-06-29)
 16 | --------------------
 17 | 
 18 | ### Changed
 19 | 
 20 | - Upgraded
 21 |   - Poetry to v2.1.3
 22 |   - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625)
 23 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1)
 24 |   - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2)
 25 |   - torch to v2.7.1
 26 |   - torchaudio to v2.7.1
 27 |   - numpy to v2.2.6
 28 |   - fastapi to v0.115.14
 29 |   - uvicorn to v0.35.0
 30 |   - numba to v0.61.2
 31 | 
 32 | [1.8.2] (2025-02-18)
 33 | --------------------
 34 | 
 35 | ### Changed
 36 | 
 37 | - Reduced GPU image size by using `nvidia/cuda:12.6.3-base-ubuntu22.04`
 38 | 
 39 | [1.8.1] (2025-02-18)
 40 | --------------------
 41 | 
 42 | ### Fixed
 43 | 
 44 | - Fixed issues with Torch CUDA and cuDNN
 45 | - Updated Torch and Torchaudio dependencies for multi-architecture support
 46 | 
 47 | [1.8.0] (2025-02-17)
 48 | --------------------
 49 | 
 50 | ### Added
 51 | 
 52 | - Added support for [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)
 53 | 
 54 | ### Changed
 55 | 
 56 | - Upgraded Cuda GPU image to v12.6.3
 57 | - Upgraded dependencies
 58 |   - torch to v2.6.0
 59 |   - fastapi to v0.115.8
 60 |   - llvmlite to v0.44.0
 61 |   - numba to v0.61.0
 62 |   - ruff to v0.9.6
 63 |   - black to v25.1.0
 64 |   - mkdocs-material to v9.6.4
 65 |   - pymdown-extensions to v10.14.3
 66 | 
 67 | [1.7.1] (2024-12-18)
 68 | --------------------
 69 | 
 70 | ### Fixed
 71 | 
 72 | - Fix JSON serialization of segments due to Faster Whisper v1.1.0 changes
 73 | 
 74 | [1.7.0] (2024-12-17)
 75 | --------------------
 76 | 
 77 | ### Added
 78 | 
 79 |   - Timeout configured to allow model to be unloaded when idle
 80 |   - Added detection confidence to langauge detection endpoint
 81 |   - Set mel generation to adjust n_dims automatically to match the loaded model
 82 |   - Refactor classes, Add comments, implement abstract methods, and add factory method for engine selection
 83 | 
 84 | ### Changed
 85 | 
 86 | - Upgraded
 87 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
 88 |   - uvicorn to v0.34.0
 89 |   - tqdm to v4.67.1
 90 |   - python-multipart to v0.0.20
 91 |   - fastapi to v0.115.6
 92 |   - pytest to v8.3.4
 93 |   - ruff to v0.8.3
 94 |   - black to v24.10.0
 95 |   - mkdocs to v1.6.1
 96 |   - mkdocs-material to v9.5.49
 97 |   - pymdown-extensions to v10.12
 98 | 
 99 | [1.6.0] (2024-10-06)
100 | --------------------
101 | 
102 | ### Changed
103 | 
104 | - Upgraded
105 |   - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
106 |   - fastapi to v0.115.0
107 |   - uvicorn to v0.31.0
108 |   - tqdm to v4.66.5
109 |   - python-multipart to v0.0.12
110 | 
111 | [1.5.0] (2024-07-04)
112 | --------------------
113 | 
114 | ### Changed
115 | 
116 | - Upgraded
117 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.3)
118 |   - fastapi to v0.111.0
119 |   - uvicorn to v0.30.1
120 |   - gunicorn to v22.0.0
121 |   - tqdm to v4.66.4
122 |   - llvmlite to v0.43.0
123 |   - numba to v0.60.0
124 | 
125 | [1.4.1] (2024-04-17)
126 | --------------------
127 | 
128 | ### Changed
129 | 
130 | - Upgraded torch to v1.13.1
131 | 
132 | [1.4.0] (2024-04-17)
133 | --------------------
134 | 
135 | ### Changed
136 | 
137 | - Upgraded
138 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.1)
139 |   - fastapi to v0.110.1
140 |   - uvicorn to v0.29.0
141 |   - gunicorn to v21.2.0
142 |   - tqdm to v4.66.2
143 |   - python-multipart to v0.0.9
144 |   - llvmlite to v0.42.0
145 |   - numba to v0.59.1
146 | 
147 | [1.3.0] (2024-02-15)
148 | --------------------
149 | 
150 | ### Added
151 | 
152 | - Compiled and added FFmpeg without LGPL libraries for license compliance
153 | 
154 | [1.2.4] (2023-11-27)
155 | --------------------
156 | 
157 | ### Changed
158 | 
159 | - Upgraded
160 |   - [openai/whisper](https://github.com/openai/whisper) to [v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
161 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v0.10.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v0.10.0)
162 | 
163 | [1.2.3] (2023-11-07)
164 | --------------------
165 | 
166 | ### Changed
167 | 
168 | - Upgraded
169 |   - [openai/whisper](https://github.com/openai/whisper) to [v20231106](https://github.com/openai/whisper/releases/tag/v20231106)
170 | 
171 | [1.2.2] (2023-11-03)
172 | --------------------
173 | 
174 | ### Fixed
175 | 
176 | - Fixed `swagger-ui` rendering issues by upgrading to `v5.9.1`, fixes #153 and #154
177 | 
178 | [1.2.1] (2023-11-03)
179 | --------------------
180 | 
181 | ### Enabled
182 | 
183 | - Enabled `vad_filter` for `faster-whisper` engine
184 | 
185 | ### Changed
186 | 
187 | - Changed misspelling in "Word level timestamps"
188 | - Removed unused unidecode dependency
189 | - Upgraded
190 |   - uvicorn to v0.23.2
191 |   - gunicorn to v21.0.1
192 |   - tqdm to v4.66.1
193 |   - python-multipart to v0.0.6
194 |   - fastapi to v0.104.1
195 |   - llvmlite to v0.41.1
196 |   - numba to v0.58.0
197 | 
198 | [1.2.0] (2023-10-01)
199 | --------------------
200 | 
201 | ### Changed
202 | 
203 | - Upgraded
204 |   - [openai/whisper](https://github.com/openai/whisper) to [v20230918](https://github.com/openai/whisper/releases/tag/v20230918)
205 |   - [guillaumekln/faster-whisper](https://github.com/guillaumekln/faster-whisper) to [v0.9.0](https://github.com/guillaumekln/faster-whisper/releases/tag/v0.9.0)
206 | 
207 | ### Updated
208 | 
209 | - Updated model conversion method (for Faster Whisper) to use Hugging Face downloader
210 | - Updated default model paths to `~/.cache/whisper` or `/root/.cache/whisper`.
211 |   - For customization, modify the `ASR_MODEL_PATH` environment variable.
212 |   - Ensure Docker volume is set for the corresponding directory to use caching.
213 | 
214 |       ```bash
215 |       docker run -d -p 9000:9000 -e ASR_MODEL_PATH=/data/whisper -v $PWD/yourlocaldir:/data/whisper onerahmet/openai-whisper-asr-webservice:latest
216 |       ```
217 | 
218 | - Removed the `triton` dependency from `poetry.lock` to ensure the stability of the pipeline for `ARM-based` Docker images
219 | 
220 | [1.1.1] (2023-05-29)
221 | --------------------
222 | 
223 | ### Changed
224 | 
225 | - 94 gpus that don't support float16 in #103
226 | - Update compute type in #108
227 | - Add word level functionality for Faster Whisper in #109
228 | 
229 | [1.1.0] (2023-04-17)
230 | --------------------
231 | 
232 | ### Changed
233 | 
234 | - Docs in #72
235 | - Fix language code typo in #77
236 | - Adds support for FasterWhisper in #81
237 | - Add an optional param to skip the encoding step in #82
238 | - Faster whisper in #92
239 | 
240 | [1.0.6] (2023-02-05)
241 | --------------------
242 | 
243 | ### Changed
244 | 
245 | - Update README.md in #58
246 | - 68 update the versions in #69
247 | - Fix gunicorn run command and remove deprecated poetry run script in #70
248 | - Move torch installation method into the pyproject.toml file in #71
249 | - Add prompt to ASR in #66
250 | 
251 | [1.0.5] (2022-12-08)
252 | --------------------
253 | 
254 | ### Changed
255 | 
256 | - 43 make swagger doc not depend on internet connection in #52
257 | - Add new large model v2 in #53
258 | 
259 | [1.0.4] (2022-11-28)
260 | --------------------
261 | 
262 | ### Changed
263 | 
264 | - 43 make swagger doc not depend on internet connection in #51
265 | - Anally retentively fixed markdown linting warnings in README. Sorry. in #48
266 | - Explicit macOS readme with explanation for no-GPU [closes #44] in #47
267 | 
268 | [1.0.3-beta] (2022-11-17)
269 | -------------------------
270 | 
271 | ### Changed
272 | 
273 | - Combine transcribe endpoints in #36
274 | - Add multi worker support with gunicorn in #37
275 | - Add multi platform (amd & arm) support in #39
276 | - Upgrade Cuda version to 11.7 in #40
277 | - Lock to the latest whisper version (eff383) in #41
278 | 
279 | [1.0.2-beta] (2022-10-04)
280 | -------------------------
281 | 
282 | ### Changed
283 | 
284 | - add mutex lock to the model in #19
285 | - Subtitles in #21
286 | - Add gpu support and create Docker image for cuda with GitHub flow in #22
287 | 
288 | [1.0.1-beta] (2022-09-27)
289 | -------------------------
290 | 
291 | ### Changed
292 | 
293 | - Init GitHub runners in #10
294 | - Lock Whisper dependency with b4308... revision number to prevent build crashes in #15
295 | 
296 | [1.0.0-beta] (2022-09-25)
297 | -------------------------
298 | 
299 | ### Changed
300 | 
301 | - Docker init in #1
302 | - Create LICENCE in #2
303 | - Fastapi init in #3
304 | - Avoid temp file in #4
305 | - Translate init in #5
306 | - mp3 support by using FFmpeg instead of librosa in #8
307 | - add language detection endpoint in #9
308 | 
309 | [1.9.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.9.1
310 | [1.9.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.9.0
311 | [1.8.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.2
312 | [1.8.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.1
313 | [1.8.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.0
314 | [1.7.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.1
315 | [1.7.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.0
316 | [1.6.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.6.0
317 | [1.5.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.5.0
318 | [1.4.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.1
319 | [1.4.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.0
320 | [1.3.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.3.0
321 | [1.2.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.4
322 | [1.2.3]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.3
323 | [1.2.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.2
324 | [1.2.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.1
325 | [1.2.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.0
326 | [1.1.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.1
327 | [1.1.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.0
328 | [1.0.6]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.6
329 | [1.0.5]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.5
330 | [1.0.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.4
331 | [1.0.3-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.3-beta
332 | [1.0.2-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.2-beta
333 | [1.0.1-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.1-beta
334 | [1.0.0-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/1.0.0-beta
335 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
 2 | 
 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 4 | 
 5 | FROM python:3.10-bookworm
 6 | 
 7 | LABEL org.opencontainers.image.source="https://github.com/ahmetoner/whisper-asr-webservice"
 8 | 
 9 | ENV POETRY_VENV=/app/.venv
10 | 
11 | RUN python3 -m venv $POETRY_VENV \
12 |     && $POETRY_VENV/bin/pip install -U pip setuptools \
13 |     && $POETRY_VENV/bin/pip install poetry==2.1.3
14 | 
15 | ENV PATH="${PATH}:${POETRY_VENV}/bin"
16 | 
17 | WORKDIR /app
18 | 
19 | COPY . .
20 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
21 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
22 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
23 | 
24 | RUN poetry config virtualenvs.in-project true
25 | RUN poetry install --extras cpu
26 | 
27 | EXPOSE 9000
28 | 
29 | ENTRYPOINT ["whisper-asr-webservice"]
30 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
 2 | 
 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 4 | 
 5 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04
 6 | 
 7 | LABEL org.opencontainers.image.source="https://github.com/ahmetoner/whisper-asr-webservice"
 8 | 
 9 | ENV PYTHON_VERSION=3.10
10 | 
11 | ENV POETRY_VENV=/app/.venv
12 | 
13 | RUN export DEBIAN_FRONTEND=noninteractive \
14 |     && apt-get -qq update \
15 |     && apt-get -qq install --no-install-recommends \
16 |     python${PYTHON_VERSION} \
17 |     python${PYTHON_VERSION}-venv \
18 |     python3-pip \
19 |     libcudnn8 \
20 |     python3-pip \
21 |     && rm -rf /var/lib/apt/lists/*
22 | 
23 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \
24 |     ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \
25 |     ln -s -f /usr/bin/pip3 /usr/bin/pip
26 | 
27 | RUN python3 -m venv $POETRY_VENV \
28 |     && $POETRY_VENV/bin/pip install -U pip setuptools \
29 |     && $POETRY_VENV/bin/pip install poetry==2.1.3
30 | 
31 | ENV PATH="${PATH}:${POETRY_VENV}/bin"
32 | 
33 | WORKDIR /app
34 | 
35 | COPY . .
36 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
37 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
38 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
39 | 
40 | RUN poetry config virtualenvs.in-project true
41 | RUN poetry install --extras cuda
42 | 
43 | EXPOSE 9000
44 | 
45 | ENTRYPOINT ["whisper-asr-webservice"]
46 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ahmet Oner & Besim Alibegovic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Release](https://img.shields.io/github/v/release/ahmetoner/whisper-asr-webservice.svg)
 2 | ![Docker Pulls](https://img.shields.io/docker/pulls/onerahmet/openai-whisper-asr-webservice.svg)
 3 | ![Build](https://img.shields.io/github/actions/workflow/status/ahmetoner/whisper-asr-webservice/docker-publish.yml.svg)
 4 | ![Licence](https://img.shields.io/github/license/ahmetoner/whisper-asr-webservice.svg)
 5 | 
 6 | > 🎉 **Join our Discord Community!** Connect with other users, get help, and stay updated on the latest features: [https://discord.gg/4Q5YVrePzZ](https://discord.gg/4Q5YVrePzZ)
 7 | 
 8 | # Whisper ASR Box
 9 | 
10 | Whisper ASR Box is a general-purpose speech recognition toolkit. Whisper Models are trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification.
11 | 
12 | ## Features
13 | 
14 | Current release (v1.9.1) supports following whisper models:
15 | 
16 | - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625)
17 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1)
18 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2)
19 | 
20 | ## Quick Usage
21 | 
22 | ### CPU
23 | 
24 | ```shell
25 | docker run -d -p 9000:9000 \
26 |   -e ASR_MODEL=base \
27 |   -e ASR_ENGINE=openai_whisper \
28 |   onerahmet/openai-whisper-asr-webservice:latest
29 | ```
30 | 
31 | ### GPU
32 | 
33 | ```shell
34 | docker run -d --gpus all -p 9000:9000 \
35 |   -e ASR_MODEL=base \
36 |   -e ASR_ENGINE=openai_whisper \
37 |   onerahmet/openai-whisper-asr-webservice:latest-gpu
38 | ```
39 | 
40 | #### Cache
41 | 
42 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory:
43 | 
44 | ```shell
45 | docker run -d -p 9000:9000 \
46 |   -v $PWD/cache:/root/.cache/ \
47 |   onerahmet/openai-whisper-asr-webservice:latest
48 | ```
49 | 
50 | ## Key Features
51 | 
52 | - Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX)
53 | - Multiple output formats (text, JSON, VTT, SRT, TSV)
54 | - Word-level timestamps support
55 | - Voice activity detection (VAD) filtering
56 | - Speaker diarization (with WhisperX)
57 | - FFmpeg integration for broad audio/video format support
58 | - GPU acceleration support
59 | - Configurable model loading/unloading
60 | - REST API with Swagger documentation
61 | 
62 | ## Environment Variables
63 | 
64 | Key configuration options:
65 | 
66 | - `ASR_ENGINE`: Engine selection (openai_whisper, faster_whisper, whisperx)
67 | - `ASR_MODEL`: Model selection (tiny, base, small, medium, large-v3, etc.)
68 | - `ASR_MODEL_PATH`: Custom path to store/load models
69 | - `ASR_DEVICE`: Device selection (cuda, cpu)
70 | - `MODEL_IDLE_TIMEOUT`: Timeout for model unloading
71 | 
72 | ## Documentation
73 | 
74 | For complete documentation, visit:
75 | [https://ahmetoner.github.io/whisper-asr-webservice](https://ahmetoner.github.io/whisper-asr-webservice)
76 | 
77 | ## Development
78 | 
79 | ```shell
80 | # Install poetry v2.X
81 | pip3 install poetry
82 | 
83 | # Install dependencies for cpu
84 | poetry install --extras cpu
85 | 
86 | # Install dependencies for cuda
87 | poetry install --extras cuda
88 | 
89 | # Run service
90 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
91 | ```
92 | 
93 | After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:9000` in your browser to access the Swagger UI documentation and try out the API endpoints.
94 | 
95 | ## Credits
96 | 
97 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html)
98 | 


--------------------------------------------------------------------------------
/app/asr_models/asr_model.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import time
 3 | from abc import ABC, abstractmethod
 4 | from threading import Lock
 5 | from typing import Union
 6 | 
 7 | import torch
 8 | 
 9 | from app.config import CONFIG
10 | 
11 | 
12 | class ASRModel(ABC):
13 |     """
14 |     Abstract base class for ASR (Automatic Speech Recognition) models.
15 |     """
16 | 
17 |     model = None
18 |     model_lock = Lock()
19 |     last_activity_time = time.time()
20 | 
21 |     def __init__(self):
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def load_model(self):
26 |         """
27 |         Loads the model from the specified path.
28 |         """
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def transcribe(
33 |         self,
34 |         audio,
35 |         task: Union[str, None],
36 |         language: Union[str, None],
37 |         initial_prompt: Union[str, None],
38 |         vad_filter: Union[bool, None],
39 |         word_timestamps: Union[bool, None],
40 |         options: Union[dict, None],
41 |         output,
42 |     ):
43 |         """
44 |         Perform transcription on the given audio file.
45 |         """
46 |         pass
47 | 
48 |     @abstractmethod
49 |     def language_detection(self, audio):
50 |         """
51 |         Perform language detection on the given audio file.
52 |         """
53 |         pass
54 | 
55 |     def monitor_idleness(self):
56 |         """
57 |         Monitors the idleness of the ASR model and releases the model if it has been idle for too long.
58 |         """
59 |         if CONFIG.MODEL_IDLE_TIMEOUT <= 0:
60 |             return
61 |         while True:
62 |             time.sleep(15)
63 |             if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
64 |                 with self.model_lock:
65 |                     self.release_model()
66 |                     break
67 | 
68 |     def release_model(self):
69 |         """
70 |         Unloads the model from memory and clears any cached GPU memory.
71 |         """
72 |         del self.model
73 |         torch.cuda.empty_cache()
74 |         gc.collect()
75 |         self.model = None
76 |         print("Model unloaded due to timeout")
77 | 


--------------------------------------------------------------------------------
/app/asr_models/faster_whisper_engine.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from io import StringIO
 3 | from threading import Thread
 4 | from typing import BinaryIO, Union
 5 | 
 6 | import whisper
 7 | from faster_whisper import WhisperModel
 8 | 
 9 | from app.asr_models.asr_model import ASRModel
10 | from app.config import CONFIG
11 | from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
12 | 
13 | 
14 | class FasterWhisperASR(ASRModel):
15 | 
16 |     def load_model(self):
17 | 
18 |         self.model = WhisperModel(
19 |             model_size_or_path=CONFIG.MODEL_NAME,
20 |             device=CONFIG.DEVICE,
21 |             compute_type=CONFIG.MODEL_QUANTIZATION,
22 |             download_root=CONFIG.MODEL_PATH
23 |         )
24 | 
25 |         Thread(target=self.monitor_idleness, daemon=True).start()
26 | 
27 |     def transcribe(
28 |             self,
29 |             audio,
30 |             task: Union[str, None],
31 |             language: Union[str, None],
32 |             initial_prompt: Union[str, None],
33 |             vad_filter: Union[bool, None],
34 |             word_timestamps: Union[bool, None],
35 |             options: Union[dict, None],
36 |             output,
37 |     ):
38 |         self.last_activity_time = time.time()
39 | 
40 |         with self.model_lock:
41 |             if self.model is None:
42 |                 self.load_model()
43 | 
44 |         options_dict = {"task": task}
45 |         if language:
46 |             options_dict["language"] = language
47 |         if initial_prompt:
48 |             options_dict["initial_prompt"] = initial_prompt
49 |         if vad_filter:
50 |             options_dict["vad_filter"] = True
51 |         if word_timestamps:
52 |             options_dict["word_timestamps"] = True
53 |         with self.model_lock:
54 |             segments = []
55 |             text = ""
56 |             segment_generator, info = self.model.transcribe(audio, beam_size=5, **options_dict)
57 |             for segment in segment_generator:
58 |                 segments.append(segment)
59 |                 text = text + segment.text
60 |             result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text}
61 | 
62 |         output_file = StringIO()
63 |         self.write_result(result, output_file, output)
64 |         output_file.seek(0)
65 | 
66 |         return output_file
67 | 
68 |     def language_detection(self, audio):
69 | 
70 |         self.last_activity_time = time.time()
71 | 
72 |         with self.model_lock:
73 |             if self.model is None: self.load_model()
74 | 
75 |         # load audio and pad/trim it to fit 30 seconds
76 |         audio = whisper.pad_or_trim(audio)
77 | 
78 |         # detect the spoken language
79 |         with self.model_lock:
80 |             segments, info = self.model.transcribe(audio, beam_size=5)
81 |             detected_lang_code = info.language
82 |             detected_language_confidence = info.language_probability
83 | 
84 |         return detected_lang_code, detected_language_confidence
85 | 
86 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
87 |         if output == "srt":
88 |             WriteSRT(ResultWriter).write_result(result, file=file)
89 |         elif output == "vtt":
90 |             WriteVTT(ResultWriter).write_result(result, file=file)
91 |         elif output == "tsv":
92 |             WriteTSV(ResultWriter).write_result(result, file=file)
93 |         elif output == "json":
94 |             WriteJSON(ResultWriter).write_result(result, file=file)
95 |         else:
96 |             WriteTXT(ResultWriter).write_result(result, file=file)
97 | 


--------------------------------------------------------------------------------
/app/asr_models/mbain_whisperx_engine.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from io import StringIO
  3 | from threading import Thread
  4 | from typing import BinaryIO, Union
  5 | 
  6 | import whisperx
  7 | from whisperx.audio import N_SAMPLES
  8 | from whisperx.diarize import DiarizationPipeline
  9 | from whisperx.utils import ResultWriter, SubtitlesWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
 10 | 
 11 | from app.asr_models.asr_model import ASRModel
 12 | from app.config import CONFIG
 13 | 
 14 | 
 15 | class WhisperXASR(ASRModel):
 16 |     def __init__(self):
 17 |         super().__init__()
 18 |         self.model = {
 19 |             'whisperx': None,
 20 |             'diarize_model': None,
 21 |             'align_model': {}
 22 |         }
 23 | 
 24 |     def load_model(self):
 25 |         asr_options = {"without_timestamps": False}
 26 |         self.model['whisperx'] = whisperx.load_model(
 27 |             CONFIG.MODEL_NAME,
 28 |             device=CONFIG.DEVICE,
 29 |             compute_type=CONFIG.MODEL_QUANTIZATION,
 30 |             asr_options=asr_options
 31 |         )
 32 | 
 33 |         if CONFIG.HF_TOKEN != "":
 34 |             self.model['diarize_model'] = DiarizationPipeline(
 35 |                 use_auth_token=CONFIG.HF_TOKEN,
 36 |                 device=CONFIG.DEVICE
 37 |             )
 38 | 
 39 |         Thread(target=self.monitor_idleness, daemon=True).start()
 40 | 
 41 |     def transcribe(
 42 |         self,
 43 |         audio,
 44 |         task: Union[str, None],
 45 |         language: Union[str, None],
 46 |         initial_prompt: Union[str, None],
 47 |         vad_filter: Union[bool, None],
 48 |         word_timestamps: Union[bool, None],
 49 |         options: Union[dict, None],
 50 |         output,
 51 |     ):
 52 |         self.last_activity_time = time.time()
 53 |         with self.model_lock:
 54 |             if self.model is None:
 55 |                 self.load_model()
 56 | 
 57 |         options_dict = {"task": task}
 58 |         if language:
 59 |             options_dict["language"] = language
 60 |         if initial_prompt:
 61 |             options_dict["initial_prompt"] = initial_prompt
 62 |         with self.model_lock:
 63 |             result = self.model['whisperx'].transcribe(audio, **options_dict)
 64 |             language = result["language"]
 65 | 
 66 |         # Load the required model and cache it
 67 |         # If we transcribe models in many different languages, this may lead to OOM propblems
 68 |         if result["language"] in self.model['align_model']:
 69 |             model_x, metadata = self.model['align_model'][result["language"]]
 70 |         else:
 71 |             self.model['align_model'][result["language"]] = whisperx.load_align_model(
 72 |                 language_code=result["language"], device=CONFIG.DEVICE
 73 |             )
 74 |             model_x, metadata = self.model['align_model'][result["language"]]
 75 | 
 76 |         # Align whisper output
 77 |         result = whisperx.align(
 78 |             result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False
 79 |         )
 80 | 
 81 |         if options.get("diarize", False) and CONFIG.HF_TOKEN != "":
 82 |             min_speakers = options.get("min_speakers", None)
 83 |             max_speakers = options.get("max_speakers", None)
 84 |             # add min/max number of speakers if known
 85 |             diarize_segments = self.model['diarize_model'](audio, min_speakers, max_speakers)
 86 |             result = whisperx.assign_word_speakers(diarize_segments, result)
 87 |         result["language"] = language
 88 | 
 89 |         output_file = StringIO()
 90 |         self.write_result(result, output_file, output)
 91 |         output_file.seek(0)
 92 | 
 93 |         return output_file
 94 | 
 95 |     def language_detection(self, audio):
 96 |         with self.model_lock:
 97 |             if self.model is None:
 98 |                 self.load_model()
 99 |             if audio.shape[0] < N_SAMPLES:
100 |                 print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
101 |             results = self.model['whisperx'].model.detect_language(audio)
102 |             language = results[0]
103 |             language_probability = round(float(results[1]), 2)
104 |             print(f"Detected language: {language} ({language_probability}) in first 30s of audio...")
105 |         return language, language_probability
106 | 
107 | 
108 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
109 |         default_options = {
110 |             "max_line_width": CONFIG.SUBTITLE_MAX_LINE_WIDTH,
111 |             "max_line_count": CONFIG.SUBTITLE_MAX_LINE_COUNT,
112 |             "highlight_words": CONFIG.SUBTITLE_HIGHLIGHT_WORDS
113 |         }
114 | 
115 |         if output == "srt":
116 |             WriteSRT(SubtitlesWriter).write_result(result, file=file, options=default_options)
117 |         elif output == "vtt":
118 |             WriteVTT(SubtitlesWriter).write_result(result, file=file, options=default_options)
119 |         elif output == "tsv":
120 |             WriteTSV(ResultWriter).write_result(result, file=file, options=default_options)
121 |         elif output == "json":
122 |             WriteJSON(ResultWriter).write_result(result, file=file, options=default_options)
123 |         else:
124 |             WriteTXT(ResultWriter).write_result(result, file=file, options=default_options)
125 | 


--------------------------------------------------------------------------------
/app/asr_models/openai_whisper_engine.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from io import StringIO
 3 | from threading import Thread
 4 | from typing import BinaryIO, Union
 5 | 
 6 | import torch
 7 | import whisper
 8 | from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
 9 | 
10 | from app.asr_models.asr_model import ASRModel
11 | from app.config import CONFIG
12 | 
13 | 
14 | class OpenAIWhisperASR(ASRModel):
15 | 
16 |     def load_model(self):
17 | 
18 |         if torch.cuda.is_available():
19 |             self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH).cuda()
20 |         else:
21 |             self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH)
22 | 
23 |         Thread(target=self.monitor_idleness, daemon=True).start()
24 | 
25 |     def transcribe(
26 |         self,
27 |         audio,
28 |         task: Union[str, None],
29 |         language: Union[str, None],
30 |         initial_prompt: Union[str, None],
31 |         vad_filter: Union[bool, None],
32 |         word_timestamps: Union[bool, None],
33 |         options: Union[dict, None],
34 |         output,
35 |     ):
36 |         self.last_activity_time = time.time()
37 | 
38 |         with self.model_lock:
39 |             if self.model is None:
40 |                 self.load_model()
41 | 
42 |         options_dict = {"task": task}
43 |         if language:
44 |             options_dict["language"] = language
45 |         if initial_prompt:
46 |             options_dict["initial_prompt"] = initial_prompt
47 |         if word_timestamps:
48 |             options_dict["word_timestamps"] = word_timestamps
49 |         with self.model_lock:
50 |             result = self.model.transcribe(audio, **options_dict)
51 | 
52 |         output_file = StringIO()
53 |         self.write_result(result, output_file, output)
54 |         output_file.seek(0)
55 | 
56 |         return output_file
57 | 
58 |     def language_detection(self, audio):
59 | 
60 |         self.last_activity_time = time.time()
61 | 
62 |         with self.model_lock:
63 |             if self.model is None:
64 |                 self.load_model()
65 | 
66 |         # load audio and pad/trim it to fit 30 seconds
67 |         audio = whisper.pad_or_trim(audio)
68 | 
69 |         # make log-Mel spectrogram and move to the same device as the model
70 |         mel = whisper.log_mel_spectrogram(audio, self.model.dims.n_mels).to(self.model.device)
71 | 
72 |         # detect the spoken language
73 |         with self.model_lock:
74 |             _, probs = self.model.detect_language(mel)
75 |         detected_lang_code = max(probs, key=probs.get)
76 | 
77 |         return detected_lang_code, probs[max(probs)]
78 | 
79 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
80 |         options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False}
81 |         if output == "srt":
82 |             WriteSRT(ResultWriter).write_result(result, file=file, options=options)
83 |         elif output == "vtt":
84 |             WriteVTT(ResultWriter).write_result(result, file=file, options=options)
85 |         elif output == "tsv":
86 |             WriteTSV(ResultWriter).write_result(result, file=file, options=options)
87 |         elif output == "json":
88 |             WriteJSON(ResultWriter).write_result(result, file=file, options=options)
89 |         else:
90 |             WriteTXT(ResultWriter).write_result(result, file=file, options=options)
91 | 


--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class CONFIG:
 7 |     """
 8 |     Configuration class for ASR models.
 9 |     Reads environment variables for runtime configuration, with sensible defaults.
10 |     """
11 |     # Determine the ASR engine ('faster_whisper', 'openai_whisper' or 'whisperx')
12 |     ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
13 | 
14 |     # Retrieve Huggingface Token
15 |     HF_TOKEN = os.getenv("HF_TOKEN", "")
16 |     if ASR_ENGINE == "whisperx" and HF_TOKEN == "":
17 |         print("You must set the HF_TOKEN environment variable to download the diarization model used by WhisperX.")
18 | 
19 |     # Determine the computation device (GPU or CPU)
20 |     DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
21 | 
22 |     # Model name to use (e.g., "base", "small", etc.)
23 |     MODEL_NAME = os.getenv("ASR_MODEL", "base")
24 | 
25 |     # Path to the model directory
26 |     MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
27 | 
28 |     # Model quantization level. Defines the precision for model weights:
29 |     #   'float32' - 32-bit floating-point precision (higher precision, slower inference)
30 |     #   'float16' - 16-bit floating-point precision (lower precision, faster inference)
31 |     #   'int8' - 8-bit integer precision (lowest precision, fastest inference)
32 |     # Defaults to 'float32' for GPU availability, 'int8' for CPU.
33 |     MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8")
34 |     if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}:
35 |         raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.")
36 | 
37 |     # Idle timeout in seconds. If set to a non-zero value, the model will be unloaded
38 |     # after being idle for this many seconds. A value of 0 means the model will never be unloaded.
39 |     MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0))
40 | 
41 |     # Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks.
42 |     SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000))
43 | 
44 |     # Subtitle output options for whisperx
45 |     SUBTITLE_MAX_LINE_WIDTH = int(os.getenv("SUBTITLE_MAX_LINE_WIDTH", 1000))
46 |     SUBTITLE_MAX_LINE_COUNT = int(os.getenv("SUBTITLE_MAX_LINE_COUNT", 2))
47 |     SUBTITLE_HIGHLIGHT_WORDS = os.getenv("SUBTITLE_HIGHLIGHT_WORDS", "false").lower() == "true"
48 | 


--------------------------------------------------------------------------------
/app/factory/asr_model_factory.py:
--------------------------------------------------------------------------------
 1 | from app.asr_models.asr_model import ASRModel
 2 | from app.asr_models.faster_whisper_engine import FasterWhisperASR
 3 | from app.asr_models.mbain_whisperx_engine import WhisperXASR
 4 | from app.asr_models.openai_whisper_engine import OpenAIWhisperASR
 5 | from app.config import CONFIG
 6 | 
 7 | 
 8 | class ASRModelFactory:
 9 |     @staticmethod
10 |     def create_asr_model() -> ASRModel:
11 |         if CONFIG.ASR_ENGINE == "openai_whisper":
12 |             return OpenAIWhisperASR()
13 |         elif CONFIG.ASR_ENGINE == "faster_whisper":
14 |             return FasterWhisperASR()
15 |         elif CONFIG.ASR_ENGINE == "whisperx":
16 |             return WhisperXASR()
17 |         else:
18 |             raise ValueError(f"Unsupported ASR engine: {CONFIG.ASR_ENGINE}")
19 | 


--------------------------------------------------------------------------------
/app/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from dataclasses import asdict
  4 | from typing import BinaryIO, TextIO
  5 | 
  6 | import ffmpeg
  7 | import numpy as np
  8 | from faster_whisper.utils import format_timestamp
  9 | 
 10 | from app.config import CONFIG
 11 | 
 12 | 
 13 | class ResultWriter:
 14 |     extension: str
 15 | 
 16 |     def __init__(self, output_dir: str):
 17 |         self.output_dir = output_dir
 18 | 
 19 |     def __call__(self, result: dict, audio_path: str):
 20 |         audio_basename = os.path.basename(audio_path)
 21 |         output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
 22 | 
 23 |         with open(output_path, "w", encoding="utf-8") as f:
 24 |             self.write_result(result, file=f)
 25 | 
 26 |     def write_result(self, result: dict, file: TextIO):
 27 |         raise NotImplementedError
 28 | 
 29 | 
 30 | class WriteTXT(ResultWriter):
 31 |     extension: str = "txt"
 32 | 
 33 |     def write_result(self, result: dict, file: TextIO):
 34 |         for segment in result["segments"]:
 35 |             print(segment.text.strip(), file=file, flush=True)
 36 | 
 37 | 
 38 | class WriteVTT(ResultWriter):
 39 |     extension: str = "vtt"
 40 | 
 41 |     def write_result(self, result: dict, file: TextIO):
 42 |         print("WEBVTT\n", file=file)
 43 |         for segment in result["segments"]:
 44 |             print(
 45 |                 f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n"
 46 |                 f"{segment.text.strip().replace('-->', '->')}\n",
 47 |                 file=file,
 48 |                 flush=True,
 49 |             )
 50 | 
 51 | 
 52 | class WriteSRT(ResultWriter):
 53 |     extension: str = "srt"
 54 | 
 55 |     def write_result(self, result: dict, file: TextIO):
 56 |         for i, segment in enumerate(result["segments"], start=1):
 57 |             # write srt lines
 58 |             print(
 59 |                 f"{i}\n"
 60 |                 f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> "
 61 |                 f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n"
 62 |                 f"{segment.text.strip().replace('-->', '->')}\n",
 63 |                 file=file,
 64 |                 flush=True,
 65 |             )
 66 | 
 67 | 
 68 | class WriteTSV(ResultWriter):
 69 |     """
 70 |     Write a transcript to a file in TSV (tab-separated values) format containing lines like:
 71 |     <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
 72 | 
 73 |     Using integer milliseconds as start and end times means there's no chance of interference from
 74 |     an environment setting a language encoding that causes the decimal in a floating point number
 75 |     to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
 76 |     """
 77 | 
 78 |     extension: str = "tsv"
 79 | 
 80 |     def write_result(self, result: dict, file: TextIO):
 81 |         print("start", "end", "text", sep="\t", file=file)
 82 |         for segment in result["segments"]:
 83 |             print(round(1000 * segment.start), file=file, end="\t")
 84 |             print(round(1000 * segment.end), file=file, end="\t")
 85 |             print(segment.text.strip().replace("\t", " "), file=file, flush=True)
 86 | 
 87 | 
 88 | class WriteJSON(ResultWriter):
 89 |     extension: str = "json"
 90 | 
 91 |     def write_result(self, result: dict, file: TextIO):
 92 |         if "segments" in result:
 93 |             result["segments"] = [asdict(segment) for segment in result["segments"]]
 94 |         json.dump(result, file)
 95 | 
 96 | 
 97 | def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
 98 |     """
 99 |     Open an audio file object and read as mono waveform, resampling as necessary.
100 |     Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
101 |     Parameters
102 |     ----------
103 |     file: BinaryIO
104 |         The audio file like object
105 |     encode: Boolean
106 |         If true, encode audio stream to WAV before sending to whisper
107 |     sr: int
108 |         The sample rate to resample the audio if necessary
109 |     Returns
110 |     -------
111 |     A NumPy array containing the audio waveform, in float32 dtype.
112 |     """
113 |     if encode:
114 |         try:
115 |             # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
116 |             # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
117 |             out, _ = (
118 |                 ffmpeg.input("pipe:", threads=0)
119 |                 .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
120 |                 .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
121 |             )
122 |         except ffmpeg.Error as e:
123 |             raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
124 |     else:
125 |         out = file.read()
126 | 
127 |     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
128 | 


--------------------------------------------------------------------------------
/app/webservice.py:
--------------------------------------------------------------------------------
  1 | import importlib.metadata
  2 | import os
  3 | from os import path
  4 | from typing import Annotated, Optional, Union
  5 | from urllib.parse import quote
  6 | 
  7 | import click
  8 | import uvicorn
  9 | from fastapi import FastAPI, File, Query, UploadFile, applications
 10 | from fastapi.openapi.docs import get_swagger_ui_html
 11 | from fastapi.responses import RedirectResponse, StreamingResponse
 12 | from fastapi.staticfiles import StaticFiles
 13 | from whisper import tokenizer
 14 | 
 15 | from app.config import CONFIG
 16 | from app.factory.asr_model_factory import ASRModelFactory
 17 | from app.utils import load_audio
 18 | 
 19 | asr_model = ASRModelFactory.create_asr_model()
 20 | asr_model.load_model()
 21 | 
 22 | LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys())
 23 | 
 24 | projectMetadata = importlib.metadata.metadata("whisper-asr-webservice")
 25 | app = FastAPI(
 26 |     title=projectMetadata["Name"].title().replace("-", " "),
 27 |     description=projectMetadata["Summary"],
 28 |     version=projectMetadata["Version"],
 29 |     contact={"url": projectMetadata["Home-page"]},
 30 |     swagger_ui_parameters={"defaultModelsExpandDepth": -1},
 31 |     license_info={"name": "MIT License", "url": "https://github.com/ahmetoner/whisper-asr-webservice/blob/main/LICENCE"},
 32 | )
 33 | 
 34 | assets_path = os.getcwd() + "/swagger-ui-assets"
 35 | if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"):
 36 |     app.mount("/assets", StaticFiles(directory=assets_path), name="static")
 37 | 
 38 |     def swagger_monkey_patch(*args, **kwargs):
 39 |         return get_swagger_ui_html(
 40 |             *args,
 41 |             **kwargs,
 42 |             swagger_favicon_url="",
 43 |             swagger_css_url="/assets/swagger-ui.css",
 44 |             swagger_js_url="/assets/swagger-ui-bundle.js",
 45 |         )
 46 | 
 47 |     applications.get_swagger_ui_html = swagger_monkey_patch
 48 | 
 49 | 
 50 | @app.get("/", response_class=RedirectResponse, include_in_schema=False)
 51 | async def index():
 52 |     return "/docs"
 53 | 
 54 | 
 55 | @app.post("/asr", tags=["Endpoints"])
 56 | async def asr(
 57 |     audio_file: UploadFile = File(...),  # noqa: B008
 58 |     encode: bool = Query(default=True, description="Encode audio first through ffmpeg"),
 59 |     task: Union[str, None] = Query(default="transcribe", enum=["transcribe", "translate"]),
 60 |     language: Union[str, None] = Query(default=None, enum=LANGUAGE_CODES),
 61 |     initial_prompt: Union[str, None] = Query(default=None),
 62 |     vad_filter: Annotated[
 63 |         bool | None,
 64 |         Query(
 65 |             description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech",
 66 |             include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False),
 67 |         ),
 68 |     ] = False,
 69 |     word_timestamps: bool = Query(
 70 |         default=False,
 71 |         description="Word level timestamps",
 72 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False),
 73 |     ),
 74 |     diarize: bool = Query(
 75 |         default=False,
 76 |         description="Diarize the input",
 77 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" and CONFIG.HF_TOKEN != "" else False),
 78 |     ),
 79 |     min_speakers: Union[int, None] = Query(
 80 |         default=None,
 81 |         description="Min speakers in this file",
 82 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False),
 83 |     ),
 84 |     max_speakers: Union[int, None] = Query(
 85 |         default=None,
 86 |         description="Max speakers in this file",
 87 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False),
 88 |     ),
 89 |     output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]),
 90 | ):
 91 |     result = asr_model.transcribe(
 92 |         load_audio(audio_file.file, encode),
 93 |         task,
 94 |         language,
 95 |         initial_prompt,
 96 |         vad_filter,
 97 |         word_timestamps,
 98 |         {"diarize": diarize, "min_speakers": min_speakers, "max_speakers": max_speakers},
 99 |         output,
100 |     )
101 |     return StreamingResponse(
102 |         result,
103 |         media_type="text/plain",
104 |         headers={
105 |             "Asr-Engine": CONFIG.ASR_ENGINE,
106 |             "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"',
107 |         },
108 |     )
109 | 
110 | 
111 | @app.post("/detect-language", tags=["Endpoints"])
112 | async def detect_language(
113 |     audio_file: UploadFile = File(...),  # noqa: B008
114 |     encode: bool = Query(default=True, description="Encode audio first through FFmpeg"),
115 | ):
116 |     detected_lang_code, confidence = asr_model.language_detection(load_audio(audio_file.file, encode))
117 |     return {
118 |         "detected_language": tokenizer.LANGUAGES[detected_lang_code],
119 |         "language_code": detected_lang_code,
120 |         "confidence": confidence,
121 |     }
122 | 
123 | 
124 | @click.command()
125 | @click.option(
126 |     "-h",
127 |     "--host",
128 |     metavar="HOST",
129 |     default="0.0.0.0",
130 |     help="Host for the webservice (default: 0.0.0.0)",
131 | )
132 | @click.option(
133 |     "-p",
134 |     "--port",
135 |     metavar="PORT",
136 |     default=9000,
137 |     help="Port for the webservice (default: 9000)",
138 | )
139 | @click.version_option(version=projectMetadata["Version"])
140 | def start(host: str, port: Optional[int] = None):
141 |     uvicorn.run(app, host=host, port=port)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     start()
146 | 


--------------------------------------------------------------------------------
/docker-compose.gpu.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   whisper-asr-webservice-gpu:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile.gpu
 8 |     deploy:
 9 |       resources:
10 |         reservations:
11 |           devices:
12 |             - driver: nvidia
13 |               count: 1
14 |               capabilities: [gpu]
15 |     environment:
16 |       - ASR_MODEL=base
17 |     ports:
18 |       - "9000:9000"
19 |     volumes:
20 |       - ./app:/app/app
21 |       - cache-whisper:/root/.cache
22 | 
23 | volumes:
24 |   cache-whisper:
25 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   whisper-asr-webservice:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     environment:
 9 |       - ASR_MODEL=base
10 |     ports:
11 |       - "9000:9000"
12 |     volumes:
13 |       - ./app:/app/app
14 |       - cache-whisper:/root/.cache
15 | 
16 | volumes:
17 |   cache-whisper:
18 | 


--------------------------------------------------------------------------------
/docs/.overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 


--------------------------------------------------------------------------------
/docs/assets/css/extra.css:
--------------------------------------------------------------------------------
1 | :root {
2 |     --md-primary-fg-color:        #3d6178;
3 |     --md-primary-fg-color--light: #3d6178;
4 |     --md-primary-fg-color--dark:  #3d6178;
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/assets/images/swagger-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmetoner/whisper-asr-webservice/ea12224ef33941d69aa1ba9fca01d95ecea7e8e7/docs/assets/images/swagger-ui.png


--------------------------------------------------------------------------------
/docs/build.md:
--------------------------------------------------------------------------------
 1 | ## Development Environment
 2 | 
 3 | Install poetry v2.X with following command:
 4 | 
 5 | ```shell
 6 | pip3 install poetry
 7 | ```
 8 | 
 9 | ### Installation
10 | 
11 | Install dependencies for cpu
12 | 
13 | ```shell
14 | poetry install --extras cpu
15 | ```
16 | 
17 | Install dependencies for cuda
18 | 
19 | ```shell
20 | poetry install --extras cuda
21 | ```
22 | 
23 | !!! Note
24 |     By default, this will install the CPU version of PyTorch. For GPU support, you'll need to install the appropriate CUDA version of PyTorch separately:
25 |     ```shell
26 |     # For CUDA support (example for CUDA 11.8):
27 |     pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121
28 |     ```
29 | 
30 | ### Run
31 | 
32 | Starting the Webservice:
33 | 
34 | ```shell
35 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
36 | ```
37 | 
38 | ### Build
39 | 
40 | === ":octicons-file-code-16: `Docker`"
41 | 
42 |     With `Dockerfile`:
43 | 
44 |     === ":octicons-file-code-16: `CPU`"
45 |     
46 |         ```shell
47 |         # Build Image
48 |         docker build -t whisper-asr-webservice .
49 |         
50 |         # Run Container
51 |         docker run -d -p 9000:9000 whisper-asr-webservice
52 |         # or with specific model
53 |         docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice
54 |         ```
55 |     
56 |     === ":octicons-file-code-16: `GPU`"
57 |     
58 |         ```shell
59 |         # Build Image
60 |         docker build -f Dockerfile.gpu -t whisper-asr-webservice-gpu .
61 |         
62 |         # Run Container
63 |         docker run -d --gpus all -p 9000:9000 whisper-asr-webservice-gpu
64 |         # or with specific model
65 |         docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice-gpu
66 |         ```
67 | 
68 |     With `docker-compose`:
69 |     
70 |     === ":octicons-file-code-16: `CPU`"
71 |     
72 |         ```shell
73 |         docker-compose up --build
74 |         ```
75 |     
76 |     === ":octicons-file-code-16: `GPU`"
77 |     
78 |         ```shell
79 |         docker-compose -f docker-compose.gpu.yml up --build
80 |         ```
81 | === ":octicons-file-code-16: `Poetry`"
82 | 
83 |     Build .whl package
84 |     
85 |     ```shell
86 |     poetry build
87 |     ```


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | --8<-- "CHANGELOG.md"
2 | 


--------------------------------------------------------------------------------
/docs/endpoints.md:
--------------------------------------------------------------------------------
  1 | ## Quick start
  2 | 
  3 | After running the docker image interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs)
  4 | 
  5 | There are 2 endpoints available:
  6 | 
  7 | - [/asr](##Automatic-Speech-recognition-service-/asr) (Automatic Speech Recognition)
  8 | - [/detect-language](##Language-detection-service-/detect-language)
  9 | 
 10 | ## Automatic speech recognition service /asr
 11 | 
 12 | - 2 task choices:
 13 |   - **transcribe**: (default) task, transcribes the uploaded file.
 14 |   - **translate**: will provide an English transcript no matter which language was spoken.
 15 | - Files are automatically converted with FFmpeg.
 16 |   - Full list of supported [audio](https://ffmpeg.org/general.html#Audio-Codecs) and [video](https://ffmpeg.org/general.html#Video-Codecs) formats.
 17 | - You can enable word level timestamps output by `word_timestamps` parameter
 18 | - You can Enable the voice activity detection (VAD) to filter out parts of the audio without speech  by `vad_filter` parameter (only with `Faster Whisper` for now).
 19 | 
 20 | ### Request URL Query Params
 21 | 
 22 | | Name            | Values                                         | Description                                                    |
 23 | |-----------------|------------------------------------------------|----------------------------------------------------------------|
 24 | | audio_file      | File                                           | Audio or video file to transcribe                              |
 25 | | output          | `text` (default), `json`, `vtt`, `srt`, `tsv` | Output format                                                  |
 26 | | task            | `transcribe`, `translate`                      | Task type - transcribe in source language or translate to English |
 27 | | language        | `en` (default is auto recognition)             | Source language code (see supported languages)                 |
 28 | | word_timestamps | false (default)                                | Enable word-level timestamps (Faster Whisper only)             |
 29 | | vad_filter      | false (default)                                | Enable voice activity detection filtering (Faster Whisper only) |
 30 | | encode          | true (default)                                 | Encode audio through FFmpeg before processing                  |
 31 | | diarize         | false (default)                                | Enable speaker diarization (WhisperX only)                     |
 32 | | min_speakers    | null (default)                                 | Minimum number of speakers for diarization (WhisperX only)     |
 33 | | max_speakers    | null (default)                                 | Maximum number of speakers for diarization (WhisperX only)     |
 34 | 
 35 | Example request with cURL
 36 | 
 37 | ```bash
 38 | curl -X POST -H "content-type: multipart/form-data" -F "audio_file=@/path/to/file" 0.0.0.0:9000/asr?output=json
 39 | ```
 40 | 
 41 | ### Response (JSON)
 42 | 
 43 | - **text**: Contains the full transcript
 44 | - **segments**: Contains an entry per segment. Each entry provides `timestamps`, `transcript`, `token ids`, `word level timestamps` and other metadata
 45 | - **language**: Detected or provided language (as a language code)
 46 | 
 47 | ### Response Formats
 48 | 
 49 | The API supports multiple output formats:
 50 | 
 51 | - **text**: Plain text transcript (default)
 52 | - **json**: Detailed JSON with segments, timestamps, and metadata
 53 | - **vtt**: WebVTT subtitle format
 54 | - **srt**: SubRip subtitle format  
 55 | - **tsv**: Tab-separated values with timestamps
 56 | 
 57 | ### Supported Languages
 58 | 
 59 | The service supports all languages supported by Whisper. Some common language codes:
 60 | 
 61 | - Turkish (tr)
 62 | - English (en)
 63 | - Spanish (es)
 64 | - French (fr)
 65 | - German (de)
 66 | - Italian (it)
 67 | - Portuguese (pt)
 68 | - And many more...
 69 | 
 70 | See the [Whisper documentation](https://github.com/openai/whisper#available-models-and-languages) for the full list of supported languages.
 71 | 
 72 | ### Speaker Diarization
 73 | 
 74 | When using the WhisperX engine with diarization enabled (`diarize=true`), the output will include speaker labels for each segment. This requires:
 75 | 
 76 | 1. WhisperX engine to be configured
 77 | 2. Valid Hugging Face token set in HF_TOKEN
 78 | 3. Sufficient memory for diarization models
 79 | 
 80 | You can optionally specify `min_speakers` and `max_speakers` if you know the expected number of speakers.
 81 | 
 82 | ## Language detection service /detect-language
 83 | 
 84 | Detects the language spoken in the uploaded file. Only processes first 30 seconds.
 85 | 
 86 | Returns a json with following fields:
 87 | 
 88 | - **detected_language**: Human readable language name (e.g. "english")
 89 | - **language_code**: ISO language code (e.g. "en")
 90 | - **confidence**: Confidence score between 0 and 1 indicating detection reliability
 91 | 
 92 | Example response:
 93 | 
 94 | ```json
 95 | {
 96 |     "detected_language": "english",
 97 |     "language_code": "en",
 98 |     "confidence": 0.98
 99 | }
100 | ```
101 | 


--------------------------------------------------------------------------------
/docs/environmental-variables.md:
--------------------------------------------------------------------------------
 1 | ### Configuring the `Engine`
 2 | 
 3 | === ":octicons-file-code-16: `openai_whisper`"
 4 | 
 5 |     ```shell
 6 |     export ASR_ENGINE=openai_whisper
 7 |     ```
 8 | 
 9 | === ":octicons-file-code-16: `faster_whisper`"
10 | 
11 |     ```shell
12 |     export ASR_ENGINE=faster_whisper
13 |     ```
14 | 
15 | === ":octicons-file-code-16: `whisperx`"
16 | 
17 |     ```shell
18 |     export ASR_ENGINE=whisperx
19 |     ```
20 | 
21 | ### Configuring the `Model`
22 | 
23 | ```shell
24 | export ASR_MODEL=base
25 | ```
26 | 
27 | Available ASR_MODELs are:
28 | 
29 | - Standard models: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` (or `large`), `large-v3-turbo` (or `turbo`)
30 | - English-optimized models: `tiny.en`, `base.en`, `small.en`, `medium.en`
31 | - Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3` (only for whisperx and faster-whisper)
32 | 
33 | For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en`
34 | models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
35 | 
36 | The distilled models offer improved inference speed while maintaining good accuracy.
37 | 
38 | ### Configuring the `Model Path`
39 | 
40 | ```shell
41 | export ASR_MODEL_PATH=/data/whisper
42 | ```
43 | 
44 | ### Configuring the `Model Unloading Timeout`
45 | 
46 | ```shell
47 | export MODEL_IDLE_TIMEOUT=300
48 | ```
49 | 
50 | Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting
51 | `0` disables the timeout, keeping the model loaded indefinitely.
52 | 
53 | ### Configuring the `SAMPLE_RATE`
54 | 
55 | ```shell
56 | export SAMPLE_RATE=16000
57 | ```
58 | 
59 | Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks.
60 | 
61 | ### Configuring Device and Quantization
62 | 
63 | ```shell
64 | export ASR_DEVICE=cuda  # or 'cpu'
65 | export ASR_QUANTIZATION=float32  # or 'float16', 'int8'
66 | ```
67 | 
68 | The `ASR_DEVICE` defaults to `cuda` if GPU is available, otherwise `cpu`. 
69 | 
70 | The `ASR_QUANTIZATION` defines the precision for model weights:
71 | 
72 | - `float32`: 32-bit floating-point precision (higher precision, slower inference)
73 | - `float16`: 16-bit floating-point precision (lower precision, faster inference)
74 | - `int8`: 8-bit integer precision (lowest precision, fastest inference)
75 | 
76 | Defaults to `float32` for GPU, `int8` for CPU.
77 | 
78 | ### Configuring Subtitle Options (WhisperX)
79 | 
80 | ```shell
81 | export SUBTITLE_MAX_LINE_WIDTH=1000
82 | export SUBTITLE_MAX_LINE_COUNT=2
83 | export SUBTITLE_HIGHLIGHT_WORDS=false
84 | ```
85 | 
86 | These options only apply when using the WhisperX engine:
87 | 
88 | - `SUBTITLE_MAX_LINE_WIDTH`: Maximum width of subtitle lines (default: 1000)
89 | - `SUBTITLE_MAX_LINE_COUNT`: Maximum number of lines per subtitle (default: 2)
90 | - `SUBTITLE_HIGHLIGHT_WORDS`: Enable word highlighting in subtitles (default: false)
91 | 
92 | ### Hugging Face Token
93 | 
94 | ```shell
95 | export HF_TOKEN=your_token_here
96 | ```
97 | 
98 | Required when using the WhisperX engine to download the diarization model.
99 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification.
 2 | 
 3 | !!! tip "Join our Discord Community!"
 4 |     🎉 **Connect with other users, get help, and stay updated on the latest features!**  
 5 |     [Join our Discord Server](https://discord.gg/4Q5YVrePzZ){target=_blank}
 6 | 
 7 | ## Features
 8 | 
 9 | Current release (v1.9.1) supports following whisper models:
10 | 
11 | - [openai/whisper](https://github.com/openai/whisper)@[v20250625](https://github.com/openai/whisper/releases/tag/v20250625)
12 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.1)
13 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.4.2](https://github.com/m-bain/whisperX/releases/tag/v3.4.2)
14 | 
15 | ## Quick Usage
16 | 
17 | === ":octicons-file-code-16: `CPU`"
18 | 
19 |     ```shell
20 |     docker run -d -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest
21 |     ```
22 | 
23 | === ":octicons-file-code-16: `GPU`"
24 | 
25 |     ```shell
26 |     docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest-gpu
27 |     ```
28 | 
29 | for more information:
30 | 
31 | - [Documentation/Run](https://ahmetoner.github.io/whisper-asr-webservice/run)
32 | - [Docker Hub](https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice)
33 | 
34 | ## Credits
35 | 
36 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html)
37 | 


--------------------------------------------------------------------------------
/docs/licence.md:
--------------------------------------------------------------------------------
1 | # Licence
2 | 
3 | ```
4 | --8<-- "LICENCE"
5 | ```
6 | 


--------------------------------------------------------------------------------
/docs/run.md:
--------------------------------------------------------------------------------
 1 | ## Usage
 2 | 
 3 | Whisper ASR Webservice now available on Docker Hub. You can find the latest version of this repository on docker hub for CPU and GPU.
 4 | 
 5 | Docker Hub: <https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice>
 6 | 
 7 | === ":octicons-file-code-16: `CPU`"
 8 | 
 9 |     ```shell
10 |     docker pull onerahmet/openai-whisper-asr-webservice:latest
11 |     docker run -d -p 9000:9000 \
12 |       -e ASR_MODEL=base \
13 |       -e ASR_ENGINE=openai_whisper \
14 |       onerahmet/openai-whisper-asr-webservice:latest
15 |     ```
16 | 
17 | === ":octicons-file-code-16: `CPU (macOS)`"
18 | 
19 |     > GPU passthrough does not work on macOS due to fundamental design limitations of Docker. Docker actually runs containers within a LinuxVM on macOS. If you wish to run GPU-accelerated containers, I'm afraid Linux is your only option.
20 |     > 
21 |     > The `:latest` image tag provides both amd64 and arm64 architectures:
22 |     
23 |     ```shell
24 |     docker pull onerahmet/openai-whisper-asr-webservice:latest
25 |     docker run -d -p 9000:9000 \
26 |       -e ASR_MODEL=base \
27 |       -e ASR_ENGINE=openai_whisper \
28 |       onerahmet/openai-whisper-asr-webservice:latest
29 |     ```
30 | 
31 | === ":octicons-file-code-16: `GPU`"
32 | 
33 |     ```shell
34 |     docker pull onerahmet/openai-whisper-asr-webservice:latest-gpu
35 |     docker run -d --gpus all -p 9000:9000 \
36 |       -e ASR_MODEL=base \
37 |       -e ASR_ENGINE=openai_whisper \
38 |       onerahmet/openai-whisper-asr-webservice:latest-gpu
39 |     ```
40 | 
41 | ### Environment Variables
42 | 
43 | The following environment variables can be used to configure the service:
44 | 
45 | - `ASR_MODEL`: Whisper model to use (tiny, base, small, medium, large) [default: base]
46 | - `ASR_ENGINE`: ASR engine to use (openai_whisper, faster_whisper) [default: openai_whisper]
47 | - `ASR_MODEL_PATH`: Custom path to store/load model files [optional]
48 | 
49 | > Interactive Swagger API documentation is available at <http://localhost:9000/docs>
50 | 
51 | ![Swagger UI](assets/images/swagger-ui.png)
52 | 
53 | ## Cache
54 | 
55 | The ASR model is downloaded each time you start the container. Using the large model can take significant time to download.
56 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory to local storage.
57 | The model will then be loaded from the cache instead of being downloaded again on subsequent container starts.
58 | 
59 | **Important: Using a persistent cache will prevent you from receiving model updates.**
60 | 
61 | === ":octicons-file-code-16: `Default cache dir`"
62 | 
63 |     ```shell
64 |     docker run -d -p 9000:9000 \
65 |       -v $PWD/cache:/root/.cache \
66 |       onerahmet/openai-whisper-asr-webservice:latest
67 |     ```
68 | 
69 | === ":octicons-file-code-16: `With ASR_MODEL_PATH`"
70 | 
71 |     ```shell
72 |     docker run -d -p 9000:9000 \
73 |       -e ASR_MODEL_PATH=/data/whisper \
74 |       -v $PWD/cache:/data/whisper \
75 |       onerahmet/openai-whisper-asr-webservice:latest
76 |     ```
77 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: Whisper ASR Webservice
  2 | site_url: https://ahmetoner.github.io/whisper-asr-webservice
  3 | site_dir: public
  4 | 
  5 | site_description: "OpenAI Whisper ASR Webservice API"
  6 | repo_url: "https://github.com/ahmetoner/whisper-asr-webservice"
  7 | repo_name: "ahmetoner/whisper-asr-webservice"
  8 | copyright: Copyright &copy; 2025
  9 | edit_uri: edit/main/docs/
 10 | 
 11 | validation:
 12 |   omitted_files: warn
 13 |   absolute_links: warn
 14 |   unrecognized_links: warn
 15 | 
 16 | nav:
 17 |   - Overview: index.md
 18 |   - Installation & Usage: run.md
 19 |   - API Endpoints: endpoints.md
 20 |   - Configuration: environmental-variables.md
 21 |   - Development: build.md
 22 |   - Changelog: changelog.md
 23 |   - License: licence.md
 24 |   - Releases: https://github.com/ahmetoner/whisper-asr-webservice/releases
 25 |   - Docker Hub: https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice
 26 | 
 27 | theme:
 28 |   name: material
 29 |   custom_dir: docs/.overrides
 30 |   icon:
 31 |     logo: material/subtitles
 32 |   features:
 33 |     - announce.dismiss
 34 |     - content.action.edit
 35 |     - content.action.view
 36 |     - content.code.annotate
 37 |     - content.code.copy
 38 |     - content.tooltips
 39 |     - navigation.footer
 40 |     - navigation.indexes
 41 |     # - navigation.sections # important
 42 |     - navigation.top
 43 |     # - navigation.tabs
 44 |     # - navigation.tabs.sticky
 45 |     - search.highlight
 46 |     - search.suggest
 47 |     - toc.follow
 48 |     - toc.integrate
 49 |   palette:
 50 |     # System preference
 51 |     - media: "(prefers-color-scheme)"
 52 |       toggle:
 53 |         icon: material/brightness-auto
 54 |         name: Switch to light mode
 55 |     # Light mode
 56 |     - media: "(prefers-color-scheme: light)"
 57 |       scheme: default
 58 |       primary: custom
 59 |       accent: teal
 60 |       toggle:
 61 |         icon: material/brightness-7
 62 |         name: Switch to dark mode
 63 |     # Dark mode
 64 |     - media: "(prefers-color-scheme: dark)"
 65 |       scheme: slate
 66 |       primary: black
 67 |       accent: lime
 68 |       toggle:
 69 |         icon: material/brightness-4
 70 |         name: Switch to system preference
 71 | 
 72 | 
 73 | 
 74 | extra_css:
 75 |   - assets/css/extra.css
 76 | markdown_extensions:
 77 |   - attr_list
 78 |   - admonition
 79 |   - footnotes
 80 |   - pymdownx.emoji:
 81 |       emoji_index: !!python/name:materialx.emoji.twemoji
 82 |       emoji_generator: !!python/name:materialx.emoji.to_svg
 83 |   - pymdownx.magiclink
 84 |   - pymdownx.snippets:
 85 |       check_paths: true
 86 |       dedent_subsections: true
 87 |   - pymdownx.superfences
 88 |   - pymdownx.tabbed:
 89 |       alternate_style: true
 90 |       slugify: !!python/object/apply:pymdownx.slugs.slugify
 91 |         kwds:
 92 |           case: lower
 93 |   - pymdownx.tasklist:
 94 |       custom_checkbox: true
 95 |   - toc:
 96 |       permalink: "¶"
 97 |   - pymdownx.superfences:
 98 |       custom_fences:
 99 |         - name: mermaid
100 |           class: mermaid
101 |           format: !!python/name:pymdownx.superfences.fence_code_format
102 | 
103 | plugins:
104 |   - search
105 | 
106 | extra:
107 |   generator: false 
108 |   social:
109 |     - icon: fontawesome/brands/github
110 |       link: https://github.com/ahmetoner
111 |     - icon: fontawesome/brands/docker
112 |       link: https://hub.docker.com/u/onerahmet
113 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "whisper-asr-webservice"
  3 | version = "1.10.0-dev"
  4 | description = "Whisper ASR Webservice is a general-purpose speech recognition webservice."
  5 | requires-python = ">=3.10,<3.13"
  6 | dependencies = [
  7 |     "fastapi (>=0.115.14)",
  8 |     "uvicorn[standard] (>=0.35.0)",
  9 |     "python-multipart (>=0.0.20)",
 10 |     "ffmpeg-python (>=0.2.0)",
 11 |     "numpy (>=2.2.6)",
 12 |     "openai-whisper (>=20250625)",
 13 |     "faster-whisper (>=1.1.1)",
 14 |     "whisperx (>=3.4.2)",
 15 |     "tqdm (>=4.67.1)",
 16 |     "llvmlite (>=0.44.0)",
 17 |     "numba (>=0.61.2)",
 18 | ]
 19 | authors = [
 20 |     { name = "Ahmet Öner" },
 21 |     { name = "Besim Alibegovic" }
 22 | ]
 23 | license = { text = "MIT" }
 24 | readme = "README.md"
 25 | keywords = ["speech-recognition", "whisper", "asr", "webservice"]
 26 | 
 27 | [tool.poetry]
 28 | requires-poetry = ">=2.0"
 29 | packages = [{ include = "app" }]
 30 | 
 31 | [project.urls]
 32 | Homepage = "https://github.com/ahmetoner/whisper-asr-webservice/"
 33 | Repository = "https://github.com/ahmetoner/whisper-asr-webservice"
 34 | 
 35 | [project.scripts]
 36 | whisper-asr-webservice = "app.webservice:start"
 37 | 
 38 | [project.optional-dependencies]
 39 | cpu = [
 40 |     "torch (==2.7.1)",
 41 |     "torchaudio (==2.7.1)"
 42 | ]
 43 | cuda = [
 44 |     "torch (==2.7.1+cu126)",
 45 |     "torchaudio (==2.7.1+cu126)"
 46 | ]
 47 | 
 48 | [[tool.poetry.source]]
 49 | name = "pytorch-cpu"
 50 | url = "https://download.pytorch.org/whl/cpu"
 51 | priority = "explicit"
 52 | 
 53 | [[tool.poetry.source]]
 54 | name = "pytorch-cuda"
 55 | url = "https://download.pytorch.org/whl/cu126"
 56 | priority = "explicit"
 57 | 
 58 | [tool.poetry.dependencies]
 59 | torch = [
 60 |     { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cpu"},
 61 |     { markers = "extra == 'cuda' and extra != 'cpu' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cuda"},
 62 |     { markers = "extra == 'cpu' and extra != 'cuda' and sys_platform == 'darwin'", source = "pypi"},
 63 |     { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'aarch64' and sys_platform != 'darwin'", source = "pypi"}
 64 |  ]
 65 | torchaudio = [
 66 |     { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cpu"},
 67 |     { markers = "extra == 'cuda' and extra != 'cpu' and platform_machine == 'x86_64' and sys_platform != 'darwin'", source = "pytorch-cuda"},
 68 |     { markers = "extra == 'cpu' and extra != 'cuda' and sys_platform == 'darwin'", source = "pypi"},
 69 |     { markers = "extra == 'cpu' and extra != 'cuda' and platform_machine == 'aarch64' and sys_platform != 'darwin'", source = "pypi"}
 70 |  ]
 71 | 
 72 | 
 73 | [tool.poetry.group.dev.dependencies]
 74 | pytest = ">=8.3.4,<9.0.0"
 75 | ruff = ">=0.9.6,<1.0.0"
 76 | black = ">=25.1.0,<26.0.0"
 77 | mkdocs-material = ">=9.6.4,<10.0.0"
 78 | pymdown-extensions = ">=10.14.3,<11.0.0"
 79 | 
 80 | [build-system]
 81 | requires = ["poetry-core>=2.0"]
 82 | build-backend = "poetry.core.masonry.api"
 83 | 
 84 | [tool.black]
 85 | skip-string-normalization = true
 86 | line-length = 120
 87 | 
 88 | [tool.ruff]
 89 | line-length = 120
 90 | 
 91 | [tool.ruff.lint]
 92 | select = [
 93 |     "E",  # pycodestyle errors
 94 |     "W",  # pycodestyle warnings
 95 |     "F",  # pyflakes
 96 |     "I",  # isort
 97 |     "C",  # flake8-comprehensions
 98 |     "B",  # flake8-bugbear
 99 | ]
100 | ignore = [
101 |     "E501",  # line too long, handled by black
102 |     "C901",  # too complex
103 | ]
104 | 
105 | [tool.ruff.lint.isort]
106 | order-by-type = true
107 | relative-imports-order = "closest-to-furthest"
108 | extra-standard-library = ["typing"]
109 | section-order = [
110 |     "future",
111 |     "standard-library",
112 |     "third-party",
113 |     "first-party",
114 |     "local-folder",
115 | ]
116 | known-first-party = []
117 | 


--------------------------------------------------------------------------------