├── .dockerignore
├── docs
    ├── changelog.md
    ├── .overrides
    │   └── main.html
    ├── licence.md
    ├── assets
    │   ├── images
    │   │   └── swagger-ui.png
    │   └── css
    │   │   └── extra.css
    ├── index.md
    ├── build.md
    ├── run.md
    ├── environmental-variables.md
    └── endpoints.md
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── documentation.yml
    │   └── docker-publish.yml
├── docker-compose.yml
├── docker-compose.gpu.yml
├── .gitignore
├── app
    ├── factory
    │   └── asr_model_factory.py
    ├── asr_models
    │   ├── asr_model.py
    │   ├── openai_whisper_engine.py
    │   ├── faster_whisper_engine.py
    │   └── mbain_whisperx_engine.py
    ├── config.py
    ├── utils.py
    └── webservice.py
├── Dockerfile
├── LICENCE
├── Dockerfile.gpu
├── pyproject.toml
├── mkdocs.yml
├── README.md
└── CHANGELOG.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .venv
3 | venv


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | --8<-- "CHANGELOG.md"
2 | 


--------------------------------------------------------------------------------
/docs/.overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 


--------------------------------------------------------------------------------
/docs/licence.md:
--------------------------------------------------------------------------------
1 | # Licence
2 | 
3 | ```
4 | --8<-- "LICENCE"
5 | ```
6 | 


--------------------------------------------------------------------------------
/docs/assets/images/swagger-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/whisper-asr-webservice/main/docs/assets/images/swagger-ui.png


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [ahmetoner]
4 | custom: ['https://bmc.link/ahmetoner']
5 | 


--------------------------------------------------------------------------------
/docs/assets/css/extra.css:
--------------------------------------------------------------------------------
1 | :root {
2 |     --md-primary-fg-color:        #3d6178;
3 |     --md-primary-fg-color--light: #3d6178;
4 |     --md-primary-fg-color--dark:  #3d6178;
5 | }
6 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   whisper-asr-webservice:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     environment:
 9 |       - ASR_MODEL=base
10 |     ports:
11 |       - "9000:9000"
12 |     volumes:
13 |       - ./app:/app/app
14 |       - cache-whisper:/root/.cache
15 | 
16 | volumes:
17 |   cache-whisper:
18 | 


--------------------------------------------------------------------------------
/docker-compose.gpu.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   whisper-asr-webservice-gpu:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile.gpu
 8 |     deploy:
 9 |       resources:
10 |         reservations:
11 |           devices:
12 |             - driver: nvidia
13 |               count: 1
14 |               capabilities: [gpu]
15 |     environment:
16 |       - ASR_MODEL=base
17 |     ports:
18 |       - "9000:9000"
19 |     volumes:
20 |       - ./app:/app/app
21 |       - cache-whisper:/root/.cache
22 | 
23 | volumes:
24 |   cache-whisper:
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | 
 3 | # Packages
 4 | *.egg
 5 | !/tests/**/*.egg
 6 | /*.egg-info
 7 | /dist/*
 8 | build
 9 | _build
10 | .cache
11 | *.so
12 | venv
13 | 
14 | # Installer logs
15 | pip-log.txt
16 | 
17 | # Unit test / coverage reports
18 | .coverage
19 | .pytest_cache
20 | 
21 | .DS_Store
22 | .idea/*
23 | .python-version
24 | .vscode/*
25 | 
26 | /test.py
27 | /test_*.*
28 | 
29 | /setup.cfg
30 | MANIFEST.in
31 | /setup.py
32 | /docs/site/*
33 | /tests/fixtures/simple_project/setup.py
34 | /tests/fixtures/project_with_extras/setup.py
35 | .mypy_cache
36 | 
37 | .venv
38 | /releases/*
39 | pip-wheel-metadata
40 | /poetry.toml
41 | 
42 | poetry/core/*
43 | 
44 | public
45 | 


--------------------------------------------------------------------------------
/app/factory/asr_model_factory.py:
--------------------------------------------------------------------------------
 1 | from app.asr_models.asr_model import ASRModel
 2 | from app.asr_models.faster_whisper_engine import FasterWhisperASR
 3 | from app.asr_models.mbain_whisperx_engine import WhisperXASR
 4 | from app.asr_models.openai_whisper_engine import OpenAIWhisperASR
 5 | from app.config import CONFIG
 6 | 
 7 | 
 8 | class ASRModelFactory:
 9 |     @staticmethod
10 |     def create_asr_model() -> ASRModel:
11 |         if CONFIG.ASR_ENGINE == "openai_whisper":
12 |             return OpenAIWhisperASR()
13 |         elif CONFIG.ASR_ENGINE == "faster_whisper":
14 |             return FasterWhisperASR()
15 |         elif CONFIG.ASR_ENGINE == "whisperx":
16 |             return WhisperXASR()
17 |         else:
18 |             raise ValueError(f"Unsupported ASR engine: {CONFIG.ASR_ENGINE}")
19 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | on:
 3 |   push:
 4 |     tags:        
 5 |       - '*'
 6 |     branches:
 7 |       - docs
 8 | permissions:
 9 |   contents: write
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     if: github.event.repository.fork == false
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: actions/setup-python@v4
17 |         with:
18 |           python-version: 3.x
19 |       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
20 |       - uses: actions/cache@v3
21 |         with:
22 |           key: mkdocs-material-${{ env.cache_id }}
23 |           path: .cache
24 |           restore-keys: |
25 |             mkdocs-material-
26 |       - run: pip install mkdocs-material pymdown-extensions
27 |       - run: mkdocs gh-deploy --force
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
 2 | 
 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 4 | 
 5 | FROM python:3.10-bookworm
 6 | 
 7 | ENV POETRY_VENV=/app/.venv
 8 | 
 9 | RUN python3 -m venv $POETRY_VENV \
10 |     && $POETRY_VENV/bin/pip install -U pip setuptools \
11 |     && $POETRY_VENV/bin/pip install poetry==2.1.1
12 | 
13 | ENV PATH="${PATH}:${POETRY_VENV}/bin"
14 | 
15 | WORKDIR /app
16 | 
17 | COPY . /app
18 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
19 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
20 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
21 | 
22 | RUN poetry config virtualenvs.in-project true
23 | RUN poetry install
24 | 
25 | EXPOSE 9000
26 | 
27 | ENTRYPOINT ["whisper-asr-webservice"]
28 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ahmet Oner & Besim Alibegovic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification.
 2 | 
 3 | ## Features
 4 | 
 5 | Current release (v1.8.2) supports following whisper models:
 6 | 
 7 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
 8 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
 9 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)
10 | 
11 | ## Quick Usage
12 | 
13 | === ":octicons-file-code-16: `CPU`"
14 | 
15 |     ```shell
16 |     docker run -d -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest
17 |     ```
18 | 
19 | === ":octicons-file-code-16: `GPU`"
20 | 
21 |     ```shell
22 |     docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest-gpu
23 |     ```
24 | 
25 | for more information:
26 | 
27 | - [Documentation/Run](https://ahmetoner.github.io/whisper-asr-webservice/run)
28 | - [Docker Hub](https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice)
29 | 
30 | ## Credits
31 | 
32 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html)
33 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg
 2 | 
 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 4 | 
 5 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04
 6 | 
 7 | ENV PYTHON_VERSION=3.10
 8 | 
 9 | ENV POETRY_VENV=/app/.venv
10 | 
11 | RUN export DEBIAN_FRONTEND=noninteractive \
12 |     && apt-get -qq update \
13 |     && apt-get -qq install --no-install-recommends \
14 |     python${PYTHON_VERSION} \
15 |     python${PYTHON_VERSION}-venv \
16 |     python3-pip \
17 |     libcudnn8 \
18 |     python3-pip \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \
22 |     ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \
23 |     ln -s -f /usr/bin/pip3 /usr/bin/pip
24 | 
25 | RUN python3 -m venv $POETRY_VENV \
26 |     && $POETRY_VENV/bin/pip install -U pip setuptools \
27 |     && $POETRY_VENV/bin/pip install poetry==2.1.1
28 | 
29 | ENV PATH="${PATH}:${POETRY_VENV}/bin"
30 | 
31 | WORKDIR /app
32 | 
33 | COPY poetry.lock pyproject.toml ./
34 | 
35 | RUN poetry config virtualenvs.in-project true
36 | RUN poetry install --no-root
37 | 
38 | COPY . .
39 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
40 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css
41 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js
42 | 
43 | RUN poetry install
44 | RUN $POETRY_VENV/bin/pip install torch==2.6.0+cu126 torchaudio==2.6.0+cu126 --index-url https://download.pytorch.org/whl/cu126
45 | 
46 | EXPOSE 9000
47 | 
48 | CMD whisper-asr-webservice
49 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Docker Image
 2 | on:
 3 |   push:
 4 |     tags:        
 5 |       - '*'
 6 |     branches:
 7 |       - debug
 8 | 
 9 | env:
10 |   DOCKER_USER: ${{secrets.DOCKER_USER}}
11 |   DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}
12 |   REPO_NAME: ${{secrets.REPO_NAME}}
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         include:
19 |           - docker_file: Dockerfile
20 |             platforms: linux/arm64,linux/amd64
21 |           - docker_file: Dockerfile.gpu
22 |             tag_extension: -gpu
23 |             platforms: linux/amd64
24 |     steps:
25 |     - name: Checkout
26 |       uses: actions/checkout@v3
27 |     - name: Set up QEMU
28 |       uses: docker/setup-qemu-action@v1
29 |     - name: Set up Docker Buildx
30 |       uses: docker/setup-buildx-action@v1
31 |     - name: Login to DockerHub
32 |       uses: docker/login-action@v1
33 |       with:
34 |         username: ${{ secrets.DOCKER_USER }}
35 |         password: ${{ secrets.DOCKER_PASSWORD }}
36 |     - name: Build and Publish the Docker debug image
37 |       if: github.ref == 'refs/heads/debug'
38 |       run: |
39 |         DOCKER_IMAGE_DEBUG=$DOCKER_USER/$REPO_NAME:debug${{ matrix.tag_extension }}
40 |         docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_DEBUG}" -f ${{ matrix.docker_file }} --push
41 |     - name: Build and Publish the Docker image
42 |       if: github.ref != 'refs/heads/debug'
43 |       run: |
44 |         DOCKER_IMAGE_LATEST=$DOCKER_USER/$REPO_NAME:latest${{ matrix.tag_extension }}
45 |         DOCKER_IMAGE_VERSION=$DOCKER_USER/$REPO_NAME:$GITHUB_REF_NAME${{ matrix.tag_extension }}
46 |         docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_LATEST}" -t "${DOCKER_IMAGE_VERSION}" -f ${{ matrix.docker_file }} --push
47 | 


--------------------------------------------------------------------------------
/docs/build.md:
--------------------------------------------------------------------------------
 1 | ## Development Environment
 2 | 
 3 | Install poetry with following command:
 4 | 
 5 | ```shell
 6 | pip3 install poetry
 7 | ```
 8 | 
 9 | ### Installation
10 | 
11 | Install packages:
12 | 
13 | ```shell
14 | poetry install
15 | ```
16 | 
17 | !!! Note
18 |     By default, this will install the CPU version of PyTorch. For GPU support, you'll need to install the appropriate CUDA version of PyTorch separately:
19 |     ```shell
20 |     # For CUDA support (example for CUDA 11.8):
21 |     pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121
22 |     ```
23 | 
24 | ### Run
25 | 
26 | Starting the Webservice:
27 | 
28 | ```shell
29 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
30 | ```
31 | 
32 | ### Build
33 | 
34 | === ":octicons-file-code-16: `Docker`"
35 | 
36 |     With `Dockerfile`:
37 | 
38 |     === ":octicons-file-code-16: `CPU`"
39 |     
40 |         ```shell
41 |         # Build Image
42 |         docker build -t whisper-asr-webservice .
43 |         
44 |         # Run Container
45 |         docker run -d -p 9000:9000 whisper-asr-webservice
46 |         # or with specific model
47 |         docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice
48 |         ```
49 |     
50 |     === ":octicons-file-code-16: `GPU`"
51 |     
52 |         ```shell
53 |         # Build Image
54 |         docker build -f Dockerfile.gpu -t whisper-asr-webservice-gpu .
55 |         
56 |         # Run Container
57 |         docker run -d --gpus all -p 9000:9000 whisper-asr-webservice-gpu
58 |         # or with specific model
59 |         docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice-gpu
60 |         ```
61 | 
62 |     With `docker-compose`:
63 |     
64 |     === ":octicons-file-code-16: `CPU`"
65 |     
66 |         ```shell
67 |         docker-compose up --build
68 |         ```
69 |     
70 |     === ":octicons-file-code-16: `GPU`"
71 |     
72 |         ```shell
73 |         docker-compose -f docker-compose.gpu.yml up --build
74 |         ```
75 | === ":octicons-file-code-16: `Poetry`"
76 | 
77 |     Build .whl package
78 |     
79 |     ```shell
80 |     poetry build
81 |     ```


--------------------------------------------------------------------------------
/app/asr_models/asr_model.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import time
 3 | from abc import ABC, abstractmethod
 4 | from threading import Lock
 5 | from typing import Union
 6 | 
 7 | import torch
 8 | 
 9 | from app.config import CONFIG
10 | 
11 | 
12 | class ASRModel(ABC):
13 |     """
14 |     Abstract base class for ASR (Automatic Speech Recognition) models.
15 |     """
16 | 
17 |     model = None
18 |     model_lock = Lock()
19 |     last_activity_time = time.time()
20 | 
21 |     def __init__(self):
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def load_model(self):
26 |         """
27 |         Loads the model from the specified path.
28 |         """
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def transcribe(
33 |         self,
34 |         audio,
35 |         task: Union[str, None],
36 |         language: Union[str, None],
37 |         initial_prompt: Union[str, None],
38 |         vad_filter: Union[bool, None],
39 |         word_timestamps: Union[bool, None],
40 |         options: Union[dict, None],
41 |         output,
42 |     ):
43 |         """
44 |         Perform transcription on the given audio file.
45 |         """
46 |         pass
47 | 
48 |     @abstractmethod
49 |     def language_detection(self, audio):
50 |         """
51 |         Perform language detection on the given audio file.
52 |         """
53 |         pass
54 | 
55 |     def monitor_idleness(self):
56 |         """
57 |         Monitors the idleness of the ASR model and releases the model if it has been idle for too long.
58 |         """
59 |         if CONFIG.MODEL_IDLE_TIMEOUT <= 0:
60 |             return
61 |         while True:
62 |             time.sleep(15)
63 |             if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
64 |                 with self.model_lock:
65 |                     self.release_model()
66 |                     break
67 | 
68 |     def release_model(self):
69 |         """
70 |         Unloads the model from memory and clears any cached GPU memory.
71 |         """
72 |         del self.model
73 |         torch.cuda.empty_cache()
74 |         gc.collect()
75 |         self.model = None
76 |         print("Model unloaded due to timeout")
77 | 


--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class CONFIG:
 7 |     """
 8 |     Configuration class for ASR models.
 9 |     Reads environment variables for runtime configuration, with sensible defaults.
10 |     """
11 |     # Determine the ASR engine ('faster_whisper', 'openai_whisper' or 'whisperx')
12 |     ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
13 | 
14 |     # Retrieve Huggingface Token
15 |     HF_TOKEN = os.getenv("HF_TOKEN", "")
16 |     if ASR_ENGINE == "whisperx" and HF_TOKEN == "":
17 |         print("You must set the HF_TOKEN environment variable to download the diarization model used by WhisperX.")
18 | 
19 |     # Determine the computation device (GPU or CPU)
20 |     DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
21 | 
22 |     # Model name to use (e.g., "base", "small", etc.)
23 |     MODEL_NAME = os.getenv("ASR_MODEL", "base")
24 | 
25 |     # Path to the model directory
26 |     MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
27 | 
28 |     # Model quantization level. Defines the precision for model weights:
29 |     #   'float32' - 32-bit floating-point precision (higher precision, slower inference)
30 |     #   'float16' - 16-bit floating-point precision (lower precision, faster inference)
31 |     #   'int8' - 8-bit integer precision (lowest precision, fastest inference)
32 |     # Defaults to 'float32' for GPU availability, 'int8' for CPU.
33 |     MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8")
34 |     if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}:
35 |         raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.")
36 | 
37 |     # Idle timeout in seconds. If set to a non-zero value, the model will be unloaded
38 |     # after being idle for this many seconds. A value of 0 means the model will never be unloaded.
39 |     MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0))
40 | 
41 |     # Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks.
42 |     SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000))
43 | 
44 |     # Subtitle output options for whisperx
45 |     SUBTITLE_MAX_LINE_WIDTH = int(os.getenv("SUBTITLE_MAX_LINE_WIDTH", 1000))
46 |     SUBTITLE_MAX_LINE_COUNT = int(os.getenv("SUBTITLE_MAX_LINE_COUNT", 2))
47 |     SUBTITLE_HIGHLIGHT_WORDS = os.getenv("SUBTITLE_HIGHLIGHT_WORDS", "false").lower() == "true"
48 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "whisper-asr-webservice"
 3 | version = "1.9.0-dev"
 4 | description = "Whisper ASR Webservice is a general-purpose speech recognition webservice."
 5 | homepage = "https://github.com/ahmetoner/whisper-asr-webservice/"
 6 | license = "https://github.com/ahmetoner/whisper-asr-webservice/blob/main/LICENCE"
 7 | authors = ["Ahmet Öner", "Besim Alibegovic"]
 8 | readme = "README.md"
 9 | packages = [{ include = "app" }]
10 | 
11 | [tool.poetry.scripts]
12 | whisper-asr-webservice = "app.webservice:start"
13 | 
14 | [[tool.poetry.source]]
15 | name = "pytorch-cpu"
16 | url = "https://download.pytorch.org/whl/cpu"
17 | priority = "explicit"
18 | 
19 | [tool.poetry.dependencies]
20 | python = "<3.13,>=3.10"
21 | fastapi = "^0.115.8"
22 | uvicorn = { extras = ["standard"], version = "^0.34.0" }
23 | python-multipart = "^0.0.20"
24 | ffmpeg-python = "^0.2.0"
25 | numpy = "<2.0.0"
26 | openai-whisper = "^20240930"
27 | faster-whisper = "^1.1.0"
28 | whisperx = "^3.3.1"
29 | tqdm = "^4.67.1"
30 | llvmlite = "^0.44.0"
31 | numba = "^0.61.0"
32 | torch = [
33 |     { version = "2.6.0", source = "pypi", markers = "sys_platform == 'darwin'"},
34 |     { version = "2.6.0", source = "pypi", markers = "platform_machine == 'aarch64' and sys_platform != 'darwin'"},
35 |     { version = "2.6.0", source = "pytorch-cpu", markers = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
36 | ]
37 | torchaudio = [
38 |     { version = "2.6.0", source = "pypi", markers = "sys_platform == 'darwin'"},
39 |     { version = "2.6.0", source = "pypi", markers = "platform_machine == 'aarch64' and sys_platform != 'darwin'"},
40 |     { version = "2.6.0", source = "pytorch-cpu", markers = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
41 | ]
42 | 
43 | [poetry.group.dev.dependencies]
44 | pytest = "^8.3.4"
45 | ruff = "^0.9.6"
46 | black = "^25.1.0"
47 | mkdocs-material = "^9.6.4"
48 | pymdown-extensions = "^10.14.3"
49 | 
50 | [build-system]
51 | requires = ["poetry-core>=1.0.0"]
52 | build-backend = "poetry.core.masonry.api"
53 | 
54 | [tool.black]
55 | skip-string-normalization = true
56 | line-length = 120
57 | 
58 | [tool.ruff]
59 | line-length = 120
60 | 
61 | [tool.ruff.lint]
62 | select = [
63 |     "E",  # pycodestyle errors
64 |     "W",  # pycodestyle warnings
65 |     "F",  # pyflakes
66 |     "I",  # isort
67 |     "C",  # flake8-comprehensions
68 |     "B",  # flake8-bugbear
69 | ]
70 | ignore = [
71 |     "E501",  # line too long, handled by black
72 |     "C901",  # too complex
73 | ]
74 | 
75 | [tool.ruff.lint.isort]
76 | order-by-type = true
77 | relative-imports-order = "closest-to-furthest"
78 | extra-standard-library = ["typing"]
79 | section-order = [
80 |     "future",
81 |     "standard-library",
82 |     "third-party",
83 |     "first-party",
84 |     "local-folder",
85 | ]
86 | known-first-party = []
87 | 


--------------------------------------------------------------------------------
/docs/run.md:
--------------------------------------------------------------------------------
 1 | ## Usage
 2 | 
 3 | Whisper ASR Webservice now available on Docker Hub. You can find the latest version of this repository on docker hub for CPU and GPU.
 4 | 
 5 | Docker Hub: <https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice>
 6 | 
 7 | === ":octicons-file-code-16: `CPU`"
 8 | 
 9 |     ```shell
10 |     docker pull onerahmet/openai-whisper-asr-webservice:latest
11 |     docker run -d -p 9000:9000 \
12 |       -e ASR_MODEL=base \
13 |       -e ASR_ENGINE=openai_whisper \
14 |       onerahmet/openai-whisper-asr-webservice:latest
15 |     ```
16 | 
17 | === ":octicons-file-code-16: `CPU (macOS)`"
18 | 
19 |     > GPU passthrough does not work on macOS due to fundamental design limitations of Docker. Docker actually runs containers within a LinuxVM on macOS. If you wish to run GPU-accelerated containers, I'm afraid Linux is your only option.
20 |     > 
21 |     > The `:latest` image tag provides both amd64 and arm64 architectures:
22 |     
23 |     ```shell
24 |     docker pull onerahmet/openai-whisper-asr-webservice:latest
25 |     docker run -d -p 9000:9000 \
26 |       -e ASR_MODEL=base \
27 |       -e ASR_ENGINE=openai_whisper \
28 |       onerahmet/openai-whisper-asr-webservice:latest
29 |     ```
30 | 
31 | === ":octicons-file-code-16: `GPU`"
32 | 
33 |     ```shell
34 |     docker pull onerahmet/openai-whisper-asr-webservice:latest-gpu
35 |     docker run -d --gpus all -p 9000:9000 \
36 |       -e ASR_MODEL=base \
37 |       -e ASR_ENGINE=openai_whisper \
38 |       onerahmet/openai-whisper-asr-webservice:latest-gpu
39 |     ```
40 | 
41 | ### Environment Variables
42 | 
43 | The following environment variables can be used to configure the service:
44 | 
45 | - `ASR_MODEL`: Whisper model to use (tiny, base, small, medium, large) [default: base]
46 | - `ASR_ENGINE`: ASR engine to use (openai_whisper, faster_whisper) [default: openai_whisper]
47 | - `ASR_MODEL_PATH`: Custom path to store/load model files [optional]
48 | 
49 | > Interactive Swagger API documentation is available at <http://localhost:9000/docs>
50 | 
51 | ![Swagger UI](assets/images/swagger-ui.png)
52 | 
53 | ## Cache
54 | 
55 | The ASR model is downloaded each time you start the container. Using the large model can take significant time to download.
56 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory to local storage.
57 | The model will then be loaded from the cache instead of being downloaded again on subsequent container starts.
58 | 
59 | **Important: Using a persistent cache will prevent you from receiving model updates.**
60 | 
61 | === ":octicons-file-code-16: `Default cache dir`"
62 | 
63 |     ```shell
64 |     docker run -d -p 9000:9000 \
65 |       -v $PWD/cache:/root/.cache \
66 |       onerahmet/openai-whisper-asr-webservice:latest
67 |     ```
68 | 
69 | === ":octicons-file-code-16: `With ASR_MODEL_PATH`"
70 | 
71 |     ```shell
72 |     docker run -d -p 9000:9000 \
73 |       -e ASR_MODEL_PATH=/data/whisper \
74 |       -v $PWD/cache:/data/whisper \
75 |       onerahmet/openai-whisper-asr-webservice:latest
76 |     ```
77 | 


--------------------------------------------------------------------------------
/docs/environmental-variables.md:
--------------------------------------------------------------------------------
 1 | ### Configuring the `Engine`
 2 | 
 3 | === ":octicons-file-code-16: `openai_whisper`"
 4 | 
 5 |     ```shell
 6 |     export ASR_ENGINE=openai_whisper
 7 |     ```
 8 | 
 9 | === ":octicons-file-code-16: `faster_whisper`"
10 | 
11 |     ```shell
12 |     export ASR_ENGINE=faster_whisper
13 |     ```
14 | 
15 | === ":octicons-file-code-16: `whisperx`"
16 | 
17 |     ```shell
18 |     export ASR_ENGINE=whisperx
19 |     ```
20 | 
21 | ### Configuring the `Model`
22 | 
23 | ```shell
24 | export ASR_MODEL=base
25 | ```
26 | 
27 | Available ASR_MODELs are:
28 | 
29 | - Standard models: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` (or `large`), `large-v3-turbo` (or `turbo`)
30 | - English-optimized models: `tiny.en`, `base.en`, `small.en`, `medium.en`
31 | - Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3` (only for whisperx and faster-whisper)
32 | 
33 | For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en`
34 | models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
35 | 
36 | The distilled models offer improved inference speed while maintaining good accuracy.
37 | 
38 | ### Configuring the `Model Path`
39 | 
40 | ```shell
41 | export ASR_MODEL_PATH=/data/whisper
42 | ```
43 | 
44 | ### Configuring the `Model Unloading Timeout`
45 | 
46 | ```shell
47 | export MODEL_IDLE_TIMEOUT=300
48 | ```
49 | 
50 | Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting
51 | `0` disables the timeout, keeping the model loaded indefinitely.
52 | 
53 | ### Configuring the `SAMPLE_RATE`
54 | 
55 | ```shell
56 | export SAMPLE_RATE=16000
57 | ```
58 | 
59 | Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks.
60 | 
61 | ### Configuring Device and Quantization
62 | 
63 | ```shell
64 | export ASR_DEVICE=cuda  # or 'cpu'
65 | export ASR_QUANTIZATION=float32  # or 'float16', 'int8'
66 | ```
67 | 
68 | The `ASR_DEVICE` defaults to `cuda` if GPU is available, otherwise `cpu`. 
69 | 
70 | The `ASR_QUANTIZATION` defines the precision for model weights:
71 | 
72 | - `float32`: 32-bit floating-point precision (higher precision, slower inference)
73 | - `float16`: 16-bit floating-point precision (lower precision, faster inference)
74 | - `int8`: 8-bit integer precision (lowest precision, fastest inference)
75 | 
76 | Defaults to `float32` for GPU, `int8` for CPU.
77 | 
78 | ### Configuring Subtitle Options (WhisperX)
79 | 
80 | ```shell
81 | export SUBTITLE_MAX_LINE_WIDTH=1000
82 | export SUBTITLE_MAX_LINE_COUNT=2
83 | export SUBTITLE_HIGHLIGHT_WORDS=false
84 | ```
85 | 
86 | These options only apply when using the WhisperX engine:
87 | 
88 | - `SUBTITLE_MAX_LINE_WIDTH`: Maximum width of subtitle lines (default: 1000)
89 | - `SUBTITLE_MAX_LINE_COUNT`: Maximum number of lines per subtitle (default: 2)
90 | - `SUBTITLE_HIGHLIGHT_WORDS`: Enable word highlighting in subtitles (default: false)
91 | 
92 | ### Hugging Face Token
93 | 
94 | ```shell
95 | export HF_TOKEN=your_token_here
96 | ```
97 | 
98 | Required when using the WhisperX engine to download the diarization model.
99 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: Whisper ASR Webservice
  2 | site_url: https://ahmetoner.github.io/whisper-asr-webservice
  3 | site_dir: public
  4 | 
  5 | site_description: "OpenAI Whisper ASR Webservice API"
  6 | repo_url: "https://github.com/ahmetoner/whisper-asr-webservice"
  7 | repo_name: "ahmetoner/whisper-asr-webservice"
  8 | copyright: Copyright &copy; 2025
  9 | edit_uri: edit/main/docs/
 10 | 
 11 | validation:
 12 |   omitted_files: warn
 13 |   absolute_links: warn
 14 |   unrecognized_links: warn
 15 | 
 16 | nav:
 17 |   - Overview: index.md
 18 |   - Installation & Usage: run.md
 19 |   - API Endpoints: endpoints.md
 20 |   - Configuration: environmental-variables.md
 21 |   - Development: build.md
 22 |   - Changelog: changelog.md
 23 |   - License: licence.md
 24 |   - Releases: https://github.com/ahmetoner/whisper-asr-webservice/releases
 25 |   - Docker Hub: https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice
 26 | 
 27 | theme:
 28 |   name: material
 29 |   custom_dir: docs/.overrides
 30 |   icon:
 31 |     logo: material/subtitles
 32 |   features:
 33 |     - announce.dismiss
 34 |     - content.action.edit
 35 |     - content.action.view
 36 |     - content.code.annotate
 37 |     - content.code.copy
 38 |     - content.tooltips
 39 |     - navigation.footer
 40 |     - navigation.indexes
 41 |     # - navigation.sections # important
 42 |     - navigation.top
 43 |     # - navigation.tabs
 44 |     # - navigation.tabs.sticky
 45 |     - search.highlight
 46 |     - search.suggest
 47 |     - toc.follow
 48 |     - toc.integrate
 49 |   palette:
 50 |     # System preference
 51 |     - media: "(prefers-color-scheme)"
 52 |       toggle:
 53 |         icon: material/brightness-auto
 54 |         name: Switch to light mode
 55 |     # Light mode
 56 |     - media: "(prefers-color-scheme: light)"
 57 |       scheme: default
 58 |       primary: custom
 59 |       accent: teal
 60 |       toggle:
 61 |         icon: material/brightness-7
 62 |         name: Switch to dark mode
 63 |     # Dark mode
 64 |     - media: "(prefers-color-scheme: dark)"
 65 |       scheme: slate
 66 |       primary: black
 67 |       accent: lime
 68 |       toggle:
 69 |         icon: material/brightness-4
 70 |         name: Switch to system preference
 71 | 
 72 | 
 73 | 
 74 | extra_css:
 75 |   - assets/css/extra.css
 76 | markdown_extensions:
 77 |   - attr_list
 78 |   - admonition
 79 |   - footnotes
 80 |   - pymdownx.emoji:
 81 |       emoji_index: !!python/name:materialx.emoji.twemoji
 82 |       emoji_generator: !!python/name:materialx.emoji.to_svg
 83 |   - pymdownx.magiclink
 84 |   - pymdownx.snippets:
 85 |       check_paths: true
 86 |       dedent_subsections: true
 87 |   - pymdownx.superfences
 88 |   - pymdownx.tabbed:
 89 |       alternate_style: true
 90 |       slugify: !!python/object/apply:pymdownx.slugs.slugify
 91 |         kwds:
 92 |           case: lower
 93 |   - pymdownx.tasklist:
 94 |       custom_checkbox: true
 95 |   - toc:
 96 |       permalink: "¶"
 97 |   - pymdownx.superfences:
 98 |       custom_fences:
 99 |         - name: mermaid
100 |           class: mermaid
101 |           format: !!python/name:pymdownx.superfences.fence_code_format
102 | 
103 | plugins:
104 |   - search
105 | 
106 | extra:
107 |   generator: false 
108 |   social:
109 |     - icon: fontawesome/brands/github
110 |       link: https://github.com/ahmetoner
111 |     - icon: fontawesome/brands/docker
112 |       link: https://hub.docker.com/u/onerahmet
113 | 


--------------------------------------------------------------------------------
/app/asr_models/openai_whisper_engine.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from io import StringIO
 3 | from threading import Thread
 4 | from typing import BinaryIO, Union
 5 | 
 6 | import torch
 7 | import whisper
 8 | from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
 9 | 
10 | from app.asr_models.asr_model import ASRModel
11 | from app.config import CONFIG
12 | 
13 | 
14 | class OpenAIWhisperASR(ASRModel):
15 | 
16 |     def load_model(self):
17 | 
18 |         if torch.cuda.is_available():
19 |             self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH).cuda()
20 |         else:
21 |             self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH)
22 | 
23 |         Thread(target=self.monitor_idleness, daemon=True).start()
24 | 
25 |     def transcribe(
26 |         self,
27 |         audio,
28 |         task: Union[str, None],
29 |         language: Union[str, None],
30 |         initial_prompt: Union[str, None],
31 |         vad_filter: Union[bool, None],
32 |         word_timestamps: Union[bool, None],
33 |         options: Union[dict, None],
34 |         output,
35 |     ):
36 |         self.last_activity_time = time.time()
37 | 
38 |         with self.model_lock:
39 |             if self.model is None:
40 |                 self.load_model()
41 | 
42 |         options_dict = {"task": task}
43 |         if language:
44 |             options_dict["language"] = language
45 |         if initial_prompt:
46 |             options_dict["initial_prompt"] = initial_prompt
47 |         if word_timestamps:
48 |             options_dict["word_timestamps"] = word_timestamps
49 |         with self.model_lock:
50 |             result = self.model.transcribe(audio, **options_dict)
51 | 
52 |         output_file = StringIO()
53 |         self.write_result(result, output_file, output)
54 |         output_file.seek(0)
55 | 
56 |         return output_file
57 | 
58 |     def language_detection(self, audio):
59 | 
60 |         self.last_activity_time = time.time()
61 | 
62 |         with self.model_lock:
63 |             if self.model is None:
64 |                 self.load_model()
65 | 
66 |         # load audio and pad/trim it to fit 30 seconds
67 |         audio = whisper.pad_or_trim(audio)
68 | 
69 |         # make log-Mel spectrogram and move to the same device as the model
70 |         mel = whisper.log_mel_spectrogram(audio, self.model.dims.n_mels).to(self.model.device)
71 | 
72 |         # detect the spoken language
73 |         with self.model_lock:
74 |             _, probs = self.model.detect_language(mel)
75 |         detected_lang_code = max(probs, key=probs.get)
76 | 
77 |         return detected_lang_code, probs[max(probs)]
78 | 
79 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
80 |         options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False}
81 |         if output == "srt":
82 |             WriteSRT(ResultWriter).write_result(result, file=file, options=options)
83 |         elif output == "vtt":
84 |             WriteVTT(ResultWriter).write_result(result, file=file, options=options)
85 |         elif output == "tsv":
86 |             WriteTSV(ResultWriter).write_result(result, file=file, options=options)
87 |         elif output == "json":
88 |             WriteJSON(ResultWriter).write_result(result, file=file, options=options)
89 |         else:
90 |             WriteTXT(ResultWriter).write_result(result, file=file, options=options)
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Release](https://img.shields.io/github/v/release/ahmetoner/whisper-asr-webservice.svg)
 2 | ![Docker Pulls](https://img.shields.io/docker/pulls/onerahmet/openai-whisper-asr-webservice.svg)
 3 | ![Build](https://img.shields.io/github/actions/workflow/status/ahmetoner/whisper-asr-webservice/docker-publish.yml.svg)
 4 | ![Licence](https://img.shields.io/github/license/ahmetoner/whisper-asr-webservice.svg)
 5 | 
 6 | # Whisper ASR Box
 7 | 
 8 | Whisper ASR Box is a general-purpose speech recognition toolkit. Whisper Models are trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification.
 9 | 
10 | ## Features
11 | 
12 | Current release (v1.8.2) supports following whisper models:
13 | 
14 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
15 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
16 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)
17 | 
18 | ## Quick Usage
19 | 
20 | ### CPU
21 | 
22 | ```shell
23 | docker run -d -p 9000:9000 \
24 |   -e ASR_MODEL=base \
25 |   -e ASR_ENGINE=openai_whisper \
26 |   onerahmet/openai-whisper-asr-webservice:latest
27 | ```
28 | 
29 | ### GPU
30 | 
31 | ```shell
32 | docker run -d --gpus all -p 9000:9000 \
33 |   -e ASR_MODEL=base \
34 |   -e ASR_ENGINE=openai_whisper \
35 |   onerahmet/openai-whisper-asr-webservice:latest-gpu
36 | ```
37 | 
38 | #### Cache
39 | 
40 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory:
41 | 
42 | ```shell
43 | docker run -d -p 9000:9000 \
44 |   -v $PWD/cache:/root/.cache/ \
45 |   onerahmet/openai-whisper-asr-webservice:latest
46 | ```
47 | 
48 | ## Key Features
49 | 
50 | - Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX)
51 | - Multiple output formats (text, JSON, VTT, SRT, TSV)
52 | - Word-level timestamps support
53 | - Voice activity detection (VAD) filtering
54 | - Speaker diarization (with WhisperX)
55 | - FFmpeg integration for broad audio/video format support
56 | - GPU acceleration support
57 | - Configurable model loading/unloading
58 | - REST API with Swagger documentation
59 | 
60 | ## Environment Variables
61 | 
62 | Key configuration options:
63 | 
64 | - `ASR_ENGINE`: Engine selection (openai_whisper, faster_whisper, whisperx)
65 | - `ASR_MODEL`: Model selection (tiny, base, small, medium, large-v3, etc.)
66 | - `ASR_MODEL_PATH`: Custom path to store/load models
67 | - `ASR_DEVICE`: Device selection (cuda, cpu)
68 | - `MODEL_IDLE_TIMEOUT`: Timeout for model unloading
69 | 
70 | ## Documentation
71 | 
72 | For complete documentation, visit:
73 | [https://ahmetoner.github.io/whisper-asr-webservice](https://ahmetoner.github.io/whisper-asr-webservice)
74 | 
75 | ## Development
76 | 
77 | ```shell
78 | # Install poetry
79 | pip3 install poetry
80 | 
81 | # Install dependencies
82 | poetry install
83 | 
84 | # Run service
85 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
86 | ```
87 | 
88 | After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:9000` in your browser to access the Swagger UI documentation and try out the API endpoints.
89 | 
90 | ## Credits
91 | 
92 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html)
93 | 


--------------------------------------------------------------------------------
/app/asr_models/faster_whisper_engine.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from io import StringIO
 3 | from threading import Thread
 4 | from typing import BinaryIO, Union
 5 | 
 6 | import whisper
 7 | from faster_whisper import WhisperModel
 8 | 
 9 | from app.asr_models.asr_model import ASRModel
10 | from app.config import CONFIG
11 | from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
12 | 
13 | 
14 | class FasterWhisperASR(ASRModel):
15 | 
16 |     def load_model(self):
17 | 
18 |         self.model = WhisperModel(
19 |             model_size_or_path=CONFIG.MODEL_NAME,
20 |             device=CONFIG.DEVICE,
21 |             compute_type=CONFIG.MODEL_QUANTIZATION,
22 |             download_root=CONFIG.MODEL_PATH
23 |         )
24 | 
25 |         Thread(target=self.monitor_idleness, daemon=True).start()
26 | 
27 |     def transcribe(
28 |             self,
29 |             audio,
30 |             task: Union[str, None],
31 |             language: Union[str, None],
32 |             initial_prompt: Union[str, None],
33 |             vad_filter: Union[bool, None],
34 |             word_timestamps: Union[bool, None],
35 |             options: Union[dict, None],
36 |             output,
37 |     ):
38 |         self.last_activity_time = time.time()
39 | 
40 |         with self.model_lock:
41 |             if self.model is None:
42 |                 self.load_model()
43 | 
44 |         options_dict = {"task": task}
45 |         if language:
46 |             options_dict["language"] = language
47 |         if initial_prompt:
48 |             options_dict["initial_prompt"] = initial_prompt
49 |         if vad_filter:
50 |             options_dict["vad_filter"] = True
51 |         if word_timestamps:
52 |             options_dict["word_timestamps"] = True
53 |         with self.model_lock:
54 |             segments = []
55 |             text = ""
56 |             segment_generator, info = self.model.transcribe(audio, beam_size=5, **options_dict)
57 |             for segment in segment_generator:
58 |                 segments.append(segment)
59 |                 text = text + segment.text
60 |             result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text}
61 | 
62 |         output_file = StringIO()
63 |         self.write_result(result, output_file, output)
64 |         output_file.seek(0)
65 | 
66 |         return output_file
67 | 
68 |     def language_detection(self, audio):
69 | 
70 |         self.last_activity_time = time.time()
71 | 
72 |         with self.model_lock:
73 |             if self.model is None: self.load_model()
74 | 
75 |         # load audio and pad/trim it to fit 30 seconds
76 |         audio = whisper.pad_or_trim(audio)
77 | 
78 |         # detect the spoken language
79 |         with self.model_lock:
80 |             segments, info = self.model.transcribe(audio, beam_size=5)
81 |             detected_lang_code = info.language
82 |             detected_language_confidence = info.language_probability
83 | 
84 |         return detected_lang_code, detected_language_confidence
85 | 
86 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
87 |         if output == "srt":
88 |             WriteSRT(ResultWriter).write_result(result, file=file)
89 |         elif output == "vtt":
90 |             WriteVTT(ResultWriter).write_result(result, file=file)
91 |         elif output == "tsv":
92 |             WriteTSV(ResultWriter).write_result(result, file=file)
93 |         elif output == "json":
94 |             WriteJSON(ResultWriter).write_result(result, file=file)
95 |         else:
96 |             WriteTXT(ResultWriter).write_result(result, file=file)
97 | 


--------------------------------------------------------------------------------
/docs/endpoints.md:
--------------------------------------------------------------------------------
  1 | ## Quick start
  2 | 
  3 | After running the docker image interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs)
  4 | 
  5 | There are 2 endpoints available:
  6 | 
  7 | - [/asr](##Automatic-Speech-recognition-service-/asr) (Automatic Speech Recognition)
  8 | - [/detect-language](##Language-detection-service-/detect-language)
  9 | 
 10 | ## Automatic speech recognition service /asr
 11 | 
 12 | - 2 task choices:
 13 |   - **transcribe**: (default) task, transcribes the uploaded file.
 14 |   - **translate**: will provide an English transcript no matter which language was spoken.
 15 | - Files are automatically converted with FFmpeg.
 16 |   - Full list of supported [audio](https://ffmpeg.org/general.html#Audio-Codecs) and [video](https://ffmpeg.org/general.html#Video-Codecs) formats.
 17 | - You can enable word level timestamps output by `word_timestamps` parameter
 18 | - You can Enable the voice activity detection (VAD) to filter out parts of the audio without speech  by `vad_filter` parameter (only with `Faster Whisper` for now).
 19 | 
 20 | ### Request URL Query Params
 21 | 
 22 | | Name            | Values                                         | Description                                                    |
 23 | |-----------------|------------------------------------------------|----------------------------------------------------------------|
 24 | | audio_file      | File                                           | Audio or video file to transcribe                              |
 25 | | output          | `text` (default), `json`, `vtt`, `srt`, `tsv` | Output format                                                  |
 26 | | task            | `transcribe`, `translate`                      | Task type - transcribe in source language or translate to English |
 27 | | language        | `en` (default is auto recognition)             | Source language code (see supported languages)                 |
 28 | | word_timestamps | false (default)                                | Enable word-level timestamps (Faster Whisper only)             |
 29 | | vad_filter      | false (default)                                | Enable voice activity detection filtering (Faster Whisper only) |
 30 | | encode          | true (default)                                 | Encode audio through FFmpeg before processing                  |
 31 | | diarize         | false (default)                                | Enable speaker diarization (WhisperX only)                     |
 32 | | min_speakers    | null (default)                                 | Minimum number of speakers for diarization (WhisperX only)     |
 33 | | max_speakers    | null (default)                                 | Maximum number of speakers for diarization (WhisperX only)     |
 34 | 
 35 | Example request with cURL
 36 | 
 37 | ```bash
 38 | curl -X POST -H "content-type: multipart/form-data" -F "audio_file=@/path/to/file" 0.0.0.0:9000/asr?output=json
 39 | ```
 40 | 
 41 | ### Response (JSON)
 42 | 
 43 | - **text**: Contains the full transcript
 44 | - **segments**: Contains an entry per segment. Each entry provides `timestamps`, `transcript`, `token ids`, `word level timestamps` and other metadata
 45 | - **language**: Detected or provided language (as a language code)
 46 | 
 47 | ### Response Formats
 48 | 
 49 | The API supports multiple output formats:
 50 | 
 51 | - **text**: Plain text transcript (default)
 52 | - **json**: Detailed JSON with segments, timestamps, and metadata
 53 | - **vtt**: WebVTT subtitle format
 54 | - **srt**: SubRip subtitle format  
 55 | - **tsv**: Tab-separated values with timestamps
 56 | 
 57 | ### Supported Languages
 58 | 
 59 | The service supports all languages supported by Whisper. Some common language codes:
 60 | 
 61 | - Turkish (tr)
 62 | - English (en)
 63 | - Spanish (es)
 64 | - French (fr)
 65 | - German (de)
 66 | - Italian (it)
 67 | - Portuguese (pt)
 68 | - And many more...
 69 | 
 70 | See the [Whisper documentation](https://github.com/openai/whisper#available-models-and-languages) for the full list of supported languages.
 71 | 
 72 | ### Speaker Diarization
 73 | 
 74 | When using the WhisperX engine with diarization enabled (`diarize=true`), the output will include speaker labels for each segment. This requires:
 75 | 
 76 | 1. WhisperX engine to be configured
 77 | 2. Valid Hugging Face token set in HF_TOKEN
 78 | 3. Sufficient memory for diarization models
 79 | 
 80 | You can optionally specify `min_speakers` and `max_speakers` if you know the expected number of speakers.
 81 | 
 82 | ## Language detection service /detect-language
 83 | 
 84 | Detects the language spoken in the uploaded file. Only processes first 30 seconds.
 85 | 
 86 | Returns a json with following fields:
 87 | 
 88 | - **detected_language**: Human readable language name (e.g. "english")
 89 | - **language_code**: ISO language code (e.g. "en")
 90 | - **confidence**: Confidence score between 0 and 1 indicating detection reliability
 91 | 
 92 | Example response:
 93 | 
 94 | ```json
 95 | {
 96 |     "detected_language": "english",
 97 |     "language_code": "en",
 98 |     "confidence": 0.98
 99 | }
100 | ```
101 | 


--------------------------------------------------------------------------------
/app/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from dataclasses import asdict
  4 | from typing import BinaryIO, TextIO
  5 | 
  6 | import ffmpeg
  7 | import numpy as np
  8 | from faster_whisper.utils import format_timestamp
  9 | 
 10 | from app.config import CONFIG
 11 | 
 12 | 
 13 | class ResultWriter:
 14 |     extension: str
 15 | 
 16 |     def __init__(self, output_dir: str):
 17 |         self.output_dir = output_dir
 18 | 
 19 |     def __call__(self, result: dict, audio_path: str):
 20 |         audio_basename = os.path.basename(audio_path)
 21 |         output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
 22 | 
 23 |         with open(output_path, "w", encoding="utf-8") as f:
 24 |             self.write_result(result, file=f)
 25 | 
 26 |     def write_result(self, result: dict, file: TextIO):
 27 |         raise NotImplementedError
 28 | 
 29 | 
 30 | class WriteTXT(ResultWriter):
 31 |     extension: str = "txt"
 32 | 
 33 |     def write_result(self, result: dict, file: TextIO):
 34 |         for segment in result["segments"]:
 35 |             print(segment.text.strip(), file=file, flush=True)
 36 | 
 37 | 
 38 | class WriteVTT(ResultWriter):
 39 |     extension: str = "vtt"
 40 | 
 41 |     def write_result(self, result: dict, file: TextIO):
 42 |         print("WEBVTT\n", file=file)
 43 |         for segment in result["segments"]:
 44 |             print(
 45 |                 f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n"
 46 |                 f"{segment.text.strip().replace('-->', '->')}\n",
 47 |                 file=file,
 48 |                 flush=True,
 49 |             )
 50 | 
 51 | 
 52 | class WriteSRT(ResultWriter):
 53 |     extension: str = "srt"
 54 | 
 55 |     def write_result(self, result: dict, file: TextIO):
 56 |         for i, segment in enumerate(result["segments"], start=1):
 57 |             # write srt lines
 58 |             print(
 59 |                 f"{i}\n"
 60 |                 f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> "
 61 |                 f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n"
 62 |                 f"{segment.text.strip().replace('-->', '->')}\n",
 63 |                 file=file,
 64 |                 flush=True,
 65 |             )
 66 | 
 67 | 
 68 | class WriteTSV(ResultWriter):
 69 |     """
 70 |     Write a transcript to a file in TSV (tab-separated values) format containing lines like:
 71 |     <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
 72 | 
 73 |     Using integer milliseconds as start and end times means there's no chance of interference from
 74 |     an environment setting a language encoding that causes the decimal in a floating point number
 75 |     to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
 76 |     """
 77 | 
 78 |     extension: str = "tsv"
 79 | 
 80 |     def write_result(self, result: dict, file: TextIO):
 81 |         print("start", "end", "text", sep="\t", file=file)
 82 |         for segment in result["segments"]:
 83 |             print(round(1000 * segment.start), file=file, end="\t")
 84 |             print(round(1000 * segment.end), file=file, end="\t")
 85 |             print(segment.text.strip().replace("\t", " "), file=file, flush=True)
 86 | 
 87 | 
 88 | class WriteJSON(ResultWriter):
 89 |     extension: str = "json"
 90 | 
 91 |     def write_result(self, result: dict, file: TextIO):
 92 |         if "segments" in result:
 93 |             result["segments"] = [asdict(segment) for segment in result["segments"]]
 94 |         json.dump(result, file)
 95 | 
 96 | 
 97 | def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
 98 |     """
 99 |     Open an audio file object and read as mono waveform, resampling as necessary.
100 |     Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
101 |     Parameters
102 |     ----------
103 |     file: BinaryIO
104 |         The audio file like object
105 |     encode: Boolean
106 |         If true, encode audio stream to WAV before sending to whisper
107 |     sr: int
108 |         The sample rate to resample the audio if necessary
109 |     Returns
110 |     -------
111 |     A NumPy array containing the audio waveform, in float32 dtype.
112 |     """
113 |     if encode:
114 |         try:
115 |             # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
116 |             # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
117 |             out, _ = (
118 |                 ffmpeg.input("pipe:", threads=0)
119 |                 .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
120 |                 .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
121 |             )
122 |         except ffmpeg.Error as e:
123 |             raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
124 |     else:
125 |         out = file.read()
126 | 
127 |     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
128 | 


--------------------------------------------------------------------------------
/app/asr_models/mbain_whisperx_engine.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from io import StringIO
  3 | from threading import Thread
  4 | from typing import BinaryIO, Union
  5 | 
  6 | import whisper
  7 | import whisperx
  8 | from whisperx.utils import ResultWriter, SubtitlesWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
  9 | 
 10 | from app.asr_models.asr_model import ASRModel
 11 | from app.config import CONFIG
 12 | 
 13 | 
 14 | class WhisperXASR(ASRModel):
 15 |     def __init__(self):
 16 |         super().__init__()
 17 |         self.model = {
 18 |             'whisperx': None,
 19 |             'diarize_model': None,
 20 |             'align_model': {}
 21 |         }
 22 | 
 23 |     def load_model(self):
 24 |         asr_options = {"without_timestamps": False}
 25 |         self.model['whisperx'] = whisperx.load_model(
 26 |             CONFIG.MODEL_NAME,
 27 |             device=CONFIG.DEVICE,
 28 |             compute_type=CONFIG.MODEL_QUANTIZATION,
 29 |             asr_options=asr_options
 30 |         )
 31 | 
 32 |         if CONFIG.HF_TOKEN != "":
 33 |             self.model['diarize_model'] = whisperx.DiarizationPipeline(
 34 |                 use_auth_token=CONFIG.HF_TOKEN,
 35 |                 device=CONFIG.DEVICE
 36 |             )
 37 | 
 38 |         Thread(target=self.monitor_idleness, daemon=True).start()
 39 | 
 40 |     def transcribe(
 41 |         self,
 42 |         audio,
 43 |         task: Union[str, None],
 44 |         language: Union[str, None],
 45 |         initial_prompt: Union[str, None],
 46 |         vad_filter: Union[bool, None],
 47 |         word_timestamps: Union[bool, None],
 48 |         options: Union[dict, None],
 49 |         output,
 50 |     ):
 51 |         self.last_activity_time = time.time()
 52 |         with self.model_lock:
 53 |             if self.model is None:
 54 |                 self.load_model()
 55 | 
 56 |         options_dict = {"task": task}
 57 |         if language:
 58 |             options_dict["language"] = language
 59 |         if initial_prompt:
 60 |             options_dict["initial_prompt"] = initial_prompt
 61 |         with self.model_lock:
 62 |             result = self.model['whisperx'].transcribe(audio, **options_dict)
 63 |             language = result["language"]
 64 | 
 65 |         # Load the required model and cache it
 66 |         # If we transcribe models in many different languages, this may lead to OOM propblems
 67 |         if result["language"] in self.model['align_model']:
 68 |             model_x, metadata = self.model['align_model'][result["language"]]
 69 |         else:
 70 |             self.model['align_model'][result["language"]] = whisperx.load_align_model(
 71 |                 language_code=result["language"], device=CONFIG.DEVICE
 72 |             )
 73 |             model_x, metadata = self.model['align_model'][result["language"]]
 74 | 
 75 |         # Align whisper output
 76 |         result = whisperx.align(
 77 |             result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False
 78 |         )
 79 | 
 80 |         if options.get("diarize", False) and CONFIG.HF_TOKEN != "":
 81 |             min_speakers = options.get("min_speakers", None)
 82 |             max_speakers = options.get("max_speakers", None)
 83 |             # add min/max number of speakers if known
 84 |             diarize_segments = self.model['diarize_model'](audio, min_speakers, max_speakers)
 85 |             result = whisperx.assign_word_speakers(diarize_segments, result)
 86 |         result["language"] = language
 87 | 
 88 |         output_file = StringIO()
 89 |         self.write_result(result, output_file, output)
 90 |         output_file.seek(0)
 91 | 
 92 |         return output_file
 93 | 
 94 |     def language_detection(self, audio):
 95 |         # load audio and pad/trim it to fit 30 seconds
 96 |         audio = whisper.pad_or_trim(audio)
 97 | 
 98 |         # make log-Mel spectrogram and move to the same device as the model
 99 |         mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
100 | 
101 |         # detect the spoken language
102 |         with self.model_lock:
103 |             if self.model is None:
104 |                 self.load_model()
105 |             _, probs = self.model.detect_language(mel)
106 |         detected_lang_code = max(probs, key=probs.get)
107 | 
108 |         return detected_lang_code
109 | 
110 |     def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
111 |         default_options = {
112 |             "max_line_width": CONFIG.SUBTITLE_MAX_LINE_WIDTH,
113 |             "max_line_count": CONFIG.SUBTITLE_MAX_LINE_COUNT,
114 |             "highlight_words": CONFIG.SUBTITLE_HIGHLIGHT_WORDS
115 |         }
116 | 
117 |         if output == "srt":
118 |             WriteSRT(SubtitlesWriter).write_result(result, file=file, options=default_options)
119 |         elif output == "vtt":
120 |             WriteVTT(SubtitlesWriter).write_result(result, file=file, options=default_options)
121 |         elif output == "tsv":
122 |             WriteTSV(ResultWriter).write_result(result, file=file, options=default_options)
123 |         elif output == "json":
124 |             WriteJSON(ResultWriter).write_result(result, file=file, options=default_options)
125 |         else:
126 |             WriteTXT(ResultWriter).write_result(result, file=file, options=default_options)
127 | 


--------------------------------------------------------------------------------
/app/webservice.py:
--------------------------------------------------------------------------------
  1 | import importlib.metadata
  2 | import os
  3 | from os import path
  4 | from typing import Annotated, Optional, Union
  5 | from urllib.parse import quote
  6 | 
  7 | import click
  8 | import uvicorn
  9 | from fastapi import FastAPI, File, Query, UploadFile, applications
 10 | from fastapi.openapi.docs import get_swagger_ui_html
 11 | from fastapi.responses import RedirectResponse, StreamingResponse
 12 | from fastapi.staticfiles import StaticFiles
 13 | from whisper import tokenizer
 14 | 
 15 | from app.config import CONFIG
 16 | from app.factory.asr_model_factory import ASRModelFactory
 17 | from app.utils import load_audio
 18 | 
 19 | asr_model = ASRModelFactory.create_asr_model()
 20 | asr_model.load_model()
 21 | 
 22 | LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys())
 23 | 
 24 | projectMetadata = importlib.metadata.metadata("whisper-asr-webservice")
 25 | app = FastAPI(
 26 |     title=projectMetadata["Name"].title().replace("-", " "),
 27 |     description=projectMetadata["Summary"],
 28 |     version=projectMetadata["Version"],
 29 |     contact={"url": projectMetadata["Home-page"]},
 30 |     swagger_ui_parameters={"defaultModelsExpandDepth": -1},
 31 |     license_info={"name": "MIT License", "url": projectMetadata["License"]},
 32 | )
 33 | 
 34 | assets_path = os.getcwd() + "/swagger-ui-assets"
 35 | if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"):
 36 |     app.mount("/assets", StaticFiles(directory=assets_path), name="static")
 37 | 
 38 |     def swagger_monkey_patch(*args, **kwargs):
 39 |         return get_swagger_ui_html(
 40 |             *args,
 41 |             **kwargs,
 42 |             swagger_favicon_url="",
 43 |             swagger_css_url="/assets/swagger-ui.css",
 44 |             swagger_js_url="/assets/swagger-ui-bundle.js",
 45 |         )
 46 | 
 47 |     applications.get_swagger_ui_html = swagger_monkey_patch
 48 | 
 49 | 
 50 | @app.get("/", response_class=RedirectResponse, include_in_schema=False)
 51 | async def index():
 52 |     return "/docs"
 53 | 
 54 | 
 55 | @app.post("/asr", tags=["Endpoints"])
 56 | async def asr(
 57 |     audio_file: UploadFile = File(...),  # noqa: B008
 58 |     encode: bool = Query(default=True, description="Encode audio first through ffmpeg"),
 59 |     task: Union[str, None] = Query(default="transcribe", enum=["transcribe", "translate"]),
 60 |     language: Union[str, None] = Query(default=None, enum=LANGUAGE_CODES),
 61 |     initial_prompt: Union[str, None] = Query(default=None),
 62 |     vad_filter: Annotated[
 63 |         bool | None,
 64 |         Query(
 65 |             description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech",
 66 |             include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False),
 67 |         ),
 68 |     ] = False,
 69 |     word_timestamps: bool = Query(
 70 |         default=False,
 71 |         description="Word level timestamps",
 72 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False),
 73 |     ),
 74 |     diarize: bool = Query(
 75 |         default=False,
 76 |         description="Diarize the input",
 77 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" and CONFIG.HF_TOKEN != "" else False),
 78 |     ),
 79 |     min_speakers: Union[int, None] = Query(
 80 |         default=None,
 81 |         description="Min speakers in this file",
 82 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False),
 83 |     ),
 84 |     max_speakers: Union[int, None] = Query(
 85 |         default=None,
 86 |         description="Max speakers in this file",
 87 |         include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False),
 88 |     ),
 89 |     output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]),
 90 | ):
 91 |     result = asr_model.transcribe(
 92 |         load_audio(audio_file.file, encode),
 93 |         task,
 94 |         language,
 95 |         initial_prompt,
 96 |         vad_filter,
 97 |         word_timestamps,
 98 |         {"diarize": diarize, "min_speakers": min_speakers, "max_speakers": max_speakers},
 99 |         output,
100 |     )
101 |     return StreamingResponse(
102 |         result,
103 |         media_type="text/plain",
104 |         headers={
105 |             "Asr-Engine": CONFIG.ASR_ENGINE,
106 |             "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"',
107 |         },
108 |     )
109 | 
110 | 
111 | @app.post("/detect-language", tags=["Endpoints"])
112 | async def detect_language(
113 |     audio_file: UploadFile = File(...),  # noqa: B008
114 |     encode: bool = Query(default=True, description="Encode audio first through FFmpeg"),
115 | ):
116 |     detected_lang_code, confidence = asr_model.language_detection(load_audio(audio_file.file, encode))
117 |     return {
118 |         "detected_language": tokenizer.LANGUAGES[detected_lang_code],
119 |         "language_code": detected_lang_code,
120 |         "confidence": confidence,
121 |     }
122 | 
123 | 
124 | @click.command()
125 | @click.option(
126 |     "-h",
127 |     "--host",
128 |     metavar="HOST",
129 |     default="0.0.0.0",
130 |     help="Host for the webservice (default: 0.0.0.0)",
131 | )
132 | @click.option(
133 |     "-p",
134 |     "--port",
135 |     metavar="PORT",
136 |     default=9000,
137 |     help="Port for the webservice (default: 9000)",
138 | )
139 | @click.version_option(version=projectMetadata["Version"])
140 | def start(host: str, port: Optional[int] = None):
141 |     uvicorn.run(app, host=host, port=port)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     start()
146 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | Unreleased
  5 | ----------
  6 | 
  7 | [1.8.2] (2025-02-18)
  8 | --------------------
  9 | 
 10 | ### Changed
 11 | 
 12 | - Reduced GPU image size by using `nvidia/cuda:12.6.3-base-ubuntu22.04`
 13 | 
 14 | [1.8.1] (2025-02-18)
 15 | --------------------
 16 | 
 17 | ### Fixed
 18 | 
 19 | - Fixed issues with Torch CUDA and cuDNN
 20 | - Updated Torch and Torchaudio dependencies for multi-architecture support
 21 | 
 22 | [1.8.0] (2025-02-17)
 23 | --------------------
 24 | 
 25 | ### Added
 26 | 
 27 | - Added support for [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)
 28 | 
 29 | ### Changed
 30 | 
 31 | - Upgraded Cuda GPU image to v12.6.3
 32 | - Upgraded dependencies
 33 |   - torch to v2.6.0
 34 |   - fastapi to v0.115.8
 35 |   - llvmlite to v0.44.0
 36 |   - numba to v0.61.0
 37 |   - ruff to v0.9.6
 38 |   - black to v25.1.0
 39 |   - mkdocs-material to v9.6.4
 40 |   - pymdown-extensions to v10.14.3
 41 | 
 42 | [1.7.1] (2024-12-18)
 43 | --------------------
 44 | 
 45 | ### Fixed
 46 | 
 47 | - Fix JSON serialization of segments due to Faster Whisper v1.1.0 changes
 48 | 
 49 | [1.7.0] (2024-12-17)
 50 | --------------------
 51 | 
 52 | ### Added
 53 | 
 54 |   - Timeout configured to allow model to be unloaded when idle
 55 |   - Added detection confidence to langauge detection endpoint
 56 |   - Set mel generation to adjust n_dims automatically to match the loaded model
 57 |   - Refactor classes, Add comments, implement abstract methods, and add factory method for engine selection
 58 | 
 59 | ### Changed
 60 | 
 61 | - Upgraded
 62 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
 63 |   - uvicorn to v0.34.0
 64 |   - tqdm to v4.67.1
 65 |   - python-multipart to v0.0.20
 66 |   - fastapi to v0.115.6
 67 |   - pytest to v8.3.4
 68 |   - ruff to v0.8.3
 69 |   - black to v24.10.0
 70 |   - mkdocs to v1.6.1
 71 |   - mkdocs-material to v9.5.49
 72 |   - pymdown-extensions to v10.12
 73 | 
 74 | [1.6.0] (2024-10-06)
 75 | --------------------
 76 | 
 77 | ### Changed
 78 | 
 79 | - Upgraded
 80 |   - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
 81 |   - fastapi to v0.115.0
 82 |   - uvicorn to v0.31.0
 83 |   - tqdm to v4.66.5
 84 |   - python-multipart to v0.0.12
 85 | 
 86 | [1.5.0] (2024-07-04)
 87 | --------------------
 88 | 
 89 | ### Changed
 90 | 
 91 | - Upgraded
 92 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.3)
 93 |   - fastapi to v0.111.0
 94 |   - uvicorn to v0.30.1
 95 |   - gunicorn to v22.0.0
 96 |   - tqdm to v4.66.4
 97 |   - llvmlite to v0.43.0
 98 |   - numba to v0.60.0
 99 | 
100 | [1.4.1] (2024-04-17)
101 | --------------------
102 | 
103 | ### Changed
104 | 
105 | - Upgraded torch to v1.13.1
106 | 
107 | [1.4.0] (2024-04-17)
108 | --------------------
109 | 
110 | ### Changed
111 | 
112 | - Upgraded
113 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.1)
114 |   - fastapi to v0.110.1
115 |   - uvicorn to v0.29.0
116 |   - gunicorn to v21.2.0
117 |   - tqdm to v4.66.2
118 |   - python-multipart to v0.0.9
119 |   - llvmlite to v0.42.0
120 |   - numba to v0.59.1
121 | 
122 | [1.3.0] (2024-02-15)
123 | --------------------
124 | 
125 | ### Added
126 | 
127 | - Compiled and added FFmpeg without LGPL libraries for license compliance
128 | 
129 | [1.2.4] (2023-11-27)
130 | --------------------
131 | 
132 | ### Changed
133 | 
134 | - Upgraded
135 |   - [openai/whisper](https://github.com/openai/whisper) to [v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
136 |   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v0.10.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v0.10.0)
137 | 
138 | [1.2.3] (2023-11-07)
139 | --------------------
140 | 
141 | ### Changed
142 | 
143 | - Upgraded
144 |   - [openai/whisper](https://github.com/openai/whisper) to [v20231106](https://github.com/openai/whisper/releases/tag/v20231106)
145 | 
146 | [1.2.2] (2023-11-03)
147 | --------------------
148 | 
149 | ### Fixed
150 | 
151 | - Fixed `swagger-ui` rendering issues by upgrading to `v5.9.1`, fixes #153 and #154
152 | 
153 | [1.2.1] (2023-11-03)
154 | --------------------
155 | 
156 | ### Enabled
157 | 
158 | - Enabled `vad_filter` for `faster-whisper` engine
159 | 
160 | ### Changed
161 | 
162 | - Changed misspelling in "Word level timestamps"
163 | - Removed unused unidecode dependency
164 | - Upgraded
165 |   - uvicorn to v0.23.2
166 |   - gunicorn to v21.0.1
167 |   - tqdm to v4.66.1
168 |   - python-multipart to v0.0.6
169 |   - fastapi to v0.104.1
170 |   - llvmlite to v0.41.1
171 |   - numba to v0.58.0
172 | 
173 | [1.2.0] (2023-10-01)
174 | --------------------
175 | 
176 | ### Changed
177 | 
178 | - Upgraded
179 |   - [openai/whisper](https://github.com/openai/whisper) to [v20230918](https://github.com/openai/whisper/releases/tag/v20230918)
180 |   - [guillaumekln/faster-whisper](https://github.com/guillaumekln/faster-whisper) to [v0.9.0](https://github.com/guillaumekln/faster-whisper/releases/tag/v0.9.0)
181 | 
182 | ### Updated
183 | 
184 | - Updated model conversion method (for Faster Whisper) to use Hugging Face downloader
185 | - Updated default model paths to `~/.cache/whisper` or `/root/.cache/whisper`.
186 |   - For customization, modify the `ASR_MODEL_PATH` environment variable.
187 |   - Ensure Docker volume is set for the corresponding directory to use caching.
188 | 
189 |       ```bash
190 |       docker run -d -p 9000:9000 -e ASR_MODEL_PATH=/data/whisper -v $PWD/yourlocaldir:/data/whisper onerahmet/openai-whisper-asr-webservice:latest
191 |       ```
192 | 
193 | - Removed the `triton` dependency from `poetry.lock` to ensure the stability of the pipeline for `ARM-based` Docker images
194 | 
195 | [1.1.1] (2023-05-29)
196 | --------------------
197 | 
198 | ### Changed
199 | 
200 | - 94 gpus that don't support float16 in #103
201 | - Update compute type in #108
202 | - Add word level functionality for Faster Whisper in #109
203 | 
204 | [1.1.0] (2023-04-17)
205 | --------------------
206 | 
207 | ### Changed
208 | 
209 | - Docs in #72
210 | - Fix language code typo in #77
211 | - Adds support for FasterWhisper in #81
212 | - Add an optional param to skip the encoding step in #82
213 | - Faster whisper in #92
214 | 
215 | [1.0.6] (2023-02-05)
216 | --------------------
217 | 
218 | ### Changed
219 | 
220 | - Update README.md in #58
221 | - 68 update the versions in #69
222 | - Fix gunicorn run command and remove deprecated poetry run script in #70
223 | - Move torch installation method into the pyproject.toml file in #71
224 | - Add prompt to ASR in #66
225 | 
226 | [1.0.5] (2022-12-08)
227 | --------------------
228 | 
229 | ### Changed
230 | 
231 | - 43 make swagger doc not depend on internet connection in #52
232 | - Add new large model v2 in #53
233 | 
234 | [1.0.4] (2022-11-28)
235 | --------------------
236 | 
237 | ### Changed
238 | 
239 | - 43 make swagger doc not depend on internet connection in #51
240 | - Anally retentively fixed markdown linting warnings in README. Sorry. in #48
241 | - Explicit macOS readme with explanation for no-GPU [closes #44] in #47
242 | 
243 | [1.0.3-beta] (2022-11-17)
244 | -------------------------
245 | 
246 | ### Changed
247 | 
248 | - Combine transcribe endpoints in #36
249 | - Add multi worker support with gunicorn in #37
250 | - Add multi platform (amd & arm) support in #39
251 | - Upgrade Cuda version to 11.7 in #40
252 | - Lock to the latest whisper version (eff383) in #41
253 | 
254 | [1.0.2-beta] (2022-10-04)
255 | -------------------------
256 | 
257 | ### Changed
258 | 
259 | - add mutex lock to the model in #19
260 | - Subtitles in #21
261 | - Add gpu support and create Docker image for cuda with GitHub flow in #22
262 | 
263 | [1.0.1-beta] (2022-09-27)
264 | -------------------------
265 | 
266 | ### Changed
267 | 
268 | - Init GitHub runners in #10
269 | - Lock Whisper dependency with b4308... revision number to prevent build crashes in #15
270 | 
271 | [1.0.0-beta] (2022-09-25)
272 | -------------------------
273 | 
274 | ### Changed
275 | 
276 | - Docker init in #1
277 | - Create LICENCE in #2
278 | - Fastapi init in #3
279 | - Avoid temp file in #4
280 | - Translate init in #5
281 | - mp3 support by using FFmpeg instead of librosa in #8
282 | - add language detection endpoint in #9
283 | 
284 | [1.8.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.2
285 | [1.8.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.1
286 | [1.8.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.0
287 | [1.7.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.1
288 | [1.7.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.0
289 | [1.6.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.6.0
290 | [1.5.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.5.0
291 | [1.4.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.1
292 | [1.4.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.0
293 | [1.3.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.3.0
294 | [1.2.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.4
295 | [1.2.3]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.3
296 | [1.2.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.2
297 | [1.2.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.1
298 | [1.2.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.0
299 | [1.1.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.1
300 | [1.1.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.0
301 | [1.0.6]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.6
302 | [1.0.5]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.5
303 | [1.0.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.4
304 | [1.0.3-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.3-beta
305 | [1.0.2-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.2-beta
306 | [1.0.1-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.1-beta
307 | [1.0.0-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/1.0.0-beta
308 | 


--------------------------------------------------------------------------------