├── .dockerignore ├── docs ├── changelog.md ├── .overrides │ └── main.html ├── licence.md ├── assets │ ├── images │ │ └── swagger-ui.png │ └── css │ │ └── extra.css ├── index.md ├── build.md ├── run.md ├── environmental-variables.md └── endpoints.md ├── .github ├── FUNDING.yml └── workflows │ ├── documentation.yml │ └── docker-publish.yml ├── docker-compose.yml ├── docker-compose.gpu.yml ├── .gitignore ├── app ├── factory │ └── asr_model_factory.py ├── asr_models │ ├── asr_model.py │ ├── openai_whisper_engine.py │ ├── faster_whisper_engine.py │ └── mbain_whisperx_engine.py ├── config.py ├── utils.py └── webservice.py ├── Dockerfile ├── LICENCE ├── Dockerfile.gpu ├── pyproject.toml ├── mkdocs.yml ├── README.md └── CHANGELOG.md /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .venv 3 | venv -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | --8<-- "CHANGELOG.md" 2 | -------------------------------------------------------------------------------- /docs/.overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | -------------------------------------------------------------------------------- /docs/licence.md: -------------------------------------------------------------------------------- 1 | # Licence 2 | 3 | ``` 4 | --8<-- "LICENCE" 5 | ``` 6 | -------------------------------------------------------------------------------- /docs/assets/images/swagger-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/whisper-asr-webservice/main/docs/assets/images/swagger-ui.png -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [ahmetoner] 4 | custom: ['https://bmc.link/ahmetoner'] 5 | -------------------------------------------------------------------------------- /docs/assets/css/extra.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #3d6178; 3 | --md-primary-fg-color--light: #3d6178; 4 | --md-primary-fg-color--dark: #3d6178; 5 | } 6 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | whisper-asr-webservice: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | environment: 9 | - ASR_MODEL=base 10 | ports: 11 | - "9000:9000" 12 | volumes: 13 | - ./app:/app/app 14 | - cache-whisper:/root/.cache 15 | 16 | volumes: 17 | cache-whisper: 18 | -------------------------------------------------------------------------------- /docker-compose.gpu.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | whisper-asr-webservice-gpu: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile.gpu 8 | deploy: 9 | resources: 10 | reservations: 11 | devices: 12 | - driver: nvidia 13 | count: 1 14 | capabilities: [gpu] 15 | environment: 16 | - ASR_MODEL=base 17 | ports: 18 | - "9000:9000" 19 | volumes: 20 | - ./app:/app/app 21 | - cache-whisper:/root/.cache 22 | 23 | volumes: 24 | cache-whisper: 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | # Packages 4 | *.egg 5 | !/tests/**/*.egg 6 | /*.egg-info 7 | /dist/* 8 | build 9 | _build 10 | .cache 11 | *.so 12 | venv 13 | 14 | # Installer logs 15 | pip-log.txt 16 | 17 | # Unit test / coverage reports 18 | .coverage 19 | .pytest_cache 20 | 21 | .DS_Store 22 | .idea/* 23 | .python-version 24 | .vscode/* 25 | 26 | /test.py 27 | /test_*.* 28 | 29 | /setup.cfg 30 | MANIFEST.in 31 | /setup.py 32 | /docs/site/* 33 | /tests/fixtures/simple_project/setup.py 34 | /tests/fixtures/project_with_extras/setup.py 35 | .mypy_cache 36 | 37 | .venv 38 | /releases/* 39 | pip-wheel-metadata 40 | /poetry.toml 41 | 42 | poetry/core/* 43 | 44 | public 45 | -------------------------------------------------------------------------------- /app/factory/asr_model_factory.py: -------------------------------------------------------------------------------- 1 | from app.asr_models.asr_model import ASRModel 2 | from app.asr_models.faster_whisper_engine import FasterWhisperASR 3 | from app.asr_models.mbain_whisperx_engine import WhisperXASR 4 | from app.asr_models.openai_whisper_engine import OpenAIWhisperASR 5 | from app.config import CONFIG 6 | 7 | 8 | class ASRModelFactory: 9 | @staticmethod 10 | def create_asr_model() -> ASRModel: 11 | if CONFIG.ASR_ENGINE == "openai_whisper": 12 | return OpenAIWhisperASR() 13 | elif CONFIG.ASR_ENGINE == "faster_whisper": 14 | return FasterWhisperASR() 15 | elif CONFIG.ASR_ENGINE == "whisperx": 16 | return WhisperXASR() 17 | else: 18 | raise ValueError(f"Unsupported ASR engine: {CONFIG.ASR_ENGINE}") 19 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | branches: 7 | - docs 8 | permissions: 9 | contents: write 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | if: github.event.repository.fork == false 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v4 17 | with: 18 | python-version: 3.x 19 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 20 | - uses: actions/cache@v3 21 | with: 22 | key: mkdocs-material-${{ env.cache_id }} 23 | path: .cache 24 | restore-keys: | 25 | mkdocs-material- 26 | - run: pip install mkdocs-material pymdown-extensions 27 | - run: mkdocs gh-deploy --force 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg 2 | 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui 4 | 5 | FROM python:3.10-bookworm 6 | 7 | ENV POETRY_VENV=/app/.venv 8 | 9 | RUN python3 -m venv $POETRY_VENV \ 10 | && $POETRY_VENV/bin/pip install -U pip setuptools \ 11 | && $POETRY_VENV/bin/pip install poetry==2.1.1 12 | 13 | ENV PATH="${PATH}:${POETRY_VENV}/bin" 14 | 15 | WORKDIR /app 16 | 17 | COPY . /app 18 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg 19 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css 20 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js 21 | 22 | RUN poetry config virtualenvs.in-project true 23 | RUN poetry install 24 | 25 | EXPOSE 9000 26 | 27 | ENTRYPOINT ["whisper-asr-webservice"] 28 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ahmet Oner & Besim Alibegovic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification. 2 | 3 | ## Features 4 | 5 | Current release (v1.8.2) supports following whisper models: 6 | 7 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930) 8 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0) 9 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1) 10 | 11 | ## Quick Usage 12 | 13 | === ":octicons-file-code-16: `CPU`" 14 | 15 | ```shell 16 | docker run -d -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest 17 | ``` 18 | 19 | === ":octicons-file-code-16: `GPU`" 20 | 21 | ```shell 22 | docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest-gpu 23 | ``` 24 | 25 | for more information: 26 | 27 | - [Documentation/Run](https://ahmetoner.github.io/whisper-asr-webservice/run) 28 | - [Docker Hub](https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice) 29 | 30 | ## Credits 31 | 32 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) 33 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM onerahmet/ffmpeg:n7.1 AS ffmpeg 2 | 3 | FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui 4 | 5 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04 6 | 7 | ENV PYTHON_VERSION=3.10 8 | 9 | ENV POETRY_VENV=/app/.venv 10 | 11 | RUN export DEBIAN_FRONTEND=noninteractive \ 12 | && apt-get -qq update \ 13 | && apt-get -qq install --no-install-recommends \ 14 | python${PYTHON_VERSION} \ 15 | python${PYTHON_VERSION}-venv \ 16 | python3-pip \ 17 | libcudnn8 \ 18 | python3-pip \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \ 22 | ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \ 23 | ln -s -f /usr/bin/pip3 /usr/bin/pip 24 | 25 | RUN python3 -m venv $POETRY_VENV \ 26 | && $POETRY_VENV/bin/pip install -U pip setuptools \ 27 | && $POETRY_VENV/bin/pip install poetry==2.1.1 28 | 29 | ENV PATH="${PATH}:${POETRY_VENV}/bin" 30 | 31 | WORKDIR /app 32 | 33 | COPY poetry.lock pyproject.toml ./ 34 | 35 | RUN poetry config virtualenvs.in-project true 36 | RUN poetry install --no-root 37 | 38 | COPY . . 39 | COPY --from=ffmpeg /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg 40 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui.css swagger-ui-assets/swagger-ui.css 41 | COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-assets/swagger-ui-bundle.js 42 | 43 | RUN poetry install 44 | RUN $POETRY_VENV/bin/pip install torch==2.6.0+cu126 torchaudio==2.6.0+cu126 --index-url https://download.pytorch.org/whl/cu126 45 | 46 | EXPOSE 9000 47 | 48 | CMD whisper-asr-webservice 49 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker Image 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | branches: 7 | - debug 8 | 9 | env: 10 | DOCKER_USER: ${{secrets.DOCKER_USER}} 11 | DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}} 12 | REPO_NAME: ${{secrets.REPO_NAME}} 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | include: 19 | - docker_file: Dockerfile 20 | platforms: linux/arm64,linux/amd64 21 | - docker_file: Dockerfile.gpu 22 | tag_extension: -gpu 23 | platforms: linux/amd64 24 | steps: 25 | - name: Checkout 26 | uses: actions/checkout@v3 27 | - name: Set up QEMU 28 | uses: docker/setup-qemu-action@v1 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v1 31 | - name: Login to DockerHub 32 | uses: docker/login-action@v1 33 | with: 34 | username: ${{ secrets.DOCKER_USER }} 35 | password: ${{ secrets.DOCKER_PASSWORD }} 36 | - name: Build and Publish the Docker debug image 37 | if: github.ref == 'refs/heads/debug' 38 | run: | 39 | DOCKER_IMAGE_DEBUG=$DOCKER_USER/$REPO_NAME:debug${{ matrix.tag_extension }} 40 | docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_DEBUG}" -f ${{ matrix.docker_file }} --push 41 | - name: Build and Publish the Docker image 42 | if: github.ref != 'refs/heads/debug' 43 | run: | 44 | DOCKER_IMAGE_LATEST=$DOCKER_USER/$REPO_NAME:latest${{ matrix.tag_extension }} 45 | DOCKER_IMAGE_VERSION=$DOCKER_USER/$REPO_NAME:$GITHUB_REF_NAME${{ matrix.tag_extension }} 46 | docker buildx build . --no-cache --platform=${{ matrix.platforms }} -t "${DOCKER_IMAGE_LATEST}" -t "${DOCKER_IMAGE_VERSION}" -f ${{ matrix.docker_file }} --push 47 | -------------------------------------------------------------------------------- /docs/build.md: -------------------------------------------------------------------------------- 1 | ## Development Environment 2 | 3 | Install poetry with following command: 4 | 5 | ```shell 6 | pip3 install poetry 7 | ``` 8 | 9 | ### Installation 10 | 11 | Install packages: 12 | 13 | ```shell 14 | poetry install 15 | ``` 16 | 17 | !!! Note 18 | By default, this will install the CPU version of PyTorch. For GPU support, you'll need to install the appropriate CUDA version of PyTorch separately: 19 | ```shell 20 | # For CUDA support (example for CUDA 11.8): 21 | pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121 22 | ``` 23 | 24 | ### Run 25 | 26 | Starting the Webservice: 27 | 28 | ```shell 29 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000 30 | ``` 31 | 32 | ### Build 33 | 34 | === ":octicons-file-code-16: `Docker`" 35 | 36 | With `Dockerfile`: 37 | 38 | === ":octicons-file-code-16: `CPU`" 39 | 40 | ```shell 41 | # Build Image 42 | docker build -t whisper-asr-webservice . 43 | 44 | # Run Container 45 | docker run -d -p 9000:9000 whisper-asr-webservice 46 | # or with specific model 47 | docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice 48 | ``` 49 | 50 | === ":octicons-file-code-16: `GPU`" 51 | 52 | ```shell 53 | # Build Image 54 | docker build -f Dockerfile.gpu -t whisper-asr-webservice-gpu . 55 | 56 | # Run Container 57 | docker run -d --gpus all -p 9000:9000 whisper-asr-webservice-gpu 58 | # or with specific model 59 | docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice-gpu 60 | ``` 61 | 62 | With `docker-compose`: 63 | 64 | === ":octicons-file-code-16: `CPU`" 65 | 66 | ```shell 67 | docker-compose up --build 68 | ``` 69 | 70 | === ":octicons-file-code-16: `GPU`" 71 | 72 | ```shell 73 | docker-compose -f docker-compose.gpu.yml up --build 74 | ``` 75 | === ":octicons-file-code-16: `Poetry`" 76 | 77 | Build .whl package 78 | 79 | ```shell 80 | poetry build 81 | ``` -------------------------------------------------------------------------------- /app/asr_models/asr_model.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | from abc import ABC, abstractmethod 4 | from threading import Lock 5 | from typing import Union 6 | 7 | import torch 8 | 9 | from app.config import CONFIG 10 | 11 | 12 | class ASRModel(ABC): 13 | """ 14 | Abstract base class for ASR (Automatic Speech Recognition) models. 15 | """ 16 | 17 | model = None 18 | model_lock = Lock() 19 | last_activity_time = time.time() 20 | 21 | def __init__(self): 22 | pass 23 | 24 | @abstractmethod 25 | def load_model(self): 26 | """ 27 | Loads the model from the specified path. 28 | """ 29 | pass 30 | 31 | @abstractmethod 32 | def transcribe( 33 | self, 34 | audio, 35 | task: Union[str, None], 36 | language: Union[str, None], 37 | initial_prompt: Union[str, None], 38 | vad_filter: Union[bool, None], 39 | word_timestamps: Union[bool, None], 40 | options: Union[dict, None], 41 | output, 42 | ): 43 | """ 44 | Perform transcription on the given audio file. 45 | """ 46 | pass 47 | 48 | @abstractmethod 49 | def language_detection(self, audio): 50 | """ 51 | Perform language detection on the given audio file. 52 | """ 53 | pass 54 | 55 | def monitor_idleness(self): 56 | """ 57 | Monitors the idleness of the ASR model and releases the model if it has been idle for too long. 58 | """ 59 | if CONFIG.MODEL_IDLE_TIMEOUT <= 0: 60 | return 61 | while True: 62 | time.sleep(15) 63 | if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT: 64 | with self.model_lock: 65 | self.release_model() 66 | break 67 | 68 | def release_model(self): 69 | """ 70 | Unloads the model from memory and clears any cached GPU memory. 71 | """ 72 | del self.model 73 | torch.cuda.empty_cache() 74 | gc.collect() 75 | self.model = None 76 | print("Model unloaded due to timeout") 77 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | class CONFIG: 7 | """ 8 | Configuration class for ASR models. 9 | Reads environment variables for runtime configuration, with sensible defaults. 10 | """ 11 | # Determine the ASR engine ('faster_whisper', 'openai_whisper' or 'whisperx') 12 | ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper") 13 | 14 | # Retrieve Huggingface Token 15 | HF_TOKEN = os.getenv("HF_TOKEN", "") 16 | if ASR_ENGINE == "whisperx" and HF_TOKEN == "": 17 | print("You must set the HF_TOKEN environment variable to download the diarization model used by WhisperX.") 18 | 19 | # Determine the computation device (GPU or CPU) 20 | DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | # Model name to use (e.g., "base", "small", etc.) 23 | MODEL_NAME = os.getenv("ASR_MODEL", "base") 24 | 25 | # Path to the model directory 26 | MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper")) 27 | 28 | # Model quantization level. Defines the precision for model weights: 29 | # 'float32' - 32-bit floating-point precision (higher precision, slower inference) 30 | # 'float16' - 16-bit floating-point precision (lower precision, faster inference) 31 | # 'int8' - 8-bit integer precision (lowest precision, fastest inference) 32 | # Defaults to 'float32' for GPU availability, 'int8' for CPU. 33 | MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8") 34 | if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}: 35 | raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.") 36 | 37 | # Idle timeout in seconds. If set to a non-zero value, the model will be unloaded 38 | # after being idle for this many seconds. A value of 0 means the model will never be unloaded. 39 | MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0)) 40 | 41 | # Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks. 42 | SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000)) 43 | 44 | # Subtitle output options for whisperx 45 | SUBTITLE_MAX_LINE_WIDTH = int(os.getenv("SUBTITLE_MAX_LINE_WIDTH", 1000)) 46 | SUBTITLE_MAX_LINE_COUNT = int(os.getenv("SUBTITLE_MAX_LINE_COUNT", 2)) 47 | SUBTITLE_HIGHLIGHT_WORDS = os.getenv("SUBTITLE_HIGHLIGHT_WORDS", "false").lower() == "true" 48 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "whisper-asr-webservice" 3 | version = "1.9.0-dev" 4 | description = "Whisper ASR Webservice is a general-purpose speech recognition webservice." 5 | homepage = "https://github.com/ahmetoner/whisper-asr-webservice/" 6 | license = "https://github.com/ahmetoner/whisper-asr-webservice/blob/main/LICENCE" 7 | authors = ["Ahmet Öner", "Besim Alibegovic"] 8 | readme = "README.md" 9 | packages = [{ include = "app" }] 10 | 11 | [tool.poetry.scripts] 12 | whisper-asr-webservice = "app.webservice:start" 13 | 14 | [[tool.poetry.source]] 15 | name = "pytorch-cpu" 16 | url = "https://download.pytorch.org/whl/cpu" 17 | priority = "explicit" 18 | 19 | [tool.poetry.dependencies] 20 | python = "<3.13,>=3.10" 21 | fastapi = "^0.115.8" 22 | uvicorn = { extras = ["standard"], version = "^0.34.0" } 23 | python-multipart = "^0.0.20" 24 | ffmpeg-python = "^0.2.0" 25 | numpy = "<2.0.0" 26 | openai-whisper = "^20240930" 27 | faster-whisper = "^1.1.0" 28 | whisperx = "^3.3.1" 29 | tqdm = "^4.67.1" 30 | llvmlite = "^0.44.0" 31 | numba = "^0.61.0" 32 | torch = [ 33 | { version = "2.6.0", source = "pypi", markers = "sys_platform == 'darwin'"}, 34 | { version = "2.6.0", source = "pypi", markers = "platform_machine == 'aarch64' and sys_platform != 'darwin'"}, 35 | { version = "2.6.0", source = "pytorch-cpu", markers = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, 36 | ] 37 | torchaudio = [ 38 | { version = "2.6.0", source = "pypi", markers = "sys_platform == 'darwin'"}, 39 | { version = "2.6.0", source = "pypi", markers = "platform_machine == 'aarch64' and sys_platform != 'darwin'"}, 40 | { version = "2.6.0", source = "pytorch-cpu", markers = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, 41 | ] 42 | 43 | [poetry.group.dev.dependencies] 44 | pytest = "^8.3.4" 45 | ruff = "^0.9.6" 46 | black = "^25.1.0" 47 | mkdocs-material = "^9.6.4" 48 | pymdown-extensions = "^10.14.3" 49 | 50 | [build-system] 51 | requires = ["poetry-core>=1.0.0"] 52 | build-backend = "poetry.core.masonry.api" 53 | 54 | [tool.black] 55 | skip-string-normalization = true 56 | line-length = 120 57 | 58 | [tool.ruff] 59 | line-length = 120 60 | 61 | [tool.ruff.lint] 62 | select = [ 63 | "E", # pycodestyle errors 64 | "W", # pycodestyle warnings 65 | "F", # pyflakes 66 | "I", # isort 67 | "C", # flake8-comprehensions 68 | "B", # flake8-bugbear 69 | ] 70 | ignore = [ 71 | "E501", # line too long, handled by black 72 | "C901", # too complex 73 | ] 74 | 75 | [tool.ruff.lint.isort] 76 | order-by-type = true 77 | relative-imports-order = "closest-to-furthest" 78 | extra-standard-library = ["typing"] 79 | section-order = [ 80 | "future", 81 | "standard-library", 82 | "third-party", 83 | "first-party", 84 | "local-folder", 85 | ] 86 | known-first-party = [] 87 | -------------------------------------------------------------------------------- /docs/run.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | Whisper ASR Webservice now available on Docker Hub. You can find the latest version of this repository on docker hub for CPU and GPU. 4 | 5 | Docker Hub: 6 | 7 | === ":octicons-file-code-16: `CPU`" 8 | 9 | ```shell 10 | docker pull onerahmet/openai-whisper-asr-webservice:latest 11 | docker run -d -p 9000:9000 \ 12 | -e ASR_MODEL=base \ 13 | -e ASR_ENGINE=openai_whisper \ 14 | onerahmet/openai-whisper-asr-webservice:latest 15 | ``` 16 | 17 | === ":octicons-file-code-16: `CPU (macOS)`" 18 | 19 | > GPU passthrough does not work on macOS due to fundamental design limitations of Docker. Docker actually runs containers within a LinuxVM on macOS. If you wish to run GPU-accelerated containers, I'm afraid Linux is your only option. 20 | > 21 | > The `:latest` image tag provides both amd64 and arm64 architectures: 22 | 23 | ```shell 24 | docker pull onerahmet/openai-whisper-asr-webservice:latest 25 | docker run -d -p 9000:9000 \ 26 | -e ASR_MODEL=base \ 27 | -e ASR_ENGINE=openai_whisper \ 28 | onerahmet/openai-whisper-asr-webservice:latest 29 | ``` 30 | 31 | === ":octicons-file-code-16: `GPU`" 32 | 33 | ```shell 34 | docker pull onerahmet/openai-whisper-asr-webservice:latest-gpu 35 | docker run -d --gpus all -p 9000:9000 \ 36 | -e ASR_MODEL=base \ 37 | -e ASR_ENGINE=openai_whisper \ 38 | onerahmet/openai-whisper-asr-webservice:latest-gpu 39 | ``` 40 | 41 | ### Environment Variables 42 | 43 | The following environment variables can be used to configure the service: 44 | 45 | - `ASR_MODEL`: Whisper model to use (tiny, base, small, medium, large) [default: base] 46 | - `ASR_ENGINE`: ASR engine to use (openai_whisper, faster_whisper) [default: openai_whisper] 47 | - `ASR_MODEL_PATH`: Custom path to store/load model files [optional] 48 | 49 | > Interactive Swagger API documentation is available at 50 | 51 | ![Swagger UI](assets/images/swagger-ui.png) 52 | 53 | ## Cache 54 | 55 | The ASR model is downloaded each time you start the container. Using the large model can take significant time to download. 56 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory to local storage. 57 | The model will then be loaded from the cache instead of being downloaded again on subsequent container starts. 58 | 59 | **Important: Using a persistent cache will prevent you from receiving model updates.** 60 | 61 | === ":octicons-file-code-16: `Default cache dir`" 62 | 63 | ```shell 64 | docker run -d -p 9000:9000 \ 65 | -v $PWD/cache:/root/.cache \ 66 | onerahmet/openai-whisper-asr-webservice:latest 67 | ``` 68 | 69 | === ":octicons-file-code-16: `With ASR_MODEL_PATH`" 70 | 71 | ```shell 72 | docker run -d -p 9000:9000 \ 73 | -e ASR_MODEL_PATH=/data/whisper \ 74 | -v $PWD/cache:/data/whisper \ 75 | onerahmet/openai-whisper-asr-webservice:latest 76 | ``` 77 | -------------------------------------------------------------------------------- /docs/environmental-variables.md: -------------------------------------------------------------------------------- 1 | ### Configuring the `Engine` 2 | 3 | === ":octicons-file-code-16: `openai_whisper`" 4 | 5 | ```shell 6 | export ASR_ENGINE=openai_whisper 7 | ``` 8 | 9 | === ":octicons-file-code-16: `faster_whisper`" 10 | 11 | ```shell 12 | export ASR_ENGINE=faster_whisper 13 | ``` 14 | 15 | === ":octicons-file-code-16: `whisperx`" 16 | 17 | ```shell 18 | export ASR_ENGINE=whisperx 19 | ``` 20 | 21 | ### Configuring the `Model` 22 | 23 | ```shell 24 | export ASR_MODEL=base 25 | ``` 26 | 27 | Available ASR_MODELs are: 28 | 29 | - Standard models: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` (or `large`), `large-v3-turbo` (or `turbo`) 30 | - English-optimized models: `tiny.en`, `base.en`, `small.en`, `medium.en` 31 | - Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3` (only for whisperx and faster-whisper) 32 | 33 | For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` 34 | models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. 35 | 36 | The distilled models offer improved inference speed while maintaining good accuracy. 37 | 38 | ### Configuring the `Model Path` 39 | 40 | ```shell 41 | export ASR_MODEL_PATH=/data/whisper 42 | ``` 43 | 44 | ### Configuring the `Model Unloading Timeout` 45 | 46 | ```shell 47 | export MODEL_IDLE_TIMEOUT=300 48 | ``` 49 | 50 | Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting 51 | `0` disables the timeout, keeping the model loaded indefinitely. 52 | 53 | ### Configuring the `SAMPLE_RATE` 54 | 55 | ```shell 56 | export SAMPLE_RATE=16000 57 | ``` 58 | 59 | Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks. 60 | 61 | ### Configuring Device and Quantization 62 | 63 | ```shell 64 | export ASR_DEVICE=cuda # or 'cpu' 65 | export ASR_QUANTIZATION=float32 # or 'float16', 'int8' 66 | ``` 67 | 68 | The `ASR_DEVICE` defaults to `cuda` if GPU is available, otherwise `cpu`. 69 | 70 | The `ASR_QUANTIZATION` defines the precision for model weights: 71 | 72 | - `float32`: 32-bit floating-point precision (higher precision, slower inference) 73 | - `float16`: 16-bit floating-point precision (lower precision, faster inference) 74 | - `int8`: 8-bit integer precision (lowest precision, fastest inference) 75 | 76 | Defaults to `float32` for GPU, `int8` for CPU. 77 | 78 | ### Configuring Subtitle Options (WhisperX) 79 | 80 | ```shell 81 | export SUBTITLE_MAX_LINE_WIDTH=1000 82 | export SUBTITLE_MAX_LINE_COUNT=2 83 | export SUBTITLE_HIGHLIGHT_WORDS=false 84 | ``` 85 | 86 | These options only apply when using the WhisperX engine: 87 | 88 | - `SUBTITLE_MAX_LINE_WIDTH`: Maximum width of subtitle lines (default: 1000) 89 | - `SUBTITLE_MAX_LINE_COUNT`: Maximum number of lines per subtitle (default: 2) 90 | - `SUBTITLE_HIGHLIGHT_WORDS`: Enable word highlighting in subtitles (default: false) 91 | 92 | ### Hugging Face Token 93 | 94 | ```shell 95 | export HF_TOKEN=your_token_here 96 | ``` 97 | 98 | Required when using the WhisperX engine to download the diarization model. 99 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Whisper ASR Webservice 2 | site_url: https://ahmetoner.github.io/whisper-asr-webservice 3 | site_dir: public 4 | 5 | site_description: "OpenAI Whisper ASR Webservice API" 6 | repo_url: "https://github.com/ahmetoner/whisper-asr-webservice" 7 | repo_name: "ahmetoner/whisper-asr-webservice" 8 | copyright: Copyright © 2025 9 | edit_uri: edit/main/docs/ 10 | 11 | validation: 12 | omitted_files: warn 13 | absolute_links: warn 14 | unrecognized_links: warn 15 | 16 | nav: 17 | - Overview: index.md 18 | - Installation & Usage: run.md 19 | - API Endpoints: endpoints.md 20 | - Configuration: environmental-variables.md 21 | - Development: build.md 22 | - Changelog: changelog.md 23 | - License: licence.md 24 | - Releases: https://github.com/ahmetoner/whisper-asr-webservice/releases 25 | - Docker Hub: https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice 26 | 27 | theme: 28 | name: material 29 | custom_dir: docs/.overrides 30 | icon: 31 | logo: material/subtitles 32 | features: 33 | - announce.dismiss 34 | - content.action.edit 35 | - content.action.view 36 | - content.code.annotate 37 | - content.code.copy 38 | - content.tooltips 39 | - navigation.footer 40 | - navigation.indexes 41 | # - navigation.sections # important 42 | - navigation.top 43 | # - navigation.tabs 44 | # - navigation.tabs.sticky 45 | - search.highlight 46 | - search.suggest 47 | - toc.follow 48 | - toc.integrate 49 | palette: 50 | # System preference 51 | - media: "(prefers-color-scheme)" 52 | toggle: 53 | icon: material/brightness-auto 54 | name: Switch to light mode 55 | # Light mode 56 | - media: "(prefers-color-scheme: light)" 57 | scheme: default 58 | primary: custom 59 | accent: teal 60 | toggle: 61 | icon: material/brightness-7 62 | name: Switch to dark mode 63 | # Dark mode 64 | - media: "(prefers-color-scheme: dark)" 65 | scheme: slate 66 | primary: black 67 | accent: lime 68 | toggle: 69 | icon: material/brightness-4 70 | name: Switch to system preference 71 | 72 | 73 | 74 | extra_css: 75 | - assets/css/extra.css 76 | markdown_extensions: 77 | - attr_list 78 | - admonition 79 | - footnotes 80 | - pymdownx.emoji: 81 | emoji_index: !!python/name:materialx.emoji.twemoji 82 | emoji_generator: !!python/name:materialx.emoji.to_svg 83 | - pymdownx.magiclink 84 | - pymdownx.snippets: 85 | check_paths: true 86 | dedent_subsections: true 87 | - pymdownx.superfences 88 | - pymdownx.tabbed: 89 | alternate_style: true 90 | slugify: !!python/object/apply:pymdownx.slugs.slugify 91 | kwds: 92 | case: lower 93 | - pymdownx.tasklist: 94 | custom_checkbox: true 95 | - toc: 96 | permalink: "¶" 97 | - pymdownx.superfences: 98 | custom_fences: 99 | - name: mermaid 100 | class: mermaid 101 | format: !!python/name:pymdownx.superfences.fence_code_format 102 | 103 | plugins: 104 | - search 105 | 106 | extra: 107 | generator: false 108 | social: 109 | - icon: fontawesome/brands/github 110 | link: https://github.com/ahmetoner 111 | - icon: fontawesome/brands/docker 112 | link: https://hub.docker.com/u/onerahmet 113 | -------------------------------------------------------------------------------- /app/asr_models/openai_whisper_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import torch 7 | import whisper 8 | from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 9 | 10 | from app.asr_models.asr_model import ASRModel 11 | from app.config import CONFIG 12 | 13 | 14 | class OpenAIWhisperASR(ASRModel): 15 | 16 | def load_model(self): 17 | 18 | if torch.cuda.is_available(): 19 | self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH).cuda() 20 | else: 21 | self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH) 22 | 23 | Thread(target=self.monitor_idleness, daemon=True).start() 24 | 25 | def transcribe( 26 | self, 27 | audio, 28 | task: Union[str, None], 29 | language: Union[str, None], 30 | initial_prompt: Union[str, None], 31 | vad_filter: Union[bool, None], 32 | word_timestamps: Union[bool, None], 33 | options: Union[dict, None], 34 | output, 35 | ): 36 | self.last_activity_time = time.time() 37 | 38 | with self.model_lock: 39 | if self.model is None: 40 | self.load_model() 41 | 42 | options_dict = {"task": task} 43 | if language: 44 | options_dict["language"] = language 45 | if initial_prompt: 46 | options_dict["initial_prompt"] = initial_prompt 47 | if word_timestamps: 48 | options_dict["word_timestamps"] = word_timestamps 49 | with self.model_lock: 50 | result = self.model.transcribe(audio, **options_dict) 51 | 52 | output_file = StringIO() 53 | self.write_result(result, output_file, output) 54 | output_file.seek(0) 55 | 56 | return output_file 57 | 58 | def language_detection(self, audio): 59 | 60 | self.last_activity_time = time.time() 61 | 62 | with self.model_lock: 63 | if self.model is None: 64 | self.load_model() 65 | 66 | # load audio and pad/trim it to fit 30 seconds 67 | audio = whisper.pad_or_trim(audio) 68 | 69 | # make log-Mel spectrogram and move to the same device as the model 70 | mel = whisper.log_mel_spectrogram(audio, self.model.dims.n_mels).to(self.model.device) 71 | 72 | # detect the spoken language 73 | with self.model_lock: 74 | _, probs = self.model.detect_language(mel) 75 | detected_lang_code = max(probs, key=probs.get) 76 | 77 | return detected_lang_code, probs[max(probs)] 78 | 79 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 80 | options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False} 81 | if output == "srt": 82 | WriteSRT(ResultWriter).write_result(result, file=file, options=options) 83 | elif output == "vtt": 84 | WriteVTT(ResultWriter).write_result(result, file=file, options=options) 85 | elif output == "tsv": 86 | WriteTSV(ResultWriter).write_result(result, file=file, options=options) 87 | elif output == "json": 88 | WriteJSON(ResultWriter).write_result(result, file=file, options=options) 89 | else: 90 | WriteTXT(ResultWriter).write_result(result, file=file, options=options) 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Release](https://img.shields.io/github/v/release/ahmetoner/whisper-asr-webservice.svg) 2 | ![Docker Pulls](https://img.shields.io/docker/pulls/onerahmet/openai-whisper-asr-webservice.svg) 3 | ![Build](https://img.shields.io/github/actions/workflow/status/ahmetoner/whisper-asr-webservice/docker-publish.yml.svg) 4 | ![Licence](https://img.shields.io/github/license/ahmetoner/whisper-asr-webservice.svg) 5 | 6 | # Whisper ASR Box 7 | 8 | Whisper ASR Box is a general-purpose speech recognition toolkit. Whisper Models are trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification. 9 | 10 | ## Features 11 | 12 | Current release (v1.8.2) supports following whisper models: 13 | 14 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930) 15 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0) 16 | - [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1) 17 | 18 | ## Quick Usage 19 | 20 | ### CPU 21 | 22 | ```shell 23 | docker run -d -p 9000:9000 \ 24 | -e ASR_MODEL=base \ 25 | -e ASR_ENGINE=openai_whisper \ 26 | onerahmet/openai-whisper-asr-webservice:latest 27 | ``` 28 | 29 | ### GPU 30 | 31 | ```shell 32 | docker run -d --gpus all -p 9000:9000 \ 33 | -e ASR_MODEL=base \ 34 | -e ASR_ENGINE=openai_whisper \ 35 | onerahmet/openai-whisper-asr-webservice:latest-gpu 36 | ``` 37 | 38 | #### Cache 39 | 40 | To reduce container startup time by avoiding repeated downloads, you can persist the cache directory: 41 | 42 | ```shell 43 | docker run -d -p 9000:9000 \ 44 | -v $PWD/cache:/root/.cache/ \ 45 | onerahmet/openai-whisper-asr-webservice:latest 46 | ``` 47 | 48 | ## Key Features 49 | 50 | - Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX) 51 | - Multiple output formats (text, JSON, VTT, SRT, TSV) 52 | - Word-level timestamps support 53 | - Voice activity detection (VAD) filtering 54 | - Speaker diarization (with WhisperX) 55 | - FFmpeg integration for broad audio/video format support 56 | - GPU acceleration support 57 | - Configurable model loading/unloading 58 | - REST API with Swagger documentation 59 | 60 | ## Environment Variables 61 | 62 | Key configuration options: 63 | 64 | - `ASR_ENGINE`: Engine selection (openai_whisper, faster_whisper, whisperx) 65 | - `ASR_MODEL`: Model selection (tiny, base, small, medium, large-v3, etc.) 66 | - `ASR_MODEL_PATH`: Custom path to store/load models 67 | - `ASR_DEVICE`: Device selection (cuda, cpu) 68 | - `MODEL_IDLE_TIMEOUT`: Timeout for model unloading 69 | 70 | ## Documentation 71 | 72 | For complete documentation, visit: 73 | [https://ahmetoner.github.io/whisper-asr-webservice](https://ahmetoner.github.io/whisper-asr-webservice) 74 | 75 | ## Development 76 | 77 | ```shell 78 | # Install poetry 79 | pip3 install poetry 80 | 81 | # Install dependencies 82 | poetry install 83 | 84 | # Run service 85 | poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000 86 | ``` 87 | 88 | After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:9000` in your browser to access the Swagger UI documentation and try out the API endpoints. 89 | 90 | ## Credits 91 | 92 | - This software uses libraries from the [FFmpeg](http://ffmpeg.org) project under the [LGPLv2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) 93 | -------------------------------------------------------------------------------- /app/asr_models/faster_whisper_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import whisper 7 | from faster_whisper import WhisperModel 8 | 9 | from app.asr_models.asr_model import ASRModel 10 | from app.config import CONFIG 11 | from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 12 | 13 | 14 | class FasterWhisperASR(ASRModel): 15 | 16 | def load_model(self): 17 | 18 | self.model = WhisperModel( 19 | model_size_or_path=CONFIG.MODEL_NAME, 20 | device=CONFIG.DEVICE, 21 | compute_type=CONFIG.MODEL_QUANTIZATION, 22 | download_root=CONFIG.MODEL_PATH 23 | ) 24 | 25 | Thread(target=self.monitor_idleness, daemon=True).start() 26 | 27 | def transcribe( 28 | self, 29 | audio, 30 | task: Union[str, None], 31 | language: Union[str, None], 32 | initial_prompt: Union[str, None], 33 | vad_filter: Union[bool, None], 34 | word_timestamps: Union[bool, None], 35 | options: Union[dict, None], 36 | output, 37 | ): 38 | self.last_activity_time = time.time() 39 | 40 | with self.model_lock: 41 | if self.model is None: 42 | self.load_model() 43 | 44 | options_dict = {"task": task} 45 | if language: 46 | options_dict["language"] = language 47 | if initial_prompt: 48 | options_dict["initial_prompt"] = initial_prompt 49 | if vad_filter: 50 | options_dict["vad_filter"] = True 51 | if word_timestamps: 52 | options_dict["word_timestamps"] = True 53 | with self.model_lock: 54 | segments = [] 55 | text = "" 56 | segment_generator, info = self.model.transcribe(audio, beam_size=5, **options_dict) 57 | for segment in segment_generator: 58 | segments.append(segment) 59 | text = text + segment.text 60 | result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text} 61 | 62 | output_file = StringIO() 63 | self.write_result(result, output_file, output) 64 | output_file.seek(0) 65 | 66 | return output_file 67 | 68 | def language_detection(self, audio): 69 | 70 | self.last_activity_time = time.time() 71 | 72 | with self.model_lock: 73 | if self.model is None: self.load_model() 74 | 75 | # load audio and pad/trim it to fit 30 seconds 76 | audio = whisper.pad_or_trim(audio) 77 | 78 | # detect the spoken language 79 | with self.model_lock: 80 | segments, info = self.model.transcribe(audio, beam_size=5) 81 | detected_lang_code = info.language 82 | detected_language_confidence = info.language_probability 83 | 84 | return detected_lang_code, detected_language_confidence 85 | 86 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 87 | if output == "srt": 88 | WriteSRT(ResultWriter).write_result(result, file=file) 89 | elif output == "vtt": 90 | WriteVTT(ResultWriter).write_result(result, file=file) 91 | elif output == "tsv": 92 | WriteTSV(ResultWriter).write_result(result, file=file) 93 | elif output == "json": 94 | WriteJSON(ResultWriter).write_result(result, file=file) 95 | else: 96 | WriteTXT(ResultWriter).write_result(result, file=file) 97 | -------------------------------------------------------------------------------- /docs/endpoints.md: -------------------------------------------------------------------------------- 1 | ## Quick start 2 | 3 | After running the docker image interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs) 4 | 5 | There are 2 endpoints available: 6 | 7 | - [/asr](##Automatic-Speech-recognition-service-/asr) (Automatic Speech Recognition) 8 | - [/detect-language](##Language-detection-service-/detect-language) 9 | 10 | ## Automatic speech recognition service /asr 11 | 12 | - 2 task choices: 13 | - **transcribe**: (default) task, transcribes the uploaded file. 14 | - **translate**: will provide an English transcript no matter which language was spoken. 15 | - Files are automatically converted with FFmpeg. 16 | - Full list of supported [audio](https://ffmpeg.org/general.html#Audio-Codecs) and [video](https://ffmpeg.org/general.html#Video-Codecs) formats. 17 | - You can enable word level timestamps output by `word_timestamps` parameter 18 | - You can Enable the voice activity detection (VAD) to filter out parts of the audio without speech by `vad_filter` parameter (only with `Faster Whisper` for now). 19 | 20 | ### Request URL Query Params 21 | 22 | | Name | Values | Description | 23 | |-----------------|------------------------------------------------|----------------------------------------------------------------| 24 | | audio_file | File | Audio or video file to transcribe | 25 | | output | `text` (default), `json`, `vtt`, `srt`, `tsv` | Output format | 26 | | task | `transcribe`, `translate` | Task type - transcribe in source language or translate to English | 27 | | language | `en` (default is auto recognition) | Source language code (see supported languages) | 28 | | word_timestamps | false (default) | Enable word-level timestamps (Faster Whisper only) | 29 | | vad_filter | false (default) | Enable voice activity detection filtering (Faster Whisper only) | 30 | | encode | true (default) | Encode audio through FFmpeg before processing | 31 | | diarize | false (default) | Enable speaker diarization (WhisperX only) | 32 | | min_speakers | null (default) | Minimum number of speakers for diarization (WhisperX only) | 33 | | max_speakers | null (default) | Maximum number of speakers for diarization (WhisperX only) | 34 | 35 | Example request with cURL 36 | 37 | ```bash 38 | curl -X POST -H "content-type: multipart/form-data" -F "audio_file=@/path/to/file" 0.0.0.0:9000/asr?output=json 39 | ``` 40 | 41 | ### Response (JSON) 42 | 43 | - **text**: Contains the full transcript 44 | - **segments**: Contains an entry per segment. Each entry provides `timestamps`, `transcript`, `token ids`, `word level timestamps` and other metadata 45 | - **language**: Detected or provided language (as a language code) 46 | 47 | ### Response Formats 48 | 49 | The API supports multiple output formats: 50 | 51 | - **text**: Plain text transcript (default) 52 | - **json**: Detailed JSON with segments, timestamps, and metadata 53 | - **vtt**: WebVTT subtitle format 54 | - **srt**: SubRip subtitle format 55 | - **tsv**: Tab-separated values with timestamps 56 | 57 | ### Supported Languages 58 | 59 | The service supports all languages supported by Whisper. Some common language codes: 60 | 61 | - Turkish (tr) 62 | - English (en) 63 | - Spanish (es) 64 | - French (fr) 65 | - German (de) 66 | - Italian (it) 67 | - Portuguese (pt) 68 | - And many more... 69 | 70 | See the [Whisper documentation](https://github.com/openai/whisper#available-models-and-languages) for the full list of supported languages. 71 | 72 | ### Speaker Diarization 73 | 74 | When using the WhisperX engine with diarization enabled (`diarize=true`), the output will include speaker labels for each segment. This requires: 75 | 76 | 1. WhisperX engine to be configured 77 | 2. Valid Hugging Face token set in HF_TOKEN 78 | 3. Sufficient memory for diarization models 79 | 80 | You can optionally specify `min_speakers` and `max_speakers` if you know the expected number of speakers. 81 | 82 | ## Language detection service /detect-language 83 | 84 | Detects the language spoken in the uploaded file. Only processes first 30 seconds. 85 | 86 | Returns a json with following fields: 87 | 88 | - **detected_language**: Human readable language name (e.g. "english") 89 | - **language_code**: ISO language code (e.g. "en") 90 | - **confidence**: Confidence score between 0 and 1 indicating detection reliability 91 | 92 | Example response: 93 | 94 | ```json 95 | { 96 | "detected_language": "english", 97 | "language_code": "en", 98 | "confidence": 0.98 99 | } 100 | ``` 101 | -------------------------------------------------------------------------------- /app/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dataclasses import asdict 4 | from typing import BinaryIO, TextIO 5 | 6 | import ffmpeg 7 | import numpy as np 8 | from faster_whisper.utils import format_timestamp 9 | 10 | from app.config import CONFIG 11 | 12 | 13 | class ResultWriter: 14 | extension: str 15 | 16 | def __init__(self, output_dir: str): 17 | self.output_dir = output_dir 18 | 19 | def __call__(self, result: dict, audio_path: str): 20 | audio_basename = os.path.basename(audio_path) 21 | output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension) 22 | 23 | with open(output_path, "w", encoding="utf-8") as f: 24 | self.write_result(result, file=f) 25 | 26 | def write_result(self, result: dict, file: TextIO): 27 | raise NotImplementedError 28 | 29 | 30 | class WriteTXT(ResultWriter): 31 | extension: str = "txt" 32 | 33 | def write_result(self, result: dict, file: TextIO): 34 | for segment in result["segments"]: 35 | print(segment.text.strip(), file=file, flush=True) 36 | 37 | 38 | class WriteVTT(ResultWriter): 39 | extension: str = "vtt" 40 | 41 | def write_result(self, result: dict, file: TextIO): 42 | print("WEBVTT\n", file=file) 43 | for segment in result["segments"]: 44 | print( 45 | f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n" 46 | f"{segment.text.strip().replace('-->', '->')}\n", 47 | file=file, 48 | flush=True, 49 | ) 50 | 51 | 52 | class WriteSRT(ResultWriter): 53 | extension: str = "srt" 54 | 55 | def write_result(self, result: dict, file: TextIO): 56 | for i, segment in enumerate(result["segments"], start=1): 57 | # write srt lines 58 | print( 59 | f"{i}\n" 60 | f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> " 61 | f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n" 62 | f"{segment.text.strip().replace('-->', '->')}\n", 63 | file=file, 64 | flush=True, 65 | ) 66 | 67 | 68 | class WriteTSV(ResultWriter): 69 | """ 70 | Write a transcript to a file in TSV (tab-separated values) format containing lines like: 71 | \t\t 72 | 73 | Using integer milliseconds as start and end times means there's no chance of interference from 74 | an environment setting a language encoding that causes the decimal in a floating point number 75 | to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++. 76 | """ 77 | 78 | extension: str = "tsv" 79 | 80 | def write_result(self, result: dict, file: TextIO): 81 | print("start", "end", "text", sep="\t", file=file) 82 | for segment in result["segments"]: 83 | print(round(1000 * segment.start), file=file, end="\t") 84 | print(round(1000 * segment.end), file=file, end="\t") 85 | print(segment.text.strip().replace("\t", " "), file=file, flush=True) 86 | 87 | 88 | class WriteJSON(ResultWriter): 89 | extension: str = "json" 90 | 91 | def write_result(self, result: dict, file: TextIO): 92 | if "segments" in result: 93 | result["segments"] = [asdict(segment) for segment in result["segments"]] 94 | json.dump(result, file) 95 | 96 | 97 | def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE): 98 | """ 99 | Open an audio file object and read as mono waveform, resampling as necessary. 100 | Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object 101 | Parameters 102 | ---------- 103 | file: BinaryIO 104 | The audio file like object 105 | encode: Boolean 106 | If true, encode audio stream to WAV before sending to whisper 107 | sr: int 108 | The sample rate to resample the audio if necessary 109 | Returns 110 | ------- 111 | A NumPy array containing the audio waveform, in float32 dtype. 112 | """ 113 | if encode: 114 | try: 115 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 116 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 117 | out, _ = ( 118 | ffmpeg.input("pipe:", threads=0) 119 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) 120 | .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read()) 121 | ) 122 | except ffmpeg.Error as e: 123 | raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e 124 | else: 125 | out = file.read() 126 | 127 | return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 128 | -------------------------------------------------------------------------------- /app/asr_models/mbain_whisperx_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import StringIO 3 | from threading import Thread 4 | from typing import BinaryIO, Union 5 | 6 | import whisper 7 | import whisperx 8 | from whisperx.utils import ResultWriter, SubtitlesWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT 9 | 10 | from app.asr_models.asr_model import ASRModel 11 | from app.config import CONFIG 12 | 13 | 14 | class WhisperXASR(ASRModel): 15 | def __init__(self): 16 | super().__init__() 17 | self.model = { 18 | 'whisperx': None, 19 | 'diarize_model': None, 20 | 'align_model': {} 21 | } 22 | 23 | def load_model(self): 24 | asr_options = {"without_timestamps": False} 25 | self.model['whisperx'] = whisperx.load_model( 26 | CONFIG.MODEL_NAME, 27 | device=CONFIG.DEVICE, 28 | compute_type=CONFIG.MODEL_QUANTIZATION, 29 | asr_options=asr_options 30 | ) 31 | 32 | if CONFIG.HF_TOKEN != "": 33 | self.model['diarize_model'] = whisperx.DiarizationPipeline( 34 | use_auth_token=CONFIG.HF_TOKEN, 35 | device=CONFIG.DEVICE 36 | ) 37 | 38 | Thread(target=self.monitor_idleness, daemon=True).start() 39 | 40 | def transcribe( 41 | self, 42 | audio, 43 | task: Union[str, None], 44 | language: Union[str, None], 45 | initial_prompt: Union[str, None], 46 | vad_filter: Union[bool, None], 47 | word_timestamps: Union[bool, None], 48 | options: Union[dict, None], 49 | output, 50 | ): 51 | self.last_activity_time = time.time() 52 | with self.model_lock: 53 | if self.model is None: 54 | self.load_model() 55 | 56 | options_dict = {"task": task} 57 | if language: 58 | options_dict["language"] = language 59 | if initial_prompt: 60 | options_dict["initial_prompt"] = initial_prompt 61 | with self.model_lock: 62 | result = self.model['whisperx'].transcribe(audio, **options_dict) 63 | language = result["language"] 64 | 65 | # Load the required model and cache it 66 | # If we transcribe models in many different languages, this may lead to OOM propblems 67 | if result["language"] in self.model['align_model']: 68 | model_x, metadata = self.model['align_model'][result["language"]] 69 | else: 70 | self.model['align_model'][result["language"]] = whisperx.load_align_model( 71 | language_code=result["language"], device=CONFIG.DEVICE 72 | ) 73 | model_x, metadata = self.model['align_model'][result["language"]] 74 | 75 | # Align whisper output 76 | result = whisperx.align( 77 | result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False 78 | ) 79 | 80 | if options.get("diarize", False) and CONFIG.HF_TOKEN != "": 81 | min_speakers = options.get("min_speakers", None) 82 | max_speakers = options.get("max_speakers", None) 83 | # add min/max number of speakers if known 84 | diarize_segments = self.model['diarize_model'](audio, min_speakers, max_speakers) 85 | result = whisperx.assign_word_speakers(diarize_segments, result) 86 | result["language"] = language 87 | 88 | output_file = StringIO() 89 | self.write_result(result, output_file, output) 90 | output_file.seek(0) 91 | 92 | return output_file 93 | 94 | def language_detection(self, audio): 95 | # load audio and pad/trim it to fit 30 seconds 96 | audio = whisper.pad_or_trim(audio) 97 | 98 | # make log-Mel spectrogram and move to the same device as the model 99 | mel = whisper.log_mel_spectrogram(audio).to(self.model.device) 100 | 101 | # detect the spoken language 102 | with self.model_lock: 103 | if self.model is None: 104 | self.load_model() 105 | _, probs = self.model.detect_language(mel) 106 | detected_lang_code = max(probs, key=probs.get) 107 | 108 | return detected_lang_code 109 | 110 | def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]): 111 | default_options = { 112 | "max_line_width": CONFIG.SUBTITLE_MAX_LINE_WIDTH, 113 | "max_line_count": CONFIG.SUBTITLE_MAX_LINE_COUNT, 114 | "highlight_words": CONFIG.SUBTITLE_HIGHLIGHT_WORDS 115 | } 116 | 117 | if output == "srt": 118 | WriteSRT(SubtitlesWriter).write_result(result, file=file, options=default_options) 119 | elif output == "vtt": 120 | WriteVTT(SubtitlesWriter).write_result(result, file=file, options=default_options) 121 | elif output == "tsv": 122 | WriteTSV(ResultWriter).write_result(result, file=file, options=default_options) 123 | elif output == "json": 124 | WriteJSON(ResultWriter).write_result(result, file=file, options=default_options) 125 | else: 126 | WriteTXT(ResultWriter).write_result(result, file=file, options=default_options) 127 | -------------------------------------------------------------------------------- /app/webservice.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import os 3 | from os import path 4 | from typing import Annotated, Optional, Union 5 | from urllib.parse import quote 6 | 7 | import click 8 | import uvicorn 9 | from fastapi import FastAPI, File, Query, UploadFile, applications 10 | from fastapi.openapi.docs import get_swagger_ui_html 11 | from fastapi.responses import RedirectResponse, StreamingResponse 12 | from fastapi.staticfiles import StaticFiles 13 | from whisper import tokenizer 14 | 15 | from app.config import CONFIG 16 | from app.factory.asr_model_factory import ASRModelFactory 17 | from app.utils import load_audio 18 | 19 | asr_model = ASRModelFactory.create_asr_model() 20 | asr_model.load_model() 21 | 22 | LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys()) 23 | 24 | projectMetadata = importlib.metadata.metadata("whisper-asr-webservice") 25 | app = FastAPI( 26 | title=projectMetadata["Name"].title().replace("-", " "), 27 | description=projectMetadata["Summary"], 28 | version=projectMetadata["Version"], 29 | contact={"url": projectMetadata["Home-page"]}, 30 | swagger_ui_parameters={"defaultModelsExpandDepth": -1}, 31 | license_info={"name": "MIT License", "url": projectMetadata["License"]}, 32 | ) 33 | 34 | assets_path = os.getcwd() + "/swagger-ui-assets" 35 | if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"): 36 | app.mount("/assets", StaticFiles(directory=assets_path), name="static") 37 | 38 | def swagger_monkey_patch(*args, **kwargs): 39 | return get_swagger_ui_html( 40 | *args, 41 | **kwargs, 42 | swagger_favicon_url="", 43 | swagger_css_url="/assets/swagger-ui.css", 44 | swagger_js_url="/assets/swagger-ui-bundle.js", 45 | ) 46 | 47 | applications.get_swagger_ui_html = swagger_monkey_patch 48 | 49 | 50 | @app.get("/", response_class=RedirectResponse, include_in_schema=False) 51 | async def index(): 52 | return "/docs" 53 | 54 | 55 | @app.post("/asr", tags=["Endpoints"]) 56 | async def asr( 57 | audio_file: UploadFile = File(...), # noqa: B008 58 | encode: bool = Query(default=True, description="Encode audio first through ffmpeg"), 59 | task: Union[str, None] = Query(default="transcribe", enum=["transcribe", "translate"]), 60 | language: Union[str, None] = Query(default=None, enum=LANGUAGE_CODES), 61 | initial_prompt: Union[str, None] = Query(default=None), 62 | vad_filter: Annotated[ 63 | bool | None, 64 | Query( 65 | description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech", 66 | include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False), 67 | ), 68 | ] = False, 69 | word_timestamps: bool = Query( 70 | default=False, 71 | description="Word level timestamps", 72 | include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False), 73 | ), 74 | diarize: bool = Query( 75 | default=False, 76 | description="Diarize the input", 77 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" and CONFIG.HF_TOKEN != "" else False), 78 | ), 79 | min_speakers: Union[int, None] = Query( 80 | default=None, 81 | description="Min speakers in this file", 82 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False), 83 | ), 84 | max_speakers: Union[int, None] = Query( 85 | default=None, 86 | description="Max speakers in this file", 87 | include_in_schema=(True if CONFIG.ASR_ENGINE == "whisperx" else False), 88 | ), 89 | output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]), 90 | ): 91 | result = asr_model.transcribe( 92 | load_audio(audio_file.file, encode), 93 | task, 94 | language, 95 | initial_prompt, 96 | vad_filter, 97 | word_timestamps, 98 | {"diarize": diarize, "min_speakers": min_speakers, "max_speakers": max_speakers}, 99 | output, 100 | ) 101 | return StreamingResponse( 102 | result, 103 | media_type="text/plain", 104 | headers={ 105 | "Asr-Engine": CONFIG.ASR_ENGINE, 106 | "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"', 107 | }, 108 | ) 109 | 110 | 111 | @app.post("/detect-language", tags=["Endpoints"]) 112 | async def detect_language( 113 | audio_file: UploadFile = File(...), # noqa: B008 114 | encode: bool = Query(default=True, description="Encode audio first through FFmpeg"), 115 | ): 116 | detected_lang_code, confidence = asr_model.language_detection(load_audio(audio_file.file, encode)) 117 | return { 118 | "detected_language": tokenizer.LANGUAGES[detected_lang_code], 119 | "language_code": detected_lang_code, 120 | "confidence": confidence, 121 | } 122 | 123 | 124 | @click.command() 125 | @click.option( 126 | "-h", 127 | "--host", 128 | metavar="HOST", 129 | default="0.0.0.0", 130 | help="Host for the webservice (default: 0.0.0.0)", 131 | ) 132 | @click.option( 133 | "-p", 134 | "--port", 135 | metavar="PORT", 136 | default=9000, 137 | help="Port for the webservice (default: 9000)", 138 | ) 139 | @click.version_option(version=projectMetadata["Version"]) 140 | def start(host: str, port: Optional[int] = None): 141 | uvicorn.run(app, host=host, port=port) 142 | 143 | 144 | if __name__ == "__main__": 145 | start() 146 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Unreleased 5 | ---------- 6 | 7 | [1.8.2] (2025-02-18) 8 | -------------------- 9 | 10 | ### Changed 11 | 12 | - Reduced GPU image size by using `nvidia/cuda:12.6.3-base-ubuntu22.04` 13 | 14 | [1.8.1] (2025-02-18) 15 | -------------------- 16 | 17 | ### Fixed 18 | 19 | - Fixed issues with Torch CUDA and cuDNN 20 | - Updated Torch and Torchaudio dependencies for multi-architecture support 21 | 22 | [1.8.0] (2025-02-17) 23 | -------------------- 24 | 25 | ### Added 26 | 27 | - Added support for [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1) 28 | 29 | ### Changed 30 | 31 | - Upgraded Cuda GPU image to v12.6.3 32 | - Upgraded dependencies 33 | - torch to v2.6.0 34 | - fastapi to v0.115.8 35 | - llvmlite to v0.44.0 36 | - numba to v0.61.0 37 | - ruff to v0.9.6 38 | - black to v25.1.0 39 | - mkdocs-material to v9.6.4 40 | - pymdown-extensions to v10.14.3 41 | 42 | [1.7.1] (2024-12-18) 43 | -------------------- 44 | 45 | ### Fixed 46 | 47 | - Fix JSON serialization of segments due to Faster Whisper v1.1.0 changes 48 | 49 | [1.7.0] (2024-12-17) 50 | -------------------- 51 | 52 | ### Added 53 | 54 | - Timeout configured to allow model to be unloaded when idle 55 | - Added detection confidence to langauge detection endpoint 56 | - Set mel generation to adjust n_dims automatically to match the loaded model 57 | - Refactor classes, Add comments, implement abstract methods, and add factory method for engine selection 58 | 59 | ### Changed 60 | 61 | - Upgraded 62 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0) 63 | - uvicorn to v0.34.0 64 | - tqdm to v4.67.1 65 | - python-multipart to v0.0.20 66 | - fastapi to v0.115.6 67 | - pytest to v8.3.4 68 | - ruff to v0.8.3 69 | - black to v24.10.0 70 | - mkdocs to v1.6.1 71 | - mkdocs-material to v9.5.49 72 | - pymdown-extensions to v10.12 73 | 74 | [1.6.0] (2024-10-06) 75 | -------------------- 76 | 77 | ### Changed 78 | 79 | - Upgraded 80 | - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930) 81 | - fastapi to v0.115.0 82 | - uvicorn to v0.31.0 83 | - tqdm to v4.66.5 84 | - python-multipart to v0.0.12 85 | 86 | [1.5.0] (2024-07-04) 87 | -------------------- 88 | 89 | ### Changed 90 | 91 | - Upgraded 92 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.3) 93 | - fastapi to v0.111.0 94 | - uvicorn to v0.30.1 95 | - gunicorn to v22.0.0 96 | - tqdm to v4.66.4 97 | - llvmlite to v0.43.0 98 | - numba to v0.60.0 99 | 100 | [1.4.1] (2024-04-17) 101 | -------------------- 102 | 103 | ### Changed 104 | 105 | - Upgraded torch to v1.13.1 106 | 107 | [1.4.0] (2024-04-17) 108 | -------------------- 109 | 110 | ### Changed 111 | 112 | - Upgraded 113 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.0.1](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.0.1) 114 | - fastapi to v0.110.1 115 | - uvicorn to v0.29.0 116 | - gunicorn to v21.2.0 117 | - tqdm to v4.66.2 118 | - python-multipart to v0.0.9 119 | - llvmlite to v0.42.0 120 | - numba to v0.59.1 121 | 122 | [1.3.0] (2024-02-15) 123 | -------------------- 124 | 125 | ### Added 126 | 127 | - Compiled and added FFmpeg without LGPL libraries for license compliance 128 | 129 | [1.2.4] (2023-11-27) 130 | -------------------- 131 | 132 | ### Changed 133 | 134 | - Upgraded 135 | - [openai/whisper](https://github.com/openai/whisper) to [v20231117](https://github.com/openai/whisper/releases/tag/v20231117) 136 | - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v0.10.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v0.10.0) 137 | 138 | [1.2.3] (2023-11-07) 139 | -------------------- 140 | 141 | ### Changed 142 | 143 | - Upgraded 144 | - [openai/whisper](https://github.com/openai/whisper) to [v20231106](https://github.com/openai/whisper/releases/tag/v20231106) 145 | 146 | [1.2.2] (2023-11-03) 147 | -------------------- 148 | 149 | ### Fixed 150 | 151 | - Fixed `swagger-ui` rendering issues by upgrading to `v5.9.1`, fixes #153 and #154 152 | 153 | [1.2.1] (2023-11-03) 154 | -------------------- 155 | 156 | ### Enabled 157 | 158 | - Enabled `vad_filter` for `faster-whisper` engine 159 | 160 | ### Changed 161 | 162 | - Changed misspelling in "Word level timestamps" 163 | - Removed unused unidecode dependency 164 | - Upgraded 165 | - uvicorn to v0.23.2 166 | - gunicorn to v21.0.1 167 | - tqdm to v4.66.1 168 | - python-multipart to v0.0.6 169 | - fastapi to v0.104.1 170 | - llvmlite to v0.41.1 171 | - numba to v0.58.0 172 | 173 | [1.2.0] (2023-10-01) 174 | -------------------- 175 | 176 | ### Changed 177 | 178 | - Upgraded 179 | - [openai/whisper](https://github.com/openai/whisper) to [v20230918](https://github.com/openai/whisper/releases/tag/v20230918) 180 | - [guillaumekln/faster-whisper](https://github.com/guillaumekln/faster-whisper) to [v0.9.0](https://github.com/guillaumekln/faster-whisper/releases/tag/v0.9.0) 181 | 182 | ### Updated 183 | 184 | - Updated model conversion method (for Faster Whisper) to use Hugging Face downloader 185 | - Updated default model paths to `~/.cache/whisper` or `/root/.cache/whisper`. 186 | - For customization, modify the `ASR_MODEL_PATH` environment variable. 187 | - Ensure Docker volume is set for the corresponding directory to use caching. 188 | 189 | ```bash 190 | docker run -d -p 9000:9000 -e ASR_MODEL_PATH=/data/whisper -v $PWD/yourlocaldir:/data/whisper onerahmet/openai-whisper-asr-webservice:latest 191 | ``` 192 | 193 | - Removed the `triton` dependency from `poetry.lock` to ensure the stability of the pipeline for `ARM-based` Docker images 194 | 195 | [1.1.1] (2023-05-29) 196 | -------------------- 197 | 198 | ### Changed 199 | 200 | - 94 gpus that don't support float16 in #103 201 | - Update compute type in #108 202 | - Add word level functionality for Faster Whisper in #109 203 | 204 | [1.1.0] (2023-04-17) 205 | -------------------- 206 | 207 | ### Changed 208 | 209 | - Docs in #72 210 | - Fix language code typo in #77 211 | - Adds support for FasterWhisper in #81 212 | - Add an optional param to skip the encoding step in #82 213 | - Faster whisper in #92 214 | 215 | [1.0.6] (2023-02-05) 216 | -------------------- 217 | 218 | ### Changed 219 | 220 | - Update README.md in #58 221 | - 68 update the versions in #69 222 | - Fix gunicorn run command and remove deprecated poetry run script in #70 223 | - Move torch installation method into the pyproject.toml file in #71 224 | - Add prompt to ASR in #66 225 | 226 | [1.0.5] (2022-12-08) 227 | -------------------- 228 | 229 | ### Changed 230 | 231 | - 43 make swagger doc not depend on internet connection in #52 232 | - Add new large model v2 in #53 233 | 234 | [1.0.4] (2022-11-28) 235 | -------------------- 236 | 237 | ### Changed 238 | 239 | - 43 make swagger doc not depend on internet connection in #51 240 | - Anally retentively fixed markdown linting warnings in README. Sorry. in #48 241 | - Explicit macOS readme with explanation for no-GPU [closes #44] in #47 242 | 243 | [1.0.3-beta] (2022-11-17) 244 | ------------------------- 245 | 246 | ### Changed 247 | 248 | - Combine transcribe endpoints in #36 249 | - Add multi worker support with gunicorn in #37 250 | - Add multi platform (amd & arm) support in #39 251 | - Upgrade Cuda version to 11.7 in #40 252 | - Lock to the latest whisper version (eff383) in #41 253 | 254 | [1.0.2-beta] (2022-10-04) 255 | ------------------------- 256 | 257 | ### Changed 258 | 259 | - add mutex lock to the model in #19 260 | - Subtitles in #21 261 | - Add gpu support and create Docker image for cuda with GitHub flow in #22 262 | 263 | [1.0.1-beta] (2022-09-27) 264 | ------------------------- 265 | 266 | ### Changed 267 | 268 | - Init GitHub runners in #10 269 | - Lock Whisper dependency with b4308... revision number to prevent build crashes in #15 270 | 271 | [1.0.0-beta] (2022-09-25) 272 | ------------------------- 273 | 274 | ### Changed 275 | 276 | - Docker init in #1 277 | - Create LICENCE in #2 278 | - Fastapi init in #3 279 | - Avoid temp file in #4 280 | - Translate init in #5 281 | - mp3 support by using FFmpeg instead of librosa in #8 282 | - add language detection endpoint in #9 283 | 284 | [1.8.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.2 285 | [1.8.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.1 286 | [1.8.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.8.0 287 | [1.7.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.1 288 | [1.7.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.7.0 289 | [1.6.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.6.0 290 | [1.5.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.5.0 291 | [1.4.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.1 292 | [1.4.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.0 293 | [1.3.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.3.0 294 | [1.2.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.4 295 | [1.2.3]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.3 296 | [1.2.2]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.2 297 | [1.2.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.1 298 | [1.2.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.2.0 299 | [1.1.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.1 300 | [1.1.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.1.0 301 | [1.0.6]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.6 302 | [1.0.5]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.5 303 | [1.0.4]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.4 304 | [1.0.3-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.3-beta 305 | [1.0.2-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.2-beta 306 | [1.0.1-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.0.1-beta 307 | [1.0.0-beta]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/1.0.0-beta 308 | --------------------------------------------------------------------------------