├── .editorconfig ├── .env.dev ├── .env.example ├── .env.test ├── .github ├── renovate.json └── workflows │ └── ci.yml ├── .gitignore ├── Makefile ├── README.md ├── alembic.ini ├── app ├── __init__.py ├── shared │ ├── __init__.py │ ├── celery.py │ ├── db │ │ ├── __init__.py │ │ ├── alembic │ │ │ ├── README │ │ │ ├── env.py │ │ │ ├── script.py.mako │ │ │ └── versions │ │ │ │ └── 0eee2b7913b7_add_tables.py │ │ ├── base.py │ │ └── models.py │ ├── logger.py │ └── settings.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_api.py │ └── test_auth.py ├── web │ ├── __init__.py │ ├── dtos.py │ ├── injections │ │ ├── __init__.py │ │ ├── db.py │ │ ├── security.py │ │ ├── settings.py │ │ └── task_queue.py │ ├── main.py │ └── task_queue.py └── worker │ ├── __init__.py │ ├── main.py │ └── strategies │ ├── base.py │ └── local.py ├── conf └── rabbitmq.conf ├── docker-compose.base.yml ├── docker-compose.dev.yml ├── docker-compose.prod.yml ├── mypy.ini ├── pyproject.toml ├── scripts └── download_models.py ├── web.Dockerfile ├── worker.Dockerfile └── worker.gpu.Dockerfile /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | charset = utf-8 9 | end_of_line = lf 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | 13 | # 2 space indentation for every file 14 | [*] 15 | indent_style = space 16 | indent_size = 2 17 | 18 | # 4 space indentation for python 19 | [*.py] 20 | indent_size = 4 21 | 22 | # allow trailing whitespace in markdown files 23 | [*.md] 24 | trim_trailing_whitespace = false 25 | 26 | [Makefile] 27 | indent_style = tab 28 | -------------------------------------------------------------------------------- /.env.dev: -------------------------------------------------------------------------------- 1 | API_SECRET="a_very_secret_token" 2 | TRAEFIK_DOMAIN="whisperbox-transcribe.localhost" 3 | WHISPER_MODEL="tiny" 4 | ENVIRONMENT="development" 5 | DATABASE_URI="sqlite:///./whisperbox-transcribe.sqlite" 6 | 7 | RABBITMQ_DEFAULT_USER="rabbitmq" 8 | RABBITMQ_DEFAULT_PASS="rabbitmq_password" 9 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # this key is later used to authenticate against the API. 2 | API_SECRET="change_me" 3 | 4 | # see https://github.com/openai/whisper#available-models-and-languages 5 | WHISPER_MODEL="small" 6 | 7 | # If enabled, GET requests to routes `/job/:id` and `/job/:id/artifacts` will be unauthenticated. 8 | ENABLE_SHARING="false" 9 | 10 | # the domain you want to access the service from. Its A records need to point to the host IP. 11 | TRAEFIK_DOMAIN="whisperbox-transcribe.localhost" 12 | 13 | # an email which is used to verify domain ownership before a TLS certificate is issued. 14 | TRAEFIK_SSLEMAIL="" 15 | 16 | # --- 17 | # below settings match the default docker-compose configuration. 18 | 19 | RABBITMQ_DEFAULT_USER="rabbitmq" 20 | RABBITMQ_DEFAULT_PASS="rabbitmq_password" 21 | 22 | DATABASE_URI="sqlite:////etc/whisperbox-transcribe/data/whisperbox-transcribe.sqlite" 23 | ENVIRONMENT="production" 24 | -------------------------------------------------------------------------------- /.env.test: -------------------------------------------------------------------------------- 1 | API_SECRET="test_secret" 2 | BROKER_URL="memory://" 3 | DATABASE_URI="sqlite://" 4 | ENVIRONMENT="test" 5 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": ["config:base", "schedule:monthly"], 4 | "timezone": "Europe/Berlin", 5 | "enabledManagers": ["dockerfile", "docker-compose", "pep621"] 6 | } 7 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: push 3 | 4 | jobs: 5 | lint: 6 | runs-on: ubuntu-latest 7 | name: Lint 8 | steps: 9 | - uses: actions/checkout@v3 10 | - uses: actions/setup-python@v4 11 | with: 12 | python-version: '3.11' 13 | cache: 'pip' 14 | cache-dependency-path: '**/pyproject.toml' 15 | - run: pip install -e .[web,tooling] 16 | - run: make lint 17 | 18 | test: 19 | runs-on: ubuntu-latest 20 | name: Test 21 | steps: 22 | - uses: actions/checkout@v3 23 | - uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.11' 26 | cache: 'pip' 27 | cache-dependency-path: '**/pyproject.toml' 28 | - run: pip install -e .[web,tooling] 29 | - run: pytest 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # VS Code 163 | .vscode 164 | .DS_Store 165 | 166 | whisperbox-transcribe.sqlite* 167 | *shm 168 | *wal 169 | 170 | # ruff 171 | .ruff_cache 172 | 173 | # other private files 174 | /data 175 | .env.prod 176 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: 2 | docker compose -f docker-compose.base.yml -f docker-compose.dev.yml down --volumes --remove-orphans 3 | 4 | dev: 5 | docker compose -f docker-compose.base.yml -f docker-compose.dev.yml build 6 | docker compose -f docker-compose.base.yml -f docker-compose.dev.yml up --remove-orphans 7 | 8 | fmt: 9 | black app 10 | ruff check app --fix 11 | 12 | lint: 13 | black --check app 14 | ruff check app 15 | mypy app 16 | 17 | test: 18 | pytest 19 | 20 | run: 21 | docker compose -f docker-compose.base.yml -f docker-compose.prod.yml build 22 | docker compose -f docker-compose.base.yml -f docker-compose.prod.yml up -d --remove-orphans 23 | 24 | stop: 25 | docker compose -f docker-compose.base.yml -f docker-compose.prod.yml down 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # whisperbox-transcribe 2 | 3 | > HTTP wrapper around [openai/whisper](https://github.com/openai/whisper). 4 | 5 | ## Overview 6 | 7 | This project wraps OpenAI's `whisper` speech-to-text models with a HTTP API. 8 | 9 | The API design draws inspiration from the [rev.ai async speech-to-text API](https://docs.rev.ai/api/asynchronous/get-started/). Transcription jobs are submitted by making a HTTP POST request to the service. Once the job is accepted, an ID is returned, which can be later utilized to retrieve the transcription results. These results are stored in an internal database until they are retrieved and can optionally be deleted afterwards. 10 | 11 | It is assumed that the service is used by exactly one consumer, so a pre-shared API key is used as authentication method. OpenAPI documentation for the service is available at `/docs`. 12 | 13 | ## Deploy 14 | 15 |
16 | 0. Choose model & instance size 17 | Whisper offers a range of models in [different sizes](https://github.com/openai/whisper#available-models-and-languages). The model size affects factors such as accuracy, resource usage, and transcription speed. Smaller models are generally faster and consume fewer resources, but they may be less accurate, especially when working with non-English languages or translation tasks. 18 | 19 | Whisper supports inference on both CPU and GPU, and this project includes slightly modified Docker Compose configurations to enable both options. CPU inference is slower but usually more cost-effective for hosting purposes. CPU inference performance typically scales well with the CPU speed. 20 | 21 | When selecting an instance for your application, it's important to consider the disk size. Media files need to be downloaded before they can be transcribed, so the disk must have sufficient free space to accommodate them. 22 | 23 | As a starting point, the "small" model can run on a 4GB Digital Ocean droplet with, achieving approximately a 1-2x speed-up over to the original audio length when transcribing. 24 |
25 | 26 | ### 1. Prepare host environment 27 | 28 | This project is intended to be run via [docker compose](https://docs.docker.com/compose/). In order to get started, [install](https://docs.docker.com/engine/install/) docker engine. Then, clone this repository to the machine. 29 | 30 | > **Note** 31 | > If you want to use a GPU, uncomment the sections tagged `` in `docker-compose.prod.yml`. 32 | 33 | ### 2. Configure service 34 | 35 | 2. Create an `.env` file from `.env.example` and configure it. Refer to comments for available envs and their usage. 36 | 37 | ### 3. Run service 38 | 39 | Run `make run` to start the server. To launch at system startup, wrap it in a systemd launch service. 40 | 41 | ## Develop 42 | 43 | [docker compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development. 44 | 45 | It is recommended to setup a virtual environment for python tooling. To install dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`. 46 | 47 | Copy `.env.dev` to `.env` to configure the service. 48 | 49 | ### Start 50 | 51 | ``` 52 | make dev 53 | ``` 54 | 55 | Builds and starts the docker containers. 56 | 57 | ``` 58 | # Bindings 59 | http://localhost:5555 => Celery dashboard 60 | http://localhost:15672 => RabbitMQ dashboard 61 | http://whisperbox-transcribe.localhost => API 62 | http://whisperbox-transcribe.localhost/docs => API docs 63 | ./whisperbox-transcribe.sqlite => Database 64 | ``` 65 | 66 | #### Clean 67 | 68 | This removes all containers and attached volumes. 69 | 70 | ``` 71 | make clean 72 | ``` 73 | 74 | ### Test 75 | 76 | ``` 77 | make test 78 | ``` 79 | 80 | ### Lint 81 | 82 | ``` 83 | make lint 84 | ``` 85 | 86 | ### Format 87 | 88 | ``` 89 | make fmt 90 | ``` 91 | -------------------------------------------------------------------------------- /alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = app/shared/db/alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python-dateutil library that can be 20 | # installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to dateutil.tz.gettz() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to src/alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:src/alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # the output encoding used when revision files 55 | # are written from script.py.mako 56 | # output_encoding = utf-8 57 | 58 | # sqlalchemy.url = driver://user:pass@localhost/dbname 59 | 60 | [post_write_hooks] 61 | # post_write_hooks defines scripts or Python functions that are run 62 | # on newly generated revision scripts. See the documentation for further 63 | # detail and examples 64 | 65 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 66 | hooks = black 67 | black.type = console_scripts 68 | black.entrypoint = black 69 | black.options = -l 79 REVISION_SCRIPT_FILENAME 70 | 71 | # Logging configuration 72 | [loggers] 73 | keys = root,sqlalchemy,alembic 74 | 75 | [handlers] 76 | keys = console 77 | 78 | [formatters] 79 | keys = generic 80 | 81 | [logger_root] 82 | level = WARN 83 | handlers = console 84 | qualname = 85 | 86 | [logger_sqlalchemy] 87 | level = WARN 88 | handlers = 89 | qualname = sqlalchemy.engine 90 | 91 | [logger_alembic] 92 | level = INFO 93 | handlers = 94 | qualname = alembic 95 | 96 | [handler_console] 97 | class = StreamHandler 98 | args = (sys.stderr,) 99 | level = NOTSET 100 | formatter = generic 101 | 102 | [formatter_generic] 103 | format = %(levelname)-5.5s [%(name)s] %(message)s 104 | datefmt = %H:%M:%S 105 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/__init__.py -------------------------------------------------------------------------------- /app/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/shared/__init__.py -------------------------------------------------------------------------------- /app/shared/celery.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | 3 | 4 | def get_celery_binding(broker_url: str) -> Celery: 5 | return Celery( 6 | broker_url=broker_url, 7 | broker_connection_retry=False, 8 | broker_connection_retry_on_startup=False, 9 | ) 10 | -------------------------------------------------------------------------------- /app/shared/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/shared/db/__init__.py -------------------------------------------------------------------------------- /app/shared/db/alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /app/shared/db/alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from alembic import context 4 | from sqlalchemy import engine_from_config, pool 5 | 6 | from app.shared.db.models import Base 7 | from app.shared.settings import Settings 8 | 9 | settings = Settings() # type: ignore 10 | 11 | # this is the Alembic Config object, which provides 12 | # access to the values within the .ini file in use. 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | if config.config_file_name is not None: 18 | fileConfig(config.config_file_name) 19 | 20 | config.set_main_option("sqlalchemy.url", settings.DATABASE_URI) 21 | 22 | # add your model's MetaData object here 23 | # for 'autogenerate' support 24 | # from myapp import mymodel 25 | # target_metadata = mymodel.Base.metadata 26 | target_metadata = Base.metadata 27 | 28 | # other values from the config, defined by the needs of env.py, 29 | # can be acquired: 30 | # my_important_option = config.get_main_option("my_important_option") 31 | # ... etc. 32 | 33 | 34 | def run_migrations_offline() -> None: 35 | """Run migrations in 'offline' mode. 36 | 37 | This configures the context with just a URL 38 | and not an Engine, though an Engine is acceptable 39 | here as well. By skipping the Engine creation 40 | we don't even need a DBAPI to be available. 41 | 42 | Calls to context.execute() here emit the given string to the 43 | script output. 44 | 45 | """ 46 | url = config.get_main_option("sqlalchemy.url") 47 | context.configure( 48 | url=url, 49 | target_metadata=target_metadata, 50 | literal_binds=True, 51 | dialect_opts={"paramstyle": "named"}, 52 | ) 53 | 54 | with context.begin_transaction(): 55 | context.run_migrations() 56 | 57 | 58 | def run_migrations_online() -> None: 59 | """Run migrations in 'online' mode. 60 | 61 | In this scenario we need to create an Engine 62 | and associate a connection with the context. 63 | 64 | """ 65 | 66 | connectable = engine_from_config( 67 | config.get_section(config.config_ini_section), # type: ignore 68 | prefix="sqlalchemy.", 69 | poolclass=pool.NullPool, 70 | ) 71 | 72 | with connectable.connect() as connection: 73 | context.configure(connection=connection, target_metadata=target_metadata) 74 | 75 | with context.begin_transaction(): 76 | context.run_migrations() 77 | 78 | 79 | if context.is_offline_mode(): 80 | run_migrations_offline() 81 | else: 82 | run_migrations_online() 83 | -------------------------------------------------------------------------------- /app/shared/db/alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade() -> None: 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade() -> None: 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /app/shared/db/alembic/versions/0eee2b7913b7_add_tables.py: -------------------------------------------------------------------------------- 1 | """add_tables 2 | 3 | Revision ID: 0eee2b7913b7 4 | Revises: 5 | Create Date: 2023-06-29 08:33:26.123728 6 | 7 | """ 8 | import sqlalchemy as sa 9 | from alembic import op 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = "0eee2b7913b7" 13 | down_revision = None 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | # ### commands auto generated by Alembic - please adjust! ### 20 | op.create_table( 21 | "jobs", 22 | sa.Column("url", sa.String(length=2048), nullable=True), 23 | sa.Column( 24 | "status", 25 | sa.Enum("create", "processing", "error", "success", name="jobstatus"), 26 | nullable=False, 27 | ), 28 | sa.Column("config", sa.JSON(none_as_null=True), nullable=True), 29 | sa.Column("meta", sa.JSON(none_as_null=True), nullable=True), 30 | sa.Column( 31 | "type", 32 | sa.Enum( 33 | "transcript", 34 | "translation", 35 | "language_detection", 36 | name="jobtype", 37 | ), 38 | nullable=False, 39 | ), 40 | sa.Column( 41 | "created_at", 42 | sa.DateTime(), 43 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 44 | nullable=False, 45 | ), 46 | sa.Column("updated_at", sa.DateTime(), nullable=True), 47 | sa.Column("id", sa.VARCHAR(length=36), nullable=False), 48 | sa.PrimaryKeyConstraint("id"), 49 | ) 50 | op.create_index(op.f("ix_jobs_id"), "jobs", ["id"], unique=False) 51 | op.create_table( 52 | "artifacts", 53 | sa.Column("job_id", sa.VARCHAR(length=36), nullable=False), 54 | sa.Column("data", sa.JSON(none_as_null=True), nullable=True), 55 | sa.Column( 56 | "type", 57 | sa.Enum("raw_transcript", "language_detection", name="artifacttype"), 58 | nullable=False, 59 | ), 60 | sa.Column( 61 | "created_at", 62 | sa.DateTime(), 63 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 64 | nullable=False, 65 | ), 66 | sa.Column("updated_at", sa.DateTime(), nullable=True), 67 | sa.Column("id", sa.VARCHAR(length=36), nullable=False), 68 | sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], ondelete="CASCADE"), 69 | sa.PrimaryKeyConstraint("id"), 70 | ) 71 | op.create_index(op.f("ix_artifacts_id"), "artifacts", ["id"], unique=False) 72 | # ### end Alembic commands ### 73 | 74 | 75 | def downgrade() -> None: 76 | # ### commands auto generated by Alembic - please adjust! ### 77 | op.drop_index(op.f("ix_artifacts_id"), table_name="artifacts") 78 | op.drop_table("artifacts") 79 | op.drop_index(op.f("ix_jobs_id"), table_name="jobs") 80 | op.drop_table("jobs") 81 | # ### end Alembic commands ### 82 | -------------------------------------------------------------------------------- /app/shared/db/base.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from sqlalchemy import Engine, create_engine, event 4 | from sqlalchemy.orm import sessionmaker 5 | 6 | 7 | def make_engine(database_url: str): 8 | engine = create_engine(database_url, connect_args={"check_same_thread": False}) 9 | 10 | @event.listens_for(engine, "connect") 11 | def set_sqlite_pragma(conn: Any, _: Any) -> None: 12 | cursor = conn.cursor() 13 | cursor.execute("PRAGMA journal_mode=WAL") 14 | cursor.close() 15 | 16 | return engine 17 | 18 | 19 | def make_session_local(engine: Engine): 20 | session_local = sessionmaker(autocommit=False, autoflush=False, bind=engine) 21 | return session_local 22 | -------------------------------------------------------------------------------- /app/shared/db/models.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import uuid 3 | 4 | from pydantic import BaseModel, Field 5 | from sqlalchemy import JSON, VARCHAR, Column, DateTime, Enum, ForeignKey, String, func 6 | from sqlalchemy.dialects.postgresql import UUID 7 | from sqlalchemy.orm import Mapped, declarative_base, declarative_mixin, declared_attr 8 | 9 | Base = declarative_base() 10 | 11 | 12 | # Enums 13 | 14 | 15 | class JobType(str, enum.Enum): 16 | """Requested type of a job.""" 17 | 18 | transcript = "transcribe" 19 | translation = "translate" 20 | language_detection = "detect_language" 21 | 22 | 23 | class JobStatus(str, enum.Enum): 24 | """Processing status of a job.""" 25 | 26 | create = "create" 27 | processing = "processing" 28 | error = "error" 29 | success = "success" 30 | 31 | 32 | class ArtifactType(str, enum.Enum): 33 | raw_transcript = "transcript_raw" 34 | language_detection = "language_detection" 35 | 36 | 37 | # JSON field types 38 | 39 | 40 | class JobConfig(BaseModel): 41 | """(JSON) Configuration for a job.""" 42 | 43 | language: str | None = Field( 44 | default=None, 45 | description=( 46 | "Spoken language in the media file. " 47 | "While optional, this can improve output." 48 | ), 49 | ) 50 | 51 | 52 | class JobMeta(BaseModel): 53 | """(JSON) Metadata relating to a job's execution.""" 54 | 55 | attempts: int | None = Field( 56 | default=None, 57 | description="Number of processing attempts a job has taken.", 58 | ) 59 | 60 | error: str | None = Field( 61 | default=None, 62 | description="Will contain a descriptive error message if processing failed.", 63 | ) 64 | 65 | task_id: uuid.UUID | None = Field( 66 | default=None, 67 | description="Internal celery id of this job submission.", 68 | ) 69 | 70 | 71 | class RawTranscript(BaseModel): 72 | """(JSON) A single transcript passage returned by whisper.""" 73 | 74 | id: int 75 | seek: int 76 | start: float 77 | end: float 78 | text: str 79 | tokens: list[int] 80 | temperature: float 81 | avg_logprob: float 82 | compression_ratio: float 83 | no_speech_prob: float 84 | 85 | 86 | class LanguageDetection(BaseModel): 87 | """A language detection""" 88 | 89 | language_code: str 90 | 91 | 92 | # Sum type for all possible artifact data values 93 | ArtifactData = list[RawTranscript] | LanguageDetection | None 94 | 95 | 96 | @declarative_mixin 97 | class WithStandardFields: 98 | """Mixin that adds standard fields (id, created_at, updated_at).""" 99 | 100 | @declared_attr 101 | def created_at(cls) -> Mapped[DateTime]: 102 | return Column(DateTime, server_default=func.now(), nullable=False) 103 | 104 | @declared_attr 105 | def updated_at(cls) -> Mapped[DateTime | None]: 106 | return Column(DateTime, onupdate=func.now()) 107 | 108 | @declared_attr 109 | def id(cls) -> Mapped[UUID]: 110 | return Column( 111 | VARCHAR(36), primary_key=True, index=True, default=lambda: str(uuid.uuid4()) 112 | ) 113 | 114 | 115 | class Job(Base, WithStandardFields): 116 | __tablename__ = "jobs" 117 | 118 | url = Column(String(length=2048)) 119 | status = Column(Enum(JobStatus), nullable=False) 120 | config = Column(JSON(none_as_null=True)) 121 | meta = Column(JSON(none_as_null=True)) 122 | type = Column(Enum(JobType), nullable=False) 123 | 124 | 125 | class Artifact(Base, WithStandardFields): 126 | __tablename__ = "artifacts" 127 | 128 | job_id = Column( 129 | VARCHAR(36), 130 | ForeignKey("jobs.id", ondelete="CASCADE"), 131 | nullable=False, 132 | ) 133 | 134 | data = Column(JSON(none_as_null=True)) 135 | type = Column(Enum(ArtifactType), nullable=False) 136 | -------------------------------------------------------------------------------- /app/shared/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig() 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | logger.setLevel(logging.INFO) 8 | -------------------------------------------------------------------------------- /app/shared/settings.py: -------------------------------------------------------------------------------- 1 | from pydantic_settings import BaseSettings 2 | 3 | 4 | class Settings(BaseSettings): 5 | API_SECRET: str 6 | BROKER_URL: str 7 | DATABASE_URI: str 8 | ENVIRONMENT: str 9 | 10 | TASK_SOFT_TIME_LIMIT: int = 3 * 60 * 60 11 | TASK_HARD_TIME_LIMIT: int = 4 * 60 * 60 12 | 13 | ENABLE_SHARING: bool = False 14 | -------------------------------------------------------------------------------- /app/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/tests/__init__.py -------------------------------------------------------------------------------- /app/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fastapi.testclient import TestClient 3 | from sqlalchemy_utils import create_database, database_exists, drop_database 4 | 5 | import app.shared.db.models as models 6 | from app.shared.db.base import make_engine, make_session_local 7 | from app.shared.settings import Settings 8 | from app.web.injections.db import get_session 9 | from app.web.injections.settings import get_settings 10 | from app.web.main import app_factory 11 | 12 | 13 | @pytest.fixture() 14 | def settings(): 15 | return Settings(_env_file=".env.test") # type: ignore 16 | 17 | 18 | @pytest.fixture() 19 | def auth_headers(settings) -> dict[str, str]: 20 | return {"Authorization": f"Bearer {settings.API_SECRET}"} 21 | 22 | 23 | @pytest.fixture() 24 | def test_db(settings): 25 | engine = make_engine(settings.DATABASE_URI) 26 | 27 | if not database_exists(engine.url): 28 | create_database(engine.url) 29 | 30 | models.Base.metadata.create_all(engine) 31 | 32 | connection = engine.connect() 33 | yield connection 34 | connection.close() 35 | 36 | models.Base.metadata.drop_all(bind=engine) 37 | drop_database(engine.url) 38 | 39 | 40 | @pytest.fixture() 41 | def db_session(test_db): 42 | session_local = make_session_local(test_db) 43 | with session_local() as session: 44 | yield session 45 | 46 | 47 | @pytest.fixture() 48 | def app(db_session, settings): 49 | app = app_factory() 50 | app.dependency_overrides[get_settings] = lambda: settings 51 | app.dependency_overrides[get_session] = lambda: db_session 52 | return app 53 | 54 | 55 | @pytest.fixture() 56 | def client(app): 57 | client = TestClient(app) 58 | return client 59 | 60 | 61 | @pytest.fixture() 62 | def mock_job(db_session): 63 | job = models.Job( 64 | url="https://example.com", 65 | type=models.JobType.transcript, 66 | status=models.JobStatus.processing, 67 | meta={"task_id": "5c790c76-2cc1-4e91-a305-443df55a4a4c"}, 68 | ) 69 | db_session.add(job) 70 | db_session.commit() 71 | return job 72 | 73 | 74 | @pytest.fixture() 75 | def mock_artifact(db_session, mock_job): 76 | artifact = models.Artifact( 77 | data=None, job_id=str(mock_job.id), type=models.ArtifactType.raw_transcript 78 | ) 79 | db_session.add(artifact) 80 | db_session.commit() 81 | return artifact 82 | -------------------------------------------------------------------------------- /app/tests/test_api.py: -------------------------------------------------------------------------------- 1 | import app.shared.db.models as models 2 | from app.shared.settings import Settings 3 | from app.web.injections.settings import get_settings 4 | 5 | 6 | # POST /api/v1/jobs 7 | # --- 8 | def test_create_job_pass(client, auth_headers: dict[str, str]): 9 | res = client.post( 10 | "/api/v1/jobs", 11 | headers=auth_headers, 12 | json={"url": "https://example.com", "type": models.JobType.transcript}, 13 | ) 14 | assert res.status_code == 201 15 | assert isinstance(res.json()["id"], str) 16 | 17 | 18 | def test_create_job_missing_body(client, auth_headers: dict[str, str]): 19 | res = client.post("/api/v1/jobs", headers=auth_headers, json={}) 20 | assert res.status_code == 422 21 | 22 | 23 | def test_create_job_malformed_url(client, auth_headers: dict[str, str]): 24 | res = client.post( 25 | "/api/v1/jobs", 26 | headers=auth_headers, 27 | json={"url": "example.com", "type": models.JobType.transcript}, 28 | ) 29 | assert res.status_code == 422 30 | 31 | 32 | # GET /api/v1/jobs 33 | # --- 34 | def test_get_jobs_pass(client, auth_headers: dict[str, str], mock_job: models.Job): 35 | res = client.get( 36 | "/api/v1/jobs?type=transcribe", 37 | headers=auth_headers, 38 | ) 39 | assert len(res.json()) == 1 40 | assert res.status_code == 200 41 | 42 | 43 | # GET /api/v1/jobs/:id 44 | # --- 45 | def test_get_job_pass(client, auth_headers: dict[str, str], mock_job: models.Job): 46 | res = client.get( 47 | f"/api/v1/jobs/{mock_job.id}", 48 | headers=auth_headers, 49 | ) 50 | assert res.status_code == 200 51 | assert res.json()["id"] == str(mock_job.id) 52 | 53 | 54 | def test_get_job_not_found(client, auth_headers: dict[str, str], mock_job): 55 | res = client.get( 56 | "/api/v1/jobs/c8ecf5ea-77cf-48a2-9ecd-199ef35e0ccb", 57 | headers=auth_headers, 58 | ) 59 | 60 | assert res.status_code == 404 61 | 62 | 63 | def test_get_job_sharing_disabled(client, mock_job): 64 | res = client.get( 65 | f"/api/v1/jobs/{mock_job.id}", 66 | headers={}, 67 | ) 68 | assert res.status_code == 401 69 | 70 | 71 | def test_get_job_sharing_enabled(client, app, mock_job): 72 | app.dependency_overrides[get_settings] = lambda: Settings( 73 | _env_file=".env.test", ENABLE_SHARING=True # type: ignore 74 | ) 75 | 76 | res = client.get( 77 | f"/api/v1/jobs/{mock_job.id}", 78 | headers={}, 79 | ) 80 | 81 | assert res.status_code == 200 82 | 83 | 84 | # GET /api/v1/jobs/:id/artifacts 85 | # --- 86 | def test_get_artifacts_pass(client, auth_headers, db_session, mock_job, mock_artifact): 87 | res = client.get( 88 | f"/api/v1/jobs/{mock_job.id}/artifacts", 89 | headers=auth_headers, 90 | ) 91 | 92 | assert res.status_code == 200 93 | assert res.json()[0]["job_id"] == str(mock_job.id) 94 | assert res.json()[0]["id"] == str(mock_artifact.id) 95 | 96 | 97 | def test_get_artifacts_not_found(client, auth_headers, mock_job): 98 | res = client.get( 99 | f"/api/v1/jobs/{mock_job.id}/artifacts", 100 | headers=auth_headers, 101 | ) 102 | 103 | assert len(res.json()) == 0 104 | assert res.status_code == 200 105 | 106 | 107 | # DELETE /api/v1/jobs 108 | # --- 109 | def test_delete_job_pass(client, auth_headers, mock_job, db_session): 110 | res_job = client.get( 111 | f"/api/v1/jobs/{mock_job.id}", 112 | headers=auth_headers, 113 | ) 114 | 115 | assert res_job.status_code == 200 116 | 117 | client.delete( 118 | f"/api/v1/jobs/{mock_job.id}", 119 | headers=auth_headers, 120 | ) 121 | 122 | # HACK: this catches a missed .commit(). 123 | # TODO: clean up pytest database handling. 124 | db_session.rollback() 125 | 126 | res_job_missing = client.get( 127 | f"/api/v1/jobs/{mock_job.id}", 128 | headers=auth_headers, 129 | ) 130 | 131 | assert res_job_missing.status_code == 404 132 | -------------------------------------------------------------------------------- /app/tests/test_auth.py: -------------------------------------------------------------------------------- 1 | def test_authorization_header_missing(client): 2 | res = client.get("/api/v1/jobs") 3 | assert res.status_code == 401 4 | 5 | 6 | def test_authorization_header_malformed(client): 7 | res = client.get("/api/v1/jobs", headers={"Authorization": "Bearer"}) 8 | assert res.status_code == 401 9 | 10 | 11 | def test_incorrect_api_key(client): 12 | res = client.get("/api/v1/jobs", headers={"Authorization": "Bearer incorrect"}) 13 | assert res.status_code == 401 14 | 15 | 16 | def test_existing_api_key(client, auth_headers): 17 | res = client.get("/api/v1/jobs", headers=auth_headers) 18 | assert res.status_code == 200 19 | -------------------------------------------------------------------------------- /app/web/__init__.py: -------------------------------------------------------------------------------- 1 | from app.web.main import app_factory 2 | 3 | app = app_factory 4 | -------------------------------------------------------------------------------- /app/web/dtos.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from uuid import UUID 3 | 4 | from pydantic import AnyHttpUrl, BaseModel, ConfigDict 5 | 6 | from app.shared.db.models import ( 7 | ArtifactData, 8 | ArtifactType, 9 | JobConfig, 10 | JobMeta, 11 | JobStatus, 12 | JobType, 13 | ) 14 | 15 | # DB objects 16 | 17 | 18 | class WithDbFields(BaseModel): 19 | id: UUID 20 | created_at: datetime 21 | updated_at: datetime | None = None 22 | model_config = ConfigDict(from_attributes=True) 23 | 24 | 25 | class Job(WithDbFields): 26 | """A transcription job for one media file.""" 27 | 28 | status: JobStatus 29 | type: JobType 30 | url: AnyHttpUrl 31 | meta: JobMeta | None = None 32 | config: JobConfig | None = None 33 | 34 | 35 | class Artifact(WithDbFields): 36 | """A transcription artifact.""" 37 | 38 | job_id: UUID 39 | data: ArtifactData 40 | type: ArtifactType 41 | -------------------------------------------------------------------------------- /app/web/injections/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/web/injections/__init__.py -------------------------------------------------------------------------------- /app/web/injections/db.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from typing import Generator 3 | 4 | from fastapi import Depends 5 | from sqlalchemy.orm import Session 6 | 7 | from app.shared.db.base import make_engine, make_session_local 8 | from app.shared.settings import Settings 9 | from app.web.injections.settings import get_settings 10 | 11 | 12 | @lru_cache 13 | def session_local(database_url: str): 14 | engine = make_engine(database_url) 15 | return make_session_local(engine) 16 | 17 | 18 | def get_session_local(settings: Settings = Depends(get_settings)): 19 | return session_local(settings.DATABASE_URI) 20 | 21 | 22 | def get_session( 23 | session_local=Depends(get_session_local), 24 | ) -> Generator[Session, None, None]: 25 | with session_local() as session: 26 | yield session 27 | -------------------------------------------------------------------------------- /app/web/injections/security.py: -------------------------------------------------------------------------------- 1 | from hmac import compare_digest 2 | from typing import Annotated 3 | 4 | from fastapi import Depends, HTTPException 5 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer 6 | 7 | from app.shared.settings import Settings 8 | from app.web.injections.settings import get_settings 9 | 10 | 11 | def api_key_auth( 12 | credentials: Annotated[ 13 | HTTPAuthorizationCredentials, Depends(HTTPBearer(auto_error=False)) 14 | ], 15 | settings: Annotated[Settings, Depends(get_settings)], 16 | ): 17 | validate_credentials(credentials, settings.API_SECRET) 18 | 19 | 20 | def sharing_auth( 21 | credentials: Annotated[ 22 | HTTPAuthorizationCredentials, Depends(HTTPBearer(auto_error=False)) 23 | ], 24 | settings: Annotated[Settings, Depends(get_settings)], 25 | ): 26 | if settings.ENABLE_SHARING: 27 | pass 28 | else: 29 | validate_credentials(credentials, settings.API_SECRET) 30 | 31 | 32 | def validate_credentials(credentials: HTTPAuthorizationCredentials, secret: str): 33 | # use compare_digest to counter timing attacks. 34 | if ( 35 | not credentials 36 | or not secret 37 | or not compare_digest(secret, credentials.credentials) 38 | ): 39 | raise HTTPException(status_code=401) 40 | -------------------------------------------------------------------------------- /app/web/injections/settings.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | 3 | from app.shared.settings import Settings 4 | 5 | 6 | @lru_cache 7 | def get_settings(): 8 | return Settings() # type: ignore 9 | -------------------------------------------------------------------------------- /app/web/injections/task_queue.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | 3 | from fastapi import Depends 4 | 5 | from app.shared.settings import Settings 6 | from app.web.injections.settings import get_settings 7 | from app.web.task_queue import TaskQueue 8 | 9 | 10 | @lru_cache 11 | def task_queue(broker_url: str): 12 | return TaskQueue(broker_url) 13 | 14 | 15 | def get_task_queue(settings: Settings = Depends(get_settings)): 16 | return task_queue(settings.BROKER_URL) 17 | -------------------------------------------------------------------------------- /app/web/main.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | from uuid import UUID 3 | 4 | from fastapi import APIRouter, Depends, FastAPI, HTTPException, Path 5 | from pydantic import AnyHttpUrl, BaseModel, Field 6 | from sqlalchemy.orm import Session 7 | 8 | import app.shared.db.models as models 9 | import app.web.dtos as dtos 10 | from app.web.injections.db import get_session 11 | from app.web.injections.security import api_key_auth, sharing_auth 12 | from app.web.injections.task_queue import get_task_queue 13 | from app.web.task_queue import TaskQueue 14 | 15 | DatabaseSession = Annotated[Session, Depends(get_session)] 16 | 17 | 18 | def app_factory(): 19 | app = FastAPI( 20 | description=( 21 | "whisperbox-transcribe is an async HTTP wrapper for openai/whisper." 22 | ), 23 | title="whisperbox-transcribe", 24 | ) 25 | 26 | api_router = APIRouter(prefix="/api/v1") 27 | 28 | @api_router.get("/", status_code=204) 29 | def api_root(): 30 | return None 31 | 32 | @api_router.get( 33 | "/jobs", 34 | dependencies=[Depends(api_key_auth)], 35 | response_model=list[dtos.Job], 36 | summary="Get metadata for all jobs", 37 | ) 38 | def get_jobs( 39 | session: DatabaseSession, 40 | type: dtos.JobType | None = None, 41 | ) -> list[models.Job]: 42 | """Get metadata for all jobs.""" 43 | query = session.query(models.Job).order_by(models.Job.created_at.desc()) 44 | 45 | if type: 46 | query = query.filter(models.Job.type == type) 47 | 48 | return query.all() 49 | 50 | @api_router.get( 51 | "/jobs/{id}", 52 | dependencies=[Depends(sharing_auth)], 53 | response_model=dtos.Job, 54 | summary="Get metadata for one job", 55 | ) 56 | def get_job( 57 | session: DatabaseSession, 58 | id: UUID = Path(), 59 | ) -> models.Job | None: 60 | """ 61 | Use this route to check transcription status of any given job. 62 | """ 63 | job = session.query(models.Job).filter(models.Job.id == str(id)).one_or_none() 64 | 65 | if not job: 66 | raise HTTPException(status_code=404) 67 | 68 | return job 69 | 70 | @api_router.get( 71 | "/jobs/{id}/artifacts", 72 | dependencies=[Depends(api_key_auth)], 73 | response_model=list[dtos.Artifact], 74 | summary="Get all artifacts for one job", 75 | ) 76 | def get_artifacts_for_job( 77 | session: DatabaseSession, 78 | id: UUID = Path(), 79 | ) -> list[models.Artifact]: 80 | """ 81 | Returns all artifacts for one job. 82 | See the type of `data` for possible data types. 83 | Returns an empty array for unfinished or non-existant jobs. 84 | """ 85 | artifacts = ( 86 | session.query(models.Artifact).filter(models.Artifact.job_id == str(id)) 87 | ).all() 88 | 89 | return artifacts 90 | 91 | @api_router.delete( 92 | "/jobs/{id}", 93 | dependencies=[Depends(sharing_auth)], 94 | status_code=204, 95 | summary="Delete a job with all artifacts", 96 | ) 97 | def delete_transcript( 98 | session: DatabaseSession, 99 | id: UUID = Path(), 100 | ) -> None: 101 | """Remove metadata and artifacts for a single job.""" 102 | session.query(models.Job).filter(models.Job.id == str(id)).delete() 103 | session.commit() 104 | return None 105 | 106 | class PostJobPayload(BaseModel): 107 | url: AnyHttpUrl = Field( 108 | description=( 109 | "URL where the media file is available. This needs to be a direct link." 110 | ) 111 | ) 112 | 113 | type: models.JobType = Field( 114 | description="""Type of this job. 115 | `transcript` uses the original language of the audio. 116 | `translation` creates an automatic translation to english. 117 | `language_detection` detects language from the first 30 seconds of audio.""" 118 | ) 119 | 120 | language: str | None = Field( 121 | default=None, 122 | description=( 123 | "Spoken language in the media file. " 124 | "While optional, this can improve output when set." 125 | ), 126 | ) 127 | 128 | @api_router.post( 129 | "/jobs", 130 | dependencies=[Depends(api_key_auth)], 131 | response_model=dtos.Job, 132 | status_code=201, 133 | summary="Enqueue a new job", 134 | ) 135 | def create_job( 136 | payload: PostJobPayload, 137 | session: DatabaseSession, 138 | task_queue: Annotated[TaskQueue, Depends(get_task_queue)], 139 | ) -> models.Job: 140 | """ 141 | Enqueue a new whisper job for processing. 142 | Notes: 143 | * Jobs are processed one-by-one in order of creation. 144 | * `payload.url` needs to point directly to a media file. 145 | * The media file is downloaded to a tmp file for the duration of processing. 146 | enough free space needs to be available on disk. 147 | * Media files ideally are audio files with a sampling rate of 16kHz. 148 | other files will be transcoded automatically via ffmpeg which might 149 | consume considerable resources while active. 150 | * Once a job is created, you can query its status by its id. 151 | """ 152 | 153 | # create a job with status "create" and save it to the database. 154 | job = models.Job( 155 | url=str(payload.url), 156 | status=dtos.JobStatus.create, 157 | type=payload.type, 158 | config={"language": payload.language} if payload.language else None, 159 | ) 160 | 161 | session.add(job) 162 | session.commit() 163 | 164 | task_queue.queue_task(job) 165 | 166 | return job 167 | 168 | app.include_router(api_router) 169 | 170 | return app 171 | -------------------------------------------------------------------------------- /app/web/task_queue.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | 3 | import app.shared.db.models as models 4 | from app.shared.celery import get_celery_binding 5 | 6 | 7 | class TaskQueue: 8 | celery: Celery 9 | 10 | def __init__(self, broker_url: str) -> None: 11 | self.celery = get_celery_binding(broker_url=broker_url) 12 | 13 | def queue_task(self, job: models.Job): 14 | """ 15 | Queues an async transcription job. We use a celery signature here to 16 | allow for full separation of worker processes and dependencies. 17 | """ 18 | transcribe = self.celery.signature("app.worker.main.transcribe") 19 | # TODO: catch delivery errors? 20 | transcribe.delay(job.id) 21 | -------------------------------------------------------------------------------- /app/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/worker/__init__.py -------------------------------------------------------------------------------- /app/worker/main.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from uuid import UUID 3 | 4 | from celery import Task 5 | from sqlalchemy.orm import Session 6 | 7 | import app.shared.db.models as models 8 | from app.shared.celery import get_celery_binding 9 | from app.shared.db.base import make_engine, make_session_local 10 | from app.shared.logger import logger 11 | from app.shared.settings import Settings 12 | from app.worker.strategies.local import LocalStrategy 13 | 14 | # TODO: refactor to be part of a Task instance. 15 | settings = Settings() # type: ignore 16 | celery = get_celery_binding(settings.BROKER_URL) 17 | engine = make_engine(settings.DATABASE_URI) 18 | SessionLocal = make_session_local(engine) 19 | 20 | 21 | class TranscribeTask(Task): 22 | """ 23 | Decorate the transcribe task with an instance of the transcription strategy. 24 | This is important for the local strategy, where loading the model is expensive. 25 | """ 26 | 27 | abstract = True 28 | 29 | def __init__(self) -> None: 30 | super().__init__() 31 | # currently only `LocalStrategy` is implemented. 32 | self.strategy: LocalStrategy | None = None 33 | 34 | def __call__(self, *args: Any, **kwargs: Any) -> Any: 35 | # load model into memory once when the first task is processed. 36 | if not self.strategy: 37 | self.strategy = LocalStrategy() 38 | return self.run(*args, **kwargs) 39 | 40 | 41 | @celery.task( 42 | base=TranscribeTask, 43 | bind=True, 44 | soft_time_limit=settings.TASK_SOFT_TIME_LIMIT, 45 | time_limit=settings.TASK_HARD_TIME_LIMIT, 46 | task_acks_late=True, 47 | task_acks_on_failure_or_timeout=True, 48 | task_reject_on_worker_lost=True, 49 | ) 50 | def transcribe(self: TranscribeTask, job_id: UUID) -> None: 51 | session: Session | None = None 52 | job: models.Job | None = None 53 | 54 | try: 55 | if not self.strategy: 56 | raise Exception("expected a transcription strategy to be defined.") 57 | 58 | # runs in a separate thread => requires sqlite's WAL mode to be enabled. 59 | session = SessionLocal() 60 | 61 | # work around mypy not inferring the sum type correctly. 62 | if not session: 63 | raise Exception("failed to acquire a session.") 64 | 65 | # check if passed job should be processed. 66 | 67 | job = session.query(models.Job).filter(models.Job.id == job_id).one_or_none() 68 | 69 | if job is None: 70 | logger.warn("[unknown]: Received unknown job, abort.") 71 | return 72 | 73 | if job.status in [models.JobStatus.error, models.JobStatus.success]: 74 | logger.warn(f"[{job.id}]: job has already been processed, abort.") 75 | return 76 | 77 | logger.debug(f"[{job.id}]: start processing {job.type} job.") 78 | 79 | if job.meta is not None: 80 | attempts = 1 + (job.meta.get("attempts") or 0) 81 | else: 82 | attempts = 1 83 | 84 | # SAFEGUARD: celery's retry policies do not handle lost workers, retry once. 85 | # @see https://github.com/celery/celery/pull/6103 86 | if attempts > 2: 87 | raise Exception("Maximum number of retries exceeded for killed worker.") 88 | 89 | # unit of work: set task status to processing. 90 | 91 | job.meta = {"task_id": self.request.id, "attempts": attempts} 92 | 93 | job.status = models.JobStatus.processing 94 | session.commit() 95 | 96 | logger.debug(f"[{job.id}]: finished setting task to {job.status}.") 97 | 98 | # unit of work: process job with whisper. 99 | result_type, result = self.strategy.process(job) 100 | logger.debug(f"[{job.id}]: successfully processed audio.") 101 | 102 | artifact = models.Artifact(job_id=str(job.id), data=result, type=result_type) 103 | session.add(artifact) 104 | 105 | job.status = models.JobStatus.success 106 | session.commit() 107 | 108 | logger.debug(f"[{job.id}]: successfully stored artifact.") 109 | 110 | except Exception as e: 111 | if job and session: 112 | if session.in_transaction(): 113 | session.rollback() 114 | if job.meta is not None: 115 | job.meta = {**job.meta, "error": str(e)} 116 | else: 117 | job.meta = {"error": str(e)} 118 | 119 | job.status = models.JobStatus.error 120 | session.commit() 121 | raise 122 | finally: 123 | if self.strategy: 124 | self.strategy.cleanup(job_id) 125 | if session: 126 | session.close() 127 | -------------------------------------------------------------------------------- /app/worker/strategies/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from abc import ABC 4 | from typing import Any, Protocol, Tuple 5 | from uuid import UUID 6 | 7 | import requests 8 | 9 | import app.shared.db.models as models 10 | 11 | TaskReturnValue = Tuple[models.ArtifactType, Any] 12 | 13 | 14 | class TaskProtocol(Protocol): 15 | def __call__(self, job: models.Job) -> TaskReturnValue: 16 | ... 17 | 18 | 19 | class BaseStrategy(ABC): 20 | def process(self, job: models.Job) -> TaskReturnValue: 21 | if job.type == models.JobType.transcript: 22 | return self.transcribe(job) 23 | elif job.type == models.JobType.translation: 24 | return self.translate(job) 25 | else: 26 | return self.detect_language(job) 27 | 28 | def cleanup(self, job_id: UUID) -> None: 29 | try: 30 | os.remove(self._get_tmp_file(job_id)) 31 | except OSError: 32 | ... 33 | 34 | def transcribe(self, job: models.Job) -> TaskReturnValue: 35 | raise NotImplementedError() 36 | 37 | def translate(self, job: models.Job) -> TaskReturnValue: 38 | raise NotImplementedError() 39 | 40 | def detect_language(self, job: models.Job) -> TaskReturnValue: 41 | raise NotImplementedError() 42 | 43 | def _get_tmp_file(self, job_id: UUID) -> str: 44 | tmp = tempfile.gettempdir() 45 | return os.path.join(tmp, str(job_id)) 46 | 47 | def _download(self, url: str, job_id: UUID) -> str: 48 | # re-create folder. 49 | filename = self._get_tmp_file(job_id) 50 | self.cleanup(job_id) 51 | 52 | # stream media to disk. 53 | with requests.get(url, stream=True) as r: 54 | r.raise_for_status() 55 | with open(filename, "wb") as f: 56 | for chunk in r.iter_content(chunk_size=8192): 57 | f.write(chunk) 58 | 59 | return filename 60 | -------------------------------------------------------------------------------- /app/worker/strategies/local.py: -------------------------------------------------------------------------------- 1 | import os 2 | from asyncio.log import logger 3 | from typing import Any, Literal 4 | from uuid import UUID 5 | 6 | import torch 7 | import whisper 8 | from pydantic import BaseModel 9 | 10 | import app.shared.db.models as models 11 | from app.worker.strategies.base import BaseStrategy, TaskReturnValue 12 | 13 | 14 | class DecodingOptions(BaseModel): 15 | """ 16 | Options passed to the whipser model. 17 | This mirrors private type `whisper.DecodingOptions`. 18 | """ 19 | 20 | language: str | None = None 21 | task: Literal["translate", "transcribe"] 22 | 23 | 24 | class LocalStrategy(BaseStrategy): 25 | def __init__(self) -> None: 26 | if torch.cuda.is_available(): 27 | logger.debug("initializing GPU model.") 28 | self.model = whisper.load_model( 29 | os.environ["WHISPER_MODEL"], download_root="/models" 30 | ).cuda() 31 | else: 32 | logger.debug("initializing CPU model.") 33 | self.model = whisper.load_model( 34 | os.environ["WHISPER_MODEL"], download_root="/models" 35 | ) 36 | 37 | logger.debug("initialized local strategy.") 38 | 39 | def transcribe(self, job): 40 | result = self._run_whisper( 41 | self._download(job.url, job.id), "transcribe", job.config, job.id 42 | ) 43 | 44 | return (models.ArtifactType.raw_transcript, result) 45 | 46 | def translate(self, job) -> TaskReturnValue: 47 | result = self._run_whisper( 48 | self._download(job.url, job.id), 49 | "translate", 50 | job.config, 51 | job.id, 52 | ) 53 | return (models.ArtifactType.raw_transcript, result) 54 | 55 | def detect_language(self, job) -> TaskReturnValue: 56 | file = self._download(job.url, job.id) 57 | 58 | # see: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/README.md?plain=1#L114 59 | audio = whisper.pad_or_trim(whisper.load_audio(file)) 60 | mel = whisper.log_mel_spectrogram(audio).to(self.model.device) 61 | _, probs = self.model.detect_language(mel) 62 | 63 | return ( 64 | models.ArtifactType.language_detection, 65 | {"code": max(probs, key=probs.get)}, 66 | ) 67 | 68 | def _run_whisper( 69 | self, 70 | filepath: str, 71 | task: Literal["translate", "transcribe"], 72 | config: dict[str, Any], 73 | job_id: UUID, 74 | ) -> list[Any]: 75 | result = self.model.transcribe( 76 | filepath, 77 | # turning this off might make the transcription less accurate, 78 | # but significantly reduces amount of model halucinations. 79 | condition_on_previous_text=False, 80 | **DecodingOptions( 81 | task=task, 82 | language=models.JobConfig(**config).language if config else None, 83 | ).dict(), 84 | ) 85 | 86 | return result["segments"] 87 | -------------------------------------------------------------------------------- /conf/rabbitmq.conf: -------------------------------------------------------------------------------- 1 | vm_memory_high_watermark.absolute = 192MB 2 | consumer_timeout = 31622400000 3 | -------------------------------------------------------------------------------- /docker-compose.base.yml: -------------------------------------------------------------------------------- 1 | x-broker-environment: &broker-environment 2 | BROKER_URL: "amqp://${RABBITMQ_DEFAULT_USER}:${RABBITMQ_DEFAULT_PASS}@rabbitmq:5672" 3 | 4 | version: "3.8" 5 | name: whisperbox-transcribe 6 | 7 | services: 8 | traefik: 9 | image: "traefik:latest" 10 | restart: unless-stopped 11 | volumes: 12 | - /var/run/docker.sock:/var/run/docker.sock:ro 13 | depends_on: 14 | - web 15 | networks: 16 | - traefik 17 | 18 | rabbitmq: 19 | env_file: .env 20 | image: rabbitmq:3-alpine 21 | networks: 22 | - app 23 | deploy: 24 | resources: 25 | limits: 26 | memory: 256M 27 | healthcheck: 28 | test: rabbitmq-diagnostics check_port_connectivity 29 | interval: 3s 30 | timeout: 3s 31 | retries: 10 32 | 33 | volumes: 34 | - ./conf/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf 35 | - rabbitmq-data:/var/lib/rabbitmq/mnesia/ 36 | 37 | worker: 38 | env_file: .env 39 | environment: 40 | <<: *broker-environment 41 | build: 42 | context: . 43 | dockerfile: worker.Dockerfile 44 | args: 45 | WHISPER_MODEL: ${WHISPER_MODEL} 46 | depends_on: 47 | rabbitmq: 48 | condition: service_healthy 49 | networks: 50 | - app 51 | 52 | web: 53 | env_file: .env 54 | environment: 55 | <<: *broker-environment 56 | build: 57 | context: . 58 | dockerfile: web.Dockerfile 59 | depends_on: 60 | rabbitmq: 61 | condition: service_healthy 62 | networks: 63 | - app 64 | - traefik 65 | 66 | networks: 67 | app: 68 | driver: bridge 69 | traefik: 70 | driver: bridge 71 | 72 | volumes: 73 | rabbitmq-data: 74 | -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | name: whisperbox-transcribe-dev 3 | 4 | services: 5 | traefik: 6 | ports: 7 | - "80:80" 8 | command: 9 | - "--providers.docker=true" 10 | - "--providers.docker.exposedbydefault=false" 11 | - "--providers.docker.network=whisperbox-transcribe-dev_traefik" 12 | - "--entrypoints.web.address=:80" 13 | 14 | web: 15 | command: bash -c "alembic upgrade head && uvicorn app.web:app --reload --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info --factory" 16 | # NOTE: the docker on mac mount adapter (virtioFS) does not support flock. 17 | # this can cause the sqlite database to corrupt when written from worker <> api simultaneously. 18 | volumes: 19 | - ./:/etc/whisperbox-transcribe/ 20 | labels: 21 | - "traefik.http.routers.web.entrypoints=web" 22 | - "traefik.enable=true" 23 | - "traefik.http.services.web.loadbalancer.server.port=8000" 24 | - "traefik.http.routers.web.rule=(Host(`${TRAEFIK_DOMAIN}`))" 25 | 26 | worker: 27 | command: watchmedo auto-restart -d app/worker -p *.py --recursive celery -- --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool prefork 28 | volumes: 29 | - ./:/etc/whisperbox-transcribe/ 30 | 31 | rabbitmq: 32 | image: rabbitmq:3-management-alpine 33 | ports: 34 | - 15672:15672 35 | 36 | flower: 37 | image: mher/flower 38 | command: celery --broker amqp://${RABBITMQ_DEFAULT_USER}:${RABBITMQ_DEFAULT_PASS}@rabbitmq:5672 flower --port=5555 39 | ports: 40 | - 5555:5555 41 | depends_on: 42 | - worker 43 | - rabbitmq 44 | networks: 45 | - app 46 | -------------------------------------------------------------------------------- /docker-compose.prod.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | name: whisperbox-transcribe 3 | 4 | services: 5 | traefik: 6 | ports: 7 | - "80:80" 8 | - "443:443" 9 | command: 10 | - "--providers.docker=true" 11 | - "--providers.docker.exposedbydefault=false" 12 | - "--providers.docker.network=whisperbox-transcribe_traefik" 13 | - "--entrypoints.web.address=:80" 14 | - "--entrypoints.websecure.address=:443" 15 | - "--entrypoints.web.http.redirections.entryPoint.to=websecure" 16 | - "--entrypoints.web.http.redirections.entryPoint.scheme=https" 17 | - "--entrypoints.web.http.redirections.entrypoint.permanent=true" 18 | - "--certificatesresolvers.le.acme.email=${TRAEFIK_SSLEMAIL}" 19 | - "--certificatesresolvers.le.acme.storage=/letsencrypt/acme.json" 20 | - "--certificatesresolvers.le.acme.tlschallenge=true" 21 | volumes: 22 | - ./data/letsencrypt:/letsencrypt 23 | - /var/run/docker.sock:/var/run/docker.sock:ro 24 | 25 | worker: 26 | # 27 | # build: 28 | # dockerfile: worker.gpu.Dockerfile 29 | volumes: 30 | - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data 31 | # 32 | # deploy: 33 | # resources: 34 | # reservations: 35 | # devices: 36 | # - driver: nvidia 37 | # count: 1 38 | # capabilities: [gpu] 39 | 40 | web: 41 | volumes: 42 | - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data/ 43 | labels: 44 | - "traefik.enable=true" 45 | - "traefik.http.services.web.loadbalancer.server.port=8000" 46 | - "traefik.http.routers.web.rule=(Host(`${TRAEFIK_DOMAIN}`))" 47 | - "traefik.http.routers.web.entrypoints=websecure" 48 | - "traefik.http.routers.web.tls=true" 49 | - "traefik.http.routers.web.tls.certresolver=le" 50 | 51 | volumes: 52 | whisperbox-transcribe-data: 53 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = sqlalchemy.ext.mypy.plugin 3 | ignore_missing_imports = True 4 | disallow_untyped_defs = False 5 | check_untyped_defs = True 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "whisperbox-transcribe" 3 | description = "" 4 | version = "1.0.1" 5 | 6 | dependencies=[ 7 | "celery ==5.3.6", 8 | "sqlalchemy[mypy] ==2.0.24", 9 | "pydantic ==2.5.3", 10 | "pydantic-settings ==2.1.0" 11 | ] 12 | 13 | [project.optional-dependencies] 14 | web=[ 15 | "alembic ==1.11.3", 16 | "fastapi ==0.101.1", 17 | "uvicorn[standard] ==0.23.2", 18 | "gunicorn ==21.2.0" 19 | ] 20 | 21 | worker=[ 22 | "watchdog[watchmedo] ==3.0.0", 23 | "openai-whisper ==20230314", 24 | "requests ==2.31.0" 25 | ] 26 | 27 | tooling = [ 28 | # code formatting 29 | "black ==23.12.1", 30 | # linting 31 | "ruff ==0.0.292", 32 | # tests 33 | "httpx ==0.26.0", 34 | "sqlalchemy-utils ==0.41.1", 35 | "python-dotenv ==1.0.0", 36 | "pytest ==7.4.4", 37 | # types 38 | "mypy ==1.5.1", 39 | "types-requests ==2.31.0.20231231" 40 | ] 41 | 42 | [tool.ruff] 43 | # pyflakes, pycodestyle, isort 44 | select = ["F", "E", "W", "I001"] 45 | 46 | [tool.setuptools] 47 | py-modules = [] 48 | -------------------------------------------------------------------------------- /scripts/download_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from whisper import _download, _MODELS # type: ignore 4 | 5 | if __name__ == "__main__": 6 | model_name = sys.argv[1].strip() 7 | _download(_MODELS[model_name], "/models/", False) 8 | -------------------------------------------------------------------------------- /web.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim as python-build 2 | 3 | WORKDIR /etc/whisperbox-transcribe 4 | 5 | COPY pyproject.toml . 6 | 7 | RUN python -m venv /opt/venv && \ 8 | /opt/venv/bin/pip install -U pip wheel && \ 9 | /opt/venv/bin/pip install -U .[web] 10 | 11 | FROM python:3.11-slim as python-deploy 12 | 13 | WORKDIR /etc/whisperbox-transcribe 14 | 15 | COPY --from=python-build /opt/venv /opt/venv 16 | 17 | COPY app ./app 18 | COPY alembic.ini . 19 | 20 | ENV VIRTUAL_ENV /opt/venv 21 | ENV PATH /opt/venv/bin:$PATH 22 | 23 | CMD alembic upgrade head && uvicorn app.web:app --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info --workers 4 --proxy-headers --factory 24 | -------------------------------------------------------------------------------- /worker.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim AS python-build 2 | 3 | WORKDIR /etc/whisperbox-transcribe 4 | 5 | # Create and build virtual env from requirements. 6 | COPY pyproject.toml . 7 | 8 | RUN python -m venv /opt/venv && \ 9 | /opt/venv/bin/pip install -U pip wheel && \ 10 | /opt/venv/bin/pip install -U .[worker] 11 | 12 | FROM python:3.11-slim as python-deploy 13 | 14 | ARG WHISPER_MODEL 15 | 16 | WORKDIR /etc/whisperbox-transcribe 17 | 18 | COPY --from=python-build /opt/venv /opt/venv 19 | 20 | COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/ 21 | COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/ 22 | 23 | ENV VIRTUAL_ENV /opt/venv 24 | ENV PATH /opt/venv/bin:$PATH 25 | 26 | COPY scripts/download_models.py . 27 | RUN python download_models.py ${WHISPER_MODEL} 28 | 29 | COPY app ./app 30 | 31 | CMD celery --app=app.worker.main.celery worker --loglevel=info --pool=prefork --concurrency=1 32 | -------------------------------------------------------------------------------- /worker.gpu.Dockerfile: -------------------------------------------------------------------------------- 1 | # TODO: clean up 2 | FROM nvidia/cuda:11.8.0-base-ubuntu22.04 AS python-deploy 3 | 4 | ENV PYTHON_VERSION=3.11 5 | 6 | ARG WHISPER_MODEL 7 | 8 | WORKDIR /etc/whisperbox-transcribe 9 | 10 | RUN export DEBIAN_FRONTEND=noninteractive \ 11 | && apt-get -qq update \ 12 | && apt-get -qq install --no-install-recommends \ 13 | python${PYTHON_VERSION} \ 14 | python${PYTHON_VERSION}-venv \ 15 | python3-pip \ 16 | && rm -rf /var/lib/apt/lists/* 17 | 18 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \ 19 | ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \ 20 | ln -s -f /usr/bin/pip3 /usr/bin/pip 21 | 22 | COPY pyproject.toml . 23 | 24 | RUN python -m venv /opt/venv && \ 25 | /opt/venv/bin/pip install -U pip wheel && \ 26 | /opt/venv/bin/pip install -U .[worker] 27 | 28 | COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/ 29 | COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/ 30 | 31 | COPY app ./app 32 | 33 | ENV VIRTUAL_ENV /opt/venv 34 | ENV PATH /opt/venv/bin:$PATH 35 | 36 | COPY scripts/download_models.py . 37 | RUN python download_models.py ${WHISPER_MODEL} 38 | 39 | CMD celery --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool=prefork 40 | --------------------------------------------------------------------------------