├── .editorconfig
├── .env.dev
├── .env.example
├── .env.test
├── .github
    ├── renovate.json
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Makefile
├── README.md
├── alembic.ini
├── app
    ├── __init__.py
    ├── shared
    │   ├── __init__.py
    │   ├── celery.py
    │   ├── db
    │   │   ├── __init__.py
    │   │   ├── alembic
    │   │   │   ├── README
    │   │   │   ├── env.py
    │   │   │   ├── script.py.mako
    │   │   │   └── versions
    │   │   │   │   └── 0eee2b7913b7_add_tables.py
    │   │   ├── base.py
    │   │   └── models.py
    │   ├── logger.py
    │   └── settings.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_api.py
    │   └── test_auth.py
    ├── web
    │   ├── __init__.py
    │   ├── dtos.py
    │   ├── injections
    │   │   ├── __init__.py
    │   │   ├── db.py
    │   │   ├── security.py
    │   │   ├── settings.py
    │   │   └── task_queue.py
    │   ├── main.py
    │   └── task_queue.py
    └── worker
    │   ├── __init__.py
    │   ├── main.py
    │   └── strategies
    │       ├── base.py
    │       └── local.py
├── conf
    └── rabbitmq.conf
├── docker-compose.base.yml
├── docker-compose.dev.yml
├── docker-compose.prod.yml
├── mypy.ini
├── pyproject.toml
├── scripts
    └── download_models.py
├── web.Dockerfile
├── worker.Dockerfile
└── worker.gpu.Dockerfile


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: https://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = true
 5 | 
 6 | # Unix-style newlines with a newline ending every file
 7 | [*]
 8 | charset = utf-8
 9 | end_of_line = lf
10 | insert_final_newline = true
11 | trim_trailing_whitespace = true
12 | 
13 | # 2 space indentation for every file
14 | [*]
15 | indent_style = space
16 | indent_size = 2
17 | 
18 | # 4 space indentation for python
19 | [*.py]
20 | indent_size = 4
21 | 
22 | # allow trailing whitespace in markdown files
23 | [*.md]
24 | trim_trailing_whitespace = false
25 | 
26 | [Makefile]
27 | indent_style = tab
28 | 


--------------------------------------------------------------------------------
/.env.dev:
--------------------------------------------------------------------------------
1 | API_SECRET="a_very_secret_token"
2 | TRAEFIK_DOMAIN="whisperbox-transcribe.localhost"
3 | WHISPER_MODEL="tiny"
4 | ENVIRONMENT="development"
5 | DATABASE_URI="sqlite:///./whisperbox-transcribe.sqlite"
6 | 
7 | RABBITMQ_DEFAULT_USER="rabbitmq"
8 | RABBITMQ_DEFAULT_PASS="rabbitmq_password"
9 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # this key is later used to authenticate against the API.
 2 | API_SECRET="change_me"
 3 | 
 4 | # see https://github.com/openai/whisper#available-models-and-languages
 5 | WHISPER_MODEL="small"
 6 | 
 7 | # If enabled, GET requests to routes `/job/:id` and `/job/:id/artifacts` will be unauthenticated.
 8 | ENABLE_SHARING="false"
 9 | 
10 | # the domain you want to access the service from. Its A records need to point to the host IP.
11 | TRAEFIK_DOMAIN="whisperbox-transcribe.localhost"
12 | 
13 | # an email which is used to verify domain ownership before a TLS certificate is issued.
14 | TRAEFIK_SSLEMAIL=""
15 | 
16 | # ---
17 | # below settings match the default docker-compose configuration.
18 | 
19 | RABBITMQ_DEFAULT_USER="rabbitmq"
20 | RABBITMQ_DEFAULT_PASS="rabbitmq_password"
21 | 
22 | DATABASE_URI="sqlite:////etc/whisperbox-transcribe/data/whisperbox-transcribe.sqlite"
23 | ENVIRONMENT="production"
24 | 


--------------------------------------------------------------------------------
/.env.test:
--------------------------------------------------------------------------------
1 | API_SECRET="test_secret"
2 | BROKER_URL="memory://"
3 | DATABASE_URI="sqlite://"
4 | ENVIRONMENT="test"
5 | 


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": ["config:base", "schedule:monthly"],
4 |   "timezone": "Europe/Berlin",
5 |   "enabledManagers": ["dockerfile", "docker-compose", "pep621"]
6 | }
7 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: push
 3 | 
 4 | jobs:
 5 |   lint:
 6 |     runs-on: ubuntu-latest
 7 |     name: Lint
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - uses: actions/setup-python@v4
11 |         with:
12 |           python-version: '3.11'
13 |           cache: 'pip'
14 |           cache-dependency-path: '**/pyproject.toml'
15 |       - run: pip install -e .[web,tooling]
16 |       - run: make lint
17 | 
18 |   test:
19 |     runs-on: ubuntu-latest
20 |     name: Test
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - uses: actions/setup-python@v4
24 |         with:
25 |           python-version: '3.11'
26 |           cache: 'pip'
27 |           cache-dependency-path: '**/pyproject.toml'
28 |       - run: pip install -e .[web,tooling]
29 |       - run: pytest
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # VS Code
163 | .vscode
164 | .DS_Store
165 | 
166 | whisperbox-transcribe.sqlite*
167 | *shm
168 | *wal
169 | 
170 | # ruff
171 | .ruff_cache
172 | 
173 | # other private files
174 | /data
175 | .env.prod
176 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean:
 2 | 	docker compose  -f docker-compose.base.yml -f docker-compose.dev.yml down --volumes --remove-orphans
 3 | 
 4 | dev:
 5 | 	docker compose -f docker-compose.base.yml -f docker-compose.dev.yml build
 6 | 	docker compose -f docker-compose.base.yml -f docker-compose.dev.yml up --remove-orphans
 7 | 
 8 | fmt:
 9 | 	black app
10 | 	ruff check app --fix
11 | 
12 | lint:
13 | 	black --check app
14 | 	ruff check app
15 | 	mypy app
16 | 
17 | test:
18 | 	pytest
19 | 
20 | run:
21 | 	docker compose -f docker-compose.base.yml -f docker-compose.prod.yml build
22 | 	docker compose -f docker-compose.base.yml -f docker-compose.prod.yml up -d --remove-orphans
23 | 
24 | stop:
25 | 	docker compose -f docker-compose.base.yml -f docker-compose.prod.yml down
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # whisperbox-transcribe 
 2 | 
 3 | > HTTP wrapper around [openai/whisper](https://github.com/openai/whisper).
 4 | 
 5 | ## Overview
 6 | 
 7 | This project wraps OpenAI's `whisper` speech-to-text models with a HTTP API.
 8 | 
 9 | The API design draws inspiration from the [rev.ai async speech-to-text API](https://docs.rev.ai/api/asynchronous/get-started/). Transcription jobs are submitted by making a HTTP POST request to the service. Once the job is accepted, an ID is returned, which can be later utilized to retrieve the transcription results. These results are stored in an internal database until they are retrieved and can optionally be deleted afterwards.
10 | 
11 | It is assumed that the service is used by exactly one consumer, so a pre-shared API key is used as authentication method. OpenAPI documentation for the service is available at `<service_url>/docs`.
12 | 
13 | ## Deploy
14 | 
15 | <details>
16 | <summary>0. Choose model & instance size</summary>
17 | Whisper offers a range of models in [different sizes](https://github.com/openai/whisper#available-models-and-languages). The model size affects factors such as accuracy, resource usage, and transcription speed. Smaller models are generally faster and consume fewer resources, but they may be less accurate, especially when working with non-English languages or translation tasks.
18 | 
19 | Whisper supports inference on both CPU and GPU, and this project includes slightly modified Docker Compose configurations to enable both options. CPU inference is slower but usually more cost-effective for hosting purposes. CPU inference performance typically scales well with the CPU speed.
20 | 
21 | When selecting an instance for your application, it's important to consider the disk size. Media files need to be downloaded before they can be transcribed, so the disk must have sufficient free space to accommodate them.
22 | 
23 | As a starting point, the "small" model can run on a 4GB Digital Ocean droplet with, achieving approximately a 1-2x speed-up over to the original audio length when transcribing.
24 | </details>
25 | 
26 | ### 1. Prepare host environment
27 | 
28 | This project is intended to be run via [docker compose](https://docs.docker.com/compose/). In order to get started, [install](https://docs.docker.com/engine/install/) docker engine. Then, clone this repository to the machine.
29 | 
30 |  > **Note**  
31 |  > If you want to use a GPU, uncomment the sections tagged `<GPU SUPPORT>` in `docker-compose.prod.yml`.
32 | 
33 | ### 2. Configure service
34 | 
35 | 2. Create an `.env` file from `.env.example` and configure it. Refer to comments for available envs and their usage.
36 | 
37 | ### 3. Run service
38 | 
39 | Run `make run` to start the server. To launch at system startup, wrap it in a systemd launch service.
40 | 
41 | ## Develop
42 | 
43 | [docker compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development.
44 | 
45 | It is recommended to setup a virtual environment for python tooling. To install dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`.
46 | 
47 | Copy `.env.dev` to `.env` to configure the service.
48 | 
49 | ### Start
50 | 
51 | ```
52 | make dev
53 | ```
54 | 
55 | Builds and starts the docker containers.
56 | 
57 | ```
58 | # Bindings
59 | http://localhost:5555                        => Celery dashboard
60 | http://localhost:15672                       => RabbitMQ dashboard
61 | http://whisperbox-transcribe.localhost       => API
62 | http://whisperbox-transcribe.localhost/docs  => API docs
63 | ./whisperbox-transcribe.sqlite               => Database
64 | ```
65 | 
66 | #### Clean
67 | 
68 | This removes all containers and attached volumes.
69 | 
70 | ```
71 | make clean
72 | ```
73 | 
74 | ### Test
75 | 
76 | ```
77 | make test
78 | ```
79 | 
80 | ### Lint
81 | 
82 | ```
83 | make lint
84 | ```
85 | 
86 | ### Format
87 | 
88 | ```
89 | make fmt
90 | ```
91 | 


--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [alembic]
  4 | # path to migration scripts
  5 | script_location = app/shared/db/alembic
  6 | 
  7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
  8 | # Uncomment the line below if you want the files to be prepended with date and time
  9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 10 | # for all available tokens
 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 12 | 
 13 | # sys.path path, will be prepended to sys.path if present.
 14 | # defaults to the current working directory.
 15 | prepend_sys_path = .
 16 | 
 17 | # timezone to use when rendering the date within the migration file
 18 | # as well as the filename.
 19 | # If specified, requires the python-dateutil library that can be
 20 | # installed by adding `alembic[tz]` to the pip requirements
 21 | # string value is passed to dateutil.tz.gettz()
 22 | # leave blank for localtime
 23 | # timezone =
 24 | 
 25 | # max length of characters to apply to the
 26 | # "slug" field
 27 | # truncate_slug_length = 40
 28 | 
 29 | # set to 'true' to run the environment during
 30 | # the 'revision' command, regardless of autogenerate
 31 | # revision_environment = false
 32 | 
 33 | # set to 'true' to allow .pyc and .pyo files without
 34 | # a source .py file to be detected as revisions in the
 35 | # versions/ directory
 36 | # sourceless = false
 37 | 
 38 | # version location specification; This defaults
 39 | # to src/alembic/versions.  When using multiple version
 40 | # directories, initial revisions must be specified with --version-path.
 41 | # The path separator used here should be the separator specified by "version_path_separator" below.
 42 | # version_locations = %(here)s/bar:%(here)s/bat:src/alembic/versions
 43 | 
 44 | # version path separator; As mentioned above, this is the character used to split
 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 47 | # Valid values for version_path_separator are:
 48 | #
 49 | # version_path_separator = :
 50 | # version_path_separator = ;
 51 | # version_path_separator = space
 52 | version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 53 | 
 54 | # the output encoding used when revision files
 55 | # are written from script.py.mako
 56 | # output_encoding = utf-8
 57 | 
 58 | # sqlalchemy.url = driver://user:pass@localhost/dbname
 59 | 
 60 | [post_write_hooks]
 61 | # post_write_hooks defines scripts or Python functions that are run
 62 | # on newly generated revision scripts.  See the documentation for further
 63 | # detail and examples
 64 | 
 65 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 66 | hooks = black
 67 | black.type = console_scripts
 68 | black.entrypoint = black
 69 | black.options = -l 79 REVISION_SCRIPT_FILENAME
 70 | 
 71 | # Logging configuration
 72 | [loggers]
 73 | keys = root,sqlalchemy,alembic
 74 | 
 75 | [handlers]
 76 | keys = console
 77 | 
 78 | [formatters]
 79 | keys = generic
 80 | 
 81 | [logger_root]
 82 | level = WARN
 83 | handlers = console
 84 | qualname =
 85 | 
 86 | [logger_sqlalchemy]
 87 | level = WARN
 88 | handlers =
 89 | qualname = sqlalchemy.engine
 90 | 
 91 | [logger_alembic]
 92 | level = INFO
 93 | handlers =
 94 | qualname = alembic
 95 | 
 96 | [handler_console]
 97 | class = StreamHandler
 98 | args = (sys.stderr,)
 99 | level = NOTSET
100 | formatter = generic
101 | 
102 | [formatter_generic]
103 | format = %(levelname)-5.5s [%(name)s] %(message)s
104 | datefmt = %H:%M:%S
105 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/__init__.py


--------------------------------------------------------------------------------
/app/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/shared/__init__.py


--------------------------------------------------------------------------------
/app/shared/celery.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | 
 3 | 
 4 | def get_celery_binding(broker_url: str) -> Celery:
 5 |     return Celery(
 6 |         broker_url=broker_url,
 7 |         broker_connection_retry=False,
 8 |         broker_connection_retry_on_startup=False,
 9 |     )
10 | 


--------------------------------------------------------------------------------
/app/shared/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/shared/db/__init__.py


--------------------------------------------------------------------------------
/app/shared/db/alembic/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/app/shared/db/alembic/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from alembic import context
 4 | from sqlalchemy import engine_from_config, pool
 5 | 
 6 | from app.shared.db.models import Base
 7 | from app.shared.settings import Settings
 8 | 
 9 | settings = Settings()  # type: ignore
10 | 
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | if config.config_file_name is not None:
18 |     fileConfig(config.config_file_name)
19 | 
20 | config.set_main_option("sqlalchemy.url", settings.DATABASE_URI)
21 | 
22 | # add your model's MetaData object here
23 | # for 'autogenerate' support
24 | # from myapp import mymodel
25 | # target_metadata = mymodel.Base.metadata
26 | target_metadata = Base.metadata
27 | 
28 | # other values from the config, defined by the needs of env.py,
29 | # can be acquired:
30 | # my_important_option = config.get_main_option("my_important_option")
31 | # ... etc.
32 | 
33 | 
34 | def run_migrations_offline() -> None:
35 |     """Run migrations in 'offline' mode.
36 | 
37 |     This configures the context with just a URL
38 |     and not an Engine, though an Engine is acceptable
39 |     here as well.  By skipping the Engine creation
40 |     we don't even need a DBAPI to be available.
41 | 
42 |     Calls to context.execute() here emit the given string to the
43 |     script output.
44 | 
45 |     """
46 |     url = config.get_main_option("sqlalchemy.url")
47 |     context.configure(
48 |         url=url,
49 |         target_metadata=target_metadata,
50 |         literal_binds=True,
51 |         dialect_opts={"paramstyle": "named"},
52 |     )
53 | 
54 |     with context.begin_transaction():
55 |         context.run_migrations()
56 | 
57 | 
58 | def run_migrations_online() -> None:
59 |     """Run migrations in 'online' mode.
60 | 
61 |     In this scenario we need to create an Engine
62 |     and associate a connection with the context.
63 | 
64 |     """
65 | 
66 |     connectable = engine_from_config(
67 |         config.get_section(config.config_ini_section),  # type: ignore
68 |         prefix="sqlalchemy.",
69 |         poolclass=pool.NullPool,
70 |     )
71 | 
72 |     with connectable.connect() as connection:
73 |         context.configure(connection=connection, target_metadata=target_metadata)
74 | 
75 |         with context.begin_transaction():
76 |             context.run_migrations()
77 | 
78 | 
79 | if context.is_offline_mode():
80 |     run_migrations_offline()
81 | else:
82 |     run_migrations_online()
83 | 


--------------------------------------------------------------------------------
/app/shared/db/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade() -> None:
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade() -> None:
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/app/shared/db/alembic/versions/0eee2b7913b7_add_tables.py:
--------------------------------------------------------------------------------
 1 | """add_tables
 2 | 
 3 | Revision ID: 0eee2b7913b7
 4 | Revises:
 5 | Create Date: 2023-06-29 08:33:26.123728
 6 | 
 7 | """
 8 | import sqlalchemy as sa
 9 | from alembic import op
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = "0eee2b7913b7"
13 | down_revision = None
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     # ### commands auto generated by Alembic - please adjust! ###
20 |     op.create_table(
21 |         "jobs",
22 |         sa.Column("url", sa.String(length=2048), nullable=True),
23 |         sa.Column(
24 |             "status",
25 |             sa.Enum("create", "processing", "error", "success", name="jobstatus"),
26 |             nullable=False,
27 |         ),
28 |         sa.Column("config", sa.JSON(none_as_null=True), nullable=True),
29 |         sa.Column("meta", sa.JSON(none_as_null=True), nullable=True),
30 |         sa.Column(
31 |             "type",
32 |             sa.Enum(
33 |                 "transcript",
34 |                 "translation",
35 |                 "language_detection",
36 |                 name="jobtype",
37 |             ),
38 |             nullable=False,
39 |         ),
40 |         sa.Column(
41 |             "created_at",
42 |             sa.DateTime(),
43 |             server_default=sa.text("(CURRENT_TIMESTAMP)"),
44 |             nullable=False,
45 |         ),
46 |         sa.Column("updated_at", sa.DateTime(), nullable=True),
47 |         sa.Column("id", sa.VARCHAR(length=36), nullable=False),
48 |         sa.PrimaryKeyConstraint("id"),
49 |     )
50 |     op.create_index(op.f("ix_jobs_id"), "jobs", ["id"], unique=False)
51 |     op.create_table(
52 |         "artifacts",
53 |         sa.Column("job_id", sa.VARCHAR(length=36), nullable=False),
54 |         sa.Column("data", sa.JSON(none_as_null=True), nullable=True),
55 |         sa.Column(
56 |             "type",
57 |             sa.Enum("raw_transcript", "language_detection", name="artifacttype"),
58 |             nullable=False,
59 |         ),
60 |         sa.Column(
61 |             "created_at",
62 |             sa.DateTime(),
63 |             server_default=sa.text("(CURRENT_TIMESTAMP)"),
64 |             nullable=False,
65 |         ),
66 |         sa.Column("updated_at", sa.DateTime(), nullable=True),
67 |         sa.Column("id", sa.VARCHAR(length=36), nullable=False),
68 |         sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], ondelete="CASCADE"),
69 |         sa.PrimaryKeyConstraint("id"),
70 |     )
71 |     op.create_index(op.f("ix_artifacts_id"), "artifacts", ["id"], unique=False)
72 |     # ### end Alembic commands ###
73 | 
74 | 
75 | def downgrade() -> None:
76 |     # ### commands auto generated by Alembic - please adjust! ###
77 |     op.drop_index(op.f("ix_artifacts_id"), table_name="artifacts")
78 |     op.drop_table("artifacts")
79 |     op.drop_index(op.f("ix_jobs_id"), table_name="jobs")
80 |     op.drop_table("jobs")
81 |     # ### end Alembic commands ###
82 | 


--------------------------------------------------------------------------------
/app/shared/db/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from sqlalchemy import Engine, create_engine, event
 4 | from sqlalchemy.orm import sessionmaker
 5 | 
 6 | 
 7 | def make_engine(database_url: str):
 8 |     engine = create_engine(database_url, connect_args={"check_same_thread": False})
 9 | 
10 |     @event.listens_for(engine, "connect")
11 |     def set_sqlite_pragma(conn: Any, _: Any) -> None:
12 |         cursor = conn.cursor()
13 |         cursor.execute("PRAGMA journal_mode=WAL")
14 |         cursor.close()
15 | 
16 |     return engine
17 | 
18 | 
19 | def make_session_local(engine: Engine):
20 |     session_local = sessionmaker(autocommit=False, autoflush=False, bind=engine)
21 |     return session_local
22 | 


--------------------------------------------------------------------------------
/app/shared/db/models.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import uuid
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | from sqlalchemy import JSON, VARCHAR, Column, DateTime, Enum, ForeignKey, String, func
  6 | from sqlalchemy.dialects.postgresql import UUID
  7 | from sqlalchemy.orm import Mapped, declarative_base, declarative_mixin, declared_attr
  8 | 
  9 | Base = declarative_base()
 10 | 
 11 | 
 12 | # Enums
 13 | 
 14 | 
 15 | class JobType(str, enum.Enum):
 16 |     """Requested type of a job."""
 17 | 
 18 |     transcript = "transcribe"
 19 |     translation = "translate"
 20 |     language_detection = "detect_language"
 21 | 
 22 | 
 23 | class JobStatus(str, enum.Enum):
 24 |     """Processing status of a job."""
 25 | 
 26 |     create = "create"
 27 |     processing = "processing"
 28 |     error = "error"
 29 |     success = "success"
 30 | 
 31 | 
 32 | class ArtifactType(str, enum.Enum):
 33 |     raw_transcript = "transcript_raw"
 34 |     language_detection = "language_detection"
 35 | 
 36 | 
 37 | # JSON field types
 38 | 
 39 | 
 40 | class JobConfig(BaseModel):
 41 |     """(JSON) Configuration for a job."""
 42 | 
 43 |     language: str | None = Field(
 44 |         default=None,
 45 |         description=(
 46 |             "Spoken language in the media file. "
 47 |             "While optional, this can improve output."
 48 |         ),
 49 |     )
 50 | 
 51 | 
 52 | class JobMeta(BaseModel):
 53 |     """(JSON) Metadata relating to a job's execution."""
 54 | 
 55 |     attempts: int | None = Field(
 56 |         default=None,
 57 |         description="Number of processing attempts a job has taken.",
 58 |     )
 59 | 
 60 |     error: str | None = Field(
 61 |         default=None,
 62 |         description="Will contain a descriptive error message if processing failed.",
 63 |     )
 64 | 
 65 |     task_id: uuid.UUID | None = Field(
 66 |         default=None,
 67 |         description="Internal celery id of this job submission.",
 68 |     )
 69 | 
 70 | 
 71 | class RawTranscript(BaseModel):
 72 |     """(JSON) A single transcript passage returned by whisper."""
 73 | 
 74 |     id: int
 75 |     seek: int
 76 |     start: float
 77 |     end: float
 78 |     text: str
 79 |     tokens: list[int]
 80 |     temperature: float
 81 |     avg_logprob: float
 82 |     compression_ratio: float
 83 |     no_speech_prob: float
 84 | 
 85 | 
 86 | class LanguageDetection(BaseModel):
 87 |     """A language detection"""
 88 | 
 89 |     language_code: str
 90 | 
 91 | 
 92 | # Sum type for all possible artifact data values
 93 | ArtifactData = list[RawTranscript] | LanguageDetection | None
 94 | 
 95 | 
 96 | @declarative_mixin
 97 | class WithStandardFields:
 98 |     """Mixin that adds standard fields (id, created_at, updated_at)."""
 99 | 
100 |     @declared_attr
101 |     def created_at(cls) -> Mapped[DateTime]:
102 |         return Column(DateTime, server_default=func.now(), nullable=False)
103 | 
104 |     @declared_attr
105 |     def updated_at(cls) -> Mapped[DateTime | None]:
106 |         return Column(DateTime, onupdate=func.now())
107 | 
108 |     @declared_attr
109 |     def id(cls) -> Mapped[UUID]:
110 |         return Column(
111 |             VARCHAR(36), primary_key=True, index=True, default=lambda: str(uuid.uuid4())
112 |         )
113 | 
114 | 
115 | class Job(Base, WithStandardFields):
116 |     __tablename__ = "jobs"
117 | 
118 |     url = Column(String(length=2048))
119 |     status = Column(Enum(JobStatus), nullable=False)
120 |     config = Column(JSON(none_as_null=True))
121 |     meta = Column(JSON(none_as_null=True))
122 |     type = Column(Enum(JobType), nullable=False)
123 | 
124 | 
125 | class Artifact(Base, WithStandardFields):
126 |     __tablename__ = "artifacts"
127 | 
128 |     job_id = Column(
129 |         VARCHAR(36),
130 |         ForeignKey("jobs.id", ondelete="CASCADE"),
131 |         nullable=False,
132 |     )
133 | 
134 |     data = Column(JSON(none_as_null=True))
135 |     type = Column(Enum(ArtifactType), nullable=False)
136 | 


--------------------------------------------------------------------------------
/app/shared/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig()
4 | 
5 | logger = logging.getLogger(__name__)
6 | 
7 | logger.setLevel(logging.INFO)
8 | 


--------------------------------------------------------------------------------
/app/shared/settings.py:
--------------------------------------------------------------------------------
 1 | from pydantic_settings import BaseSettings
 2 | 
 3 | 
 4 | class Settings(BaseSettings):
 5 |     API_SECRET: str
 6 |     BROKER_URL: str
 7 |     DATABASE_URI: str
 8 |     ENVIRONMENT: str
 9 | 
10 |     TASK_SOFT_TIME_LIMIT: int = 3 * 60 * 60
11 |     TASK_HARD_TIME_LIMIT: int = 4 * 60 * 60
12 | 
13 |     ENABLE_SHARING: bool = False
14 | 


--------------------------------------------------------------------------------
/app/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/tests/__init__.py


--------------------------------------------------------------------------------
/app/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fastapi.testclient import TestClient
 3 | from sqlalchemy_utils import create_database, database_exists, drop_database
 4 | 
 5 | import app.shared.db.models as models
 6 | from app.shared.db.base import make_engine, make_session_local
 7 | from app.shared.settings import Settings
 8 | from app.web.injections.db import get_session
 9 | from app.web.injections.settings import get_settings
10 | from app.web.main import app_factory
11 | 
12 | 
13 | @pytest.fixture()
14 | def settings():
15 |     return Settings(_env_file=".env.test")  # type: ignore
16 | 
17 | 
18 | @pytest.fixture()
19 | def auth_headers(settings) -> dict[str, str]:
20 |     return {"Authorization": f"Bearer {settings.API_SECRET}"}
21 | 
22 | 
23 | @pytest.fixture()
24 | def test_db(settings):
25 |     engine = make_engine(settings.DATABASE_URI)
26 | 
27 |     if not database_exists(engine.url):
28 |         create_database(engine.url)
29 | 
30 |     models.Base.metadata.create_all(engine)
31 | 
32 |     connection = engine.connect()
33 |     yield connection
34 |     connection.close()
35 | 
36 |     models.Base.metadata.drop_all(bind=engine)
37 |     drop_database(engine.url)
38 | 
39 | 
40 | @pytest.fixture()
41 | def db_session(test_db):
42 |     session_local = make_session_local(test_db)
43 |     with session_local() as session:
44 |         yield session
45 | 
46 | 
47 | @pytest.fixture()
48 | def app(db_session, settings):
49 |     app = app_factory()
50 |     app.dependency_overrides[get_settings] = lambda: settings
51 |     app.dependency_overrides[get_session] = lambda: db_session
52 |     return app
53 | 
54 | 
55 | @pytest.fixture()
56 | def client(app):
57 |     client = TestClient(app)
58 |     return client
59 | 
60 | 
61 | @pytest.fixture()
62 | def mock_job(db_session):
63 |     job = models.Job(
64 |         url="https://example.com",
65 |         type=models.JobType.transcript,
66 |         status=models.JobStatus.processing,
67 |         meta={"task_id": "5c790c76-2cc1-4e91-a305-443df55a4a4c"},
68 |     )
69 |     db_session.add(job)
70 |     db_session.commit()
71 |     return job
72 | 
73 | 
74 | @pytest.fixture()
75 | def mock_artifact(db_session, mock_job):
76 |     artifact = models.Artifact(
77 |         data=None, job_id=str(mock_job.id), type=models.ArtifactType.raw_transcript
78 |     )
79 |     db_session.add(artifact)
80 |     db_session.commit()
81 |     return artifact
82 | 


--------------------------------------------------------------------------------
/app/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | import app.shared.db.models as models
  2 | from app.shared.settings import Settings
  3 | from app.web.injections.settings import get_settings
  4 | 
  5 | 
  6 | # POST /api/v1/jobs
  7 | # ---
  8 | def test_create_job_pass(client, auth_headers: dict[str, str]):
  9 |     res = client.post(
 10 |         "/api/v1/jobs",
 11 |         headers=auth_headers,
 12 |         json={"url": "https://example.com", "type": models.JobType.transcript},
 13 |     )
 14 |     assert res.status_code == 201
 15 |     assert isinstance(res.json()["id"], str)
 16 | 
 17 | 
 18 | def test_create_job_missing_body(client, auth_headers: dict[str, str]):
 19 |     res = client.post("/api/v1/jobs", headers=auth_headers, json={})
 20 |     assert res.status_code == 422
 21 | 
 22 | 
 23 | def test_create_job_malformed_url(client, auth_headers: dict[str, str]):
 24 |     res = client.post(
 25 |         "/api/v1/jobs",
 26 |         headers=auth_headers,
 27 |         json={"url": "example.com", "type": models.JobType.transcript},
 28 |     )
 29 |     assert res.status_code == 422
 30 | 
 31 | 
 32 | # GET /api/v1/jobs
 33 | # ---
 34 | def test_get_jobs_pass(client, auth_headers: dict[str, str], mock_job: models.Job):
 35 |     res = client.get(
 36 |         "/api/v1/jobs?type=transcribe",
 37 |         headers=auth_headers,
 38 |     )
 39 |     assert len(res.json()) == 1
 40 |     assert res.status_code == 200
 41 | 
 42 | 
 43 | # GET /api/v1/jobs/:id
 44 | # ---
 45 | def test_get_job_pass(client, auth_headers: dict[str, str], mock_job: models.Job):
 46 |     res = client.get(
 47 |         f"/api/v1/jobs/{mock_job.id}",
 48 |         headers=auth_headers,
 49 |     )
 50 |     assert res.status_code == 200
 51 |     assert res.json()["id"] == str(mock_job.id)
 52 | 
 53 | 
 54 | def test_get_job_not_found(client, auth_headers: dict[str, str], mock_job):
 55 |     res = client.get(
 56 |         "/api/v1/jobs/c8ecf5ea-77cf-48a2-9ecd-199ef35e0ccb",
 57 |         headers=auth_headers,
 58 |     )
 59 | 
 60 |     assert res.status_code == 404
 61 | 
 62 | 
 63 | def test_get_job_sharing_disabled(client, mock_job):
 64 |     res = client.get(
 65 |         f"/api/v1/jobs/{mock_job.id}",
 66 |         headers={},
 67 |     )
 68 |     assert res.status_code == 401
 69 | 
 70 | 
 71 | def test_get_job_sharing_enabled(client, app, mock_job):
 72 |     app.dependency_overrides[get_settings] = lambda: Settings(
 73 |         _env_file=".env.test", ENABLE_SHARING=True  # type: ignore
 74 |     )
 75 | 
 76 |     res = client.get(
 77 |         f"/api/v1/jobs/{mock_job.id}",
 78 |         headers={},
 79 |     )
 80 | 
 81 |     assert res.status_code == 200
 82 | 
 83 | 
 84 | # GET /api/v1/jobs/:id/artifacts
 85 | # ---
 86 | def test_get_artifacts_pass(client, auth_headers, db_session, mock_job, mock_artifact):
 87 |     res = client.get(
 88 |         f"/api/v1/jobs/{mock_job.id}/artifacts",
 89 |         headers=auth_headers,
 90 |     )
 91 | 
 92 |     assert res.status_code == 200
 93 |     assert res.json()[0]["job_id"] == str(mock_job.id)
 94 |     assert res.json()[0]["id"] == str(mock_artifact.id)
 95 | 
 96 | 
 97 | def test_get_artifacts_not_found(client, auth_headers, mock_job):
 98 |     res = client.get(
 99 |         f"/api/v1/jobs/{mock_job.id}/artifacts",
100 |         headers=auth_headers,
101 |     )
102 | 
103 |     assert len(res.json()) == 0
104 |     assert res.status_code == 200
105 | 
106 | 
107 | # DELETE /api/v1/jobs
108 | # ---
109 | def test_delete_job_pass(client, auth_headers, mock_job, db_session):
110 |     res_job = client.get(
111 |         f"/api/v1/jobs/{mock_job.id}",
112 |         headers=auth_headers,
113 |     )
114 | 
115 |     assert res_job.status_code == 200
116 | 
117 |     client.delete(
118 |         f"/api/v1/jobs/{mock_job.id}",
119 |         headers=auth_headers,
120 |     )
121 | 
122 |     # HACK: this catches a missed .commit().
123 |     # TODO: clean up pytest database handling.
124 |     db_session.rollback()
125 | 
126 |     res_job_missing = client.get(
127 |         f"/api/v1/jobs/{mock_job.id}",
128 |         headers=auth_headers,
129 |     )
130 | 
131 |     assert res_job_missing.status_code == 404
132 | 


--------------------------------------------------------------------------------
/app/tests/test_auth.py:
--------------------------------------------------------------------------------
 1 | def test_authorization_header_missing(client):
 2 |     res = client.get("/api/v1/jobs")
 3 |     assert res.status_code == 401
 4 | 
 5 | 
 6 | def test_authorization_header_malformed(client):
 7 |     res = client.get("/api/v1/jobs", headers={"Authorization": "Bearer"})
 8 |     assert res.status_code == 401
 9 | 
10 | 
11 | def test_incorrect_api_key(client):
12 |     res = client.get("/api/v1/jobs", headers={"Authorization": "Bearer incorrect"})
13 |     assert res.status_code == 401
14 | 
15 | 
16 | def test_existing_api_key(client, auth_headers):
17 |     res = client.get("/api/v1/jobs", headers=auth_headers)
18 |     assert res.status_code == 200
19 | 


--------------------------------------------------------------------------------
/app/web/__init__.py:
--------------------------------------------------------------------------------
1 | from app.web.main import app_factory
2 | 
3 | app = app_factory
4 | 


--------------------------------------------------------------------------------
/app/web/dtos.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from uuid import UUID
 3 | 
 4 | from pydantic import AnyHttpUrl, BaseModel, ConfigDict
 5 | 
 6 | from app.shared.db.models import (
 7 |     ArtifactData,
 8 |     ArtifactType,
 9 |     JobConfig,
10 |     JobMeta,
11 |     JobStatus,
12 |     JobType,
13 | )
14 | 
15 | # DB objects
16 | 
17 | 
18 | class WithDbFields(BaseModel):
19 |     id: UUID
20 |     created_at: datetime
21 |     updated_at: datetime | None = None
22 |     model_config = ConfigDict(from_attributes=True)
23 | 
24 | 
25 | class Job(WithDbFields):
26 |     """A transcription job for one media file."""
27 | 
28 |     status: JobStatus
29 |     type: JobType
30 |     url: AnyHttpUrl
31 |     meta: JobMeta | None = None
32 |     config: JobConfig | None = None
33 | 
34 | 
35 | class Artifact(WithDbFields):
36 |     """A transcription artifact."""
37 | 
38 |     job_id: UUID
39 |     data: ArtifactData
40 |     type: ArtifactType
41 | 


--------------------------------------------------------------------------------
/app/web/injections/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/web/injections/__init__.py


--------------------------------------------------------------------------------
/app/web/injections/db.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from typing import Generator
 3 | 
 4 | from fastapi import Depends
 5 | from sqlalchemy.orm import Session
 6 | 
 7 | from app.shared.db.base import make_engine, make_session_local
 8 | from app.shared.settings import Settings
 9 | from app.web.injections.settings import get_settings
10 | 
11 | 
12 | @lru_cache
13 | def session_local(database_url: str):
14 |     engine = make_engine(database_url)
15 |     return make_session_local(engine)
16 | 
17 | 
18 | def get_session_local(settings: Settings = Depends(get_settings)):
19 |     return session_local(settings.DATABASE_URI)
20 | 
21 | 
22 | def get_session(
23 |     session_local=Depends(get_session_local),
24 | ) -> Generator[Session, None, None]:
25 |     with session_local() as session:
26 |         yield session
27 | 


--------------------------------------------------------------------------------
/app/web/injections/security.py:
--------------------------------------------------------------------------------
 1 | from hmac import compare_digest
 2 | from typing import Annotated
 3 | 
 4 | from fastapi import Depends, HTTPException
 5 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 6 | 
 7 | from app.shared.settings import Settings
 8 | from app.web.injections.settings import get_settings
 9 | 
10 | 
11 | def api_key_auth(
12 |     credentials: Annotated[
13 |         HTTPAuthorizationCredentials, Depends(HTTPBearer(auto_error=False))
14 |     ],
15 |     settings: Annotated[Settings, Depends(get_settings)],
16 | ):
17 |     validate_credentials(credentials, settings.API_SECRET)
18 | 
19 | 
20 | def sharing_auth(
21 |     credentials: Annotated[
22 |         HTTPAuthorizationCredentials, Depends(HTTPBearer(auto_error=False))
23 |     ],
24 |     settings: Annotated[Settings, Depends(get_settings)],
25 | ):
26 |     if settings.ENABLE_SHARING:
27 |         pass
28 |     else:
29 |         validate_credentials(credentials, settings.API_SECRET)
30 | 
31 | 
32 | def validate_credentials(credentials: HTTPAuthorizationCredentials, secret: str):
33 |     # use compare_digest to counter timing attacks.
34 |     if (
35 |         not credentials
36 |         or not secret
37 |         or not compare_digest(secret, credentials.credentials)
38 |     ):
39 |         raise HTTPException(status_code=401)
40 | 


--------------------------------------------------------------------------------
/app/web/injections/settings.py:
--------------------------------------------------------------------------------
1 | from functools import lru_cache
2 | 
3 | from app.shared.settings import Settings
4 | 
5 | 
6 | @lru_cache
7 | def get_settings():
8 |     return Settings()  # type: ignore
9 | 


--------------------------------------------------------------------------------
/app/web/injections/task_queue.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | 
 3 | from fastapi import Depends
 4 | 
 5 | from app.shared.settings import Settings
 6 | from app.web.injections.settings import get_settings
 7 | from app.web.task_queue import TaskQueue
 8 | 
 9 | 
10 | @lru_cache
11 | def task_queue(broker_url: str):
12 |     return TaskQueue(broker_url)
13 | 
14 | 
15 | def get_task_queue(settings: Settings = Depends(get_settings)):
16 |     return task_queue(settings.BROKER_URL)
17 | 


--------------------------------------------------------------------------------
/app/web/main.py:
--------------------------------------------------------------------------------
  1 | from typing import Annotated
  2 | from uuid import UUID
  3 | 
  4 | from fastapi import APIRouter, Depends, FastAPI, HTTPException, Path
  5 | from pydantic import AnyHttpUrl, BaseModel, Field
  6 | from sqlalchemy.orm import Session
  7 | 
  8 | import app.shared.db.models as models
  9 | import app.web.dtos as dtos
 10 | from app.web.injections.db import get_session
 11 | from app.web.injections.security import api_key_auth, sharing_auth
 12 | from app.web.injections.task_queue import get_task_queue
 13 | from app.web.task_queue import TaskQueue
 14 | 
 15 | DatabaseSession = Annotated[Session, Depends(get_session)]
 16 | 
 17 | 
 18 | def app_factory():
 19 |     app = FastAPI(
 20 |         description=(
 21 |             "whisperbox-transcribe is an async HTTP wrapper for openai/whisper."
 22 |         ),
 23 |         title="whisperbox-transcribe",
 24 |     )
 25 | 
 26 |     api_router = APIRouter(prefix="/api/v1")
 27 | 
 28 |     @api_router.get("/", status_code=204)
 29 |     def api_root():
 30 |         return None
 31 | 
 32 |     @api_router.get(
 33 |         "/jobs",
 34 |         dependencies=[Depends(api_key_auth)],
 35 |         response_model=list[dtos.Job],
 36 |         summary="Get metadata for all jobs",
 37 |     )
 38 |     def get_jobs(
 39 |         session: DatabaseSession,
 40 |         type: dtos.JobType | None = None,
 41 |     ) -> list[models.Job]:
 42 |         """Get metadata for all jobs."""
 43 |         query = session.query(models.Job).order_by(models.Job.created_at.desc())
 44 | 
 45 |         if type:
 46 |             query = query.filter(models.Job.type == type)
 47 | 
 48 |         return query.all()
 49 | 
 50 |     @api_router.get(
 51 |         "/jobs/{id}",
 52 |         dependencies=[Depends(sharing_auth)],
 53 |         response_model=dtos.Job,
 54 |         summary="Get metadata for one job",
 55 |     )
 56 |     def get_job(
 57 |         session: DatabaseSession,
 58 |         id: UUID = Path(),
 59 |     ) -> models.Job | None:
 60 |         """
 61 |         Use this route to check transcription status of any given job.
 62 |         """
 63 |         job = session.query(models.Job).filter(models.Job.id == str(id)).one_or_none()
 64 | 
 65 |         if not job:
 66 |             raise HTTPException(status_code=404)
 67 | 
 68 |         return job
 69 | 
 70 |     @api_router.get(
 71 |         "/jobs/{id}/artifacts",
 72 |         dependencies=[Depends(api_key_auth)],
 73 |         response_model=list[dtos.Artifact],
 74 |         summary="Get all artifacts for one job",
 75 |     )
 76 |     def get_artifacts_for_job(
 77 |         session: DatabaseSession,
 78 |         id: UUID = Path(),
 79 |     ) -> list[models.Artifact]:
 80 |         """
 81 |         Returns all artifacts for one job.
 82 |         See the type of `data` for possible data types.
 83 |         Returns an empty array for unfinished or non-existant jobs.
 84 |         """
 85 |         artifacts = (
 86 |             session.query(models.Artifact).filter(models.Artifact.job_id == str(id))
 87 |         ).all()
 88 | 
 89 |         return artifacts
 90 | 
 91 |     @api_router.delete(
 92 |         "/jobs/{id}",
 93 |         dependencies=[Depends(sharing_auth)],
 94 |         status_code=204,
 95 |         summary="Delete a job with all artifacts",
 96 |     )
 97 |     def delete_transcript(
 98 |         session: DatabaseSession,
 99 |         id: UUID = Path(),
100 |     ) -> None:
101 |         """Remove metadata and artifacts for a single job."""
102 |         session.query(models.Job).filter(models.Job.id == str(id)).delete()
103 |         session.commit()
104 |         return None
105 | 
106 |     class PostJobPayload(BaseModel):
107 |         url: AnyHttpUrl = Field(
108 |             description=(
109 |                 "URL where the media file is available. This needs to be a direct link."
110 |             )
111 |         )
112 | 
113 |         type: models.JobType = Field(
114 |             description="""Type of this job.
115 |             `transcript` uses the original language of the audio.
116 |             `translation` creates an automatic translation to english.
117 |             `language_detection` detects language from the first 30 seconds of audio."""
118 |         )
119 | 
120 |         language: str | None = Field(
121 |             default=None,
122 |             description=(
123 |                 "Spoken language in the media file. "
124 |                 "While optional, this can improve output when set."
125 |             ),
126 |         )
127 | 
128 |     @api_router.post(
129 |         "/jobs",
130 |         dependencies=[Depends(api_key_auth)],
131 |         response_model=dtos.Job,
132 |         status_code=201,
133 |         summary="Enqueue a new job",
134 |     )
135 |     def create_job(
136 |         payload: PostJobPayload,
137 |         session: DatabaseSession,
138 |         task_queue: Annotated[TaskQueue, Depends(get_task_queue)],
139 |     ) -> models.Job:
140 |         """
141 |         Enqueue a new whisper job for processing.
142 |         Notes:
143 |         * Jobs are processed one-by-one in order of creation.
144 |         * `payload.url` needs to point directly to a media file.
145 |         * The media file is downloaded to a tmp file for the duration of processing.
146 |         enough free space needs to be available on disk.
147 |         * Media files ideally are audio files with a sampling rate of 16kHz.
148 |         other files will be transcoded automatically via ffmpeg which might
149 |         consume considerable resources while active.
150 |         * Once a job is created, you can query its status by its id.
151 |         """
152 | 
153 |         # create a job with status "create" and save it to the database.
154 |         job = models.Job(
155 |             url=str(payload.url),
156 |             status=dtos.JobStatus.create,
157 |             type=payload.type,
158 |             config={"language": payload.language} if payload.language else None,
159 |         )
160 | 
161 |         session.add(job)
162 |         session.commit()
163 | 
164 |         task_queue.queue_task(job)
165 | 
166 |         return job
167 | 
168 |     app.include_router(api_router)
169 | 
170 |     return app
171 | 


--------------------------------------------------------------------------------
/app/web/task_queue.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | 
 3 | import app.shared.db.models as models
 4 | from app.shared.celery import get_celery_binding
 5 | 
 6 | 
 7 | class TaskQueue:
 8 |     celery: Celery
 9 | 
10 |     def __init__(self, broker_url: str) -> None:
11 |         self.celery = get_celery_binding(broker_url=broker_url)
12 | 
13 |     def queue_task(self, job: models.Job):
14 |         """
15 |         Queues an async transcription job. We use a celery signature here to
16 |         allow for full separation of worker processes and dependencies.
17 |         """
18 |         transcribe = self.celery.signature("app.worker.main.transcribe")
19 |         # TODO: catch delivery errors?
20 |         transcribe.delay(job.id)
21 | 


--------------------------------------------------------------------------------
/app/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bellingcat/whisperbox-transcribe/e52d1e136437fef34566a221cd4ba50a60698a2a/app/worker/__init__.py


--------------------------------------------------------------------------------
/app/worker/main.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | from uuid import UUID
  3 | 
  4 | from celery import Task
  5 | from sqlalchemy.orm import Session
  6 | 
  7 | import app.shared.db.models as models
  8 | from app.shared.celery import get_celery_binding
  9 | from app.shared.db.base import make_engine, make_session_local
 10 | from app.shared.logger import logger
 11 | from app.shared.settings import Settings
 12 | from app.worker.strategies.local import LocalStrategy
 13 | 
 14 | # TODO: refactor to be part of a Task instance.
 15 | settings = Settings()  # type: ignore
 16 | celery = get_celery_binding(settings.BROKER_URL)
 17 | engine = make_engine(settings.DATABASE_URI)
 18 | SessionLocal = make_session_local(engine)
 19 | 
 20 | 
 21 | class TranscribeTask(Task):
 22 |     """
 23 |     Decorate the transcribe task with an instance of the transcription strategy.
 24 |     This is important for the local strategy, where loading the model is expensive.
 25 |     """
 26 | 
 27 |     abstract = True
 28 | 
 29 |     def __init__(self) -> None:
 30 |         super().__init__()
 31 |         # currently only `LocalStrategy` is implemented.
 32 |         self.strategy: LocalStrategy | None = None
 33 | 
 34 |     def __call__(self, *args: Any, **kwargs: Any) -> Any:
 35 |         # load model into memory once when the first task is processed.
 36 |         if not self.strategy:
 37 |             self.strategy = LocalStrategy()
 38 |         return self.run(*args, **kwargs)
 39 | 
 40 | 
 41 | @celery.task(
 42 |     base=TranscribeTask,
 43 |     bind=True,
 44 |     soft_time_limit=settings.TASK_SOFT_TIME_LIMIT,
 45 |     time_limit=settings.TASK_HARD_TIME_LIMIT,
 46 |     task_acks_late=True,
 47 |     task_acks_on_failure_or_timeout=True,
 48 |     task_reject_on_worker_lost=True,
 49 | )
 50 | def transcribe(self: TranscribeTask, job_id: UUID) -> None:
 51 |     session: Session | None = None
 52 |     job: models.Job | None = None
 53 | 
 54 |     try:
 55 |         if not self.strategy:
 56 |             raise Exception("expected a transcription strategy to be defined.")
 57 | 
 58 |         # runs in a separate thread => requires sqlite's WAL mode to be enabled.
 59 |         session = SessionLocal()
 60 | 
 61 |         # work around mypy not inferring the sum type correctly.
 62 |         if not session:
 63 |             raise Exception("failed to acquire a session.")
 64 | 
 65 |         # check if passed job should be processed.
 66 | 
 67 |         job = session.query(models.Job).filter(models.Job.id == job_id).one_or_none()
 68 | 
 69 |         if job is None:
 70 |             logger.warn("[unknown]: Received unknown job, abort.")
 71 |             return
 72 | 
 73 |         if job.status in [models.JobStatus.error, models.JobStatus.success]:
 74 |             logger.warn(f"[{job.id}]: job has already been processed, abort.")
 75 |             return
 76 | 
 77 |         logger.debug(f"[{job.id}]: start processing {job.type} job.")
 78 | 
 79 |         if job.meta is not None:
 80 |             attempts = 1 + (job.meta.get("attempts") or 0)
 81 |         else:
 82 |             attempts = 1
 83 | 
 84 |         # SAFEGUARD: celery's retry policies do not handle lost workers, retry once.
 85 |         # @see https://github.com/celery/celery/pull/6103
 86 |         if attempts > 2:
 87 |             raise Exception("Maximum number of retries exceeded for killed worker.")
 88 | 
 89 |         # unit of work: set task status to processing.
 90 | 
 91 |         job.meta = {"task_id": self.request.id, "attempts": attempts}
 92 | 
 93 |         job.status = models.JobStatus.processing
 94 |         session.commit()
 95 | 
 96 |         logger.debug(f"[{job.id}]: finished setting task to {job.status}.")
 97 | 
 98 |         # unit of work: process job with whisper.
 99 |         result_type, result = self.strategy.process(job)
100 |         logger.debug(f"[{job.id}]: successfully processed audio.")
101 | 
102 |         artifact = models.Artifact(job_id=str(job.id), data=result, type=result_type)
103 |         session.add(artifact)
104 | 
105 |         job.status = models.JobStatus.success
106 |         session.commit()
107 | 
108 |         logger.debug(f"[{job.id}]: successfully stored artifact.")
109 | 
110 |     except Exception as e:
111 |         if job and session:
112 |             if session.in_transaction():
113 |                 session.rollback()
114 |             if job.meta is not None:
115 |                 job.meta = {**job.meta, "error": str(e)}
116 |             else:
117 |                 job.meta = {"error": str(e)}
118 | 
119 |             job.status = models.JobStatus.error
120 |             session.commit()
121 |         raise
122 |     finally:
123 |         if self.strategy:
124 |             self.strategy.cleanup(job_id)
125 |         if session:
126 |             session.close()
127 | 


--------------------------------------------------------------------------------
/app/worker/strategies/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from abc import ABC
 4 | from typing import Any, Protocol, Tuple
 5 | from uuid import UUID
 6 | 
 7 | import requests
 8 | 
 9 | import app.shared.db.models as models
10 | 
11 | TaskReturnValue = Tuple[models.ArtifactType, Any]
12 | 
13 | 
14 | class TaskProtocol(Protocol):
15 |     def __call__(self, job: models.Job) -> TaskReturnValue:
16 |         ...
17 | 
18 | 
19 | class BaseStrategy(ABC):
20 |     def process(self, job: models.Job) -> TaskReturnValue:
21 |         if job.type == models.JobType.transcript:
22 |             return self.transcribe(job)
23 |         elif job.type == models.JobType.translation:
24 |             return self.translate(job)
25 |         else:
26 |             return self.detect_language(job)
27 | 
28 |     def cleanup(self, job_id: UUID) -> None:
29 |         try:
30 |             os.remove(self._get_tmp_file(job_id))
31 |         except OSError:
32 |             ...
33 | 
34 |     def transcribe(self, job: models.Job) -> TaskReturnValue:
35 |         raise NotImplementedError()
36 | 
37 |     def translate(self, job: models.Job) -> TaskReturnValue:
38 |         raise NotImplementedError()
39 | 
40 |     def detect_language(self, job: models.Job) -> TaskReturnValue:
41 |         raise NotImplementedError()
42 | 
43 |     def _get_tmp_file(self, job_id: UUID) -> str:
44 |         tmp = tempfile.gettempdir()
45 |         return os.path.join(tmp, str(job_id))
46 | 
47 |     def _download(self, url: str, job_id: UUID) -> str:
48 |         # re-create folder.
49 |         filename = self._get_tmp_file(job_id)
50 |         self.cleanup(job_id)
51 | 
52 |         # stream media to disk.
53 |         with requests.get(url, stream=True) as r:
54 |             r.raise_for_status()
55 |             with open(filename, "wb") as f:
56 |                 for chunk in r.iter_content(chunk_size=8192):
57 |                     f.write(chunk)
58 | 
59 |         return filename
60 | 


--------------------------------------------------------------------------------
/app/worker/strategies/local.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from asyncio.log import logger
 3 | from typing import Any, Literal
 4 | from uuid import UUID
 5 | 
 6 | import torch
 7 | import whisper
 8 | from pydantic import BaseModel
 9 | 
10 | import app.shared.db.models as models
11 | from app.worker.strategies.base import BaseStrategy, TaskReturnValue
12 | 
13 | 
14 | class DecodingOptions(BaseModel):
15 |     """
16 |     Options passed to the whipser model.
17 |     This mirrors private type `whisper.DecodingOptions`.
18 |     """
19 | 
20 |     language: str | None = None
21 |     task: Literal["translate", "transcribe"]
22 | 
23 | 
24 | class LocalStrategy(BaseStrategy):
25 |     def __init__(self) -> None:
26 |         if torch.cuda.is_available():
27 |             logger.debug("initializing GPU model.")
28 |             self.model = whisper.load_model(
29 |                 os.environ["WHISPER_MODEL"], download_root="/models"
30 |             ).cuda()
31 |         else:
32 |             logger.debug("initializing CPU model.")
33 |             self.model = whisper.load_model(
34 |                 os.environ["WHISPER_MODEL"], download_root="/models"
35 |             )
36 | 
37 |         logger.debug("initialized local strategy.")
38 | 
39 |     def transcribe(self, job):
40 |         result = self._run_whisper(
41 |             self._download(job.url, job.id), "transcribe", job.config, job.id
42 |         )
43 | 
44 |         return (models.ArtifactType.raw_transcript, result)
45 | 
46 |     def translate(self, job) -> TaskReturnValue:
47 |         result = self._run_whisper(
48 |             self._download(job.url, job.id),
49 |             "translate",
50 |             job.config,
51 |             job.id,
52 |         )
53 |         return (models.ArtifactType.raw_transcript, result)
54 | 
55 |     def detect_language(self, job) -> TaskReturnValue:
56 |         file = self._download(job.url, job.id)
57 | 
58 |         # see: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/README.md?plain=1#L114
59 |         audio = whisper.pad_or_trim(whisper.load_audio(file))
60 |         mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
61 |         _, probs = self.model.detect_language(mel)
62 | 
63 |         return (
64 |             models.ArtifactType.language_detection,
65 |             {"code": max(probs, key=probs.get)},
66 |         )
67 | 
68 |     def _run_whisper(
69 |         self,
70 |         filepath: str,
71 |         task: Literal["translate", "transcribe"],
72 |         config: dict[str, Any],
73 |         job_id: UUID,
74 |     ) -> list[Any]:
75 |         result = self.model.transcribe(
76 |             filepath,
77 |             # turning this off might make the transcription less accurate,
78 |             # but significantly reduces amount of model halucinations.
79 |             condition_on_previous_text=False,
80 |             **DecodingOptions(
81 |                 task=task,
82 |                 language=models.JobConfig(**config).language if config else None,
83 |             ).dict(),
84 |         )
85 | 
86 |         return result["segments"]
87 | 


--------------------------------------------------------------------------------
/conf/rabbitmq.conf:
--------------------------------------------------------------------------------
1 | vm_memory_high_watermark.absolute = 192MB
2 | consumer_timeout = 31622400000
3 | 


--------------------------------------------------------------------------------
/docker-compose.base.yml:
--------------------------------------------------------------------------------
 1 | x-broker-environment: &broker-environment
 2 |   BROKER_URL: "amqp://${RABBITMQ_DEFAULT_USER}:${RABBITMQ_DEFAULT_PASS}@rabbitmq:5672"
 3 | 
 4 | version: "3.8"
 5 | name: whisperbox-transcribe
 6 | 
 7 | services:
 8 |   traefik:
 9 |     image: "traefik:latest"
10 |     restart: unless-stopped
11 |     volumes:
12 |       - /var/run/docker.sock:/var/run/docker.sock:ro
13 |     depends_on:
14 |       - web
15 |     networks:
16 |       - traefik
17 | 
18 |   rabbitmq:
19 |     env_file: .env
20 |     image: rabbitmq:3-alpine
21 |     networks:
22 |       - app
23 |     deploy:
24 |       resources:
25 |         limits:
26 |           memory: 256M
27 |     healthcheck:
28 |       test: rabbitmq-diagnostics check_port_connectivity
29 |       interval: 3s
30 |       timeout: 3s
31 |       retries: 10
32 | 
33 |     volumes:
34 |       - ./conf/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf
35 |       - rabbitmq-data:/var/lib/rabbitmq/mnesia/
36 | 
37 |   worker:
38 |     env_file: .env
39 |     environment:
40 |       <<: *broker-environment
41 |     build:
42 |       context: .
43 |       dockerfile: worker.Dockerfile
44 |       args:
45 |         WHISPER_MODEL: ${WHISPER_MODEL}
46 |     depends_on:
47 |       rabbitmq:
48 |         condition: service_healthy
49 |     networks:
50 |       - app
51 | 
52 |   web:
53 |     env_file: .env
54 |     environment:
55 |       <<: *broker-environment
56 |     build:
57 |       context: .
58 |       dockerfile: web.Dockerfile
59 |     depends_on:
60 |       rabbitmq:
61 |         condition: service_healthy
62 |     networks:
63 |       - app
64 |       - traefik
65 | 
66 | networks:
67 |   app:
68 |     driver: bridge
69 |   traefik:
70 |     driver: bridge
71 | 
72 | volumes:
73 |   rabbitmq-data:
74 | 


--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | name: whisperbox-transcribe-dev
 3 | 
 4 | services:
 5 |   traefik:
 6 |     ports:
 7 |       - "80:80"
 8 |     command:
 9 |       - "--providers.docker=true"
10 |       - "--providers.docker.exposedbydefault=false"
11 |       - "--providers.docker.network=whisperbox-transcribe-dev_traefik"
12 |       - "--entrypoints.web.address=:80"
13 | 
14 |   web:
15 |     command: bash -c "alembic upgrade head && uvicorn app.web:app --reload --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info --factory"
16 |     # NOTE: the docker on mac mount adapter (virtioFS) does not support flock.
17 |     # this can cause the sqlite database to corrupt when written from worker <> api simultaneously.
18 |     volumes:
19 |       - ./:/etc/whisperbox-transcribe/
20 |     labels:
21 |       - "traefik.http.routers.web.entrypoints=web"
22 |       - "traefik.enable=true"
23 |       - "traefik.http.services.web.loadbalancer.server.port=8000"
24 |       - "traefik.http.routers.web.rule=(Host(`${TRAEFIK_DOMAIN}`))"
25 | 
26 |   worker:
27 |     command: watchmedo auto-restart -d app/worker -p *.py --recursive celery -- --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool prefork
28 |     volumes:
29 |       - ./:/etc/whisperbox-transcribe/
30 | 
31 |   rabbitmq:
32 |     image: rabbitmq:3-management-alpine
33 |     ports:
34 |       - 15672:15672
35 | 
36 |   flower:
37 |     image: mher/flower
38 |     command: celery --broker amqp://${RABBITMQ_DEFAULT_USER}:${RABBITMQ_DEFAULT_PASS}@rabbitmq:5672 flower --port=5555
39 |     ports:
40 |       - 5555:5555
41 |     depends_on:
42 |       - worker
43 |       - rabbitmq
44 |     networks:
45 |       - app
46 | 


--------------------------------------------------------------------------------
/docker-compose.prod.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | name: whisperbox-transcribe
 3 | 
 4 | services:
 5 |   traefik:
 6 |     ports:
 7 |       - "80:80"
 8 |       - "443:443"
 9 |     command:
10 |       - "--providers.docker=true"
11 |       - "--providers.docker.exposedbydefault=false"
12 |       - "--providers.docker.network=whisperbox-transcribe_traefik"
13 |       - "--entrypoints.web.address=:80"
14 |       - "--entrypoints.websecure.address=:443"
15 |       - "--entrypoints.web.http.redirections.entryPoint.to=websecure"
16 |       - "--entrypoints.web.http.redirections.entryPoint.scheme=https"
17 |       - "--entrypoints.web.http.redirections.entrypoint.permanent=true"
18 |       - "--certificatesresolvers.le.acme.email=${TRAEFIK_SSLEMAIL}"
19 |       - "--certificatesresolvers.le.acme.storage=/letsencrypt/acme.json"
20 |       - "--certificatesresolvers.le.acme.tlschallenge=true"
21 |     volumes:
22 |       - ./data/letsencrypt:/letsencrypt
23 |       - /var/run/docker.sock:/var/run/docker.sock:ro
24 | 
25 |   worker:
26 |     # <GPU SUPPORT>
27 |     # build:
28 |     #   dockerfile: worker.gpu.Dockerfile
29 |     volumes:
30 |       - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data
31 |     # <GPU SUPPORT>
32 |     # deploy:
33 |     #   resources:
34 |     #     reservations:
35 |     #       devices:
36 |     #         - driver: nvidia
37 |     #           count: 1
38 |     #           capabilities: [gpu]
39 | 
40 |   web:
41 |     volumes:
42 |       - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data/
43 |     labels:
44 |       - "traefik.enable=true"
45 |       - "traefik.http.services.web.loadbalancer.server.port=8000"
46 |       - "traefik.http.routers.web.rule=(Host(`${TRAEFIK_DOMAIN}`))"
47 |       - "traefik.http.routers.web.entrypoints=websecure"
48 |       - "traefik.http.routers.web.tls=true"
49 |       - "traefik.http.routers.web.tls.certresolver=le"
50 | 
51 | volumes:
52 |   whisperbox-transcribe-data:
53 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = sqlalchemy.ext.mypy.plugin
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = False
5 | check_untyped_defs = True
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "whisperbox-transcribe"
 3 | description = ""
 4 | version = "1.0.1"
 5 | 
 6 | dependencies=[
 7 |   "celery ==5.3.6",
 8 |   "sqlalchemy[mypy] ==2.0.24",
 9 |   "pydantic ==2.5.3",
10 |   "pydantic-settings ==2.1.0"
11 | ]
12 | 
13 | [project.optional-dependencies]
14 | web=[
15 |   "alembic ==1.11.3",
16 |   "fastapi ==0.101.1",
17 |   "uvicorn[standard] ==0.23.2",
18 |   "gunicorn ==21.2.0"
19 | ]
20 | 
21 | worker=[
22 |   "watchdog[watchmedo] ==3.0.0",
23 |   "openai-whisper ==20230314",
24 |   "requests ==2.31.0"
25 | ]
26 | 
27 | tooling = [
28 |   # code formatting
29 |   "black ==23.12.1",
30 |   # linting
31 |   "ruff ==0.0.292",
32 |   # tests
33 |   "httpx ==0.26.0",
34 |   "sqlalchemy-utils ==0.41.1",
35 |   "python-dotenv ==1.0.0",
36 |   "pytest ==7.4.4",
37 |   # types
38 |   "mypy ==1.5.1",
39 |   "types-requests ==2.31.0.20231231"
40 | ]
41 | 
42 | [tool.ruff]
43 | # pyflakes, pycodestyle, isort
44 | select = ["F", "E", "W", "I001"]
45 | 
46 | [tool.setuptools]
47 | py-modules = []
48 | 


--------------------------------------------------------------------------------
/scripts/download_models.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from whisper import _download, _MODELS  # type: ignore
4 | 
5 | if __name__ == "__main__":
6 |     model_name = sys.argv[1].strip()
7 |     _download(_MODELS[model_name], "/models/", False)
8 | 


--------------------------------------------------------------------------------
/web.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim as python-build
 2 | 
 3 | WORKDIR /etc/whisperbox-transcribe
 4 | 
 5 | COPY pyproject.toml .
 6 | 
 7 | RUN python -m venv /opt/venv && \
 8 |     /opt/venv/bin/pip install -U pip wheel && \
 9 |     /opt/venv/bin/pip install -U .[web]
10 | 
11 | FROM python:3.11-slim as python-deploy
12 | 
13 | WORKDIR /etc/whisperbox-transcribe
14 | 
15 | COPY --from=python-build /opt/venv /opt/venv
16 | 
17 | COPY app ./app
18 | COPY alembic.ini .
19 | 
20 | ENV VIRTUAL_ENV /opt/venv
21 | ENV PATH /opt/venv/bin:$PATH
22 | 
23 | CMD alembic upgrade head && uvicorn app.web:app --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info --workers 4 --proxy-headers --factory
24 | 


--------------------------------------------------------------------------------
/worker.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim AS python-build
 2 | 
 3 | WORKDIR /etc/whisperbox-transcribe
 4 | 
 5 | # Create and build virtual env from requirements.
 6 | COPY pyproject.toml .
 7 | 
 8 | RUN python -m venv /opt/venv && \
 9 |     /opt/venv/bin/pip install -U pip wheel && \
10 |     /opt/venv/bin/pip install -U .[worker]
11 | 
12 | FROM python:3.11-slim as python-deploy
13 | 
14 | ARG WHISPER_MODEL
15 | 
16 | WORKDIR /etc/whisperbox-transcribe
17 | 
18 | COPY --from=python-build /opt/venv /opt/venv
19 | 
20 | COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/
21 | COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/
22 | 
23 | ENV VIRTUAL_ENV /opt/venv
24 | ENV PATH /opt/venv/bin:$PATH
25 | 
26 | COPY scripts/download_models.py .
27 | RUN python download_models.py ${WHISPER_MODEL}
28 | 
29 | COPY app ./app
30 | 
31 | CMD celery --app=app.worker.main.celery worker --loglevel=info --pool=prefork --concurrency=1
32 | 


--------------------------------------------------------------------------------
/worker.gpu.Dockerfile:
--------------------------------------------------------------------------------
 1 | # TODO: clean up
 2 | FROM nvidia/cuda:11.8.0-base-ubuntu22.04 AS python-deploy
 3 | 
 4 | ENV PYTHON_VERSION=3.11
 5 | 
 6 | ARG WHISPER_MODEL
 7 | 
 8 | WORKDIR /etc/whisperbox-transcribe
 9 | 
10 | RUN export DEBIAN_FRONTEND=noninteractive \
11 |     && apt-get -qq update \
12 |     && apt-get -qq install --no-install-recommends \
13 |     python${PYTHON_VERSION} \
14 |     python${PYTHON_VERSION}-venv \
15 |     python3-pip \
16 |     && rm -rf /var/lib/apt/lists/*
17 | 
18 | RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \
19 |     ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \
20 |     ln -s -f /usr/bin/pip3 /usr/bin/pip
21 | 
22 | COPY pyproject.toml .
23 | 
24 | RUN python -m venv /opt/venv && \
25 |     /opt/venv/bin/pip install -U pip wheel && \
26 |     /opt/venv/bin/pip install -U .[worker]
27 | 
28 | COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/
29 | COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/
30 | 
31 | COPY app ./app
32 | 
33 | ENV VIRTUAL_ENV /opt/venv
34 | ENV PATH /opt/venv/bin:$PATH
35 | 
36 | COPY scripts/download_models.py .
37 | RUN python download_models.py ${WHISPER_MODEL}
38 | 
39 | CMD celery --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool=prefork
40 | 


--------------------------------------------------------------------------------