├── .flake8
├── requirements.txt
├── ttsfm-web
├── run.py
├── requirements.txt
├── static
│ └── js
│ │ ├── api-client.js
│ │ └── i18n.js
├── templates
│ ├── index.html
│ └── base.html
├── i18n.py
├── translations
│ ├── zh.json
│ └── en.json
└── websocket_handler.py
├── .github
├── ISSUE_TEMPLATE
│ ├── feature_request.md
│ └── bug_report.md
└── workflows
│ ├── release.yml
│ ├── docker-build-full.yml
│ └── docker-build-slim.yml
├── .env.example
├── LICENSE
├── docs
├── architecture.md
├── docker-workflows.md
├── v3.4-dual-image-implementation.md
└── websocket-streaming.md
├── CONTRIBUTING.md
├── Dockerfile
├── .gitignore
├── tests
├── test_utils.py
├── test_web_app.py
├── test_clients.py
└── test_audio_processing.py
├── ttsfm
├── capabilities.py
├── audio.py
├── __init__.py
├── audio_processing.py
├── exceptions.py
├── models.py
└── cli.py
├── README.zh.md
├── pyproject.toml
├── scripts
└── test_websocket.py
└── README.md
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | extend-ignore = E203,W503,E501
4 | exclude = .venv,build,dist,ttsfm.egg-info
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Core dependencies for the TTSFM package
2 | requests>=2.25.0
3 | aiohttp>=3.8.0
4 | python-dotenv>=1.0.1
5 |
--------------------------------------------------------------------------------
/ttsfm-web/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Run script for TTSFM web application with proper eventlet initialization
4 | """
5 |
6 | import eventlet
7 |
8 | eventlet.monkey_patch()
9 |
10 | from app import DEBUG, HOST, PORT, app, socketio # noqa: E402
11 |
12 | if __name__ == "__main__":
13 | print(f"Starting TTSFM with WebSocket support on {HOST}:{PORT}")
14 | socketio.run(app, host=HOST, port=PORT, debug=DEBUG, allow_unsafe_werkzeug=True)
15 |
--------------------------------------------------------------------------------
/ttsfm-web/requirements.txt:
--------------------------------------------------------------------------------
1 | # Web application dependencies
2 | argon2-cffi>=23.1.0
3 | flask>=2.0.0
4 | flask-cors>=3.0.10
5 | flask-socketio>=5.3.0
6 | python-socketio>=5.10.0
7 | eventlet>=0.33.3
8 | waitress>=3.0.0
9 | python-dotenv>=1.0.0
10 |
11 | # Audio processing (optional, for combining audio files)
12 | # If not installed, will fall back to simple concatenation for WAV files
13 | pydub>=0.25.0
14 |
15 | # TTSFM package (install from local directory or PyPI)
16 | # For local development: pip install -e ../
17 | # For Docker/production: installed via pyproject.toml[web] dependencies
18 |
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # TTSFM Environment Configuration
2 |
3 | # Server Configuration
4 | HOST=0.0.0.0
5 | PORT=8000
6 |
7 | # SSL Configuration
8 | VERIFY_SSL=true
9 |
10 | # Flask Configuration
11 | FLASK_ENV=production
12 | FLASK_APP=app.py
13 | DEBUG=false
14 |
15 | # API Key Protection (Optional)
16 | # Set REQUIRE_API_KEY=true to enable API key authentication
17 | REQUIRE_API_KEY=false
18 |
19 | # Set your API key here when protection is enabled
20 | # This key will be required for all TTS generation requests
21 | TTSFM_API_KEY=your-secret-api-key-here
22 |
23 | # Example usage:
24 | # 1. Set REQUIRE_API_KEY=true
25 | # 2. Set TTSFM_API_KEY to your desired secret key
26 | # 3. Restart the application
27 | # 4. All TTS requests will now require the API key in:
28 | # - Authorization header (Bearer token) - OpenAI compatible
29 | # - X-API-Key header
30 | # - api_key query parameter
31 | # - api_key in JSON body
32 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 dbcccc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/ttsfm-web/static/js/api-client.js:
--------------------------------------------------------------------------------
1 | const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
2 | const cache = new Map();
3 |
4 | function shouldUseCache(entry) {
5 | if (!entry) {
6 | return false;
7 | }
8 | if (entry.expiresAt === null) {
9 | return true;
10 | }
11 | return Date.now() < entry.expiresAt;
12 | }
13 |
14 | async function fetchWithCache(url, { signal, refresh = false } = {}) {
15 | if (!refresh) {
16 | const cached = cache.get(url);
17 | if (shouldUseCache(cached)) {
18 | return cached.data;
19 | }
20 | }
21 |
22 | const response = await fetch(url, { signal });
23 | if (!response.ok) {
24 | throw new Error(`Request to ${url} failed with status ${response.status}`);
25 | }
26 | const data = await response.json();
27 | cache.set(url, { data, expiresAt: Date.now() + CACHE_TTL_MS });
28 | return data;
29 | }
30 |
31 | export function clearCache(urlPrefix) {
32 | if (!urlPrefix) {
33 | cache.clear();
34 | return;
35 | }
36 | for (const key of Array.from(cache.keys())) {
37 | if (key.startsWith(urlPrefix)) {
38 | cache.delete(key);
39 | }
40 | }
41 | }
42 |
43 | export function fetchVoices(options = {}) {
44 | return fetchWithCache('/api/voices', options);
45 | }
46 |
47 | export function fetchFormats(options = {}) {
48 | return fetchWithCache('/api/formats', options);
49 | }
50 |
51 | export function primeCache(url, data, ttlMs = CACHE_TTL_MS) {
52 | cache.set(url, { data, expiresAt: ttlMs === null ? null : Date.now() + ttlMs });
53 | }
54 |
--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # TTSFM Architecture Overview
2 |
3 | ```
4 | +----------------+ +--------------------+ +----------------------+
5 | | Frontend (JS) | <---> | Flask REST Endpoints| <---> | OpenAI.fm upstream |
6 | | Playground UI | | /api/* + /v1/audio | | reverse-engineered |
7 | +----------------+ +--------------------+ +----------------------+
8 | | ^
9 | v |
10 | +----------------+ +--------------------+
11 | | Socket.IO WS | <---> | WebSocket Handler |
12 | | streaming UI | | (background tasks) |
13 | +----------------+ +--------------------+
14 | ```
15 |
16 | - **Synchronous Client (`TTSClient`)** – Used by both REST endpoints and the WebSocket handler. Each request gets an isolated client instance, preventing shared session races.
17 | - **Async Client (`AsyncTTSClient`)** – Available to external consumers that want fully asynchronous workflows.
18 | - **Utilities** – Shared helpers handle sanitisation, deterministic headers, and text splitting for both HTTP and WebSocket flows.
19 |
20 | The repo ships with a Docker image that bundles the Flask app, Socket.IO server, and static assets. A per-request TTS client ensures concurrency safety; outgoing prompt tuning is opt-in through the `use_default_prompt` flag.
21 |
22 | For more implementation details see:
23 |
24 | - `ttsfm-web/app.py` – Flask routes, streaming combination logic, API key security.
25 | - `ttsfm-web/websocket_handler.py` – Background task orchestration and streaming chunk delivery.
26 | - `ttsfm/utils.py` – Sanitisation, deterministic headers, and text chunk helpers.
27 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to TTSFM
2 |
3 | Thanks for your interest in improving TTSFM! This document outlines the local development workflow and quality gates that every pull request must satisfy.
4 |
5 | ## 1. Set Up Your Environment
6 |
7 | ```bash
8 | # Clone and create a virtual environment of your choice
9 | python -m venv .venv
10 | source .venv/bin/activate # Windows: .venv\Scripts\activate
11 |
12 | # Install the package with all tooling and web extras
13 | pip install -e .[web,dev]
14 | ```
15 |
16 | ## 2. Run the Test Suite
17 |
18 | ```bash
19 | pytest
20 | ```
21 |
22 | Add new tests alongside your changes—patches without coverage for new behaviour will be sent back for revision.
23 |
24 | ## 3. Lint and Type-Check
25 |
26 | We keep the codebase consistent and catch regressions early with these checks:
27 |
28 | ```bash
29 | black --check ttsfm ttsfm-web tests
30 | flake8 ttsfm ttsfm-web
31 | mypy ttsfm
32 | ```
33 |
34 | Format your code with `black` and resolve lint/type errors before opening a pull request.
35 |
36 | ## 4. Web UI Smoke Tests
37 |
38 | If you touch the Flask app or frontend assets, run the web server locally and exercise the basic flows (text input, long-form combine, WebSocket streaming). For asynchronous features, open two browser tabs and confirm cancellation works.
39 |
40 | ## 5. Commit & Pull Request Guidelines
41 |
42 | - Keep commits focused; squash trivial fixups before submitting.
43 | - Describe _why_ a change is needed in the PR description.
44 | - Link to an issue if one exists.
45 | - Document behaviour changes in `CHANGELOG.md` when relevant.
46 |
47 | Questions or ideas? Open a discussion thread or drop by the issue tracker—we’re happy to help.
48 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Build argument to control image variant (full or slim)
2 | ARG VARIANT=full
3 |
4 | FROM python:3.11-slim AS builder
5 |
6 | WORKDIR /app
7 |
8 | ENV PYTHONDONTWRITEBYTECODE=1 \
9 | PYTHONUNBUFFERED=1
10 |
11 | RUN apt-get update \
12 | && apt-get install -y --no-install-recommends build-essential \
13 | && rm -rf /var/lib/apt/lists/*
14 |
15 | COPY pyproject.toml ./
16 | COPY README.md ./
17 | COPY requirements.txt ./
18 | COPY ttsfm/ ./ttsfm/
19 |
20 | ARG VERSION=0.0.0
21 | ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
22 |
23 | RUN pip install --no-cache-dir --upgrade pip \
24 | && pip install --no-cache-dir --prefix /install .[web] \
25 | && find /install -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true \
26 | && find /install -type f -name '*.pyc' -delete \
27 | && find /install -type f -name '*.pyo' -delete \
28 | && find /install -type d -name 'tests' -exec rm -rf {} + 2>/dev/null || true \
29 | && find /install -type d -name 'test' -exec rm -rf {} + 2>/dev/null || true \
30 | && find /install -name '*.dist-info' -type d -exec sh -c 'rm -f "$1"/RECORD "$1"/INSTALLER' sh {} \; 2>/dev/null || true
31 |
32 | FROM python:3.11-slim
33 |
34 | # Re-declare ARG after FROM to make it available in this stage
35 | ARG VARIANT=full
36 |
37 | ENV PYTHONDONTWRITEBYTECODE=1 \
38 | PYTHONUNBUFFERED=1 \
39 | PORT=8000 \
40 | TTSFM_VARIANT=${VARIANT}
41 |
42 | WORKDIR /app
43 |
44 | # Conditional ffmpeg installation based on variant
45 | # Full variant: includes ffmpeg for MP3 combining, speed adjustment, and format conversion
46 | # Slim variant: minimal image without ffmpeg (WAV-only auto-combine, no speed adjustment)
47 | RUN apt-get update \
48 | && if [ "$VARIANT" = "full" ]; then \
49 | apt-get install -y --no-install-recommends ffmpeg; \
50 | fi \
51 | && rm -rf /var/lib/apt/lists/* \
52 | && useradd --create-home --shell /usr/sbin/nologin ttsfm
53 |
54 | COPY --from=builder /install /usr/local
55 | ENV PATH="/usr/local/bin:$PATH"
56 |
57 | COPY --chown=ttsfm:ttsfm ttsfm-web/ ./ttsfm-web/
58 |
59 | USER ttsfm
60 |
61 | EXPOSE 8000
62 |
63 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
64 | CMD ["python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health', timeout=5)"]
65 |
66 | WORKDIR /app/ttsfm-web
67 | CMD ["python", "run.py"]
68 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.so
6 | .Python
7 | build/
8 | develop-eggs/
9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | MANIFEST
23 |
24 | # Virtual Environment
25 | venv/
26 | env/
27 | ENV/
28 | .venv/
29 |
30 | # Environment variables
31 | .env
32 | .env.local
33 | .env.production
34 |
35 | # IDE
36 | .idea/
37 | .vscode/
38 | *.swp
39 | *.swo
40 | .spyderproject
41 | .spyproject
42 |
43 | # OS
44 | .DS_Store
45 | .DS_Store?
46 | ._*
47 | .Spotlight-V100
48 | .Trashes
49 | ehthumbs.db
50 | Thumbs.db
51 |
52 | # Generated audio files (for testing)
53 | *.mp3
54 | *.wav
55 | *.opus
56 | *.aac
57 | *.flac
58 | *.pcm
59 | test_output.*
60 | output.*
61 | hello.*
62 | speech.*
63 |
64 | # Logs
65 | *.log
66 | logs/
67 | .pytest_cache/
68 |
69 | # Temporary files
70 | tmp/
71 | temp/
72 | .tmp/
73 |
74 | # Coverage reports
75 | htmlcov/
76 | .coverage
77 | .coverage.*
78 | coverage.xml
79 | *.cover
80 | .hypothesis/
81 |
82 | # Documentation builds
83 | docs/_build/
84 | site/
85 |
86 | # Package builds
87 | *.tar.gz
88 | *.whl
89 | dist/
90 | build/
91 |
92 | # MyPy
93 | .mypy_cache/
94 | .dmypy.json
95 | dmypy.json
96 |
97 | # Jupyter Notebook
98 | .ipynb_checkpoints
99 |
100 | # pyenv
101 | .python-version
102 |
103 | # pipenv
104 | Pipfile.lock
105 |
106 | # PEP 582
107 | __pypackages__/
108 |
109 | # Celery
110 | celerybeat-schedule
111 | celerybeat.pid
112 |
113 | # SageMath parsed files
114 | *.sage.py
115 |
116 | # Rope project settings
117 | .ropeproject
118 |
119 | # mkdocs documentation
120 | /site
121 |
122 | # Pyre type checker
123 | .pyre/
124 |
125 | # Additional exclusions for GitHub
126 |
127 | # API Keys and Secrets
128 | config.json
129 | secrets.json
130 | .secrets
131 | api_keys.txt
132 |
133 | # Database files
134 | *.db
135 | *.sqlite
136 | *.sqlite3
137 |
138 | # Backup files
139 | *.bak
140 | *.backup
141 | *~
142 |
143 | # Node.js (if using any JS tools)
144 | node_modules/
145 | npm-debug.log*
146 | yarn-debug.log*
147 | yarn-error.log*
148 |
149 | # Docker
150 | .dockerignore
151 | Dockerfile.dev
152 | docker-compose.override.yml
153 |
154 | # Local configuration
155 | local_settings.py
156 | local_config.py
157 |
158 | # Claude
159 | .claude/
160 | VERSION_BUMP_GUIDE.md
161 | scripts/test_audio_generation.py
162 | /artifacts
163 | test.py
164 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import importlib
2 |
3 | import pytest
4 |
5 | import ttsfm.utils as utils
6 |
7 |
8 | def test_split_text_preserves_sentence_punctuation():
9 | text = "First sentence! Second question? Final statement."
10 | chunks = utils.split_text_by_length(text, max_length=15)
11 |
12 | assert chunks[0].endswith("!"), chunks
13 | assert any(chunk.endswith("?") for chunk in chunks), chunks
14 | assert chunks[-1].endswith("."), chunks
15 |
16 |
17 | def test_split_text_handles_oversized_sentence():
18 | long_sentence = " ".join(["word"] * 600)
19 | chunks = utils.split_text_by_length(long_sentence, max_length=120)
20 |
21 | assert all(len(chunk) <= 120 for chunk in chunks)
22 | assert sum(len(chunk.split()) for chunk in chunks) == 600
23 |
24 |
25 | def test_split_text_handles_extremely_long_word():
26 | max_length = 50
27 | painful_word = "a" * 140
28 | text = f"start {painful_word} end"
29 |
30 | chunks = utils.split_text_by_length(text, max_length=max_length)
31 |
32 | assert any(painful_word[:max_length] in chunk for chunk in chunks)
33 | assert all(len(chunk) <= max_length for chunk in chunks)
34 |
35 |
36 | def test_sanitize_text_retains_ampersands():
37 | text = "R&D and Fish & Chips & Co. Bold"
38 | sanitized = utils.sanitize_text(text)
39 |
40 | assert "R&D" in sanitized
41 | assert "Fish & Chips" in sanitized
42 | assert "Bold" in sanitized
43 | assert "<" not in sanitized
44 |
45 |
46 | def test_header_generation_deterministic_upgrade_flag(monkeypatch):
47 | module = importlib.reload(utils)
48 |
49 | headers_first = module.get_realistic_headers()
50 | headers_second = module.get_realistic_headers()
51 |
52 | assert "Upgrade-Insecure-Requests" in headers_first
53 | assert "Upgrade-Insecure-Requests" not in headers_second
54 | assert headers_first["Accept-Language"] != headers_second["Accept-Language"]
55 |
56 |
57 | @pytest.mark.asyncio
58 | async def test_async_batch_propagates_original_exception(monkeypatch):
59 | from ttsfm.async_client import AsyncTTSClient
60 | from ttsfm.exceptions import NetworkException
61 | from ttsfm.models import TTSRequest, Voice
62 |
63 | client = AsyncTTSClient()
64 |
65 | async def fail_request(_request):
66 | raise NetworkException("boom")
67 |
68 | monkeypatch.setattr(client, "_make_request", fail_request)
69 |
70 | request = TTSRequest(input="hello", voice=Voice.ALLOY)
71 |
72 | with pytest.raises(NetworkException):
73 | await client.generate_speech_batch([request])
74 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release and Publish
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*' # Triggers on version tags like v1.0.0, v3.0.1, etc.
7 |
8 | permissions:
9 | contents: write
10 | id-token: write
11 |
12 | jobs:
13 | release-and-publish:
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 |
19 | - name: Set up Python
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: '3.11'
23 |
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install build twine
28 | pip install '.[web,dev]'
29 |
30 | - name: Run linters and type checks
31 | run: |
32 | flake8 ttsfm ttsfm-web
33 | mypy ttsfm
34 | black --check ttsfm ttsfm-web tests
35 |
36 | - name: Run tests
37 | run: pytest
38 |
39 | - name: Test package install and import
40 | run: |
41 | python -c "import ttsfm; print('TTSFM imported successfully')"
42 | python -c "from ttsfm import TTSClient; print('TTSClient imported successfully')"
43 | python -m ttsfm.cli --help > /dev/null
44 | echo 'CLI smoke test passed'
45 |
46 | - name: Build package
47 | run: |
48 | python -m build
49 | echo "Package built successfully"
50 | ls -la dist/
51 |
52 | - name: Check package
53 | run: |
54 | twine check dist/*
55 | echo "Package validation passed"
56 |
57 | - name: Publish to PyPI
58 | uses: pypa/gh-action-pypi-publish@release/v1
59 | with:
60 | attestations: true
61 | skip-existing: true
62 |
63 | - name: Extract version (strip leading v)
64 | id: ver
65 | run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
66 |
67 | - name: Create GitHub Release
68 | uses: softprops/action-gh-release@v1
69 | with:
70 | body: |
71 | ## TTSFM ${{ github.ref_name }}
72 |
73 | New release of TTSFM - Free Text-to-Speech API with OpenAI compatibility.
74 |
75 | ### Installation
76 | ```bash
77 | pip install ttsfm==${{ steps.ver.outputs.version }}
78 | ```
79 |
80 | ### Quick Start
81 | ```python
82 | from ttsfm import TTSClient
83 |
84 | client = TTSClient()
85 | response = client.generate_speech("Hello from TTSFM!")
86 | response.save_to_file("hello")
87 | ```
88 |
89 | ### Docker
90 | ```bash
91 | docker run -p 8000:8000 dbcccc/ttsfm:latest
92 | ```
93 |
94 | ### Features
95 | - Completely free (uses openai.fm service)
96 | - OpenAI-compatible API
97 | - 11 voices available
98 | - 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
99 | - Async and sync clients
100 | - Web interface included
101 | - CLI tool available
102 |
103 | ### Documentation
104 | See [README](https://github.com/dbccccccc/ttsfm#readme) for full documentation.
105 | draft: false
106 | prerelease: ${{ contains(github.ref_name, '-' ) }}
107 |
108 |
--------------------------------------------------------------------------------
/tests/test_web_app.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import importlib.util
3 | import sys
4 | from pathlib import Path
5 |
6 | import pytest
7 |
8 | WEB_DIR = Path(__file__).resolve().parents[1] / "ttsfm-web"
9 | MODULE_NAME = "ttsfm_web.app"
10 |
11 |
12 | def load_web_app(monkeypatch, **env):
13 | for key, value in env.items():
14 | if value is None:
15 | monkeypatch.delenv(key, raising=False)
16 | else:
17 | monkeypatch.setenv(key, value)
18 |
19 | sys.modules.pop(MODULE_NAME, None)
20 | sys.modules.pop("ttsfm_web", None)
21 | sys.modules.pop("websocket_handler", None)
22 |
23 | web_dir_str = str(WEB_DIR)
24 | if web_dir_str not in sys.path:
25 | sys.path.insert(0, web_dir_str)
26 |
27 | pkg_spec = importlib.util.spec_from_loader("ttsfm_web", loader=None)
28 | pkg = importlib.util.module_from_spec(pkg_spec)
29 | pkg.__path__ = [web_dir_str] # type: ignore[attr-defined]
30 | sys.modules.setdefault("ttsfm_web", pkg)
31 |
32 | spec = importlib.util.spec_from_file_location(MODULE_NAME, WEB_DIR / "app.py")
33 | module = importlib.util.module_from_spec(spec)
34 | assert spec and spec.loader
35 | spec.loader.exec_module(module) # type: ignore[attr-defined]
36 | return module
37 |
38 |
39 | def test_voices_endpoint_returns_data(monkeypatch):
40 | module = load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None)
41 | client = module.app.test_client()
42 | response = client.get("/api/voices")
43 | assert response.status_code == 200
44 | payload = response.get_json()
45 | assert payload["count"] == len(payload["voices"])
46 |
47 |
48 | def test_combine_audio_chunks_uses_format_hint(monkeypatch):
49 | load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None)
50 |
51 | from ttsfm import audio as audio_module
52 |
53 | class DummySegment:
54 | def __init__(self, tag: str):
55 | self.tag = tag
56 |
57 | def __iadd__(self, other: "DummySegment"):
58 | self.tag += other.tag
59 | return self
60 |
61 | def export(self, buffer, format: str):
62 | buffer.write(f"{format}:{self.tag}".encode())
63 |
64 | class DummyAudioSegment:
65 | formats = []
66 |
67 | @classmethod
68 | def from_mp3(cls, buffer):
69 | cls.formats.append("mp3")
70 | return DummySegment("mp3")
71 |
72 | @classmethod
73 | def from_wav(cls, buffer):
74 | cls.formats.append("wav")
75 | return DummySegment("wav")
76 |
77 | monkeypatch.setattr(audio_module, "AudioSegment", DummyAudioSegment)
78 |
79 | output = audio_module.combine_audio_chunks([b"one", b"two"], "opus")
80 |
81 | assert output == b"wav:wavwav"
82 | assert DummyAudioSegment.formats == ["wav", "wav"]
83 |
84 |
85 | @pytest.mark.parametrize(
86 | "header_name, header_value",
87 | [
88 | ("Authorization", "Bearer super-secret"),
89 | ("X-API-Key", "super-secret"),
90 | ],
91 | )
92 | def test_api_key_hash_verification(monkeypatch, header_name, header_value):
93 | module = load_web_app(monkeypatch, REQUIRE_API_KEY="true", TTSFM_API_KEY="super-secret")
94 | client = module.app.test_client()
95 |
96 | denied = client.post("/api/validate-text", json={"text": "hello"})
97 | assert denied.status_code == 401
98 |
99 | headers = {header_name: header_value}
100 | response = client.post("/api/validate-text", json={"text": "hello"}, headers=headers)
101 | assert response.status_code == 200
102 |
--------------------------------------------------------------------------------
/tests/test_clients.py:
--------------------------------------------------------------------------------
1 | import types
2 |
3 | import pytest
4 |
5 | from ttsfm.async_client import AsyncTTSClient
6 | from ttsfm.client import TTSClient
7 | from ttsfm.models import AudioFormat, TTSResponse
8 |
9 |
10 | def _mk_response(data: bytes) -> TTSResponse:
11 | return TTSResponse(
12 | audio_data=data,
13 | content_type="audio/mpeg",
14 | format=AudioFormat.MP3,
15 | size=len(data),
16 | )
17 |
18 |
19 | class _DummyResponse:
20 | def __init__(self, content_type: str, content: bytes, url: str = "https://example.test/audio"):
21 | self.status_code = 200
22 | self.headers = {"content-type": content_type}
23 | self.content = content
24 | self.url = url
25 | self.text = ""
26 |
27 | def json(self): # pragma: no cover - not used on success path
28 | return {}
29 |
30 |
31 | def test_sync_request_normalizes_non_mp3_format(monkeypatch):
32 | client = TTSClient()
33 | captured = {}
34 |
35 | def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
36 | captured["data"] = data
37 | return _DummyResponse("audio/wav", b"RIFF" + b"\x00" * 64, url)
38 |
39 | monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
40 |
41 | response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.FLAC)
42 |
43 | assert captured["data"]["response_format"] == "wav"
44 | assert response.format is AudioFormat.WAV
45 |
46 |
47 | def test_sync_request_preserves_mp3_format(monkeypatch):
48 | client = TTSClient()
49 | captured = {}
50 |
51 | def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
52 | captured["data"] = data
53 | return _DummyResponse("audio/mpeg", b"ID3" + b"\x00" * 64, url)
54 |
55 | monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
56 |
57 | response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.MP3)
58 |
59 | assert captured["data"]["response_format"] == "mp3"
60 | assert response.format is AudioFormat.MP3
61 |
62 |
63 | def test_sync_long_text_auto_combine(monkeypatch):
64 | client = TTSClient()
65 |
66 | monkeypatch.setattr(
67 | client,
68 | "generate_speech_batch",
69 | lambda **kwargs: [_mk_response(b"one"), _mk_response(b"two")],
70 | )
71 |
72 | combined_flag = {}
73 |
74 | def fake_combine(responses):
75 | combined_flag["called"] = True
76 | return _mk_response(b"onetwo")
77 |
78 | monkeypatch.setattr("ttsfm.client.combine_responses", fake_combine)
79 |
80 | result = client.generate_speech_long_text(
81 | text="dummy",
82 | auto_combine=True,
83 | )
84 |
85 | assert combined_flag["called"] is True
86 | assert isinstance(result, TTSResponse)
87 | assert result.audio_data == b"onetwo"
88 |
89 |
90 | def test_sync_long_text_returns_list_without_auto_combine(monkeypatch):
91 | client = TTSClient()
92 |
93 | responses = [_mk_response(b"one")]
94 | monkeypatch.setattr(client, "generate_speech_batch", lambda **_: responses)
95 |
96 | result = client.generate_speech_long_text(text="dummy", auto_combine=False)
97 |
98 | assert result is responses
99 |
100 |
101 | @pytest.mark.asyncio
102 | async def test_async_long_text_auto_combine(monkeypatch):
103 | client = AsyncTTSClient()
104 |
105 | async def fake_batch(**kwargs):
106 | return [_mk_response(b"one"), _mk_response(b"two")]
107 |
108 | monkeypatch.setattr(client, "generate_speech_batch", fake_batch)
109 |
110 | def fake_combine(responses):
111 | return _mk_response(b"onetwo")
112 |
113 | monkeypatch.setattr("ttsfm.async_client.combine_responses", fake_combine)
114 |
115 | result = await client.generate_speech_long_text(
116 | text="dummy",
117 | auto_combine=True,
118 | )
119 |
120 | assert isinstance(result, TTSResponse)
121 | assert result.audio_data == b"onetwo"
122 |
--------------------------------------------------------------------------------
/ttsfm/capabilities.py:
--------------------------------------------------------------------------------
1 | """System capabilities detection for TTSFM.
2 |
3 | This module provides runtime detection of available features based on
4 | system dependencies (primarily ffmpeg availability).
5 | """
6 |
7 | from __future__ import annotations
8 |
9 | import shutil
10 | from typing import Dict, List
11 |
12 |
13 | class SystemCapabilities:
14 | """Detect and report system capabilities.
15 |
16 | This class checks for the availability of optional dependencies
17 | (like ffmpeg) and reports which features are available in the
18 | current environment.
19 | """
20 |
21 | def __init__(self) -> None:
22 | """Initialize capabilities detection."""
23 | self.ffmpeg_available = shutil.which("ffmpeg") is not None
24 |
25 | def get_capabilities(self) -> Dict:
26 | """Get complete system capabilities report.
27 |
28 | Returns:
29 | Dict containing:
30 | - ffmpeg_available: bool
31 | - image_variant: "full" or "slim"
32 | - features: dict of feature availability
33 | - supported_formats: list of supported audio formats
34 | """
35 | return {
36 | "ffmpeg_available": self.ffmpeg_available,
37 | "image_variant": "full" if self.ffmpeg_available else "slim",
38 | "features": {
39 | "speed_adjustment": self.ffmpeg_available,
40 | "format_conversion": self.ffmpeg_available,
41 | "mp3_auto_combine": self.ffmpeg_available,
42 | "basic_formats": True, # MP3, WAV always available
43 | },
44 | "supported_formats": self.get_supported_formats(),
45 | }
46 |
47 | def get_supported_formats(self) -> List[str]:
48 | """Get list of supported audio formats.
49 |
50 | Returns:
51 | List of format names (e.g., ["mp3", "wav", "opus", ...])
52 | """
53 | basic = ["mp3", "wav"]
54 | if self.ffmpeg_available:
55 | return basic + ["opus", "aac", "flac", "pcm"]
56 | return basic
57 |
58 | def requires_ffmpeg(self, feature: str) -> bool:
59 | """Check if a feature requires ffmpeg.
60 |
61 | Args:
62 | feature: Feature name or format name to check
63 |
64 | Returns:
65 | True if the feature requires ffmpeg, False otherwise
66 | """
67 | ffmpeg_features = {
68 | "speed_adjustment",
69 | "format_conversion",
70 | "mp3_auto_combine",
71 | "opus",
72 | "aac",
73 | "flac",
74 | "pcm",
75 | }
76 | return feature.lower() in ffmpeg_features
77 |
78 | def check_feature_available(self, feature: str) -> bool:
79 | """Check if a specific feature is available.
80 |
81 | Args:
82 | feature: Feature name to check
83 |
84 | Returns:
85 | True if feature is available, False otherwise
86 | """
87 | if not self.requires_ffmpeg(feature):
88 | return True
89 | return self.ffmpeg_available
90 |
91 | def get_unavailable_reason(self, feature: str) -> str | None:
92 | """Get reason why a feature is unavailable.
93 |
94 | Args:
95 | feature: Feature name to check
96 |
97 | Returns:
98 | Error message if unavailable, None if available
99 | """
100 | if self.check_feature_available(feature):
101 | return None
102 |
103 | return (
104 | f"Feature '{feature}' requires ffmpeg. "
105 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
106 | )
107 |
108 |
109 | # Global instance for easy access
110 | _capabilities_instance: SystemCapabilities | None = None
111 |
112 |
113 | def get_capabilities() -> SystemCapabilities:
114 | """Get global SystemCapabilities instance.
115 |
116 | Returns:
117 | SystemCapabilities singleton instance
118 | """
119 | global _capabilities_instance
120 | if _capabilities_instance is None:
121 | _capabilities_instance = SystemCapabilities()
122 | return _capabilities_instance
123 |
--------------------------------------------------------------------------------
/README.zh.md:
--------------------------------------------------------------------------------
1 | # TTSFM - 文本转语音 API 客户端
2 |
3 | > **⚠️ 告示:由于 openai.fm 体验网站已关闭,本项目已无法使用。**
4 |
5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md)
6 |
7 | [](https://hub.docker.com/r/dbcccc/ttsfm)
8 | [](https://github.com/dbccccccc/ttsfm)
9 | [](https://opensource.org/licenses/MIT)
10 | 
11 |
12 | ## Star History
13 |
14 | [](https://www.star-history.com/#dbccccccc/ttsfm&Date)
15 |
16 | ## 概述
17 |
18 | TTSFM 是一个免费的、兼容 OpenAI 的文本转语音 API 服务,提供将文本转换为自然语音的完整解决方案,使用OpenAI的GPT-4o mini TTS。基于 openai.fm 后端构建,提供强大的 Python SDK、RESTful API 接口以及直观的网页 Playground,方便测试和集成。
19 |
20 | **TTSFM 的功能:**
21 | - 🎤 **多种语音选择**:11 种兼容 OpenAI 的语音(alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse)
22 | - 🎵 **灵活的音频格式**:支持 6 种音频格式(MP3、WAV、OPUS、AAC、FLAC、PCM)
23 | - ⚡ **语速控制**:0.25x 到 4.0x 的播放速度调节,适应不同使用场景
24 | - 📝 **长文本支持**:自动文本分割和音频合并,支持任意长度内容
25 | - 🔄 **实时流式传输**:WebSocket 支持流式音频生成
26 | - 🐍 **Python SDK**:易用的同步和异步客户端
27 | - 🌐 **网页 Playground**:交互式网页界面,方便测试和实验
28 | - 🐳 **Docker 就绪**:预构建的 Docker 镜像,即刻部署
29 | - 🔍 **智能检测**:自动功能检测和友好的错误提示
30 | - 🤖 **OpenAI 兼容**:可直接替代 OpenAI 的 TTS API
31 |
32 | **v3.4.0 版本的主要特性:**
33 | - 🎯 镜像变体检测(完整版 vs 精简版 Docker 镜像)
34 | - 🔍 运行时功能 API,检查特性可用性
35 | - ⚡ 基于 ffmpeg 的语速调节
36 | - 🎵 所有 6 种音频格式的真实格式转换
37 | - 📊 增强的错误处理,提供清晰、可操作的错误信息
38 | - 🐳 针对不同使用场景优化的双镜像版本
39 |
40 | > **⚠️ 免责声明**:本项目仅用于**学习和研究目的**。这是对 openai.fm 服务的逆向工程实现,不应用于商业用途或生产环境。用户需自行确保遵守适用的法律法规和服务条款。
41 |
42 | ## 安装
43 |
44 | ### Python 包
45 |
46 | ```bash
47 | pip install ttsfm # 核心客户端
48 | pip install ttsfm[web] # 核心客户端 + Web/服务端依赖
49 | ```
50 |
51 | ### Docker 镜像
52 |
53 | TTSFM 提供两种 Docker 镜像变体以满足不同需求:
54 |
55 | #### 完整版(推荐)
56 | ```bash
57 | docker run -p 8000:8000 dbcccc/ttsfm:latest
58 | ```
59 |
60 | **包含 ffmpeg,支持高级功能:**
61 | - ✅ 所有 6 种音频格式(MP3、WAV、OPUS、AAC、FLAC、PCM)
62 | - ✅ 语速调节(0.25x - 4.0x)
63 | - ✅ 使用 ffmpeg 进行格式转换
64 | - ✅ 长文本 MP3 自动合并
65 | - ✅ 长文本 WAV 自动合并
66 |
67 | #### 精简版
68 | ```bash
69 | docker run -p 8000:8000 dbcccc/ttsfm:slim
70 | ```
71 |
72 | **不含 ffmpeg 的最小化镜像:**
73 | - ✅ 基础 TTS 功能
74 | - ✅ 2 种音频格式(仅 MP3、WAV)
75 | - ✅ 长文本 WAV 自动合并
76 | - ❌ 不支持语速调节
77 | - ❌ 不支持格式转换
78 | - ❌ 不支持 MP3 自动合并
79 |
80 | 容器默认开放网页 Playground(`http://localhost:8000`)以及兼容 OpenAI 的 `/v1/audio/speech` 接口。
81 |
82 | **检查可用功能:**
83 | ```bash
84 | curl http://localhost:8000/api/capabilities
85 | ```
86 |
87 | ## 快速开始
88 |
89 | ### Python 客户端
90 |
91 | ```python
92 | from ttsfm import TTSClient, AudioFormat, Voice
93 |
94 | client = TTSClient()
95 |
96 | # 基础用法
97 | response = client.generate_speech(
98 | text="来自 TTSFM 的问候!",
99 | voice=Voice.ALLOY,
100 | response_format=AudioFormat.MP3,
101 | )
102 | response.save_to_file("hello") # -> hello.mp3
103 |
104 | # 使用语速调节(需要 ffmpeg)
105 | response = client.generate_speech(
106 | text="这段语音会更快!",
107 | voice=Voice.NOVA,
108 | response_format=AudioFormat.MP3,
109 | speed=1.5, # 1.5 倍速(范围:0.25 - 4.0)
110 | )
111 | response.save_to_file("fast") # -> fast.mp3
112 | ```
113 |
114 | ### 命令行
115 |
116 | ```bash
117 | ttsfm "你好,世界" --voice nova --format mp3 --output hello.mp3
118 | ```
119 |
120 | ### REST API(兼容 OpenAI)
121 |
122 | ```bash
123 | # 基础请求
124 | curl -X POST http://localhost:8000/v1/audio/speech \
125 | -H "Content-Type: application/json" \
126 | -d '{
127 | "model": "tts-1",
128 | "input": "你好,世界",
129 | "voice": "alloy",
130 | "response_format": "mp3"
131 | }' --output speech.mp3
132 |
133 | # 使用语速调节(需要完整版镜像)
134 | curl -X POST http://localhost:8000/v1/audio/speech \
135 | -H "Content-Type: application/json" \
136 | -d '{
137 | "model": "tts-1",
138 | "input": "你好,世界",
139 | "voice": "alloy",
140 | "response_format": "mp3",
141 | "speed": 1.5
142 | }' --output speech_fast.mp3
143 | ```
144 |
145 | **可用语音:** alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse
146 | **可用格式:** mp3、wav(始终可用)+ opus、aac、flac、pcm(仅完整版镜像)
147 | **语速范围:** 0.25 - 4.0(需要完整版镜像)
148 |
149 | ## 了解更多
150 |
151 | - 在 [Web 文档](http://localhost:8000/docs)(或 `ttsfm-web/templates/docs.html`)查看完整接口说明与运行注意事项。
152 | - 查看 [架构概览](docs/architecture.md) 了解组件间的关系。
153 | - 欢迎参与贡献,流程说明请见 [CONTRIBUTING.md](CONTRIBUTING.md)。
154 |
155 | ## 许可证
156 |
157 | TTSFM 采用 [MIT 许可证](LICENSE) 发布。
158 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "ttsfm"
7 | dynamic = ["version"]
8 | description = "Text-to-Speech API Client with OpenAI compatibility"
9 | readme = "README.md"
10 | license = "MIT"
11 | authors = [
12 | {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"}
13 | ]
14 | maintainers = [
15 | {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"}
16 | ]
17 | classifiers = [
18 | "Development Status :: 4 - Beta",
19 | "Intended Audience :: Developers",
20 |
21 | "Operating System :: OS Independent",
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | "Programming Language :: Python :: 3.10",
26 | "Programming Language :: Python :: 3.11",
27 | "Programming Language :: Python :: 3.12",
28 | "Topic :: Multimedia :: Sound/Audio :: Speech",
29 | "Topic :: Software Development :: Libraries :: Python Modules",
30 | "Topic :: Internet :: WWW/HTTP :: Dynamic Content",
31 | ]
32 | keywords = [
33 | "tts",
34 | "text-to-speech",
35 | "speech-synthesis",
36 | "openai",
37 | "api-client",
38 | "audio",
39 | "voice",
40 | "speech"
41 | ]
42 | requires-python = ">=3.8"
43 | dependencies = [
44 | "requests>=2.25.0",
45 | "aiohttp>=3.8.0",
46 | "python-dotenv>=1.0.1",
47 | ]
48 |
49 | [project.optional-dependencies]
50 | dev = [
51 | "pytest>=6.0",
52 | "pytest-asyncio>=0.18.0",
53 | "pytest-cov>=2.0",
54 | "black>=22.0",
55 | "isort>=5.0",
56 | "flake8>=4.0",
57 | "mypy>=0.900",
58 | "pre-commit>=2.0",
59 | ]
60 | docs = [
61 | "sphinx>=4.0",
62 | "sphinx-rtd-theme>=1.0",
63 | "myst-parser>=0.17",
64 | ]
65 | web = [
66 | "flask>=2.0.0",
67 | "flask-cors>=3.0.10",
68 | "flask-socketio>=5.3.0",
69 | "python-socketio>=5.10.0",
70 | "eventlet>=0.33.3",
71 | "waitress>=3.0.0",
72 | "pydub>=0.25.0",
73 | "argon2-cffi>=23.1.0",
74 | ]
75 |
76 | [project.urls]
77 | Homepage = "https://github.com/dbccccccc/ttsfm"
78 | Documentation = "https://github.com/dbccccccc/ttsfm/blob/main/docs/"
79 | Repository = "https://github.com/dbccccccc/ttsfm"
80 | "Bug Tracker" = "https://github.com/dbccccccc/ttsfm/issues"
81 |
82 | [project.scripts]
83 | ttsfm = "ttsfm.cli:main"
84 |
85 | [tool.setuptools_scm]
86 | version_scheme = "no-guess-dev"
87 | local_scheme = "no-local-version"
88 |
89 | fallback_version = "3.4.2"
90 | [tool.setuptools]
91 | packages = ["ttsfm"]
92 |
93 | [tool.setuptools.package-data]
94 | ttsfm = ["py.typed"]
95 |
96 | [tool.black]
97 | line-length = 100
98 | target-version = ['py38']
99 | include = '\\.pyi?$'
100 | extend-exclude = '''
101 | /(
102 | # directories
103 | \.eggs
104 | | \.git
105 | | \.hg
106 | | \.mypy_cache
107 | | \.tox
108 | | \.venv
109 | | build
110 | | dist
111 | )/
112 | '''
113 |
114 | [tool.isort]
115 | profile = "black"
116 | line_length = 100
117 | multi_line_output = 3
118 | include_trailing_comma = true
119 | force_grid_wrap = 0
120 | use_parentheses = true
121 | ensure_newline_before_comments = true
122 |
123 | [tool.mypy]
124 | python_version = "3.9"
125 | warn_return_any = false
126 | warn_unused_configs = true
127 | disallow_untyped_defs = false
128 | disallow_incomplete_defs = false
129 | check_untyped_defs = true
130 | disallow_untyped_decorators = false
131 | no_implicit_optional = false
132 | warn_redundant_casts = true
133 | warn_unused_ignores = false
134 | warn_no_return = true
135 | warn_unreachable = false
136 | strict_equality = true
137 |
138 | [[tool.mypy.overrides]]
139 | module = "requests.*"
140 | ignore_missing_imports = true
141 |
142 | [[tool.mypy.overrides]]
143 | module = "pydub.*"
144 | ignore_missing_imports = true
145 |
146 | [[tool.mypy.overrides]]
147 | module = "fake_useragent.*"
148 | ignore_missing_imports = true
149 |
150 | [tool.pytest.ini_options]
151 | minversion = "6.0"
152 | addopts = "-ra -q --strict-markers --strict-config"
153 | testpaths = ["tests"]
154 | python_files = ["test_*.py", "*_test.py"]
155 | python_classes = ["Test*"]
156 | python_functions = ["test_*"]
157 | markers = [
158 | "slow: marks tests as slow (deselect with '-m \"not slow\"')",
159 | "integration: marks tests as integration tests",
160 | "unit: marks tests as unit tests",
161 | ]
162 |
163 | [tool.coverage.run]
164 | source = ["ttsfm"]
165 | omit = [
166 | "*/tests/*",
167 | "*/test_*",
168 | "setup.py",
169 | ]
170 |
171 | [tool.coverage.report]
172 | exclude_lines = [
173 | "pragma: no cover",
174 | "def __repr__",
175 | "if self.debug:",
176 | "if settings.DEBUG",
177 | "raise AssertionError",
178 | "raise NotImplementedError",
179 | "if 0:",
180 | "if __name__ == .__main__.:",
181 | "class .*\\bProtocol\\):",
182 | "@(abc\\.)?abstractmethod",
183 | ]
184 |
185 |
--------------------------------------------------------------------------------
/tests/test_audio_processing.py:
--------------------------------------------------------------------------------
1 | """Tests for audio processing functionality."""
2 |
3 | import pytest
4 | import shutil
5 | from ttsfm.audio_processing import adjust_audio_speed, _build_atempo_filter_chain
6 |
7 |
8 | class TestAudioProcessing:
9 | """Test audio processing functions."""
10 |
11 | def test_build_atempo_filter_chain_normal_range(self):
12 | """Test atempo filter chain for speeds in 0.5-2.0 range."""
13 | # Single filter for speeds in range
14 | assert _build_atempo_filter_chain(1.0) == "atempo=1.0"
15 | assert _build_atempo_filter_chain(1.5) == "atempo=1.5"
16 | assert _build_atempo_filter_chain(0.5) == "atempo=0.5"
17 | assert _build_atempo_filter_chain(2.0) == "atempo=2.0"
18 |
19 | def test_build_atempo_filter_chain_high_speed(self):
20 | """Test atempo filter chain for speeds > 2.0."""
21 | # Should chain multiple filters
22 | result = _build_atempo_filter_chain(4.0)
23 | assert "atempo=2.0" in result
24 | assert "," in result # Multiple filters chained
25 |
26 | def test_build_atempo_filter_chain_low_speed(self):
27 | """Test atempo filter chain for speeds < 0.5."""
28 | # Should chain multiple filters
29 | result = _build_atempo_filter_chain(0.25)
30 | assert "atempo=0.5" in result
31 | assert "," in result # Multiple filters chained
32 |
33 | def test_adjust_audio_speed_validation(self):
34 | """Test speed parameter validation."""
35 | dummy_audio = b"dummy audio data"
36 |
37 | # Speed too low
38 | with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"):
39 | adjust_audio_speed(dummy_audio, speed=0.1)
40 |
41 | # Speed too high
42 | with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"):
43 | adjust_audio_speed(dummy_audio, speed=5.0)
44 |
45 | def test_adjust_audio_speed_no_change(self):
46 | """Test that speed=1.0 returns original audio."""
47 | dummy_audio = b"dummy audio data"
48 | result = adjust_audio_speed(dummy_audio, speed=1.0)
49 | assert result == dummy_audio
50 |
51 | @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not available")
52 | def test_adjust_audio_speed_requires_ffmpeg(self):
53 | """Test that speed adjustment requires ffmpeg."""
54 | # This test only runs if ffmpeg is available
55 | # If ffmpeg is not available, the function should raise RuntimeError
56 | pass
57 |
58 | def test_adjust_audio_speed_no_ffmpeg(self, monkeypatch):
59 | """Test error when ffmpeg is not available."""
60 | # Mock shutil.which to return None (ffmpeg not found)
61 | monkeypatch.setattr("shutil.which", lambda x: None)
62 |
63 | dummy_audio = b"dummy audio data"
64 | with pytest.raises(RuntimeError, match="Speed adjustment requires ffmpeg"):
65 | adjust_audio_speed(dummy_audio, speed=1.5)
66 |
67 |
68 | class TestFFmpegDetection:
69 | """Test ffmpeg detection in audio module."""
70 |
71 | def test_ffmpeg_detection(self):
72 | """Test that FFMPEG_AVAILABLE is set correctly."""
73 | from ttsfm.audio import FFMPEG_AVAILABLE
74 |
75 | # Should be a boolean
76 | assert isinstance(FFMPEG_AVAILABLE, bool)
77 |
78 | # Should match actual ffmpeg availability
79 | expected = shutil.which("ffmpeg") is not None
80 | assert FFMPEG_AVAILABLE == expected
81 |
82 |
83 | class TestAudioCombineWithFFmpeg:
84 | """Test audio combining with ffmpeg detection."""
85 |
86 | def test_combine_mp3_without_ffmpeg(self, monkeypatch):
87 | """Test that MP3 combining fails gracefully without ffmpeg."""
88 | # Mock both pydub and ffmpeg as unavailable
89 | import ttsfm.audio
90 |
91 | monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
92 | monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False)
93 |
94 | from ttsfm.audio import combine_audio_chunks
95 | from ttsfm.exceptions import AudioProcessingException
96 |
97 | chunks = [b"chunk1", b"chunk2"]
98 | with pytest.raises(AudioProcessingException, match="MP3 audio requires pydub and ffmpeg"):
99 | combine_audio_chunks(chunks, format_type="mp3")
100 |
101 | def test_combine_wav_without_ffmpeg(self, monkeypatch):
102 | """Test that WAV combining works without ffmpeg."""
103 | # Mock pydub as unavailable but allow WAV concatenation
104 | import ttsfm.audio
105 |
106 | monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
107 |
108 | from ttsfm.audio import combine_audio_chunks
109 |
110 | # Create simple WAV chunks (with minimal headers)
111 | # This is a simplified test - real WAV files have proper headers
112 | chunks = [b"RIFF" + b"\x00" * 40 + b"data", b"RIFF" + b"\x00" * 40 + b"data"]
113 |
114 | # Should not raise error for WAV
115 | result = combine_audio_chunks(chunks, format_type="wav")
116 | assert isinstance(result, bytes)
117 |
--------------------------------------------------------------------------------
/ttsfm/audio.py:
--------------------------------------------------------------------------------
1 | """Audio helper utilities shared across TTSFM components."""
2 |
3 | from __future__ import annotations
4 |
5 | import io
6 | import logging
7 | import shutil
8 | from typing import Iterable, List, Sequence
9 |
10 | from .exceptions import AudioProcessingException
11 | from .models import TTSResponse
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | try: # Optional dependency for non-WAV combining
17 | from pydub import AudioSegment
18 | except ImportError: # pragma: no cover - optional dependency
19 | AudioSegment = None
20 |
21 |
22 | # Detect ffmpeg availability at runtime
23 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None
24 |
25 | SUPPORTED_EXPORT_FORMATS = {"mp3", "wav", "aac", "flac", "opus", "pcm"}
26 |
27 |
28 | def combine_audio_chunks(audio_chunks: Iterable[bytes], format_type: str = "mp3") -> bytes:
29 | """Combine multiple audio chunks into a single audio file.
30 |
31 | Args:
32 | audio_chunks: Iterable of raw audio byte strings
33 | format_type: Requested output format
34 |
35 | Returns:
36 | Combined audio data as bytes
37 |
38 | Raises:
39 | RuntimeError: If non-WAV combining is requested without pydub/ffmpeg available
40 | """
41 |
42 | chunks_list = list(audio_chunks)
43 | if not chunks_list:
44 | return b""
45 |
46 | fmt = format_type.lower()
47 |
48 | # Check for pydub availability (which requires ffmpeg for MP3)
49 | if AudioSegment is None:
50 | if fmt == "mp3":
51 | raise AudioProcessingException(
52 | "Combining MP3 audio requires pydub and ffmpeg. "
53 | "Install ttsfm[web] and use the full Docker image (dbcccc/ttsfm:latest) "
54 | "instead of the slim variant.",
55 | audio_format="mp3",
56 | )
57 | return _simple_wav_concatenation(chunks_list)
58 |
59 | # Check for ffmpeg availability when using pydub
60 | if not FFMPEG_AVAILABLE and fmt == "mp3":
61 | raise AudioProcessingException(
62 | "MP3 auto-combine requires ffmpeg. "
63 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant, "
64 | "or disable auto_combine and handle chunks separately.",
65 | audio_format="mp3",
66 | )
67 |
68 | audio_segments = []
69 | for chunk in chunks_list:
70 | buffer = io.BytesIO(chunk)
71 | if fmt == "mp3":
72 | segment = AudioSegment.from_mp3(buffer)
73 | else:
74 | segment = AudioSegment.from_wav(buffer)
75 | audio_segments.append(segment)
76 |
77 | combined = audio_segments[0]
78 | for segment in audio_segments[1:]:
79 | combined += segment
80 |
81 | output_buffer = io.BytesIO()
82 | export_format = "mp3" if fmt == "mp3" else "wav"
83 | combined.export(output_buffer, format=export_format)
84 | return output_buffer.getvalue()
85 |
86 |
87 | def _simple_wav_concatenation(wav_chunks: List[bytes]) -> bytes:
88 | """Simple WAV concatenation fallback that avoids external deps."""
89 | if not wav_chunks:
90 | return b""
91 |
92 | if len(wav_chunks) == 1:
93 | return wav_chunks[0]
94 |
95 | try:
96 | first_wav = wav_chunks[0]
97 | if len(first_wav) < 44:
98 | return b"".join(wav_chunks)
99 |
100 | header = bytearray(first_wav[:44])
101 | audio_data = first_wav[44:]
102 |
103 | for wav_chunk in wav_chunks[1:]:
104 | if len(wav_chunk) > 44:
105 | audio_data += wav_chunk[44:]
106 |
107 | total_size = len(header) + len(audio_data) - 8
108 | header[4:8] = total_size.to_bytes(4, byteorder="little")
109 |
110 | data_size = len(audio_data)
111 | header[40:44] = data_size.to_bytes(4, byteorder="little")
112 |
113 | return bytes(header) + audio_data
114 | except Exception as exc:
115 | logger.error("Error in simple WAV concatenation: %s", exc)
116 | return b"".join(wav_chunks)
117 |
118 |
119 | def combine_responses(responses: Sequence["TTSResponse"]) -> "TTSResponse":
120 | """Combine multiple ``TTSResponse`` objects into a single response."""
121 |
122 | responses = list(responses)
123 | if not responses:
124 | raise ValueError("No responses provided for combination")
125 |
126 | first = responses[0]
127 | audio_format = first.format
128 |
129 | audio_bytes = combine_audio_chunks((resp.audio_data for resp in responses), audio_format.value)
130 |
131 | total_duration = None
132 | if any(resp.duration is not None for resp in responses):
133 | total_duration = sum(filter(None, (resp.duration for resp in responses)))
134 |
135 | metadata = dict(first.metadata or {})
136 | metadata.update(
137 | {
138 | "chunks_combined": len(responses),
139 | "auto_combined": True,
140 | }
141 | )
142 |
143 | return TTSResponse(
144 | audio_data=audio_bytes,
145 | content_type=first.content_type,
146 | format=audio_format,
147 | size=len(audio_bytes),
148 | duration=total_duration if total_duration is not None else first.duration,
149 | metadata=metadata,
150 | )
151 |
--------------------------------------------------------------------------------
/scripts/test_websocket.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Test WebSocket connection to TTSFM server.
4 |
5 | This script tests the WebSocket functionality by connecting to the server
6 | and performing a simple TTS generation request.
7 | """
8 |
9 | import time
10 | import socketio
11 |
12 | # Create a Socket.IO client
13 | sio = socketio.Client(logger=True, engineio_logger=True)
14 |
15 | # Track connection state
16 | connected = False
17 | stream_complete = False
18 | chunks_received = 0
19 |
20 |
21 | @sio.on('connect')
22 | def on_connect():
23 | """Handle connection event."""
24 | global connected
25 | connected = True
26 | print('\n✅ Connected to WebSocket server!')
27 | print(f'Session ID: {sio.sid}')
28 |
29 |
30 | @sio.on('connected')
31 | def on_session_ready(data):
32 | """Handle session ready event."""
33 | print(f'\n✅ Session established: {data}')
34 |
35 |
36 | @sio.on('disconnect')
37 | def on_disconnect():
38 | """Handle disconnection event."""
39 | global connected
40 | connected = False
41 | print('\n❌ Disconnected from WebSocket server')
42 |
43 |
44 | @sio.on('connect_error')
45 | def on_connect_error(data):
46 | """Handle connection error."""
47 | print(f'\n❌ Connection error: {data}')
48 |
49 |
50 | @sio.on('pong')
51 | def on_pong(data):
52 | """Handle pong response."""
53 | print(f'\n✅ Pong received: {data}')
54 |
55 |
56 | @sio.on('stream_started')
57 | def on_stream_started(data):
58 | """Handle stream started event."""
59 | print(f'\n✅ Stream started: {data}')
60 |
61 |
62 | @sio.on('stream_progress')
63 | def on_stream_progress(data):
64 | """Handle stream progress event."""
65 | progress = data.get('progress', 0)
66 | status = data.get('status', 'unknown')
67 | print(f'📊 Progress: {progress}% - Status: {status}')
68 |
69 |
70 | @sio.on('audio_chunk')
71 | def on_audio_chunk(data):
72 | """Handle audio chunk event."""
73 | global chunks_received
74 | chunks_received += 1
75 | chunk_index = data.get('chunk_index', 0)
76 | total_chunks = data.get('total_chunks', 0)
77 | print(f'🎵 Received audio chunk {chunk_index + 1}/{total_chunks}')
78 |
79 |
80 | @sio.on('stream_complete')
81 | def on_stream_complete(data):
82 | """Handle stream complete event."""
83 | global stream_complete
84 | stream_complete = True
85 | print(f'\n✅ Stream complete: {data}')
86 | print(f'Total chunks received: {chunks_received}')
87 |
88 |
89 | @sio.on('stream_error')
90 | def on_stream_error(data):
91 | """Handle stream error event."""
92 | print(f'\n❌ Stream error: {data}')
93 |
94 |
95 | def test_connection(url='http://localhost:8000'):
96 | """Test WebSocket connection."""
97 | print(f'🔌 Connecting to {url}...')
98 |
99 | try:
100 | # Connect to the server
101 | sio.connect(url, transports=['polling', 'websocket'])
102 |
103 | # Wait for connection
104 | timeout = 10
105 | start_time = time.time()
106 | while not connected and (time.time() - start_time) < timeout:
107 | time.sleep(0.1)
108 |
109 | if not connected:
110 | print('❌ Failed to connect within timeout')
111 | return False
112 |
113 | # Test ping/pong
114 | print('\n📡 Testing ping/pong...')
115 | sio.emit('ping', {'timestamp': time.time()})
116 | time.sleep(1)
117 |
118 | # Test TTS generation
119 | print('\n🎤 Testing TTS generation...')
120 | request_data = {
121 | 'request_id': f'test_{int(time.time())}',
122 | 'text': 'Hello, this is a WebSocket test!',
123 | 'voice': 'alloy',
124 | 'format': 'mp3',
125 | 'chunk_size': 512
126 | }
127 |
128 | sio.emit('generate_stream', request_data)
129 |
130 | # Wait for stream to complete
131 | timeout = 30
132 | start_time = time.time()
133 | while not stream_complete and (time.time() - start_time) < timeout:
134 | time.sleep(0.1)
135 |
136 | if stream_complete:
137 | print('\n✅ WebSocket test completed successfully!')
138 | return True
139 | else:
140 | print('\n⚠️ Stream did not complete within timeout')
141 | return False
142 |
143 | except Exception as e:
144 | print(f'\n❌ Error during test: {e}')
145 | import traceback
146 | traceback.print_exc()
147 | return False
148 |
149 | finally:
150 | # Disconnect
151 | if connected:
152 | print('\n🔌 Disconnecting...')
153 | sio.disconnect()
154 | time.sleep(1)
155 |
156 |
157 | if __name__ == '__main__':
158 | import sys
159 |
160 | # Get URL from command line or use default
161 | url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8000'
162 |
163 | print('=' * 60)
164 | print('TTSFM WebSocket Connection Test')
165 | print('=' * 60)
166 |
167 | success = test_connection(url)
168 |
169 | print('\n' + '=' * 60)
170 | if success:
171 | print('✅ All tests passed!')
172 | sys.exit(0)
173 | else:
174 | print('❌ Some tests failed')
175 | sys.exit(1)
176 |
177 |
--------------------------------------------------------------------------------
/docs/docker-workflows.md:
--------------------------------------------------------------------------------
1 | # Docker Build Workflows
2 |
3 | ## Overview
4 |
5 | Starting with v3.4.0, TTSFM uses **separate GitHub Actions workflows** for building the full and slim Docker image variants. This provides better clarity, easier debugging, and independent execution.
6 |
7 | ## Workflow Files
8 |
9 | ### 1. `.github/workflows/docker-build-full.yml`
10 |
11 | **Purpose**: Builds the full variant with ffmpeg support
12 |
13 | **Triggers**:
14 | - Push to `main` branch
15 | - Pull requests to `main` branch
16 | - Release published
17 |
18 | **Image Tags** (on release):
19 | - `dbcccc/ttsfm:vX.X.X`
20 | - `dbcccc/ttsfm:latest` (only for stable releases, not pre-releases)
21 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X`
22 | - `ghcr.io/dbccccccc/ttsfm:latest` (only for stable releases)
23 |
24 | **Features**:
25 | - ✅ ffmpeg included
26 | - ✅ MP3 auto-combine
27 | - ✅ Speed adjustment (0.25x - 4.0x)
28 | - ✅ Format conversion
29 | - ✅ Multi-platform builds (linux/amd64, linux/arm64)
30 | - ✅ Smoke test on PR/push
31 | - ✅ GitHub Actions cache (scope: `full`)
32 |
33 | ---
34 |
35 | ### 2. `.github/workflows/docker-build-slim.yml`
36 |
37 | **Purpose**: Builds the slim variant without ffmpeg
38 |
39 | **Triggers**:
40 | - Push to `main` branch
41 | - Pull requests to `main` branch
42 | - Release published
43 |
44 | **Image Tags** (on release):
45 | - `dbcccc/ttsfm:vX.X.X-slim`
46 | - `dbcccc/ttsfm:vX.X-slim` (only for stable releases, not pre-releases)
47 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X-slim`
48 | - `ghcr.io/dbccccccc/ttsfm:vX.X-slim` (only for stable releases)
49 |
50 | **Features**:
51 | - ✅ No ffmpeg (smaller image)
52 | - ✅ Basic TTS (MP3/WAV)
53 | - ✅ WAV auto-combine (simple concatenation)
54 | - ❌ No MP3 auto-combine
55 | - ❌ No speed adjustment
56 | - ❌ No format conversion
57 | - ✅ Multi-platform builds (linux/amd64, linux/arm64)
58 | - ✅ Smoke test on PR/push (port 8001)
59 | - ✅ GitHub Actions cache (scope: `slim`)
60 |
61 | ---
62 |
63 | ## Build Behavior
64 |
65 | ### On Pull Request or Push to Main
66 |
67 | Both workflows run in parallel:
68 | - Build for `linux/amd64` only (faster)
69 | - Images are **not pushed** to registries
70 | - Images are loaded locally for smoke testing
71 | - Temporary tags: `ghcr.io/dbccccccc/ttsfm:ci-{RUN_ID}-full` and `ci-{RUN_ID}-slim`
72 |
73 | ### On Release Published
74 |
75 | Both workflows run in parallel:
76 | - Build for `linux/amd64` and `linux/arm64` (multi-platform)
77 | - Images are **pushed** to Docker Hub and GitHub Container Registry
78 | - No local loading (images go directly to registries)
79 | - Production tags based on release version
80 |
81 | ### Pre-release vs Stable Release
82 |
83 | **Pre-release** (e.g., `v3.4.0-alpha1`):
84 | - Full variant: `vX.X.X` only (no `latest` tag)
85 | - Slim variant: `vX.X.X-slim` only (no `vX.X-slim` tag)
86 |
87 | **Stable release** (e.g., `v3.4.0`):
88 | - Full variant: `vX.X.X` + `latest`
89 | - Slim variant: `vX.X.X-slim` + `vX.X-slim`
90 |
91 | ---
92 |
93 | ## Advantages of Separate Workflows
94 |
95 | 1. **Clarity**: Each workflow has a single, clear purpose
96 | 2. **Easier debugging**: When a build fails, you immediately know which variant failed
97 | 3. **Independent execution**: Can trigger/retry builds independently
98 | 4. **Simpler logic**: No complex conditionals or fallback logic
99 | 5. **Better visibility**: GitHub Actions UI shows them as separate jobs
100 | 6. **Parallel execution**: Both variants build truly in parallel
101 | 7. **Independent caching**: Each variant has its own cache scope
102 |
103 | ---
104 |
105 | ## Monitoring Builds
106 |
107 | ### GitHub Actions UI
108 |
109 | When you create a release, you'll see **two separate workflow runs**:
110 | - ✅ Docker Build and Push (Full)
111 | - ✅ Docker Build and Push (Slim)
112 |
113 | Each can succeed or fail independently.
114 |
115 | ### Checking Build Status
116 |
117 | **Via GitHub UI**:
118 | 1. Go to repository → Actions tab
119 | 2. Look for the two workflow runs
120 | 3. Click on each to see detailed logs
121 |
122 | **Via API**:
123 | ```bash
124 | # Check latest workflow runs
125 | gh run list --workflow=docker-build-full.yml
126 | gh run list --workflow=docker-build-slim.yml
127 | ```
128 |
129 | ---
130 |
131 | ## Troubleshooting
132 |
133 | ### Slim variant not building
134 |
135 | 1. Check if the workflow file exists: `.github/workflows/docker-build-slim.yml`
136 | 2. Check the Actions tab for the "Docker Build and Push (Slim)" workflow
137 | 3. Look for error messages in the workflow logs
138 | 4. Verify Docker Hub and GitHub Container Registry credentials
139 |
140 | ### Images not pushed to registry
141 |
142 | 1. Verify the event is a "release published" (not draft)
143 | 2. Check Docker Hub credentials in repository secrets:
144 | - `DOCKERHUB_USERNAME`
145 | - `DOCKERHUB_TOKEN`
146 | 3. Check GitHub Container Registry permissions (automatic via `GITHUB_TOKEN`)
147 |
148 | ### Smoke test failing
149 |
150 | 1. Check the smoke test logs in the workflow run
151 | 2. Verify the health endpoint is working: `/api/health`
152 | 3. For slim variant, ensure it's using port 8001 (not 8000)
153 |
154 | ---
155 |
156 | ## Future Enhancements
157 |
158 | Potential improvements for the workflows:
159 |
160 | 1. **Matrix builds**: Use a single workflow with matrix strategy
161 | 2. **Reusable workflows**: Extract common steps into a reusable workflow
162 | 3. **Build notifications**: Send notifications on build success/failure
163 | 4. **Image scanning**: Add security scanning with Trivy or Snyk
164 | 5. **Performance metrics**: Track and report build times and image sizes
165 |
166 |
--------------------------------------------------------------------------------
/docs/v3.4-dual-image-implementation.md:
--------------------------------------------------------------------------------
1 | # TTSFM v3.4.x Dual-Image Implementation
2 |
3 | ## Overview
4 |
5 | Starting with v3.4.0-alpha1, TTSFM provides two Docker image variants to balance functionality and image size:
6 |
7 | 1. **Full variant** (`dbcccc/ttsfm:latest`, `dbcccc/ttsfm:v3.4.0-alpha1`)
8 | - Includes ffmpeg for advanced audio processing
9 | - Supports all features including speed adjustment and format conversion
10 |
11 | 2. **Slim variant** (`dbcccc/ttsfm:v3.4.0-alpha1-slim`)
12 | - Minimal image without ffmpeg
13 | - Basic TTS functionality only
14 |
15 | ## Implementation Details
16 |
17 | ### 1. Dockerfile Changes
18 |
19 | The Dockerfile now accepts a `VARIANT` build argument:
20 |
21 | ```dockerfile
22 | ARG VARIANT=full # Can be 'full' or 'slim'
23 | ```
24 |
25 | - **Full variant**: Installs ffmpeg in the runtime stage
26 | - **Slim variant**: Skips ffmpeg installation
27 |
28 | ### 2. GitHub Actions Workflow
29 |
30 | `.github/workflows/docker-build.yml` now builds both variants:
31 |
32 | - **Full image tags**: `vX.X.X`, `latest`
33 | - **Slim image tags**: `vX.X.X-slim`
34 |
35 | Both variants are built for `linux/amd64` and `linux/arm64` platforms on release.
36 |
37 | ### 3. Runtime Feature Detection
38 |
39 | `ttsfm/audio.py` now includes runtime detection:
40 |
41 | ```python
42 | import shutil
43 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None
44 | ```
45 |
46 | Functions that require ffmpeg provide helpful error messages when it's not available.
47 |
48 | ### 4. Speed Adjustment Feature
49 |
50 | New module `ttsfm/audio_processing.py` provides:
51 |
52 | - `adjust_audio_speed()`: Adjust playback speed using ffmpeg (0.25x - 4.0x)
53 | - `convert_audio_format()`: Convert between audio formats using ffmpeg
54 |
55 | Both sync (`TTSClient`) and async (`AsyncTTSClient`) clients now support the `speed` parameter:
56 |
57 | ```python
58 | response = client.generate_speech(
59 | text="Hello!",
60 | voice=Voice.ALLOY,
61 | speed=1.5, # 1.5x faster
62 | )
63 | ```
64 |
65 | Speed adjustment is applied post-generation using ffmpeg's `atempo` filter.
66 |
67 | ## Feature Matrix
68 |
69 | | Feature | Full Image | Slim Image | Python Package |
70 | |---------|-----------|------------|----------------|
71 | | Basic TTS (MP3/WAV) | ✅ | ✅ | ✅ |
72 | | WAV auto-combine | ✅ | ✅ (simple) | ✅ (simple) |
73 | | MP3 auto-combine | ✅ | ❌ | ✅ (with pydub) |
74 | | Speed adjustment | ✅ | ❌ | ✅ (with ffmpeg) |
75 | | Format conversion | ✅ | ❌ | ✅ (with ffmpeg) |
76 |
77 | ## Usage Examples
78 |
79 | ### Full Image (Recommended)
80 |
81 | ```bash
82 | # Pull and run full image
83 | docker run -p 8000:8000 dbcccc/ttsfm:latest
84 |
85 | # Use speed adjustment
86 | curl -X POST http://localhost:8000/v1/audio/speech \
87 | -H "Content-Type: application/json" \
88 | -d '{"input":"Hello!","voice":"alloy","speed":1.5}' \
89 | --output fast.mp3
90 | ```
91 |
92 | ### Slim Image (Minimal)
93 |
94 | ```bash
95 | # Pull and run slim image
96 | docker run -p 8000:8000 dbcccc/ttsfm:v3.4.0-alpha1-slim
97 |
98 | # Basic TTS works fine
99 | curl -X POST http://localhost:8000/v1/audio/speech \
100 | -H "Content-Type: application/json" \
101 | -d '{"input":"Hello!","voice":"alloy"}' \
102 | --output speech.mp3
103 |
104 | # Speed parameter will be ignored (no error, just logged warning)
105 | ```
106 |
107 | ### Python Package
108 |
109 | ```python
110 | from ttsfm import TTSClient, Voice
111 |
112 | client = TTSClient()
113 |
114 | # Speed adjustment requires ffmpeg installed on system
115 | response = client.generate_speech(
116 | text="This will be faster!",
117 | voice=Voice.NOVA,
118 | speed=1.5,
119 | )
120 | response.save_to_file("fast.mp3")
121 | ```
122 |
123 | ## Error Handling
124 |
125 | When ffmpeg-dependent features are used without ffmpeg:
126 |
127 | ```python
128 | # Graceful degradation with helpful error messages
129 | RuntimeError: "Speed adjustment requires ffmpeg.
130 | Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
131 | ```
132 |
133 | ## Migration Guide
134 |
135 | ### From v3.3.x to v3.4.x
136 |
137 | **No breaking changes** - existing code continues to work:
138 |
139 | 1. **Docker users**:
140 | - `dbcccc/ttsfm:latest` now includes speed adjustment
141 | - Use `dbcccc/ttsfm:v3.4.0-alpha1-slim` for minimal image
142 |
143 | 2. **Python package users**:
144 | - Speed parameter now functional (requires ffmpeg)
145 | - Install ffmpeg: `apt-get install ffmpeg` (Linux) or `brew install ffmpeg` (Mac)
146 |
147 | 3. **API users**:
148 | - Speed parameter now works in `/v1/audio/speech` endpoint
149 | - Response metadata includes `speed_applied: true/false`
150 |
151 | ## Technical Notes
152 |
153 | ### Speed Adjustment Implementation
154 |
155 | - Uses ffmpeg's `atempo` filter for speed adjustment
156 | - Supports 0.25x to 4.0x range (OpenAI TTS API compatible)
157 | - Chains multiple `atempo` filters for speeds outside 0.5-2.0 range
158 | - Adjusts estimated duration based on speed multiplier
159 | - Runs in thread pool for async client to avoid blocking
160 |
161 | ### Build Optimization
162 |
163 | - Shared builder stage for both variants
164 | - Separate cache scopes (`scope=full`, `scope=slim`) for efficient caching
165 | - Multi-platform builds only on release (saves CI time)
166 |
167 | ## Future Enhancements
168 |
169 | Potential additions for future versions:
170 |
171 | 1. **Additional format support**: Real AAC, FLAC, OPUS output (currently mapped to WAV)
172 | 2. **Audio effects**: Pitch adjustment, noise reduction
173 | 3. **Streaming support**: Real-time audio streaming with speed adjustment
174 | 4. **Ultra-slim variant**: Alpine-based image (~50MB) with no Python web server
175 |
176 | ## References
177 |
178 | - [OpenAI TTS API Documentation](https://platform.openai.com/docs/guides/text-to-speech)
179 | - [ffmpeg atempo filter](https://ffmpeg.org/ffmpeg-filters.html#atempo)
180 | - [Docker multi-stage builds](https://docs.docker.com/build/building/multi-stage/)
181 |
182 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TTSFM - Text-to-Speech API Client
2 |
3 | > **⚠️ NOTICE: This project is no longer functional as the openai.fm demo website has been shut down.**
4 |
5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md)
6 |
7 | [](https://hub.docker.com/r/dbcccc/ttsfm)
8 | [](https://github.com/dbccccccc/ttsfm)
9 | [](https://opensource.org/licenses/MIT)
10 | 
11 |
12 | ## Star History
13 |
14 | [](https://www.star-history.com/#dbccccccc/ttsfm&Date)
15 |
16 | ## Overview
17 |
18 | TTSFM is a free, OpenAI-compatible text-to-speech API service that provides a complete solution for converting text to natural-sounding speech based on OpenAI's GPT-4o mini TTS. Built on top of the openai.fm backend, it offers a powerful Python SDK, RESTful API endpoints, and an intuitive web playground for easy testing and integration.
19 |
20 | **What TTSFM Can Do:**
21 | - 🎤 **Multiple Voices**: Choose from 11 OpenAI-compatible voices (alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse)
22 | - 🎵 **Flexible Audio Formats**: Support for 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
23 | - ⚡ **Speed Control**: Adjust playback speed from 0.25x to 4.0x for different use cases
24 | - 📝 **Long Text Support**: Automatic text splitting and audio combining for content of any length
25 | - 🔄 **Real-time Streaming**: WebSocket support for streaming audio generation
26 | - 🐍 **Python SDK**: Easy-to-use synchronous and asynchronous clients
27 | - 🌐 **Web Playground**: Interactive web interface for testing and experimentation
28 | - 🐳 **Docker Ready**: Pre-built Docker images for instant deployment
29 | - 🔍 **Smart Detection**: Automatic capability detection and helpful error messages
30 | - 🤖 **OpenAI Compatible**: Drop-in replacement for OpenAI's TTS API
31 |
32 | **Key Features in v3.4.0:**
33 | - 🎯 Image variant detection (full vs slim Docker images)
34 | - 🔍 Runtime capabilities API for feature availability checking
35 | - ⚡ Speed adjustment with ffmpeg-based audio processing
36 | - 🎵 Real format conversion for all 6 audio formats
37 | - 📊 Enhanced error handling with clear, actionable messages
38 | - 🐳 Dual Docker images optimized for different use cases
39 |
40 | > **⚠️ Disclaimer**: This project is intended for **educational and research purposes only**. It is a reverse-engineered implementation of the openai.fm service and should not be used for commercial purposes or in production environments. Users are responsible for ensuring compliance with applicable laws and terms of service.
41 |
42 | ## Installation
43 |
44 | ### Python package
45 |
46 | ```bash
47 | pip install ttsfm # core client
48 | pip install ttsfm[web] # core client + web/server dependencies
49 | ```
50 |
51 | ### Docker image
52 |
53 | TTSFM offers two Docker image variants to suit different needs:
54 |
55 | #### Full variant (recommended)
56 | ```bash
57 | docker run -p 8000:8000 dbcccc/ttsfm:latest
58 | ```
59 |
60 | **Includes ffmpeg for advanced features:**
61 | - ✅ All 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
62 | - ✅ Speed adjustment (0.25x - 4.0x)
63 | - ✅ Format conversion with ffmpeg
64 | - ✅ MP3 auto-combine for long text
65 | - ✅ WAV auto-combine for long text
66 |
67 | #### Slim variant - ~100MB
68 | ```bash
69 | docker run -p 8000:8000 dbcccc/ttsfm:slim
70 | ```
71 |
72 | **Minimal image without ffmpeg:**
73 | - ✅ Basic TTS functionality
74 | - ✅ 2 audio formats (MP3, WAV only)
75 | - ✅ WAV auto-combine for long text
76 | - ❌ No speed adjustment
77 | - ❌ No format conversion
78 | - ❌ No MP3 auto-combine
79 |
80 | The container exposes the web playground at `http://localhost:8000` and an OpenAI-compatible endpoint at `/v1/audio/speech`.
81 |
82 | **Check available features:**
83 | ```bash
84 | curl http://localhost:8000/api/capabilities
85 | ```
86 |
87 | ## Quick start
88 |
89 | ### Python client
90 |
91 | ```python
92 | from ttsfm import TTSClient, AudioFormat, Voice
93 |
94 | client = TTSClient()
95 |
96 | # Basic usage
97 | response = client.generate_speech(
98 | text="Hello from TTSFM!",
99 | voice=Voice.ALLOY,
100 | response_format=AudioFormat.MP3,
101 | )
102 | response.save_to_file("hello") # -> hello.mp3
103 |
104 | # With speed adjustment (requires ffmpeg)
105 | response = client.generate_speech(
106 | text="This will be faster!",
107 | voice=Voice.NOVA,
108 | response_format=AudioFormat.MP3,
109 | speed=1.5, # 1.5x speed (0.25 - 4.0)
110 | )
111 | response.save_to_file("fast") # -> fast.mp3
112 | ```
113 |
114 | ### CLI
115 |
116 | ```bash
117 | ttsfm "Hello, world" --voice nova --format mp3 --output hello.mp3
118 | ```
119 |
120 | ### REST API (OpenAI-compatible)
121 |
122 | ```bash
123 | # Basic request
124 | curl -X POST http://localhost:8000/v1/audio/speech \
125 | -H "Content-Type: application/json" \
126 | -d '{
127 | "model": "tts-1",
128 | "input": "Hello world!",
129 | "voice": "alloy",
130 | "response_format": "mp3"
131 | }' --output speech.mp3
132 |
133 | # With speed adjustment (requires full image)
134 | curl -X POST http://localhost:8000/v1/audio/speech \
135 | -H "Content-Type: application/json" \
136 | -d '{
137 | "model": "tts-1",
138 | "input": "Hello world!",
139 | "voice": "alloy",
140 | "response_format": "mp3",
141 | "speed": 1.5
142 | }' --output speech_fast.mp3
143 | ```
144 |
145 | **Available voices:** alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse
146 | **Available formats:** mp3, wav (always) + opus, aac, flac, pcm (full image only)
147 | **Speed range:** 0.25 - 4.0 (requires full image)
148 |
149 | ## Learn more
150 |
151 | - Browse the full API reference and operational notes in the [web documentation](http://localhost:8000/docs) (or see `ttsfm-web/templates/docs.html`).
152 | - Read the [architecture overview](docs/architecture.md) for component diagrams.
153 | - Contributions are welcome—see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
154 |
155 | ## License
156 |
157 | TTSFM is released under the [MIT License](LICENSE).
158 |
--------------------------------------------------------------------------------
/docs/websocket-streaming.md:
--------------------------------------------------------------------------------
1 | # 🚀 WebSocket Streaming for TTSFM
2 |
3 | Real-time audio streaming for text-to-speech generation using WebSockets.
4 |
5 | ## Overview
6 |
7 | The WebSocket streaming feature provides:
8 | - **Real-time audio chunk delivery** as they're generated
9 | - **Progress tracking** with live updates
10 | - **Lower perceived latency** - start receiving audio before complete generation
11 | - **Cancellable operations** - stop mid-generation if needed
12 |
13 | ## Quick Start
14 |
15 | ### 1. Docker Deployment (Recommended)
16 |
17 | ```bash
18 | # Build with WebSocket support
19 | docker build -t ttsfm-websocket .
20 |
21 | # Run with WebSocket enabled
22 | docker run -p 8000:8000 \
23 | -e DEBUG=false \
24 | ttsfm-websocket
25 | ```
26 |
27 | ### 2. Test WebSocket Connection
28 |
29 | Visit `http://localhost:8000/websocket-demo` for an interactive demo.
30 |
31 | ### 3. Client Usage
32 |
33 | ```javascript
34 | // Initialize WebSocket client
35 | const client = new WebSocketTTSClient({
36 | socketUrl: 'http://localhost:8000',
37 | debug: true
38 | });
39 |
40 | // Generate speech with streaming
41 | const result = await client.generateSpeech('Hello, WebSocket world!', {
42 | voice: 'alloy',
43 | format: 'mp3',
44 | onProgress: (progress) => {
45 | console.log(`Progress: ${progress.progress}%`);
46 | },
47 | onChunk: (chunk) => {
48 | console.log(`Received chunk ${chunk.chunkIndex + 1}`);
49 | // Process audio chunk in real-time
50 | },
51 | onComplete: (result) => {
52 | console.log('Generation complete!');
53 | // Play or download the combined audio
54 | }
55 | });
56 | ```
57 |
58 | ## API Reference
59 |
60 | ### WebSocket Events
61 |
62 | #### Client → Server
63 |
64 | **`generate_stream`**
65 | ```javascript
66 | {
67 | text: string, // Text to convert
68 | voice: string, // Voice ID (alloy, echo, etc.)
69 | format: string, // Audio format (mp3, wav, opus)
70 | chunk_size: number // Optional, default 1024
71 | }
72 | ```
73 |
74 | **`cancel_stream`**
75 | ```javascript
76 | {
77 | request_id: string // Request ID to cancel
78 | }
79 | ```
80 |
81 | #### Server → Client
82 |
83 | **`stream_started`**
84 | ```javascript
85 | {
86 | request_id: string,
87 | timestamp: number
88 | }
89 | ```
90 |
91 | **`audio_chunk`**
92 | ```javascript
93 | {
94 | request_id: string,
95 | chunk_index: number,
96 | total_chunks: number,
97 | audio_data: string, // Hex-encoded audio data
98 | format: string,
99 | duration: number,
100 | generation_time: number,
101 | chunk_text: string // Preview of chunk text
102 | }
103 | ```
104 |
105 | **`stream_progress`**
106 | ```javascript
107 | {
108 | request_id: string,
109 | progress: number, // 0-100
110 | total_chunks: number,
111 | chunks_completed: number,
112 | status: string
113 | }
114 | ```
115 |
116 | **`stream_complete`**
117 | ```javascript
118 | {
119 | request_id: string,
120 | total_chunks: number,
121 | status: 'completed',
122 | timestamp: number
123 | }
124 | ```
125 |
126 | **`stream_error`**
127 | ```javascript
128 | {
129 | request_id: string,
130 | error: string,
131 | timestamp: number
132 | }
133 | ```
134 |
135 | ## Performance Considerations
136 |
137 | 1. **Chunk Size**: Smaller chunks (512-1024 chars) provide more frequent updates but increase overhead
138 | 2. **Network Latency**: WebSocket reduces latency compared to HTTP polling
139 | 3. **Audio Buffering**: Client should buffer chunks for smooth playback
140 | 4. **Concurrent Streams**: Server supports multiple concurrent streaming sessions
141 |
142 | ## Browser Support
143 |
144 | - Chrome/Edge: Full support
145 | - Firefox: Full support
146 | - Safari: Full support (iOS 11.3+)
147 | - IE11: Not supported (use polling fallback)
148 |
149 | ## Troubleshooting
150 |
151 | ### Connection Issues
152 | ```javascript
153 | // Check WebSocket status
154 | fetch('/api/websocket/status')
155 | .then(res => res.json())
156 | .then(data => console.log('WebSocket status:', data));
157 | ```
158 |
159 | ### Debug Mode
160 | ```javascript
161 | const client = new WebSocketTTSClient({
162 | debug: true // Enable console logging
163 | });
164 | ```
165 |
166 | ### Common Issues
167 |
168 | 1. **"WebSocket connection failed"**
169 | - Check if port 8000 is accessible
170 | - Ensure eventlet is installed: `pip install eventlet>=0.33.3`
171 | - Try polling transport as fallback
172 |
173 | 2. **"Chunks arriving out of order"**
174 | - Client automatically sorts chunks by index
175 | - Check network stability
176 |
177 | 3. **"Audio playback stuttering"**
178 | - Increase chunk size for better buffering
179 | - Check client-side audio buffer implementation
180 |
181 | ## Advanced Usage
182 |
183 | ### Custom Chunk Processing
184 | ```javascript
185 | client.generateSpeech(text, {
186 | onChunk: async (chunk) => {
187 | // Custom processing per chunk
188 | const processed = await processAudioChunk(chunk.audioData);
189 | audioQueue.push(processed);
190 |
191 | // Start playback after first chunk
192 | if (chunk.chunkIndex === 0) {
193 | startStreamingPlayback(audioQueue);
194 | }
195 | }
196 | });
197 | ```
198 |
199 | ### Progress Visualization
200 | ```javascript
201 | client.generateSpeech(text, {
202 | onProgress: (progress) => {
203 | // Update UI progress bar
204 | progressBar.style.width = `${progress.progress}%`;
205 | statusText.textContent = `Processing chunk ${progress.chunksCompleted}/${progress.totalChunks}`;
206 | }
207 | });
208 | ```
209 |
210 | ## Security
211 |
212 | - WebSocket connections respect API key authentication if enabled
213 | - CORS is configured for cross-origin requests
214 | - SSL/TLS recommended for production deployments
215 |
216 | ## Deployment Notes
217 |
218 | For production deployment with your existing setup:
219 |
220 | ```bash
221 | # Build new image with WebSocket support
222 | docker build -t ttsfm-websocket:latest .
223 |
224 | # Deploy to your server (192.168.1.150)
225 | docker stop ttsfm-container
226 | docker rm ttsfm-container
227 | docker run -d \
228 | --name ttsfm-container \
229 | -p 8000:8000 \
230 | -e REQUIRE_API_KEY=true \
231 | -e TTSFM_API_KEY=your-secret-key \
232 | -e DEBUG=false \
233 | ttsfm-websocket:latest
234 | ```
235 |
236 | ## Performance Metrics
237 |
238 | Based on testing with openai.fm backend:
239 | - First chunk delivery: ~0.5-1s
240 | - Streaming overhead: ~10-15% vs batch processing
241 | - Concurrent connections: 100+ (limited by server resources)
242 | - Memory usage: ~50MB per active stream
243 |
244 | *Built by a grumpy senior engineer who thinks HTTP was good enough*
--------------------------------------------------------------------------------
/ttsfm/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | TTSFM - Text-to-Speech for Free using OpenAI.fm
3 |
4 | A Python library for generating high-quality text-to-speech audio using the free OpenAI.fm service.
5 | Supports multiple voices and audio formats with a simple, intuitive API.
6 |
7 | Example:
8 | >>> from ttsfm import TTSClient, Voice, AudioFormat
9 | >>>
10 | >>> client = TTSClient()
11 | >>>
12 | >>> # Generate MP3 audio
13 | >>> mp3_response = client.generate_speech(
14 | ... text="Hello, world!",
15 | ... voice=Voice.ALLOY,
16 | ... response_format=AudioFormat.MP3
17 | ... )
18 | >>> mp3_response.save_to_file("hello") # Saves as hello.mp3
19 | >>>
20 | >>> # Generate WAV audio
21 | >>> wav_response = client.generate_speech(
22 | ... text="High quality audio",
23 | ... voice=Voice.NOVA,
24 | ... response_format=AudioFormat.WAV
25 | ... )
26 | >>> wav_response.save_to_file("audio") # Saves as audio.wav
27 | >>>
28 | >>> # Generate OPUS audio
29 | >>> opus_response = client.generate_speech(
30 | ... text="Compressed audio",
31 | ... voice=Voice.ECHO,
32 | ... response_format=AudioFormat.OPUS
33 | ... )
34 | >>> opus_response.save_to_file("compressed") # Saves as compressed.wav
35 | """
36 |
37 | from typing import Optional
38 |
39 | from .async_client import AsyncTTSClient
40 | from .audio import combine_audio_chunks, combine_responses
41 | from .client import TTSClient
42 | from .exceptions import (
43 | APIException,
44 | AudioProcessingException,
45 | AuthenticationException,
46 | NetworkException,
47 | QuotaExceededException,
48 | RateLimitException,
49 | ServiceUnavailableException,
50 | TTSException,
51 | ValidationException,
52 | )
53 | from .models import (
54 | APIError,
55 | AudioFormat,
56 | NetworkError,
57 | TTSError,
58 | TTSRequest,
59 | TTSResponse,
60 | ValidationError,
61 | Voice,
62 | )
63 | from .utils import split_text_by_length, validate_text_length
64 |
65 | __version__ = "3.4.2"
66 | __author__ = "dbcccc"
67 | __email__ = "120614547+dbccccccc@users.noreply.github.com"
68 | __description__ = "Text-to-Speech API Client with OpenAI compatibility"
69 | __url__ = "https://github.com/dbccccccc/ttsfm"
70 |
71 | # Default client instance for convenience
72 | default_client = None
73 |
74 |
75 | def create_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> TTSClient: # type: ignore[misc]
76 | """
77 | Create a new TTS client instance.
78 |
79 | Args:
80 | base_url: Base URL for the TTS service
81 | api_key: API key for authentication (if required)
82 | **kwargs: Additional client configuration
83 |
84 | Returns:
85 | TTSClient: Configured client instance
86 | """
87 | client_kwargs = kwargs.copy()
88 | if base_url is not None:
89 | client_kwargs["base_url"] = base_url
90 | if api_key is not None:
91 | client_kwargs["api_key"] = api_key
92 | return TTSClient(**client_kwargs)
93 |
94 |
95 | def create_async_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> AsyncTTSClient: # type: ignore[misc]
96 | """
97 | Create a new async TTS client instance.
98 |
99 | Args:
100 | base_url: Base URL for the TTS service
101 | api_key: API key for authentication (if required)
102 | **kwargs: Additional client configuration
103 |
104 | Returns:
105 | AsyncTTSClient: Configured async client instance
106 | """
107 | client_kwargs = kwargs.copy()
108 | if base_url is not None:
109 | client_kwargs["base_url"] = base_url
110 | if api_key is not None:
111 | client_kwargs["api_key"] = api_key
112 | return AsyncTTSClient(**client_kwargs)
113 |
114 |
115 | def set_default_client(client: TTSClient) -> None:
116 | """Set the default client instance for convenience functions."""
117 | global default_client
118 | default_client = client
119 |
120 |
121 | def generate_speech(text: str, voice: str = "alloy", **kwargs) -> TTSResponse: # type: ignore[misc]
122 | """
123 | Convenience function to generate speech using the default client.
124 |
125 | Args:
126 | text: Text to convert to speech
127 | voice: Voice to use for generation
128 | **kwargs: Additional generation parameters
129 |
130 | Returns:
131 | TTSResponse: Generated audio response
132 |
133 | Raises:
134 | TTSException: If no default client is set or generation fails
135 | """
136 | if default_client is None:
137 | raise TTSException("No default client set. Use create_client() first.")
138 |
139 | return default_client.generate_speech(text=text, voice=voice, **kwargs)
140 |
141 |
142 | def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs): # type: ignore[no-untyped-def]
143 | """
144 | Convenience function to generate speech from long text using the default client.
145 |
146 | Automatically splits long text into chunks and generates speech for each chunk.
147 |
148 | Args:
149 | text: Text to convert to speech (can be longer than 1000 characters)
150 | voice: Voice to use for generation
151 | **kwargs: Additional generation parameters (max_length, preserve_words, etc.)
152 |
153 | Returns:
154 | list: List of TTSResponse objects for each chunk
155 |
156 | Raises:
157 | TTSException: If no default client is set or generation fails
158 | """
159 | if default_client is None:
160 | raise TTSException("No default client set. Use create_client() first.")
161 |
162 | return default_client.generate_speech_long_text(text=text, voice=voice, **kwargs)
163 |
164 |
165 | # Export all public components
166 | __all__ = [
167 | # Main classes
168 | "TTSClient",
169 | "AsyncTTSClient",
170 | # Models
171 | "TTSRequest",
172 | "TTSResponse",
173 | "Voice",
174 | "AudioFormat",
175 | "TTSError",
176 | "APIError",
177 | "NetworkError",
178 | "ValidationError",
179 | # Exceptions
180 | "TTSException",
181 | "APIException",
182 | "NetworkException",
183 | "ValidationException",
184 | "RateLimitException",
185 | "AuthenticationException",
186 | "ServiceUnavailableException",
187 | "QuotaExceededException",
188 | "AudioProcessingException",
189 | # Factory functions
190 | "create_client",
191 | "create_async_client",
192 | "set_default_client",
193 | "generate_speech",
194 | "generate_speech_long_text",
195 | # Utility functions
196 | "validate_text_length",
197 | "split_text_by_length",
198 | "combine_audio_chunks",
199 | "combine_responses",
200 | # Package metadata
201 | "__version__",
202 | "__author__",
203 | "__email__",
204 | "__description__",
205 | "__url__",
206 | ]
207 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build-full.yml:
--------------------------------------------------------------------------------
1 | name: Docker Build and Push (Full)
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 | release:
9 | types: [published]
10 |
11 | env:
12 | REGISTRY_DOCKERHUB: docker.io
13 | REGISTRY_GHCR: ghcr.io
14 | IMAGE_NAME: ${{ github.repository }}
15 | DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }}
16 |
17 | jobs:
18 | build-and-push-full:
19 | runs-on: ubuntu-latest
20 | permissions:
21 | contents: read
22 | packages: write
23 | steps:
24 | - name: Checkout repository
25 | uses: actions/checkout@v4
26 |
27 | - name: Determine build settings
28 | id: build-config
29 | env:
30 | EVENT_NAME: ${{ github.event_name }}
31 | EVENT_ACTION: ${{ github.event.action }}
32 | run: |
33 | if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then
34 | echo "push=true" >> "$GITHUB_OUTPUT"
35 | echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
36 | echo "load=false" >> "$GITHUB_OUTPUT"
37 | else
38 | echo "push=false" >> "$GITHUB_OUTPUT"
39 | echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT"
40 | echo "load=true" >> "$GITHUB_OUTPUT"
41 | fi
42 |
43 | - name: Derive image version
44 | id: version
45 | env:
46 | EVENT_NAME: ${{ github.event_name }}
47 | TAG_NAME: ${{ github.event.release.tag_name }}
48 | REF_NAME: ${{ github.ref_name }}
49 | GITHUB_SHA: ${{ github.sha }}
50 | run: |
51 | version=""
52 | if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then
53 | version="$TAG_NAME"
54 | elif [ -n "$REF_NAME" ]; then
55 | version="$REF_NAME"
56 | fi
57 | version="${version##*/}"
58 | if [ "${version#v}" != "$version" ]; then
59 | version="${version#v}"
60 | fi
61 | if [ -z "$version" ]; then
62 | version="${GITHUB_SHA:0:12}"
63 | fi
64 | if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then
65 | safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-')
66 | safe_branch=${safe_branch%-}
67 | if [ -z "$safe_branch" ]; then
68 | safe_branch="sha-${GITHUB_SHA:0:12}"
69 | fi
70 | version="0.0.0+${safe_branch}"
71 | fi
72 | echo "version=$version" >> "$GITHUB_OUTPUT"
73 |
74 | - name: Set up QEMU
75 | if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64'
76 | uses: docker/setup-qemu-action@v3
77 |
78 | - name: Set up Docker Buildx
79 | uses: docker/setup-buildx-action@v3
80 | with:
81 | driver: docker-container
82 |
83 | - name: Login to Docker Hub
84 | if: steps.build-config.outputs.push == 'true'
85 | uses: docker/login-action@v3
86 | with:
87 | username: ${{ secrets.DOCKERHUB_USERNAME }}
88 | password: ${{ secrets.DOCKERHUB_TOKEN }}
89 |
90 | - name: Login to GitHub Container Registry
91 | if: steps.build-config.outputs.push == 'true'
92 | uses: docker/login-action@v3
93 | with:
94 | registry: ${{ env.REGISTRY_GHCR }}
95 | username: ${{ github.actor }}
96 | password: ${{ secrets.GITHUB_TOKEN }}
97 |
98 | - name: Extract metadata
99 | id: meta
100 | if: steps.build-config.outputs.push == 'true'
101 | uses: docker/metadata-action@v5
102 | with:
103 | images: |
104 | ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm
105 | ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}
106 | tags: |
107 | type=semver,pattern=v{{version}}
108 | type=raw,value=latest,enable=${{ github.event.release.prerelease == false }}
109 | labels: |
110 | org.opencontainers.image.source=${{ github.repositoryUrl }}
111 | org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (full variant with ffmpeg)
112 | org.opencontainers.image.licenses=MIT
113 | org.opencontainers.image.title=TTSFM - Free TTS API Server (Full)
114 | org.opencontainers.image.vendor=dbcccc
115 | flavor: |
116 | latest=auto
117 |
118 | - name: Set local image metadata
119 | id: meta-local
120 | if: steps.build-config.outputs.push != 'true'
121 | run: |
122 | echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-full" >> "$GITHUB_OUTPUT"
123 | echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT"
124 |
125 | - name: Build and push image
126 | id: build-and-push
127 | uses: docker/build-push-action@v5
128 | with:
129 | context: .
130 | platforms: ${{ steps.build-config.outputs.platforms }}
131 | push: ${{ steps.build-config.outputs.push == 'true' }}
132 | load: ${{ steps.build-config.outputs.load == 'true' }}
133 | tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}
134 | labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }}
135 | cache-from: type=gha,scope=full
136 | cache-to: type=gha,mode=max,scope=full
137 | build-args: |
138 | VERSION=${{ steps.version.outputs.version }}
139 | VARIANT=full
140 |
141 | - name: Smoke test image
142 | if: steps.build-config.outputs.load == 'true'
143 | run: |
144 | set -euo pipefail
145 | IMAGE="${{ steps.meta-local.outputs.tags }}"
146 | echo "Running smoke test for full image: $IMAGE"
147 | docker rm -f ttsfm-smoke >/dev/null 2>&1 || true
148 | docker run -d --name ttsfm-smoke -p 127.0.0.1:8000:8000 "$IMAGE"
149 | success=""
150 | for attempt in $(seq 1 10); do
151 | if curl --fail --silent --max-time 5 http://127.0.0.1:8000/api/health > /tmp/ttsfm-health.json; then
152 | success="yes"
153 | cat /tmp/ttsfm-health.json
154 | break
155 | fi
156 | sleep 3
157 | done
158 | docker logs ttsfm-smoke || true
159 | docker rm -f ttsfm-smoke >/dev/null 2>&1 || true
160 | if [ -z "$success" ]; then
161 | echo "Container health check failed" >&2
162 | exit 1
163 | fi
164 |
165 | - name: Show image info
166 | run: |
167 | echo "Variant: full"
168 | echo "Push enabled: ${{ steps.build-config.outputs.push }}"
169 | echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}"
170 | echo "Image digest: ${{ steps.build-and-push.outputs.digest }}"
171 |
172 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build-slim.yml:
--------------------------------------------------------------------------------
1 | name: Docker Build and Push (Slim)
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 | release:
9 | types: [published]
10 |
11 | env:
12 | REGISTRY_DOCKERHUB: docker.io
13 | REGISTRY_GHCR: ghcr.io
14 | IMAGE_NAME: ${{ github.repository }}
15 | DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }}
16 |
17 | jobs:
18 | build-and-push-slim:
19 | runs-on: ubuntu-latest
20 | permissions:
21 | contents: read
22 | packages: write
23 | steps:
24 | - name: Checkout repository
25 | uses: actions/checkout@v4
26 |
27 | - name: Determine build settings
28 | id: build-config
29 | env:
30 | EVENT_NAME: ${{ github.event_name }}
31 | EVENT_ACTION: ${{ github.event.action }}
32 | run: |
33 | if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then
34 | echo "push=true" >> "$GITHUB_OUTPUT"
35 | echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
36 | echo "load=false" >> "$GITHUB_OUTPUT"
37 | else
38 | echo "push=false" >> "$GITHUB_OUTPUT"
39 | echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT"
40 | echo "load=true" >> "$GITHUB_OUTPUT"
41 | fi
42 |
43 | - name: Derive image version
44 | id: version
45 | env:
46 | EVENT_NAME: ${{ github.event_name }}
47 | TAG_NAME: ${{ github.event.release.tag_name }}
48 | REF_NAME: ${{ github.ref_name }}
49 | GITHUB_SHA: ${{ github.sha }}
50 | run: |
51 | version=""
52 | if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then
53 | version="$TAG_NAME"
54 | elif [ -n "$REF_NAME" ]; then
55 | version="$REF_NAME"
56 | fi
57 | version="${version##*/}"
58 | if [ "${version#v}" != "$version" ]; then
59 | version="${version#v}"
60 | fi
61 | if [ -z "$version" ]; then
62 | version="${GITHUB_SHA:0:12}"
63 | fi
64 | if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then
65 | safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-')
66 | safe_branch=${safe_branch%-}
67 | if [ -z "$safe_branch" ]; then
68 | safe_branch="sha-${GITHUB_SHA:0:12}"
69 | fi
70 | version="0.0.0+${safe_branch}"
71 | fi
72 | echo "version=$version" >> "$GITHUB_OUTPUT"
73 |
74 | - name: Set up QEMU
75 | if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64'
76 | uses: docker/setup-qemu-action@v3
77 |
78 | - name: Set up Docker Buildx
79 | uses: docker/setup-buildx-action@v3
80 | with:
81 | driver: docker-container
82 |
83 | - name: Login to Docker Hub
84 | if: steps.build-config.outputs.push == 'true'
85 | uses: docker/login-action@v3
86 | with:
87 | username: ${{ secrets.DOCKERHUB_USERNAME }}
88 | password: ${{ secrets.DOCKERHUB_TOKEN }}
89 |
90 | - name: Login to GitHub Container Registry
91 | if: steps.build-config.outputs.push == 'true'
92 | uses: docker/login-action@v3
93 | with:
94 | registry: ${{ env.REGISTRY_GHCR }}
95 | username: ${{ github.actor }}
96 | password: ${{ secrets.GITHUB_TOKEN }}
97 |
98 | - name: Extract metadata
99 | id: meta
100 | if: steps.build-config.outputs.push == 'true'
101 | uses: docker/metadata-action@v5
102 | with:
103 | images: |
104 | ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm
105 | ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}
106 | tags: |
107 | type=semver,pattern=v{{version}},suffix=-slim
108 | type=raw,value=slim,enable=${{ !contains(github.ref, 'alpha') && !contains(github.ref, 'beta') }}
109 | labels: |
110 | org.opencontainers.image.source=${{ github.repositoryUrl }}
111 | org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (slim variant without ffmpeg)
112 | org.opencontainers.image.licenses=MIT
113 | org.opencontainers.image.title=TTSFM - Free TTS API Server (Slim)
114 | org.opencontainers.image.vendor=dbcccc
115 |
116 | - name: Set local image metadata
117 | id: meta-local
118 | if: steps.build-config.outputs.push != 'true'
119 | run: |
120 | echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-slim" >> "$GITHUB_OUTPUT"
121 | echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT"
122 |
123 | - name: Build and push image
124 | id: build-and-push
125 | uses: docker/build-push-action@v5
126 | with:
127 | context: .
128 | platforms: ${{ steps.build-config.outputs.platforms }}
129 | push: ${{ steps.build-config.outputs.push == 'true' }}
130 | load: ${{ steps.build-config.outputs.load == 'true' }}
131 | tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}
132 | labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }}
133 | cache-from: type=gha,scope=slim
134 | cache-to: type=gha,mode=max,scope=slim
135 | build-args: |
136 | VERSION=${{ steps.version.outputs.version }}
137 | VARIANT=slim
138 |
139 | - name: Smoke test image
140 | if: steps.build-config.outputs.load == 'true'
141 | run: |
142 | set -euo pipefail
143 | IMAGE="${{ steps.meta-local.outputs.tags }}"
144 | echo "Running smoke test for slim image: $IMAGE"
145 | docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true
146 | docker run -d --name ttsfm-smoke-slim -p 127.0.0.1:8001:8000 "$IMAGE"
147 | success=""
148 | for attempt in $(seq 1 10); do
149 | if curl --fail --silent --max-time 5 http://127.0.0.1:8001/api/health > /tmp/ttsfm-health-slim.json; then
150 | success="yes"
151 | cat /tmp/ttsfm-health-slim.json
152 | break
153 | fi
154 | sleep 3
155 | done
156 | docker logs ttsfm-smoke-slim || true
157 | docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true
158 | if [ -z "$success" ]; then
159 | echo "Container health check failed" >&2
160 | exit 1
161 | fi
162 |
163 | - name: Show image info
164 | run: |
165 | echo "Variant: slim"
166 | echo "Push enabled: ${{ steps.build-config.outputs.push }}"
167 | echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}"
168 | echo "Image digest: ${{ steps.build-and-push.outputs.digest }}"
169 |
170 |
--------------------------------------------------------------------------------
/ttsfm-web/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block title %}TTSFM - {{ _('home.title') }}{% endblock %}
4 |
5 | {% block content %}
6 |
7 |
19 | {{ _('home.subtitle') }}
20 |
45 | {{ _('home.features_subtitle') }}
46 | {{ _('home.feature_free_desc') }} {{ _('home.feature_openai_desc') }} {{ _('home.feature_async_desc') }} {{ _('home.feature_voices_desc') }} {{ _('home.feature_formats_desc') }}
101 | {{ _('home.subtitle') }}
102 |
16 | {{ _('home.title') }}
17 |
18 | {{ _('home.features_title') }}
44 | {{ _('home.feature_free_title') }}
57 | {{ _('home.feature_openai_title') }}
67 | {{ _('home.feature_async_title') }}
77 | {{ _('home.feature_voices_title') }} & {{ _('home.feature_formats_title') }}
87 | {{ _('home.quick_start_title') }}
100 |
111 | {{ _('home.installation_title') }}
112 |
113 |
114 | Requires Python 3.8+
115 | {{ _('home.installation_code') }}
123 | {{ _('home.usage_title') }}
124 |
125 |
134 | No API keys required
135 | from ttsfm import TTSClient, Voice, AudioFormat
126 |
127 | client = TTSClient()
128 | response = client.generate_speech(
129 | text="Hello, world!",
130 | voice=Voice.ALLOY,
131 | response_format=AudioFormat.MP3
132 | )
133 | response.save_to_file("hello")