├── .flake8 ├── requirements.txt ├── ttsfm-web ├── run.py ├── requirements.txt ├── static │ └── js │ │ ├── api-client.js │ │ └── i18n.js ├── templates │ ├── index.html │ └── base.html ├── i18n.py ├── translations │ ├── zh.json │ └── en.json └── websocket_handler.py ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── release.yml │ ├── docker-build-full.yml │ └── docker-build-slim.yml ├── .env.example ├── LICENSE ├── docs ├── architecture.md ├── docker-workflows.md ├── v3.4-dual-image-implementation.md └── websocket-streaming.md ├── CONTRIBUTING.md ├── Dockerfile ├── .gitignore ├── tests ├── test_utils.py ├── test_web_app.py ├── test_clients.py └── test_audio_processing.py ├── ttsfm ├── capabilities.py ├── audio.py ├── __init__.py ├── audio_processing.py ├── exceptions.py ├── models.py └── cli.py ├── README.zh.md ├── pyproject.toml ├── scripts └── test_websocket.py └── README.md /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | extend-ignore = E203,W503,E501 4 | exclude = .venv,build,dist,ttsfm.egg-info 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies for the TTSFM package 2 | requests>=2.25.0 3 | aiohttp>=3.8.0 4 | python-dotenv>=1.0.1 5 | -------------------------------------------------------------------------------- /ttsfm-web/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Run script for TTSFM web application with proper eventlet initialization 4 | """ 5 | 6 | import eventlet 7 | 8 | eventlet.monkey_patch() 9 | 10 | from app import DEBUG, HOST, PORT, app, socketio # noqa: E402 11 | 12 | if __name__ == "__main__": 13 | print(f"Starting TTSFM with WebSocket support on {HOST}:{PORT}") 14 | socketio.run(app, host=HOST, port=PORT, debug=DEBUG, allow_unsafe_werkzeug=True) 15 | -------------------------------------------------------------------------------- /ttsfm-web/requirements.txt: -------------------------------------------------------------------------------- 1 | # Web application dependencies 2 | argon2-cffi>=23.1.0 3 | flask>=2.0.0 4 | flask-cors>=3.0.10 5 | flask-socketio>=5.3.0 6 | python-socketio>=5.10.0 7 | eventlet>=0.33.3 8 | waitress>=3.0.0 9 | python-dotenv>=1.0.0 10 | 11 | # Audio processing (optional, for combining audio files) 12 | # If not installed, will fall back to simple concatenation for WAV files 13 | pydub>=0.25.0 14 | 15 | # TTSFM package (install from local directory or PyPI) 16 | # For local development: pip install -e ../ 17 | # For Docker/production: installed via pyproject.toml[web] dependencies 18 | 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # TTSFM Environment Configuration 2 | 3 | # Server Configuration 4 | HOST=0.0.0.0 5 | PORT=8000 6 | 7 | # SSL Configuration 8 | VERIFY_SSL=true 9 | 10 | # Flask Configuration 11 | FLASK_ENV=production 12 | FLASK_APP=app.py 13 | DEBUG=false 14 | 15 | # API Key Protection (Optional) 16 | # Set REQUIRE_API_KEY=true to enable API key authentication 17 | REQUIRE_API_KEY=false 18 | 19 | # Set your API key here when protection is enabled 20 | # This key will be required for all TTS generation requests 21 | TTSFM_API_KEY=your-secret-api-key-here 22 | 23 | # Example usage: 24 | # 1. Set REQUIRE_API_KEY=true 25 | # 2. Set TTSFM_API_KEY to your desired secret key 26 | # 3. Restart the application 27 | # 4. All TTS requests will now require the API key in: 28 | # - Authorization header (Bearer token) - OpenAI compatible 29 | # - X-API-Key header 30 | # - api_key query parameter 31 | # - api_key in JSON body 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 dbcccc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ttsfm-web/static/js/api-client.js: -------------------------------------------------------------------------------- 1 | const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes 2 | const cache = new Map(); 3 | 4 | function shouldUseCache(entry) { 5 | if (!entry) { 6 | return false; 7 | } 8 | if (entry.expiresAt === null) { 9 | return true; 10 | } 11 | return Date.now() < entry.expiresAt; 12 | } 13 | 14 | async function fetchWithCache(url, { signal, refresh = false } = {}) { 15 | if (!refresh) { 16 | const cached = cache.get(url); 17 | if (shouldUseCache(cached)) { 18 | return cached.data; 19 | } 20 | } 21 | 22 | const response = await fetch(url, { signal }); 23 | if (!response.ok) { 24 | throw new Error(`Request to ${url} failed with status ${response.status}`); 25 | } 26 | const data = await response.json(); 27 | cache.set(url, { data, expiresAt: Date.now() + CACHE_TTL_MS }); 28 | return data; 29 | } 30 | 31 | export function clearCache(urlPrefix) { 32 | if (!urlPrefix) { 33 | cache.clear(); 34 | return; 35 | } 36 | for (const key of Array.from(cache.keys())) { 37 | if (key.startsWith(urlPrefix)) { 38 | cache.delete(key); 39 | } 40 | } 41 | } 42 | 43 | export function fetchVoices(options = {}) { 44 | return fetchWithCache('/api/voices', options); 45 | } 46 | 47 | export function fetchFormats(options = {}) { 48 | return fetchWithCache('/api/formats', options); 49 | } 50 | 51 | export function primeCache(url, data, ttlMs = CACHE_TTL_MS) { 52 | cache.set(url, { data, expiresAt: ttlMs === null ? null : Date.now() + ttlMs }); 53 | } 54 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # TTSFM Architecture Overview 2 | 3 | ``` 4 | +----------------+ +--------------------+ +----------------------+ 5 | | Frontend (JS) | <---> | Flask REST Endpoints| <---> | OpenAI.fm upstream | 6 | | Playground UI | | /api/* + /v1/audio | | reverse-engineered | 7 | +----------------+ +--------------------+ +----------------------+ 8 | | ^ 9 | v | 10 | +----------------+ +--------------------+ 11 | | Socket.IO WS | <---> | WebSocket Handler | 12 | | streaming UI | | (background tasks) | 13 | +----------------+ +--------------------+ 14 | ``` 15 | 16 | - **Synchronous Client (`TTSClient`)** – Used by both REST endpoints and the WebSocket handler. Each request gets an isolated client instance, preventing shared session races. 17 | - **Async Client (`AsyncTTSClient`)** – Available to external consumers that want fully asynchronous workflows. 18 | - **Utilities** – Shared helpers handle sanitisation, deterministic headers, and text splitting for both HTTP and WebSocket flows. 19 | 20 | The repo ships with a Docker image that bundles the Flask app, Socket.IO server, and static assets. A per-request TTS client ensures concurrency safety; outgoing prompt tuning is opt-in through the `use_default_prompt` flag. 21 | 22 | For more implementation details see: 23 | 24 | - `ttsfm-web/app.py` – Flask routes, streaming combination logic, API key security. 25 | - `ttsfm-web/websocket_handler.py` – Background task orchestration and streaming chunk delivery. 26 | - `ttsfm/utils.py` – Sanitisation, deterministic headers, and text chunk helpers. 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to TTSFM 2 | 3 | Thanks for your interest in improving TTSFM! This document outlines the local development workflow and quality gates that every pull request must satisfy. 4 | 5 | ## 1. Set Up Your Environment 6 | 7 | ```bash 8 | # Clone and create a virtual environment of your choice 9 | python -m venv .venv 10 | source .venv/bin/activate # Windows: .venv\Scripts\activate 11 | 12 | # Install the package with all tooling and web extras 13 | pip install -e .[web,dev] 14 | ``` 15 | 16 | ## 2. Run the Test Suite 17 | 18 | ```bash 19 | pytest 20 | ``` 21 | 22 | Add new tests alongside your changes—patches without coverage for new behaviour will be sent back for revision. 23 | 24 | ## 3. Lint and Type-Check 25 | 26 | We keep the codebase consistent and catch regressions early with these checks: 27 | 28 | ```bash 29 | black --check ttsfm ttsfm-web tests 30 | flake8 ttsfm ttsfm-web 31 | mypy ttsfm 32 | ``` 33 | 34 | Format your code with `black` and resolve lint/type errors before opening a pull request. 35 | 36 | ## 4. Web UI Smoke Tests 37 | 38 | If you touch the Flask app or frontend assets, run the web server locally and exercise the basic flows (text input, long-form combine, WebSocket streaming). For asynchronous features, open two browser tabs and confirm cancellation works. 39 | 40 | ## 5. Commit & Pull Request Guidelines 41 | 42 | - Keep commits focused; squash trivial fixups before submitting. 43 | - Describe _why_ a change is needed in the PR description. 44 | - Link to an issue if one exists. 45 | - Document behaviour changes in `CHANGELOG.md` when relevant. 46 | 47 | Questions or ideas? Open a discussion thread or drop by the issue tracker—we’re happy to help. 48 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build argument to control image variant (full or slim) 2 | ARG VARIANT=full 3 | 4 | FROM python:3.11-slim AS builder 5 | 6 | WORKDIR /app 7 | 8 | ENV PYTHONDONTWRITEBYTECODE=1 \ 9 | PYTHONUNBUFFERED=1 10 | 11 | RUN apt-get update \ 12 | && apt-get install -y --no-install-recommends build-essential \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | COPY pyproject.toml ./ 16 | COPY README.md ./ 17 | COPY requirements.txt ./ 18 | COPY ttsfm/ ./ttsfm/ 19 | 20 | ARG VERSION=0.0.0 21 | ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION} 22 | 23 | RUN pip install --no-cache-dir --upgrade pip \ 24 | && pip install --no-cache-dir --prefix /install .[web] \ 25 | && find /install -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true \ 26 | && find /install -type f -name '*.pyc' -delete \ 27 | && find /install -type f -name '*.pyo' -delete \ 28 | && find /install -type d -name 'tests' -exec rm -rf {} + 2>/dev/null || true \ 29 | && find /install -type d -name 'test' -exec rm -rf {} + 2>/dev/null || true \ 30 | && find /install -name '*.dist-info' -type d -exec sh -c 'rm -f "$1"/RECORD "$1"/INSTALLER' sh {} \; 2>/dev/null || true 31 | 32 | FROM python:3.11-slim 33 | 34 | # Re-declare ARG after FROM to make it available in this stage 35 | ARG VARIANT=full 36 | 37 | ENV PYTHONDONTWRITEBYTECODE=1 \ 38 | PYTHONUNBUFFERED=1 \ 39 | PORT=8000 \ 40 | TTSFM_VARIANT=${VARIANT} 41 | 42 | WORKDIR /app 43 | 44 | # Conditional ffmpeg installation based on variant 45 | # Full variant: includes ffmpeg for MP3 combining, speed adjustment, and format conversion 46 | # Slim variant: minimal image without ffmpeg (WAV-only auto-combine, no speed adjustment) 47 | RUN apt-get update \ 48 | && if [ "$VARIANT" = "full" ]; then \ 49 | apt-get install -y --no-install-recommends ffmpeg; \ 50 | fi \ 51 | && rm -rf /var/lib/apt/lists/* \ 52 | && useradd --create-home --shell /usr/sbin/nologin ttsfm 53 | 54 | COPY --from=builder /install /usr/local 55 | ENV PATH="/usr/local/bin:$PATH" 56 | 57 | COPY --chown=ttsfm:ttsfm ttsfm-web/ ./ttsfm-web/ 58 | 59 | USER ttsfm 60 | 61 | EXPOSE 8000 62 | 63 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 64 | CMD ["python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health', timeout=5)"] 65 | 66 | WORKDIR /app/ttsfm-web 67 | CMD ["python", "run.py"] 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | MANIFEST 23 | 24 | # Virtual Environment 25 | venv/ 26 | env/ 27 | ENV/ 28 | .venv/ 29 | 30 | # Environment variables 31 | .env 32 | .env.local 33 | .env.production 34 | 35 | # IDE 36 | .idea/ 37 | .vscode/ 38 | *.swp 39 | *.swo 40 | .spyderproject 41 | .spyproject 42 | 43 | # OS 44 | .DS_Store 45 | .DS_Store? 46 | ._* 47 | .Spotlight-V100 48 | .Trashes 49 | ehthumbs.db 50 | Thumbs.db 51 | 52 | # Generated audio files (for testing) 53 | *.mp3 54 | *.wav 55 | *.opus 56 | *.aac 57 | *.flac 58 | *.pcm 59 | test_output.* 60 | output.* 61 | hello.* 62 | speech.* 63 | 64 | # Logs 65 | *.log 66 | logs/ 67 | .pytest_cache/ 68 | 69 | # Temporary files 70 | tmp/ 71 | temp/ 72 | .tmp/ 73 | 74 | # Coverage reports 75 | htmlcov/ 76 | .coverage 77 | .coverage.* 78 | coverage.xml 79 | *.cover 80 | .hypothesis/ 81 | 82 | # Documentation builds 83 | docs/_build/ 84 | site/ 85 | 86 | # Package builds 87 | *.tar.gz 88 | *.whl 89 | dist/ 90 | build/ 91 | 92 | # MyPy 93 | .mypy_cache/ 94 | .dmypy.json 95 | dmypy.json 96 | 97 | # Jupyter Notebook 98 | .ipynb_checkpoints 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | Pipfile.lock 105 | 106 | # PEP 582 107 | __pypackages__/ 108 | 109 | # Celery 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | # Additional exclusions for GitHub 126 | 127 | # API Keys and Secrets 128 | config.json 129 | secrets.json 130 | .secrets 131 | api_keys.txt 132 | 133 | # Database files 134 | *.db 135 | *.sqlite 136 | *.sqlite3 137 | 138 | # Backup files 139 | *.bak 140 | *.backup 141 | *~ 142 | 143 | # Node.js (if using any JS tools) 144 | node_modules/ 145 | npm-debug.log* 146 | yarn-debug.log* 147 | yarn-error.log* 148 | 149 | # Docker 150 | .dockerignore 151 | Dockerfile.dev 152 | docker-compose.override.yml 153 | 154 | # Local configuration 155 | local_settings.py 156 | local_config.py 157 | 158 | # Claude 159 | .claude/ 160 | VERSION_BUMP_GUIDE.md 161 | scripts/test_audio_generation.py 162 | /artifacts 163 | test.py 164 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | 5 | import ttsfm.utils as utils 6 | 7 | 8 | def test_split_text_preserves_sentence_punctuation(): 9 | text = "First sentence! Second question? Final statement." 10 | chunks = utils.split_text_by_length(text, max_length=15) 11 | 12 | assert chunks[0].endswith("!"), chunks 13 | assert any(chunk.endswith("?") for chunk in chunks), chunks 14 | assert chunks[-1].endswith("."), chunks 15 | 16 | 17 | def test_split_text_handles_oversized_sentence(): 18 | long_sentence = " ".join(["word"] * 600) 19 | chunks = utils.split_text_by_length(long_sentence, max_length=120) 20 | 21 | assert all(len(chunk) <= 120 for chunk in chunks) 22 | assert sum(len(chunk.split()) for chunk in chunks) == 600 23 | 24 | 25 | def test_split_text_handles_extremely_long_word(): 26 | max_length = 50 27 | painful_word = "a" * 140 28 | text = f"start {painful_word} end" 29 | 30 | chunks = utils.split_text_by_length(text, max_length=max_length) 31 | 32 | assert any(painful_word[:max_length] in chunk for chunk in chunks) 33 | assert all(len(chunk) <= max_length for chunk in chunks) 34 | 35 | 36 | def test_sanitize_text_retains_ampersands(): 37 | text = "R&D and Fish & Chips & Co. Bold" 38 | sanitized = utils.sanitize_text(text) 39 | 40 | assert "R&D" in sanitized 41 | assert "Fish & Chips" in sanitized 42 | assert "Bold" in sanitized 43 | assert "<" not in sanitized 44 | 45 | 46 | def test_header_generation_deterministic_upgrade_flag(monkeypatch): 47 | module = importlib.reload(utils) 48 | 49 | headers_first = module.get_realistic_headers() 50 | headers_second = module.get_realistic_headers() 51 | 52 | assert "Upgrade-Insecure-Requests" in headers_first 53 | assert "Upgrade-Insecure-Requests" not in headers_second 54 | assert headers_first["Accept-Language"] != headers_second["Accept-Language"] 55 | 56 | 57 | @pytest.mark.asyncio 58 | async def test_async_batch_propagates_original_exception(monkeypatch): 59 | from ttsfm.async_client import AsyncTTSClient 60 | from ttsfm.exceptions import NetworkException 61 | from ttsfm.models import TTSRequest, Voice 62 | 63 | client = AsyncTTSClient() 64 | 65 | async def fail_request(_request): 66 | raise NetworkException("boom") 67 | 68 | monkeypatch.setattr(client, "_make_request", fail_request) 69 | 70 | request = TTSRequest(input="hello", voice=Voice.ALLOY) 71 | 72 | with pytest.raises(NetworkException): 73 | await client.generate_speech_batch([request]) 74 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release and Publish 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Triggers on version tags like v1.0.0, v3.0.1, etc. 7 | 8 | permissions: 9 | contents: write 10 | id-token: write 11 | 12 | jobs: 13 | release-and-publish: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: '3.11' 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install build twine 28 | pip install '.[web,dev]' 29 | 30 | - name: Run linters and type checks 31 | run: | 32 | flake8 ttsfm ttsfm-web 33 | mypy ttsfm 34 | black --check ttsfm ttsfm-web tests 35 | 36 | - name: Run tests 37 | run: pytest 38 | 39 | - name: Test package install and import 40 | run: | 41 | python -c "import ttsfm; print('TTSFM imported successfully')" 42 | python -c "from ttsfm import TTSClient; print('TTSClient imported successfully')" 43 | python -m ttsfm.cli --help > /dev/null 44 | echo 'CLI smoke test passed' 45 | 46 | - name: Build package 47 | run: | 48 | python -m build 49 | echo "Package built successfully" 50 | ls -la dist/ 51 | 52 | - name: Check package 53 | run: | 54 | twine check dist/* 55 | echo "Package validation passed" 56 | 57 | - name: Publish to PyPI 58 | uses: pypa/gh-action-pypi-publish@release/v1 59 | with: 60 | attestations: true 61 | skip-existing: true 62 | 63 | - name: Extract version (strip leading v) 64 | id: ver 65 | run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT" 66 | 67 | - name: Create GitHub Release 68 | uses: softprops/action-gh-release@v1 69 | with: 70 | body: | 71 | ## TTSFM ${{ github.ref_name }} 72 | 73 | New release of TTSFM - Free Text-to-Speech API with OpenAI compatibility. 74 | 75 | ### Installation 76 | ```bash 77 | pip install ttsfm==${{ steps.ver.outputs.version }} 78 | ``` 79 | 80 | ### Quick Start 81 | ```python 82 | from ttsfm import TTSClient 83 | 84 | client = TTSClient() 85 | response = client.generate_speech("Hello from TTSFM!") 86 | response.save_to_file("hello") 87 | ``` 88 | 89 | ### Docker 90 | ```bash 91 | docker run -p 8000:8000 dbcccc/ttsfm:latest 92 | ``` 93 | 94 | ### Features 95 | - Completely free (uses openai.fm service) 96 | - OpenAI-compatible API 97 | - 11 voices available 98 | - 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM) 99 | - Async and sync clients 100 | - Web interface included 101 | - CLI tool available 102 | 103 | ### Documentation 104 | See [README](https://github.com/dbccccccc/ttsfm#readme) for full documentation. 105 | draft: false 106 | prerelease: ${{ contains(github.ref_name, '-' ) }} 107 | 108 | -------------------------------------------------------------------------------- /tests/test_web_app.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | import sys 4 | from pathlib import Path 5 | 6 | import pytest 7 | 8 | WEB_DIR = Path(__file__).resolve().parents[1] / "ttsfm-web" 9 | MODULE_NAME = "ttsfm_web.app" 10 | 11 | 12 | def load_web_app(monkeypatch, **env): 13 | for key, value in env.items(): 14 | if value is None: 15 | monkeypatch.delenv(key, raising=False) 16 | else: 17 | monkeypatch.setenv(key, value) 18 | 19 | sys.modules.pop(MODULE_NAME, None) 20 | sys.modules.pop("ttsfm_web", None) 21 | sys.modules.pop("websocket_handler", None) 22 | 23 | web_dir_str = str(WEB_DIR) 24 | if web_dir_str not in sys.path: 25 | sys.path.insert(0, web_dir_str) 26 | 27 | pkg_spec = importlib.util.spec_from_loader("ttsfm_web", loader=None) 28 | pkg = importlib.util.module_from_spec(pkg_spec) 29 | pkg.__path__ = [web_dir_str] # type: ignore[attr-defined] 30 | sys.modules.setdefault("ttsfm_web", pkg) 31 | 32 | spec = importlib.util.spec_from_file_location(MODULE_NAME, WEB_DIR / "app.py") 33 | module = importlib.util.module_from_spec(spec) 34 | assert spec and spec.loader 35 | spec.loader.exec_module(module) # type: ignore[attr-defined] 36 | return module 37 | 38 | 39 | def test_voices_endpoint_returns_data(monkeypatch): 40 | module = load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None) 41 | client = module.app.test_client() 42 | response = client.get("/api/voices") 43 | assert response.status_code == 200 44 | payload = response.get_json() 45 | assert payload["count"] == len(payload["voices"]) 46 | 47 | 48 | def test_combine_audio_chunks_uses_format_hint(monkeypatch): 49 | load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None) 50 | 51 | from ttsfm import audio as audio_module 52 | 53 | class DummySegment: 54 | def __init__(self, tag: str): 55 | self.tag = tag 56 | 57 | def __iadd__(self, other: "DummySegment"): 58 | self.tag += other.tag 59 | return self 60 | 61 | def export(self, buffer, format: str): 62 | buffer.write(f"{format}:{self.tag}".encode()) 63 | 64 | class DummyAudioSegment: 65 | formats = [] 66 | 67 | @classmethod 68 | def from_mp3(cls, buffer): 69 | cls.formats.append("mp3") 70 | return DummySegment("mp3") 71 | 72 | @classmethod 73 | def from_wav(cls, buffer): 74 | cls.formats.append("wav") 75 | return DummySegment("wav") 76 | 77 | monkeypatch.setattr(audio_module, "AudioSegment", DummyAudioSegment) 78 | 79 | output = audio_module.combine_audio_chunks([b"one", b"two"], "opus") 80 | 81 | assert output == b"wav:wavwav" 82 | assert DummyAudioSegment.formats == ["wav", "wav"] 83 | 84 | 85 | @pytest.mark.parametrize( 86 | "header_name, header_value", 87 | [ 88 | ("Authorization", "Bearer super-secret"), 89 | ("X-API-Key", "super-secret"), 90 | ], 91 | ) 92 | def test_api_key_hash_verification(monkeypatch, header_name, header_value): 93 | module = load_web_app(monkeypatch, REQUIRE_API_KEY="true", TTSFM_API_KEY="super-secret") 94 | client = module.app.test_client() 95 | 96 | denied = client.post("/api/validate-text", json={"text": "hello"}) 97 | assert denied.status_code == 401 98 | 99 | headers = {header_name: header_value} 100 | response = client.post("/api/validate-text", json={"text": "hello"}, headers=headers) 101 | assert response.status_code == 200 102 | -------------------------------------------------------------------------------- /tests/test_clients.py: -------------------------------------------------------------------------------- 1 | import types 2 | 3 | import pytest 4 | 5 | from ttsfm.async_client import AsyncTTSClient 6 | from ttsfm.client import TTSClient 7 | from ttsfm.models import AudioFormat, TTSResponse 8 | 9 | 10 | def _mk_response(data: bytes) -> TTSResponse: 11 | return TTSResponse( 12 | audio_data=data, 13 | content_type="audio/mpeg", 14 | format=AudioFormat.MP3, 15 | size=len(data), 16 | ) 17 | 18 | 19 | class _DummyResponse: 20 | def __init__(self, content_type: str, content: bytes, url: str = "https://example.test/audio"): 21 | self.status_code = 200 22 | self.headers = {"content-type": content_type} 23 | self.content = content 24 | self.url = url 25 | self.text = "" 26 | 27 | def json(self): # pragma: no cover - not used on success path 28 | return {} 29 | 30 | 31 | def test_sync_request_normalizes_non_mp3_format(monkeypatch): 32 | client = TTSClient() 33 | captured = {} 34 | 35 | def fake_post(self, url, data=None, headers=None, timeout=None, verify=None): 36 | captured["data"] = data 37 | return _DummyResponse("audio/wav", b"RIFF" + b"\x00" * 64, url) 38 | 39 | monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session)) 40 | 41 | response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.FLAC) 42 | 43 | assert captured["data"]["response_format"] == "wav" 44 | assert response.format is AudioFormat.WAV 45 | 46 | 47 | def test_sync_request_preserves_mp3_format(monkeypatch): 48 | client = TTSClient() 49 | captured = {} 50 | 51 | def fake_post(self, url, data=None, headers=None, timeout=None, verify=None): 52 | captured["data"] = data 53 | return _DummyResponse("audio/mpeg", b"ID3" + b"\x00" * 64, url) 54 | 55 | monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session)) 56 | 57 | response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.MP3) 58 | 59 | assert captured["data"]["response_format"] == "mp3" 60 | assert response.format is AudioFormat.MP3 61 | 62 | 63 | def test_sync_long_text_auto_combine(monkeypatch): 64 | client = TTSClient() 65 | 66 | monkeypatch.setattr( 67 | client, 68 | "generate_speech_batch", 69 | lambda **kwargs: [_mk_response(b"one"), _mk_response(b"two")], 70 | ) 71 | 72 | combined_flag = {} 73 | 74 | def fake_combine(responses): 75 | combined_flag["called"] = True 76 | return _mk_response(b"onetwo") 77 | 78 | monkeypatch.setattr("ttsfm.client.combine_responses", fake_combine) 79 | 80 | result = client.generate_speech_long_text( 81 | text="dummy", 82 | auto_combine=True, 83 | ) 84 | 85 | assert combined_flag["called"] is True 86 | assert isinstance(result, TTSResponse) 87 | assert result.audio_data == b"onetwo" 88 | 89 | 90 | def test_sync_long_text_returns_list_without_auto_combine(monkeypatch): 91 | client = TTSClient() 92 | 93 | responses = [_mk_response(b"one")] 94 | monkeypatch.setattr(client, "generate_speech_batch", lambda **_: responses) 95 | 96 | result = client.generate_speech_long_text(text="dummy", auto_combine=False) 97 | 98 | assert result is responses 99 | 100 | 101 | @pytest.mark.asyncio 102 | async def test_async_long_text_auto_combine(monkeypatch): 103 | client = AsyncTTSClient() 104 | 105 | async def fake_batch(**kwargs): 106 | return [_mk_response(b"one"), _mk_response(b"two")] 107 | 108 | monkeypatch.setattr(client, "generate_speech_batch", fake_batch) 109 | 110 | def fake_combine(responses): 111 | return _mk_response(b"onetwo") 112 | 113 | monkeypatch.setattr("ttsfm.async_client.combine_responses", fake_combine) 114 | 115 | result = await client.generate_speech_long_text( 116 | text="dummy", 117 | auto_combine=True, 118 | ) 119 | 120 | assert isinstance(result, TTSResponse) 121 | assert result.audio_data == b"onetwo" 122 | -------------------------------------------------------------------------------- /ttsfm/capabilities.py: -------------------------------------------------------------------------------- 1 | """System capabilities detection for TTSFM. 2 | 3 | This module provides runtime detection of available features based on 4 | system dependencies (primarily ffmpeg availability). 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | import shutil 10 | from typing import Dict, List 11 | 12 | 13 | class SystemCapabilities: 14 | """Detect and report system capabilities. 15 | 16 | This class checks for the availability of optional dependencies 17 | (like ffmpeg) and reports which features are available in the 18 | current environment. 19 | """ 20 | 21 | def __init__(self) -> None: 22 | """Initialize capabilities detection.""" 23 | self.ffmpeg_available = shutil.which("ffmpeg") is not None 24 | 25 | def get_capabilities(self) -> Dict: 26 | """Get complete system capabilities report. 27 | 28 | Returns: 29 | Dict containing: 30 | - ffmpeg_available: bool 31 | - image_variant: "full" or "slim" 32 | - features: dict of feature availability 33 | - supported_formats: list of supported audio formats 34 | """ 35 | return { 36 | "ffmpeg_available": self.ffmpeg_available, 37 | "image_variant": "full" if self.ffmpeg_available else "slim", 38 | "features": { 39 | "speed_adjustment": self.ffmpeg_available, 40 | "format_conversion": self.ffmpeg_available, 41 | "mp3_auto_combine": self.ffmpeg_available, 42 | "basic_formats": True, # MP3, WAV always available 43 | }, 44 | "supported_formats": self.get_supported_formats(), 45 | } 46 | 47 | def get_supported_formats(self) -> List[str]: 48 | """Get list of supported audio formats. 49 | 50 | Returns: 51 | List of format names (e.g., ["mp3", "wav", "opus", ...]) 52 | """ 53 | basic = ["mp3", "wav"] 54 | if self.ffmpeg_available: 55 | return basic + ["opus", "aac", "flac", "pcm"] 56 | return basic 57 | 58 | def requires_ffmpeg(self, feature: str) -> bool: 59 | """Check if a feature requires ffmpeg. 60 | 61 | Args: 62 | feature: Feature name or format name to check 63 | 64 | Returns: 65 | True if the feature requires ffmpeg, False otherwise 66 | """ 67 | ffmpeg_features = { 68 | "speed_adjustment", 69 | "format_conversion", 70 | "mp3_auto_combine", 71 | "opus", 72 | "aac", 73 | "flac", 74 | "pcm", 75 | } 76 | return feature.lower() in ffmpeg_features 77 | 78 | def check_feature_available(self, feature: str) -> bool: 79 | """Check if a specific feature is available. 80 | 81 | Args: 82 | feature: Feature name to check 83 | 84 | Returns: 85 | True if feature is available, False otherwise 86 | """ 87 | if not self.requires_ffmpeg(feature): 88 | return True 89 | return self.ffmpeg_available 90 | 91 | def get_unavailable_reason(self, feature: str) -> str | None: 92 | """Get reason why a feature is unavailable. 93 | 94 | Args: 95 | feature: Feature name to check 96 | 97 | Returns: 98 | Error message if unavailable, None if available 99 | """ 100 | if self.check_feature_available(feature): 101 | return None 102 | 103 | return ( 104 | f"Feature '{feature}' requires ffmpeg. " 105 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant." 106 | ) 107 | 108 | 109 | # Global instance for easy access 110 | _capabilities_instance: SystemCapabilities | None = None 111 | 112 | 113 | def get_capabilities() -> SystemCapabilities: 114 | """Get global SystemCapabilities instance. 115 | 116 | Returns: 117 | SystemCapabilities singleton instance 118 | """ 119 | global _capabilities_instance 120 | if _capabilities_instance is None: 121 | _capabilities_instance = SystemCapabilities() 122 | return _capabilities_instance 123 | -------------------------------------------------------------------------------- /README.zh.md: -------------------------------------------------------------------------------- 1 | # TTSFM - 文本转语音 API 客户端 2 | 3 | > **⚠️ 告示:由于 openai.fm 体验网站已关闭,本项目已无法使用。** 4 | 5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md) 6 | 7 | [![Docker Pulls](https://img.shields.io/docker/pulls/dbcccc/ttsfm?style=flat-square&logo=docker)](https://hub.docker.com/r/dbcccc/ttsfm) 8 | [![GitHub Stars](https://img.shields.io/github/stars/dbccccccc/ttsfm?style=social)](https://github.com/dbccccccc/ttsfm) 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT) 10 | ![ghcr pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fghcr-badge.elias.eu.org%2Fapi%2Fdbccccccc%2Fttsfm%2Fttsfm&query=downloadCount&label=ghcr+pulls&logo=github) 11 | 12 | ## Star History 13 | 14 | [![Star History Chart](https://api.star-history.com/svg?repos=dbccccccc/ttsfm&type=Date)](https://www.star-history.com/#dbccccccc/ttsfm&Date) 15 | 16 | ## 概述 17 | 18 | TTSFM 是一个免费的、兼容 OpenAI 的文本转语音 API 服务,提供将文本转换为自然语音的完整解决方案,使用OpenAI的GPT-4o mini TTS。基于 openai.fm 后端构建,提供强大的 Python SDK、RESTful API 接口以及直观的网页 Playground,方便测试和集成。 19 | 20 | **TTSFM 的功能:** 21 | - 🎤 **多种语音选择**:11 种兼容 OpenAI 的语音(alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse) 22 | - 🎵 **灵活的音频格式**:支持 6 种音频格式(MP3、WAV、OPUS、AAC、FLAC、PCM) 23 | - ⚡ **语速控制**:0.25x 到 4.0x 的播放速度调节,适应不同使用场景 24 | - 📝 **长文本支持**:自动文本分割和音频合并,支持任意长度内容 25 | - 🔄 **实时流式传输**:WebSocket 支持流式音频生成 26 | - 🐍 **Python SDK**:易用的同步和异步客户端 27 | - 🌐 **网页 Playground**:交互式网页界面,方便测试和实验 28 | - 🐳 **Docker 就绪**:预构建的 Docker 镜像,即刻部署 29 | - 🔍 **智能检测**:自动功能检测和友好的错误提示 30 | - 🤖 **OpenAI 兼容**:可直接替代 OpenAI 的 TTS API 31 | 32 | **v3.4.0 版本的主要特性:** 33 | - 🎯 镜像变体检测(完整版 vs 精简版 Docker 镜像) 34 | - 🔍 运行时功能 API,检查特性可用性 35 | - ⚡ 基于 ffmpeg 的语速调节 36 | - 🎵 所有 6 种音频格式的真实格式转换 37 | - 📊 增强的错误处理,提供清晰、可操作的错误信息 38 | - 🐳 针对不同使用场景优化的双镜像版本 39 | 40 | > **⚠️ 免责声明**:本项目仅用于**学习和研究目的**。这是对 openai.fm 服务的逆向工程实现,不应用于商业用途或生产环境。用户需自行确保遵守适用的法律法规和服务条款。 41 | 42 | ## 安装 43 | 44 | ### Python 包 45 | 46 | ```bash 47 | pip install ttsfm # 核心客户端 48 | pip install ttsfm[web] # 核心客户端 + Web/服务端依赖 49 | ``` 50 | 51 | ### Docker 镜像 52 | 53 | TTSFM 提供两种 Docker 镜像变体以满足不同需求: 54 | 55 | #### 完整版(推荐) 56 | ```bash 57 | docker run -p 8000:8000 dbcccc/ttsfm:latest 58 | ``` 59 | 60 | **包含 ffmpeg,支持高级功能:** 61 | - ✅ 所有 6 种音频格式(MP3、WAV、OPUS、AAC、FLAC、PCM) 62 | - ✅ 语速调节(0.25x - 4.0x) 63 | - ✅ 使用 ffmpeg 进行格式转换 64 | - ✅ 长文本 MP3 自动合并 65 | - ✅ 长文本 WAV 自动合并 66 | 67 | #### 精简版 68 | ```bash 69 | docker run -p 8000:8000 dbcccc/ttsfm:slim 70 | ``` 71 | 72 | **不含 ffmpeg 的最小化镜像:** 73 | - ✅ 基础 TTS 功能 74 | - ✅ 2 种音频格式(仅 MP3、WAV) 75 | - ✅ 长文本 WAV 自动合并 76 | - ❌ 不支持语速调节 77 | - ❌ 不支持格式转换 78 | - ❌ 不支持 MP3 自动合并 79 | 80 | 容器默认开放网页 Playground(`http://localhost:8000`)以及兼容 OpenAI 的 `/v1/audio/speech` 接口。 81 | 82 | **检查可用功能:** 83 | ```bash 84 | curl http://localhost:8000/api/capabilities 85 | ``` 86 | 87 | ## 快速开始 88 | 89 | ### Python 客户端 90 | 91 | ```python 92 | from ttsfm import TTSClient, AudioFormat, Voice 93 | 94 | client = TTSClient() 95 | 96 | # 基础用法 97 | response = client.generate_speech( 98 | text="来自 TTSFM 的问候!", 99 | voice=Voice.ALLOY, 100 | response_format=AudioFormat.MP3, 101 | ) 102 | response.save_to_file("hello") # -> hello.mp3 103 | 104 | # 使用语速调节(需要 ffmpeg) 105 | response = client.generate_speech( 106 | text="这段语音会更快!", 107 | voice=Voice.NOVA, 108 | response_format=AudioFormat.MP3, 109 | speed=1.5, # 1.5 倍速(范围:0.25 - 4.0) 110 | ) 111 | response.save_to_file("fast") # -> fast.mp3 112 | ``` 113 | 114 | ### 命令行 115 | 116 | ```bash 117 | ttsfm "你好,世界" --voice nova --format mp3 --output hello.mp3 118 | ``` 119 | 120 | ### REST API(兼容 OpenAI) 121 | 122 | ```bash 123 | # 基础请求 124 | curl -X POST http://localhost:8000/v1/audio/speech \ 125 | -H "Content-Type: application/json" \ 126 | -d '{ 127 | "model": "tts-1", 128 | "input": "你好,世界", 129 | "voice": "alloy", 130 | "response_format": "mp3" 131 | }' --output speech.mp3 132 | 133 | # 使用语速调节(需要完整版镜像) 134 | curl -X POST http://localhost:8000/v1/audio/speech \ 135 | -H "Content-Type: application/json" \ 136 | -d '{ 137 | "model": "tts-1", 138 | "input": "你好,世界", 139 | "voice": "alloy", 140 | "response_format": "mp3", 141 | "speed": 1.5 142 | }' --output speech_fast.mp3 143 | ``` 144 | 145 | **可用语音:** alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse 146 | **可用格式:** mp3、wav(始终可用)+ opus、aac、flac、pcm(仅完整版镜像) 147 | **语速范围:** 0.25 - 4.0(需要完整版镜像) 148 | 149 | ## 了解更多 150 | 151 | - 在 [Web 文档](http://localhost:8000/docs)(或 `ttsfm-web/templates/docs.html`)查看完整接口说明与运行注意事项。 152 | - 查看 [架构概览](docs/architecture.md) 了解组件间的关系。 153 | - 欢迎参与贡献,流程说明请见 [CONTRIBUTING.md](CONTRIBUTING.md)。 154 | 155 | ## 许可证 156 | 157 | TTSFM 采用 [MIT 许可证](LICENSE) 发布。 158 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "ttsfm" 7 | dynamic = ["version"] 8 | description = "Text-to-Speech API Client with OpenAI compatibility" 9 | readme = "README.md" 10 | license = "MIT" 11 | authors = [ 12 | {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"} 13 | ] 14 | maintainers = [ 15 | {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"} 16 | ] 17 | classifiers = [ 18 | "Development Status :: 4 - Beta", 19 | "Intended Audience :: Developers", 20 | 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Topic :: Multimedia :: Sound/Audio :: Speech", 29 | "Topic :: Software Development :: Libraries :: Python Modules", 30 | "Topic :: Internet :: WWW/HTTP :: Dynamic Content", 31 | ] 32 | keywords = [ 33 | "tts", 34 | "text-to-speech", 35 | "speech-synthesis", 36 | "openai", 37 | "api-client", 38 | "audio", 39 | "voice", 40 | "speech" 41 | ] 42 | requires-python = ">=3.8" 43 | dependencies = [ 44 | "requests>=2.25.0", 45 | "aiohttp>=3.8.0", 46 | "python-dotenv>=1.0.1", 47 | ] 48 | 49 | [project.optional-dependencies] 50 | dev = [ 51 | "pytest>=6.0", 52 | "pytest-asyncio>=0.18.0", 53 | "pytest-cov>=2.0", 54 | "black>=22.0", 55 | "isort>=5.0", 56 | "flake8>=4.0", 57 | "mypy>=0.900", 58 | "pre-commit>=2.0", 59 | ] 60 | docs = [ 61 | "sphinx>=4.0", 62 | "sphinx-rtd-theme>=1.0", 63 | "myst-parser>=0.17", 64 | ] 65 | web = [ 66 | "flask>=2.0.0", 67 | "flask-cors>=3.0.10", 68 | "flask-socketio>=5.3.0", 69 | "python-socketio>=5.10.0", 70 | "eventlet>=0.33.3", 71 | "waitress>=3.0.0", 72 | "pydub>=0.25.0", 73 | "argon2-cffi>=23.1.0", 74 | ] 75 | 76 | [project.urls] 77 | Homepage = "https://github.com/dbccccccc/ttsfm" 78 | Documentation = "https://github.com/dbccccccc/ttsfm/blob/main/docs/" 79 | Repository = "https://github.com/dbccccccc/ttsfm" 80 | "Bug Tracker" = "https://github.com/dbccccccc/ttsfm/issues" 81 | 82 | [project.scripts] 83 | ttsfm = "ttsfm.cli:main" 84 | 85 | [tool.setuptools_scm] 86 | version_scheme = "no-guess-dev" 87 | local_scheme = "no-local-version" 88 | 89 | fallback_version = "3.4.2" 90 | [tool.setuptools] 91 | packages = ["ttsfm"] 92 | 93 | [tool.setuptools.package-data] 94 | ttsfm = ["py.typed"] 95 | 96 | [tool.black] 97 | line-length = 100 98 | target-version = ['py38'] 99 | include = '\\.pyi?$' 100 | extend-exclude = ''' 101 | /( 102 | # directories 103 | \.eggs 104 | | \.git 105 | | \.hg 106 | | \.mypy_cache 107 | | \.tox 108 | | \.venv 109 | | build 110 | | dist 111 | )/ 112 | ''' 113 | 114 | [tool.isort] 115 | profile = "black" 116 | line_length = 100 117 | multi_line_output = 3 118 | include_trailing_comma = true 119 | force_grid_wrap = 0 120 | use_parentheses = true 121 | ensure_newline_before_comments = true 122 | 123 | [tool.mypy] 124 | python_version = "3.9" 125 | warn_return_any = false 126 | warn_unused_configs = true 127 | disallow_untyped_defs = false 128 | disallow_incomplete_defs = false 129 | check_untyped_defs = true 130 | disallow_untyped_decorators = false 131 | no_implicit_optional = false 132 | warn_redundant_casts = true 133 | warn_unused_ignores = false 134 | warn_no_return = true 135 | warn_unreachable = false 136 | strict_equality = true 137 | 138 | [[tool.mypy.overrides]] 139 | module = "requests.*" 140 | ignore_missing_imports = true 141 | 142 | [[tool.mypy.overrides]] 143 | module = "pydub.*" 144 | ignore_missing_imports = true 145 | 146 | [[tool.mypy.overrides]] 147 | module = "fake_useragent.*" 148 | ignore_missing_imports = true 149 | 150 | [tool.pytest.ini_options] 151 | minversion = "6.0" 152 | addopts = "-ra -q --strict-markers --strict-config" 153 | testpaths = ["tests"] 154 | python_files = ["test_*.py", "*_test.py"] 155 | python_classes = ["Test*"] 156 | python_functions = ["test_*"] 157 | markers = [ 158 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 159 | "integration: marks tests as integration tests", 160 | "unit: marks tests as unit tests", 161 | ] 162 | 163 | [tool.coverage.run] 164 | source = ["ttsfm"] 165 | omit = [ 166 | "*/tests/*", 167 | "*/test_*", 168 | "setup.py", 169 | ] 170 | 171 | [tool.coverage.report] 172 | exclude_lines = [ 173 | "pragma: no cover", 174 | "def __repr__", 175 | "if self.debug:", 176 | "if settings.DEBUG", 177 | "raise AssertionError", 178 | "raise NotImplementedError", 179 | "if 0:", 180 | "if __name__ == .__main__.:", 181 | "class .*\\bProtocol\\):", 182 | "@(abc\\.)?abstractmethod", 183 | ] 184 | 185 | -------------------------------------------------------------------------------- /tests/test_audio_processing.py: -------------------------------------------------------------------------------- 1 | """Tests for audio processing functionality.""" 2 | 3 | import pytest 4 | import shutil 5 | from ttsfm.audio_processing import adjust_audio_speed, _build_atempo_filter_chain 6 | 7 | 8 | class TestAudioProcessing: 9 | """Test audio processing functions.""" 10 | 11 | def test_build_atempo_filter_chain_normal_range(self): 12 | """Test atempo filter chain for speeds in 0.5-2.0 range.""" 13 | # Single filter for speeds in range 14 | assert _build_atempo_filter_chain(1.0) == "atempo=1.0" 15 | assert _build_atempo_filter_chain(1.5) == "atempo=1.5" 16 | assert _build_atempo_filter_chain(0.5) == "atempo=0.5" 17 | assert _build_atempo_filter_chain(2.0) == "atempo=2.0" 18 | 19 | def test_build_atempo_filter_chain_high_speed(self): 20 | """Test atempo filter chain for speeds > 2.0.""" 21 | # Should chain multiple filters 22 | result = _build_atempo_filter_chain(4.0) 23 | assert "atempo=2.0" in result 24 | assert "," in result # Multiple filters chained 25 | 26 | def test_build_atempo_filter_chain_low_speed(self): 27 | """Test atempo filter chain for speeds < 0.5.""" 28 | # Should chain multiple filters 29 | result = _build_atempo_filter_chain(0.25) 30 | assert "atempo=0.5" in result 31 | assert "," in result # Multiple filters chained 32 | 33 | def test_adjust_audio_speed_validation(self): 34 | """Test speed parameter validation.""" 35 | dummy_audio = b"dummy audio data" 36 | 37 | # Speed too low 38 | with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"): 39 | adjust_audio_speed(dummy_audio, speed=0.1) 40 | 41 | # Speed too high 42 | with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"): 43 | adjust_audio_speed(dummy_audio, speed=5.0) 44 | 45 | def test_adjust_audio_speed_no_change(self): 46 | """Test that speed=1.0 returns original audio.""" 47 | dummy_audio = b"dummy audio data" 48 | result = adjust_audio_speed(dummy_audio, speed=1.0) 49 | assert result == dummy_audio 50 | 51 | @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not available") 52 | def test_adjust_audio_speed_requires_ffmpeg(self): 53 | """Test that speed adjustment requires ffmpeg.""" 54 | # This test only runs if ffmpeg is available 55 | # If ffmpeg is not available, the function should raise RuntimeError 56 | pass 57 | 58 | def test_adjust_audio_speed_no_ffmpeg(self, monkeypatch): 59 | """Test error when ffmpeg is not available.""" 60 | # Mock shutil.which to return None (ffmpeg not found) 61 | monkeypatch.setattr("shutil.which", lambda x: None) 62 | 63 | dummy_audio = b"dummy audio data" 64 | with pytest.raises(RuntimeError, match="Speed adjustment requires ffmpeg"): 65 | adjust_audio_speed(dummy_audio, speed=1.5) 66 | 67 | 68 | class TestFFmpegDetection: 69 | """Test ffmpeg detection in audio module.""" 70 | 71 | def test_ffmpeg_detection(self): 72 | """Test that FFMPEG_AVAILABLE is set correctly.""" 73 | from ttsfm.audio import FFMPEG_AVAILABLE 74 | 75 | # Should be a boolean 76 | assert isinstance(FFMPEG_AVAILABLE, bool) 77 | 78 | # Should match actual ffmpeg availability 79 | expected = shutil.which("ffmpeg") is not None 80 | assert FFMPEG_AVAILABLE == expected 81 | 82 | 83 | class TestAudioCombineWithFFmpeg: 84 | """Test audio combining with ffmpeg detection.""" 85 | 86 | def test_combine_mp3_without_ffmpeg(self, monkeypatch): 87 | """Test that MP3 combining fails gracefully without ffmpeg.""" 88 | # Mock both pydub and ffmpeg as unavailable 89 | import ttsfm.audio 90 | 91 | monkeypatch.setattr(ttsfm.audio, "AudioSegment", None) 92 | monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False) 93 | 94 | from ttsfm.audio import combine_audio_chunks 95 | from ttsfm.exceptions import AudioProcessingException 96 | 97 | chunks = [b"chunk1", b"chunk2"] 98 | with pytest.raises(AudioProcessingException, match="MP3 audio requires pydub and ffmpeg"): 99 | combine_audio_chunks(chunks, format_type="mp3") 100 | 101 | def test_combine_wav_without_ffmpeg(self, monkeypatch): 102 | """Test that WAV combining works without ffmpeg.""" 103 | # Mock pydub as unavailable but allow WAV concatenation 104 | import ttsfm.audio 105 | 106 | monkeypatch.setattr(ttsfm.audio, "AudioSegment", None) 107 | 108 | from ttsfm.audio import combine_audio_chunks 109 | 110 | # Create simple WAV chunks (with minimal headers) 111 | # This is a simplified test - real WAV files have proper headers 112 | chunks = [b"RIFF" + b"\x00" * 40 + b"data", b"RIFF" + b"\x00" * 40 + b"data"] 113 | 114 | # Should not raise error for WAV 115 | result = combine_audio_chunks(chunks, format_type="wav") 116 | assert isinstance(result, bytes) 117 | -------------------------------------------------------------------------------- /ttsfm/audio.py: -------------------------------------------------------------------------------- 1 | """Audio helper utilities shared across TTSFM components.""" 2 | 3 | from __future__ import annotations 4 | 5 | import io 6 | import logging 7 | import shutil 8 | from typing import Iterable, List, Sequence 9 | 10 | from .exceptions import AudioProcessingException 11 | from .models import TTSResponse 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | try: # Optional dependency for non-WAV combining 17 | from pydub import AudioSegment 18 | except ImportError: # pragma: no cover - optional dependency 19 | AudioSegment = None 20 | 21 | 22 | # Detect ffmpeg availability at runtime 23 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None 24 | 25 | SUPPORTED_EXPORT_FORMATS = {"mp3", "wav", "aac", "flac", "opus", "pcm"} 26 | 27 | 28 | def combine_audio_chunks(audio_chunks: Iterable[bytes], format_type: str = "mp3") -> bytes: 29 | """Combine multiple audio chunks into a single audio file. 30 | 31 | Args: 32 | audio_chunks: Iterable of raw audio byte strings 33 | format_type: Requested output format 34 | 35 | Returns: 36 | Combined audio data as bytes 37 | 38 | Raises: 39 | RuntimeError: If non-WAV combining is requested without pydub/ffmpeg available 40 | """ 41 | 42 | chunks_list = list(audio_chunks) 43 | if not chunks_list: 44 | return b"" 45 | 46 | fmt = format_type.lower() 47 | 48 | # Check for pydub availability (which requires ffmpeg for MP3) 49 | if AudioSegment is None: 50 | if fmt == "mp3": 51 | raise AudioProcessingException( 52 | "Combining MP3 audio requires pydub and ffmpeg. " 53 | "Install ttsfm[web] and use the full Docker image (dbcccc/ttsfm:latest) " 54 | "instead of the slim variant.", 55 | audio_format="mp3", 56 | ) 57 | return _simple_wav_concatenation(chunks_list) 58 | 59 | # Check for ffmpeg availability when using pydub 60 | if not FFMPEG_AVAILABLE and fmt == "mp3": 61 | raise AudioProcessingException( 62 | "MP3 auto-combine requires ffmpeg. " 63 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant, " 64 | "or disable auto_combine and handle chunks separately.", 65 | audio_format="mp3", 66 | ) 67 | 68 | audio_segments = [] 69 | for chunk in chunks_list: 70 | buffer = io.BytesIO(chunk) 71 | if fmt == "mp3": 72 | segment = AudioSegment.from_mp3(buffer) 73 | else: 74 | segment = AudioSegment.from_wav(buffer) 75 | audio_segments.append(segment) 76 | 77 | combined = audio_segments[0] 78 | for segment in audio_segments[1:]: 79 | combined += segment 80 | 81 | output_buffer = io.BytesIO() 82 | export_format = "mp3" if fmt == "mp3" else "wav" 83 | combined.export(output_buffer, format=export_format) 84 | return output_buffer.getvalue() 85 | 86 | 87 | def _simple_wav_concatenation(wav_chunks: List[bytes]) -> bytes: 88 | """Simple WAV concatenation fallback that avoids external deps.""" 89 | if not wav_chunks: 90 | return b"" 91 | 92 | if len(wav_chunks) == 1: 93 | return wav_chunks[0] 94 | 95 | try: 96 | first_wav = wav_chunks[0] 97 | if len(first_wav) < 44: 98 | return b"".join(wav_chunks) 99 | 100 | header = bytearray(first_wav[:44]) 101 | audio_data = first_wav[44:] 102 | 103 | for wav_chunk in wav_chunks[1:]: 104 | if len(wav_chunk) > 44: 105 | audio_data += wav_chunk[44:] 106 | 107 | total_size = len(header) + len(audio_data) - 8 108 | header[4:8] = total_size.to_bytes(4, byteorder="little") 109 | 110 | data_size = len(audio_data) 111 | header[40:44] = data_size.to_bytes(4, byteorder="little") 112 | 113 | return bytes(header) + audio_data 114 | except Exception as exc: 115 | logger.error("Error in simple WAV concatenation: %s", exc) 116 | return b"".join(wav_chunks) 117 | 118 | 119 | def combine_responses(responses: Sequence["TTSResponse"]) -> "TTSResponse": 120 | """Combine multiple ``TTSResponse`` objects into a single response.""" 121 | 122 | responses = list(responses) 123 | if not responses: 124 | raise ValueError("No responses provided for combination") 125 | 126 | first = responses[0] 127 | audio_format = first.format 128 | 129 | audio_bytes = combine_audio_chunks((resp.audio_data for resp in responses), audio_format.value) 130 | 131 | total_duration = None 132 | if any(resp.duration is not None for resp in responses): 133 | total_duration = sum(filter(None, (resp.duration for resp in responses))) 134 | 135 | metadata = dict(first.metadata or {}) 136 | metadata.update( 137 | { 138 | "chunks_combined": len(responses), 139 | "auto_combined": True, 140 | } 141 | ) 142 | 143 | return TTSResponse( 144 | audio_data=audio_bytes, 145 | content_type=first.content_type, 146 | format=audio_format, 147 | size=len(audio_bytes), 148 | duration=total_duration if total_duration is not None else first.duration, 149 | metadata=metadata, 150 | ) 151 | -------------------------------------------------------------------------------- /scripts/test_websocket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Test WebSocket connection to TTSFM server. 4 | 5 | This script tests the WebSocket functionality by connecting to the server 6 | and performing a simple TTS generation request. 7 | """ 8 | 9 | import time 10 | import socketio 11 | 12 | # Create a Socket.IO client 13 | sio = socketio.Client(logger=True, engineio_logger=True) 14 | 15 | # Track connection state 16 | connected = False 17 | stream_complete = False 18 | chunks_received = 0 19 | 20 | 21 | @sio.on('connect') 22 | def on_connect(): 23 | """Handle connection event.""" 24 | global connected 25 | connected = True 26 | print('\n✅ Connected to WebSocket server!') 27 | print(f'Session ID: {sio.sid}') 28 | 29 | 30 | @sio.on('connected') 31 | def on_session_ready(data): 32 | """Handle session ready event.""" 33 | print(f'\n✅ Session established: {data}') 34 | 35 | 36 | @sio.on('disconnect') 37 | def on_disconnect(): 38 | """Handle disconnection event.""" 39 | global connected 40 | connected = False 41 | print('\n❌ Disconnected from WebSocket server') 42 | 43 | 44 | @sio.on('connect_error') 45 | def on_connect_error(data): 46 | """Handle connection error.""" 47 | print(f'\n❌ Connection error: {data}') 48 | 49 | 50 | @sio.on('pong') 51 | def on_pong(data): 52 | """Handle pong response.""" 53 | print(f'\n✅ Pong received: {data}') 54 | 55 | 56 | @sio.on('stream_started') 57 | def on_stream_started(data): 58 | """Handle stream started event.""" 59 | print(f'\n✅ Stream started: {data}') 60 | 61 | 62 | @sio.on('stream_progress') 63 | def on_stream_progress(data): 64 | """Handle stream progress event.""" 65 | progress = data.get('progress', 0) 66 | status = data.get('status', 'unknown') 67 | print(f'📊 Progress: {progress}% - Status: {status}') 68 | 69 | 70 | @sio.on('audio_chunk') 71 | def on_audio_chunk(data): 72 | """Handle audio chunk event.""" 73 | global chunks_received 74 | chunks_received += 1 75 | chunk_index = data.get('chunk_index', 0) 76 | total_chunks = data.get('total_chunks', 0) 77 | print(f'🎵 Received audio chunk {chunk_index + 1}/{total_chunks}') 78 | 79 | 80 | @sio.on('stream_complete') 81 | def on_stream_complete(data): 82 | """Handle stream complete event.""" 83 | global stream_complete 84 | stream_complete = True 85 | print(f'\n✅ Stream complete: {data}') 86 | print(f'Total chunks received: {chunks_received}') 87 | 88 | 89 | @sio.on('stream_error') 90 | def on_stream_error(data): 91 | """Handle stream error event.""" 92 | print(f'\n❌ Stream error: {data}') 93 | 94 | 95 | def test_connection(url='http://localhost:8000'): 96 | """Test WebSocket connection.""" 97 | print(f'🔌 Connecting to {url}...') 98 | 99 | try: 100 | # Connect to the server 101 | sio.connect(url, transports=['polling', 'websocket']) 102 | 103 | # Wait for connection 104 | timeout = 10 105 | start_time = time.time() 106 | while not connected and (time.time() - start_time) < timeout: 107 | time.sleep(0.1) 108 | 109 | if not connected: 110 | print('❌ Failed to connect within timeout') 111 | return False 112 | 113 | # Test ping/pong 114 | print('\n📡 Testing ping/pong...') 115 | sio.emit('ping', {'timestamp': time.time()}) 116 | time.sleep(1) 117 | 118 | # Test TTS generation 119 | print('\n🎤 Testing TTS generation...') 120 | request_data = { 121 | 'request_id': f'test_{int(time.time())}', 122 | 'text': 'Hello, this is a WebSocket test!', 123 | 'voice': 'alloy', 124 | 'format': 'mp3', 125 | 'chunk_size': 512 126 | } 127 | 128 | sio.emit('generate_stream', request_data) 129 | 130 | # Wait for stream to complete 131 | timeout = 30 132 | start_time = time.time() 133 | while not stream_complete and (time.time() - start_time) < timeout: 134 | time.sleep(0.1) 135 | 136 | if stream_complete: 137 | print('\n✅ WebSocket test completed successfully!') 138 | return True 139 | else: 140 | print('\n⚠️ Stream did not complete within timeout') 141 | return False 142 | 143 | except Exception as e: 144 | print(f'\n❌ Error during test: {e}') 145 | import traceback 146 | traceback.print_exc() 147 | return False 148 | 149 | finally: 150 | # Disconnect 151 | if connected: 152 | print('\n🔌 Disconnecting...') 153 | sio.disconnect() 154 | time.sleep(1) 155 | 156 | 157 | if __name__ == '__main__': 158 | import sys 159 | 160 | # Get URL from command line or use default 161 | url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8000' 162 | 163 | print('=' * 60) 164 | print('TTSFM WebSocket Connection Test') 165 | print('=' * 60) 166 | 167 | success = test_connection(url) 168 | 169 | print('\n' + '=' * 60) 170 | if success: 171 | print('✅ All tests passed!') 172 | sys.exit(0) 173 | else: 174 | print('❌ Some tests failed') 175 | sys.exit(1) 176 | 177 | -------------------------------------------------------------------------------- /docs/docker-workflows.md: -------------------------------------------------------------------------------- 1 | # Docker Build Workflows 2 | 3 | ## Overview 4 | 5 | Starting with v3.4.0, TTSFM uses **separate GitHub Actions workflows** for building the full and slim Docker image variants. This provides better clarity, easier debugging, and independent execution. 6 | 7 | ## Workflow Files 8 | 9 | ### 1. `.github/workflows/docker-build-full.yml` 10 | 11 | **Purpose**: Builds the full variant with ffmpeg support 12 | 13 | **Triggers**: 14 | - Push to `main` branch 15 | - Pull requests to `main` branch 16 | - Release published 17 | 18 | **Image Tags** (on release): 19 | - `dbcccc/ttsfm:vX.X.X` 20 | - `dbcccc/ttsfm:latest` (only for stable releases, not pre-releases) 21 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X` 22 | - `ghcr.io/dbccccccc/ttsfm:latest` (only for stable releases) 23 | 24 | **Features**: 25 | - ✅ ffmpeg included 26 | - ✅ MP3 auto-combine 27 | - ✅ Speed adjustment (0.25x - 4.0x) 28 | - ✅ Format conversion 29 | - ✅ Multi-platform builds (linux/amd64, linux/arm64) 30 | - ✅ Smoke test on PR/push 31 | - ✅ GitHub Actions cache (scope: `full`) 32 | 33 | --- 34 | 35 | ### 2. `.github/workflows/docker-build-slim.yml` 36 | 37 | **Purpose**: Builds the slim variant without ffmpeg 38 | 39 | **Triggers**: 40 | - Push to `main` branch 41 | - Pull requests to `main` branch 42 | - Release published 43 | 44 | **Image Tags** (on release): 45 | - `dbcccc/ttsfm:vX.X.X-slim` 46 | - `dbcccc/ttsfm:vX.X-slim` (only for stable releases, not pre-releases) 47 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X-slim` 48 | - `ghcr.io/dbccccccc/ttsfm:vX.X-slim` (only for stable releases) 49 | 50 | **Features**: 51 | - ✅ No ffmpeg (smaller image) 52 | - ✅ Basic TTS (MP3/WAV) 53 | - ✅ WAV auto-combine (simple concatenation) 54 | - ❌ No MP3 auto-combine 55 | - ❌ No speed adjustment 56 | - ❌ No format conversion 57 | - ✅ Multi-platform builds (linux/amd64, linux/arm64) 58 | - ✅ Smoke test on PR/push (port 8001) 59 | - ✅ GitHub Actions cache (scope: `slim`) 60 | 61 | --- 62 | 63 | ## Build Behavior 64 | 65 | ### On Pull Request or Push to Main 66 | 67 | Both workflows run in parallel: 68 | - Build for `linux/amd64` only (faster) 69 | - Images are **not pushed** to registries 70 | - Images are loaded locally for smoke testing 71 | - Temporary tags: `ghcr.io/dbccccccc/ttsfm:ci-{RUN_ID}-full` and `ci-{RUN_ID}-slim` 72 | 73 | ### On Release Published 74 | 75 | Both workflows run in parallel: 76 | - Build for `linux/amd64` and `linux/arm64` (multi-platform) 77 | - Images are **pushed** to Docker Hub and GitHub Container Registry 78 | - No local loading (images go directly to registries) 79 | - Production tags based on release version 80 | 81 | ### Pre-release vs Stable Release 82 | 83 | **Pre-release** (e.g., `v3.4.0-alpha1`): 84 | - Full variant: `vX.X.X` only (no `latest` tag) 85 | - Slim variant: `vX.X.X-slim` only (no `vX.X-slim` tag) 86 | 87 | **Stable release** (e.g., `v3.4.0`): 88 | - Full variant: `vX.X.X` + `latest` 89 | - Slim variant: `vX.X.X-slim` + `vX.X-slim` 90 | 91 | --- 92 | 93 | ## Advantages of Separate Workflows 94 | 95 | 1. **Clarity**: Each workflow has a single, clear purpose 96 | 2. **Easier debugging**: When a build fails, you immediately know which variant failed 97 | 3. **Independent execution**: Can trigger/retry builds independently 98 | 4. **Simpler logic**: No complex conditionals or fallback logic 99 | 5. **Better visibility**: GitHub Actions UI shows them as separate jobs 100 | 6. **Parallel execution**: Both variants build truly in parallel 101 | 7. **Independent caching**: Each variant has its own cache scope 102 | 103 | --- 104 | 105 | ## Monitoring Builds 106 | 107 | ### GitHub Actions UI 108 | 109 | When you create a release, you'll see **two separate workflow runs**: 110 | - ✅ Docker Build and Push (Full) 111 | - ✅ Docker Build and Push (Slim) 112 | 113 | Each can succeed or fail independently. 114 | 115 | ### Checking Build Status 116 | 117 | **Via GitHub UI**: 118 | 1. Go to repository → Actions tab 119 | 2. Look for the two workflow runs 120 | 3. Click on each to see detailed logs 121 | 122 | **Via API**: 123 | ```bash 124 | # Check latest workflow runs 125 | gh run list --workflow=docker-build-full.yml 126 | gh run list --workflow=docker-build-slim.yml 127 | ``` 128 | 129 | --- 130 | 131 | ## Troubleshooting 132 | 133 | ### Slim variant not building 134 | 135 | 1. Check if the workflow file exists: `.github/workflows/docker-build-slim.yml` 136 | 2. Check the Actions tab for the "Docker Build and Push (Slim)" workflow 137 | 3. Look for error messages in the workflow logs 138 | 4. Verify Docker Hub and GitHub Container Registry credentials 139 | 140 | ### Images not pushed to registry 141 | 142 | 1. Verify the event is a "release published" (not draft) 143 | 2. Check Docker Hub credentials in repository secrets: 144 | - `DOCKERHUB_USERNAME` 145 | - `DOCKERHUB_TOKEN` 146 | 3. Check GitHub Container Registry permissions (automatic via `GITHUB_TOKEN`) 147 | 148 | ### Smoke test failing 149 | 150 | 1. Check the smoke test logs in the workflow run 151 | 2. Verify the health endpoint is working: `/api/health` 152 | 3. For slim variant, ensure it's using port 8001 (not 8000) 153 | 154 | --- 155 | 156 | ## Future Enhancements 157 | 158 | Potential improvements for the workflows: 159 | 160 | 1. **Matrix builds**: Use a single workflow with matrix strategy 161 | 2. **Reusable workflows**: Extract common steps into a reusable workflow 162 | 3. **Build notifications**: Send notifications on build success/failure 163 | 4. **Image scanning**: Add security scanning with Trivy or Snyk 164 | 5. **Performance metrics**: Track and report build times and image sizes 165 | 166 | -------------------------------------------------------------------------------- /docs/v3.4-dual-image-implementation.md: -------------------------------------------------------------------------------- 1 | # TTSFM v3.4.x Dual-Image Implementation 2 | 3 | ## Overview 4 | 5 | Starting with v3.4.0-alpha1, TTSFM provides two Docker image variants to balance functionality and image size: 6 | 7 | 1. **Full variant** (`dbcccc/ttsfm:latest`, `dbcccc/ttsfm:v3.4.0-alpha1`) 8 | - Includes ffmpeg for advanced audio processing 9 | - Supports all features including speed adjustment and format conversion 10 | 11 | 2. **Slim variant** (`dbcccc/ttsfm:v3.4.0-alpha1-slim`) 12 | - Minimal image without ffmpeg 13 | - Basic TTS functionality only 14 | 15 | ## Implementation Details 16 | 17 | ### 1. Dockerfile Changes 18 | 19 | The Dockerfile now accepts a `VARIANT` build argument: 20 | 21 | ```dockerfile 22 | ARG VARIANT=full # Can be 'full' or 'slim' 23 | ``` 24 | 25 | - **Full variant**: Installs ffmpeg in the runtime stage 26 | - **Slim variant**: Skips ffmpeg installation 27 | 28 | ### 2. GitHub Actions Workflow 29 | 30 | `.github/workflows/docker-build.yml` now builds both variants: 31 | 32 | - **Full image tags**: `vX.X.X`, `latest` 33 | - **Slim image tags**: `vX.X.X-slim` 34 | 35 | Both variants are built for `linux/amd64` and `linux/arm64` platforms on release. 36 | 37 | ### 3. Runtime Feature Detection 38 | 39 | `ttsfm/audio.py` now includes runtime detection: 40 | 41 | ```python 42 | import shutil 43 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None 44 | ``` 45 | 46 | Functions that require ffmpeg provide helpful error messages when it's not available. 47 | 48 | ### 4. Speed Adjustment Feature 49 | 50 | New module `ttsfm/audio_processing.py` provides: 51 | 52 | - `adjust_audio_speed()`: Adjust playback speed using ffmpeg (0.25x - 4.0x) 53 | - `convert_audio_format()`: Convert between audio formats using ffmpeg 54 | 55 | Both sync (`TTSClient`) and async (`AsyncTTSClient`) clients now support the `speed` parameter: 56 | 57 | ```python 58 | response = client.generate_speech( 59 | text="Hello!", 60 | voice=Voice.ALLOY, 61 | speed=1.5, # 1.5x faster 62 | ) 63 | ``` 64 | 65 | Speed adjustment is applied post-generation using ffmpeg's `atempo` filter. 66 | 67 | ## Feature Matrix 68 | 69 | | Feature | Full Image | Slim Image | Python Package | 70 | |---------|-----------|------------|----------------| 71 | | Basic TTS (MP3/WAV) | ✅ | ✅ | ✅ | 72 | | WAV auto-combine | ✅ | ✅ (simple) | ✅ (simple) | 73 | | MP3 auto-combine | ✅ | ❌ | ✅ (with pydub) | 74 | | Speed adjustment | ✅ | ❌ | ✅ (with ffmpeg) | 75 | | Format conversion | ✅ | ❌ | ✅ (with ffmpeg) | 76 | 77 | ## Usage Examples 78 | 79 | ### Full Image (Recommended) 80 | 81 | ```bash 82 | # Pull and run full image 83 | docker run -p 8000:8000 dbcccc/ttsfm:latest 84 | 85 | # Use speed adjustment 86 | curl -X POST http://localhost:8000/v1/audio/speech \ 87 | -H "Content-Type: application/json" \ 88 | -d '{"input":"Hello!","voice":"alloy","speed":1.5}' \ 89 | --output fast.mp3 90 | ``` 91 | 92 | ### Slim Image (Minimal) 93 | 94 | ```bash 95 | # Pull and run slim image 96 | docker run -p 8000:8000 dbcccc/ttsfm:v3.4.0-alpha1-slim 97 | 98 | # Basic TTS works fine 99 | curl -X POST http://localhost:8000/v1/audio/speech \ 100 | -H "Content-Type: application/json" \ 101 | -d '{"input":"Hello!","voice":"alloy"}' \ 102 | --output speech.mp3 103 | 104 | # Speed parameter will be ignored (no error, just logged warning) 105 | ``` 106 | 107 | ### Python Package 108 | 109 | ```python 110 | from ttsfm import TTSClient, Voice 111 | 112 | client = TTSClient() 113 | 114 | # Speed adjustment requires ffmpeg installed on system 115 | response = client.generate_speech( 116 | text="This will be faster!", 117 | voice=Voice.NOVA, 118 | speed=1.5, 119 | ) 120 | response.save_to_file("fast.mp3") 121 | ``` 122 | 123 | ## Error Handling 124 | 125 | When ffmpeg-dependent features are used without ffmpeg: 126 | 127 | ```python 128 | # Graceful degradation with helpful error messages 129 | RuntimeError: "Speed adjustment requires ffmpeg. 130 | Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant." 131 | ``` 132 | 133 | ## Migration Guide 134 | 135 | ### From v3.3.x to v3.4.x 136 | 137 | **No breaking changes** - existing code continues to work: 138 | 139 | 1. **Docker users**: 140 | - `dbcccc/ttsfm:latest` now includes speed adjustment 141 | - Use `dbcccc/ttsfm:v3.4.0-alpha1-slim` for minimal image 142 | 143 | 2. **Python package users**: 144 | - Speed parameter now functional (requires ffmpeg) 145 | - Install ffmpeg: `apt-get install ffmpeg` (Linux) or `brew install ffmpeg` (Mac) 146 | 147 | 3. **API users**: 148 | - Speed parameter now works in `/v1/audio/speech` endpoint 149 | - Response metadata includes `speed_applied: true/false` 150 | 151 | ## Technical Notes 152 | 153 | ### Speed Adjustment Implementation 154 | 155 | - Uses ffmpeg's `atempo` filter for speed adjustment 156 | - Supports 0.25x to 4.0x range (OpenAI TTS API compatible) 157 | - Chains multiple `atempo` filters for speeds outside 0.5-2.0 range 158 | - Adjusts estimated duration based on speed multiplier 159 | - Runs in thread pool for async client to avoid blocking 160 | 161 | ### Build Optimization 162 | 163 | - Shared builder stage for both variants 164 | - Separate cache scopes (`scope=full`, `scope=slim`) for efficient caching 165 | - Multi-platform builds only on release (saves CI time) 166 | 167 | ## Future Enhancements 168 | 169 | Potential additions for future versions: 170 | 171 | 1. **Additional format support**: Real AAC, FLAC, OPUS output (currently mapped to WAV) 172 | 2. **Audio effects**: Pitch adjustment, noise reduction 173 | 3. **Streaming support**: Real-time audio streaming with speed adjustment 174 | 4. **Ultra-slim variant**: Alpine-based image (~50MB) with no Python web server 175 | 176 | ## References 177 | 178 | - [OpenAI TTS API Documentation](https://platform.openai.com/docs/guides/text-to-speech) 179 | - [ffmpeg atempo filter](https://ffmpeg.org/ffmpeg-filters.html#atempo) 180 | - [Docker multi-stage builds](https://docs.docker.com/build/building/multi-stage/) 181 | 182 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TTSFM - Text-to-Speech API Client 2 | 3 | > **⚠️ NOTICE: This project is no longer functional as the openai.fm demo website has been shut down.** 4 | 5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md) 6 | 7 | [![Docker Pulls](https://img.shields.io/docker/pulls/dbcccc/ttsfm?style=flat-square&logo=docker)](https://hub.docker.com/r/dbcccc/ttsfm) 8 | [![GitHub Stars](https://img.shields.io/github/stars/dbccccccc/ttsfm?style=social)](https://github.com/dbccccccc/ttsfm) 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT) 10 | ![ghcr pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fghcr-badge.elias.eu.org%2Fapi%2Fdbccccccc%2Fttsfm%2Fttsfm&query=downloadCount&label=ghcr+pulls&logo=github) 11 | 12 | ## Star History 13 | 14 | [![Star History Chart](https://api.star-history.com/svg?repos=dbccccccc/ttsfm&type=Date)](https://www.star-history.com/#dbccccccc/ttsfm&Date) 15 | 16 | ## Overview 17 | 18 | TTSFM is a free, OpenAI-compatible text-to-speech API service that provides a complete solution for converting text to natural-sounding speech based on OpenAI's GPT-4o mini TTS. Built on top of the openai.fm backend, it offers a powerful Python SDK, RESTful API endpoints, and an intuitive web playground for easy testing and integration. 19 | 20 | **What TTSFM Can Do:** 21 | - 🎤 **Multiple Voices**: Choose from 11 OpenAI-compatible voices (alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse) 22 | - 🎵 **Flexible Audio Formats**: Support for 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM) 23 | - ⚡ **Speed Control**: Adjust playback speed from 0.25x to 4.0x for different use cases 24 | - 📝 **Long Text Support**: Automatic text splitting and audio combining for content of any length 25 | - 🔄 **Real-time Streaming**: WebSocket support for streaming audio generation 26 | - 🐍 **Python SDK**: Easy-to-use synchronous and asynchronous clients 27 | - 🌐 **Web Playground**: Interactive web interface for testing and experimentation 28 | - 🐳 **Docker Ready**: Pre-built Docker images for instant deployment 29 | - 🔍 **Smart Detection**: Automatic capability detection and helpful error messages 30 | - 🤖 **OpenAI Compatible**: Drop-in replacement for OpenAI's TTS API 31 | 32 | **Key Features in v3.4.0:** 33 | - 🎯 Image variant detection (full vs slim Docker images) 34 | - 🔍 Runtime capabilities API for feature availability checking 35 | - ⚡ Speed adjustment with ffmpeg-based audio processing 36 | - 🎵 Real format conversion for all 6 audio formats 37 | - 📊 Enhanced error handling with clear, actionable messages 38 | - 🐳 Dual Docker images optimized for different use cases 39 | 40 | > **⚠️ Disclaimer**: This project is intended for **educational and research purposes only**. It is a reverse-engineered implementation of the openai.fm service and should not be used for commercial purposes or in production environments. Users are responsible for ensuring compliance with applicable laws and terms of service. 41 | 42 | ## Installation 43 | 44 | ### Python package 45 | 46 | ```bash 47 | pip install ttsfm # core client 48 | pip install ttsfm[web] # core client + web/server dependencies 49 | ``` 50 | 51 | ### Docker image 52 | 53 | TTSFM offers two Docker image variants to suit different needs: 54 | 55 | #### Full variant (recommended) 56 | ```bash 57 | docker run -p 8000:8000 dbcccc/ttsfm:latest 58 | ``` 59 | 60 | **Includes ffmpeg for advanced features:** 61 | - ✅ All 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM) 62 | - ✅ Speed adjustment (0.25x - 4.0x) 63 | - ✅ Format conversion with ffmpeg 64 | - ✅ MP3 auto-combine for long text 65 | - ✅ WAV auto-combine for long text 66 | 67 | #### Slim variant - ~100MB 68 | ```bash 69 | docker run -p 8000:8000 dbcccc/ttsfm:slim 70 | ``` 71 | 72 | **Minimal image without ffmpeg:** 73 | - ✅ Basic TTS functionality 74 | - ✅ 2 audio formats (MP3, WAV only) 75 | - ✅ WAV auto-combine for long text 76 | - ❌ No speed adjustment 77 | - ❌ No format conversion 78 | - ❌ No MP3 auto-combine 79 | 80 | The container exposes the web playground at `http://localhost:8000` and an OpenAI-compatible endpoint at `/v1/audio/speech`. 81 | 82 | **Check available features:** 83 | ```bash 84 | curl http://localhost:8000/api/capabilities 85 | ``` 86 | 87 | ## Quick start 88 | 89 | ### Python client 90 | 91 | ```python 92 | from ttsfm import TTSClient, AudioFormat, Voice 93 | 94 | client = TTSClient() 95 | 96 | # Basic usage 97 | response = client.generate_speech( 98 | text="Hello from TTSFM!", 99 | voice=Voice.ALLOY, 100 | response_format=AudioFormat.MP3, 101 | ) 102 | response.save_to_file("hello") # -> hello.mp3 103 | 104 | # With speed adjustment (requires ffmpeg) 105 | response = client.generate_speech( 106 | text="This will be faster!", 107 | voice=Voice.NOVA, 108 | response_format=AudioFormat.MP3, 109 | speed=1.5, # 1.5x speed (0.25 - 4.0) 110 | ) 111 | response.save_to_file("fast") # -> fast.mp3 112 | ``` 113 | 114 | ### CLI 115 | 116 | ```bash 117 | ttsfm "Hello, world" --voice nova --format mp3 --output hello.mp3 118 | ``` 119 | 120 | ### REST API (OpenAI-compatible) 121 | 122 | ```bash 123 | # Basic request 124 | curl -X POST http://localhost:8000/v1/audio/speech \ 125 | -H "Content-Type: application/json" \ 126 | -d '{ 127 | "model": "tts-1", 128 | "input": "Hello world!", 129 | "voice": "alloy", 130 | "response_format": "mp3" 131 | }' --output speech.mp3 132 | 133 | # With speed adjustment (requires full image) 134 | curl -X POST http://localhost:8000/v1/audio/speech \ 135 | -H "Content-Type: application/json" \ 136 | -d '{ 137 | "model": "tts-1", 138 | "input": "Hello world!", 139 | "voice": "alloy", 140 | "response_format": "mp3", 141 | "speed": 1.5 142 | }' --output speech_fast.mp3 143 | ``` 144 | 145 | **Available voices:** alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse 146 | **Available formats:** mp3, wav (always) + opus, aac, flac, pcm (full image only) 147 | **Speed range:** 0.25 - 4.0 (requires full image) 148 | 149 | ## Learn more 150 | 151 | - Browse the full API reference and operational notes in the [web documentation](http://localhost:8000/docs) (or see `ttsfm-web/templates/docs.html`). 152 | - Read the [architecture overview](docs/architecture.md) for component diagrams. 153 | - Contributions are welcome—see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. 154 | 155 | ## License 156 | 157 | TTSFM is released under the [MIT License](LICENSE). 158 | -------------------------------------------------------------------------------- /docs/websocket-streaming.md: -------------------------------------------------------------------------------- 1 | # 🚀 WebSocket Streaming for TTSFM 2 | 3 | Real-time audio streaming for text-to-speech generation using WebSockets. 4 | 5 | ## Overview 6 | 7 | The WebSocket streaming feature provides: 8 | - **Real-time audio chunk delivery** as they're generated 9 | - **Progress tracking** with live updates 10 | - **Lower perceived latency** - start receiving audio before complete generation 11 | - **Cancellable operations** - stop mid-generation if needed 12 | 13 | ## Quick Start 14 | 15 | ### 1. Docker Deployment (Recommended) 16 | 17 | ```bash 18 | # Build with WebSocket support 19 | docker build -t ttsfm-websocket . 20 | 21 | # Run with WebSocket enabled 22 | docker run -p 8000:8000 \ 23 | -e DEBUG=false \ 24 | ttsfm-websocket 25 | ``` 26 | 27 | ### 2. Test WebSocket Connection 28 | 29 | Visit `http://localhost:8000/websocket-demo` for an interactive demo. 30 | 31 | ### 3. Client Usage 32 | 33 | ```javascript 34 | // Initialize WebSocket client 35 | const client = new WebSocketTTSClient({ 36 | socketUrl: 'http://localhost:8000', 37 | debug: true 38 | }); 39 | 40 | // Generate speech with streaming 41 | const result = await client.generateSpeech('Hello, WebSocket world!', { 42 | voice: 'alloy', 43 | format: 'mp3', 44 | onProgress: (progress) => { 45 | console.log(`Progress: ${progress.progress}%`); 46 | }, 47 | onChunk: (chunk) => { 48 | console.log(`Received chunk ${chunk.chunkIndex + 1}`); 49 | // Process audio chunk in real-time 50 | }, 51 | onComplete: (result) => { 52 | console.log('Generation complete!'); 53 | // Play or download the combined audio 54 | } 55 | }); 56 | ``` 57 | 58 | ## API Reference 59 | 60 | ### WebSocket Events 61 | 62 | #### Client → Server 63 | 64 | **`generate_stream`** 65 | ```javascript 66 | { 67 | text: string, // Text to convert 68 | voice: string, // Voice ID (alloy, echo, etc.) 69 | format: string, // Audio format (mp3, wav, opus) 70 | chunk_size: number // Optional, default 1024 71 | } 72 | ``` 73 | 74 | **`cancel_stream`** 75 | ```javascript 76 | { 77 | request_id: string // Request ID to cancel 78 | } 79 | ``` 80 | 81 | #### Server → Client 82 | 83 | **`stream_started`** 84 | ```javascript 85 | { 86 | request_id: string, 87 | timestamp: number 88 | } 89 | ``` 90 | 91 | **`audio_chunk`** 92 | ```javascript 93 | { 94 | request_id: string, 95 | chunk_index: number, 96 | total_chunks: number, 97 | audio_data: string, // Hex-encoded audio data 98 | format: string, 99 | duration: number, 100 | generation_time: number, 101 | chunk_text: string // Preview of chunk text 102 | } 103 | ``` 104 | 105 | **`stream_progress`** 106 | ```javascript 107 | { 108 | request_id: string, 109 | progress: number, // 0-100 110 | total_chunks: number, 111 | chunks_completed: number, 112 | status: string 113 | } 114 | ``` 115 | 116 | **`stream_complete`** 117 | ```javascript 118 | { 119 | request_id: string, 120 | total_chunks: number, 121 | status: 'completed', 122 | timestamp: number 123 | } 124 | ``` 125 | 126 | **`stream_error`** 127 | ```javascript 128 | { 129 | request_id: string, 130 | error: string, 131 | timestamp: number 132 | } 133 | ``` 134 | 135 | ## Performance Considerations 136 | 137 | 1. **Chunk Size**: Smaller chunks (512-1024 chars) provide more frequent updates but increase overhead 138 | 2. **Network Latency**: WebSocket reduces latency compared to HTTP polling 139 | 3. **Audio Buffering**: Client should buffer chunks for smooth playback 140 | 4. **Concurrent Streams**: Server supports multiple concurrent streaming sessions 141 | 142 | ## Browser Support 143 | 144 | - Chrome/Edge: Full support 145 | - Firefox: Full support 146 | - Safari: Full support (iOS 11.3+) 147 | - IE11: Not supported (use polling fallback) 148 | 149 | ## Troubleshooting 150 | 151 | ### Connection Issues 152 | ```javascript 153 | // Check WebSocket status 154 | fetch('/api/websocket/status') 155 | .then(res => res.json()) 156 | .then(data => console.log('WebSocket status:', data)); 157 | ``` 158 | 159 | ### Debug Mode 160 | ```javascript 161 | const client = new WebSocketTTSClient({ 162 | debug: true // Enable console logging 163 | }); 164 | ``` 165 | 166 | ### Common Issues 167 | 168 | 1. **"WebSocket connection failed"** 169 | - Check if port 8000 is accessible 170 | - Ensure eventlet is installed: `pip install eventlet>=0.33.3` 171 | - Try polling transport as fallback 172 | 173 | 2. **"Chunks arriving out of order"** 174 | - Client automatically sorts chunks by index 175 | - Check network stability 176 | 177 | 3. **"Audio playback stuttering"** 178 | - Increase chunk size for better buffering 179 | - Check client-side audio buffer implementation 180 | 181 | ## Advanced Usage 182 | 183 | ### Custom Chunk Processing 184 | ```javascript 185 | client.generateSpeech(text, { 186 | onChunk: async (chunk) => { 187 | // Custom processing per chunk 188 | const processed = await processAudioChunk(chunk.audioData); 189 | audioQueue.push(processed); 190 | 191 | // Start playback after first chunk 192 | if (chunk.chunkIndex === 0) { 193 | startStreamingPlayback(audioQueue); 194 | } 195 | } 196 | }); 197 | ``` 198 | 199 | ### Progress Visualization 200 | ```javascript 201 | client.generateSpeech(text, { 202 | onProgress: (progress) => { 203 | // Update UI progress bar 204 | progressBar.style.width = `${progress.progress}%`; 205 | statusText.textContent = `Processing chunk ${progress.chunksCompleted}/${progress.totalChunks}`; 206 | } 207 | }); 208 | ``` 209 | 210 | ## Security 211 | 212 | - WebSocket connections respect API key authentication if enabled 213 | - CORS is configured for cross-origin requests 214 | - SSL/TLS recommended for production deployments 215 | 216 | ## Deployment Notes 217 | 218 | For production deployment with your existing setup: 219 | 220 | ```bash 221 | # Build new image with WebSocket support 222 | docker build -t ttsfm-websocket:latest . 223 | 224 | # Deploy to your server (192.168.1.150) 225 | docker stop ttsfm-container 226 | docker rm ttsfm-container 227 | docker run -d \ 228 | --name ttsfm-container \ 229 | -p 8000:8000 \ 230 | -e REQUIRE_API_KEY=true \ 231 | -e TTSFM_API_KEY=your-secret-key \ 232 | -e DEBUG=false \ 233 | ttsfm-websocket:latest 234 | ``` 235 | 236 | ## Performance Metrics 237 | 238 | Based on testing with openai.fm backend: 239 | - First chunk delivery: ~0.5-1s 240 | - Streaming overhead: ~10-15% vs batch processing 241 | - Concurrent connections: 100+ (limited by server resources) 242 | - Memory usage: ~50MB per active stream 243 | 244 | *Built by a grumpy senior engineer who thinks HTTP was good enough* -------------------------------------------------------------------------------- /ttsfm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TTSFM - Text-to-Speech for Free using OpenAI.fm 3 | 4 | A Python library for generating high-quality text-to-speech audio using the free OpenAI.fm service. 5 | Supports multiple voices and audio formats with a simple, intuitive API. 6 | 7 | Example: 8 | >>> from ttsfm import TTSClient, Voice, AudioFormat 9 | >>> 10 | >>> client = TTSClient() 11 | >>> 12 | >>> # Generate MP3 audio 13 | >>> mp3_response = client.generate_speech( 14 | ... text="Hello, world!", 15 | ... voice=Voice.ALLOY, 16 | ... response_format=AudioFormat.MP3 17 | ... ) 18 | >>> mp3_response.save_to_file("hello") # Saves as hello.mp3 19 | >>> 20 | >>> # Generate WAV audio 21 | >>> wav_response = client.generate_speech( 22 | ... text="High quality audio", 23 | ... voice=Voice.NOVA, 24 | ... response_format=AudioFormat.WAV 25 | ... ) 26 | >>> wav_response.save_to_file("audio") # Saves as audio.wav 27 | >>> 28 | >>> # Generate OPUS audio 29 | >>> opus_response = client.generate_speech( 30 | ... text="Compressed audio", 31 | ... voice=Voice.ECHO, 32 | ... response_format=AudioFormat.OPUS 33 | ... ) 34 | >>> opus_response.save_to_file("compressed") # Saves as compressed.wav 35 | """ 36 | 37 | from typing import Optional 38 | 39 | from .async_client import AsyncTTSClient 40 | from .audio import combine_audio_chunks, combine_responses 41 | from .client import TTSClient 42 | from .exceptions import ( 43 | APIException, 44 | AudioProcessingException, 45 | AuthenticationException, 46 | NetworkException, 47 | QuotaExceededException, 48 | RateLimitException, 49 | ServiceUnavailableException, 50 | TTSException, 51 | ValidationException, 52 | ) 53 | from .models import ( 54 | APIError, 55 | AudioFormat, 56 | NetworkError, 57 | TTSError, 58 | TTSRequest, 59 | TTSResponse, 60 | ValidationError, 61 | Voice, 62 | ) 63 | from .utils import split_text_by_length, validate_text_length 64 | 65 | __version__ = "3.4.2" 66 | __author__ = "dbcccc" 67 | __email__ = "120614547+dbccccccc@users.noreply.github.com" 68 | __description__ = "Text-to-Speech API Client with OpenAI compatibility" 69 | __url__ = "https://github.com/dbccccccc/ttsfm" 70 | 71 | # Default client instance for convenience 72 | default_client = None 73 | 74 | 75 | def create_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> TTSClient: # type: ignore[misc] 76 | """ 77 | Create a new TTS client instance. 78 | 79 | Args: 80 | base_url: Base URL for the TTS service 81 | api_key: API key for authentication (if required) 82 | **kwargs: Additional client configuration 83 | 84 | Returns: 85 | TTSClient: Configured client instance 86 | """ 87 | client_kwargs = kwargs.copy() 88 | if base_url is not None: 89 | client_kwargs["base_url"] = base_url 90 | if api_key is not None: 91 | client_kwargs["api_key"] = api_key 92 | return TTSClient(**client_kwargs) 93 | 94 | 95 | def create_async_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> AsyncTTSClient: # type: ignore[misc] 96 | """ 97 | Create a new async TTS client instance. 98 | 99 | Args: 100 | base_url: Base URL for the TTS service 101 | api_key: API key for authentication (if required) 102 | **kwargs: Additional client configuration 103 | 104 | Returns: 105 | AsyncTTSClient: Configured async client instance 106 | """ 107 | client_kwargs = kwargs.copy() 108 | if base_url is not None: 109 | client_kwargs["base_url"] = base_url 110 | if api_key is not None: 111 | client_kwargs["api_key"] = api_key 112 | return AsyncTTSClient(**client_kwargs) 113 | 114 | 115 | def set_default_client(client: TTSClient) -> None: 116 | """Set the default client instance for convenience functions.""" 117 | global default_client 118 | default_client = client 119 | 120 | 121 | def generate_speech(text: str, voice: str = "alloy", **kwargs) -> TTSResponse: # type: ignore[misc] 122 | """ 123 | Convenience function to generate speech using the default client. 124 | 125 | Args: 126 | text: Text to convert to speech 127 | voice: Voice to use for generation 128 | **kwargs: Additional generation parameters 129 | 130 | Returns: 131 | TTSResponse: Generated audio response 132 | 133 | Raises: 134 | TTSException: If no default client is set or generation fails 135 | """ 136 | if default_client is None: 137 | raise TTSException("No default client set. Use create_client() first.") 138 | 139 | return default_client.generate_speech(text=text, voice=voice, **kwargs) 140 | 141 | 142 | def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs): # type: ignore[no-untyped-def] 143 | """ 144 | Convenience function to generate speech from long text using the default client. 145 | 146 | Automatically splits long text into chunks and generates speech for each chunk. 147 | 148 | Args: 149 | text: Text to convert to speech (can be longer than 1000 characters) 150 | voice: Voice to use for generation 151 | **kwargs: Additional generation parameters (max_length, preserve_words, etc.) 152 | 153 | Returns: 154 | list: List of TTSResponse objects for each chunk 155 | 156 | Raises: 157 | TTSException: If no default client is set or generation fails 158 | """ 159 | if default_client is None: 160 | raise TTSException("No default client set. Use create_client() first.") 161 | 162 | return default_client.generate_speech_long_text(text=text, voice=voice, **kwargs) 163 | 164 | 165 | # Export all public components 166 | __all__ = [ 167 | # Main classes 168 | "TTSClient", 169 | "AsyncTTSClient", 170 | # Models 171 | "TTSRequest", 172 | "TTSResponse", 173 | "Voice", 174 | "AudioFormat", 175 | "TTSError", 176 | "APIError", 177 | "NetworkError", 178 | "ValidationError", 179 | # Exceptions 180 | "TTSException", 181 | "APIException", 182 | "NetworkException", 183 | "ValidationException", 184 | "RateLimitException", 185 | "AuthenticationException", 186 | "ServiceUnavailableException", 187 | "QuotaExceededException", 188 | "AudioProcessingException", 189 | # Factory functions 190 | "create_client", 191 | "create_async_client", 192 | "set_default_client", 193 | "generate_speech", 194 | "generate_speech_long_text", 195 | # Utility functions 196 | "validate_text_length", 197 | "split_text_by_length", 198 | "combine_audio_chunks", 199 | "combine_responses", 200 | # Package metadata 201 | "__version__", 202 | "__author__", 203 | "__email__", 204 | "__description__", 205 | "__url__", 206 | ] 207 | -------------------------------------------------------------------------------- /.github/workflows/docker-build-full.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Push (Full) 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | release: 9 | types: [published] 10 | 11 | env: 12 | REGISTRY_DOCKERHUB: docker.io 13 | REGISTRY_GHCR: ghcr.io 14 | IMAGE_NAME: ${{ github.repository }} 15 | DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }} 16 | 17 | jobs: 18 | build-and-push-full: 19 | runs-on: ubuntu-latest 20 | permissions: 21 | contents: read 22 | packages: write 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Determine build settings 28 | id: build-config 29 | env: 30 | EVENT_NAME: ${{ github.event_name }} 31 | EVENT_ACTION: ${{ github.event.action }} 32 | run: | 33 | if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then 34 | echo "push=true" >> "$GITHUB_OUTPUT" 35 | echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT" 36 | echo "load=false" >> "$GITHUB_OUTPUT" 37 | else 38 | echo "push=false" >> "$GITHUB_OUTPUT" 39 | echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT" 40 | echo "load=true" >> "$GITHUB_OUTPUT" 41 | fi 42 | 43 | - name: Derive image version 44 | id: version 45 | env: 46 | EVENT_NAME: ${{ github.event_name }} 47 | TAG_NAME: ${{ github.event.release.tag_name }} 48 | REF_NAME: ${{ github.ref_name }} 49 | GITHUB_SHA: ${{ github.sha }} 50 | run: | 51 | version="" 52 | if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then 53 | version="$TAG_NAME" 54 | elif [ -n "$REF_NAME" ]; then 55 | version="$REF_NAME" 56 | fi 57 | version="${version##*/}" 58 | if [ "${version#v}" != "$version" ]; then 59 | version="${version#v}" 60 | fi 61 | if [ -z "$version" ]; then 62 | version="${GITHUB_SHA:0:12}" 63 | fi 64 | if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then 65 | safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-') 66 | safe_branch=${safe_branch%-} 67 | if [ -z "$safe_branch" ]; then 68 | safe_branch="sha-${GITHUB_SHA:0:12}" 69 | fi 70 | version="0.0.0+${safe_branch}" 71 | fi 72 | echo "version=$version" >> "$GITHUB_OUTPUT" 73 | 74 | - name: Set up QEMU 75 | if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64' 76 | uses: docker/setup-qemu-action@v3 77 | 78 | - name: Set up Docker Buildx 79 | uses: docker/setup-buildx-action@v3 80 | with: 81 | driver: docker-container 82 | 83 | - name: Login to Docker Hub 84 | if: steps.build-config.outputs.push == 'true' 85 | uses: docker/login-action@v3 86 | with: 87 | username: ${{ secrets.DOCKERHUB_USERNAME }} 88 | password: ${{ secrets.DOCKERHUB_TOKEN }} 89 | 90 | - name: Login to GitHub Container Registry 91 | if: steps.build-config.outputs.push == 'true' 92 | uses: docker/login-action@v3 93 | with: 94 | registry: ${{ env.REGISTRY_GHCR }} 95 | username: ${{ github.actor }} 96 | password: ${{ secrets.GITHUB_TOKEN }} 97 | 98 | - name: Extract metadata 99 | id: meta 100 | if: steps.build-config.outputs.push == 'true' 101 | uses: docker/metadata-action@v5 102 | with: 103 | images: | 104 | ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm 105 | ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }} 106 | tags: | 107 | type=semver,pattern=v{{version}} 108 | type=raw,value=latest,enable=${{ github.event.release.prerelease == false }} 109 | labels: | 110 | org.opencontainers.image.source=${{ github.repositoryUrl }} 111 | org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (full variant with ffmpeg) 112 | org.opencontainers.image.licenses=MIT 113 | org.opencontainers.image.title=TTSFM - Free TTS API Server (Full) 114 | org.opencontainers.image.vendor=dbcccc 115 | flavor: | 116 | latest=auto 117 | 118 | - name: Set local image metadata 119 | id: meta-local 120 | if: steps.build-config.outputs.push != 'true' 121 | run: | 122 | echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-full" >> "$GITHUB_OUTPUT" 123 | echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT" 124 | 125 | - name: Build and push image 126 | id: build-and-push 127 | uses: docker/build-push-action@v5 128 | with: 129 | context: . 130 | platforms: ${{ steps.build-config.outputs.platforms }} 131 | push: ${{ steps.build-config.outputs.push == 'true' }} 132 | load: ${{ steps.build-config.outputs.load == 'true' }} 133 | tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }} 134 | labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }} 135 | cache-from: type=gha,scope=full 136 | cache-to: type=gha,mode=max,scope=full 137 | build-args: | 138 | VERSION=${{ steps.version.outputs.version }} 139 | VARIANT=full 140 | 141 | - name: Smoke test image 142 | if: steps.build-config.outputs.load == 'true' 143 | run: | 144 | set -euo pipefail 145 | IMAGE="${{ steps.meta-local.outputs.tags }}" 146 | echo "Running smoke test for full image: $IMAGE" 147 | docker rm -f ttsfm-smoke >/dev/null 2>&1 || true 148 | docker run -d --name ttsfm-smoke -p 127.0.0.1:8000:8000 "$IMAGE" 149 | success="" 150 | for attempt in $(seq 1 10); do 151 | if curl --fail --silent --max-time 5 http://127.0.0.1:8000/api/health > /tmp/ttsfm-health.json; then 152 | success="yes" 153 | cat /tmp/ttsfm-health.json 154 | break 155 | fi 156 | sleep 3 157 | done 158 | docker logs ttsfm-smoke || true 159 | docker rm -f ttsfm-smoke >/dev/null 2>&1 || true 160 | if [ -z "$success" ]; then 161 | echo "Container health check failed" >&2 162 | exit 1 163 | fi 164 | 165 | - name: Show image info 166 | run: | 167 | echo "Variant: full" 168 | echo "Push enabled: ${{ steps.build-config.outputs.push }}" 169 | echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}" 170 | echo "Image digest: ${{ steps.build-and-push.outputs.digest }}" 171 | 172 | -------------------------------------------------------------------------------- /.github/workflows/docker-build-slim.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Push (Slim) 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | release: 9 | types: [published] 10 | 11 | env: 12 | REGISTRY_DOCKERHUB: docker.io 13 | REGISTRY_GHCR: ghcr.io 14 | IMAGE_NAME: ${{ github.repository }} 15 | DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }} 16 | 17 | jobs: 18 | build-and-push-slim: 19 | runs-on: ubuntu-latest 20 | permissions: 21 | contents: read 22 | packages: write 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Determine build settings 28 | id: build-config 29 | env: 30 | EVENT_NAME: ${{ github.event_name }} 31 | EVENT_ACTION: ${{ github.event.action }} 32 | run: | 33 | if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then 34 | echo "push=true" >> "$GITHUB_OUTPUT" 35 | echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT" 36 | echo "load=false" >> "$GITHUB_OUTPUT" 37 | else 38 | echo "push=false" >> "$GITHUB_OUTPUT" 39 | echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT" 40 | echo "load=true" >> "$GITHUB_OUTPUT" 41 | fi 42 | 43 | - name: Derive image version 44 | id: version 45 | env: 46 | EVENT_NAME: ${{ github.event_name }} 47 | TAG_NAME: ${{ github.event.release.tag_name }} 48 | REF_NAME: ${{ github.ref_name }} 49 | GITHUB_SHA: ${{ github.sha }} 50 | run: | 51 | version="" 52 | if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then 53 | version="$TAG_NAME" 54 | elif [ -n "$REF_NAME" ]; then 55 | version="$REF_NAME" 56 | fi 57 | version="${version##*/}" 58 | if [ "${version#v}" != "$version" ]; then 59 | version="${version#v}" 60 | fi 61 | if [ -z "$version" ]; then 62 | version="${GITHUB_SHA:0:12}" 63 | fi 64 | if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then 65 | safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-') 66 | safe_branch=${safe_branch%-} 67 | if [ -z "$safe_branch" ]; then 68 | safe_branch="sha-${GITHUB_SHA:0:12}" 69 | fi 70 | version="0.0.0+${safe_branch}" 71 | fi 72 | echo "version=$version" >> "$GITHUB_OUTPUT" 73 | 74 | - name: Set up QEMU 75 | if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64' 76 | uses: docker/setup-qemu-action@v3 77 | 78 | - name: Set up Docker Buildx 79 | uses: docker/setup-buildx-action@v3 80 | with: 81 | driver: docker-container 82 | 83 | - name: Login to Docker Hub 84 | if: steps.build-config.outputs.push == 'true' 85 | uses: docker/login-action@v3 86 | with: 87 | username: ${{ secrets.DOCKERHUB_USERNAME }} 88 | password: ${{ secrets.DOCKERHUB_TOKEN }} 89 | 90 | - name: Login to GitHub Container Registry 91 | if: steps.build-config.outputs.push == 'true' 92 | uses: docker/login-action@v3 93 | with: 94 | registry: ${{ env.REGISTRY_GHCR }} 95 | username: ${{ github.actor }} 96 | password: ${{ secrets.GITHUB_TOKEN }} 97 | 98 | - name: Extract metadata 99 | id: meta 100 | if: steps.build-config.outputs.push == 'true' 101 | uses: docker/metadata-action@v5 102 | with: 103 | images: | 104 | ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm 105 | ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }} 106 | tags: | 107 | type=semver,pattern=v{{version}},suffix=-slim 108 | type=raw,value=slim,enable=${{ !contains(github.ref, 'alpha') && !contains(github.ref, 'beta') }} 109 | labels: | 110 | org.opencontainers.image.source=${{ github.repositoryUrl }} 111 | org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (slim variant without ffmpeg) 112 | org.opencontainers.image.licenses=MIT 113 | org.opencontainers.image.title=TTSFM - Free TTS API Server (Slim) 114 | org.opencontainers.image.vendor=dbcccc 115 | 116 | - name: Set local image metadata 117 | id: meta-local 118 | if: steps.build-config.outputs.push != 'true' 119 | run: | 120 | echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-slim" >> "$GITHUB_OUTPUT" 121 | echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT" 122 | 123 | - name: Build and push image 124 | id: build-and-push 125 | uses: docker/build-push-action@v5 126 | with: 127 | context: . 128 | platforms: ${{ steps.build-config.outputs.platforms }} 129 | push: ${{ steps.build-config.outputs.push == 'true' }} 130 | load: ${{ steps.build-config.outputs.load == 'true' }} 131 | tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }} 132 | labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }} 133 | cache-from: type=gha,scope=slim 134 | cache-to: type=gha,mode=max,scope=slim 135 | build-args: | 136 | VERSION=${{ steps.version.outputs.version }} 137 | VARIANT=slim 138 | 139 | - name: Smoke test image 140 | if: steps.build-config.outputs.load == 'true' 141 | run: | 142 | set -euo pipefail 143 | IMAGE="${{ steps.meta-local.outputs.tags }}" 144 | echo "Running smoke test for slim image: $IMAGE" 145 | docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true 146 | docker run -d --name ttsfm-smoke-slim -p 127.0.0.1:8001:8000 "$IMAGE" 147 | success="" 148 | for attempt in $(seq 1 10); do 149 | if curl --fail --silent --max-time 5 http://127.0.0.1:8001/api/health > /tmp/ttsfm-health-slim.json; then 150 | success="yes" 151 | cat /tmp/ttsfm-health-slim.json 152 | break 153 | fi 154 | sleep 3 155 | done 156 | docker logs ttsfm-smoke-slim || true 157 | docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true 158 | if [ -z "$success" ]; then 159 | echo "Container health check failed" >&2 160 | exit 1 161 | fi 162 | 163 | - name: Show image info 164 | run: | 165 | echo "Variant: slim" 166 | echo "Push enabled: ${{ steps.build-config.outputs.push }}" 167 | echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}" 168 | echo "Image digest: ${{ steps.build-and-push.outputs.digest }}" 169 | 170 | -------------------------------------------------------------------------------- /ttsfm-web/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}TTSFM - {{ _('home.title') }}{% endblock %} 4 | 5 | {% block content %} 6 | 7 |
8 |
9 |
10 |
11 |
12 |
13 | Python Package 14 |
15 |

16 | {{ _('home.title') }} 17 |

18 |

19 | {{ _('home.subtitle') }} 20 |

21 | 32 |
33 |
34 |
35 |
36 |
37 | 38 | 39 |
40 |
41 |
42 |
43 |

{{ _('home.features_title') }}

44 |

45 | {{ _('home.features_subtitle') }} 46 |

47 |
48 |
49 | 50 |
51 |
52 |
53 |
54 | 55 |
56 |
{{ _('home.feature_free_title') }}
57 |

{{ _('home.feature_free_desc') }}

58 |
59 |
60 | 61 |
62 |
63 |
64 | 65 |
66 |
{{ _('home.feature_openai_title') }}
67 |

{{ _('home.feature_openai_desc') }}

68 |
69 |
70 | 71 |
72 |
73 |
74 | 75 |
76 |
{{ _('home.feature_async_title') }}
77 |

{{ _('home.feature_async_desc') }}

78 |
79 |
80 | 81 |
82 |
83 |
84 | 85 |
86 |
{{ _('home.feature_voices_title') }} & {{ _('home.feature_formats_title') }}
87 |

{{ _('home.feature_voices_desc') }} {{ _('home.feature_formats_desc') }}

88 |
89 |
90 |
91 |
92 |
93 | 94 | 95 |
96 |
97 |
98 |
99 |

{{ _('home.quick_start_title') }}

100 |

101 | {{ _('home.subtitle') }} 102 |

103 |
104 |
105 | 106 |
107 |
108 |
109 |
110 |
111 | {{ _('home.installation_title') }} 112 |
113 |
{{ _('home.installation_code') }}
114 | Requires Python 3.8+ 115 |
116 |
117 |
118 | 119 |
120 |
121 |
122 |
123 | {{ _('home.usage_title') }} 124 |
125 |
from ttsfm import TTSClient, Voice, AudioFormat
126 | 
127 | client = TTSClient()
128 | response = client.generate_speech(
129 |     text="Hello, world!",
130 |     voice=Voice.ALLOY,
131 |     response_format=AudioFormat.MP3
132 | )
133 | response.save_to_file("hello")
134 | No API keys required 135 |
136 |
137 |
138 |
139 | 140 | 152 |
153 |
154 | 155 | 156 | {% endblock %} 157 | -------------------------------------------------------------------------------- /ttsfm/audio_processing.py: -------------------------------------------------------------------------------- 1 | """Audio processing utilities using ffmpeg for advanced features.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import subprocess 7 | import tempfile 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def adjust_audio_speed( 15 | audio_data: bytes, 16 | speed: float, 17 | input_format: str = "mp3", 18 | output_format: str = "mp3", 19 | ) -> bytes: 20 | """ 21 | Adjust audio playback speed using ffmpeg. 22 | 23 | Args: 24 | audio_data: Input audio data as bytes 25 | speed: Speed multiplier (0.25 to 4.0). 1.0 = normal speed, 2.0 = 2x faster 26 | input_format: Input audio format (mp3, wav, etc.) 27 | output_format: Output audio format (mp3, wav, etc.) 28 | 29 | Returns: 30 | Processed audio data as bytes 31 | 32 | Raises: 33 | RuntimeError: If ffmpeg is not available or processing fails 34 | ValueError: If speed is out of valid range 35 | """ 36 | # Validate speed range (OpenAI TTS API supports 0.25 to 4.0) 37 | if not 0.25 <= speed <= 4.0: 38 | raise ValueError(f"Speed must be between 0.25 and 4.0, got {speed}") 39 | 40 | # If speed is 1.0, no processing needed 41 | if speed == 1.0: 42 | return audio_data 43 | 44 | # Check ffmpeg availability 45 | import shutil 46 | 47 | if not shutil.which("ffmpeg"): 48 | raise RuntimeError( 49 | "Speed adjustment requires ffmpeg. " 50 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant." 51 | ) 52 | 53 | try: 54 | # Create temporary files for input and output 55 | with tempfile.TemporaryDirectory() as tmpdir: 56 | tmp_path = Path(tmpdir) 57 | input_file = tmp_path / f"input.{input_format}" 58 | output_file = tmp_path / f"output.{output_format}" 59 | 60 | # Write input audio to temp file 61 | input_file.write_bytes(audio_data) 62 | 63 | # Build ffmpeg command 64 | # For speed adjustment, we use the atempo filter 65 | # atempo only supports 0.5-2.0 range, so we may need to chain filters 66 | atempo_filters = _build_atempo_filter_chain(speed) 67 | 68 | cmd = [ 69 | "ffmpeg", 70 | "-i", 71 | str(input_file), 72 | "-filter:a", 73 | atempo_filters, 74 | "-y", # Overwrite output file 75 | "-loglevel", 76 | "error", # Only show errors 77 | str(output_file), 78 | ] 79 | 80 | # Run ffmpeg 81 | result = subprocess.run( 82 | cmd, 83 | capture_output=True, 84 | text=True, 85 | timeout=30, 86 | ) 87 | 88 | if result.returncode != 0: 89 | logger.error(f"ffmpeg error: {result.stderr}") 90 | raise RuntimeError(f"ffmpeg processing failed: {result.stderr}") 91 | 92 | # Read processed audio 93 | return output_file.read_bytes() 94 | 95 | except subprocess.TimeoutExpired: 96 | raise RuntimeError("Audio processing timed out") 97 | except Exception as e: 98 | logger.error(f"Error adjusting audio speed: {e}") 99 | raise RuntimeError(f"Failed to adjust audio speed: {e}") 100 | 101 | 102 | def _build_atempo_filter_chain(speed: float) -> str: 103 | """ 104 | Build atempo filter chain for ffmpeg. 105 | 106 | The atempo filter only supports 0.5-2.0 range, so for speeds outside 107 | this range, we need to chain multiple atempo filters. 108 | 109 | Args: 110 | speed: Target speed multiplier 111 | 112 | Returns: 113 | Filter string for ffmpeg 114 | """ 115 | if 0.5 <= speed <= 2.0: 116 | return f"atempo={speed}" 117 | 118 | # For speeds outside 0.5-2.0, chain multiple atempo filters 119 | filters = [] 120 | remaining_speed = speed 121 | 122 | while remaining_speed > 2.0: 123 | filters.append("atempo=2.0") 124 | remaining_speed /= 2.0 125 | 126 | while remaining_speed < 0.5: 127 | filters.append("atempo=0.5") 128 | remaining_speed /= 0.5 129 | 130 | if remaining_speed != 1.0: 131 | filters.append(f"atempo={remaining_speed}") 132 | 133 | return ",".join(filters) 134 | 135 | 136 | def convert_audio_format( 137 | audio_data: bytes, 138 | input_format: str, 139 | output_format: str, 140 | bitrate: Optional[str] = None, 141 | ) -> bytes: 142 | """ 143 | Convert audio from one format to another using ffmpeg. 144 | 145 | Args: 146 | audio_data: Input audio data as bytes 147 | input_format: Input audio format (mp3, wav, opus, aac, flac, pcm) 148 | output_format: Output audio format (mp3, wav, opus, aac, flac, pcm) 149 | bitrate: Optional bitrate for output (e.g., "128k", "192k") 150 | 151 | Returns: 152 | Converted audio data as bytes 153 | 154 | Raises: 155 | RuntimeError: If ffmpeg is not available or conversion fails 156 | """ 157 | # Check ffmpeg availability 158 | import shutil 159 | 160 | if not shutil.which("ffmpeg"): 161 | raise RuntimeError( 162 | "Format conversion requires ffmpeg. " 163 | "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant." 164 | ) 165 | 166 | try: 167 | with tempfile.TemporaryDirectory() as tmpdir: 168 | tmp_path = Path(tmpdir) 169 | input_file = tmp_path / f"input.{input_format}" 170 | output_file = tmp_path / f"output.{output_format}" 171 | 172 | # Write input audio to temp file 173 | input_file.write_bytes(audio_data) 174 | 175 | # Build ffmpeg command 176 | cmd = [ 177 | "ffmpeg", 178 | "-i", 179 | str(input_file), 180 | "-y", # Overwrite output file 181 | "-loglevel", 182 | "error", 183 | ] 184 | 185 | # Add bitrate if specified 186 | if bitrate: 187 | cmd.extend(["-b:a", bitrate]) 188 | 189 | # Add output format-specific options 190 | if output_format == "opus": 191 | cmd.extend(["-c:a", "libopus"]) 192 | elif output_format == "aac": 193 | cmd.extend(["-c:a", "aac"]) 194 | elif output_format == "flac": 195 | cmd.extend(["-c:a", "flac"]) 196 | elif output_format == "pcm": 197 | cmd.extend(["-f", "s16le", "-acodec", "pcm_s16le"]) 198 | 199 | cmd.append(str(output_file)) 200 | 201 | # Run ffmpeg 202 | result = subprocess.run( 203 | cmd, 204 | capture_output=True, 205 | text=True, 206 | timeout=30, 207 | ) 208 | 209 | if result.returncode != 0: 210 | logger.error(f"ffmpeg error: {result.stderr}") 211 | raise RuntimeError(f"ffmpeg conversion failed: {result.stderr}") 212 | 213 | # Read converted audio 214 | return output_file.read_bytes() 215 | 216 | except subprocess.TimeoutExpired: 217 | raise RuntimeError("Audio conversion timed out") 218 | except Exception as e: 219 | logger.error(f"Error converting audio format: {e}") 220 | raise RuntimeError(f"Failed to convert audio format: {e}") 221 | -------------------------------------------------------------------------------- /ttsfm/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exception classes for the TTSFM package. 3 | 4 | This module defines the exception hierarchy used throughout the package 5 | for consistent error handling and reporting. 6 | """ 7 | 8 | from typing import Any, Dict, Optional 9 | 10 | 11 | class TTSException(Exception): 12 | """ 13 | Base exception class for all TTSFM-related errors. 14 | 15 | Attributes: 16 | message: Human-readable error message 17 | code: Error code for programmatic handling 18 | details: Additional error details 19 | """ 20 | 21 | def __init__( 22 | self, message: str, code: Optional[str] = None, details: Optional[Dict[str, Any]] = None 23 | ): 24 | super().__init__(message) 25 | self.message = message 26 | self.code = code or self.__class__.__name__ 27 | self.details = details or {} 28 | 29 | def __str__(self) -> str: 30 | if self.code: 31 | return f"[{self.code}] {self.message}" 32 | return self.message 33 | 34 | def __repr__(self) -> str: 35 | return f"{self.__class__.__name__}(message='{self.message}', code='{self.code}')" 36 | 37 | 38 | class APIException(TTSException): 39 | """ 40 | Exception raised for API-related errors. 41 | 42 | This includes HTTP errors, invalid responses, and server-side issues. 43 | """ 44 | 45 | def __init__( 46 | self, 47 | message: str, 48 | status_code: Optional[int] = None, 49 | response_data: Optional[Dict[str, Any]] = None, 50 | **kwargs: Any, 51 | ) -> None: 52 | super().__init__(message, **kwargs) 53 | self.status_code = status_code 54 | self.response_data = response_data or {} 55 | 56 | def __str__(self) -> str: 57 | if self.status_code: 58 | return f"[HTTP {self.status_code}] {self.message}" 59 | return super().__str__() 60 | 61 | 62 | class NetworkException(TTSException): 63 | """ 64 | Exception raised for network-related errors. 65 | 66 | This includes connection timeouts, DNS resolution failures, and other 67 | network connectivity issues. 68 | """ 69 | 70 | def __init__( 71 | self, message: str, timeout: Optional[float] = None, retry_count: int = 0, **kwargs: Any 72 | ) -> None: 73 | super().__init__(message, **kwargs) 74 | self.timeout = timeout 75 | self.retry_count = retry_count 76 | 77 | 78 | class ValidationException(TTSException): 79 | """ 80 | Exception raised for input validation errors. 81 | 82 | This includes invalid parameters, missing required fields, and 83 | data format issues. 84 | """ 85 | 86 | def __init__( 87 | self, message: str, field: Optional[str] = None, value: Optional[Any] = None, **kwargs: Any 88 | ) -> None: 89 | super().__init__(message, **kwargs) 90 | self.field = field 91 | self.value = value 92 | 93 | def __str__(self) -> str: 94 | if self.field: 95 | return f"Validation error for '{self.field}': {self.message}" 96 | return f"Validation error: {self.message}" 97 | 98 | 99 | class RateLimitException(APIException): 100 | """ 101 | Exception raised when API rate limits are exceeded. 102 | 103 | Attributes: 104 | retry_after: Seconds to wait before retrying (if provided by server) 105 | limit: Rate limit that was exceeded 106 | remaining: Remaining requests in current window 107 | """ 108 | 109 | def __init__( 110 | self, 111 | message: str = "Rate limit exceeded", 112 | retry_after: Optional[int] = None, 113 | limit: Optional[int] = None, 114 | remaining: Optional[int] = None, 115 | **kwargs: Any, 116 | ) -> None: 117 | super().__init__(message, status_code=429, **kwargs) 118 | self.retry_after = retry_after 119 | self.limit = limit 120 | self.remaining = remaining 121 | 122 | def __str__(self) -> str: 123 | msg = super().__str__() 124 | if self.retry_after: 125 | msg += f" (retry after {self.retry_after}s)" 126 | return msg 127 | 128 | 129 | class AuthenticationException(APIException): 130 | """ 131 | Exception raised for authentication and authorization errors. 132 | 133 | This includes invalid API keys, expired tokens, and insufficient 134 | permissions. 135 | """ 136 | 137 | def __init__(self, message: str = "Authentication failed", **kwargs: Any) -> None: 138 | super().__init__(message, status_code=401, **kwargs) 139 | 140 | 141 | class ServiceUnavailableException(APIException): 142 | """ 143 | Exception raised when the TTS service is temporarily unavailable. 144 | 145 | This includes server maintenance, overload conditions, and 146 | temporary service outages. 147 | """ 148 | 149 | def __init__( 150 | self, 151 | message: str = "Service temporarily unavailable", 152 | retry_after: Optional[int] = None, 153 | **kwargs: Any, 154 | ) -> None: 155 | super().__init__(message, status_code=503, **kwargs) 156 | self.retry_after = retry_after 157 | 158 | 159 | class QuotaExceededException(APIException): 160 | """ 161 | Exception raised when usage quotas are exceeded. 162 | 163 | This includes monthly limits, character limits, and other 164 | usage-based restrictions. 165 | """ 166 | 167 | def __init__( 168 | self, 169 | message: str = "Usage quota exceeded", 170 | quota_type: Optional[str] = None, 171 | limit: Optional[int] = None, 172 | used: Optional[int] = None, 173 | **kwargs: Any, 174 | ) -> None: 175 | super().__init__(message, status_code=402, **kwargs) 176 | self.quota_type = quota_type 177 | self.limit = limit 178 | self.used = used 179 | 180 | 181 | class AudioProcessingException(TTSException): 182 | """ 183 | Exception raised for audio processing errors. 184 | 185 | This includes format conversion issues, audio generation failures, 186 | and output processing problems. 187 | """ 188 | 189 | def __init__(self, message: str, audio_format: Optional[str] = None, **kwargs: Any) -> None: 190 | super().__init__(message, **kwargs) 191 | self.audio_format = audio_format 192 | 193 | 194 | def create_exception_from_response( 195 | status_code: int, response_data: Dict[str, Any], default_message: str = "API request failed" 196 | ) -> APIException: 197 | """ 198 | Create appropriate exception from API response. 199 | 200 | Args: 201 | status_code: HTTP status code 202 | response_data: Response data from API 203 | default_message: Default message if none in response 204 | 205 | Returns: 206 | APIException: Appropriate exception instance 207 | """ 208 | message = response_data.get("error", {}).get("message", default_message) 209 | 210 | if status_code == 401: 211 | return AuthenticationException(message, response_data=response_data) 212 | elif status_code == 402: 213 | return QuotaExceededException(message, response_data=response_data) 214 | elif status_code == 429: 215 | retry_after = response_data.get("retry_after") 216 | return RateLimitException(message, retry_after=retry_after, response_data=response_data) 217 | elif status_code == 503: 218 | retry_after = response_data.get("retry_after") 219 | return ServiceUnavailableException( 220 | message, 221 | retry_after=retry_after, 222 | response_data=response_data, 223 | ) 224 | else: 225 | return APIException(message, status_code=status_code, response_data=response_data) 226 | -------------------------------------------------------------------------------- /ttsfm-web/i18n.py: -------------------------------------------------------------------------------- 1 | """ 2 | Internationalization (i18n) support for TTSFM Web Application 3 | 4 | This module provides multi-language support for the Flask web application, 5 | including language detection, translation management, and template functions. 6 | """ 7 | 8 | import json 9 | import os 10 | from typing import Any, Dict, Optional 11 | 12 | from flask import request, session 13 | 14 | 15 | class LanguageManager: 16 | """Manages language detection, translation loading, and text translation.""" 17 | 18 | def __init__(self, app=None, translations_dir: str = "translations"): 19 | """ 20 | Initialize the LanguageManager. 21 | 22 | Args: 23 | app: Flask application instance 24 | translations_dir: Directory containing translation files 25 | """ 26 | self.translations_dir = translations_dir 27 | self.translations: Dict[str, Dict[str, Any]] = {} 28 | self.supported_languages = ["en", "zh"] 29 | self.default_language = "en" 30 | 31 | if app is not None: 32 | self.init_app(app) 33 | 34 | def init_app(self, app): 35 | """Initialize the Flask application with i18n support.""" 36 | app.config.setdefault("LANGUAGES", self.supported_languages) 37 | app.config.setdefault("DEFAULT_LANGUAGE", self.default_language) 38 | 39 | # Load translations 40 | self.load_translations() 41 | 42 | # Register template functions 43 | app.jinja_env.globals["_"] = self.translate 44 | app.jinja_env.globals["get_locale"] = self.get_locale 45 | app.jinja_env.globals["get_supported_languages"] = self.get_supported_languages 46 | 47 | # Store reference to this instance 48 | app.language_manager = self 49 | 50 | def load_translations(self): 51 | """Load all translation files from the translations directory.""" 52 | translations_path = os.path.join(os.path.dirname(__file__), self.translations_dir) 53 | 54 | if not os.path.exists(translations_path): 55 | print(f"Warning: Translations directory not found: {translations_path}") 56 | return 57 | 58 | for lang_code in self.supported_languages: 59 | file_path = os.path.join(translations_path, f"{lang_code}.json") 60 | 61 | if os.path.exists(file_path): 62 | try: 63 | with open(file_path, "r", encoding="utf-8") as f: 64 | self.translations[lang_code] = json.load(f) 65 | print(f"Info: Loaded translations for language: {lang_code}") 66 | except Exception as e: 67 | print(f"Error: Failed to load translations for {lang_code}: {e}") 68 | else: 69 | print(f"Warning: Translation file not found: {file_path}") 70 | 71 | def get_locale(self) -> str: 72 | """ 73 | Get the current locale based on user preference, session, or browser settings. 74 | 75 | Returns: 76 | Language code (e.g., 'en', 'zh') 77 | """ 78 | # 1. Check URL parameter (for language switching) 79 | if "lang" in request.args: 80 | lang = request.args.get("lang") 81 | if lang in self.supported_languages: 82 | session["language"] = lang 83 | return lang 84 | 85 | # 2. Check session (user's previous choice) 86 | if "language" in session: 87 | lang = session["language"] 88 | if lang in self.supported_languages: 89 | return lang 90 | 91 | # 3. Check browser's Accept-Language header 92 | if request.headers.get("Accept-Language"): 93 | browser_langs = request.headers.get("Accept-Language").split(",") 94 | for browser_lang in browser_langs: 95 | # Extract language code (e.g., 'zh-CN' -> 'zh') 96 | lang_code = browser_lang.split(";")[0].split("-")[0].strip().lower() 97 | if lang_code in self.supported_languages: 98 | session["language"] = lang_code 99 | return lang_code 100 | 101 | # 4. Fall back to default language 102 | return self.default_language 103 | 104 | def set_locale(self, lang_code: str) -> bool: 105 | """ 106 | Set the current locale. 107 | 108 | Args: 109 | lang_code: Language code to set 110 | 111 | Returns: 112 | True if successful, False if language not supported 113 | """ 114 | if lang_code in self.supported_languages: 115 | session["language"] = lang_code 116 | return True 117 | return False 118 | 119 | def translate(self, key: str, **kwargs) -> str: 120 | """ 121 | Translate a text key to the current locale. 122 | 123 | Args: 124 | key: Translation key in dot notation (e.g., 'nav.home') 125 | **kwargs: Variables for string formatting 126 | 127 | Returns: 128 | Translated text or the key if translation not found 129 | """ 130 | locale = self.get_locale() 131 | 132 | # Get translation for current locale 133 | translation = self._get_nested_value(self.translations.get(locale, {}), key) 134 | 135 | # Fall back to default language if not found 136 | if translation is None and locale != self.default_language: 137 | translation = self._get_nested_value( 138 | self.translations.get(self.default_language, {}), key 139 | ) 140 | 141 | # Fall back to key if still not found 142 | if translation is None: 143 | translation = key 144 | 145 | # Format with variables if provided 146 | if kwargs and isinstance(translation, str): 147 | try: 148 | translation = translation.format(**kwargs) 149 | except (KeyError, ValueError): 150 | pass # Ignore formatting errors 151 | 152 | return translation 153 | 154 | def _get_nested_value(self, data: Dict[str, Any], key: str) -> Optional[str]: 155 | """ 156 | Get a nested value from a dictionary using dot notation. 157 | 158 | Args: 159 | data: Dictionary to search in 160 | key: Dot-separated key (e.g., 'nav.home') 161 | 162 | Returns: 163 | Value if found, None otherwise 164 | """ 165 | keys = key.split(".") 166 | current = data 167 | 168 | for k in keys: 169 | if isinstance(current, dict) and k in current: 170 | current = current[k] 171 | else: 172 | return None 173 | 174 | return current if isinstance(current, str) else None 175 | 176 | def get_supported_languages(self) -> Dict[str, str]: 177 | """ 178 | Get a dictionary of supported languages with their display names. 179 | 180 | Returns: 181 | Dictionary mapping language codes to display names 182 | """ 183 | return {"en": "English", "zh": "中文"} 184 | 185 | def get_language_info(self, lang_code: str) -> Dict[str, str]: 186 | """ 187 | Get information about a specific language. 188 | 189 | Args: 190 | lang_code: Language code 191 | 192 | Returns: 193 | Dictionary with language information 194 | """ 195 | language_names = { 196 | "en": {"name": "English", "native": "English"}, 197 | "zh": {"name": "Chinese", "native": "中文"}, 198 | } 199 | 200 | return language_names.get( 201 | lang_code, {"name": lang_code.upper(), "native": lang_code.upper()} 202 | ) 203 | 204 | 205 | # Global instance 206 | language_manager = LanguageManager() 207 | 208 | 209 | def init_i18n(app): 210 | """Initialize i18n support for the Flask application.""" 211 | language_manager.init_app(app) 212 | return language_manager 213 | 214 | 215 | # Template helper functions 216 | def _(key: str, **kwargs) -> str: 217 | """Shorthand translation function for use in templates and code.""" 218 | return language_manager.translate(key, **kwargs) 219 | 220 | 221 | def get_locale() -> str: 222 | """Get the current locale.""" 223 | return language_manager.get_locale() 224 | 225 | 226 | def set_locale(lang_code: str) -> bool: 227 | """Set the current locale.""" 228 | return language_manager.set_locale(lang_code) 229 | -------------------------------------------------------------------------------- /ttsfm-web/static/js/i18n.js: -------------------------------------------------------------------------------- 1 | // JavaScript Internationalization Support for TTSFM 2 | 3 | // Translation data - this will be populated by the server 4 | window.i18nData = window.i18nData || {}; 5 | 6 | // Current locale 7 | window.currentLocale = document.documentElement.lang || 'en'; 8 | 9 | // Translation function 10 | function _(key, params = {}) { 11 | const keys = key.split('.'); 12 | let value = window.i18nData; 13 | 14 | // Navigate through the nested object 15 | for (const k of keys) { 16 | if (value && typeof value === 'object' && k in value) { 17 | value = value[k]; 18 | } else { 19 | // Fallback to key if translation not found 20 | return key; 21 | } 22 | } 23 | 24 | // If we found a string, apply parameters 25 | if (typeof value === 'string') { 26 | return formatString(value, params); 27 | } 28 | 29 | // Fallback to key 30 | return key; 31 | } 32 | 33 | // Format string with parameters 34 | function formatString(str, params) { 35 | return str.replace(/\{(\w+)\}/g, (match, key) => { 36 | return params.hasOwnProperty(key) ? params[key] : match; 37 | }); 38 | } 39 | 40 | // Load translations from server 41 | async function loadTranslations() { 42 | try { 43 | const response = await fetch(`/api/translations/${window.currentLocale}`); 44 | if (response.ok) { 45 | window.i18nData = await response.json(); 46 | } 47 | } catch (error) { 48 | console.warn('Failed to load translations:', error); 49 | } 50 | } 51 | 52 | // Sample texts for different languages 53 | const sampleTexts = { 54 | en: { 55 | welcome: "Welcome to TTSFM! This is a free text-to-speech service that converts your text into high-quality audio using advanced AI technology.", 56 | story: "Once upon a time, in a digital world far away, there lived a small Python package that could transform any text into beautiful speech. This package was called TTSFM, and it brought joy to developers everywhere.", 57 | technical: "TTSFM is a Python client for text-to-speech APIs that provides both synchronous and asynchronous interfaces. It supports multiple voices and audio formats, making it perfect for various applications.", 58 | multilingual: "TTSFM supports multiple languages and voices, allowing you to create diverse audio content for global audiences. The service is completely free and requires no API keys.", 59 | long: "This is a longer text sample designed to test the auto-combine feature of TTSFM. When text exceeds the maximum length limit, TTSFM automatically splits it into smaller chunks, generates audio for each chunk, and then seamlessly combines them into a single audio file. This process is completely transparent to the user and ensures that you can convert text of any length without worrying about technical limitations. The resulting audio maintains consistent quality and natural flow throughout the entire content." 60 | }, 61 | zh: { 62 | welcome: "欢迎使用TTSFM!这是一个免费的文本转语音服务,使用先进的AI技术将您的文本转换为高质量音频。", 63 | story: "很久很久以前,在一个遥远的数字世界里,住着一个小小的Python包,它能够将任何文本转换成美妙的语音。这个包叫做TTSFM,它为世界各地的开发者带来了快乐。", 64 | technical: "TTSFM是一个用于文本转语音API的Python客户端,提供同步和异步接口。它支持多种声音和音频格式,非常适合各种应用。", 65 | multilingual: "TTSFM支持多种语言和声音,让您能够为全球受众创建多样化的音频内容。该服务完全免费,无需API密钥。", 66 | long: "这是一个较长的文本示例,用于测试TTSFM的自动合并功能。当文本超过最大长度限制时,TTSFM会自动将其分割成较小的片段,为每个片段生成音频,然后无缝地将它们合并成一个音频文件。这个过程对用户完全透明,确保您可以转换任何长度的文本,而无需担心技术限制。生成的音频在整个内容中保持一致的质量和自然的流畅性。" 67 | } 68 | }; 69 | 70 | // Get sample text for current locale 71 | function getSampleText(type) { 72 | const locale = window.currentLocale; 73 | const texts = sampleTexts[locale] || sampleTexts.en; 74 | return texts[type] || texts.welcome; 75 | } 76 | 77 | // Error messages 78 | const errorMessages = { 79 | en: { 80 | empty_text: "Please enter some text to convert.", 81 | generation_failed: "Failed to generate speech. Please try again.", 82 | network_error: "Network error. Please check your connection and try again.", 83 | invalid_format: "Invalid audio format selected.", 84 | invalid_voice: "Invalid voice selected.", 85 | text_too_long: "Text is too long. Please reduce the length or enable auto-combine.", 86 | server_error: "Server error. Please try again later." 87 | }, 88 | zh: { 89 | empty_text: "请输入要转换的文本。", 90 | generation_failed: "语音生成失败。请重试。", 91 | network_error: "网络错误。请检查您的连接并重试。", 92 | invalid_format: "选择的音频格式无效。", 93 | invalid_voice: "选择的声音无效。", 94 | text_too_long: "文本太长。请减少长度或启用自动合并。", 95 | server_error: "服务器错误。请稍后重试。" 96 | } 97 | }; 98 | 99 | // Success messages 100 | const successMessages = { 101 | en: { 102 | generation_complete: "Speech generated successfully!", 103 | text_copied: "Text copied to clipboard!", 104 | download_started: "Download started!" 105 | }, 106 | zh: { 107 | generation_complete: "语音生成成功!", 108 | text_copied: "文本已复制到剪贴板!", 109 | download_started: "下载已开始!" 110 | } 111 | }; 112 | 113 | // Get error message 114 | function getErrorMessage(key) { 115 | const locale = window.currentLocale; 116 | const messages = errorMessages[locale] || errorMessages.en; 117 | return messages[key] || key; 118 | } 119 | 120 | // Get success message 121 | function getSuccessMessage(key) { 122 | const locale = window.currentLocale; 123 | const messages = successMessages[locale] || successMessages.en; 124 | return messages[key] || key; 125 | } 126 | 127 | // Format file size 128 | function formatFileSize(bytes) { 129 | if (bytes === 0) return '0 Bytes'; 130 | 131 | const k = 1024; 132 | const sizes = window.currentLocale === 'zh' 133 | ? ['字节', 'KB', 'MB', 'GB'] 134 | : ['Bytes', 'KB', 'MB', 'GB']; 135 | 136 | const i = Math.floor(Math.log(bytes) / Math.log(k)); 137 | return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; 138 | } 139 | 140 | // Format duration 141 | function formatDuration(seconds) { 142 | if (isNaN(seconds) || seconds < 0) { 143 | return window.currentLocale === 'zh' ? '未知' : 'Unknown'; 144 | } 145 | 146 | const minutes = Math.floor(seconds / 60); 147 | const remainingSeconds = Math.floor(seconds % 60); 148 | 149 | if (minutes > 0) { 150 | return window.currentLocale === 'zh' 151 | ? `${minutes}分${remainingSeconds}秒` 152 | : `${minutes}m ${remainingSeconds}s`; 153 | } else { 154 | return window.currentLocale === 'zh' 155 | ? `${remainingSeconds}秒` 156 | : `${remainingSeconds}s`; 157 | } 158 | } 159 | 160 | // Update UI text based on current locale 161 | function updateUIText() { 162 | // Update button texts 163 | const generateBtn = document.getElementById('generate-btn'); 164 | if (generateBtn && !generateBtn.disabled) { 165 | generateBtn.innerHTML = window.currentLocale === 'zh' 166 | ? '生成语音' 167 | : 'Generate Speech'; 168 | } 169 | 170 | // Update other dynamic text elements 171 | const charCountElement = document.querySelector('#char-count'); 172 | if (charCountElement) { 173 | const count = charCountElement.textContent; 174 | const parent = charCountElement.parentElement; 175 | if (parent) { 176 | // Escape HTML characters to prevent XSS 177 | const escapedCount = count.replace(/&/g, '&') 178 | .replace(//g, '>') 180 | .replace(/"/g, '"') 181 | .replace(/'/g, '''); 182 | 183 | parent.innerHTML = window.currentLocale === 'zh' 184 | ? `${escapedCount} 字符` 185 | : `${escapedCount} characters`; 186 | } 187 | } 188 | } 189 | 190 | // Initialize i18n 191 | function initI18n() { 192 | // Load translations if needed 193 | loadTranslations(); 194 | 195 | // Update UI text 196 | updateUIText(); 197 | 198 | // Listen for language changes 199 | document.addEventListener('languageChanged', function(event) { 200 | window.currentLocale = event.detail.locale; 201 | loadTranslations().then(() => { 202 | updateUIText(); 203 | }); 204 | }); 205 | } 206 | 207 | // Export functions for global use 208 | window._ = _; 209 | window.getSampleText = getSampleText; 210 | window.getErrorMessage = getErrorMessage; 211 | window.getSuccessMessage = getSuccessMessage; 212 | window.formatFileSize = formatFileSize; 213 | window.formatDuration = formatDuration; 214 | window.initI18n = initI18n; 215 | 216 | // Auto-initialize when DOM is ready 217 | if (document.readyState === 'loading') { 218 | document.addEventListener('DOMContentLoaded', initI18n); 219 | } else { 220 | initI18n(); 221 | } 222 | -------------------------------------------------------------------------------- /ttsfm/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data models and types for the TTSFM package. 3 | 4 | This module defines the core data structures used throughout the package, 5 | including request/response models, enums, and error types. 6 | """ 7 | 8 | from dataclasses import dataclass 9 | from datetime import datetime 10 | from enum import Enum 11 | from typing import Any, Dict, Optional, Union 12 | 13 | 14 | class Voice(str, Enum): 15 | """Available voice options for TTS generation.""" 16 | 17 | ALLOY = "alloy" 18 | ASH = "ash" 19 | BALLAD = "ballad" 20 | CORAL = "coral" 21 | ECHO = "echo" 22 | FABLE = "fable" 23 | NOVA = "nova" 24 | ONYX = "onyx" 25 | SAGE = "sage" 26 | SHIMMER = "shimmer" 27 | VERSE = "verse" 28 | 29 | 30 | class AudioFormat(str, Enum): 31 | """Supported audio output formats.""" 32 | 33 | MP3 = "mp3" 34 | WAV = "wav" 35 | OPUS = "opus" 36 | AAC = "aac" 37 | FLAC = "flac" 38 | PCM = "pcm" 39 | 40 | 41 | @dataclass 42 | class TTSRequest: 43 | """ 44 | Request model for TTS generation. 45 | 46 | Attributes: 47 | input: Text to convert to speech 48 | voice: Voice to use for generation 49 | response_format: Audio format for output 50 | instructions: Optional instructions for voice modulation 51 | model: Model to use (for OpenAI compatibility, usually ignored) 52 | speed: Speech speed (for OpenAI compatibility, usually ignored) 53 | max_length: Maximum allowed text length (default: 1000 characters) 54 | validate_length: Whether to validate text length (default: True) 55 | """ 56 | 57 | input: str 58 | voice: Union[Voice, str] = Voice.ALLOY 59 | response_format: Union[AudioFormat, str] = AudioFormat.MP3 60 | instructions: Optional[str] = None 61 | model: Optional[str] = None 62 | speed: Optional[float] = None 63 | max_length: int = 1000 64 | validate_length: bool = True 65 | 66 | def __post_init__(self) -> None: 67 | """Validate and normalize fields after initialization.""" 68 | if self.max_length > 1000: 69 | self.max_length = 1000 70 | # Ensure voice is a valid Voice enum 71 | if isinstance(self.voice, str): 72 | try: 73 | self.voice = Voice(self.voice.lower()) 74 | except ValueError: 75 | raise ValueError(f"Invalid voice: {self.voice}. Must be one of {list(Voice)}") 76 | 77 | # Ensure response_format is a valid AudioFormat enum 78 | if isinstance(self.response_format, str): 79 | try: 80 | self.response_format = AudioFormat(self.response_format.lower()) 81 | except ValueError: 82 | raise ValueError( 83 | f"Invalid format: {self.response_format}. Must be one of {list(AudioFormat)}" 84 | ) 85 | 86 | # Validate input text 87 | if not self.input or not self.input.strip(): 88 | raise ValueError("Input text cannot be empty") 89 | 90 | # Validate text length if enabled 91 | if self.validate_length: 92 | text_length = len(self.input) 93 | if text_length > self.max_length: 94 | raise ValueError( 95 | f"Input text is too long ({text_length} characters). " 96 | f"Maximum allowed length is {self.max_length} characters. " 97 | f"Consider splitting your text into smaller chunks or disable " 98 | f"length validation with validate_length=False." 99 | ) 100 | 101 | # Validate max_length parameter 102 | if self.max_length <= 0: 103 | raise ValueError("max_length must be a positive integer") 104 | 105 | # Validate speed if provided 106 | if self.speed is not None and (self.speed < 0.25 or self.speed > 4.0): 107 | raise ValueError("Speed must be between 0.25 and 4.0") 108 | 109 | def to_dict(self) -> Dict[str, Any]: 110 | """Convert request to dictionary for API calls.""" 111 | data: Dict[str, Any] = { 112 | "input": self.input, 113 | "voice": self.voice.value if isinstance(self.voice, Voice) else self.voice, 114 | "response_format": ( 115 | self.response_format.value 116 | if isinstance(self.response_format, AudioFormat) 117 | else self.response_format 118 | ), 119 | } 120 | 121 | if self.instructions: 122 | data["instructions"] = self.instructions 123 | 124 | if self.model: 125 | data["model"] = self.model 126 | 127 | if self.speed is not None: 128 | data["speed"] = self.speed 129 | 130 | return data 131 | 132 | 133 | @dataclass 134 | class TTSResponse: 135 | """ 136 | Response model for TTS generation. 137 | 138 | Attributes: 139 | audio_data: Generated audio as bytes 140 | content_type: MIME type of the audio data 141 | format: Audio format used 142 | size: Size of audio data in bytes 143 | duration: Estimated duration in seconds (if available) 144 | metadata: Additional response metadata 145 | """ 146 | 147 | audio_data: bytes 148 | content_type: str 149 | format: AudioFormat 150 | size: int 151 | duration: Optional[float] = None 152 | metadata: Optional[Dict[str, Any]] = None 153 | 154 | def __post_init__(self) -> None: 155 | """Calculate derived fields after initialization.""" 156 | # Size is always set from audio_data length if not provided 157 | pass 158 | 159 | def save_to_file(self, filename: str) -> str: 160 | """ 161 | Save audio data to a file. 162 | 163 | Args: 164 | filename: Target filename (extension will be added if missing) 165 | 166 | Returns: 167 | str: Final filename used 168 | """ 169 | import os 170 | 171 | # Use the actual returned format for the extension, not any requested format 172 | expected_extension = f".{self.format.value}" 173 | 174 | # Check if filename already has the correct extension 175 | if filename.endswith(expected_extension): 176 | final_filename = filename 177 | else: 178 | # Remove any existing extension and add the correct one 179 | base_name = filename 180 | # Remove common audio extensions if present 181 | for ext in [".mp3", ".wav", ".opus", ".aac", ".flac", ".pcm"]: 182 | if base_name.endswith(ext): 183 | base_name = base_name[: -len(ext)] 184 | break 185 | final_filename = f"{base_name}{expected_extension}" 186 | 187 | # Create directory if it doesn't exist 188 | os.makedirs( 189 | os.path.dirname(final_filename) if os.path.dirname(final_filename) else ".", 190 | exist_ok=True, 191 | ) 192 | 193 | # Write audio data 194 | with open(final_filename, "wb") as f: 195 | f.write(self.audio_data) 196 | 197 | return final_filename 198 | 199 | 200 | @dataclass 201 | class TTSError: 202 | """ 203 | Error information from TTS API. 204 | 205 | Attributes: 206 | code: Error code 207 | message: Human-readable error message 208 | type: Error type/category 209 | details: Additional error details 210 | timestamp: When the error occurred 211 | """ 212 | 213 | code: str 214 | message: str 215 | type: Optional[str] = None 216 | details: Optional[Dict[str, Any]] = None 217 | timestamp: Optional[datetime] = None 218 | 219 | def __post_init__(self) -> None: 220 | """Set timestamp if not provided.""" 221 | if self.timestamp is None: 222 | self.timestamp = datetime.now() 223 | 224 | 225 | @dataclass 226 | class APIError(TTSError): 227 | """API-specific error information.""" 228 | 229 | status_code: int = 500 230 | headers: Optional[Dict[str, str]] = None 231 | 232 | 233 | @dataclass 234 | class NetworkError(TTSError): 235 | """Network-related error information.""" 236 | 237 | timeout: Optional[float] = None 238 | retry_count: int = 0 239 | 240 | 241 | @dataclass 242 | class ValidationError(TTSError): 243 | """Validation error information.""" 244 | 245 | field: Optional[str] = None 246 | value: Optional[Any] = None 247 | 248 | 249 | # Content type mappings for audio formats 250 | CONTENT_TYPE_MAP = { 251 | AudioFormat.MP3: "audio/mpeg", 252 | AudioFormat.OPUS: "audio/opus", 253 | AudioFormat.AAC: "audio/aac", 254 | AudioFormat.FLAC: "audio/flac", 255 | AudioFormat.WAV: "audio/wav", 256 | AudioFormat.PCM: "audio/pcm", 257 | } 258 | 259 | # Reverse mapping for content type to format 260 | FORMAT_FROM_CONTENT_TYPE = {v: k for k, v in CONTENT_TYPE_MAP.items()} 261 | 262 | 263 | def get_content_type(format: Union[AudioFormat, str]) -> str: 264 | """Get MIME content type for audio format.""" 265 | if isinstance(format, str): 266 | format = AudioFormat(format.lower()) 267 | return CONTENT_TYPE_MAP.get(format, "audio/mpeg") 268 | 269 | 270 | def get_format_from_content_type(content_type: str) -> AudioFormat: 271 | """Get audio format from MIME content type.""" 272 | return FORMAT_FROM_CONTENT_TYPE.get(content_type, AudioFormat.MP3) 273 | -------------------------------------------------------------------------------- /ttsfm-web/translations/zh.json: -------------------------------------------------------------------------------- 1 | { 2 | "nav": { 3 | "home": "首页", 4 | "playground": "试用平台", 5 | "documentation": "文档", 6 | "github": "GitHub", 7 | "status_checking": "检查中...", 8 | "status_online": "在线", 9 | "status_offline": "离线" 10 | }, 11 | "common": { 12 | "loading": "加载中...", 13 | "error": "错误", 14 | "success": "成功", 15 | "warning": "警告", 16 | "info": "信息", 17 | "close": "关闭", 18 | "save": "保存", 19 | "cancel": "取消", 20 | "confirm": "确认", 21 | "download": "下载", 22 | "upload": "上传", 23 | "generate": "生成", 24 | "play": "播放", 25 | "stop": "停止", 26 | "pause": "暂停", 27 | "resume": "继续", 28 | "clear": "清除", 29 | "reset": "重置", 30 | "copy": "复制", 31 | "copied": "已复制!", 32 | "language": "语言", 33 | "english": "English", 34 | "chinese": "中文", 35 | "validate": "验证", 36 | "options": "选项", 37 | "max_length": "最大长度", 38 | "tip": "提示", 39 | "choose_voice": "从可用声音中选择", 40 | "select_format": "选择您偏好的音频格式", 41 | "loading_voices": "加载声音中...", 42 | "loading_formats": "加载格式中...", 43 | "ctrl_enter_tip": "使用 Ctrl+Enter 生成", 44 | "auto_combine_enabled": "自动合并已启用", 45 | "demo": "演示", 46 | "clear_text": "清除文本", 47 | "tip_ctrl_enter": "提示:使用 Ctrl+Enter 生成", 48 | "ready": "就绪", 49 | "replay_audio": "重播音频", 50 | "share_audio": "分享音频", 51 | "browser_no_audio_support": "您的浏览器不支持音频元素。", 52 | "generating_speech": "生成语音中...", 53 | "streaming": "流式传输", 54 | "chars": "字符", 55 | "generated": "已生成" 56 | }, 57 | "home": { 58 | "title": "免费的Python文本转语音", 59 | "subtitle": "使用免费的openai.fm服务从文本生成高质量语音。无需API密钥,无需注册 - 只需安装即可开始创建音频。", 60 | "try_demo": "试用演示", 61 | "documentation": "文档", 62 | "github": "GitHub", 63 | "features_title": "主要特性", 64 | "features_subtitle": "简单、免费且强大的Python开发者文本转语音工具。", 65 | "feature_free_title": "完全免费", 66 | "feature_free_desc": "无需API密钥或注册。使用免费的openai.fm服务。", 67 | "feature_voices_title": "11种声音", 68 | "feature_voices_desc": "提供所有OpenAI兼容的声音,适用于不同使用场景。", 69 | "feature_formats_title": "6种音频格式", 70 | "feature_formats_desc": "支持MP3、WAV、OPUS、AAC、FLAC和PCM格式,适用于任何应用。", 71 | "feature_docker_title": "Docker就绪", 72 | "feature_docker_desc": "一键部署,包含Web界面和API端点。", 73 | "feature_openai_title": "OpenAI兼容", 74 | "feature_openai_desc": "OpenAI TTS API的直接替代品,支持长文本自动合并。", 75 | "feature_async_title": "异步和同步", 76 | "feature_async_desc": "提供asyncio和同步客户端,最大化灵活性。", 77 | "quick_start_title": "快速开始", 78 | "installation_title": "安装", 79 | "installation_code": "pip install ttsfm", 80 | "usage_title": "基本用法", 81 | "docker_title": "Docker部署", 82 | "docker_desc": "运行带有Web界面的TTSFM:", 83 | "api_title": "OpenAI兼容API", 84 | "api_desc": "与OpenAI Python客户端一起使用:", 85 | "footer_copyright": "© 2024 dbcccc" 86 | }, 87 | "playground": { 88 | "title": "交互式TTS试用平台", 89 | "subtitle": "实时测试不同的声音和音频格式", 90 | "text_input_label": "要转换的文本", 91 | "text_input_placeholder": "输入您想要转换为语音的文本...", 92 | "voice_label": "声音", 93 | "format_label": "音频格式", 94 | "instructions_label": "声音指令(可选)", 95 | "instructions_placeholder": "语音生成的额外指令...", 96 | "character_count": "字符", 97 | "max_length_warning": "文本超过最大长度。将自动分割并合并。", 98 | "generate_speech": "生成语音", 99 | "generating": "生成中...", 100 | "download_audio": "下载音频", 101 | "audio_player_title": "生成的音频", 102 | "file_size": "文件大小", 103 | "duration": "时长", 104 | "format": "格式", 105 | "voice": "声音", 106 | "chunks_combined": "合并片段", 107 | "random_text": "随机文本", 108 | "clear_text": "清除文本", 109 | "max_length_description": "每个请求的最大字符数(默认:1000)", 110 | "enable_length_validation": "启用长度验证", 111 | "auto_combine_long_text": "自动合并长文本", 112 | "auto_combine_tooltip": "自动分割长文本并将音频片段合并为单个文件", 113 | "auto_combine_description": "自动处理超过限制的文本", 114 | "instructions_description": "为声音调制提供可选指令", 115 | "api_key_optional": "API密钥(可选)", 116 | "api_key_placeholder": "如果需要,请输入您的API密钥", 117 | "api_key_description": "仅在服务器启用API密钥保护时需要", 118 | "sample_texts": { 119 | "welcome": "欢迎使用TTSFM!这是一个免费的文本转语音服务,使用先进的AI技术将您的文本转换为高质量音频。", 120 | "story": "很久很久以前,在一个遥远的数字世界里,住着一个小小的Python包,它能够将任何文本转换成美妙的语音。这个包叫做TTSFM,它为世界各地的开发者带来了快乐。", 121 | "technical": "TTSFM是一个用于文本转语音API的Python客户端,提供同步和异步接口。它支持多种声音和音频格式,非常适合各种应用。", 122 | "multilingual": "TTSFM支持多种语言和声音,让您能够为全球受众创建多样化的音频内容。该服务完全免费,无需API密钥。", 123 | "long": "这是一个较长的文本示例,用于测试TTSFM的自动合并功能。当文本超过最大长度限制时,TTSFM会自动将其分割成较小的片段,为每个片段生成音频,然后无缝地将它们合并成一个音频文件。这个过程对用户完全透明,确保您可以转换任何长度的文本,而无需担心技术限制。生成的音频在整个内容中保持一致的质量和自然的流畅性。" 124 | }, 125 | "error_messages": { 126 | "empty_text": "请输入要转换的文本。", 127 | "generation_failed": "语音生成失败。请重试。", 128 | "network_error": "网络错误。请检查您的连接并重试。", 129 | "invalid_format": "选择的音频格式无效。", 130 | "invalid_voice": "选择的声音无效。", 131 | "text_too_long": "文本太长。请减少长度或启用自动合并。", 132 | "server_error": "服务器错误。请稍后重试。" 133 | }, 134 | "success_messages": { 135 | "generation_complete": "语音生成成功!", 136 | "text_copied": "文本已复制到剪贴板!", 137 | "download_started": "下载已开始!" 138 | }, 139 | "speed_label": "播放速度", 140 | "speed_description": "调整音频播放速度,从 0.25x(较慢)到 4.0x(较快)。默认为 1.0x(正常速度)。", 141 | "speed": "速度", 142 | "chunks": "片段", 143 | "format_description": "选择音频输出格式。转换格式需要 ffmpeg。", 144 | "enable_websocket_streaming": "启用 WebSocket 流式传输", 145 | "realtime_audio_chunks": "(实时音频片段)", 146 | "streaming_progress": "流式传输进度", 147 | "stream_speech": "流式生成语音", 148 | "streaming_complete": "流式传输完成", 149 | "streaming_ready": "流式传输就绪", 150 | "streaming_active": "流式传输中...", 151 | "streaming_offline": "流式传输离线", 152 | "chunks_label": "片段:", 153 | "total_size_label": "总大小:", 154 | "time_label": "时间:", 155 | "format_label_colon": "格式:", 156 | "connection_error": "连接错误", 157 | "chunks_heading": "片段", 158 | "data_heading": "数据", 159 | "time_heading": "时间", 160 | "chunk_title": "片段" 161 | }, 162 | "docs": { 163 | "title": "API文档", 164 | "subtitle": "TTSFM文本转语音API的完整参考。免费、简单且强大。", 165 | "contents": "目录", 166 | "overview": "概述", 167 | "authentication": "身份验证", 168 | "text_validation": "文本验证", 169 | "endpoints": "API端点", 170 | "voices": "声音", 171 | "formats": "音频格式", 172 | "generate": "生成语音", 173 | "combined": "合并音频", 174 | "status": "状态和健康检查", 175 | "errors": "错误处理", 176 | "examples": "代码示例", 177 | "python_package": "Python包", 178 | "overview_title": "概述", 179 | "overview_desc": "TTSFM API提供现代的、OpenAI兼容的文本转语音生成接口。它支持多种声音、音频格式,并包含高级功能,如文本长度验证和智能自动合并功能。", 180 | "base_url": "基础URL:", 181 | "key_features": "主要特性", 182 | "feature_voices": "11种不同的声音选项 - 从alloy、echo、nova等中选择", 183 | "feature_formats": "多种音频格式 - 支持MP3、WAV、OPUS、AAC、FLAC、PCM", 184 | "feature_openai": "OpenAI兼容性 - OpenAI TTS API的直接替代品", 185 | "feature_auto_combine": "自动合并功能 - 自动处理长文本(>1000字符),通过分割和合并音频", 186 | "feature_validation": "文本长度验证 - 智能验证,可配置限制", 187 | "feature_monitoring": "实时监控 - 状态端点和健康检查", 188 | "new_version": "v3.3.1新功能:", 189 | "new_version_desc": "运行时镜像现已内置 ffmpeg,MP3 自动合并可立即使用;默认长文本上限调整为 1000 字符,保证播报行为一致。", 190 | "authentication_title": "身份验证", 191 | "authentication_desc": "目前,API支持可选的API密钥身份验证。如果已配置,请在请求头中包含您的API密钥。", 192 | "text_validation_title": "文本长度验证", 193 | "text_validation_desc": "TTSFM包含内置的文本长度验证,以确保与TTS模型的兼容性。默认最大长度为1000个字符,但可以自定义。", 194 | "important": "重要:", 195 | "text_validation_warning": "超过最大长度的文本将被拒绝,除非禁用验证或将文本分割成块。", 196 | "validation_options": "验证选项", 197 | "max_length_option": "允许的最大字符数(默认:1000)", 198 | "validate_length_option": "启用/禁用验证(默认:true)", 199 | "preserve_words_option": "分块时避免分割单词(默认:true)", 200 | "endpoints_title": "API端点", 201 | "get_voices_desc": "获取可用声音列表。", 202 | "get_formats_desc": "获取支持的音频格式列表。", 203 | "validate_text_desc": "验证文本长度并获取分割建议。", 204 | "generate_speech_desc": "从文本生成语音。", 205 | "response_example": "响应示例:", 206 | "request_body": "请求体:", 207 | "parameters": "参数:", 208 | "text_param": "要转换为语音的文本", 209 | "voice_param": "声音ID(默认:\"alloy\")", 210 | "format_param": "音频格式(默认:\"mp3\")", 211 | "instructions_param": "声音调制指令", 212 | "max_length_param": "最大文本长度(默认:1000)", 213 | "validate_length_param": "启用验证(默认:true)", 214 | "response": "响应:", 215 | "response_audio": "返回带有适当Content-Type头的音频文件。", 216 | "response_combined_audio": "返回包含所有块无缝合并的单个音频文件。", 217 | "required": "必需", 218 | "optional": "可选", 219 | "python_package_title": "Python包", 220 | "long_text_support": "长文本支持", 221 | "long_text_desc": "TTSFM Python包包含内置的长文本分割功能,为需要精细控制的开发者提供支持:", 222 | "developer_features": "开发者功能:", 223 | "manual_splitting": "手动分割:对高级用例的文本分块进行完全控制", 224 | "word_preservation": "单词保护:维护单词边界以获得自然语音", 225 | "separate_files": "单独文件:每个块保存为单独的音频文件", 226 | "cli_support": "CLI支持:使用`--split-long-text`标志进行命令行使用", 227 | "note": "注意:", 228 | "auto_combine_note": "对于Web用户,建议使用`/v1/audio/speech`中的自动合并功能,因为它会自动处理长文本并返回单个无缝音频文件。", 229 | "combined_audio_desc": "从长文本生成单个合并的音频文件。自动将文本分割成块,为每个块生成语音,并将它们合并成一个无缝的音频文件。", 230 | "response_headers": "响应头:", 231 | "chunks_combined_header": "合并的块数", 232 | "original_text_length_header": "原始文本长度(字符数)", 233 | "audio_size_header": "最终音频文件大小(字节)", 234 | "openai_compatible_desc": "增强的OpenAI兼容端点,具有自动合并功能。在需要时自动处理长文本,通过分割和合并音频块。", 235 | "enhanced_parameters": "增强参数:", 236 | "auto_combine_param": "自动分割长文本并将音频块合并为单个文件", 237 | "auto_combine_false": "如果文本超过max_length则返回错误(标准OpenAI行为)", 238 | "max_length_chunk_param": "分割时每个块的最大字符数", 239 | "auto_combine_header": "是否启用了自动合并(true/false)", 240 | "chunks_combined_response": "合并的音频块数(短文本为1)", 241 | "original_text_response": "原始文本长度(用于长文本处理)", 242 | "audio_format_header": "响应的音频格式", 243 | "audio_size_response": "音频文件大小(字节)", 244 | "short_text_comment": "短文本(正常工作)", 245 | "long_text_auto_comment": "带自动合并的长文本(默认)", 246 | "long_text_no_auto_comment": "不带自动合并的长文本(将出错)", 247 | "audio_combination": "音频合并:", 248 | "audio_combination_desc": "在可用时使用高级音频处理(PyDub),在不同环境中具有智能回退。支持所有音频格式。", 249 | "use_cases": "使用场景:", 250 | "use_case_articles": "长文章:将博客文章或文章转换为单个音频文件", 251 | "use_case_audiobooks": "有声书:将章节生成为单个音频文件", 252 | "use_case_podcasts": "播客:从脚本创建播客剧集", 253 | "use_case_education": "教育内容:将学习材料转换为音频", 254 | "example_usage": "使用示例:", 255 | "python_example_comment": "Python示例", 256 | "operations": "运行须知", 257 | "operational_title": "运行须知", 258 | "operational_limit": "启用 auto_combine 时,超过 1000 字符的请求会自动拆分;若需自行控制分块,可关闭校验。", 259 | "operational_fallback": "MP3 请求保持 MP3,OPUS/AAC/FLAC/WAV/PCM 等格式会回退为 WAV,确保播放稳定。", 260 | "operational_backend": "语音由第三方 openai.fm 提供,服务可能波动,请在业务中预留降级策略。", 261 | "operational_ffmpeg": "Docker 镜像已内置 ffmpeg,无需额外配置即可完成 MP3 自动合并。" 262 | } 263 | } -------------------------------------------------------------------------------- /ttsfm/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Command-line interface for TTSFM. 4 | 5 | This module provides a command-line interface for the TTSFM package, 6 | allowing users to generate speech from text using various options. 7 | """ 8 | 9 | import argparse 10 | import os 11 | import sys 12 | from pathlib import Path 13 | 14 | from .client import TTSClient 15 | from .exceptions import APIException, NetworkException, TTSException 16 | from .models import AudioFormat, TTSResponse, Voice 17 | 18 | 19 | def create_parser() -> argparse.ArgumentParser: 20 | """Create and configure the argument parser.""" 21 | parser = argparse.ArgumentParser( 22 | prog="ttsfm", 23 | description="TTSFM - Text-to-Speech API Client", 24 | formatter_class=argparse.RawDescriptionHelpFormatter, 25 | epilog=""" 26 | Examples: 27 | ttsfm "Hello, world!" --output hello.mp3 28 | ttsfm "Hello, world!" --voice nova --format wav --output hello.wav 29 | ttsfm "Hello, world!" --url http://localhost:7000 --output hello.mp3 30 | ttsfm --text-file input.txt --output speech.mp3 31 | """, 32 | ) 33 | 34 | # Text input options (mutually exclusive) 35 | text_group = parser.add_mutually_exclusive_group(required=True) 36 | text_group.add_argument("text", nargs="?", help="Text to convert to speech") 37 | text_group.add_argument("--text-file", "-f", type=str, help="Read text from file") 38 | 39 | # Output options 40 | parser.add_argument("--output", "-o", type=str, required=True, help="Output file path") 41 | 42 | # TTS options 43 | parser.add_argument( 44 | "--voice", 45 | "-v", 46 | type=str, 47 | default="alloy", 48 | choices=[ 49 | "alloy", 50 | "ash", 51 | "ballad", 52 | "coral", 53 | "echo", 54 | "fable", 55 | "nova", 56 | "onyx", 57 | "sage", 58 | "shimmer", 59 | "verse", 60 | ], 61 | help="Voice to use for speech generation (default: alloy)", 62 | ) 63 | 64 | parser.add_argument( 65 | "--format", 66 | type=str, 67 | default="mp3", 68 | choices=["mp3", "opus", "aac", "flac", "wav", "pcm"], 69 | help="Audio format (default: mp3)", 70 | ) 71 | 72 | parser.add_argument( 73 | "--speed", type=float, default=1.0, help="Speech speed (0.25 to 4.0, default: 1.0)" 74 | ) 75 | 76 | # Client options 77 | parser.add_argument( 78 | "--url", 79 | "-u", 80 | type=str, 81 | default="http://localhost:7000", 82 | help="TTS service URL (default: http://localhost:7000)", 83 | ) 84 | 85 | parser.add_argument("--api-key", "-k", type=str, help="API key for authentication") 86 | 87 | parser.add_argument( 88 | "--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30.0)" 89 | ) 90 | 91 | parser.add_argument( 92 | "--retries", type=int, default=3, help="Maximum number of retries (default: 3)" 93 | ) 94 | 95 | # Text length validation options 96 | parser.add_argument( 97 | "--max-length", 98 | type=int, 99 | default=1000, 100 | help="Maximum text length in characters (default: 1000)", 101 | ) 102 | 103 | parser.add_argument( 104 | "--no-length-validation", action="store_true", help="Disable text length validation" 105 | ) 106 | 107 | parser.add_argument( 108 | "--split-long-text", action="store_true", help="Automatically split long text into chunks" 109 | ) 110 | 111 | parser.add_argument( 112 | "--auto-combine", 113 | action="store_true", 114 | help=( 115 | "Combine long-text chunks into a single audio file " 116 | "(requires pydub for non-WAV formats)" 117 | ), 118 | ) 119 | 120 | # Other options 121 | parser.add_argument("--verbose", "-V", action="store_true", help="Enable verbose output") 122 | 123 | parser.add_argument("--version", action="version", version=f"%(prog)s {get_version()}") 124 | 125 | return parser 126 | 127 | 128 | def get_version() -> str: 129 | """Get the package version.""" 130 | try: 131 | from . import __version__ 132 | 133 | return __version__ 134 | except ImportError: 135 | return "unknown" 136 | 137 | 138 | def read_text_file(file_path: str) -> str: 139 | """Read text from a file.""" 140 | try: 141 | with open(file_path, "r", encoding="utf-8") as f: 142 | return f.read().strip() 143 | except FileNotFoundError: 144 | print(f"Error: File '{file_path}' not found.", file=sys.stderr) 145 | sys.exit(1) 146 | except Exception as e: 147 | print(f"Error reading file '{file_path}': {e}", file=sys.stderr) 148 | sys.exit(1) 149 | 150 | 151 | def validate_speed(speed: float) -> float: 152 | """Validate and return the speed parameter.""" 153 | if not 0.25 <= speed <= 4.0: 154 | print("Error: Speed must be between 0.25 and 4.0", file=sys.stderr) 155 | sys.exit(1) 156 | return speed 157 | 158 | 159 | def get_voice_enum(voice_str: str) -> Voice: 160 | """Convert voice string to Voice enum.""" 161 | voice_map = { 162 | "alloy": Voice.ALLOY, 163 | "ash": Voice.ASH, 164 | "ballad": Voice.BALLAD, 165 | "coral": Voice.CORAL, 166 | "echo": Voice.ECHO, 167 | "fable": Voice.FABLE, 168 | "nova": Voice.NOVA, 169 | "onyx": Voice.ONYX, 170 | "sage": Voice.SAGE, 171 | "shimmer": Voice.SHIMMER, 172 | "verse": Voice.VERSE, 173 | } 174 | return voice_map[voice_str.lower()] 175 | 176 | 177 | def get_format_enum(format_str: str) -> AudioFormat: 178 | """Convert format string to AudioFormat enum.""" 179 | format_map = { 180 | "mp3": AudioFormat.MP3, 181 | "opus": AudioFormat.OPUS, 182 | "aac": AudioFormat.AAC, 183 | "flac": AudioFormat.FLAC, 184 | "wav": AudioFormat.WAV, 185 | "pcm": AudioFormat.PCM, 186 | } 187 | return format_map[format_str.lower()] 188 | 189 | 190 | def handle_long_text( # type: ignore[no-untyped-def] 191 | args, 192 | text: str, 193 | voice: Voice, 194 | audio_format: AudioFormat, 195 | speed: float, 196 | ) -> None: 197 | """Handle long text by splitting it into chunks and generating multiple files.""" 198 | # Create client 199 | try: 200 | client = TTSClient( 201 | base_url=args.url, api_key=args.api_key, timeout=args.timeout, max_retries=args.retries 202 | ) 203 | 204 | # Use the new long text method 205 | responses = client.generate_speech_long_text( 206 | text=text, 207 | voice=voice, 208 | response_format=audio_format, 209 | speed=speed, 210 | max_length=args.max_length, 211 | preserve_words=True, 212 | auto_combine=args.auto_combine, 213 | ) 214 | 215 | if not responses: 216 | print("Error: No valid text chunks found after processing.", file=sys.stderr) 217 | sys.exit(1) 218 | if isinstance(responses, TTSResponse): 219 | combined_response = responses 220 | combined_response.save_to_file(args.output) 221 | print(f"Generated combined audio: {args.output}") 222 | return 223 | 224 | print(f"Generated {len(responses)} audio chunks") 225 | 226 | base_name, ext = os.path.splitext(args.output) 227 | 228 | for i, response in enumerate(responses, 1): 229 | if args.verbose: 230 | print(f"Saving chunk {i}/{len(responses)}...") 231 | 232 | if len(responses) == 1: 233 | output_file = args.output 234 | else: 235 | output_file = f"{base_name}_part{i:03d}{ext}" 236 | 237 | with open(output_file, "wb") as f: 238 | f.write(response.audio_data) 239 | 240 | print(f"Generated: {output_file}") 241 | 242 | if len(responses) > 1: 243 | print(f"\nGenerated {len(responses)} audio files from long text.") 244 | print(f"Files: {base_name}_part001{ext} to {base_name}_part{len(responses):03d}{ext}") 245 | 246 | except Exception as e: 247 | print(f"Error processing long text: {e}", file=sys.stderr) 248 | if args.verbose: 249 | import traceback 250 | 251 | traceback.print_exc() 252 | sys.exit(1) 253 | 254 | 255 | def main() -> None: 256 | """Main CLI entry point.""" 257 | parser = create_parser() 258 | args = parser.parse_args() 259 | 260 | # Get text input 261 | if args.text: 262 | text = args.text 263 | else: 264 | text = read_text_file(args.text_file) 265 | 266 | if not text: 267 | print("Error: No text provided.", file=sys.stderr) 268 | sys.exit(1) 269 | 270 | # Validate parameters 271 | speed = validate_speed(args.speed) 272 | voice = get_voice_enum(args.voice) 273 | audio_format = get_format_enum(args.format) 274 | 275 | # Create output directory if needed 276 | output_path = Path(args.output) 277 | output_path.parent.mkdir(parents=True, exist_ok=True) 278 | 279 | # Check text length and handle accordingly 280 | text_length = len(text) 281 | validate_length = not args.no_length_validation 282 | 283 | if args.verbose: 284 | print(f"Text: {text[:50]}{'...' if len(text) > 50 else ''}") 285 | print(f"Text length: {text_length} characters") 286 | print(f"Max length: {args.max_length}") 287 | print(f"Length validation: {'enabled' if validate_length else 'disabled'}") 288 | print(f"Voice: {args.voice}") 289 | print(f"Format: {args.format}") 290 | print(f"Speed: {speed}") 291 | print(f"URL: {args.url}") 292 | print(f"Output: {args.output}") 293 | print() 294 | 295 | # Handle long text 296 | if text_length > args.max_length: 297 | if args.split_long_text: 298 | print(f"Text is {text_length} characters, splitting into chunks...") 299 | return handle_long_text(args, text, voice, audio_format, speed) 300 | elif validate_length: 301 | print( 302 | f"Error: Text is too long ({text_length} characters). " 303 | f"Maximum allowed is {args.max_length} characters.", 304 | file=sys.stderr, 305 | ) 306 | print( 307 | "Use --split-long-text to automatically split the text, " 308 | "or --no-length-validation to disable this check.", 309 | file=sys.stderr, 310 | ) 311 | sys.exit(1) 312 | 313 | # Create client 314 | try: 315 | client = TTSClient( 316 | base_url=args.url, api_key=args.api_key, timeout=args.timeout, max_retries=args.retries 317 | ) 318 | 319 | if args.verbose: 320 | print("Generating speech...") 321 | 322 | # Generate speech 323 | response = client.generate_speech( 324 | text=text, 325 | voice=voice, 326 | response_format=audio_format, 327 | speed=speed, 328 | max_length=args.max_length, 329 | validate_length=validate_length, 330 | ) 331 | 332 | # Save to file 333 | with open(args.output, "wb") as f: 334 | f.write(response.audio_data) 335 | 336 | print(f"Speech generated successfully: {args.output}") 337 | 338 | except NetworkException as e: 339 | print(f"Network error: {e}", file=sys.stderr) 340 | sys.exit(1) 341 | except APIException as e: 342 | print(f"API error: {e}", file=sys.stderr) 343 | sys.exit(1) 344 | except TTSException as e: 345 | print(f"TTS error: {e}", file=sys.stderr) 346 | sys.exit(1) 347 | except Exception as e: 348 | print(f"Unexpected error: {e}", file=sys.stderr) 349 | if args.verbose: 350 | import traceback 351 | 352 | traceback.print_exc() 353 | sys.exit(1) 354 | 355 | 356 | if __name__ == "__main__": 357 | main() 358 | -------------------------------------------------------------------------------- /ttsfm-web/websocket_handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | WebSocket handler for real-time TTS streaming. 3 | 4 | Because apparently waiting 2 seconds for audio generation is too much for modern users. 5 | At least this will make it FEEL faster. 6 | """ 7 | 8 | import base64 9 | import logging 10 | import time 11 | import uuid 12 | from datetime import datetime 13 | from typing import Any, Callable, Dict, Optional 14 | 15 | from flask import request 16 | from flask_socketio import SocketIO, emit 17 | 18 | from ttsfm import AudioFormat, TTSClient, Voice 19 | from ttsfm.utils import split_text_by_length 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class WebSocketTTSHandler: 25 | """ 26 | Handles WebSocket connections for streaming TTS generation. 27 | 28 | Because your users can't wait 2 seconds for a complete response. 29 | """ 30 | 31 | def __init__(self, socketio: SocketIO, client_factory: Callable[[], TTSClient]): 32 | self.socketio = socketio 33 | self._client_factory = client_factory 34 | self.active_sessions: Dict[str, Dict[str, Any]] = {} 35 | self._tasks: Dict[str, Dict[str, Any]] = {} 36 | 37 | # Register WebSocket events 38 | self._register_events() 39 | 40 | def _register_events(self): 41 | """Register all WebSocket event handlers.""" 42 | 43 | @self.socketio.on("connect") 44 | def handle_connect(): 45 | """Handle new WebSocket connection.""" 46 | session_id = request.sid 47 | self.active_sessions[session_id] = { 48 | "connected_at": datetime.now(), 49 | "request_count": 0, 50 | "last_request": None, 51 | } 52 | self._tasks[session_id] = {} 53 | logger.info(f"WebSocket client connected: {session_id}") 54 | logger.info(f"Active sessions: {len(self.active_sessions)}") 55 | emit("connected", {"session_id": session_id, "status": "ready"}) 56 | 57 | @self.socketio.on("disconnect") 58 | def handle_disconnect(): 59 | """Handle WebSocket disconnection.""" 60 | session_id = request.sid 61 | if session_id in self.active_sessions: 62 | del self.active_sessions[session_id] 63 | self._cancel_all_tasks(session_id) 64 | logger.info(f"WebSocket client disconnected: {session_id}") 65 | 66 | @self.socketio.on("generate_stream") 67 | def handle_generate_stream(data): 68 | """ 69 | Handle streaming TTS generation request. 70 | 71 | Expected data format: 72 | { 73 | 'text': str, 74 | 'voice': str, 75 | 'format': str, 76 | 'chunk_size': int (optional, default 1024 chars), 77 | 'instructions': str (optional, voice modulation instructions) 78 | } 79 | """ 80 | session_id = request.sid 81 | request_id = data.get("request_id", str(uuid.uuid4())) 82 | 83 | # Update session info 84 | if session_id in self.active_sessions: 85 | self.active_sessions[session_id]["request_count"] += 1 86 | self.active_sessions[session_id]["last_request"] = datetime.now() 87 | 88 | # Emit acknowledgment 89 | emit("stream_started", {"request_id": request_id, "timestamp": time.time()}) 90 | 91 | # Start async generation 92 | task = self.socketio.start_background_task( 93 | self._generate_stream, session_id, request_id, data 94 | ) 95 | self._store_task(session_id, request_id, task) 96 | 97 | @self.socketio.on("cancel_stream") 98 | def handle_cancel_stream(data): 99 | """Handle stream cancellation request.""" 100 | request_id = data.get("request_id") 101 | session_id = request.sid 102 | 103 | if not request_id: 104 | return 105 | 106 | cancelled = self._cancel_task(session_id, request_id) 107 | if cancelled: 108 | logger.info(f"Stream cancellation requested: {request_id}") 109 | else: 110 | logger.info(f"Stream cancellation requested for unknown request: {request_id}") 111 | 112 | emit("stream_cancelled", {"request_id": request_id, "cancelled": cancelled}) 113 | 114 | @self.socketio.on("ping") 115 | def handle_ping(data): 116 | """Handle ping request for connection testing.""" 117 | session_id = request.sid 118 | logger.debug(f"Ping received from {session_id}") 119 | emit("pong", {"timestamp": time.time(), "data": data}) 120 | 121 | def _generate_stream(self, session_id: str, request_id: str, data: Dict[str, Any]): 122 | """ 123 | Generate TTS audio in chunks and stream to client. 124 | 125 | This is where the magic happens. And by magic, I mean 126 | chunking text and pretending it's real-time. 127 | """ 128 | client = self._client_factory() 129 | 130 | try: 131 | # Extract parameters 132 | text = data.get("text", "") 133 | voice = data.get("voice", "alloy") 134 | format_str = data.get("format", "mp3") 135 | chunk_size = data.get("chunk_size", 1024) 136 | instructions = data.get("instructions", None) # Voice instructions support! 137 | 138 | if not text: 139 | self._emit_error(session_id, request_id, "No text provided") 140 | return 141 | 142 | # Convert string parameters to enums 143 | try: 144 | voice_enum = Voice(voice.lower()) 145 | format_enum = AudioFormat(format_str.lower()) 146 | except ValueError as e: 147 | self._emit_error(session_id, request_id, f"Invalid parameter: {str(e)}") 148 | return 149 | 150 | # Split text into chunks for "streaming" effect 151 | chunks = split_text_by_length(text, chunk_size, preserve_words=True) 152 | total_chunks = len(chunks) 153 | 154 | logger.info(f"Starting stream generation: {request_id} with {total_chunks} chunks") 155 | 156 | # Emit initial progress 157 | self.socketio.emit( 158 | "stream_progress", 159 | { 160 | "request_id": request_id, 161 | "progress": 0, 162 | "total_chunks": total_chunks, 163 | "status": "processing", 164 | }, 165 | room=session_id, 166 | ) 167 | 168 | # Process each chunk 169 | for i, chunk in enumerate(chunks): 170 | # Check if client is still connected 171 | if session_id not in self.active_sessions: 172 | logger.warning(f"Client disconnected during generation: {session_id}") 173 | break 174 | 175 | if not self._is_task_active(session_id, request_id): 176 | logger.info(f"Stream generation cancelled: {request_id}") 177 | break 178 | 179 | try: 180 | # Generate audio for chunk 181 | start_time = time.time() 182 | response = client.generate_speech( 183 | text=chunk, 184 | voice=voice_enum, 185 | response_format=format_enum, 186 | instructions=instructions, # Pass voice instructions! 187 | validate_length=False, # We already chunked it 188 | ) 189 | generation_time = time.time() - start_time 190 | 191 | # Emit chunk data 192 | encoded_audio = base64.b64encode(response.audio_data).decode("ascii") 193 | chunk_data = { 194 | "request_id": request_id, 195 | "chunk_index": i, 196 | "total_chunks": total_chunks, 197 | "audio_data": encoded_audio, 198 | "encoding": "base64", 199 | "byte_length": len(response.audio_data), 200 | "format": response.format.value, 201 | "requested_format": format_enum.value, 202 | "duration": response.duration, 203 | "generation_time": generation_time, 204 | "chunk_text": chunk[:50] + "..." if len(chunk) > 50 else chunk, 205 | } 206 | 207 | self.socketio.emit("audio_chunk", chunk_data, room=session_id) 208 | 209 | # Emit progress update 210 | progress = int(((i + 1) / total_chunks) * 100) 211 | self.socketio.emit( 212 | "stream_progress", 213 | { 214 | "request_id": request_id, 215 | "progress": progress, 216 | "total_chunks": total_chunks, 217 | "chunks_completed": i + 1, 218 | "status": "processing", 219 | }, 220 | room=session_id, 221 | ) 222 | 223 | # Small delay to prevent overwhelming the client 224 | # (and to make it feel more "real-time") 225 | self.socketio.sleep(0.1) 226 | 227 | except Exception as e: 228 | logger.error(f"Error generating chunk {i}: {str(e)}") 229 | self._emit_error( 230 | session_id, request_id, f"Chunk {i} generation failed: {str(e)}" 231 | ) 232 | # Continue with next chunk instead of failing completely 233 | continue 234 | 235 | # Emit completion 236 | self.socketio.emit( 237 | "stream_complete", 238 | { 239 | "request_id": request_id, 240 | "total_chunks": total_chunks, 241 | "status": "completed", 242 | "timestamp": time.time(), 243 | }, 244 | room=session_id, 245 | ) 246 | 247 | logger.info(f"Stream generation completed: {request_id}") 248 | 249 | except Exception as e: 250 | logger.error(f"Stream generation failed: {str(e)}") 251 | self._emit_error(session_id, request_id, str(e)) 252 | finally: 253 | try: 254 | client.close() 255 | except Exception as exc: # pragma: no cover - defensive cleanup 256 | logger.debug("Failed to close TTS client cleanly: %s", exc) 257 | self._remove_task(session_id, request_id) 258 | 259 | def _emit_error(self, session_id: str, request_id: str, error_message: str): 260 | """Emit error to specific session.""" 261 | self.socketio.emit( 262 | "stream_error", 263 | {"request_id": request_id, "error": error_message, "timestamp": time.time()}, 264 | room=session_id, 265 | ) 266 | 267 | def _store_task(self, session_id: str, request_id: str, task: Any) -> None: 268 | self._tasks.setdefault(session_id, {})[request_id] = task 269 | 270 | def _remove_task(self, session_id: str, request_id: str) -> None: 271 | tasks = self._tasks.get(session_id) 272 | if not tasks: 273 | return 274 | tasks.pop(request_id, None) 275 | if not tasks: 276 | self._tasks.pop(session_id, None) 277 | 278 | def _cancel_task(self, session_id: str, request_id: str) -> bool: 279 | tasks = self._tasks.get(session_id) 280 | if not tasks: 281 | return False 282 | task = tasks.pop(request_id, None) 283 | if not task: 284 | if not tasks: 285 | self._tasks.pop(session_id, None) 286 | return False 287 | 288 | self._invoke_task_cancel(task) 289 | if not tasks: 290 | self._tasks.pop(session_id, None) 291 | return True 292 | 293 | def _cancel_all_tasks(self, session_id: str) -> None: 294 | tasks = self._tasks.pop(session_id, {}) 295 | for task in tasks.values(): 296 | self._invoke_task_cancel(task) 297 | 298 | def _invoke_task_cancel(self, task: Any) -> None: 299 | try: 300 | cancel = getattr(task, "cancel", None) 301 | if callable(cancel): 302 | cancel() 303 | return 304 | 305 | kill = getattr(task, "kill", None) 306 | if callable(kill): # pragma: no cover - eventlet specific 307 | kill() 308 | except Exception as exc: # pragma: no cover - defensive logging 309 | logger.debug("Failed to cancel background task cleanly: %s", exc) 310 | 311 | def _is_task_active(self, session_id: str, request_id: str) -> bool: 312 | tasks = self._tasks.get(session_id) 313 | if not tasks: 314 | return False 315 | return request_id in tasks 316 | 317 | def get_active_sessions_count(self) -> int: 318 | """Get count of active WebSocket sessions.""" 319 | return len(self.active_sessions) 320 | 321 | def get_session_info(self, session_id: str) -> Optional[Dict[str, Any]]: 322 | """Get information about a specific session.""" 323 | return self.active_sessions.get(session_id) 324 | -------------------------------------------------------------------------------- /ttsfm-web/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {% block title %}TTSFM - {{ _('nav.home') }}{% endblock %} 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | {% block extra_css %}{% endblock %} 45 | 46 | 47 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 148 | 149 | 150 |
151 | {% block content %}{% endblock %} 152 |
153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 338 | 339 | {% block extra_js %}{% endblock %} 340 | 341 | 342 | 343 | -------------------------------------------------------------------------------- /ttsfm-web/translations/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "nav": { 3 | "home": "Home", 4 | "playground": "Playground", 5 | "documentation": "Documentation", 6 | "github": "GitHub", 7 | "status_checking": "Checking...", 8 | "status_online": "Online", 9 | "status_offline": "Offline" 10 | }, 11 | "common": { 12 | "loading": "Loading...", 13 | "error": "Error", 14 | "success": "Success", 15 | "warning": "Warning", 16 | "info": "Info", 17 | "close": "Close", 18 | "save": "Save", 19 | "cancel": "Cancel", 20 | "confirm": "Confirm", 21 | "download": "Download", 22 | "upload": "Upload", 23 | "generate": "Generate", 24 | "play": "Play", 25 | "stop": "Stop", 26 | "pause": "Pause", 27 | "resume": "Resume", 28 | "clear": "Clear", 29 | "reset": "Reset", 30 | "copy": "Copy", 31 | "copied": "Copied!", 32 | "language": "Language", 33 | "english": "English", 34 | "chinese": "中文", 35 | "validate": "Validate", 36 | "options": "Options", 37 | "max_length": "Max Length", 38 | "tip": "Tip", 39 | "choose_voice": "Choose from available voices", 40 | "select_format": "Select your preferred audio format", 41 | "loading_voices": "Loading voices...", 42 | "loading_formats": "Loading formats...", 43 | "ctrl_enter_tip": "Use Ctrl+Enter to generate", 44 | "auto_combine_enabled": "Auto-combine enabled", 45 | "demo": "Demo", 46 | "clear_text": "Clear text", 47 | "tip_ctrl_enter": "Tip: Use Ctrl+Enter to generate", 48 | "ready": "Ready", 49 | "replay_audio": "Replay audio", 50 | "share_audio": "Share audio", 51 | "browser_no_audio_support": "Your browser does not support the audio element.", 52 | "generating_speech": "Generating speech...", 53 | "streaming": "Streaming", 54 | "chars": "chars", 55 | "generated": "Generated" 56 | }, 57 | "home": { 58 | "title": "Free Text-to-Speech for Python", 59 | "subtitle": "Generate high-quality speech from text using the free openai.fm service. No API keys, no registration - just install and start creating audio.", 60 | "try_demo": "Try Demo", 61 | "documentation": "Documentation", 62 | "github": "GitHub", 63 | "features_title": "Key Features", 64 | "features_subtitle": "Simple, free, and powerful text-to-speech for Python developers.", 65 | "feature_free_title": "Completely Free", 66 | "feature_free_desc": "No API keys or registration required. Uses the free openai.fm service.", 67 | "feature_voices_title": "11 Voices", 68 | "feature_voices_desc": "All OpenAI-compatible voices available for different use cases.", 69 | "feature_formats_title": "6 Audio Formats", 70 | "feature_formats_desc": "MP3, WAV, OPUS, AAC, FLAC, and PCM support for any application.", 71 | "feature_docker_title": "Docker Ready", 72 | "feature_docker_desc": "One-command deployment with web interface and API endpoints.", 73 | "feature_openai_title": "OpenAI Compatible", 74 | "feature_openai_desc": "Drop-in replacement for OpenAI's TTS API with auto-combine for long text.", 75 | "feature_async_title": "Async & Sync", 76 | "feature_async_desc": "Both asyncio and synchronous clients for maximum flexibility.", 77 | "quick_start_title": "Quick Start", 78 | "installation_title": "Installation", 79 | "installation_code": "pip install ttsfm", 80 | "usage_title": "Basic Usage", 81 | "docker_title": "Docker Deployment", 82 | "docker_desc": "Run TTSFM with web interface:", 83 | "api_title": "OpenAI-Compatible API", 84 | "api_desc": "Use with OpenAI Python client:", 85 | "footer_copyright": "© 2024 dbcccc" 86 | }, 87 | "playground": { 88 | "title": "Interactive TTS Playground", 89 | "subtitle": "Test different voices and audio formats in real-time", 90 | "text_input_label": "Text to Convert", 91 | "text_input_placeholder": "Enter the text you want to convert to speech...", 92 | "voice_label": "Voice", 93 | "format_label": "Audio Format", 94 | "instructions_label": "Voice Instructions (Optional)", 95 | "instructions_placeholder": "Additional instructions for voice generation...", 96 | "character_count": "characters", 97 | "max_length_warning": "Text exceeds maximum length. It will be automatically split and combined.", 98 | "generate_speech": "Generate Speech", 99 | "generating": "Generating...", 100 | "download_audio": "Download Audio", 101 | "audio_player_title": "Generated Audio", 102 | "file_size": "File Size", 103 | "duration": "Duration", 104 | "format": "Format", 105 | "voice": "Voice", 106 | "chunks_combined": "Chunks Combined", 107 | "random_text": "Random Text", 108 | "clear_text": "Clear Text", 109 | "max_length_description": "Maximum characters per request (default: 1000)", 110 | "enable_length_validation": "Enable length validation", 111 | "auto_combine_long_text": "Auto-combine long text", 112 | "auto_combine_tooltip": "Automatically split long text and combine audio chunks into a single file", 113 | "auto_combine_description": "Automatically handles text longer than the limit", 114 | "instructions_description": "Provide optional instructions for voice modulation", 115 | "api_key_optional": "API Key (Optional)", 116 | "api_key_placeholder": "Enter your API key if required", 117 | "api_key_description": "Only required if API key protection is enabled on the server", 118 | "sample_texts": { 119 | "welcome": "Welcome to TTSFM! This is a free text-to-speech service that converts your text into high-quality audio using advanced AI technology.", 120 | "story": "Once upon a time, in a digital world far away, there lived a small Python package that could transform any text into beautiful speech. This package was called TTSFM, and it brought joy to developers everywhere.", 121 | "technical": "TTSFM is a Python client for text-to-speech APIs that provides both synchronous and asynchronous interfaces. It supports multiple voices and audio formats, making it perfect for various applications.", 122 | "multilingual": "TTSFM supports multiple languages and voices, allowing you to create diverse audio content for global audiences. The service is completely free and requires no API keys.", 123 | "long": "This is a longer text sample designed to test the auto-combine feature of TTSFM. When text exceeds the maximum length limit, TTSFM automatically splits it into smaller chunks, generates audio for each chunk, and then seamlessly combines them into a single audio file. This process is completely transparent to the user and ensures that you can convert text of any length without worrying about technical limitations. The resulting audio maintains consistent quality and natural flow throughout the entire content." 124 | }, 125 | "error_messages": { 126 | "empty_text": "Please enter some text to convert.", 127 | "generation_failed": "Failed to generate speech. Please try again.", 128 | "network_error": "Network error. Please check your connection and try again.", 129 | "invalid_format": "Invalid audio format selected.", 130 | "invalid_voice": "Invalid voice selected.", 131 | "text_too_long": "Text is too long. Please reduce the length or enable auto-combine.", 132 | "server_error": "Server error. Please try again later." 133 | }, 134 | "success_messages": { 135 | "generation_complete": "Speech generated successfully!", 136 | "text_copied": "Text copied to clipboard!", 137 | "download_started": "Download started!" 138 | }, 139 | "speed_label": "Playback Speed", 140 | "speed_description": "Adjust audio playback speed from 0.25x (slower) to 4.0x (faster). Default is 1.0x (normal speed).", 141 | "speed": "Speed", 142 | "chunks": "Chunks", 143 | "format_description": "Choose audio output format. Converted formats require ffmpeg.", 144 | "enable_websocket_streaming": "Enable WebSocket Streaming", 145 | "realtime_audio_chunks": "(Real-time audio chunks)", 146 | "streaming_progress": "Streaming Progress", 147 | "stream_speech": "Stream Speech", 148 | "streaming_complete": "Streaming Complete", 149 | "streaming_ready": "Streaming Ready", 150 | "streaming_active": "Streaming...", 151 | "streaming_offline": "Streaming Offline", 152 | "chunks_label": "Chunks:", 153 | "total_size_label": "Total Size:", 154 | "time_label": "Time:", 155 | "format_label_colon": "Format:", 156 | "connection_error": "Connection Error", 157 | "chunks_heading": "Chunks", 158 | "data_heading": "Data", 159 | "time_heading": "Time", 160 | "chunk_title": "Chunk" 161 | }, 162 | "docs": { 163 | "title": "API Documentation", 164 | "subtitle": "Complete reference for the TTSFM Text-to-Speech API. Free, simple, and powerful.", 165 | "contents": "Contents", 166 | "overview": "Overview", 167 | "authentication": "Authentication", 168 | "text_validation": "Text Validation", 169 | "endpoints": "API Endpoints", 170 | "voices": "Voices", 171 | "formats": "Audio Formats", 172 | "generate": "Generate Speech", 173 | "combined": "Combined Audio", 174 | "status": "Status & Health", 175 | "errors": "Error Handling", 176 | "examples": "Code Examples", 177 | "python_package": "Python Package", 178 | "overview_title": "Overview", 179 | "overview_desc": "The TTSFM API provides a modern, OpenAI-compatible interface for text-to-speech generation. It supports multiple voices, audio formats, and includes advanced features like text length validation and intelligent auto-combine functionality.", 180 | "base_url": "Base URL:", 181 | "key_features": "Key Features", 182 | "feature_voices": "11 different voice options - Choose from alloy, echo, nova, and more", 183 | "feature_formats": "Multiple audio formats - MP3, WAV, OPUS, AAC, FLAC, PCM support", 184 | "feature_openai": "OpenAI compatibility - Drop-in replacement for OpenAI's TTS API", 185 | "feature_auto_combine": "Auto-combine feature - Automatically handles long text (>1000 chars) by splitting and combining audio", 186 | "feature_validation": "Text length validation - Smart validation with configurable limits", 187 | "feature_monitoring": "Real-time monitoring - Status endpoints and health checks", 188 | "new_version": "New in v3.3.4:", 189 | "new_version_desc": "Runtime images now ship with ffmpeg so MP3 auto-combine succeeds immediately, and the default long-text limit is trimmed to 1000 characters for predictable playback.", 190 | "authentication_title": "Authentication", 191 | "authentication_desc": "Currently, the API supports optional API key authentication. If configured, include your API key in the request headers.", 192 | "text_validation_title": "Text Length Validation", 193 | "text_validation_desc": "TTSFM includes built-in text length validation to ensure compatibility with TTS models. The default maximum length is 1000 characters, but this can be customized.", 194 | "important": "Important:", 195 | "text_validation_warning": "Text exceeding the maximum length will be rejected unless validation is disabled or the text is split into chunks.", 196 | "validation_options": "Validation Options", 197 | "max_length_option": "Maximum allowed characters (default: 1000)", 198 | "validate_length_option": "Enable/disable validation (default: true)", 199 | "preserve_words_option": "Avoid splitting words when chunking (default: true)", 200 | "endpoints_title": "API Endpoints", 201 | "get_voices_desc": "Get list of available voices.", 202 | "get_formats_desc": "Get list of supported audio formats.", 203 | "validate_text_desc": "Validate text length and get splitting suggestions.", 204 | "generate_speech_desc": "Generate speech from text.", 205 | "response_example": "Response Example:", 206 | "request_body": "Request Body:", 207 | "parameters": "Parameters:", 208 | "text_param": "Text to convert to speech", 209 | "voice_param": "Voice ID (default: \"alloy\")", 210 | "format_param": "Audio format (default: \"mp3\")", 211 | "instructions_param": "Voice modulation instructions", 212 | "max_length_param": "Maximum text length (default: 1000)", 213 | "validate_length_param": "Enable validation (default: true)", 214 | "response": "Response:", 215 | "response_audio": "Returns audio file with appropriate Content-Type header.", 216 | "response_combined_audio": "Returns a single audio file containing all chunks combined seamlessly.", 217 | "required": "required", 218 | "optional": "optional", 219 | "python_package_title": "Python Package", 220 | "long_text_support": "Long Text Support", 221 | "long_text_desc": "The TTSFM Python package includes built-in long text splitting functionality for developers who need fine-grained control:", 222 | "developer_features": "Developer Features:", 223 | "manual_splitting": "Manual Splitting: Full control over text chunking for advanced use cases", 224 | "word_preservation": "Word Preservation: Maintains word boundaries for natural speech", 225 | "separate_files": "Separate Files: Each chunk saved as individual audio file", 226 | "cli_support": "CLI Support: Use `--split-long-text` flag for command-line usage", 227 | "note": "Note:", 228 | "auto_combine_note": "For web users, the auto-combine feature in `/v1/audio/speech` is recommended as it automatically handles long text and returns a single seamless audio file.", 229 | "combined_audio_desc": "Generate a single combined audio file from long text. Automatically splits text into chunks, generates speech for each chunk, and combines them into one seamless audio file.", 230 | "response_headers": "Response Headers:", 231 | "chunks_combined_header": "Number of chunks that were combined", 232 | "original_text_length_header": "Original text length in characters", 233 | "audio_size_header": "Final audio file size in bytes", 234 | "openai_compatible_desc": "Enhanced OpenAI-compatible endpoint with auto-combine feature. Automatically handles long text by splitting and combining audio chunks when needed.", 235 | "enhanced_parameters": "Enhanced Parameters:", 236 | "auto_combine_param": "Automatically split long text and combine audio chunks into a single file", 237 | "auto_combine_false": "Return error if text exceeds max_length (standard OpenAI behavior)", 238 | "max_length_chunk_param": "Maximum characters per chunk when splitting", 239 | "auto_combine_header": "Whether auto-combine was enabled (true/false)", 240 | "chunks_combined_response": "Number of audio chunks combined (1 for short text)", 241 | "original_text_response": "Original text length (for long text processing)", 242 | "audio_format_header": "Audio format of the response", 243 | "audio_size_response": "Audio file size in bytes", 244 | "short_text_comment": "Short text (works normally)", 245 | "long_text_auto_comment": "Long text with auto-combine (default)", 246 | "long_text_no_auto_comment": "Long text without auto-combine (will error)", 247 | "audio_combination": "Audio Combination:", 248 | "audio_combination_desc": "Uses advanced audio processing (PyDub) when available, with intelligent fallbacks for different environments. Supports all audio formats.", 249 | "use_cases": "Use Cases:", 250 | "use_case_articles": "Long Articles: Convert blog posts or articles to single audio files", 251 | "use_case_audiobooks": "Audiobooks: Generate chapters as single audio files", 252 | "use_case_podcasts": "Podcasts: Create podcast episodes from scripts", 253 | "use_case_education": "Educational Content: Convert learning materials to audio", 254 | "example_usage": "Example Usage:", 255 | "python_example_comment": "Python example", 256 | "operations": "Operational Notes", 257 | "operational_title": "Operational Notes", 258 | "operational_limit": "Requests above 1000 characters are automatically split when auto_combine is enabled; disable validation to manage chunking yourself.", 259 | "operational_fallback": "MP3 requests return MP3. OPUS, AAC, FLAC, WAV, and PCM map to WAV for reliable playback.", 260 | "operational_backend": "Audio comes from the third-party openai.fm service; availability may change without notice—add graceful fallbacks.", 261 | "operational_ffmpeg": "The Docker image bundles ffmpeg so combined MP3 responses work immediately without extra setup." 262 | } 263 | } --------------------------------------------------------------------------------