├── .flake8
├── requirements.txt
├── ttsfm-web
    ├── run.py
    ├── requirements.txt
    ├── static
    │   └── js
    │   │   ├── api-client.js
    │   │   └── i18n.js
    ├── templates
    │   ├── index.html
    │   └── base.html
    ├── i18n.py
    ├── translations
    │   ├── zh.json
    │   └── en.json
    └── websocket_handler.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── release.yml
    │   ├── docker-build-full.yml
    │   └── docker-build-slim.yml
├── .env.example
├── LICENSE
├── docs
    ├── architecture.md
    ├── docker-workflows.md
    ├── v3.4-dual-image-implementation.md
    └── websocket-streaming.md
├── CONTRIBUTING.md
├── Dockerfile
├── .gitignore
├── tests
    ├── test_utils.py
    ├── test_web_app.py
    ├── test_clients.py
    └── test_audio_processing.py
├── ttsfm
    ├── capabilities.py
    ├── audio.py
    ├── __init__.py
    ├── audio_processing.py
    ├── exceptions.py
    ├── models.py
    └── cli.py
├── README.zh.md
├── pyproject.toml
├── scripts
    └── test_websocket.py
└── README.md


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | extend-ignore = E203,W503,E501
4 | exclude = .venv,build,dist,ttsfm.egg-info
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Core dependencies for the TTSFM package
2 | requests>=2.25.0
3 | aiohttp>=3.8.0
4 | python-dotenv>=1.0.1
5 | 


--------------------------------------------------------------------------------
/ttsfm-web/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Run script for TTSFM web application with proper eventlet initialization
 4 | """
 5 | 
 6 | import eventlet
 7 | 
 8 | eventlet.monkey_patch()
 9 | 
10 | from app import DEBUG, HOST, PORT, app, socketio  # noqa: E402
11 | 
12 | if __name__ == "__main__":
13 |     print(f"Starting TTSFM with WebSocket support on {HOST}:{PORT}")
14 |     socketio.run(app, host=HOST, port=PORT, debug=DEBUG, allow_unsafe_werkzeug=True)
15 | 


--------------------------------------------------------------------------------
/ttsfm-web/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Web application dependencies
 2 | argon2-cffi>=23.1.0
 3 | flask>=2.0.0
 4 | flask-cors>=3.0.10
 5 | flask-socketio>=5.3.0
 6 | python-socketio>=5.10.0
 7 | eventlet>=0.33.3
 8 | waitress>=3.0.0
 9 | python-dotenv>=1.0.0
10 | 
11 | # Audio processing (optional, for combining audio files)
12 | # If not installed, will fall back to simple concatenation for WAV files
13 | pydub>=0.25.0
14 | 
15 | # TTSFM package (install from local directory or PyPI)
16 | # For local development: pip install -e ../
17 | # For Docker/production: installed via pyproject.toml[web] dependencies
18 | 
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # TTSFM Environment Configuration
 2 | 
 3 | # Server Configuration
 4 | HOST=0.0.0.0
 5 | PORT=8000
 6 | 
 7 | # SSL Configuration
 8 | VERIFY_SSL=true
 9 | 
10 | # Flask Configuration
11 | FLASK_ENV=production
12 | FLASK_APP=app.py
13 | DEBUG=false
14 | 
15 | # API Key Protection (Optional)
16 | # Set REQUIRE_API_KEY=true to enable API key authentication
17 | REQUIRE_API_KEY=false
18 | 
19 | # Set your API key here when protection is enabled
20 | # This key will be required for all TTS generation requests
21 | TTSFM_API_KEY=your-secret-api-key-here
22 | 
23 | # Example usage:
24 | # 1. Set REQUIRE_API_KEY=true
25 | # 2. Set TTSFM_API_KEY to your desired secret key
26 | # 3. Restart the application
27 | # 4. All TTS requests will now require the API key in:
28 | #    - Authorization header (Bearer token) - OpenAI compatible
29 | #    - X-API-Key header
30 | #    - api_key query parameter
31 | #    - api_key in JSON body
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 dbcccc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ttsfm-web/static/js/api-client.js:
--------------------------------------------------------------------------------
 1 | const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
 2 | const cache = new Map();
 3 | 
 4 | function shouldUseCache(entry) {
 5 |   if (!entry) {
 6 |     return false;
 7 |   }
 8 |   if (entry.expiresAt === null) {
 9 |     return true;
10 |   }
11 |   return Date.now() < entry.expiresAt;
12 | }
13 | 
14 | async function fetchWithCache(url, { signal, refresh = false } = {}) {
15 |   if (!refresh) {
16 |     const cached = cache.get(url);
17 |     if (shouldUseCache(cached)) {
18 |       return cached.data;
19 |     }
20 |   }
21 | 
22 |   const response = await fetch(url, { signal });
23 |   if (!response.ok) {
24 |     throw new Error(`Request to ${url} failed with status ${response.status}`);
25 |   }
26 |   const data = await response.json();
27 |   cache.set(url, { data, expiresAt: Date.now() + CACHE_TTL_MS });
28 |   return data;
29 | }
30 | 
31 | export function clearCache(urlPrefix) {
32 |   if (!urlPrefix) {
33 |     cache.clear();
34 |     return;
35 |   }
36 |   for (const key of Array.from(cache.keys())) {
37 |     if (key.startsWith(urlPrefix)) {
38 |       cache.delete(key);
39 |     }
40 |   }
41 | }
42 | 
43 | export function fetchVoices(options = {}) {
44 |   return fetchWithCache('/api/voices', options);
45 | }
46 | 
47 | export function fetchFormats(options = {}) {
48 |   return fetchWithCache('/api/formats', options);
49 | }
50 | 
51 | export function primeCache(url, data, ttlMs = CACHE_TTL_MS) {
52 |   cache.set(url, { data, expiresAt: ttlMs === null ? null : Date.now() + ttlMs });
53 | }
54 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
 1 | # TTSFM Architecture Overview
 2 | 
 3 | ```
 4 | +----------------+       +--------------------+       +----------------------+
 5 | | Frontend (JS)  | <---> | Flask REST Endpoints| <---> | OpenAI.fm upstream   |
 6 | | Playground UI  |       | /api/* + /v1/audio  |       | reverse-engineered   |
 7 | +----------------+       +--------------------+       +----------------------+
 8 |         |                               ^
 9 |         v                               |
10 | +----------------+       +--------------------+
11 | | Socket.IO WS   | <---> | WebSocket Handler  |
12 | | streaming UI   |       | (background tasks) |
13 | +----------------+       +--------------------+
14 | ```
15 | 
16 | - **Synchronous Client (`TTSClient`)** – Used by both REST endpoints and the WebSocket handler. Each request gets an isolated client instance, preventing shared session races.
17 | - **Async Client (`AsyncTTSClient`)** – Available to external consumers that want fully asynchronous workflows.
18 | - **Utilities** – Shared helpers handle sanitisation, deterministic headers, and text splitting for both HTTP and WebSocket flows.
19 | 
20 | The repo ships with a Docker image that bundles the Flask app, Socket.IO server, and static assets. A per-request TTS client ensures concurrency safety; outgoing prompt tuning is opt-in through the `use_default_prompt` flag.
21 | 
22 | For more implementation details see:
23 | 
24 | - `ttsfm-web/app.py` – Flask routes, streaming combination logic, API key security.
25 | - `ttsfm-web/websocket_handler.py` – Background task orchestration and streaming chunk delivery.
26 | - `ttsfm/utils.py` – Sanitisation, deterministic headers, and text chunk helpers.
27 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to TTSFM
 2 | 
 3 | Thanks for your interest in improving TTSFM! This document outlines the local development workflow and quality gates that every pull request must satisfy.
 4 | 
 5 | ## 1. Set Up Your Environment
 6 | 
 7 | ```bash
 8 | # Clone and create a virtual environment of your choice
 9 | python -m venv .venv
10 | source .venv/bin/activate  # Windows: .venv\Scripts\activate
11 | 
12 | # Install the package with all tooling and web extras
13 | pip install -e .[web,dev]
14 | ```
15 | 
16 | ## 2. Run the Test Suite
17 | 
18 | ```bash
19 | pytest
20 | ```
21 | 
22 | Add new tests alongside your changes—patches without coverage for new behaviour will be sent back for revision.
23 | 
24 | ## 3. Lint and Type-Check
25 | 
26 | We keep the codebase consistent and catch regressions early with these checks:
27 | 
28 | ```bash
29 | black --check ttsfm ttsfm-web tests
30 | flake8 ttsfm ttsfm-web
31 | mypy ttsfm
32 | ```
33 | 
34 | Format your code with `black` and resolve lint/type errors before opening a pull request.
35 | 
36 | ## 4. Web UI Smoke Tests
37 | 
38 | If you touch the Flask app or frontend assets, run the web server locally and exercise the basic flows (text input, long-form combine, WebSocket streaming). For asynchronous features, open two browser tabs and confirm cancellation works.
39 | 
40 | ## 5. Commit & Pull Request Guidelines
41 | 
42 | - Keep commits focused; squash trivial fixups before submitting.
43 | - Describe _why_ a change is needed in the PR description.
44 | - Link to an issue if one exists.
45 | - Document behaviour changes in `CHANGELOG.md` when relevant.
46 | 
47 | Questions or ideas? Open a discussion thread or drop by the issue tracker—we’re happy to help.
48 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build argument to control image variant (full or slim)
 2 | ARG VARIANT=full
 3 | 
 4 | FROM python:3.11-slim AS builder
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | ENV PYTHONDONTWRITEBYTECODE=1 \
 9 |     PYTHONUNBUFFERED=1
10 | 
11 | RUN apt-get update \
12 |     && apt-get install -y --no-install-recommends build-essential \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY pyproject.toml ./
16 | COPY README.md ./
17 | COPY requirements.txt ./
18 | COPY ttsfm/ ./ttsfm/
19 | 
20 | ARG VERSION=0.0.0
21 | ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
22 | 
23 | RUN pip install --no-cache-dir --upgrade pip \
24 |     && pip install --no-cache-dir --prefix /install .[web] \
25 |     && find /install -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true \
26 |     && find /install -type f -name '*.pyc' -delete \
27 |     && find /install -type f -name '*.pyo' -delete \
28 |     && find /install -type d -name 'tests' -exec rm -rf {} + 2>/dev/null || true \
29 |     && find /install -type d -name 'test' -exec rm -rf {} + 2>/dev/null || true \
30 |     && find /install -name '*.dist-info' -type d -exec sh -c 'rm -f "$1"/RECORD "$1"/INSTALLER' sh {} \; 2>/dev/null || true
31 | 
32 | FROM python:3.11-slim
33 | 
34 | # Re-declare ARG after FROM to make it available in this stage
35 | ARG VARIANT=full
36 | 
37 | ENV PYTHONDONTWRITEBYTECODE=1 \
38 |     PYTHONUNBUFFERED=1 \
39 |     PORT=8000 \
40 |     TTSFM_VARIANT=${VARIANT}
41 | 
42 | WORKDIR /app
43 | 
44 | # Conditional ffmpeg installation based on variant
45 | # Full variant: includes ffmpeg for MP3 combining, speed adjustment, and format conversion
46 | # Slim variant: minimal image without ffmpeg (WAV-only auto-combine, no speed adjustment)
47 | RUN apt-get update \
48 |     && if [ "$VARIANT" = "full" ]; then \
49 |          apt-get install -y --no-install-recommends ffmpeg; \
50 |        fi \
51 |     && rm -rf /var/lib/apt/lists/* \
52 |     && useradd --create-home --shell /usr/sbin/nologin ttsfm
53 | 
54 | COPY --from=builder /install /usr/local
55 | ENV PATH="/usr/local/bin:$PATH"
56 | 
57 | COPY --chown=ttsfm:ttsfm ttsfm-web/ ./ttsfm-web/
58 | 
59 | USER ttsfm
60 | 
61 | EXPOSE 8000
62 | 
63 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
64 |     CMD ["python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health', timeout=5)"]
65 | 
66 | WORKDIR /app/ttsfm-web
67 | CMD ["python", "run.py"]
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Python
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.so
  6 | .Python
  7 | build/
  8 | develop-eggs/
  9 | dist/
 10 | downloads/
 11 | eggs/
 12 | .eggs/
 13 | lib/
 14 | lib64/
 15 | parts/
 16 | sdist/
 17 | var/
 18 | wheels/
 19 | *.egg-info/
 20 | .installed.cfg
 21 | *.egg
 22 | MANIFEST
 23 | 
 24 | # Virtual Environment
 25 | venv/
 26 | env/
 27 | ENV/
 28 | .venv/
 29 | 
 30 | # Environment variables
 31 | .env
 32 | .env.local
 33 | .env.production
 34 | 
 35 | # IDE
 36 | .idea/
 37 | .vscode/
 38 | *.swp
 39 | *.swo
 40 | .spyderproject
 41 | .spyproject
 42 | 
 43 | # OS
 44 | .DS_Store
 45 | .DS_Store?
 46 | ._*
 47 | .Spotlight-V100
 48 | .Trashes
 49 | ehthumbs.db
 50 | Thumbs.db
 51 | 
 52 | # Generated audio files (for testing)
 53 | *.mp3
 54 | *.wav
 55 | *.opus
 56 | *.aac
 57 | *.flac
 58 | *.pcm
 59 | test_output.*
 60 | output.*
 61 | hello.*
 62 | speech.*
 63 | 
 64 | # Logs
 65 | *.log
 66 | logs/
 67 | .pytest_cache/
 68 | 
 69 | # Temporary files
 70 | tmp/
 71 | temp/
 72 | .tmp/
 73 | 
 74 | # Coverage reports
 75 | htmlcov/
 76 | .coverage
 77 | .coverage.*
 78 | coverage.xml
 79 | *.cover
 80 | .hypothesis/
 81 | 
 82 | # Documentation builds
 83 | docs/_build/
 84 | site/
 85 | 
 86 | # Package builds
 87 | *.tar.gz
 88 | *.whl
 89 | dist/
 90 | build/
 91 | 
 92 | # MyPy
 93 | .mypy_cache/
 94 | .dmypy.json
 95 | dmypy.json
 96 | 
 97 | # Jupyter Notebook
 98 | .ipynb_checkpoints
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | Pipfile.lock
105 | 
106 | # PEP 582
107 | __pypackages__/
108 | 
109 | # Celery
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | # Additional exclusions for GitHub
126 | 
127 | # API Keys and Secrets
128 | config.json
129 | secrets.json
130 | .secrets
131 | api_keys.txt
132 | 
133 | # Database files
134 | *.db
135 | *.sqlite
136 | *.sqlite3
137 | 
138 | # Backup files
139 | *.bak
140 | *.backup
141 | *~
142 | 
143 | # Node.js (if using any JS tools)
144 | node_modules/
145 | npm-debug.log*
146 | yarn-debug.log*
147 | yarn-error.log*
148 | 
149 | # Docker
150 | .dockerignore
151 | Dockerfile.dev
152 | docker-compose.override.yml
153 | 
154 | # Local configuration
155 | local_settings.py
156 | local_config.py
157 | 
158 | # Claude
159 | .claude/
160 | VERSION_BUMP_GUIDE.md
161 | scripts/test_audio_generation.py
162 | /artifacts
163 | test.py
164 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | 
 5 | import ttsfm.utils as utils
 6 | 
 7 | 
 8 | def test_split_text_preserves_sentence_punctuation():
 9 |     text = "First sentence! Second question? Final statement."
10 |     chunks = utils.split_text_by_length(text, max_length=15)
11 | 
12 |     assert chunks[0].endswith("!"), chunks
13 |     assert any(chunk.endswith("?") for chunk in chunks), chunks
14 |     assert chunks[-1].endswith("."), chunks
15 | 
16 | 
17 | def test_split_text_handles_oversized_sentence():
18 |     long_sentence = " ".join(["word"] * 600)
19 |     chunks = utils.split_text_by_length(long_sentence, max_length=120)
20 | 
21 |     assert all(len(chunk) <= 120 for chunk in chunks)
22 |     assert sum(len(chunk.split()) for chunk in chunks) == 600
23 | 
24 | 
25 | def test_split_text_handles_extremely_long_word():
26 |     max_length = 50
27 |     painful_word = "a" * 140
28 |     text = f"start {painful_word} end"
29 | 
30 |     chunks = utils.split_text_by_length(text, max_length=max_length)
31 | 
32 |     assert any(painful_word[:max_length] in chunk for chunk in chunks)
33 |     assert all(len(chunk) <= max_length for chunk in chunks)
34 | 
35 | 
36 | def test_sanitize_text_retains_ampersands():
37 |     text = "R&D and Fish & Chips &amp; Co. <b>Bold</b>"
38 |     sanitized = utils.sanitize_text(text)
39 | 
40 |     assert "R&D" in sanitized
41 |     assert "Fish & Chips" in sanitized
42 |     assert "Bold" in sanitized
43 |     assert "<" not in sanitized
44 | 
45 | 
46 | def test_header_generation_deterministic_upgrade_flag(monkeypatch):
47 |     module = importlib.reload(utils)
48 | 
49 |     headers_first = module.get_realistic_headers()
50 |     headers_second = module.get_realistic_headers()
51 | 
52 |     assert "Upgrade-Insecure-Requests" in headers_first
53 |     assert "Upgrade-Insecure-Requests" not in headers_second
54 |     assert headers_first["Accept-Language"] != headers_second["Accept-Language"]
55 | 
56 | 
57 | @pytest.mark.asyncio
58 | async def test_async_batch_propagates_original_exception(monkeypatch):
59 |     from ttsfm.async_client import AsyncTTSClient
60 |     from ttsfm.exceptions import NetworkException
61 |     from ttsfm.models import TTSRequest, Voice
62 | 
63 |     client = AsyncTTSClient()
64 | 
65 |     async def fail_request(_request):
66 |         raise NetworkException("boom")
67 | 
68 |     monkeypatch.setattr(client, "_make_request", fail_request)
69 | 
70 |     request = TTSRequest(input="hello", voice=Voice.ALLOY)
71 | 
72 |     with pytest.raises(NetworkException):
73 |         await client.generate_speech_batch([request])
74 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Release and Publish
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - 'v*'  # Triggers on version tags like v1.0.0, v3.0.1, etc.
  7 | 
  8 | permissions:
  9 |   contents: write
 10 |   id-token: write
 11 | 
 12 | jobs:
 13 |   release-and-publish:
 14 |     runs-on: ubuntu-latest
 15 | 
 16 |     steps:
 17 |     - uses: actions/checkout@v4
 18 | 
 19 |     - name: Set up Python
 20 |       uses: actions/setup-python@v4
 21 |       with:
 22 |         python-version: '3.11'
 23 | 
 24 |     - name: Install dependencies
 25 |       run: |
 26 |         python -m pip install --upgrade pip
 27 |         pip install build twine
 28 |         pip install '.[web,dev]'
 29 | 
 30 |     - name: Run linters and type checks
 31 |       run: |
 32 |         flake8 ttsfm ttsfm-web
 33 |         mypy ttsfm
 34 |         black --check ttsfm ttsfm-web tests
 35 | 
 36 |     - name: Run tests
 37 |       run: pytest
 38 | 
 39 |     - name: Test package install and import
 40 |       run: |
 41 |         python -c "import ttsfm; print('TTSFM imported successfully')"
 42 |         python -c "from ttsfm import TTSClient; print('TTSClient imported successfully')"
 43 |         python -m ttsfm.cli --help > /dev/null
 44 |         echo 'CLI smoke test passed'
 45 | 
 46 |     - name: Build package
 47 |       run: |
 48 |         python -m build
 49 |         echo "Package built successfully"
 50 |         ls -la dist/
 51 | 
 52 |     - name: Check package
 53 |       run: |
 54 |         twine check dist/*
 55 |         echo "Package validation passed"
 56 | 
 57 |     - name: Publish to PyPI
 58 |       uses: pypa/gh-action-pypi-publish@release/v1
 59 |       with:
 60 |         attestations: true
 61 |         skip-existing: true
 62 | 
 63 |     - name: Extract version (strip leading v)
 64 |       id: ver
 65 |       run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
 66 | 
 67 |     - name: Create GitHub Release
 68 |       uses: softprops/action-gh-release@v1
 69 |       with:
 70 |         body: |
 71 |           ## TTSFM ${{ github.ref_name }}
 72 | 
 73 |           New release of TTSFM - Free Text-to-Speech API with OpenAI compatibility.
 74 | 
 75 |           ### Installation
 76 |           ```bash
 77 |           pip install ttsfm==${{ steps.ver.outputs.version }}
 78 |           ```
 79 | 
 80 |           ### Quick Start
 81 |           ```python
 82 |           from ttsfm import TTSClient
 83 | 
 84 |           client = TTSClient()
 85 |           response = client.generate_speech("Hello from TTSFM!")
 86 |           response.save_to_file("hello")
 87 |           ```
 88 | 
 89 |           ### Docker
 90 |           ```bash
 91 |           docker run -p 8000:8000 dbcccc/ttsfm:latest
 92 |           ```
 93 | 
 94 |           ### Features
 95 |           - Completely free (uses openai.fm service)
 96 |           - OpenAI-compatible API
 97 |           - 11 voices available
 98 |           - 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
 99 |           - Async and sync clients
100 |           - Web interface included
101 |           - CLI tool available
102 | 
103 |           ### Documentation
104 |           See [README](https://github.com/dbccccccc/ttsfm#readme) for full documentation.
105 |         draft: false
106 |         prerelease: ${{ contains(github.ref_name, '-' ) }}
107 | 
108 | 


--------------------------------------------------------------------------------
/tests/test_web_app.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import importlib.util
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | import pytest
  7 | 
  8 | WEB_DIR = Path(__file__).resolve().parents[1] / "ttsfm-web"
  9 | MODULE_NAME = "ttsfm_web.app"
 10 | 
 11 | 
 12 | def load_web_app(monkeypatch, **env):
 13 |     for key, value in env.items():
 14 |         if value is None:
 15 |             monkeypatch.delenv(key, raising=False)
 16 |         else:
 17 |             monkeypatch.setenv(key, value)
 18 | 
 19 |     sys.modules.pop(MODULE_NAME, None)
 20 |     sys.modules.pop("ttsfm_web", None)
 21 |     sys.modules.pop("websocket_handler", None)
 22 | 
 23 |     web_dir_str = str(WEB_DIR)
 24 |     if web_dir_str not in sys.path:
 25 |         sys.path.insert(0, web_dir_str)
 26 | 
 27 |     pkg_spec = importlib.util.spec_from_loader("ttsfm_web", loader=None)
 28 |     pkg = importlib.util.module_from_spec(pkg_spec)
 29 |     pkg.__path__ = [web_dir_str]  # type: ignore[attr-defined]
 30 |     sys.modules.setdefault("ttsfm_web", pkg)
 31 | 
 32 |     spec = importlib.util.spec_from_file_location(MODULE_NAME, WEB_DIR / "app.py")
 33 |     module = importlib.util.module_from_spec(spec)
 34 |     assert spec and spec.loader
 35 |     spec.loader.exec_module(module)  # type: ignore[attr-defined]
 36 |     return module
 37 | 
 38 | 
 39 | def test_voices_endpoint_returns_data(monkeypatch):
 40 |     module = load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None)
 41 |     client = module.app.test_client()
 42 |     response = client.get("/api/voices")
 43 |     assert response.status_code == 200
 44 |     payload = response.get_json()
 45 |     assert payload["count"] == len(payload["voices"])
 46 | 
 47 | 
 48 | def test_combine_audio_chunks_uses_format_hint(monkeypatch):
 49 |     load_web_app(monkeypatch, REQUIRE_API_KEY="false", TTSFM_API_KEY=None)
 50 | 
 51 |     from ttsfm import audio as audio_module
 52 | 
 53 |     class DummySegment:
 54 |         def __init__(self, tag: str):
 55 |             self.tag = tag
 56 | 
 57 |         def __iadd__(self, other: "DummySegment"):
 58 |             self.tag += other.tag
 59 |             return self
 60 | 
 61 |         def export(self, buffer, format: str):
 62 |             buffer.write(f"{format}:{self.tag}".encode())
 63 | 
 64 |     class DummyAudioSegment:
 65 |         formats = []
 66 | 
 67 |         @classmethod
 68 |         def from_mp3(cls, buffer):
 69 |             cls.formats.append("mp3")
 70 |             return DummySegment("mp3")
 71 | 
 72 |         @classmethod
 73 |         def from_wav(cls, buffer):
 74 |             cls.formats.append("wav")
 75 |             return DummySegment("wav")
 76 | 
 77 |     monkeypatch.setattr(audio_module, "AudioSegment", DummyAudioSegment)
 78 | 
 79 |     output = audio_module.combine_audio_chunks([b"one", b"two"], "opus")
 80 | 
 81 |     assert output == b"wav:wavwav"
 82 |     assert DummyAudioSegment.formats == ["wav", "wav"]
 83 | 
 84 | 
 85 | @pytest.mark.parametrize(
 86 |     "header_name, header_value",
 87 |     [
 88 |         ("Authorization", "Bearer super-secret"),
 89 |         ("X-API-Key", "super-secret"),
 90 |     ],
 91 | )
 92 | def test_api_key_hash_verification(monkeypatch, header_name, header_value):
 93 |     module = load_web_app(monkeypatch, REQUIRE_API_KEY="true", TTSFM_API_KEY="super-secret")
 94 |     client = module.app.test_client()
 95 | 
 96 |     denied = client.post("/api/validate-text", json={"text": "hello"})
 97 |     assert denied.status_code == 401
 98 | 
 99 |     headers = {header_name: header_value}
100 |     response = client.post("/api/validate-text", json={"text": "hello"}, headers=headers)
101 |     assert response.status_code == 200
102 | 


--------------------------------------------------------------------------------
/tests/test_clients.py:
--------------------------------------------------------------------------------
  1 | import types
  2 | 
  3 | import pytest
  4 | 
  5 | from ttsfm.async_client import AsyncTTSClient
  6 | from ttsfm.client import TTSClient
  7 | from ttsfm.models import AudioFormat, TTSResponse
  8 | 
  9 | 
 10 | def _mk_response(data: bytes) -> TTSResponse:
 11 |     return TTSResponse(
 12 |         audio_data=data,
 13 |         content_type="audio/mpeg",
 14 |         format=AudioFormat.MP3,
 15 |         size=len(data),
 16 |     )
 17 | 
 18 | 
 19 | class _DummyResponse:
 20 |     def __init__(self, content_type: str, content: bytes, url: str = "https://example.test/audio"):
 21 |         self.status_code = 200
 22 |         self.headers = {"content-type": content_type}
 23 |         self.content = content
 24 |         self.url = url
 25 |         self.text = ""
 26 | 
 27 |     def json(self):  # pragma: no cover - not used on success path
 28 |         return {}
 29 | 
 30 | 
 31 | def test_sync_request_normalizes_non_mp3_format(monkeypatch):
 32 |     client = TTSClient()
 33 |     captured = {}
 34 | 
 35 |     def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
 36 |         captured["data"] = data
 37 |         return _DummyResponse("audio/wav", b"RIFF" + b"\x00" * 64, url)
 38 | 
 39 |     monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
 40 | 
 41 |     response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.FLAC)
 42 | 
 43 |     assert captured["data"]["response_format"] == "wav"
 44 |     assert response.format is AudioFormat.WAV
 45 | 
 46 | 
 47 | def test_sync_request_preserves_mp3_format(monkeypatch):
 48 |     client = TTSClient()
 49 |     captured = {}
 50 | 
 51 |     def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
 52 |         captured["data"] = data
 53 |         return _DummyResponse("audio/mpeg", b"ID3" + b"\x00" * 64, url)
 54 | 
 55 |     monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
 56 | 
 57 |     response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.MP3)
 58 | 
 59 |     assert captured["data"]["response_format"] == "mp3"
 60 |     assert response.format is AudioFormat.MP3
 61 | 
 62 | 
 63 | def test_sync_long_text_auto_combine(monkeypatch):
 64 |     client = TTSClient()
 65 | 
 66 |     monkeypatch.setattr(
 67 |         client,
 68 |         "generate_speech_batch",
 69 |         lambda **kwargs: [_mk_response(b"one"), _mk_response(b"two")],
 70 |     )
 71 | 
 72 |     combined_flag = {}
 73 | 
 74 |     def fake_combine(responses):
 75 |         combined_flag["called"] = True
 76 |         return _mk_response(b"onetwo")
 77 | 
 78 |     monkeypatch.setattr("ttsfm.client.combine_responses", fake_combine)
 79 | 
 80 |     result = client.generate_speech_long_text(
 81 |         text="dummy",
 82 |         auto_combine=True,
 83 |     )
 84 | 
 85 |     assert combined_flag["called"] is True
 86 |     assert isinstance(result, TTSResponse)
 87 |     assert result.audio_data == b"onetwo"
 88 | 
 89 | 
 90 | def test_sync_long_text_returns_list_without_auto_combine(monkeypatch):
 91 |     client = TTSClient()
 92 | 
 93 |     responses = [_mk_response(b"one")]
 94 |     monkeypatch.setattr(client, "generate_speech_batch", lambda **_: responses)
 95 | 
 96 |     result = client.generate_speech_long_text(text="dummy", auto_combine=False)
 97 | 
 98 |     assert result is responses
 99 | 
100 | 
101 | @pytest.mark.asyncio
102 | async def test_async_long_text_auto_combine(monkeypatch):
103 |     client = AsyncTTSClient()
104 | 
105 |     async def fake_batch(**kwargs):
106 |         return [_mk_response(b"one"), _mk_response(b"two")]
107 | 
108 |     monkeypatch.setattr(client, "generate_speech_batch", fake_batch)
109 | 
110 |     def fake_combine(responses):
111 |         return _mk_response(b"onetwo")
112 | 
113 |     monkeypatch.setattr("ttsfm.async_client.combine_responses", fake_combine)
114 | 
115 |     result = await client.generate_speech_long_text(
116 |         text="dummy",
117 |         auto_combine=True,
118 |     )
119 | 
120 |     assert isinstance(result, TTSResponse)
121 |     assert result.audio_data == b"onetwo"
122 | 


--------------------------------------------------------------------------------
/ttsfm/capabilities.py:
--------------------------------------------------------------------------------
  1 | """System capabilities detection for TTSFM.
  2 | 
  3 | This module provides runtime detection of available features based on
  4 | system dependencies (primarily ffmpeg availability).
  5 | """
  6 | 
  7 | from __future__ import annotations
  8 | 
  9 | import shutil
 10 | from typing import Dict, List
 11 | 
 12 | 
 13 | class SystemCapabilities:
 14 |     """Detect and report system capabilities.
 15 | 
 16 |     This class checks for the availability of optional dependencies
 17 |     (like ffmpeg) and reports which features are available in the
 18 |     current environment.
 19 |     """
 20 | 
 21 |     def __init__(self) -> None:
 22 |         """Initialize capabilities detection."""
 23 |         self.ffmpeg_available = shutil.which("ffmpeg") is not None
 24 | 
 25 |     def get_capabilities(self) -> Dict:
 26 |         """Get complete system capabilities report.
 27 | 
 28 |         Returns:
 29 |             Dict containing:
 30 |                 - ffmpeg_available: bool
 31 |                 - image_variant: "full" or "slim"
 32 |                 - features: dict of feature availability
 33 |                 - supported_formats: list of supported audio formats
 34 |         """
 35 |         return {
 36 |             "ffmpeg_available": self.ffmpeg_available,
 37 |             "image_variant": "full" if self.ffmpeg_available else "slim",
 38 |             "features": {
 39 |                 "speed_adjustment": self.ffmpeg_available,
 40 |                 "format_conversion": self.ffmpeg_available,
 41 |                 "mp3_auto_combine": self.ffmpeg_available,
 42 |                 "basic_formats": True,  # MP3, WAV always available
 43 |             },
 44 |             "supported_formats": self.get_supported_formats(),
 45 |         }
 46 | 
 47 |     def get_supported_formats(self) -> List[str]:
 48 |         """Get list of supported audio formats.
 49 | 
 50 |         Returns:
 51 |             List of format names (e.g., ["mp3", "wav", "opus", ...])
 52 |         """
 53 |         basic = ["mp3", "wav"]
 54 |         if self.ffmpeg_available:
 55 |             return basic + ["opus", "aac", "flac", "pcm"]
 56 |         return basic
 57 | 
 58 |     def requires_ffmpeg(self, feature: str) -> bool:
 59 |         """Check if a feature requires ffmpeg.
 60 | 
 61 |         Args:
 62 |             feature: Feature name or format name to check
 63 | 
 64 |         Returns:
 65 |             True if the feature requires ffmpeg, False otherwise
 66 |         """
 67 |         ffmpeg_features = {
 68 |             "speed_adjustment",
 69 |             "format_conversion",
 70 |             "mp3_auto_combine",
 71 |             "opus",
 72 |             "aac",
 73 |             "flac",
 74 |             "pcm",
 75 |         }
 76 |         return feature.lower() in ffmpeg_features
 77 | 
 78 |     def check_feature_available(self, feature: str) -> bool:
 79 |         """Check if a specific feature is available.
 80 | 
 81 |         Args:
 82 |             feature: Feature name to check
 83 | 
 84 |         Returns:
 85 |             True if feature is available, False otherwise
 86 |         """
 87 |         if not self.requires_ffmpeg(feature):
 88 |             return True
 89 |         return self.ffmpeg_available
 90 | 
 91 |     def get_unavailable_reason(self, feature: str) -> str | None:
 92 |         """Get reason why a feature is unavailable.
 93 | 
 94 |         Args:
 95 |             feature: Feature name to check
 96 | 
 97 |         Returns:
 98 |             Error message if unavailable, None if available
 99 |         """
100 |         if self.check_feature_available(feature):
101 |             return None
102 | 
103 |         return (
104 |             f"Feature '{feature}' requires ffmpeg. "
105 |             "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
106 |         )
107 | 
108 | 
109 | # Global instance for easy access
110 | _capabilities_instance: SystemCapabilities | None = None
111 | 
112 | 
113 | def get_capabilities() -> SystemCapabilities:
114 |     """Get global SystemCapabilities instance.
115 | 
116 |     Returns:
117 |         SystemCapabilities singleton instance
118 |     """
119 |     global _capabilities_instance
120 |     if _capabilities_instance is None:
121 |         _capabilities_instance = SystemCapabilities()
122 |     return _capabilities_instance
123 | 


--------------------------------------------------------------------------------
/README.zh.md:
--------------------------------------------------------------------------------
  1 | # TTSFM - 文本转语音 API 客户端
  2 | 
  3 | > **⚠️ 告示：由于 openai.fm 体验网站已关闭，本项目已无法使用。**
  4 | 
  5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md)
  6 | 
  7 | [![Docker Pulls](https://img.shields.io/docker/pulls/dbcccc/ttsfm?style=flat-square&logo=docker)](https://hub.docker.com/r/dbcccc/ttsfm)
  8 | [![GitHub Stars](https://img.shields.io/github/stars/dbccccccc/ttsfm?style=social)](https://github.com/dbccccccc/ttsfm)
  9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
 10 | ![ghcr pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fghcr-badge.elias.eu.org%2Fapi%2Fdbccccccc%2Fttsfm%2Fttsfm&query=downloadCount&label=ghcr+pulls&logo=github)
 11 | 
 12 | ## Star History
 13 | 
 14 | [![Star History Chart](https://api.star-history.com/svg?repos=dbccccccc/ttsfm&type=Date)](https://www.star-history.com/#dbccccccc/ttsfm&Date)
 15 | 
 16 | ## 概述
 17 | 
 18 | TTSFM 是一个免费的、兼容 OpenAI 的文本转语音 API 服务，提供将文本转换为自然语音的完整解决方案，使用OpenAI的GPT-4o mini TTS。基于 openai.fm 后端构建，提供强大的 Python SDK、RESTful API 接口以及直观的网页 Playground，方便测试和集成。
 19 | 
 20 | **TTSFM 的功能：**
 21 | - 🎤 **多种语音选择**：11 种兼容 OpenAI 的语音（alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse）
 22 | - 🎵 **灵活的音频格式**：支持 6 种音频格式（MP3、WAV、OPUS、AAC、FLAC、PCM）
 23 | - ⚡ **语速控制**：0.25x 到 4.0x 的播放速度调节，适应不同使用场景
 24 | - 📝 **长文本支持**：自动文本分割和音频合并，支持任意长度内容
 25 | - 🔄 **实时流式传输**：WebSocket 支持流式音频生成
 26 | - 🐍 **Python SDK**：易用的同步和异步客户端
 27 | - 🌐 **网页 Playground**：交互式网页界面，方便测试和实验
 28 | - 🐳 **Docker 就绪**：预构建的 Docker 镜像，即刻部署
 29 | - 🔍 **智能检测**：自动功能检测和友好的错误提示
 30 | - 🤖 **OpenAI 兼容**：可直接替代 OpenAI 的 TTS API
 31 | 
 32 | **v3.4.0 版本的主要特性：**
 33 | - 🎯 镜像变体检测（完整版 vs 精简版 Docker 镜像）
 34 | - 🔍 运行时功能 API，检查特性可用性
 35 | - ⚡ 基于 ffmpeg 的语速调节
 36 | - 🎵 所有 6 种音频格式的真实格式转换
 37 | - 📊 增强的错误处理，提供清晰、可操作的错误信息
 38 | - 🐳 针对不同使用场景优化的双镜像版本
 39 | 
 40 | > **⚠️ 免责声明**：本项目仅用于**学习和研究目的**。这是对 openai.fm 服务的逆向工程实现，不应用于商业用途或生产环境。用户需自行确保遵守适用的法律法规和服务条款。
 41 | 
 42 | ## 安装
 43 | 
 44 | ### Python 包
 45 | 
 46 | ```bash
 47 | pip install ttsfm        # 核心客户端
 48 | pip install ttsfm[web]   # 核心客户端 + Web/服务端依赖
 49 | ```
 50 | 
 51 | ### Docker 镜像
 52 | 
 53 | TTSFM 提供两种 Docker 镜像变体以满足不同需求：
 54 | 
 55 | #### 完整版（推荐）
 56 | ```bash
 57 | docker run -p 8000:8000 dbcccc/ttsfm:latest
 58 | ```
 59 | 
 60 | **包含 ffmpeg，支持高级功能：**
 61 | - ✅ 所有 6 种音频格式（MP3、WAV、OPUS、AAC、FLAC、PCM）
 62 | - ✅ 语速调节（0.25x - 4.0x）
 63 | - ✅ 使用 ffmpeg 进行格式转换
 64 | - ✅ 长文本 MP3 自动合并
 65 | - ✅ 长文本 WAV 自动合并
 66 | 
 67 | #### 精简版
 68 | ```bash
 69 | docker run -p 8000:8000 dbcccc/ttsfm:slim
 70 | ```
 71 | 
 72 | **不含 ffmpeg 的最小化镜像：**
 73 | - ✅ 基础 TTS 功能
 74 | - ✅ 2 种音频格式（仅 MP3、WAV）
 75 | - ✅ 长文本 WAV 自动合并
 76 | - ❌ 不支持语速调节
 77 | - ❌ 不支持格式转换
 78 | - ❌ 不支持 MP3 自动合并
 79 | 
 80 | 容器默认开放网页 Playground（`http://localhost:8000`）以及兼容 OpenAI 的 `/v1/audio/speech` 接口。
 81 | 
 82 | **检查可用功能：**
 83 | ```bash
 84 | curl http://localhost:8000/api/capabilities
 85 | ```
 86 | 
 87 | ## 快速开始
 88 | 
 89 | ### Python 客户端
 90 | 
 91 | ```python
 92 | from ttsfm import TTSClient, AudioFormat, Voice
 93 | 
 94 | client = TTSClient()
 95 | 
 96 | # 基础用法
 97 | response = client.generate_speech(
 98 |     text="来自 TTSFM 的问候！",
 99 |     voice=Voice.ALLOY,
100 |     response_format=AudioFormat.MP3,
101 | )
102 | response.save_to_file("hello")  # -> hello.mp3
103 | 
104 | # 使用语速调节（需要 ffmpeg）
105 | response = client.generate_speech(
106 |     text="这段语音会更快！",
107 |     voice=Voice.NOVA,
108 |     response_format=AudioFormat.MP3,
109 |     speed=1.5,  # 1.5 倍速（范围：0.25 - 4.0）
110 | )
111 | response.save_to_file("fast")  # -> fast.mp3
112 | ```
113 | 
114 | ### 命令行
115 | 
116 | ```bash
117 | ttsfm "你好，世界" --voice nova --format mp3 --output hello.mp3
118 | ```
119 | 
120 | ### REST API（兼容 OpenAI）
121 | 
122 | ```bash
123 | # 基础请求
124 | curl -X POST http://localhost:8000/v1/audio/speech \
125 |   -H "Content-Type: application/json" \
126 |   -d '{
127 |     "model": "tts-1",
128 |     "input": "你好，世界",
129 |     "voice": "alloy",
130 |     "response_format": "mp3"
131 |   }' --output speech.mp3
132 | 
133 | # 使用语速调节（需要完整版镜像）
134 | curl -X POST http://localhost:8000/v1/audio/speech \
135 |   -H "Content-Type: application/json" \
136 |   -d '{
137 |     "model": "tts-1",
138 |     "input": "你好，世界",
139 |     "voice": "alloy",
140 |     "response_format": "mp3",
141 |     "speed": 1.5
142 |   }' --output speech_fast.mp3
143 | ```
144 | 
145 | **可用语音：** alloy、ash、ballad、coral、echo、fable、nova、onyx、sage、shimmer、verse
146 | **可用格式：** mp3、wav（始终可用）+ opus、aac、flac、pcm（仅完整版镜像）
147 | **语速范围：** 0.25 - 4.0（需要完整版镜像）
148 | 
149 | ## 了解更多
150 | 
151 | - 在 [Web 文档](http://localhost:8000/docs)（或 `ttsfm-web/templates/docs.html`）查看完整接口说明与运行注意事项。
152 | - 查看 [架构概览](docs/architecture.md) 了解组件间的关系。
153 | - 欢迎参与贡献，流程说明请见 [CONTRIBUTING.md](CONTRIBUTING.md)。
154 | 
155 | ## 许可证
156 | 
157 | TTSFM 采用 [MIT 许可证](LICENSE) 发布。
158 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "ttsfm"
  7 | dynamic = ["version"]
  8 | description = "Text-to-Speech API Client with OpenAI compatibility"
  9 | readme = "README.md"
 10 | license = "MIT"
 11 | authors = [
 12 |     {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"}
 13 | ]
 14 | maintainers = [
 15 |     {name = "dbcccc", email = "120614547+dbccccccc@users.noreply.github.com"}
 16 | ]
 17 | classifiers = [
 18 |     "Development Status :: 4 - Beta",
 19 |     "Intended Audience :: Developers",
 20 | 
 21 |     "Operating System :: OS Independent",
 22 |     "Programming Language :: Python :: 3",
 23 |     "Programming Language :: Python :: 3.8",
 24 |     "Programming Language :: Python :: 3.9",
 25 |     "Programming Language :: Python :: 3.10",
 26 |     "Programming Language :: Python :: 3.11",
 27 |     "Programming Language :: Python :: 3.12",
 28 |     "Topic :: Multimedia :: Sound/Audio :: Speech",
 29 |     "Topic :: Software Development :: Libraries :: Python Modules",
 30 |     "Topic :: Internet :: WWW/HTTP :: Dynamic Content",
 31 | ]
 32 | keywords = [
 33 |     "tts",
 34 |     "text-to-speech", 
 35 |     "speech-synthesis",
 36 |     "openai",
 37 |     "api-client",
 38 |     "audio",
 39 |     "voice",
 40 |     "speech"
 41 | ]
 42 | requires-python = ">=3.8"
 43 | dependencies = [
 44 |     "requests>=2.25.0",
 45 |     "aiohttp>=3.8.0",
 46 |     "python-dotenv>=1.0.1",
 47 | ]
 48 | 
 49 | [project.optional-dependencies]
 50 | dev = [
 51 |     "pytest>=6.0",
 52 |     "pytest-asyncio>=0.18.0",
 53 |     "pytest-cov>=2.0",
 54 |     "black>=22.0",
 55 |     "isort>=5.0",
 56 |     "flake8>=4.0",
 57 |     "mypy>=0.900",
 58 |     "pre-commit>=2.0",
 59 | ]
 60 | docs = [
 61 |     "sphinx>=4.0",
 62 |     "sphinx-rtd-theme>=1.0",
 63 |     "myst-parser>=0.17",
 64 | ]
 65 | web = [
 66 |     "flask>=2.0.0",
 67 |     "flask-cors>=3.0.10",
 68 |     "flask-socketio>=5.3.0",
 69 |     "python-socketio>=5.10.0",
 70 |     "eventlet>=0.33.3",
 71 |     "waitress>=3.0.0",
 72 |     "pydub>=0.25.0",
 73 |     "argon2-cffi>=23.1.0",
 74 | ]
 75 | 
 76 | [project.urls]
 77 | Homepage = "https://github.com/dbccccccc/ttsfm"
 78 | Documentation = "https://github.com/dbccccccc/ttsfm/blob/main/docs/"
 79 | Repository = "https://github.com/dbccccccc/ttsfm"
 80 | "Bug Tracker" = "https://github.com/dbccccccc/ttsfm/issues"
 81 | 
 82 | [project.scripts]
 83 | ttsfm = "ttsfm.cli:main"
 84 | 
 85 | [tool.setuptools_scm]
 86 | version_scheme = "no-guess-dev"
 87 | local_scheme = "no-local-version"
 88 | 
 89 | fallback_version = "3.4.2"
 90 | [tool.setuptools]
 91 | packages = ["ttsfm"]
 92 | 
 93 | [tool.setuptools.package-data]
 94 | ttsfm = ["py.typed"]
 95 | 
 96 | [tool.black]
 97 | line-length = 100
 98 | target-version = ['py38']
 99 | include = '\\.pyi?$'
100 | extend-exclude = '''
101 | /( 
102 |   # directories
103 |   \.eggs
104 |   | \.git
105 |   | \.hg
106 |   | \.mypy_cache
107 |   | \.tox
108 |   | \.venv
109 |   | build
110 |   | dist
111 | )/
112 | '''
113 | 
114 | [tool.isort]
115 | profile = "black"
116 | line_length = 100
117 | multi_line_output = 3
118 | include_trailing_comma = true
119 | force_grid_wrap = 0
120 | use_parentheses = true
121 | ensure_newline_before_comments = true
122 | 
123 | [tool.mypy]
124 | python_version = "3.9"
125 | warn_return_any = false
126 | warn_unused_configs = true
127 | disallow_untyped_defs = false
128 | disallow_incomplete_defs = false
129 | check_untyped_defs = true
130 | disallow_untyped_decorators = false
131 | no_implicit_optional = false
132 | warn_redundant_casts = true
133 | warn_unused_ignores = false
134 | warn_no_return = true
135 | warn_unreachable = false
136 | strict_equality = true
137 | 
138 | [[tool.mypy.overrides]]
139 | module = "requests.*"
140 | ignore_missing_imports = true
141 | 
142 | [[tool.mypy.overrides]]
143 | module = "pydub.*"
144 | ignore_missing_imports = true
145 | 
146 | [[tool.mypy.overrides]]
147 | module = "fake_useragent.*"
148 | ignore_missing_imports = true
149 | 
150 | [tool.pytest.ini_options]
151 | minversion = "6.0"
152 | addopts = "-ra -q --strict-markers --strict-config"
153 | testpaths = ["tests"]
154 | python_files = ["test_*.py", "*_test.py"]
155 | python_classes = ["Test*"]
156 | python_functions = ["test_*"]
157 | markers = [
158 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
159 |     "integration: marks tests as integration tests",
160 |     "unit: marks tests as unit tests",
161 | ]
162 | 
163 | [tool.coverage.run]
164 | source = ["ttsfm"]
165 | omit = [
166 |     "*/tests/*",
167 |     "*/test_*",
168 |     "setup.py",
169 | ]
170 | 
171 | [tool.coverage.report]
172 | exclude_lines = [
173 |     "pragma: no cover",
174 |     "def __repr__",
175 |     "if self.debug:",
176 |     "if settings.DEBUG",
177 |     "raise AssertionError",
178 |     "raise NotImplementedError",
179 |     "if 0:",
180 |     "if __name__ == .__main__.:",
181 |     "class .*\\bProtocol\\):",
182 |     "@(abc\\.)?abstractmethod",
183 | ]
184 | 
185 | 


--------------------------------------------------------------------------------
/tests/test_audio_processing.py:
--------------------------------------------------------------------------------
  1 | """Tests for audio processing functionality."""
  2 | 
  3 | import pytest
  4 | import shutil
  5 | from ttsfm.audio_processing import adjust_audio_speed, _build_atempo_filter_chain
  6 | 
  7 | 
  8 | class TestAudioProcessing:
  9 |     """Test audio processing functions."""
 10 | 
 11 |     def test_build_atempo_filter_chain_normal_range(self):
 12 |         """Test atempo filter chain for speeds in 0.5-2.0 range."""
 13 |         # Single filter for speeds in range
 14 |         assert _build_atempo_filter_chain(1.0) == "atempo=1.0"
 15 |         assert _build_atempo_filter_chain(1.5) == "atempo=1.5"
 16 |         assert _build_atempo_filter_chain(0.5) == "atempo=0.5"
 17 |         assert _build_atempo_filter_chain(2.0) == "atempo=2.0"
 18 | 
 19 |     def test_build_atempo_filter_chain_high_speed(self):
 20 |         """Test atempo filter chain for speeds > 2.0."""
 21 |         # Should chain multiple filters
 22 |         result = _build_atempo_filter_chain(4.0)
 23 |         assert "atempo=2.0" in result
 24 |         assert "," in result  # Multiple filters chained
 25 | 
 26 |     def test_build_atempo_filter_chain_low_speed(self):
 27 |         """Test atempo filter chain for speeds < 0.5."""
 28 |         # Should chain multiple filters
 29 |         result = _build_atempo_filter_chain(0.25)
 30 |         assert "atempo=0.5" in result
 31 |         assert "," in result  # Multiple filters chained
 32 | 
 33 |     def test_adjust_audio_speed_validation(self):
 34 |         """Test speed parameter validation."""
 35 |         dummy_audio = b"dummy audio data"
 36 | 
 37 |         # Speed too low
 38 |         with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"):
 39 |             adjust_audio_speed(dummy_audio, speed=0.1)
 40 | 
 41 |         # Speed too high
 42 |         with pytest.raises(ValueError, match="Speed must be between 0.25 and 4.0"):
 43 |             adjust_audio_speed(dummy_audio, speed=5.0)
 44 | 
 45 |     def test_adjust_audio_speed_no_change(self):
 46 |         """Test that speed=1.0 returns original audio."""
 47 |         dummy_audio = b"dummy audio data"
 48 |         result = adjust_audio_speed(dummy_audio, speed=1.0)
 49 |         assert result == dummy_audio
 50 | 
 51 |     @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not available")
 52 |     def test_adjust_audio_speed_requires_ffmpeg(self):
 53 |         """Test that speed adjustment requires ffmpeg."""
 54 |         # This test only runs if ffmpeg is available
 55 |         # If ffmpeg is not available, the function should raise RuntimeError
 56 |         pass
 57 | 
 58 |     def test_adjust_audio_speed_no_ffmpeg(self, monkeypatch):
 59 |         """Test error when ffmpeg is not available."""
 60 |         # Mock shutil.which to return None (ffmpeg not found)
 61 |         monkeypatch.setattr("shutil.which", lambda x: None)
 62 | 
 63 |         dummy_audio = b"dummy audio data"
 64 |         with pytest.raises(RuntimeError, match="Speed adjustment requires ffmpeg"):
 65 |             adjust_audio_speed(dummy_audio, speed=1.5)
 66 | 
 67 | 
 68 | class TestFFmpegDetection:
 69 |     """Test ffmpeg detection in audio module."""
 70 | 
 71 |     def test_ffmpeg_detection(self):
 72 |         """Test that FFMPEG_AVAILABLE is set correctly."""
 73 |         from ttsfm.audio import FFMPEG_AVAILABLE
 74 | 
 75 |         # Should be a boolean
 76 |         assert isinstance(FFMPEG_AVAILABLE, bool)
 77 | 
 78 |         # Should match actual ffmpeg availability
 79 |         expected = shutil.which("ffmpeg") is not None
 80 |         assert FFMPEG_AVAILABLE == expected
 81 | 
 82 | 
 83 | class TestAudioCombineWithFFmpeg:
 84 |     """Test audio combining with ffmpeg detection."""
 85 | 
 86 |     def test_combine_mp3_without_ffmpeg(self, monkeypatch):
 87 |         """Test that MP3 combining fails gracefully without ffmpeg."""
 88 |         # Mock both pydub and ffmpeg as unavailable
 89 |         import ttsfm.audio
 90 | 
 91 |         monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
 92 |         monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False)
 93 | 
 94 |         from ttsfm.audio import combine_audio_chunks
 95 |         from ttsfm.exceptions import AudioProcessingException
 96 | 
 97 |         chunks = [b"chunk1", b"chunk2"]
 98 |         with pytest.raises(AudioProcessingException, match="MP3 audio requires pydub and ffmpeg"):
 99 |             combine_audio_chunks(chunks, format_type="mp3")
100 | 
101 |     def test_combine_wav_without_ffmpeg(self, monkeypatch):
102 |         """Test that WAV combining works without ffmpeg."""
103 |         # Mock pydub as unavailable but allow WAV concatenation
104 |         import ttsfm.audio
105 | 
106 |         monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
107 | 
108 |         from ttsfm.audio import combine_audio_chunks
109 | 
110 |         # Create simple WAV chunks (with minimal headers)
111 |         # This is a simplified test - real WAV files have proper headers
112 |         chunks = [b"RIFF" + b"\x00" * 40 + b"data", b"RIFF" + b"\x00" * 40 + b"data"]
113 | 
114 |         # Should not raise error for WAV
115 |         result = combine_audio_chunks(chunks, format_type="wav")
116 |         assert isinstance(result, bytes)
117 | 


--------------------------------------------------------------------------------
/ttsfm/audio.py:
--------------------------------------------------------------------------------
  1 | """Audio helper utilities shared across TTSFM components."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import io
  6 | import logging
  7 | import shutil
  8 | from typing import Iterable, List, Sequence
  9 | 
 10 | from .exceptions import AudioProcessingException
 11 | from .models import TTSResponse
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | try:  # Optional dependency for non-WAV combining
 17 |     from pydub import AudioSegment
 18 | except ImportError:  # pragma: no cover - optional dependency
 19 |     AudioSegment = None
 20 | 
 21 | 
 22 | # Detect ffmpeg availability at runtime
 23 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None
 24 | 
 25 | SUPPORTED_EXPORT_FORMATS = {"mp3", "wav", "aac", "flac", "opus", "pcm"}
 26 | 
 27 | 
 28 | def combine_audio_chunks(audio_chunks: Iterable[bytes], format_type: str = "mp3") -> bytes:
 29 |     """Combine multiple audio chunks into a single audio file.
 30 | 
 31 |     Args:
 32 |         audio_chunks: Iterable of raw audio byte strings
 33 |         format_type: Requested output format
 34 | 
 35 |     Returns:
 36 |         Combined audio data as bytes
 37 | 
 38 |     Raises:
 39 |         RuntimeError: If non-WAV combining is requested without pydub/ffmpeg available
 40 |     """
 41 | 
 42 |     chunks_list = list(audio_chunks)
 43 |     if not chunks_list:
 44 |         return b""
 45 | 
 46 |     fmt = format_type.lower()
 47 | 
 48 |     # Check for pydub availability (which requires ffmpeg for MP3)
 49 |     if AudioSegment is None:
 50 |         if fmt == "mp3":
 51 |             raise AudioProcessingException(
 52 |                 "Combining MP3 audio requires pydub and ffmpeg. "
 53 |                 "Install ttsfm[web] and use the full Docker image (dbcccc/ttsfm:latest) "
 54 |                 "instead of the slim variant.",
 55 |                 audio_format="mp3",
 56 |             )
 57 |         return _simple_wav_concatenation(chunks_list)
 58 | 
 59 |     # Check for ffmpeg availability when using pydub
 60 |     if not FFMPEG_AVAILABLE and fmt == "mp3":
 61 |         raise AudioProcessingException(
 62 |             "MP3 auto-combine requires ffmpeg. "
 63 |             "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant, "
 64 |             "or disable auto_combine and handle chunks separately.",
 65 |             audio_format="mp3",
 66 |         )
 67 | 
 68 |     audio_segments = []
 69 |     for chunk in chunks_list:
 70 |         buffer = io.BytesIO(chunk)
 71 |         if fmt == "mp3":
 72 |             segment = AudioSegment.from_mp3(buffer)
 73 |         else:
 74 |             segment = AudioSegment.from_wav(buffer)
 75 |         audio_segments.append(segment)
 76 | 
 77 |     combined = audio_segments[0]
 78 |     for segment in audio_segments[1:]:
 79 |         combined += segment
 80 | 
 81 |     output_buffer = io.BytesIO()
 82 |     export_format = "mp3" if fmt == "mp3" else "wav"
 83 |     combined.export(output_buffer, format=export_format)
 84 |     return output_buffer.getvalue()
 85 | 
 86 | 
 87 | def _simple_wav_concatenation(wav_chunks: List[bytes]) -> bytes:
 88 |     """Simple WAV concatenation fallback that avoids external deps."""
 89 |     if not wav_chunks:
 90 |         return b""
 91 | 
 92 |     if len(wav_chunks) == 1:
 93 |         return wav_chunks[0]
 94 | 
 95 |     try:
 96 |         first_wav = wav_chunks[0]
 97 |         if len(first_wav) < 44:
 98 |             return b"".join(wav_chunks)
 99 | 
100 |         header = bytearray(first_wav[:44])
101 |         audio_data = first_wav[44:]
102 | 
103 |         for wav_chunk in wav_chunks[1:]:
104 |             if len(wav_chunk) > 44:
105 |                 audio_data += wav_chunk[44:]
106 | 
107 |         total_size = len(header) + len(audio_data) - 8
108 |         header[4:8] = total_size.to_bytes(4, byteorder="little")
109 | 
110 |         data_size = len(audio_data)
111 |         header[40:44] = data_size.to_bytes(4, byteorder="little")
112 | 
113 |         return bytes(header) + audio_data
114 |     except Exception as exc:
115 |         logger.error("Error in simple WAV concatenation: %s", exc)
116 |         return b"".join(wav_chunks)
117 | 
118 | 
119 | def combine_responses(responses: Sequence["TTSResponse"]) -> "TTSResponse":
120 |     """Combine multiple ``TTSResponse`` objects into a single response."""
121 | 
122 |     responses = list(responses)
123 |     if not responses:
124 |         raise ValueError("No responses provided for combination")
125 | 
126 |     first = responses[0]
127 |     audio_format = first.format
128 | 
129 |     audio_bytes = combine_audio_chunks((resp.audio_data for resp in responses), audio_format.value)
130 | 
131 |     total_duration = None
132 |     if any(resp.duration is not None for resp in responses):
133 |         total_duration = sum(filter(None, (resp.duration for resp in responses)))
134 | 
135 |     metadata = dict(first.metadata or {})
136 |     metadata.update(
137 |         {
138 |             "chunks_combined": len(responses),
139 |             "auto_combined": True,
140 |         }
141 |     )
142 | 
143 |     return TTSResponse(
144 |         audio_data=audio_bytes,
145 |         content_type=first.content_type,
146 |         format=audio_format,
147 |         size=len(audio_bytes),
148 |         duration=total_duration if total_duration is not None else first.duration,
149 |         metadata=metadata,
150 |     )
151 | 


--------------------------------------------------------------------------------
/scripts/test_websocket.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Test WebSocket connection to TTSFM server.
  4 | 
  5 | This script tests the WebSocket functionality by connecting to the server
  6 | and performing a simple TTS generation request.
  7 | """
  8 | 
  9 | import time
 10 | import socketio
 11 | 
 12 | # Create a Socket.IO client
 13 | sio = socketio.Client(logger=True, engineio_logger=True)
 14 | 
 15 | # Track connection state
 16 | connected = False
 17 | stream_complete = False
 18 | chunks_received = 0
 19 | 
 20 | 
 21 | @sio.on('connect')
 22 | def on_connect():
 23 |     """Handle connection event."""
 24 |     global connected
 25 |     connected = True
 26 |     print('\n✅ Connected to WebSocket server!')
 27 |     print(f'Session ID: {sio.sid}')
 28 |     
 29 | 
 30 | @sio.on('connected')
 31 | def on_session_ready(data):
 32 |     """Handle session ready event."""
 33 |     print(f'\n✅ Session established: {data}')
 34 | 
 35 | 
 36 | @sio.on('disconnect')
 37 | def on_disconnect():
 38 |     """Handle disconnection event."""
 39 |     global connected
 40 |     connected = False
 41 |     print('\n❌ Disconnected from WebSocket server')
 42 | 
 43 | 
 44 | @sio.on('connect_error')
 45 | def on_connect_error(data):
 46 |     """Handle connection error."""
 47 |     print(f'\n❌ Connection error: {data}')
 48 | 
 49 | 
 50 | @sio.on('pong')
 51 | def on_pong(data):
 52 |     """Handle pong response."""
 53 |     print(f'\n✅ Pong received: {data}')
 54 | 
 55 | 
 56 | @sio.on('stream_started')
 57 | def on_stream_started(data):
 58 |     """Handle stream started event."""
 59 |     print(f'\n✅ Stream started: {data}')
 60 | 
 61 | 
 62 | @sio.on('stream_progress')
 63 | def on_stream_progress(data):
 64 |     """Handle stream progress event."""
 65 |     progress = data.get('progress', 0)
 66 |     status = data.get('status', 'unknown')
 67 |     print(f'📊 Progress: {progress}% - Status: {status}')
 68 | 
 69 | 
 70 | @sio.on('audio_chunk')
 71 | def on_audio_chunk(data):
 72 |     """Handle audio chunk event."""
 73 |     global chunks_received
 74 |     chunks_received += 1
 75 |     chunk_index = data.get('chunk_index', 0)
 76 |     total_chunks = data.get('total_chunks', 0)
 77 |     print(f'🎵 Received audio chunk {chunk_index + 1}/{total_chunks}')
 78 | 
 79 | 
 80 | @sio.on('stream_complete')
 81 | def on_stream_complete(data):
 82 |     """Handle stream complete event."""
 83 |     global stream_complete
 84 |     stream_complete = True
 85 |     print(f'\n✅ Stream complete: {data}')
 86 |     print(f'Total chunks received: {chunks_received}')
 87 | 
 88 | 
 89 | @sio.on('stream_error')
 90 | def on_stream_error(data):
 91 |     """Handle stream error event."""
 92 |     print(f'\n❌ Stream error: {data}')
 93 | 
 94 | 
 95 | def test_connection(url='http://localhost:8000'):
 96 |     """Test WebSocket connection."""
 97 |     print(f'🔌 Connecting to {url}...')
 98 |     
 99 |     try:
100 |         # Connect to the server
101 |         sio.connect(url, transports=['polling', 'websocket'])
102 |         
103 |         # Wait for connection
104 |         timeout = 10
105 |         start_time = time.time()
106 |         while not connected and (time.time() - start_time) < timeout:
107 |             time.sleep(0.1)
108 |         
109 |         if not connected:
110 |             print('❌ Failed to connect within timeout')
111 |             return False
112 |         
113 |         # Test ping/pong
114 |         print('\n📡 Testing ping/pong...')
115 |         sio.emit('ping', {'timestamp': time.time()})
116 |         time.sleep(1)
117 |         
118 |         # Test TTS generation
119 |         print('\n🎤 Testing TTS generation...')
120 |         request_data = {
121 |             'request_id': f'test_{int(time.time())}',
122 |             'text': 'Hello, this is a WebSocket test!',
123 |             'voice': 'alloy',
124 |             'format': 'mp3',
125 |             'chunk_size': 512
126 |         }
127 |         
128 |         sio.emit('generate_stream', request_data)
129 |         
130 |         # Wait for stream to complete
131 |         timeout = 30
132 |         start_time = time.time()
133 |         while not stream_complete and (time.time() - start_time) < timeout:
134 |             time.sleep(0.1)
135 |         
136 |         if stream_complete:
137 |             print('\n✅ WebSocket test completed successfully!')
138 |             return True
139 |         else:
140 |             print('\n⚠️  Stream did not complete within timeout')
141 |             return False
142 |         
143 |     except Exception as e:
144 |         print(f'\n❌ Error during test: {e}')
145 |         import traceback
146 |         traceback.print_exc()
147 |         return False
148 |     
149 |     finally:
150 |         # Disconnect
151 |         if connected:
152 |             print('\n🔌 Disconnecting...')
153 |             sio.disconnect()
154 |             time.sleep(1)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     import sys
159 |     
160 |     # Get URL from command line or use default
161 |     url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8000'
162 |     
163 |     print('=' * 60)
164 |     print('TTSFM WebSocket Connection Test')
165 |     print('=' * 60)
166 |     
167 |     success = test_connection(url)
168 |     
169 |     print('\n' + '=' * 60)
170 |     if success:
171 |         print('✅ All tests passed!')
172 |         sys.exit(0)
173 |     else:
174 |         print('❌ Some tests failed')
175 |         sys.exit(1)
176 | 
177 | 


--------------------------------------------------------------------------------
/docs/docker-workflows.md:
--------------------------------------------------------------------------------
  1 | # Docker Build Workflows
  2 | 
  3 | ## Overview
  4 | 
  5 | Starting with v3.4.0, TTSFM uses **separate GitHub Actions workflows** for building the full and slim Docker image variants. This provides better clarity, easier debugging, and independent execution.
  6 | 
  7 | ## Workflow Files
  8 | 
  9 | ### 1. `.github/workflows/docker-build-full.yml`
 10 | 
 11 | **Purpose**: Builds the full variant with ffmpeg support
 12 | 
 13 | **Triggers**:
 14 | - Push to `main` branch
 15 | - Pull requests to `main` branch
 16 | - Release published
 17 | 
 18 | **Image Tags** (on release):
 19 | - `dbcccc/ttsfm:vX.X.X`
 20 | - `dbcccc/ttsfm:latest` (only for stable releases, not pre-releases)
 21 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X`
 22 | - `ghcr.io/dbccccccc/ttsfm:latest` (only for stable releases)
 23 | 
 24 | **Features**:
 25 | - ✅ ffmpeg included
 26 | - ✅ MP3 auto-combine
 27 | - ✅ Speed adjustment (0.25x - 4.0x)
 28 | - ✅ Format conversion
 29 | - ✅ Multi-platform builds (linux/amd64, linux/arm64)
 30 | - ✅ Smoke test on PR/push
 31 | - ✅ GitHub Actions cache (scope: `full`)
 32 | 
 33 | ---
 34 | 
 35 | ### 2. `.github/workflows/docker-build-slim.yml`
 36 | 
 37 | **Purpose**: Builds the slim variant without ffmpeg
 38 | 
 39 | **Triggers**:
 40 | - Push to `main` branch
 41 | - Pull requests to `main` branch
 42 | - Release published
 43 | 
 44 | **Image Tags** (on release):
 45 | - `dbcccc/ttsfm:vX.X.X-slim`
 46 | - `dbcccc/ttsfm:vX.X-slim` (only for stable releases, not pre-releases)
 47 | - `ghcr.io/dbccccccc/ttsfm:vX.X.X-slim`
 48 | - `ghcr.io/dbccccccc/ttsfm:vX.X-slim` (only for stable releases)
 49 | 
 50 | **Features**:
 51 | - ✅ No ffmpeg (smaller image)
 52 | - ✅ Basic TTS (MP3/WAV)
 53 | - ✅ WAV auto-combine (simple concatenation)
 54 | - ❌ No MP3 auto-combine
 55 | - ❌ No speed adjustment
 56 | - ❌ No format conversion
 57 | - ✅ Multi-platform builds (linux/amd64, linux/arm64)
 58 | - ✅ Smoke test on PR/push (port 8001)
 59 | - ✅ GitHub Actions cache (scope: `slim`)
 60 | 
 61 | ---
 62 | 
 63 | ## Build Behavior
 64 | 
 65 | ### On Pull Request or Push to Main
 66 | 
 67 | Both workflows run in parallel:
 68 | - Build for `linux/amd64` only (faster)
 69 | - Images are **not pushed** to registries
 70 | - Images are loaded locally for smoke testing
 71 | - Temporary tags: `ghcr.io/dbccccccc/ttsfm:ci-{RUN_ID}-full` and `ci-{RUN_ID}-slim`
 72 | 
 73 | ### On Release Published
 74 | 
 75 | Both workflows run in parallel:
 76 | - Build for `linux/amd64` and `linux/arm64` (multi-platform)
 77 | - Images are **pushed** to Docker Hub and GitHub Container Registry
 78 | - No local loading (images go directly to registries)
 79 | - Production tags based on release version
 80 | 
 81 | ### Pre-release vs Stable Release
 82 | 
 83 | **Pre-release** (e.g., `v3.4.0-alpha1`):
 84 | - Full variant: `vX.X.X` only (no `latest` tag)
 85 | - Slim variant: `vX.X.X-slim` only (no `vX.X-slim` tag)
 86 | 
 87 | **Stable release** (e.g., `v3.4.0`):
 88 | - Full variant: `vX.X.X` + `latest`
 89 | - Slim variant: `vX.X.X-slim` + `vX.X-slim`
 90 | 
 91 | ---
 92 | 
 93 | ## Advantages of Separate Workflows
 94 | 
 95 | 1. **Clarity**: Each workflow has a single, clear purpose
 96 | 2. **Easier debugging**: When a build fails, you immediately know which variant failed
 97 | 3. **Independent execution**: Can trigger/retry builds independently
 98 | 4. **Simpler logic**: No complex conditionals or fallback logic
 99 | 5. **Better visibility**: GitHub Actions UI shows them as separate jobs
100 | 6. **Parallel execution**: Both variants build truly in parallel
101 | 7. **Independent caching**: Each variant has its own cache scope
102 | 
103 | ---
104 | 
105 | ## Monitoring Builds
106 | 
107 | ### GitHub Actions UI
108 | 
109 | When you create a release, you'll see **two separate workflow runs**:
110 | - ✅ Docker Build and Push (Full)
111 | - ✅ Docker Build and Push (Slim)
112 | 
113 | Each can succeed or fail independently.
114 | 
115 | ### Checking Build Status
116 | 
117 | **Via GitHub UI**:
118 | 1. Go to repository → Actions tab
119 | 2. Look for the two workflow runs
120 | 3. Click on each to see detailed logs
121 | 
122 | **Via API**:
123 | ```bash
124 | # Check latest workflow runs
125 | gh run list --workflow=docker-build-full.yml
126 | gh run list --workflow=docker-build-slim.yml
127 | ```
128 | 
129 | ---
130 | 
131 | ## Troubleshooting
132 | 
133 | ### Slim variant not building
134 | 
135 | 1. Check if the workflow file exists: `.github/workflows/docker-build-slim.yml`
136 | 2. Check the Actions tab for the "Docker Build and Push (Slim)" workflow
137 | 3. Look for error messages in the workflow logs
138 | 4. Verify Docker Hub and GitHub Container Registry credentials
139 | 
140 | ### Images not pushed to registry
141 | 
142 | 1. Verify the event is a "release published" (not draft)
143 | 2. Check Docker Hub credentials in repository secrets:
144 |    - `DOCKERHUB_USERNAME`
145 |    - `DOCKERHUB_TOKEN`
146 | 3. Check GitHub Container Registry permissions (automatic via `GITHUB_TOKEN`)
147 | 
148 | ### Smoke test failing
149 | 
150 | 1. Check the smoke test logs in the workflow run
151 | 2. Verify the health endpoint is working: `/api/health`
152 | 3. For slim variant, ensure it's using port 8001 (not 8000)
153 | 
154 | ---
155 | 
156 | ## Future Enhancements
157 | 
158 | Potential improvements for the workflows:
159 | 
160 | 1. **Matrix builds**: Use a single workflow with matrix strategy
161 | 2. **Reusable workflows**: Extract common steps into a reusable workflow
162 | 3. **Build notifications**: Send notifications on build success/failure
163 | 4. **Image scanning**: Add security scanning with Trivy or Snyk
164 | 5. **Performance metrics**: Track and report build times and image sizes
165 | 
166 | 


--------------------------------------------------------------------------------
/docs/v3.4-dual-image-implementation.md:
--------------------------------------------------------------------------------
  1 | # TTSFM v3.4.x Dual-Image Implementation
  2 | 
  3 | ## Overview
  4 | 
  5 | Starting with v3.4.0-alpha1, TTSFM provides two Docker image variants to balance functionality and image size:
  6 | 
  7 | 1. **Full variant** (`dbcccc/ttsfm:latest`, `dbcccc/ttsfm:v3.4.0-alpha1`)
  8 |    - Includes ffmpeg for advanced audio processing
  9 |    - Supports all features including speed adjustment and format conversion
 10 | 
 11 | 2. **Slim variant** (`dbcccc/ttsfm:v3.4.0-alpha1-slim`)
 12 |    - Minimal image without ffmpeg
 13 |    - Basic TTS functionality only
 14 | 
 15 | ## Implementation Details
 16 | 
 17 | ### 1. Dockerfile Changes
 18 | 
 19 | The Dockerfile now accepts a `VARIANT` build argument:
 20 | 
 21 | ```dockerfile
 22 | ARG VARIANT=full  # Can be 'full' or 'slim'
 23 | ```
 24 | 
 25 | - **Full variant**: Installs ffmpeg in the runtime stage
 26 | - **Slim variant**: Skips ffmpeg installation
 27 | 
 28 | ### 2. GitHub Actions Workflow
 29 | 
 30 | `.github/workflows/docker-build.yml` now builds both variants:
 31 | 
 32 | - **Full image tags**: `vX.X.X`, `latest`
 33 | - **Slim image tags**: `vX.X.X-slim`
 34 | 
 35 | Both variants are built for `linux/amd64` and `linux/arm64` platforms on release.
 36 | 
 37 | ### 3. Runtime Feature Detection
 38 | 
 39 | `ttsfm/audio.py` now includes runtime detection:
 40 | 
 41 | ```python
 42 | import shutil
 43 | FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None
 44 | ```
 45 | 
 46 | Functions that require ffmpeg provide helpful error messages when it's not available.
 47 | 
 48 | ### 4. Speed Adjustment Feature
 49 | 
 50 | New module `ttsfm/audio_processing.py` provides:
 51 | 
 52 | - `adjust_audio_speed()`: Adjust playback speed using ffmpeg (0.25x - 4.0x)
 53 | - `convert_audio_format()`: Convert between audio formats using ffmpeg
 54 | 
 55 | Both sync (`TTSClient`) and async (`AsyncTTSClient`) clients now support the `speed` parameter:
 56 | 
 57 | ```python
 58 | response = client.generate_speech(
 59 |     text="Hello!",
 60 |     voice=Voice.ALLOY,
 61 |     speed=1.5,  # 1.5x faster
 62 | )
 63 | ```
 64 | 
 65 | Speed adjustment is applied post-generation using ffmpeg's `atempo` filter.
 66 | 
 67 | ## Feature Matrix
 68 | 
 69 | | Feature | Full Image | Slim Image | Python Package |
 70 | |---------|-----------|------------|----------------|
 71 | | Basic TTS (MP3/WAV) | ✅ | ✅ | ✅ |
 72 | | WAV auto-combine | ✅ | ✅ (simple) | ✅ (simple) |
 73 | | MP3 auto-combine | ✅ | ❌ | ✅ (with pydub) |
 74 | | Speed adjustment | ✅ | ❌ | ✅ (with ffmpeg) |
 75 | | Format conversion | ✅ | ❌ | ✅ (with ffmpeg) |
 76 | 
 77 | ## Usage Examples
 78 | 
 79 | ### Full Image (Recommended)
 80 | 
 81 | ```bash
 82 | # Pull and run full image
 83 | docker run -p 8000:8000 dbcccc/ttsfm:latest
 84 | 
 85 | # Use speed adjustment
 86 | curl -X POST http://localhost:8000/v1/audio/speech \
 87 |   -H "Content-Type: application/json" \
 88 |   -d '{"input":"Hello!","voice":"alloy","speed":1.5}' \
 89 |   --output fast.mp3
 90 | ```
 91 | 
 92 | ### Slim Image (Minimal)
 93 | 
 94 | ```bash
 95 | # Pull and run slim image
 96 | docker run -p 8000:8000 dbcccc/ttsfm:v3.4.0-alpha1-slim
 97 | 
 98 | # Basic TTS works fine
 99 | curl -X POST http://localhost:8000/v1/audio/speech \
100 |   -H "Content-Type: application/json" \
101 |   -d '{"input":"Hello!","voice":"alloy"}' \
102 |   --output speech.mp3
103 | 
104 | # Speed parameter will be ignored (no error, just logged warning)
105 | ```
106 | 
107 | ### Python Package
108 | 
109 | ```python
110 | from ttsfm import TTSClient, Voice
111 | 
112 | client = TTSClient()
113 | 
114 | # Speed adjustment requires ffmpeg installed on system
115 | response = client.generate_speech(
116 |     text="This will be faster!",
117 |     voice=Voice.NOVA,
118 |     speed=1.5,
119 | )
120 | response.save_to_file("fast.mp3")
121 | ```
122 | 
123 | ## Error Handling
124 | 
125 | When ffmpeg-dependent features are used without ffmpeg:
126 | 
127 | ```python
128 | # Graceful degradation with helpful error messages
129 | RuntimeError: "Speed adjustment requires ffmpeg. 
130 | Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
131 | ```
132 | 
133 | ## Migration Guide
134 | 
135 | ### From v3.3.x to v3.4.x
136 | 
137 | **No breaking changes** - existing code continues to work:
138 | 
139 | 1. **Docker users**:
140 |    - `dbcccc/ttsfm:latest` now includes speed adjustment
141 |    - Use `dbcccc/ttsfm:v3.4.0-alpha1-slim` for minimal image
142 | 
143 | 2. **Python package users**:
144 |    - Speed parameter now functional (requires ffmpeg)
145 |    - Install ffmpeg: `apt-get install ffmpeg` (Linux) or `brew install ffmpeg` (Mac)
146 | 
147 | 3. **API users**:
148 |    - Speed parameter now works in `/v1/audio/speech` endpoint
149 |    - Response metadata includes `speed_applied: true/false`
150 | 
151 | ## Technical Notes
152 | 
153 | ### Speed Adjustment Implementation
154 | 
155 | - Uses ffmpeg's `atempo` filter for speed adjustment
156 | - Supports 0.25x to 4.0x range (OpenAI TTS API compatible)
157 | - Chains multiple `atempo` filters for speeds outside 0.5-2.0 range
158 | - Adjusts estimated duration based on speed multiplier
159 | - Runs in thread pool for async client to avoid blocking
160 | 
161 | ### Build Optimization
162 | 
163 | - Shared builder stage for both variants
164 | - Separate cache scopes (`scope=full`, `scope=slim`) for efficient caching
165 | - Multi-platform builds only on release (saves CI time)
166 | 
167 | ## Future Enhancements
168 | 
169 | Potential additions for future versions:
170 | 
171 | 1. **Additional format support**: Real AAC, FLAC, OPUS output (currently mapped to WAV)
172 | 2. **Audio effects**: Pitch adjustment, noise reduction
173 | 3. **Streaming support**: Real-time audio streaming with speed adjustment
174 | 4. **Ultra-slim variant**: Alpine-based image (~50MB) with no Python web server
175 | 
176 | ## References
177 | 
178 | - [OpenAI TTS API Documentation](https://platform.openai.com/docs/guides/text-to-speech)
179 | - [ffmpeg atempo filter](https://ffmpeg.org/ffmpeg-filters.html#atempo)
180 | - [Docker multi-stage builds](https://docs.docker.com/build/building/multi-stage/)
181 | 
182 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TTSFM - Text-to-Speech API Client
  2 | 
  3 | > **⚠️ NOTICE: This project is no longer functional as the openai.fm demo website has been shut down.**
  4 | 
  5 | > **Language / 语言**: [English](README.md) | [中文](README.zh.md)
  6 | 
  7 | [![Docker Pulls](https://img.shields.io/docker/pulls/dbcccc/ttsfm?style=flat-square&logo=docker)](https://hub.docker.com/r/dbcccc/ttsfm)
  8 | [![GitHub Stars](https://img.shields.io/github/stars/dbccccccc/ttsfm?style=social)](https://github.com/dbccccccc/ttsfm)
  9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
 10 | ![ghcr pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fghcr-badge.elias.eu.org%2Fapi%2Fdbccccccc%2Fttsfm%2Fttsfm&query=downloadCount&label=ghcr+pulls&logo=github)
 11 | 
 12 | ## Star History
 13 | 
 14 | [![Star History Chart](https://api.star-history.com/svg?repos=dbccccccc/ttsfm&type=Date)](https://www.star-history.com/#dbccccccc/ttsfm&Date)
 15 | 
 16 | ## Overview
 17 | 
 18 | TTSFM is a free, OpenAI-compatible text-to-speech API service that provides a complete solution for converting text to natural-sounding speech based on OpenAI's GPT-4o mini TTS. Built on top of the openai.fm backend, it offers a powerful Python SDK, RESTful API endpoints, and an intuitive web playground for easy testing and integration.
 19 | 
 20 | **What TTSFM Can Do:**
 21 | - 🎤 **Multiple Voices**: Choose from 11 OpenAI-compatible voices (alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse)
 22 | - 🎵 **Flexible Audio Formats**: Support for 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
 23 | - ⚡ **Speed Control**: Adjust playback speed from 0.25x to 4.0x for different use cases
 24 | - 📝 **Long Text Support**: Automatic text splitting and audio combining for content of any length
 25 | - 🔄 **Real-time Streaming**: WebSocket support for streaming audio generation
 26 | - 🐍 **Python SDK**: Easy-to-use synchronous and asynchronous clients
 27 | - 🌐 **Web Playground**: Interactive web interface for testing and experimentation
 28 | - 🐳 **Docker Ready**: Pre-built Docker images for instant deployment
 29 | - 🔍 **Smart Detection**: Automatic capability detection and helpful error messages
 30 | - 🤖 **OpenAI Compatible**: Drop-in replacement for OpenAI's TTS API
 31 | 
 32 | **Key Features in v3.4.0:**
 33 | - 🎯 Image variant detection (full vs slim Docker images)
 34 | - 🔍 Runtime capabilities API for feature availability checking
 35 | - ⚡ Speed adjustment with ffmpeg-based audio processing
 36 | - 🎵 Real format conversion for all 6 audio formats
 37 | - 📊 Enhanced error handling with clear, actionable messages
 38 | - 🐳 Dual Docker images optimized for different use cases
 39 | 
 40 | > **⚠️ Disclaimer**: This project is intended for **educational and research purposes only**. It is a reverse-engineered implementation of the openai.fm service and should not be used for commercial purposes or in production environments. Users are responsible for ensuring compliance with applicable laws and terms of service.
 41 | 
 42 | ## Installation
 43 | 
 44 | ### Python package
 45 | 
 46 | ```bash
 47 | pip install ttsfm        # core client
 48 | pip install ttsfm[web]   # core client + web/server dependencies
 49 | ```
 50 | 
 51 | ### Docker image
 52 | 
 53 | TTSFM offers two Docker image variants to suit different needs:
 54 | 
 55 | #### Full variant (recommended)
 56 | ```bash
 57 | docker run -p 8000:8000 dbcccc/ttsfm:latest
 58 | ```
 59 | 
 60 | **Includes ffmpeg for advanced features:**
 61 | - ✅ All 6 audio formats (MP3, WAV, OPUS, AAC, FLAC, PCM)
 62 | - ✅ Speed adjustment (0.25x - 4.0x)
 63 | - ✅ Format conversion with ffmpeg
 64 | - ✅ MP3 auto-combine for long text
 65 | - ✅ WAV auto-combine for long text
 66 | 
 67 | #### Slim variant - ~100MB
 68 | ```bash
 69 | docker run -p 8000:8000 dbcccc/ttsfm:slim
 70 | ```
 71 | 
 72 | **Minimal image without ffmpeg:**
 73 | - ✅ Basic TTS functionality
 74 | - ✅ 2 audio formats (MP3, WAV only)
 75 | - ✅ WAV auto-combine for long text
 76 | - ❌ No speed adjustment
 77 | - ❌ No format conversion
 78 | - ❌ No MP3 auto-combine
 79 | 
 80 | The container exposes the web playground at `http://localhost:8000` and an OpenAI-compatible endpoint at `/v1/audio/speech`.
 81 | 
 82 | **Check available features:**
 83 | ```bash
 84 | curl http://localhost:8000/api/capabilities
 85 | ```
 86 | 
 87 | ## Quick start
 88 | 
 89 | ### Python client
 90 | 
 91 | ```python
 92 | from ttsfm import TTSClient, AudioFormat, Voice
 93 | 
 94 | client = TTSClient()
 95 | 
 96 | # Basic usage
 97 | response = client.generate_speech(
 98 |     text="Hello from TTSFM!",
 99 |     voice=Voice.ALLOY,
100 |     response_format=AudioFormat.MP3,
101 | )
102 | response.save_to_file("hello")  # -> hello.mp3
103 | 
104 | # With speed adjustment (requires ffmpeg)
105 | response = client.generate_speech(
106 |     text="This will be faster!",
107 |     voice=Voice.NOVA,
108 |     response_format=AudioFormat.MP3,
109 |     speed=1.5,  # 1.5x speed (0.25 - 4.0)
110 | )
111 | response.save_to_file("fast")  # -> fast.mp3
112 | ```
113 | 
114 | ### CLI
115 | 
116 | ```bash
117 | ttsfm "Hello, world" --voice nova --format mp3 --output hello.mp3
118 | ```
119 | 
120 | ### REST API (OpenAI-compatible)
121 | 
122 | ```bash
123 | # Basic request
124 | curl -X POST http://localhost:8000/v1/audio/speech \
125 |   -H "Content-Type: application/json" \
126 |   -d '{
127 |     "model": "tts-1",
128 |     "input": "Hello world!",
129 |     "voice": "alloy",
130 |     "response_format": "mp3"
131 |   }' --output speech.mp3
132 | 
133 | # With speed adjustment (requires full image)
134 | curl -X POST http://localhost:8000/v1/audio/speech \
135 |   -H "Content-Type: application/json" \
136 |   -d '{
137 |     "model": "tts-1",
138 |     "input": "Hello world!",
139 |     "voice": "alloy",
140 |     "response_format": "mp3",
141 |     "speed": 1.5
142 |   }' --output speech_fast.mp3
143 | ```
144 | 
145 | **Available voices:** alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse
146 | **Available formats:** mp3, wav (always) + opus, aac, flac, pcm (full image only)
147 | **Speed range:** 0.25 - 4.0 (requires full image)
148 | 
149 | ## Learn more
150 | 
151 | - Browse the full API reference and operational notes in the [web documentation](http://localhost:8000/docs) (or see `ttsfm-web/templates/docs.html`).
152 | - Read the [architecture overview](docs/architecture.md) for component diagrams.
153 | - Contributions are welcome—see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
154 | 
155 | ## License
156 | 
157 | TTSFM is released under the [MIT License](LICENSE).
158 | 


--------------------------------------------------------------------------------
/docs/websocket-streaming.md:
--------------------------------------------------------------------------------
  1 | # 🚀 WebSocket Streaming for TTSFM
  2 | 
  3 | Real-time audio streaming for text-to-speech generation using WebSockets.
  4 | 
  5 | ## Overview
  6 | 
  7 | The WebSocket streaming feature provides:
  8 | - **Real-time audio chunk delivery** as they're generated
  9 | - **Progress tracking** with live updates
 10 | - **Lower perceived latency** - start receiving audio before complete generation
 11 | - **Cancellable operations** - stop mid-generation if needed
 12 | 
 13 | ## Quick Start
 14 | 
 15 | ### 1. Docker Deployment (Recommended)
 16 | 
 17 | ```bash
 18 | # Build with WebSocket support
 19 | docker build -t ttsfm-websocket .
 20 | 
 21 | # Run with WebSocket enabled
 22 | docker run -p 8000:8000 \
 23 |   -e DEBUG=false \
 24 |   ttsfm-websocket
 25 | ```
 26 | 
 27 | ### 2. Test WebSocket Connection
 28 | 
 29 | Visit `http://localhost:8000/websocket-demo` for an interactive demo.
 30 | 
 31 | ### 3. Client Usage
 32 | 
 33 | ```javascript
 34 | // Initialize WebSocket client
 35 | const client = new WebSocketTTSClient({
 36 |     socketUrl: 'http://localhost:8000',
 37 |     debug: true
 38 | });
 39 | 
 40 | // Generate speech with streaming
 41 | const result = await client.generateSpeech('Hello, WebSocket world!', {
 42 |     voice: 'alloy',
 43 |     format: 'mp3',
 44 |     onProgress: (progress) => {
 45 |         console.log(`Progress: ${progress.progress}%`);
 46 |     },
 47 |     onChunk: (chunk) => {
 48 |         console.log(`Received chunk ${chunk.chunkIndex + 1}`);
 49 |         // Process audio chunk in real-time
 50 |     },
 51 |     onComplete: (result) => {
 52 |         console.log('Generation complete!');
 53 |         // Play or download the combined audio
 54 |     }
 55 | });
 56 | ```
 57 | 
 58 | ## API Reference
 59 | 
 60 | ### WebSocket Events
 61 | 
 62 | #### Client → Server
 63 | 
 64 | **`generate_stream`**
 65 | ```javascript
 66 | {
 67 |     text: string,          // Text to convert
 68 |     voice: string,         // Voice ID (alloy, echo, etc.)
 69 |     format: string,        // Audio format (mp3, wav, opus)
 70 |     chunk_size: number     // Optional, default 1024
 71 | }
 72 | ```
 73 | 
 74 | **`cancel_stream`**
 75 | ```javascript
 76 | {
 77 |     request_id: string     // Request ID to cancel
 78 | }
 79 | ```
 80 | 
 81 | #### Server → Client
 82 | 
 83 | **`stream_started`**
 84 | ```javascript
 85 | {
 86 |     request_id: string,
 87 |     timestamp: number
 88 | }
 89 | ```
 90 | 
 91 | **`audio_chunk`**
 92 | ```javascript
 93 | {
 94 |     request_id: string,
 95 |     chunk_index: number,
 96 |     total_chunks: number,
 97 |     audio_data: string,    // Hex-encoded audio data
 98 |     format: string,
 99 |     duration: number,
100 |     generation_time: number,
101 |     chunk_text: string     // Preview of chunk text
102 | }
103 | ```
104 | 
105 | **`stream_progress`**
106 | ```javascript
107 | {
108 |     request_id: string,
109 |     progress: number,      // 0-100
110 |     total_chunks: number,
111 |     chunks_completed: number,
112 |     status: string
113 | }
114 | ```
115 | 
116 | **`stream_complete`**
117 | ```javascript
118 | {
119 |     request_id: string,
120 |     total_chunks: number,
121 |     status: 'completed',
122 |     timestamp: number
123 | }
124 | ```
125 | 
126 | **`stream_error`**
127 | ```javascript
128 | {
129 |     request_id: string,
130 |     error: string,
131 |     timestamp: number
132 | }
133 | ```
134 | 
135 | ## Performance Considerations
136 | 
137 | 1. **Chunk Size**: Smaller chunks (512-1024 chars) provide more frequent updates but increase overhead
138 | 2. **Network Latency**: WebSocket reduces latency compared to HTTP polling
139 | 3. **Audio Buffering**: Client should buffer chunks for smooth playback
140 | 4. **Concurrent Streams**: Server supports multiple concurrent streaming sessions
141 | 
142 | ## Browser Support
143 | 
144 | - Chrome/Edge: Full support
145 | - Firefox: Full support
146 | - Safari: Full support (iOS 11.3+)
147 | - IE11: Not supported (use polling fallback)
148 | 
149 | ## Troubleshooting
150 | 
151 | ### Connection Issues
152 | ```javascript
153 | // Check WebSocket status
154 | fetch('/api/websocket/status')
155 |     .then(res => res.json())
156 |     .then(data => console.log('WebSocket status:', data));
157 | ```
158 | 
159 | ### Debug Mode
160 | ```javascript
161 | const client = new WebSocketTTSClient({
162 |     debug: true  // Enable console logging
163 | });
164 | ```
165 | 
166 | ### Common Issues
167 | 
168 | 1. **"WebSocket connection failed"**
169 |    - Check if port 8000 is accessible
170 |    - Ensure eventlet is installed: `pip install eventlet>=0.33.3`
171 |    - Try polling transport as fallback
172 | 
173 | 2. **"Chunks arriving out of order"**
174 |    - Client automatically sorts chunks by index
175 |    - Check network stability
176 | 
177 | 3. **"Audio playback stuttering"**
178 |    - Increase chunk size for better buffering
179 |    - Check client-side audio buffer implementation
180 | 
181 | ## Advanced Usage
182 | 
183 | ### Custom Chunk Processing
184 | ```javascript
185 | client.generateSpeech(text, {
186 |     onChunk: async (chunk) => {
187 |         // Custom processing per chunk
188 |         const processed = await processAudioChunk(chunk.audioData);
189 |         audioQueue.push(processed);
190 |         
191 |         // Start playback after first chunk
192 |         if (chunk.chunkIndex === 0) {
193 |             startStreamingPlayback(audioQueue);
194 |         }
195 |     }
196 | });
197 | ```
198 | 
199 | ### Progress Visualization
200 | ```javascript
201 | client.generateSpeech(text, {
202 |     onProgress: (progress) => {
203 |         // Update UI progress bar
204 |         progressBar.style.width = `${progress.progress}%`;
205 |         statusText.textContent = `Processing chunk ${progress.chunksCompleted}/${progress.totalChunks}`;
206 |     }
207 | });
208 | ```
209 | 
210 | ## Security
211 | 
212 | - WebSocket connections respect API key authentication if enabled
213 | - CORS is configured for cross-origin requests
214 | - SSL/TLS recommended for production deployments
215 | 
216 | ## Deployment Notes
217 | 
218 | For production deployment with your existing setup:
219 | 
220 | ```bash
221 | # Build new image with WebSocket support
222 | docker build -t ttsfm-websocket:latest .
223 | 
224 | # Deploy to your server (192.168.1.150)
225 | docker stop ttsfm-container
226 | docker rm ttsfm-container
227 | docker run -d \
228 |   --name ttsfm-container \
229 |   -p 8000:8000 \
230 |   -e REQUIRE_API_KEY=true \
231 |   -e TTSFM_API_KEY=your-secret-key \
232 |   -e DEBUG=false \
233 |   ttsfm-websocket:latest
234 | ```
235 | 
236 | ## Performance Metrics
237 | 
238 | Based on testing with openai.fm backend:
239 | - First chunk delivery: ~0.5-1s
240 | - Streaming overhead: ~10-15% vs batch processing
241 | - Concurrent connections: 100+ (limited by server resources)
242 | - Memory usage: ~50MB per active stream
243 | 
244 | *Built by a grumpy senior engineer who thinks HTTP was good enough*


--------------------------------------------------------------------------------
/ttsfm/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TTSFM - Text-to-Speech for Free using OpenAI.fm
  3 | 
  4 | A Python library for generating high-quality text-to-speech audio using the free OpenAI.fm service.
  5 | Supports multiple voices and audio formats with a simple, intuitive API.
  6 | 
  7 | Example:
  8 |     >>> from ttsfm import TTSClient, Voice, AudioFormat
  9 |     >>>
 10 |     >>> client = TTSClient()
 11 |     >>>
 12 |     >>> # Generate MP3 audio
 13 |     >>> mp3_response = client.generate_speech(
 14 |     ...     text="Hello, world!",
 15 |     ...     voice=Voice.ALLOY,
 16 |     ...     response_format=AudioFormat.MP3
 17 |     ... )
 18 |     >>> mp3_response.save_to_file("hello")  # Saves as hello.mp3
 19 |     >>>
 20 |     >>> # Generate WAV audio
 21 |     >>> wav_response = client.generate_speech(
 22 |     ...     text="High quality audio",
 23 |     ...     voice=Voice.NOVA,
 24 |     ...     response_format=AudioFormat.WAV
 25 |     ... )
 26 |     >>> wav_response.save_to_file("audio")  # Saves as audio.wav
 27 |     >>>
 28 |     >>> # Generate OPUS audio
 29 |     >>> opus_response = client.generate_speech(
 30 |     ...     text="Compressed audio",
 31 |     ...     voice=Voice.ECHO,
 32 |     ...     response_format=AudioFormat.OPUS
 33 |     ... )
 34 |     >>> opus_response.save_to_file("compressed")  # Saves as compressed.wav
 35 | """
 36 | 
 37 | from typing import Optional
 38 | 
 39 | from .async_client import AsyncTTSClient
 40 | from .audio import combine_audio_chunks, combine_responses
 41 | from .client import TTSClient
 42 | from .exceptions import (
 43 |     APIException,
 44 |     AudioProcessingException,
 45 |     AuthenticationException,
 46 |     NetworkException,
 47 |     QuotaExceededException,
 48 |     RateLimitException,
 49 |     ServiceUnavailableException,
 50 |     TTSException,
 51 |     ValidationException,
 52 | )
 53 | from .models import (
 54 |     APIError,
 55 |     AudioFormat,
 56 |     NetworkError,
 57 |     TTSError,
 58 |     TTSRequest,
 59 |     TTSResponse,
 60 |     ValidationError,
 61 |     Voice,
 62 | )
 63 | from .utils import split_text_by_length, validate_text_length
 64 | 
 65 | __version__ = "3.4.2"
 66 | __author__ = "dbcccc"
 67 | __email__ = "120614547+dbccccccc@users.noreply.github.com"
 68 | __description__ = "Text-to-Speech API Client with OpenAI compatibility"
 69 | __url__ = "https://github.com/dbccccccc/ttsfm"
 70 | 
 71 | # Default client instance for convenience
 72 | default_client = None
 73 | 
 74 | 
 75 | def create_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> TTSClient:  # type: ignore[misc]
 76 |     """
 77 |     Create a new TTS client instance.
 78 | 
 79 |     Args:
 80 |         base_url: Base URL for the TTS service
 81 |         api_key: API key for authentication (if required)
 82 |         **kwargs: Additional client configuration
 83 | 
 84 |     Returns:
 85 |         TTSClient: Configured client instance
 86 |     """
 87 |     client_kwargs = kwargs.copy()
 88 |     if base_url is not None:
 89 |         client_kwargs["base_url"] = base_url
 90 |     if api_key is not None:
 91 |         client_kwargs["api_key"] = api_key
 92 |     return TTSClient(**client_kwargs)
 93 | 
 94 | 
 95 | def create_async_client(base_url: Optional[str] = None, api_key: Optional[str] = None, **kwargs) -> AsyncTTSClient:  # type: ignore[misc]
 96 |     """
 97 |     Create a new async TTS client instance.
 98 | 
 99 |     Args:
100 |         base_url: Base URL for the TTS service
101 |         api_key: API key for authentication (if required)
102 |         **kwargs: Additional client configuration
103 | 
104 |     Returns:
105 |         AsyncTTSClient: Configured async client instance
106 |     """
107 |     client_kwargs = kwargs.copy()
108 |     if base_url is not None:
109 |         client_kwargs["base_url"] = base_url
110 |     if api_key is not None:
111 |         client_kwargs["api_key"] = api_key
112 |     return AsyncTTSClient(**client_kwargs)
113 | 
114 | 
115 | def set_default_client(client: TTSClient) -> None:
116 |     """Set the default client instance for convenience functions."""
117 |     global default_client
118 |     default_client = client
119 | 
120 | 
121 | def generate_speech(text: str, voice: str = "alloy", **kwargs) -> TTSResponse:  # type: ignore[misc]
122 |     """
123 |     Convenience function to generate speech using the default client.
124 | 
125 |     Args:
126 |         text: Text to convert to speech
127 |         voice: Voice to use for generation
128 |         **kwargs: Additional generation parameters
129 | 
130 |     Returns:
131 |         TTSResponse: Generated audio response
132 | 
133 |     Raises:
134 |         TTSException: If no default client is set or generation fails
135 |     """
136 |     if default_client is None:
137 |         raise TTSException("No default client set. Use create_client() first.")
138 | 
139 |     return default_client.generate_speech(text=text, voice=voice, **kwargs)
140 | 
141 | 
142 | def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs):  # type: ignore[no-untyped-def]
143 |     """
144 |     Convenience function to generate speech from long text using the default client.
145 | 
146 |     Automatically splits long text into chunks and generates speech for each chunk.
147 | 
148 |     Args:
149 |         text: Text to convert to speech (can be longer than 1000 characters)
150 |         voice: Voice to use for generation
151 |         **kwargs: Additional generation parameters (max_length, preserve_words, etc.)
152 | 
153 |     Returns:
154 |         list: List of TTSResponse objects for each chunk
155 | 
156 |     Raises:
157 |         TTSException: If no default client is set or generation fails
158 |     """
159 |     if default_client is None:
160 |         raise TTSException("No default client set. Use create_client() first.")
161 | 
162 |     return default_client.generate_speech_long_text(text=text, voice=voice, **kwargs)
163 | 
164 | 
165 | # Export all public components
166 | __all__ = [
167 |     # Main classes
168 |     "TTSClient",
169 |     "AsyncTTSClient",
170 |     # Models
171 |     "TTSRequest",
172 |     "TTSResponse",
173 |     "Voice",
174 |     "AudioFormat",
175 |     "TTSError",
176 |     "APIError",
177 |     "NetworkError",
178 |     "ValidationError",
179 |     # Exceptions
180 |     "TTSException",
181 |     "APIException",
182 |     "NetworkException",
183 |     "ValidationException",
184 |     "RateLimitException",
185 |     "AuthenticationException",
186 |     "ServiceUnavailableException",
187 |     "QuotaExceededException",
188 |     "AudioProcessingException",
189 |     # Factory functions
190 |     "create_client",
191 |     "create_async_client",
192 |     "set_default_client",
193 |     "generate_speech",
194 |     "generate_speech_long_text",
195 |     # Utility functions
196 |     "validate_text_length",
197 |     "split_text_by_length",
198 |     "combine_audio_chunks",
199 |     "combine_responses",
200 |     # Package metadata
201 |     "__version__",
202 |     "__author__",
203 |     "__email__",
204 |     "__description__",
205 |     "__url__",
206 | ]
207 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build-full.yml:
--------------------------------------------------------------------------------
  1 | name: Docker Build and Push (Full)
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main]
  6 |   pull_request:
  7 |     branches: [main]
  8 |   release:
  9 |     types: [published]
 10 | 
 11 | env:
 12 |   REGISTRY_DOCKERHUB: docker.io
 13 |   REGISTRY_GHCR: ghcr.io
 14 |   IMAGE_NAME: ${{ github.repository }}
 15 |   DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }}
 16 | 
 17 | jobs:
 18 |   build-and-push-full:
 19 |     runs-on: ubuntu-latest
 20 |     permissions:
 21 |       contents: read
 22 |       packages: write
 23 |     steps:
 24 |       - name: Checkout repository
 25 |         uses: actions/checkout@v4
 26 | 
 27 |       - name: Determine build settings
 28 |         id: build-config
 29 |         env:
 30 |           EVENT_NAME: ${{ github.event_name }}
 31 |           EVENT_ACTION: ${{ github.event.action }}
 32 |         run: |
 33 |           if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then
 34 |             echo "push=true" >> "$GITHUB_OUTPUT"
 35 |             echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
 36 |             echo "load=false" >> "$GITHUB_OUTPUT"
 37 |           else
 38 |             echo "push=false" >> "$GITHUB_OUTPUT"
 39 |             echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT"
 40 |             echo "load=true" >> "$GITHUB_OUTPUT"
 41 |           fi
 42 | 
 43 |       - name: Derive image version
 44 |         id: version
 45 |         env:
 46 |           EVENT_NAME: ${{ github.event_name }}
 47 |           TAG_NAME: ${{ github.event.release.tag_name }}
 48 |           REF_NAME: ${{ github.ref_name }}
 49 |           GITHUB_SHA: ${{ github.sha }}
 50 |         run: |
 51 |           version=""
 52 |           if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then
 53 |             version="$TAG_NAME"
 54 |           elif [ -n "$REF_NAME" ]; then
 55 |             version="$REF_NAME"
 56 |           fi
 57 |           version="${version##*/}"
 58 |           if [ "${version#v}" != "$version" ]; then
 59 |             version="${version#v}"
 60 |           fi
 61 |           if [ -z "$version" ]; then
 62 |             version="${GITHUB_SHA:0:12}"
 63 |           fi
 64 |           if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then
 65 |             safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-')
 66 |             safe_branch=${safe_branch%-}
 67 |             if [ -z "$safe_branch" ]; then
 68 |               safe_branch="sha-${GITHUB_SHA:0:12}"
 69 |             fi
 70 |             version="0.0.0+${safe_branch}"
 71 |           fi
 72 |           echo "version=$version" >> "$GITHUB_OUTPUT"
 73 | 
 74 |       - name: Set up QEMU
 75 |         if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64'
 76 |         uses: docker/setup-qemu-action@v3
 77 | 
 78 |       - name: Set up Docker Buildx
 79 |         uses: docker/setup-buildx-action@v3
 80 |         with:
 81 |           driver: docker-container
 82 | 
 83 |       - name: Login to Docker Hub
 84 |         if: steps.build-config.outputs.push == 'true'
 85 |         uses: docker/login-action@v3
 86 |         with:
 87 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
 88 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
 89 | 
 90 |       - name: Login to GitHub Container Registry
 91 |         if: steps.build-config.outputs.push == 'true'
 92 |         uses: docker/login-action@v3
 93 |         with:
 94 |           registry: ${{ env.REGISTRY_GHCR }}
 95 |           username: ${{ github.actor }}
 96 |           password: ${{ secrets.GITHUB_TOKEN }}
 97 | 
 98 |       - name: Extract metadata
 99 |         id: meta
100 |         if: steps.build-config.outputs.push == 'true'
101 |         uses: docker/metadata-action@v5
102 |         with:
103 |           images: |
104 |             ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm
105 |             ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}
106 |           tags: |
107 |             type=semver,pattern=v{{version}}
108 |             type=raw,value=latest,enable=${{ github.event.release.prerelease == false }}
109 |           labels: |
110 |             org.opencontainers.image.source=${{ github.repositoryUrl }}
111 |             org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (full variant with ffmpeg)
112 |             org.opencontainers.image.licenses=MIT
113 |             org.opencontainers.image.title=TTSFM - Free TTS API Server (Full)
114 |             org.opencontainers.image.vendor=dbcccc
115 |           flavor: |
116 |             latest=auto
117 | 
118 |       - name: Set local image metadata
119 |         id: meta-local
120 |         if: steps.build-config.outputs.push != 'true'
121 |         run: |
122 |           echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-full" >> "$GITHUB_OUTPUT"
123 |           echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT"
124 | 
125 |       - name: Build and push image
126 |         id: build-and-push
127 |         uses: docker/build-push-action@v5
128 |         with:
129 |           context: .
130 |           platforms: ${{ steps.build-config.outputs.platforms }}
131 |           push: ${{ steps.build-config.outputs.push == 'true' }}
132 |           load: ${{ steps.build-config.outputs.load == 'true' }}
133 |           tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}
134 |           labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }}
135 |           cache-from: type=gha,scope=full
136 |           cache-to: type=gha,mode=max,scope=full
137 |           build-args: |
138 |             VERSION=${{ steps.version.outputs.version }}
139 |             VARIANT=full
140 | 
141 |       - name: Smoke test image
142 |         if: steps.build-config.outputs.load == 'true'
143 |         run: |
144 |           set -euo pipefail
145 |           IMAGE="${{ steps.meta-local.outputs.tags }}"
146 |           echo "Running smoke test for full image: $IMAGE"
147 |           docker rm -f ttsfm-smoke >/dev/null 2>&1 || true
148 |           docker run -d --name ttsfm-smoke -p 127.0.0.1:8000:8000 "$IMAGE"
149 |           success=""
150 |           for attempt in $(seq 1 10); do
151 |             if curl --fail --silent --max-time 5 http://127.0.0.1:8000/api/health > /tmp/ttsfm-health.json; then
152 |               success="yes"
153 |               cat /tmp/ttsfm-health.json
154 |               break
155 |             fi
156 |             sleep 3
157 |           done
158 |           docker logs ttsfm-smoke || true
159 |           docker rm -f ttsfm-smoke >/dev/null 2>&1 || true
160 |           if [ -z "$success" ]; then
161 |             echo "Container health check failed" >&2
162 |             exit 1
163 |           fi
164 | 
165 |       - name: Show image info
166 |         run: |
167 |           echo "Variant: full"
168 |           echo "Push enabled: ${{ steps.build-config.outputs.push }}"
169 |           echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}"
170 |           echo "Image digest: ${{ steps.build-and-push.outputs.digest }}"
171 | 
172 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build-slim.yml:
--------------------------------------------------------------------------------
  1 | name: Docker Build and Push (Slim)
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main]
  6 |   pull_request:
  7 |     branches: [main]
  8 |   release:
  9 |     types: [published]
 10 | 
 11 | env:
 12 |   REGISTRY_DOCKERHUB: docker.io
 13 |   REGISTRY_GHCR: ghcr.io
 14 |   IMAGE_NAME: ${{ github.repository }}
 15 |   DOCKERHUB_NAMESPACE: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_USERNAME || github.repository_owner }}
 16 | 
 17 | jobs:
 18 |   build-and-push-slim:
 19 |     runs-on: ubuntu-latest
 20 |     permissions:
 21 |       contents: read
 22 |       packages: write
 23 |     steps:
 24 |       - name: Checkout repository
 25 |         uses: actions/checkout@v4
 26 | 
 27 |       - name: Determine build settings
 28 |         id: build-config
 29 |         env:
 30 |           EVENT_NAME: ${{ github.event_name }}
 31 |           EVENT_ACTION: ${{ github.event.action }}
 32 |         run: |
 33 |           if [ "$EVENT_NAME" = "release" ] && [ "$EVENT_ACTION" = "published" ]; then
 34 |             echo "push=true" >> "$GITHUB_OUTPUT"
 35 |             echo "platforms=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
 36 |             echo "load=false" >> "$GITHUB_OUTPUT"
 37 |           else
 38 |             echo "push=false" >> "$GITHUB_OUTPUT"
 39 |             echo "platforms=linux/amd64" >> "$GITHUB_OUTPUT"
 40 |             echo "load=true" >> "$GITHUB_OUTPUT"
 41 |           fi
 42 | 
 43 |       - name: Derive image version
 44 |         id: version
 45 |         env:
 46 |           EVENT_NAME: ${{ github.event_name }}
 47 |           TAG_NAME: ${{ github.event.release.tag_name }}
 48 |           REF_NAME: ${{ github.ref_name }}
 49 |           GITHUB_SHA: ${{ github.sha }}
 50 |         run: |
 51 |           version=""
 52 |           if [ "$EVENT_NAME" = "release" ] && [ -n "$TAG_NAME" ]; then
 53 |             version="$TAG_NAME"
 54 |           elif [ -n "$REF_NAME" ]; then
 55 |             version="$REF_NAME"
 56 |           fi
 57 |           version="${version##*/}"
 58 |           if [ "${version#v}" != "$version" ]; then
 59 |             version="${version#v}"
 60 |           fi
 61 |           if [ -z "$version" ]; then
 62 |             version="${GITHUB_SHA:0:12}"
 63 |           fi
 64 |           if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+'; then
 65 |             safe_branch=$(printf %s "$version" | tr -c 'A-Za-z0-9' '-')
 66 |             safe_branch=${safe_branch%-}
 67 |             if [ -z "$safe_branch" ]; then
 68 |               safe_branch="sha-${GITHUB_SHA:0:12}"
 69 |             fi
 70 |             version="0.0.0+${safe_branch}"
 71 |           fi
 72 |           echo "version=$version" >> "$GITHUB_OUTPUT"
 73 | 
 74 |       - name: Set up QEMU
 75 |         if: steps.build-config.outputs.platforms == 'linux/amd64,linux/arm64'
 76 |         uses: docker/setup-qemu-action@v3
 77 | 
 78 |       - name: Set up Docker Buildx
 79 |         uses: docker/setup-buildx-action@v3
 80 |         with:
 81 |           driver: docker-container
 82 | 
 83 |       - name: Login to Docker Hub
 84 |         if: steps.build-config.outputs.push == 'true'
 85 |         uses: docker/login-action@v3
 86 |         with:
 87 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
 88 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
 89 | 
 90 |       - name: Login to GitHub Container Registry
 91 |         if: steps.build-config.outputs.push == 'true'
 92 |         uses: docker/login-action@v3
 93 |         with:
 94 |           registry: ${{ env.REGISTRY_GHCR }}
 95 |           username: ${{ github.actor }}
 96 |           password: ${{ secrets.GITHUB_TOKEN }}
 97 | 
 98 |       - name: Extract metadata
 99 |         id: meta
100 |         if: steps.build-config.outputs.push == 'true'
101 |         uses: docker/metadata-action@v5
102 |         with:
103 |           images: |
104 |             ${{ env.DOCKERHUB_NAMESPACE }}/ttsfm
105 |             ${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}
106 |           tags: |
107 |             type=semver,pattern=v{{version}},suffix=-slim
108 |             type=raw,value=slim,enable=${{ !contains(github.ref, 'alpha') && !contains(github.ref, 'beta') }}
109 |           labels: |
110 |             org.opencontainers.image.source=${{ github.repositoryUrl }}
111 |             org.opencontainers.image.description=Free TTS API server compatible with OpenAI's TTS API format using openai.fm (slim variant without ffmpeg)
112 |             org.opencontainers.image.licenses=MIT
113 |             org.opencontainers.image.title=TTSFM - Free TTS API Server (Slim)
114 |             org.opencontainers.image.vendor=dbcccc
115 | 
116 |       - name: Set local image metadata
117 |         id: meta-local
118 |         if: steps.build-config.outputs.push != 'true'
119 |         run: |
120 |           echo "tags=${{ env.REGISTRY_GHCR }}/${{ env.IMAGE_NAME }}:ci-${GITHUB_RUN_ID}-slim" >> "$GITHUB_OUTPUT"
121 |           echo "labels=org.opencontainers.image.source=${{ github.repositoryUrl }}" >> "$GITHUB_OUTPUT"
122 | 
123 |       - name: Build and push image
124 |         id: build-and-push
125 |         uses: docker/build-push-action@v5
126 |         with:
127 |           context: .
128 |           platforms: ${{ steps.build-config.outputs.platforms }}
129 |           push: ${{ steps.build-config.outputs.push == 'true' }}
130 |           load: ${{ steps.build-config.outputs.load == 'true' }}
131 |           tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}
132 |           labels: ${{ steps.meta.outputs.labels || steps.meta-local.outputs.labels }}
133 |           cache-from: type=gha,scope=slim
134 |           cache-to: type=gha,mode=max,scope=slim
135 |           build-args: |
136 |             VERSION=${{ steps.version.outputs.version }}
137 |             VARIANT=slim
138 | 
139 |       - name: Smoke test image
140 |         if: steps.build-config.outputs.load == 'true'
141 |         run: |
142 |           set -euo pipefail
143 |           IMAGE="${{ steps.meta-local.outputs.tags }}"
144 |           echo "Running smoke test for slim image: $IMAGE"
145 |           docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true
146 |           docker run -d --name ttsfm-smoke-slim -p 127.0.0.1:8001:8000 "$IMAGE"
147 |           success=""
148 |           for attempt in $(seq 1 10); do
149 |             if curl --fail --silent --max-time 5 http://127.0.0.1:8001/api/health > /tmp/ttsfm-health-slim.json; then
150 |               success="yes"
151 |               cat /tmp/ttsfm-health-slim.json
152 |               break
153 |             fi
154 |             sleep 3
155 |           done
156 |           docker logs ttsfm-smoke-slim || true
157 |           docker rm -f ttsfm-smoke-slim >/dev/null 2>&1 || true
158 |           if [ -z "$success" ]; then
159 |             echo "Container health check failed" >&2
160 |             exit 1
161 |           fi
162 | 
163 |       - name: Show image info
164 |         run: |
165 |           echo "Variant: slim"
166 |           echo "Push enabled: ${{ steps.build-config.outputs.push }}"
167 |           echo "Image tags: ${{ steps.meta.outputs.tags || steps.meta-local.outputs.tags }}"
168 |           echo "Image digest: ${{ steps.build-and-push.outputs.digest }}"
169 | 
170 | 


--------------------------------------------------------------------------------
/ttsfm-web/templates/index.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block title %}TTSFM - {{ _('home.title') }}{% endblock %}
  4 | 
  5 | {% block content %}
  6 | <!-- Hero Section -->
  7 | <section class="hero-section">
  8 |     <div class="container">
  9 |         <div class="row align-items-center min-vh-75">
 10 |             <div class="col-lg-8 mx-auto text-center">
 11 |                 <div class="hero-content">
 12 |                     <div class="badge bg-primary text-white mb-3 px-3 py-2">
 13 |                         <i class="fas fa-code me-2"></i>Python Package
 14 |                     </div>
 15 |                     <h1 class="display-4 fw-bold mb-4">
 16 |                         {{ _('home.title') }}
 17 |                     </h1>
 18 |                     <p class="lead mb-4">
 19 |                         {{ _('home.subtitle') }}
 20 |                     </p>
 21 |                     <div class="d-flex flex-wrap gap-3 justify-content-center">
 22 |                         <a href="{{ url_for('playground') }}" class="btn btn-primary btn-lg">
 23 |                             <i class="fas fa-play me-2"></i>{{ _('home.try_demo') }}
 24 |                         </a>
 25 |                         <a href="{{ url_for('docs') }}" class="btn btn-outline-secondary btn-lg">
 26 |                             <i class="fas fa-book me-2"></i>{{ _('home.documentation') }}
 27 |                         </a>
 28 |                         <a href="https://github.com/dbccccccc/ttsfm" class="btn btn-outline-secondary btn-lg" target="_blank" rel="noopener noreferrer">
 29 |                             <i class="fab fa-github me-2"></i>{{ _('home.github') }}
 30 |                         </a>
 31 |                     </div>
 32 |                 </div>
 33 |             </div>
 34 |         </div>
 35 |     </div>
 36 | </section>
 37 | 
 38 | <!-- Features Section -->
 39 | <section class="py-5" style="background-color: #f8fafc;">
 40 |     <div class="container">
 41 |         <div class="row">
 42 |             <div class="col-12 text-center mb-5">
 43 |                 <h2 class="fw-bold mb-4">{{ _('home.features_title') }}</h2>
 44 |                 <p class="lead text-muted">
 45 |                     {{ _('home.features_subtitle') }}
 46 |                 </p>
 47 |             </div>
 48 |         </div>
 49 | 
 50 |         <div class="row g-4">
 51 |             <div class="col-lg-3">
 52 |                 <div class="text-center">
 53 |                     <div class="feature-icon text-white rounded-circle d-inline-flex align-items-center justify-content-center mb-3" style="width: 4rem; height: 4rem; background: linear-gradient(135deg, #4f46e5 0%, #6366f1 100%);">
 54 |                         <i class="fas fa-key"></i>
 55 |                     </div>
 56 |                     <h5 class="fw-bold">{{ _('home.feature_free_title') }}</h5>
 57 |                     <p class="text-muted">{{ _('home.feature_free_desc') }}</p>
 58 |                 </div>
 59 |             </div>
 60 | 
 61 |             <div class="col-lg-3">
 62 |                 <div class="text-center">
 63 |                     <div class="feature-icon text-white rounded-circle d-inline-flex align-items-center justify-content-center mb-3" style="width: 4rem; height: 4rem; background: linear-gradient(135deg, #f59e0b 0%, #fbbf24 100%);">
 64 |                         <i class="fas fa-magic"></i>
 65 |                     </div>
 66 |                     <h5 class="fw-bold">{{ _('home.feature_openai_title') }}</h5>
 67 |                     <p class="text-muted">{{ _('home.feature_openai_desc') }}</p>
 68 |                 </div>
 69 |             </div>
 70 | 
 71 |             <div class="col-lg-3">
 72 |                 <div class="text-center">
 73 |                     <div class="feature-icon text-white rounded-circle d-inline-flex align-items-center justify-content-center mb-3" style="width: 4rem; height: 4rem; background: linear-gradient(135deg, #059669 0%, #10b981 100%);">
 74 |                         <i class="fas fa-bolt"></i>
 75 |                     </div>
 76 |                     <h5 class="fw-bold">{{ _('home.feature_async_title') }}</h5>
 77 |                     <p class="text-muted">{{ _('home.feature_async_desc') }}</p>
 78 |                 </div>
 79 |             </div>
 80 | 
 81 |             <div class="col-lg-3">
 82 |                 <div class="text-center">
 83 |                     <div class="feature-icon text-white rounded-circle d-inline-flex align-items-center justify-content-center mb-3" style="width: 4rem; height: 4rem; background: linear-gradient(135deg, #6b7280 0%, #9ca3af 100%);">
 84 |                         <i class="fas fa-microphone-alt"></i>
 85 |                     </div>
 86 |                     <h5 class="fw-bold">{{ _('home.feature_voices_title') }} & {{ _('home.feature_formats_title') }}</h5>
 87 |                     <p class="text-muted">{{ _('home.feature_voices_desc') }} {{ _('home.feature_formats_desc') }}</p>
 88 |                 </div>
 89 |             </div>
 90 |         </div>
 91 |     </div>
 92 | </section>
 93 | 
 94 | <!-- Quick Start Section -->
 95 | <section class="py-5">
 96 |     <div class="container">
 97 |         <div class="row">
 98 |             <div class="col-12 text-center mb-5">
 99 |                 <h2 class="fw-bold mb-4">{{ _('home.quick_start_title') }}</h2>
100 |                 <p class="lead text-muted">
101 |                     {{ _('home.subtitle') }}
102 |                 </p>
103 |             </div>
104 |         </div>
105 | 
106 |         <div class="row g-4">
107 |             <div class="col-lg-6">
108 |                 <div class="card h-100">
109 |                     <div class="card-body">
110 |                         <h5 class="card-title">
111 |                             <i class="fas fa-download me-2 text-primary"></i>{{ _('home.installation_title') }}
112 |                         </h5>
113 |                         <pre class="bg-light p-3 rounded"><code>{{ _('home.installation_code') }}</code></pre>
114 |                         <small class="text-muted">Requires Python 3.8+</small>
115 |                     </div>
116 |                 </div>
117 |             </div>
118 | 
119 |             <div class="col-lg-6">
120 |                 <div class="card h-100">
121 |                     <div class="card-body">
122 |                         <h5 class="card-title">
123 |                             <i class="fas fa-play me-2 text-success"></i>{{ _('home.usage_title') }}
124 |                         </h5>
125 |                         <pre class="bg-light p-3 rounded"><code>from ttsfm import TTSClient, Voice, AudioFormat
126 | 
127 | client = TTSClient()
128 | response = client.generate_speech(
129 |     text="Hello, world!",
130 |     voice=Voice.ALLOY,
131 |     response_format=AudioFormat.MP3
132 | )
133 | response.save_to_file("hello")</code></pre>
134 |                         <small class="text-muted">No API keys required</small>
135 |                     </div>
136 |                 </div>
137 |             </div>
138 |         </div>
139 | 
140 |         <div class="row mt-4">
141 |             <div class="col-12 text-center">
142 |                 <div class="d-flex justify-content-center gap-3 flex-wrap">
143 |                     <a href="{{ url_for('playground') }}" class="btn btn-primary">
144 |                         <i class="fas fa-play me-2"></i>{{ _('home.try_demo') }}
145 |                     </a>
146 |                     <a href="{{ url_for('docs') }}" class="btn btn-outline-primary">
147 |                         <i class="fas fa-book me-2"></i>{{ _('home.documentation') }}
148 |                     </a>
149 |                 </div>
150 |             </div>
151 |         </div>
152 |     </div>
153 | </section>
154 | 
155 | 
156 | {% endblock %}
157 | 


--------------------------------------------------------------------------------
/ttsfm/audio_processing.py:
--------------------------------------------------------------------------------
  1 | """Audio processing utilities using ffmpeg for advanced features."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import logging
  6 | import subprocess
  7 | import tempfile
  8 | from pathlib import Path
  9 | from typing import Optional
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def adjust_audio_speed(
 15 |     audio_data: bytes,
 16 |     speed: float,
 17 |     input_format: str = "mp3",
 18 |     output_format: str = "mp3",
 19 | ) -> bytes:
 20 |     """
 21 |     Adjust audio playback speed using ffmpeg.
 22 | 
 23 |     Args:
 24 |         audio_data: Input audio data as bytes
 25 |         speed: Speed multiplier (0.25 to 4.0). 1.0 = normal speed, 2.0 = 2x faster
 26 |         input_format: Input audio format (mp3, wav, etc.)
 27 |         output_format: Output audio format (mp3, wav, etc.)
 28 | 
 29 |     Returns:
 30 |         Processed audio data as bytes
 31 | 
 32 |     Raises:
 33 |         RuntimeError: If ffmpeg is not available or processing fails
 34 |         ValueError: If speed is out of valid range
 35 |     """
 36 |     # Validate speed range (OpenAI TTS API supports 0.25 to 4.0)
 37 |     if not 0.25 <= speed <= 4.0:
 38 |         raise ValueError(f"Speed must be between 0.25 and 4.0, got {speed}")
 39 | 
 40 |     # If speed is 1.0, no processing needed
 41 |     if speed == 1.0:
 42 |         return audio_data
 43 | 
 44 |     # Check ffmpeg availability
 45 |     import shutil
 46 | 
 47 |     if not shutil.which("ffmpeg"):
 48 |         raise RuntimeError(
 49 |             "Speed adjustment requires ffmpeg. "
 50 |             "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
 51 |         )
 52 | 
 53 |     try:
 54 |         # Create temporary files for input and output
 55 |         with tempfile.TemporaryDirectory() as tmpdir:
 56 |             tmp_path = Path(tmpdir)
 57 |             input_file = tmp_path / f"input.{input_format}"
 58 |             output_file = tmp_path / f"output.{output_format}"
 59 | 
 60 |             # Write input audio to temp file
 61 |             input_file.write_bytes(audio_data)
 62 | 
 63 |             # Build ffmpeg command
 64 |             # For speed adjustment, we use the atempo filter
 65 |             # atempo only supports 0.5-2.0 range, so we may need to chain filters
 66 |             atempo_filters = _build_atempo_filter_chain(speed)
 67 | 
 68 |             cmd = [
 69 |                 "ffmpeg",
 70 |                 "-i",
 71 |                 str(input_file),
 72 |                 "-filter:a",
 73 |                 atempo_filters,
 74 |                 "-y",  # Overwrite output file
 75 |                 "-loglevel",
 76 |                 "error",  # Only show errors
 77 |                 str(output_file),
 78 |             ]
 79 | 
 80 |             # Run ffmpeg
 81 |             result = subprocess.run(
 82 |                 cmd,
 83 |                 capture_output=True,
 84 |                 text=True,
 85 |                 timeout=30,
 86 |             )
 87 | 
 88 |             if result.returncode != 0:
 89 |                 logger.error(f"ffmpeg error: {result.stderr}")
 90 |                 raise RuntimeError(f"ffmpeg processing failed: {result.stderr}")
 91 | 
 92 |             # Read processed audio
 93 |             return output_file.read_bytes()
 94 | 
 95 |     except subprocess.TimeoutExpired:
 96 |         raise RuntimeError("Audio processing timed out")
 97 |     except Exception as e:
 98 |         logger.error(f"Error adjusting audio speed: {e}")
 99 |         raise RuntimeError(f"Failed to adjust audio speed: {e}")
100 | 
101 | 
102 | def _build_atempo_filter_chain(speed: float) -> str:
103 |     """
104 |     Build atempo filter chain for ffmpeg.
105 | 
106 |     The atempo filter only supports 0.5-2.0 range, so for speeds outside
107 |     this range, we need to chain multiple atempo filters.
108 | 
109 |     Args:
110 |         speed: Target speed multiplier
111 | 
112 |     Returns:
113 |         Filter string for ffmpeg
114 |     """
115 |     if 0.5 <= speed <= 2.0:
116 |         return f"atempo={speed}"
117 | 
118 |     # For speeds outside 0.5-2.0, chain multiple atempo filters
119 |     filters = []
120 |     remaining_speed = speed
121 | 
122 |     while remaining_speed > 2.0:
123 |         filters.append("atempo=2.0")
124 |         remaining_speed /= 2.0
125 | 
126 |     while remaining_speed < 0.5:
127 |         filters.append("atempo=0.5")
128 |         remaining_speed /= 0.5
129 | 
130 |     if remaining_speed != 1.0:
131 |         filters.append(f"atempo={remaining_speed}")
132 | 
133 |     return ",".join(filters)
134 | 
135 | 
136 | def convert_audio_format(
137 |     audio_data: bytes,
138 |     input_format: str,
139 |     output_format: str,
140 |     bitrate: Optional[str] = None,
141 | ) -> bytes:
142 |     """
143 |     Convert audio from one format to another using ffmpeg.
144 | 
145 |     Args:
146 |         audio_data: Input audio data as bytes
147 |         input_format: Input audio format (mp3, wav, opus, aac, flac, pcm)
148 |         output_format: Output audio format (mp3, wav, opus, aac, flac, pcm)
149 |         bitrate: Optional bitrate for output (e.g., "128k", "192k")
150 | 
151 |     Returns:
152 |         Converted audio data as bytes
153 | 
154 |     Raises:
155 |         RuntimeError: If ffmpeg is not available or conversion fails
156 |     """
157 |     # Check ffmpeg availability
158 |     import shutil
159 | 
160 |     if not shutil.which("ffmpeg"):
161 |         raise RuntimeError(
162 |             "Format conversion requires ffmpeg. "
163 |             "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant."
164 |         )
165 | 
166 |     try:
167 |         with tempfile.TemporaryDirectory() as tmpdir:
168 |             tmp_path = Path(tmpdir)
169 |             input_file = tmp_path / f"input.{input_format}"
170 |             output_file = tmp_path / f"output.{output_format}"
171 | 
172 |             # Write input audio to temp file
173 |             input_file.write_bytes(audio_data)
174 | 
175 |             # Build ffmpeg command
176 |             cmd = [
177 |                 "ffmpeg",
178 |                 "-i",
179 |                 str(input_file),
180 |                 "-y",  # Overwrite output file
181 |                 "-loglevel",
182 |                 "error",
183 |             ]
184 | 
185 |             # Add bitrate if specified
186 |             if bitrate:
187 |                 cmd.extend(["-b:a", bitrate])
188 | 
189 |             # Add output format-specific options
190 |             if output_format == "opus":
191 |                 cmd.extend(["-c:a", "libopus"])
192 |             elif output_format == "aac":
193 |                 cmd.extend(["-c:a", "aac"])
194 |             elif output_format == "flac":
195 |                 cmd.extend(["-c:a", "flac"])
196 |             elif output_format == "pcm":
197 |                 cmd.extend(["-f", "s16le", "-acodec", "pcm_s16le"])
198 | 
199 |             cmd.append(str(output_file))
200 | 
201 |             # Run ffmpeg
202 |             result = subprocess.run(
203 |                 cmd,
204 |                 capture_output=True,
205 |                 text=True,
206 |                 timeout=30,
207 |             )
208 | 
209 |             if result.returncode != 0:
210 |                 logger.error(f"ffmpeg error: {result.stderr}")
211 |                 raise RuntimeError(f"ffmpeg conversion failed: {result.stderr}")
212 | 
213 |             # Read converted audio
214 |             return output_file.read_bytes()
215 | 
216 |     except subprocess.TimeoutExpired:
217 |         raise RuntimeError("Audio conversion timed out")
218 |     except Exception as e:
219 |         logger.error(f"Error converting audio format: {e}")
220 |         raise RuntimeError(f"Failed to convert audio format: {e}")
221 | 


--------------------------------------------------------------------------------
/ttsfm/exceptions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Exception classes for the TTSFM package.
  3 | 
  4 | This module defines the exception hierarchy used throughout the package
  5 | for consistent error handling and reporting.
  6 | """
  7 | 
  8 | from typing import Any, Dict, Optional
  9 | 
 10 | 
 11 | class TTSException(Exception):
 12 |     """
 13 |     Base exception class for all TTSFM-related errors.
 14 | 
 15 |     Attributes:
 16 |         message: Human-readable error message
 17 |         code: Error code for programmatic handling
 18 |         details: Additional error details
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self, message: str, code: Optional[str] = None, details: Optional[Dict[str, Any]] = None
 23 |     ):
 24 |         super().__init__(message)
 25 |         self.message = message
 26 |         self.code = code or self.__class__.__name__
 27 |         self.details = details or {}
 28 | 
 29 |     def __str__(self) -> str:
 30 |         if self.code:
 31 |             return f"[{self.code}] {self.message}"
 32 |         return self.message
 33 | 
 34 |     def __repr__(self) -> str:
 35 |         return f"{self.__class__.__name__}(message='{self.message}', code='{self.code}')"
 36 | 
 37 | 
 38 | class APIException(TTSException):
 39 |     """
 40 |     Exception raised for API-related errors.
 41 | 
 42 |     This includes HTTP errors, invalid responses, and server-side issues.
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         message: str,
 48 |         status_code: Optional[int] = None,
 49 |         response_data: Optional[Dict[str, Any]] = None,
 50 |         **kwargs: Any,
 51 |     ) -> None:
 52 |         super().__init__(message, **kwargs)
 53 |         self.status_code = status_code
 54 |         self.response_data = response_data or {}
 55 | 
 56 |     def __str__(self) -> str:
 57 |         if self.status_code:
 58 |             return f"[HTTP {self.status_code}] {self.message}"
 59 |         return super().__str__()
 60 | 
 61 | 
 62 | class NetworkException(TTSException):
 63 |     """
 64 |     Exception raised for network-related errors.
 65 | 
 66 |     This includes connection timeouts, DNS resolution failures, and other
 67 |     network connectivity issues.
 68 |     """
 69 | 
 70 |     def __init__(
 71 |         self, message: str, timeout: Optional[float] = None, retry_count: int = 0, **kwargs: Any
 72 |     ) -> None:
 73 |         super().__init__(message, **kwargs)
 74 |         self.timeout = timeout
 75 |         self.retry_count = retry_count
 76 | 
 77 | 
 78 | class ValidationException(TTSException):
 79 |     """
 80 |     Exception raised for input validation errors.
 81 | 
 82 |     This includes invalid parameters, missing required fields, and
 83 |     data format issues.
 84 |     """
 85 | 
 86 |     def __init__(
 87 |         self, message: str, field: Optional[str] = None, value: Optional[Any] = None, **kwargs: Any
 88 |     ) -> None:
 89 |         super().__init__(message, **kwargs)
 90 |         self.field = field
 91 |         self.value = value
 92 | 
 93 |     def __str__(self) -> str:
 94 |         if self.field:
 95 |             return f"Validation error for '{self.field}': {self.message}"
 96 |         return f"Validation error: {self.message}"
 97 | 
 98 | 
 99 | class RateLimitException(APIException):
100 |     """
101 |     Exception raised when API rate limits are exceeded.
102 | 
103 |     Attributes:
104 |         retry_after: Seconds to wait before retrying (if provided by server)
105 |         limit: Rate limit that was exceeded
106 |         remaining: Remaining requests in current window
107 |     """
108 | 
109 |     def __init__(
110 |         self,
111 |         message: str = "Rate limit exceeded",
112 |         retry_after: Optional[int] = None,
113 |         limit: Optional[int] = None,
114 |         remaining: Optional[int] = None,
115 |         **kwargs: Any,
116 |     ) -> None:
117 |         super().__init__(message, status_code=429, **kwargs)
118 |         self.retry_after = retry_after
119 |         self.limit = limit
120 |         self.remaining = remaining
121 | 
122 |     def __str__(self) -> str:
123 |         msg = super().__str__()
124 |         if self.retry_after:
125 |             msg += f" (retry after {self.retry_after}s)"
126 |         return msg
127 | 
128 | 
129 | class AuthenticationException(APIException):
130 |     """
131 |     Exception raised for authentication and authorization errors.
132 | 
133 |     This includes invalid API keys, expired tokens, and insufficient
134 |     permissions.
135 |     """
136 | 
137 |     def __init__(self, message: str = "Authentication failed", **kwargs: Any) -> None:
138 |         super().__init__(message, status_code=401, **kwargs)
139 | 
140 | 
141 | class ServiceUnavailableException(APIException):
142 |     """
143 |     Exception raised when the TTS service is temporarily unavailable.
144 | 
145 |     This includes server maintenance, overload conditions, and
146 |     temporary service outages.
147 |     """
148 | 
149 |     def __init__(
150 |         self,
151 |         message: str = "Service temporarily unavailable",
152 |         retry_after: Optional[int] = None,
153 |         **kwargs: Any,
154 |     ) -> None:
155 |         super().__init__(message, status_code=503, **kwargs)
156 |         self.retry_after = retry_after
157 | 
158 | 
159 | class QuotaExceededException(APIException):
160 |     """
161 |     Exception raised when usage quotas are exceeded.
162 | 
163 |     This includes monthly limits, character limits, and other
164 |     usage-based restrictions.
165 |     """
166 | 
167 |     def __init__(
168 |         self,
169 |         message: str = "Usage quota exceeded",
170 |         quota_type: Optional[str] = None,
171 |         limit: Optional[int] = None,
172 |         used: Optional[int] = None,
173 |         **kwargs: Any,
174 |     ) -> None:
175 |         super().__init__(message, status_code=402, **kwargs)
176 |         self.quota_type = quota_type
177 |         self.limit = limit
178 |         self.used = used
179 | 
180 | 
181 | class AudioProcessingException(TTSException):
182 |     """
183 |     Exception raised for audio processing errors.
184 | 
185 |     This includes format conversion issues, audio generation failures,
186 |     and output processing problems.
187 |     """
188 | 
189 |     def __init__(self, message: str, audio_format: Optional[str] = None, **kwargs: Any) -> None:
190 |         super().__init__(message, **kwargs)
191 |         self.audio_format = audio_format
192 | 
193 | 
194 | def create_exception_from_response(
195 |     status_code: int, response_data: Dict[str, Any], default_message: str = "API request failed"
196 | ) -> APIException:
197 |     """
198 |     Create appropriate exception from API response.
199 | 
200 |     Args:
201 |         status_code: HTTP status code
202 |         response_data: Response data from API
203 |         default_message: Default message if none in response
204 | 
205 |     Returns:
206 |         APIException: Appropriate exception instance
207 |     """
208 |     message = response_data.get("error", {}).get("message", default_message)
209 | 
210 |     if status_code == 401:
211 |         return AuthenticationException(message, response_data=response_data)
212 |     elif status_code == 402:
213 |         return QuotaExceededException(message, response_data=response_data)
214 |     elif status_code == 429:
215 |         retry_after = response_data.get("retry_after")
216 |         return RateLimitException(message, retry_after=retry_after, response_data=response_data)
217 |     elif status_code == 503:
218 |         retry_after = response_data.get("retry_after")
219 |         return ServiceUnavailableException(
220 |             message,
221 |             retry_after=retry_after,
222 |             response_data=response_data,
223 |         )
224 |     else:
225 |         return APIException(message, status_code=status_code, response_data=response_data)
226 | 


--------------------------------------------------------------------------------
/ttsfm-web/i18n.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Internationalization (i18n) support for TTSFM Web Application
  3 | 
  4 | This module provides multi-language support for the Flask web application,
  5 | including language detection, translation management, and template functions.
  6 | """
  7 | 
  8 | import json
  9 | import os
 10 | from typing import Any, Dict, Optional
 11 | 
 12 | from flask import request, session
 13 | 
 14 | 
 15 | class LanguageManager:
 16 |     """Manages language detection, translation loading, and text translation."""
 17 | 
 18 |     def __init__(self, app=None, translations_dir: str = "translations"):
 19 |         """
 20 |         Initialize the LanguageManager.
 21 | 
 22 |         Args:
 23 |             app: Flask application instance
 24 |             translations_dir: Directory containing translation files
 25 |         """
 26 |         self.translations_dir = translations_dir
 27 |         self.translations: Dict[str, Dict[str, Any]] = {}
 28 |         self.supported_languages = ["en", "zh"]
 29 |         self.default_language = "en"
 30 | 
 31 |         if app is not None:
 32 |             self.init_app(app)
 33 | 
 34 |     def init_app(self, app):
 35 |         """Initialize the Flask application with i18n support."""
 36 |         app.config.setdefault("LANGUAGES", self.supported_languages)
 37 |         app.config.setdefault("DEFAULT_LANGUAGE", self.default_language)
 38 | 
 39 |         # Load translations
 40 |         self.load_translations()
 41 | 
 42 |         # Register template functions
 43 |         app.jinja_env.globals["_"] = self.translate
 44 |         app.jinja_env.globals["get_locale"] = self.get_locale
 45 |         app.jinja_env.globals["get_supported_languages"] = self.get_supported_languages
 46 | 
 47 |         # Store reference to this instance
 48 |         app.language_manager = self
 49 | 
 50 |     def load_translations(self):
 51 |         """Load all translation files from the translations directory."""
 52 |         translations_path = os.path.join(os.path.dirname(__file__), self.translations_dir)
 53 | 
 54 |         if not os.path.exists(translations_path):
 55 |             print(f"Warning: Translations directory not found: {translations_path}")
 56 |             return
 57 | 
 58 |         for lang_code in self.supported_languages:
 59 |             file_path = os.path.join(translations_path, f"{lang_code}.json")
 60 | 
 61 |             if os.path.exists(file_path):
 62 |                 try:
 63 |                     with open(file_path, "r", encoding="utf-8") as f:
 64 |                         self.translations[lang_code] = json.load(f)
 65 |                     print(f"Info: Loaded translations for language: {lang_code}")
 66 |                 except Exception as e:
 67 |                     print(f"Error: Failed to load translations for {lang_code}: {e}")
 68 |             else:
 69 |                 print(f"Warning: Translation file not found: {file_path}")
 70 | 
 71 |     def get_locale(self) -> str:
 72 |         """
 73 |         Get the current locale based on user preference, session, or browser settings.
 74 | 
 75 |         Returns:
 76 |             Language code (e.g., 'en', 'zh')
 77 |         """
 78 |         # 1. Check URL parameter (for language switching)
 79 |         if "lang" in request.args:
 80 |             lang = request.args.get("lang")
 81 |             if lang in self.supported_languages:
 82 |                 session["language"] = lang
 83 |                 return lang
 84 | 
 85 |         # 2. Check session (user's previous choice)
 86 |         if "language" in session:
 87 |             lang = session["language"]
 88 |             if lang in self.supported_languages:
 89 |                 return lang
 90 | 
 91 |         # 3. Check browser's Accept-Language header
 92 |         if request.headers.get("Accept-Language"):
 93 |             browser_langs = request.headers.get("Accept-Language").split(",")
 94 |             for browser_lang in browser_langs:
 95 |                 # Extract language code (e.g., 'zh-CN' -> 'zh')
 96 |                 lang_code = browser_lang.split(";")[0].split("-")[0].strip().lower()
 97 |                 if lang_code in self.supported_languages:
 98 |                     session["language"] = lang_code
 99 |                     return lang_code
100 | 
101 |         # 4. Fall back to default language
102 |         return self.default_language
103 | 
104 |     def set_locale(self, lang_code: str) -> bool:
105 |         """
106 |         Set the current locale.
107 | 
108 |         Args:
109 |             lang_code: Language code to set
110 | 
111 |         Returns:
112 |             True if successful, False if language not supported
113 |         """
114 |         if lang_code in self.supported_languages:
115 |             session["language"] = lang_code
116 |             return True
117 |         return False
118 | 
119 |     def translate(self, key: str, **kwargs) -> str:
120 |         """
121 |         Translate a text key to the current locale.
122 | 
123 |         Args:
124 |             key: Translation key in dot notation (e.g., 'nav.home')
125 |             **kwargs: Variables for string formatting
126 | 
127 |         Returns:
128 |             Translated text or the key if translation not found
129 |         """
130 |         locale = self.get_locale()
131 | 
132 |         # Get translation for current locale
133 |         translation = self._get_nested_value(self.translations.get(locale, {}), key)
134 | 
135 |         # Fall back to default language if not found
136 |         if translation is None and locale != self.default_language:
137 |             translation = self._get_nested_value(
138 |                 self.translations.get(self.default_language, {}), key
139 |             )
140 | 
141 |         # Fall back to key if still not found
142 |         if translation is None:
143 |             translation = key
144 | 
145 |         # Format with variables if provided
146 |         if kwargs and isinstance(translation, str):
147 |             try:
148 |                 translation = translation.format(**kwargs)
149 |             except (KeyError, ValueError):
150 |                 pass  # Ignore formatting errors
151 | 
152 |         return translation
153 | 
154 |     def _get_nested_value(self, data: Dict[str, Any], key: str) -> Optional[str]:
155 |         """
156 |         Get a nested value from a dictionary using dot notation.
157 | 
158 |         Args:
159 |             data: Dictionary to search in
160 |             key: Dot-separated key (e.g., 'nav.home')
161 | 
162 |         Returns:
163 |             Value if found, None otherwise
164 |         """
165 |         keys = key.split(".")
166 |         current = data
167 | 
168 |         for k in keys:
169 |             if isinstance(current, dict) and k in current:
170 |                 current = current[k]
171 |             else:
172 |                 return None
173 | 
174 |         return current if isinstance(current, str) else None
175 | 
176 |     def get_supported_languages(self) -> Dict[str, str]:
177 |         """
178 |         Get a dictionary of supported languages with their display names.
179 | 
180 |         Returns:
181 |             Dictionary mapping language codes to display names
182 |         """
183 |         return {"en": "English", "zh": "中文"}
184 | 
185 |     def get_language_info(self, lang_code: str) -> Dict[str, str]:
186 |         """
187 |         Get information about a specific language.
188 | 
189 |         Args:
190 |             lang_code: Language code
191 | 
192 |         Returns:
193 |             Dictionary with language information
194 |         """
195 |         language_names = {
196 |             "en": {"name": "English", "native": "English"},
197 |             "zh": {"name": "Chinese", "native": "中文"},
198 |         }
199 | 
200 |         return language_names.get(
201 |             lang_code, {"name": lang_code.upper(), "native": lang_code.upper()}
202 |         )
203 | 
204 | 
205 | # Global instance
206 | language_manager = LanguageManager()
207 | 
208 | 
209 | def init_i18n(app):
210 |     """Initialize i18n support for the Flask application."""
211 |     language_manager.init_app(app)
212 |     return language_manager
213 | 
214 | 
215 | # Template helper functions
216 | def _(key: str, **kwargs) -> str:
217 |     """Shorthand translation function for use in templates and code."""
218 |     return language_manager.translate(key, **kwargs)
219 | 
220 | 
221 | def get_locale() -> str:
222 |     """Get the current locale."""
223 |     return language_manager.get_locale()
224 | 
225 | 
226 | def set_locale(lang_code: str) -> bool:
227 |     """Set the current locale."""
228 |     return language_manager.set_locale(lang_code)
229 | 


--------------------------------------------------------------------------------
/ttsfm-web/static/js/i18n.js:
--------------------------------------------------------------------------------
  1 | // JavaScript Internationalization Support for TTSFM
  2 | 
  3 | // Translation data - this will be populated by the server
  4 | window.i18nData = window.i18nData || {};
  5 | 
  6 | // Current locale
  7 | window.currentLocale = document.documentElement.lang || 'en';
  8 | 
  9 | // Translation function
 10 | function _(key, params = {}) {
 11 |     const keys = key.split('.');
 12 |     let value = window.i18nData;
 13 |     
 14 |     // Navigate through the nested object
 15 |     for (const k of keys) {
 16 |         if (value && typeof value === 'object' && k in value) {
 17 |             value = value[k];
 18 |         } else {
 19 |             // Fallback to key if translation not found
 20 |             return key;
 21 |         }
 22 |     }
 23 |     
 24 |     // If we found a string, apply parameters
 25 |     if (typeof value === 'string') {
 26 |         return formatString(value, params);
 27 |     }
 28 |     
 29 |     // Fallback to key
 30 |     return key;
 31 | }
 32 | 
 33 | // Format string with parameters
 34 | function formatString(str, params) {
 35 |     return str.replace(/\{(\w+)\}/g, (match, key) => {
 36 |         return params.hasOwnProperty(key) ? params[key] : match;
 37 |     });
 38 | }
 39 | 
 40 | // Load translations from server
 41 | async function loadTranslations() {
 42 |     try {
 43 |         const response = await fetch(`/api/translations/${window.currentLocale}`);
 44 |         if (response.ok) {
 45 |             window.i18nData = await response.json();
 46 |         }
 47 |     } catch (error) {
 48 |         console.warn('Failed to load translations:', error);
 49 |     }
 50 | }
 51 | 
 52 | // Sample texts for different languages
 53 | const sampleTexts = {
 54 |     en: {
 55 |         welcome: "Welcome to TTSFM! This is a free text-to-speech service that converts your text into high-quality audio using advanced AI technology.",
 56 |         story: "Once upon a time, in a digital world far away, there lived a small Python package that could transform any text into beautiful speech. This package was called TTSFM, and it brought joy to developers everywhere.",
 57 |         technical: "TTSFM is a Python client for text-to-speech APIs that provides both synchronous and asynchronous interfaces. It supports multiple voices and audio formats, making it perfect for various applications.",
 58 |         multilingual: "TTSFM supports multiple languages and voices, allowing you to create diverse audio content for global audiences. The service is completely free and requires no API keys.",
 59 |         long: "This is a longer text sample designed to test the auto-combine feature of TTSFM. When text exceeds the maximum length limit, TTSFM automatically splits it into smaller chunks, generates audio for each chunk, and then seamlessly combines them into a single audio file. This process is completely transparent to the user and ensures that you can convert text of any length without worrying about technical limitations. The resulting audio maintains consistent quality and natural flow throughout the entire content."
 60 |     },
 61 |     zh: {
 62 |         welcome: "欢迎使用TTSFM！这是一个免费的文本转语音服务，使用先进的AI技术将您的文本转换为高质量音频。",
 63 |         story: "很久很久以前，在一个遥远的数字世界里，住着一个小小的Python包，它能够将任何文本转换成美妙的语音。这个包叫做TTSFM，它为世界各地的开发者带来了快乐。",
 64 |         technical: "TTSFM是一个用于文本转语音API的Python客户端，提供同步和异步接口。它支持多种声音和音频格式，非常适合各种应用。",
 65 |         multilingual: "TTSFM支持多种语言和声音，让您能够为全球受众创建多样化的音频内容。该服务完全免费，无需API密钥。",
 66 |         long: "这是一个较长的文本示例，用于测试TTSFM的自动合并功能。当文本超过最大长度限制时，TTSFM会自动将其分割成较小的片段，为每个片段生成音频，然后无缝地将它们合并成一个音频文件。这个过程对用户完全透明，确保您可以转换任何长度的文本，而无需担心技术限制。生成的音频在整个内容中保持一致的质量和自然的流畅性。"
 67 |     }
 68 | };
 69 | 
 70 | // Get sample text for current locale
 71 | function getSampleText(type) {
 72 |     const locale = window.currentLocale;
 73 |     const texts = sampleTexts[locale] || sampleTexts.en;
 74 |     return texts[type] || texts.welcome;
 75 | }
 76 | 
 77 | // Error messages
 78 | const errorMessages = {
 79 |     en: {
 80 |         empty_text: "Please enter some text to convert.",
 81 |         generation_failed: "Failed to generate speech. Please try again.",
 82 |         network_error: "Network error. Please check your connection and try again.",
 83 |         invalid_format: "Invalid audio format selected.",
 84 |         invalid_voice: "Invalid voice selected.",
 85 |         text_too_long: "Text is too long. Please reduce the length or enable auto-combine.",
 86 |         server_error: "Server error. Please try again later."
 87 |     },
 88 |     zh: {
 89 |         empty_text: "请输入要转换的文本。",
 90 |         generation_failed: "语音生成失败。请重试。",
 91 |         network_error: "网络错误。请检查您的连接并重试。",
 92 |         invalid_format: "选择的音频格式无效。",
 93 |         invalid_voice: "选择的声音无效。",
 94 |         text_too_long: "文本太长。请减少长度或启用自动合并。",
 95 |         server_error: "服务器错误。请稍后重试。"
 96 |     }
 97 | };
 98 | 
 99 | // Success messages
100 | const successMessages = {
101 |     en: {
102 |         generation_complete: "Speech generated successfully!",
103 |         text_copied: "Text copied to clipboard!",
104 |         download_started: "Download started!"
105 |     },
106 |     zh: {
107 |         generation_complete: "语音生成成功！",
108 |         text_copied: "文本已复制到剪贴板！",
109 |         download_started: "下载已开始！"
110 |     }
111 | };
112 | 
113 | // Get error message
114 | function getErrorMessage(key) {
115 |     const locale = window.currentLocale;
116 |     const messages = errorMessages[locale] || errorMessages.en;
117 |     return messages[key] || key;
118 | }
119 | 
120 | // Get success message
121 | function getSuccessMessage(key) {
122 |     const locale = window.currentLocale;
123 |     const messages = successMessages[locale] || successMessages.en;
124 |     return messages[key] || key;
125 | }
126 | 
127 | // Format file size
128 | function formatFileSize(bytes) {
129 |     if (bytes === 0) return '0 Bytes';
130 |     
131 |     const k = 1024;
132 |     const sizes = window.currentLocale === 'zh' 
133 |         ? ['字节', 'KB', 'MB', 'GB'] 
134 |         : ['Bytes', 'KB', 'MB', 'GB'];
135 |     
136 |     const i = Math.floor(Math.log(bytes) / Math.log(k));
137 |     return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
138 | }
139 | 
140 | // Format duration
141 | function formatDuration(seconds) {
142 |     if (isNaN(seconds) || seconds < 0) {
143 |         return window.currentLocale === 'zh' ? '未知' : 'Unknown';
144 |     }
145 |     
146 |     const minutes = Math.floor(seconds / 60);
147 |     const remainingSeconds = Math.floor(seconds % 60);
148 |     
149 |     if (minutes > 0) {
150 |         return window.currentLocale === 'zh' 
151 |             ? `${minutes}分${remainingSeconds}秒`
152 |             : `${minutes}m ${remainingSeconds}s`;
153 |     } else {
154 |         return window.currentLocale === 'zh' 
155 |             ? `${remainingSeconds}秒`
156 |             : `${remainingSeconds}s`;
157 |     }
158 | }
159 | 
160 | // Update UI text based on current locale
161 | function updateUIText() {
162 |     // Update button texts
163 |     const generateBtn = document.getElementById('generate-btn');
164 |     if (generateBtn && !generateBtn.disabled) {
165 |         generateBtn.innerHTML = window.currentLocale === 'zh' 
166 |             ? '<i class="fas fa-magic me-2"></i>生成语音'
167 |             : '<i class="fas fa-magic me-2"></i>Generate Speech';
168 |     }
169 |     
170 |     // Update other dynamic text elements
171 |     const charCountElement = document.querySelector('#char-count');
172 |     if (charCountElement) {
173 |         const count = charCountElement.textContent;
174 |         const parent = charCountElement.parentElement;
175 |         if (parent) {
176 |             // Escape HTML characters to prevent XSS
177 |             const escapedCount = count.replace(/&/g, '&amp;')
178 |                                      .replace(/</g, '&lt;')
179 |                                      .replace(/>/g, '&gt;')
180 |                                      .replace(/"/g, '&quot;')
181 |                                      .replace(/'/g, '&#x27;');
182 |             
183 |             parent.innerHTML = window.currentLocale === 'zh'
184 |                 ? `<i class="fas fa-keyboard me-1"></i><span id="char-count">${escapedCount}</span> 字符`
185 |                 : `<i class="fas fa-keyboard me-1"></i><span id="char-count">${escapedCount}</span> characters`;
186 |         }
187 |     }
188 | }
189 | 
190 | // Initialize i18n
191 | function initI18n() {
192 |     // Load translations if needed
193 |     loadTranslations();
194 |     
195 |     // Update UI text
196 |     updateUIText();
197 |     
198 |     // Listen for language changes
199 |     document.addEventListener('languageChanged', function(event) {
200 |         window.currentLocale = event.detail.locale;
201 |         loadTranslations().then(() => {
202 |             updateUIText();
203 |         });
204 |     });
205 | }
206 | 
207 | // Export functions for global use
208 | window._ = _;
209 | window.getSampleText = getSampleText;
210 | window.getErrorMessage = getErrorMessage;
211 | window.getSuccessMessage = getSuccessMessage;
212 | window.formatFileSize = formatFileSize;
213 | window.formatDuration = formatDuration;
214 | window.initI18n = initI18n;
215 | 
216 | // Auto-initialize when DOM is ready
217 | if (document.readyState === 'loading') {
218 |     document.addEventListener('DOMContentLoaded', initI18n);
219 | } else {
220 |     initI18n();
221 | }
222 | 


--------------------------------------------------------------------------------
/ttsfm/models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data models and types for the TTSFM package.
  3 | 
  4 | This module defines the core data structures used throughout the package,
  5 | including request/response models, enums, and error types.
  6 | """
  7 | 
  8 | from dataclasses import dataclass
  9 | from datetime import datetime
 10 | from enum import Enum
 11 | from typing import Any, Dict, Optional, Union
 12 | 
 13 | 
 14 | class Voice(str, Enum):
 15 |     """Available voice options for TTS generation."""
 16 | 
 17 |     ALLOY = "alloy"
 18 |     ASH = "ash"
 19 |     BALLAD = "ballad"
 20 |     CORAL = "coral"
 21 |     ECHO = "echo"
 22 |     FABLE = "fable"
 23 |     NOVA = "nova"
 24 |     ONYX = "onyx"
 25 |     SAGE = "sage"
 26 |     SHIMMER = "shimmer"
 27 |     VERSE = "verse"
 28 | 
 29 | 
 30 | class AudioFormat(str, Enum):
 31 |     """Supported audio output formats."""
 32 | 
 33 |     MP3 = "mp3"
 34 |     WAV = "wav"
 35 |     OPUS = "opus"
 36 |     AAC = "aac"
 37 |     FLAC = "flac"
 38 |     PCM = "pcm"
 39 | 
 40 | 
 41 | @dataclass
 42 | class TTSRequest:
 43 |     """
 44 |     Request model for TTS generation.
 45 | 
 46 |     Attributes:
 47 |         input: Text to convert to speech
 48 |         voice: Voice to use for generation
 49 |         response_format: Audio format for output
 50 |         instructions: Optional instructions for voice modulation
 51 |         model: Model to use (for OpenAI compatibility, usually ignored)
 52 |         speed: Speech speed (for OpenAI compatibility, usually ignored)
 53 |         max_length: Maximum allowed text length (default: 1000 characters)
 54 |         validate_length: Whether to validate text length (default: True)
 55 |     """
 56 | 
 57 |     input: str
 58 |     voice: Union[Voice, str] = Voice.ALLOY
 59 |     response_format: Union[AudioFormat, str] = AudioFormat.MP3
 60 |     instructions: Optional[str] = None
 61 |     model: Optional[str] = None
 62 |     speed: Optional[float] = None
 63 |     max_length: int = 1000
 64 |     validate_length: bool = True
 65 | 
 66 |     def __post_init__(self) -> None:
 67 |         """Validate and normalize fields after initialization."""
 68 |         if self.max_length > 1000:
 69 |             self.max_length = 1000
 70 |         # Ensure voice is a valid Voice enum
 71 |         if isinstance(self.voice, str):
 72 |             try:
 73 |                 self.voice = Voice(self.voice.lower())
 74 |             except ValueError:
 75 |                 raise ValueError(f"Invalid voice: {self.voice}. Must be one of {list(Voice)}")
 76 | 
 77 |         # Ensure response_format is a valid AudioFormat enum
 78 |         if isinstance(self.response_format, str):
 79 |             try:
 80 |                 self.response_format = AudioFormat(self.response_format.lower())
 81 |             except ValueError:
 82 |                 raise ValueError(
 83 |                     f"Invalid format: {self.response_format}. Must be one of {list(AudioFormat)}"
 84 |                 )
 85 | 
 86 |         # Validate input text
 87 |         if not self.input or not self.input.strip():
 88 |             raise ValueError("Input text cannot be empty")
 89 | 
 90 |         # Validate text length if enabled
 91 |         if self.validate_length:
 92 |             text_length = len(self.input)
 93 |             if text_length > self.max_length:
 94 |                 raise ValueError(
 95 |                     f"Input text is too long ({text_length} characters). "
 96 |                     f"Maximum allowed length is {self.max_length} characters. "
 97 |                     f"Consider splitting your text into smaller chunks or disable "
 98 |                     f"length validation with validate_length=False."
 99 |                 )
100 | 
101 |         # Validate max_length parameter
102 |         if self.max_length <= 0:
103 |             raise ValueError("max_length must be a positive integer")
104 | 
105 |         # Validate speed if provided
106 |         if self.speed is not None and (self.speed < 0.25 or self.speed > 4.0):
107 |             raise ValueError("Speed must be between 0.25 and 4.0")
108 | 
109 |     def to_dict(self) -> Dict[str, Any]:
110 |         """Convert request to dictionary for API calls."""
111 |         data: Dict[str, Any] = {
112 |             "input": self.input,
113 |             "voice": self.voice.value if isinstance(self.voice, Voice) else self.voice,
114 |             "response_format": (
115 |                 self.response_format.value
116 |                 if isinstance(self.response_format, AudioFormat)
117 |                 else self.response_format
118 |             ),
119 |         }
120 | 
121 |         if self.instructions:
122 |             data["instructions"] = self.instructions
123 | 
124 |         if self.model:
125 |             data["model"] = self.model
126 | 
127 |         if self.speed is not None:
128 |             data["speed"] = self.speed
129 | 
130 |         return data
131 | 
132 | 
133 | @dataclass
134 | class TTSResponse:
135 |     """
136 |     Response model for TTS generation.
137 | 
138 |     Attributes:
139 |         audio_data: Generated audio as bytes
140 |         content_type: MIME type of the audio data
141 |         format: Audio format used
142 |         size: Size of audio data in bytes
143 |         duration: Estimated duration in seconds (if available)
144 |         metadata: Additional response metadata
145 |     """
146 | 
147 |     audio_data: bytes
148 |     content_type: str
149 |     format: AudioFormat
150 |     size: int
151 |     duration: Optional[float] = None
152 |     metadata: Optional[Dict[str, Any]] = None
153 | 
154 |     def __post_init__(self) -> None:
155 |         """Calculate derived fields after initialization."""
156 |         # Size is always set from audio_data length if not provided
157 |         pass
158 | 
159 |     def save_to_file(self, filename: str) -> str:
160 |         """
161 |         Save audio data to a file.
162 | 
163 |         Args:
164 |             filename: Target filename (extension will be added if missing)
165 | 
166 |         Returns:
167 |             str: Final filename used
168 |         """
169 |         import os
170 | 
171 |         # Use the actual returned format for the extension, not any requested format
172 |         expected_extension = f".{self.format.value}"
173 | 
174 |         # Check if filename already has the correct extension
175 |         if filename.endswith(expected_extension):
176 |             final_filename = filename
177 |         else:
178 |             # Remove any existing extension and add the correct one
179 |             base_name = filename
180 |             # Remove common audio extensions if present
181 |             for ext in [".mp3", ".wav", ".opus", ".aac", ".flac", ".pcm"]:
182 |                 if base_name.endswith(ext):
183 |                     base_name = base_name[: -len(ext)]
184 |                     break
185 |             final_filename = f"{base_name}{expected_extension}"
186 | 
187 |         # Create directory if it doesn't exist
188 |         os.makedirs(
189 |             os.path.dirname(final_filename) if os.path.dirname(final_filename) else ".",
190 |             exist_ok=True,
191 |         )
192 | 
193 |         # Write audio data
194 |         with open(final_filename, "wb") as f:
195 |             f.write(self.audio_data)
196 | 
197 |         return final_filename
198 | 
199 | 
200 | @dataclass
201 | class TTSError:
202 |     """
203 |     Error information from TTS API.
204 | 
205 |     Attributes:
206 |         code: Error code
207 |         message: Human-readable error message
208 |         type: Error type/category
209 |         details: Additional error details
210 |         timestamp: When the error occurred
211 |     """
212 | 
213 |     code: str
214 |     message: str
215 |     type: Optional[str] = None
216 |     details: Optional[Dict[str, Any]] = None
217 |     timestamp: Optional[datetime] = None
218 | 
219 |     def __post_init__(self) -> None:
220 |         """Set timestamp if not provided."""
221 |         if self.timestamp is None:
222 |             self.timestamp = datetime.now()
223 | 
224 | 
225 | @dataclass
226 | class APIError(TTSError):
227 |     """API-specific error information."""
228 | 
229 |     status_code: int = 500
230 |     headers: Optional[Dict[str, str]] = None
231 | 
232 | 
233 | @dataclass
234 | class NetworkError(TTSError):
235 |     """Network-related error information."""
236 | 
237 |     timeout: Optional[float] = None
238 |     retry_count: int = 0
239 | 
240 | 
241 | @dataclass
242 | class ValidationError(TTSError):
243 |     """Validation error information."""
244 | 
245 |     field: Optional[str] = None
246 |     value: Optional[Any] = None
247 | 
248 | 
249 | # Content type mappings for audio formats
250 | CONTENT_TYPE_MAP = {
251 |     AudioFormat.MP3: "audio/mpeg",
252 |     AudioFormat.OPUS: "audio/opus",
253 |     AudioFormat.AAC: "audio/aac",
254 |     AudioFormat.FLAC: "audio/flac",
255 |     AudioFormat.WAV: "audio/wav",
256 |     AudioFormat.PCM: "audio/pcm",
257 | }
258 | 
259 | # Reverse mapping for content type to format
260 | FORMAT_FROM_CONTENT_TYPE = {v: k for k, v in CONTENT_TYPE_MAP.items()}
261 | 
262 | 
263 | def get_content_type(format: Union[AudioFormat, str]) -> str:
264 |     """Get MIME content type for audio format."""
265 |     if isinstance(format, str):
266 |         format = AudioFormat(format.lower())
267 |     return CONTENT_TYPE_MAP.get(format, "audio/mpeg")
268 | 
269 | 
270 | def get_format_from_content_type(content_type: str) -> AudioFormat:
271 |     """Get audio format from MIME content type."""
272 |     return FORMAT_FROM_CONTENT_TYPE.get(content_type, AudioFormat.MP3)
273 | 


--------------------------------------------------------------------------------
/ttsfm-web/translations/zh.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nav": {
  3 |     "home": "首页",
  4 |     "playground": "试用平台",
  5 |     "documentation": "文档",
  6 |     "github": "GitHub",
  7 |     "status_checking": "检查中...",
  8 |     "status_online": "在线",
  9 |     "status_offline": "离线"
 10 |   },
 11 |   "common": {
 12 |     "loading": "加载中...",
 13 |     "error": "错误",
 14 |     "success": "成功",
 15 |     "warning": "警告",
 16 |     "info": "信息",
 17 |     "close": "关闭",
 18 |     "save": "保存",
 19 |     "cancel": "取消",
 20 |     "confirm": "确认",
 21 |     "download": "下载",
 22 |     "upload": "上传",
 23 |     "generate": "生成",
 24 |     "play": "播放",
 25 |     "stop": "停止",
 26 |     "pause": "暂停",
 27 |     "resume": "继续",
 28 |     "clear": "清除",
 29 |     "reset": "重置",
 30 |     "copy": "复制",
 31 |     "copied": "已复制！",
 32 |     "language": "语言",
 33 |     "english": "English",
 34 |     "chinese": "中文",
 35 |     "validate": "验证",
 36 |     "options": "选项",
 37 |     "max_length": "最大长度",
 38 |     "tip": "提示",
 39 |     "choose_voice": "从可用声音中选择",
 40 |     "select_format": "选择您偏好的音频格式",
 41 |     "loading_voices": "加载声音中...",
 42 |     "loading_formats": "加载格式中...",
 43 |     "ctrl_enter_tip": "使用 Ctrl+Enter 生成",
 44 |     "auto_combine_enabled": "自动合并已启用",
 45 |     "demo": "演示",
 46 |     "clear_text": "清除文本",
 47 |     "tip_ctrl_enter": "提示：使用 Ctrl+Enter 生成",
 48 |     "ready": "就绪",
 49 |     "replay_audio": "重播音频",
 50 |     "share_audio": "分享音频",
 51 |     "browser_no_audio_support": "您的浏览器不支持音频元素。",
 52 |     "generating_speech": "生成语音中...",
 53 |     "streaming": "流式传输",
 54 |     "chars": "字符",
 55 |     "generated": "已生成"
 56 |   },
 57 |   "home": {
 58 |     "title": "免费的Python文本转语音",
 59 |     "subtitle": "使用免费的openai.fm服务从文本生成高质量语音。无需API密钥，无需注册 - 只需安装即可开始创建音频。",
 60 |     "try_demo": "试用演示",
 61 |     "documentation": "文档",
 62 |     "github": "GitHub",
 63 |     "features_title": "主要特性",
 64 |     "features_subtitle": "简单、免费且强大的Python开发者文本转语音工具。",
 65 |     "feature_free_title": "完全免费",
 66 |     "feature_free_desc": "无需API密钥或注册。使用免费的openai.fm服务。",
 67 |     "feature_voices_title": "11种声音",
 68 |     "feature_voices_desc": "提供所有OpenAI兼容的声音，适用于不同使用场景。",
 69 |     "feature_formats_title": "6种音频格式",
 70 |     "feature_formats_desc": "支持MP3、WAV、OPUS、AAC、FLAC和PCM格式，适用于任何应用。",
 71 |     "feature_docker_title": "Docker就绪",
 72 |     "feature_docker_desc": "一键部署，包含Web界面和API端点。",
 73 |     "feature_openai_title": "OpenAI兼容",
 74 |     "feature_openai_desc": "OpenAI TTS API的直接替代品，支持长文本自动合并。",
 75 |     "feature_async_title": "异步和同步",
 76 |     "feature_async_desc": "提供asyncio和同步客户端，最大化灵活性。",
 77 |     "quick_start_title": "快速开始",
 78 |     "installation_title": "安装",
 79 |     "installation_code": "pip install ttsfm",
 80 |     "usage_title": "基本用法",
 81 |     "docker_title": "Docker部署",
 82 |     "docker_desc": "运行带有Web界面的TTSFM：",
 83 |     "api_title": "OpenAI兼容API",
 84 |     "api_desc": "与OpenAI Python客户端一起使用：",
 85 |     "footer_copyright": "© 2024 dbcccc"
 86 |   },
 87 |   "playground": {
 88 |     "title": "交互式TTS试用平台",
 89 |     "subtitle": "实时测试不同的声音和音频格式",
 90 |     "text_input_label": "要转换的文本",
 91 |     "text_input_placeholder": "输入您想要转换为语音的文本...",
 92 |     "voice_label": "声音",
 93 |     "format_label": "音频格式",
 94 |     "instructions_label": "声音指令（可选）",
 95 |     "instructions_placeholder": "语音生成的额外指令...",
 96 |     "character_count": "字符",
 97 |     "max_length_warning": "文本超过最大长度。将自动分割并合并。",
 98 |     "generate_speech": "生成语音",
 99 |     "generating": "生成中...",
100 |     "download_audio": "下载音频",
101 |     "audio_player_title": "生成的音频",
102 |     "file_size": "文件大小",
103 |     "duration": "时长",
104 |     "format": "格式",
105 |     "voice": "声音",
106 |     "chunks_combined": "合并片段",
107 |     "random_text": "随机文本",
108 |     "clear_text": "清除文本",
109 |     "max_length_description": "每个请求的最大字符数（默认：1000）",
110 |     "enable_length_validation": "启用长度验证",
111 |     "auto_combine_long_text": "自动合并长文本",
112 |     "auto_combine_tooltip": "自动分割长文本并将音频片段合并为单个文件",
113 |     "auto_combine_description": "自动处理超过限制的文本",
114 |     "instructions_description": "为声音调制提供可选指令",
115 |     "api_key_optional": "API密钥（可选）",
116 |     "api_key_placeholder": "如果需要，请输入您的API密钥",
117 |     "api_key_description": "仅在服务器启用API密钥保护时需要",
118 |     "sample_texts": {
119 |       "welcome": "欢迎使用TTSFM！这是一个免费的文本转语音服务，使用先进的AI技术将您的文本转换为高质量音频。",
120 |       "story": "很久很久以前，在一个遥远的数字世界里，住着一个小小的Python包，它能够将任何文本转换成美妙的语音。这个包叫做TTSFM，它为世界各地的开发者带来了快乐。",
121 |       "technical": "TTSFM是一个用于文本转语音API的Python客户端，提供同步和异步接口。它支持多种声音和音频格式，非常适合各种应用。",
122 |       "multilingual": "TTSFM支持多种语言和声音，让您能够为全球受众创建多样化的音频内容。该服务完全免费，无需API密钥。",
123 |       "long": "这是一个较长的文本示例，用于测试TTSFM的自动合并功能。当文本超过最大长度限制时，TTSFM会自动将其分割成较小的片段，为每个片段生成音频，然后无缝地将它们合并成一个音频文件。这个过程对用户完全透明，确保您可以转换任何长度的文本，而无需担心技术限制。生成的音频在整个内容中保持一致的质量和自然的流畅性。"
124 |     },
125 |     "error_messages": {
126 |       "empty_text": "请输入要转换的文本。",
127 |       "generation_failed": "语音生成失败。请重试。",
128 |       "network_error": "网络错误。请检查您的连接并重试。",
129 |       "invalid_format": "选择的音频格式无效。",
130 |       "invalid_voice": "选择的声音无效。",
131 |       "text_too_long": "文本太长。请减少长度或启用自动合并。",
132 |       "server_error": "服务器错误。请稍后重试。"
133 |     },
134 |     "success_messages": {
135 |       "generation_complete": "语音生成成功！",
136 |       "text_copied": "文本已复制到剪贴板！",
137 |       "download_started": "下载已开始！"
138 |     },
139 |     "speed_label": "播放速度",
140 |     "speed_description": "调整音频播放速度，从 0.25x（较慢）到 4.0x（较快）。默认为 1.0x（正常速度）。",
141 |     "speed": "速度",
142 |     "chunks": "片段",
143 |     "format_description": "选择音频输出格式。转换格式需要 ffmpeg。",
144 |     "enable_websocket_streaming": "启用 WebSocket 流式传输",
145 |     "realtime_audio_chunks": "（实时音频片段）",
146 |     "streaming_progress": "流式传输进度",
147 |     "stream_speech": "流式生成语音",
148 |     "streaming_complete": "流式传输完成",
149 |     "streaming_ready": "流式传输就绪",
150 |     "streaming_active": "流式传输中...",
151 |     "streaming_offline": "流式传输离线",
152 |     "chunks_label": "片段：",
153 |     "total_size_label": "总大小：",
154 |     "time_label": "时间：",
155 |     "format_label_colon": "格式：",
156 |     "connection_error": "连接错误",
157 |     "chunks_heading": "片段",
158 |     "data_heading": "数据",
159 |     "time_heading": "时间",
160 |     "chunk_title": "片段"
161 |   },
162 |   "docs": {
163 |     "title": "API文档",
164 |     "subtitle": "TTSFM文本转语音API的完整参考。免费、简单且强大。",
165 |     "contents": "目录",
166 |     "overview": "概述",
167 |     "authentication": "身份验证",
168 |     "text_validation": "文本验证",
169 |     "endpoints": "API端点",
170 |     "voices": "声音",
171 |     "formats": "音频格式",
172 |     "generate": "生成语音",
173 |     "combined": "合并音频",
174 |     "status": "状态和健康检查",
175 |     "errors": "错误处理",
176 |     "examples": "代码示例",
177 |     "python_package": "Python包",
178 |     "overview_title": "概述",
179 |     "overview_desc": "TTSFM API提供现代的、OpenAI兼容的文本转语音生成接口。它支持多种声音、音频格式，并包含高级功能，如文本长度验证和智能自动合并功能。",
180 |     "base_url": "基础URL：",
181 |     "key_features": "主要特性",
182 |     "feature_voices": "11种不同的声音选项 - 从alloy、echo、nova等中选择",
183 |     "feature_formats": "多种音频格式 - 支持MP3、WAV、OPUS、AAC、FLAC、PCM",
184 |     "feature_openai": "OpenAI兼容性 - OpenAI TTS API的直接替代品",
185 |     "feature_auto_combine": "自动合并功能 - 自动处理长文本（>1000字符），通过分割和合并音频",
186 |     "feature_validation": "文本长度验证 - 智能验证，可配置限制",
187 |     "feature_monitoring": "实时监控 - 状态端点和健康检查",
188 |     "new_version": "v3.3.1新功能：",
189 |     "new_version_desc": "运行时镜像现已内置 ffmpeg，MP3 自动合并可立即使用；默认长文本上限调整为 1000 字符，保证播报行为一致。",
190 |     "authentication_title": "身份验证",
191 |     "authentication_desc": "目前，API支持可选的API密钥身份验证。如果已配置，请在请求头中包含您的API密钥。",
192 |     "text_validation_title": "文本长度验证",
193 |     "text_validation_desc": "TTSFM包含内置的文本长度验证，以确保与TTS模型的兼容性。默认最大长度为1000个字符，但可以自定义。",
194 |     "important": "重要：",
195 |     "text_validation_warning": "超过最大长度的文本将被拒绝，除非禁用验证或将文本分割成块。",
196 |     "validation_options": "验证选项",
197 |     "max_length_option": "允许的最大字符数（默认：1000）",
198 |     "validate_length_option": "启用/禁用验证（默认：true）",
199 |     "preserve_words_option": "分块时避免分割单词（默认：true）",
200 |     "endpoints_title": "API端点",
201 |     "get_voices_desc": "获取可用声音列表。",
202 |     "get_formats_desc": "获取支持的音频格式列表。",
203 |     "validate_text_desc": "验证文本长度并获取分割建议。",
204 |     "generate_speech_desc": "从文本生成语音。",
205 |     "response_example": "响应示例：",
206 |     "request_body": "请求体：",
207 |     "parameters": "参数：",
208 |     "text_param": "要转换为语音的文本",
209 |     "voice_param": "声音ID（默认：\"alloy\"）",
210 |     "format_param": "音频格式（默认：\"mp3\"）",
211 |     "instructions_param": "声音调制指令",
212 |     "max_length_param": "最大文本长度（默认：1000）",
213 |     "validate_length_param": "启用验证（默认：true）",
214 |     "response": "响应：",
215 |     "response_audio": "返回带有适当Content-Type头的音频文件。",
216 |     "response_combined_audio": "返回包含所有块无缝合并的单个音频文件。",
217 |     "required": "必需",
218 |     "optional": "可选",
219 |     "python_package_title": "Python包",
220 |     "long_text_support": "长文本支持",
221 |     "long_text_desc": "TTSFM Python包包含内置的长文本分割功能，为需要精细控制的开发者提供支持：",
222 |     "developer_features": "开发者功能：",
223 |     "manual_splitting": "手动分割：对高级用例的文本分块进行完全控制",
224 |     "word_preservation": "单词保护：维护单词边界以获得自然语音",
225 |     "separate_files": "单独文件：每个块保存为单独的音频文件",
226 |     "cli_support": "CLI支持：使用`--split-long-text`标志进行命令行使用",
227 |     "note": "注意：",
228 |     "auto_combine_note": "对于Web用户，建议使用`/v1/audio/speech`中的自动合并功能，因为它会自动处理长文本并返回单个无缝音频文件。",
229 |     "combined_audio_desc": "从长文本生成单个合并的音频文件。自动将文本分割成块，为每个块生成语音，并将它们合并成一个无缝的音频文件。",
230 |     "response_headers": "响应头：",
231 |     "chunks_combined_header": "合并的块数",
232 |     "original_text_length_header": "原始文本长度（字符数）",
233 |     "audio_size_header": "最终音频文件大小（字节）",
234 |     "openai_compatible_desc": "增强的OpenAI兼容端点，具有自动合并功能。在需要时自动处理长文本，通过分割和合并音频块。",
235 |     "enhanced_parameters": "增强参数：",
236 |     "auto_combine_param": "自动分割长文本并将音频块合并为单个文件",
237 |     "auto_combine_false": "如果文本超过max_length则返回错误（标准OpenAI行为）",
238 |     "max_length_chunk_param": "分割时每个块的最大字符数",
239 |     "auto_combine_header": "是否启用了自动合并（true/false）",
240 |     "chunks_combined_response": "合并的音频块数（短文本为1）",
241 |     "original_text_response": "原始文本长度（用于长文本处理）",
242 |     "audio_format_header": "响应的音频格式",
243 |     "audio_size_response": "音频文件大小（字节）",
244 |     "short_text_comment": "短文本（正常工作）",
245 |     "long_text_auto_comment": "带自动合并的长文本（默认）",
246 |     "long_text_no_auto_comment": "不带自动合并的长文本（将出错）",
247 |     "audio_combination": "音频合并：",
248 |     "audio_combination_desc": "在可用时使用高级音频处理（PyDub），在不同环境中具有智能回退。支持所有音频格式。",
249 |     "use_cases": "使用场景：",
250 |     "use_case_articles": "长文章：将博客文章或文章转换为单个音频文件",
251 |     "use_case_audiobooks": "有声书：将章节生成为单个音频文件",
252 |     "use_case_podcasts": "播客：从脚本创建播客剧集",
253 |     "use_case_education": "教育内容：将学习材料转换为音频",
254 |     "example_usage": "使用示例：",
255 |     "python_example_comment": "Python示例",
256 |     "operations": "运行须知",
257 |     "operational_title": "运行须知",
258 |     "operational_limit": "启用 auto_combine 时，超过 1000 字符的请求会自动拆分；若需自行控制分块，可关闭校验。",
259 |     "operational_fallback": "MP3 请求保持 MP3，OPUS/AAC/FLAC/WAV/PCM 等格式会回退为 WAV，确保播放稳定。",
260 |     "operational_backend": "语音由第三方 openai.fm 提供，服务可能波动，请在业务中预留降级策略。",
261 |     "operational_ffmpeg": "Docker 镜像已内置 ffmpeg，无需额外配置即可完成 MP3 自动合并。"
262 |   }
263 | }


--------------------------------------------------------------------------------
/ttsfm/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Command-line interface for TTSFM.
  4 | 
  5 | This module provides a command-line interface for the TTSFM package,
  6 | allowing users to generate speech from text using various options.
  7 | """
  8 | 
  9 | import argparse
 10 | import os
 11 | import sys
 12 | from pathlib import Path
 13 | 
 14 | from .client import TTSClient
 15 | from .exceptions import APIException, NetworkException, TTSException
 16 | from .models import AudioFormat, TTSResponse, Voice
 17 | 
 18 | 
 19 | def create_parser() -> argparse.ArgumentParser:
 20 |     """Create and configure the argument parser."""
 21 |     parser = argparse.ArgumentParser(
 22 |         prog="ttsfm",
 23 |         description="TTSFM - Text-to-Speech API Client",
 24 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 25 |         epilog="""
 26 | Examples:
 27 |   ttsfm "Hello, world!" --output hello.mp3
 28 |   ttsfm "Hello, world!" --voice nova --format wav --output hello.wav
 29 |   ttsfm "Hello, world!" --url http://localhost:7000 --output hello.mp3
 30 |   ttsfm --text-file input.txt --output speech.mp3
 31 |         """,
 32 |     )
 33 | 
 34 |     # Text input options (mutually exclusive)
 35 |     text_group = parser.add_mutually_exclusive_group(required=True)
 36 |     text_group.add_argument("text", nargs="?", help="Text to convert to speech")
 37 |     text_group.add_argument("--text-file", "-f", type=str, help="Read text from file")
 38 | 
 39 |     # Output options
 40 |     parser.add_argument("--output", "-o", type=str, required=True, help="Output file path")
 41 | 
 42 |     # TTS options
 43 |     parser.add_argument(
 44 |         "--voice",
 45 |         "-v",
 46 |         type=str,
 47 |         default="alloy",
 48 |         choices=[
 49 |             "alloy",
 50 |             "ash",
 51 |             "ballad",
 52 |             "coral",
 53 |             "echo",
 54 |             "fable",
 55 |             "nova",
 56 |             "onyx",
 57 |             "sage",
 58 |             "shimmer",
 59 |             "verse",
 60 |         ],
 61 |         help="Voice to use for speech generation (default: alloy)",
 62 |     )
 63 | 
 64 |     parser.add_argument(
 65 |         "--format",
 66 |         type=str,
 67 |         default="mp3",
 68 |         choices=["mp3", "opus", "aac", "flac", "wav", "pcm"],
 69 |         help="Audio format (default: mp3)",
 70 |     )
 71 | 
 72 |     parser.add_argument(
 73 |         "--speed", type=float, default=1.0, help="Speech speed (0.25 to 4.0, default: 1.0)"
 74 |     )
 75 | 
 76 |     # Client options
 77 |     parser.add_argument(
 78 |         "--url",
 79 |         "-u",
 80 |         type=str,
 81 |         default="http://localhost:7000",
 82 |         help="TTS service URL (default: http://localhost:7000)",
 83 |     )
 84 | 
 85 |     parser.add_argument("--api-key", "-k", type=str, help="API key for authentication")
 86 | 
 87 |     parser.add_argument(
 88 |         "--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30.0)"
 89 |     )
 90 | 
 91 |     parser.add_argument(
 92 |         "--retries", type=int, default=3, help="Maximum number of retries (default: 3)"
 93 |     )
 94 | 
 95 |     # Text length validation options
 96 |     parser.add_argument(
 97 |         "--max-length",
 98 |         type=int,
 99 |         default=1000,
100 |         help="Maximum text length in characters (default: 1000)",
101 |     )
102 | 
103 |     parser.add_argument(
104 |         "--no-length-validation", action="store_true", help="Disable text length validation"
105 |     )
106 | 
107 |     parser.add_argument(
108 |         "--split-long-text", action="store_true", help="Automatically split long text into chunks"
109 |     )
110 | 
111 |     parser.add_argument(
112 |         "--auto-combine",
113 |         action="store_true",
114 |         help=(
115 |             "Combine long-text chunks into a single audio file "
116 |             "(requires pydub for non-WAV formats)"
117 |         ),
118 |     )
119 | 
120 |     # Other options
121 |     parser.add_argument("--verbose", "-V", action="store_true", help="Enable verbose output")
122 | 
123 |     parser.add_argument("--version", action="version", version=f"%(prog)s {get_version()}")
124 | 
125 |     return parser
126 | 
127 | 
128 | def get_version() -> str:
129 |     """Get the package version."""
130 |     try:
131 |         from . import __version__
132 | 
133 |         return __version__
134 |     except ImportError:
135 |         return "unknown"
136 | 
137 | 
138 | def read_text_file(file_path: str) -> str:
139 |     """Read text from a file."""
140 |     try:
141 |         with open(file_path, "r", encoding="utf-8") as f:
142 |             return f.read().strip()
143 |     except FileNotFoundError:
144 |         print(f"Error: File '{file_path}' not found.", file=sys.stderr)
145 |         sys.exit(1)
146 |     except Exception as e:
147 |         print(f"Error reading file '{file_path}': {e}", file=sys.stderr)
148 |         sys.exit(1)
149 | 
150 | 
151 | def validate_speed(speed: float) -> float:
152 |     """Validate and return the speed parameter."""
153 |     if not 0.25 <= speed <= 4.0:
154 |         print("Error: Speed must be between 0.25 and 4.0", file=sys.stderr)
155 |         sys.exit(1)
156 |     return speed
157 | 
158 | 
159 | def get_voice_enum(voice_str: str) -> Voice:
160 |     """Convert voice string to Voice enum."""
161 |     voice_map = {
162 |         "alloy": Voice.ALLOY,
163 |         "ash": Voice.ASH,
164 |         "ballad": Voice.BALLAD,
165 |         "coral": Voice.CORAL,
166 |         "echo": Voice.ECHO,
167 |         "fable": Voice.FABLE,
168 |         "nova": Voice.NOVA,
169 |         "onyx": Voice.ONYX,
170 |         "sage": Voice.SAGE,
171 |         "shimmer": Voice.SHIMMER,
172 |         "verse": Voice.VERSE,
173 |     }
174 |     return voice_map[voice_str.lower()]
175 | 
176 | 
177 | def get_format_enum(format_str: str) -> AudioFormat:
178 |     """Convert format string to AudioFormat enum."""
179 |     format_map = {
180 |         "mp3": AudioFormat.MP3,
181 |         "opus": AudioFormat.OPUS,
182 |         "aac": AudioFormat.AAC,
183 |         "flac": AudioFormat.FLAC,
184 |         "wav": AudioFormat.WAV,
185 |         "pcm": AudioFormat.PCM,
186 |     }
187 |     return format_map[format_str.lower()]
188 | 
189 | 
190 | def handle_long_text(  # type: ignore[no-untyped-def]
191 |     args,
192 |     text: str,
193 |     voice: Voice,
194 |     audio_format: AudioFormat,
195 |     speed: float,
196 | ) -> None:
197 |     """Handle long text by splitting it into chunks and generating multiple files."""
198 |     # Create client
199 |     try:
200 |         client = TTSClient(
201 |             base_url=args.url, api_key=args.api_key, timeout=args.timeout, max_retries=args.retries
202 |         )
203 | 
204 |         # Use the new long text method
205 |         responses = client.generate_speech_long_text(
206 |             text=text,
207 |             voice=voice,
208 |             response_format=audio_format,
209 |             speed=speed,
210 |             max_length=args.max_length,
211 |             preserve_words=True,
212 |             auto_combine=args.auto_combine,
213 |         )
214 | 
215 |         if not responses:
216 |             print("Error: No valid text chunks found after processing.", file=sys.stderr)
217 |             sys.exit(1)
218 |         if isinstance(responses, TTSResponse):
219 |             combined_response = responses
220 |             combined_response.save_to_file(args.output)
221 |             print(f"Generated combined audio: {args.output}")
222 |             return
223 | 
224 |         print(f"Generated {len(responses)} audio chunks")
225 | 
226 |         base_name, ext = os.path.splitext(args.output)
227 | 
228 |         for i, response in enumerate(responses, 1):
229 |             if args.verbose:
230 |                 print(f"Saving chunk {i}/{len(responses)}...")
231 | 
232 |             if len(responses) == 1:
233 |                 output_file = args.output
234 |             else:
235 |                 output_file = f"{base_name}_part{i:03d}{ext}"
236 | 
237 |             with open(output_file, "wb") as f:
238 |                 f.write(response.audio_data)
239 | 
240 |             print(f"Generated: {output_file}")
241 | 
242 |         if len(responses) > 1:
243 |             print(f"\nGenerated {len(responses)} audio files from long text.")
244 |             print(f"Files: {base_name}_part001{ext} to {base_name}_part{len(responses):03d}{ext}")
245 | 
246 |     except Exception as e:
247 |         print(f"Error processing long text: {e}", file=sys.stderr)
248 |         if args.verbose:
249 |             import traceback
250 | 
251 |             traceback.print_exc()
252 |         sys.exit(1)
253 | 
254 | 
255 | def main() -> None:
256 |     """Main CLI entry point."""
257 |     parser = create_parser()
258 |     args = parser.parse_args()
259 | 
260 |     # Get text input
261 |     if args.text:
262 |         text = args.text
263 |     else:
264 |         text = read_text_file(args.text_file)
265 | 
266 |     if not text:
267 |         print("Error: No text provided.", file=sys.stderr)
268 |         sys.exit(1)
269 | 
270 |     # Validate parameters
271 |     speed = validate_speed(args.speed)
272 |     voice = get_voice_enum(args.voice)
273 |     audio_format = get_format_enum(args.format)
274 | 
275 |     # Create output directory if needed
276 |     output_path = Path(args.output)
277 |     output_path.parent.mkdir(parents=True, exist_ok=True)
278 | 
279 |     # Check text length and handle accordingly
280 |     text_length = len(text)
281 |     validate_length = not args.no_length_validation
282 | 
283 |     if args.verbose:
284 |         print(f"Text: {text[:50]}{'...' if len(text) > 50 else ''}")
285 |         print(f"Text length: {text_length} characters")
286 |         print(f"Max length: {args.max_length}")
287 |         print(f"Length validation: {'enabled' if validate_length else 'disabled'}")
288 |         print(f"Voice: {args.voice}")
289 |         print(f"Format: {args.format}")
290 |         print(f"Speed: {speed}")
291 |         print(f"URL: {args.url}")
292 |         print(f"Output: {args.output}")
293 |         print()
294 | 
295 |     # Handle long text
296 |     if text_length > args.max_length:
297 |         if args.split_long_text:
298 |             print(f"Text is {text_length} characters, splitting into chunks...")
299 |             return handle_long_text(args, text, voice, audio_format, speed)
300 |         elif validate_length:
301 |             print(
302 |                 f"Error: Text is too long ({text_length} characters). "
303 |                 f"Maximum allowed is {args.max_length} characters.",
304 |                 file=sys.stderr,
305 |             )
306 |             print(
307 |                 "Use --split-long-text to automatically split the text, "
308 |                 "or --no-length-validation to disable this check.",
309 |                 file=sys.stderr,
310 |             )
311 |             sys.exit(1)
312 | 
313 |     # Create client
314 |     try:
315 |         client = TTSClient(
316 |             base_url=args.url, api_key=args.api_key, timeout=args.timeout, max_retries=args.retries
317 |         )
318 | 
319 |         if args.verbose:
320 |             print("Generating speech...")
321 | 
322 |         # Generate speech
323 |         response = client.generate_speech(
324 |             text=text,
325 |             voice=voice,
326 |             response_format=audio_format,
327 |             speed=speed,
328 |             max_length=args.max_length,
329 |             validate_length=validate_length,
330 |         )
331 | 
332 |         # Save to file
333 |         with open(args.output, "wb") as f:
334 |             f.write(response.audio_data)
335 | 
336 |         print(f"Speech generated successfully: {args.output}")
337 | 
338 |     except NetworkException as e:
339 |         print(f"Network error: {e}", file=sys.stderr)
340 |         sys.exit(1)
341 |     except APIException as e:
342 |         print(f"API error: {e}", file=sys.stderr)
343 |         sys.exit(1)
344 |     except TTSException as e:
345 |         print(f"TTS error: {e}", file=sys.stderr)
346 |         sys.exit(1)
347 |     except Exception as e:
348 |         print(f"Unexpected error: {e}", file=sys.stderr)
349 |         if args.verbose:
350 |             import traceback
351 | 
352 |             traceback.print_exc()
353 |         sys.exit(1)
354 | 
355 | 
356 | if __name__ == "__main__":
357 |     main()
358 | 


--------------------------------------------------------------------------------
/ttsfm-web/websocket_handler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | WebSocket handler for real-time TTS streaming.
  3 | 
  4 | Because apparently waiting 2 seconds for audio generation is too much for modern users.
  5 | At least this will make it FEEL faster.
  6 | """
  7 | 
  8 | import base64
  9 | import logging
 10 | import time
 11 | import uuid
 12 | from datetime import datetime
 13 | from typing import Any, Callable, Dict, Optional
 14 | 
 15 | from flask import request
 16 | from flask_socketio import SocketIO, emit
 17 | 
 18 | from ttsfm import AudioFormat, TTSClient, Voice
 19 | from ttsfm.utils import split_text_by_length
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class WebSocketTTSHandler:
 25 |     """
 26 |     Handles WebSocket connections for streaming TTS generation.
 27 | 
 28 |     Because your users can't wait 2 seconds for a complete response.
 29 |     """
 30 | 
 31 |     def __init__(self, socketio: SocketIO, client_factory: Callable[[], TTSClient]):
 32 |         self.socketio = socketio
 33 |         self._client_factory = client_factory
 34 |         self.active_sessions: Dict[str, Dict[str, Any]] = {}
 35 |         self._tasks: Dict[str, Dict[str, Any]] = {}
 36 | 
 37 |         # Register WebSocket events
 38 |         self._register_events()
 39 | 
 40 |     def _register_events(self):
 41 |         """Register all WebSocket event handlers."""
 42 | 
 43 |         @self.socketio.on("connect")
 44 |         def handle_connect():
 45 |             """Handle new WebSocket connection."""
 46 |             session_id = request.sid
 47 |             self.active_sessions[session_id] = {
 48 |                 "connected_at": datetime.now(),
 49 |                 "request_count": 0,
 50 |                 "last_request": None,
 51 |             }
 52 |             self._tasks[session_id] = {}
 53 |             logger.info(f"WebSocket client connected: {session_id}")
 54 |             logger.info(f"Active sessions: {len(self.active_sessions)}")
 55 |             emit("connected", {"session_id": session_id, "status": "ready"})
 56 | 
 57 |         @self.socketio.on("disconnect")
 58 |         def handle_disconnect():
 59 |             """Handle WebSocket disconnection."""
 60 |             session_id = request.sid
 61 |             if session_id in self.active_sessions:
 62 |                 del self.active_sessions[session_id]
 63 |             self._cancel_all_tasks(session_id)
 64 |             logger.info(f"WebSocket client disconnected: {session_id}")
 65 | 
 66 |         @self.socketio.on("generate_stream")
 67 |         def handle_generate_stream(data):
 68 |             """
 69 |             Handle streaming TTS generation request.
 70 | 
 71 |             Expected data format:
 72 |             {
 73 |                 'text': str,
 74 |                 'voice': str,
 75 |                 'format': str,
 76 |                 'chunk_size': int (optional, default 1024 chars),
 77 |                 'instructions': str (optional, voice modulation instructions)
 78 |             }
 79 |             """
 80 |             session_id = request.sid
 81 |             request_id = data.get("request_id", str(uuid.uuid4()))
 82 | 
 83 |             # Update session info
 84 |             if session_id in self.active_sessions:
 85 |                 self.active_sessions[session_id]["request_count"] += 1
 86 |                 self.active_sessions[session_id]["last_request"] = datetime.now()
 87 | 
 88 |             # Emit acknowledgment
 89 |             emit("stream_started", {"request_id": request_id, "timestamp": time.time()})
 90 | 
 91 |             # Start async generation
 92 |             task = self.socketio.start_background_task(
 93 |                 self._generate_stream, session_id, request_id, data
 94 |             )
 95 |             self._store_task(session_id, request_id, task)
 96 | 
 97 |         @self.socketio.on("cancel_stream")
 98 |         def handle_cancel_stream(data):
 99 |             """Handle stream cancellation request."""
100 |             request_id = data.get("request_id")
101 |             session_id = request.sid
102 | 
103 |             if not request_id:
104 |                 return
105 | 
106 |             cancelled = self._cancel_task(session_id, request_id)
107 |             if cancelled:
108 |                 logger.info(f"Stream cancellation requested: {request_id}")
109 |             else:
110 |                 logger.info(f"Stream cancellation requested for unknown request: {request_id}")
111 | 
112 |             emit("stream_cancelled", {"request_id": request_id, "cancelled": cancelled})
113 | 
114 |         @self.socketio.on("ping")
115 |         def handle_ping(data):
116 |             """Handle ping request for connection testing."""
117 |             session_id = request.sid
118 |             logger.debug(f"Ping received from {session_id}")
119 |             emit("pong", {"timestamp": time.time(), "data": data})
120 | 
121 |     def _generate_stream(self, session_id: str, request_id: str, data: Dict[str, Any]):
122 |         """
123 |         Generate TTS audio in chunks and stream to client.
124 | 
125 |         This is where the magic happens. And by magic, I mean
126 |         chunking text and pretending it's real-time.
127 |         """
128 |         client = self._client_factory()
129 | 
130 |         try:
131 |             # Extract parameters
132 |             text = data.get("text", "")
133 |             voice = data.get("voice", "alloy")
134 |             format_str = data.get("format", "mp3")
135 |             chunk_size = data.get("chunk_size", 1024)
136 |             instructions = data.get("instructions", None)  # Voice instructions support!
137 | 
138 |             if not text:
139 |                 self._emit_error(session_id, request_id, "No text provided")
140 |                 return
141 | 
142 |             # Convert string parameters to enums
143 |             try:
144 |                 voice_enum = Voice(voice.lower())
145 |                 format_enum = AudioFormat(format_str.lower())
146 |             except ValueError as e:
147 |                 self._emit_error(session_id, request_id, f"Invalid parameter: {str(e)}")
148 |                 return
149 | 
150 |             # Split text into chunks for "streaming" effect
151 |             chunks = split_text_by_length(text, chunk_size, preserve_words=True)
152 |             total_chunks = len(chunks)
153 | 
154 |             logger.info(f"Starting stream generation: {request_id} with {total_chunks} chunks")
155 | 
156 |             # Emit initial progress
157 |             self.socketio.emit(
158 |                 "stream_progress",
159 |                 {
160 |                     "request_id": request_id,
161 |                     "progress": 0,
162 |                     "total_chunks": total_chunks,
163 |                     "status": "processing",
164 |                 },
165 |                 room=session_id,
166 |             )
167 | 
168 |             # Process each chunk
169 |             for i, chunk in enumerate(chunks):
170 |                 # Check if client is still connected
171 |                 if session_id not in self.active_sessions:
172 |                     logger.warning(f"Client disconnected during generation: {session_id}")
173 |                     break
174 | 
175 |                 if not self._is_task_active(session_id, request_id):
176 |                     logger.info(f"Stream generation cancelled: {request_id}")
177 |                     break
178 | 
179 |                 try:
180 |                     # Generate audio for chunk
181 |                     start_time = time.time()
182 |                     response = client.generate_speech(
183 |                         text=chunk,
184 |                         voice=voice_enum,
185 |                         response_format=format_enum,
186 |                         instructions=instructions,  # Pass voice instructions!
187 |                         validate_length=False,  # We already chunked it
188 |                     )
189 |                     generation_time = time.time() - start_time
190 | 
191 |                     # Emit chunk data
192 |                     encoded_audio = base64.b64encode(response.audio_data).decode("ascii")
193 |                     chunk_data = {
194 |                         "request_id": request_id,
195 |                         "chunk_index": i,
196 |                         "total_chunks": total_chunks,
197 |                         "audio_data": encoded_audio,
198 |                         "encoding": "base64",
199 |                         "byte_length": len(response.audio_data),
200 |                         "format": response.format.value,
201 |                         "requested_format": format_enum.value,
202 |                         "duration": response.duration,
203 |                         "generation_time": generation_time,
204 |                         "chunk_text": chunk[:50] + "..." if len(chunk) > 50 else chunk,
205 |                     }
206 | 
207 |                     self.socketio.emit("audio_chunk", chunk_data, room=session_id)
208 | 
209 |                     # Emit progress update
210 |                     progress = int(((i + 1) / total_chunks) * 100)
211 |                     self.socketio.emit(
212 |                         "stream_progress",
213 |                         {
214 |                             "request_id": request_id,
215 |                             "progress": progress,
216 |                             "total_chunks": total_chunks,
217 |                             "chunks_completed": i + 1,
218 |                             "status": "processing",
219 |                         },
220 |                         room=session_id,
221 |                     )
222 | 
223 |                     # Small delay to prevent overwhelming the client
224 |                     # (and to make it feel more "real-time")
225 |                     self.socketio.sleep(0.1)
226 | 
227 |                 except Exception as e:
228 |                     logger.error(f"Error generating chunk {i}: {str(e)}")
229 |                     self._emit_error(
230 |                         session_id, request_id, f"Chunk {i} generation failed: {str(e)}"
231 |                     )
232 |                     # Continue with next chunk instead of failing completely
233 |                     continue
234 | 
235 |             # Emit completion
236 |             self.socketio.emit(
237 |                 "stream_complete",
238 |                 {
239 |                     "request_id": request_id,
240 |                     "total_chunks": total_chunks,
241 |                     "status": "completed",
242 |                     "timestamp": time.time(),
243 |                 },
244 |                 room=session_id,
245 |             )
246 | 
247 |             logger.info(f"Stream generation completed: {request_id}")
248 | 
249 |         except Exception as e:
250 |             logger.error(f"Stream generation failed: {str(e)}")
251 |             self._emit_error(session_id, request_id, str(e))
252 |         finally:
253 |             try:
254 |                 client.close()
255 |             except Exception as exc:  # pragma: no cover - defensive cleanup
256 |                 logger.debug("Failed to close TTS client cleanly: %s", exc)
257 |             self._remove_task(session_id, request_id)
258 | 
259 |     def _emit_error(self, session_id: str, request_id: str, error_message: str):
260 |         """Emit error to specific session."""
261 |         self.socketio.emit(
262 |             "stream_error",
263 |             {"request_id": request_id, "error": error_message, "timestamp": time.time()},
264 |             room=session_id,
265 |         )
266 | 
267 |     def _store_task(self, session_id: str, request_id: str, task: Any) -> None:
268 |         self._tasks.setdefault(session_id, {})[request_id] = task
269 | 
270 |     def _remove_task(self, session_id: str, request_id: str) -> None:
271 |         tasks = self._tasks.get(session_id)
272 |         if not tasks:
273 |             return
274 |         tasks.pop(request_id, None)
275 |         if not tasks:
276 |             self._tasks.pop(session_id, None)
277 | 
278 |     def _cancel_task(self, session_id: str, request_id: str) -> bool:
279 |         tasks = self._tasks.get(session_id)
280 |         if not tasks:
281 |             return False
282 |         task = tasks.pop(request_id, None)
283 |         if not task:
284 |             if not tasks:
285 |                 self._tasks.pop(session_id, None)
286 |             return False
287 | 
288 |         self._invoke_task_cancel(task)
289 |         if not tasks:
290 |             self._tasks.pop(session_id, None)
291 |         return True
292 | 
293 |     def _cancel_all_tasks(self, session_id: str) -> None:
294 |         tasks = self._tasks.pop(session_id, {})
295 |         for task in tasks.values():
296 |             self._invoke_task_cancel(task)
297 | 
298 |     def _invoke_task_cancel(self, task: Any) -> None:
299 |         try:
300 |             cancel = getattr(task, "cancel", None)
301 |             if callable(cancel):
302 |                 cancel()
303 |                 return
304 | 
305 |             kill = getattr(task, "kill", None)
306 |             if callable(kill):  # pragma: no cover - eventlet specific
307 |                 kill()
308 |         except Exception as exc:  # pragma: no cover - defensive logging
309 |             logger.debug("Failed to cancel background task cleanly: %s", exc)
310 | 
311 |     def _is_task_active(self, session_id: str, request_id: str) -> bool:
312 |         tasks = self._tasks.get(session_id)
313 |         if not tasks:
314 |             return False
315 |         return request_id in tasks
316 | 
317 |     def get_active_sessions_count(self) -> int:
318 |         """Get count of active WebSocket sessions."""
319 |         return len(self.active_sessions)
320 | 
321 |     def get_session_info(self, session_id: str) -> Optional[Dict[str, Any]]:
322 |         """Get information about a specific session."""
323 |         return self.active_sessions.get(session_id)
324 | 


--------------------------------------------------------------------------------
/ttsfm-web/templates/base.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="{{ get_locale() }}">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>{% block title %}TTSFM - {{ _('nav.home') }}{% endblock %}</title>
  7 |     
  8 |     <!-- Bootstrap CSS -->
  9 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
 10 |     
 11 |     <!-- Font Awesome -->
 12 |     <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
 13 | 
 14 |     <!-- Google Fonts -->
 15 |     <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
 16 | 
 17 |     <!-- Custom CSS -->
 18 |     <link href="{{ url_for('static', filename='css/style.css') }}" rel="stylesheet">
 19 | 
 20 |     <!-- Additional Performance Optimizations -->
 21 |     <link rel="preconnect" href="https://fonts.googleapis.com">
 22 |     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
 23 | 
 24 |     <!-- Favicon -->
 25 |     <link rel="icon" type="image/svg+xml" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><text y='.9em' font-size='90'>🎤</text></svg>">
 26 | 
 27 |     <!-- Meta tags for better SEO and social sharing -->
 28 |     <meta name="description" content="TTSFM - A Python client for text-to-speech APIs. Simple to use with support for multiple voices and audio formats.">
 29 |     <meta name="keywords" content="text-to-speech, TTS, python, API, voice synthesis, audio generation">
 30 |     <meta name="author" content="TTSFM">
 31 | 
 32 |     <!-- Open Graph / Facebook -->
 33 |     <meta property="og:type" content="website">
 34 |     <meta property="og:url" content="{{ request.url }}">
 35 |     <meta property="og:title" content="{% block og_title %}TTSFM - Python Text-to-Speech Client{% endblock %}">
 36 |     <meta property="og:description" content="A Python client for text-to-speech APIs. Simple to use with support for multiple voices and audio formats.">
 37 | 
 38 |     <!-- Twitter -->
 39 |     <meta property="twitter:card" content="summary">
 40 |     <meta property="twitter:url" content="{{ request.url }}">
 41 |     <meta property="twitter:title" content="{% block twitter_title %}TTSFM - Python Text-to-Speech Client{% endblock %}">
 42 |     <meta property="twitter:description" content="A Python client for text-to-speech APIs. Simple to use with support for multiple voices and audio formats.">
 43 |     
 44 |     {% block extra_css %}{% endblock %}
 45 | 
 46 |     <!-- Language button styling -->
 47 |     <style>
 48 |         /* Language dropdown button styling */
 49 |         #languageDropdown {
 50 |             border-color: #6c757d;
 51 |             color: #6c757d;
 52 |             transition: all 0.2s ease-in-out;
 53 |             font-size: 0.875rem;
 54 |         }
 55 | 
 56 |         #languageDropdown:hover {
 57 |             border-color: #495057;
 58 |             color: #495057;
 59 |             background-color: #f8f9fa;
 60 |         }
 61 | 
 62 |         #languageDropdown:focus {
 63 |             box-shadow: 0 0 0 0.2rem rgba(108, 117, 125, 0.25);
 64 |         }
 65 | 
 66 |         /* Responsive language button */
 67 |         @media (max-width: 576px) {
 68 |             #languageDropdown {
 69 |                 font-size: 0.75rem;
 70 |                 padding: 0.25rem 0.5rem;
 71 |             }
 72 |         }
 73 | 
 74 |         /* Ensure consistent button heights */
 75 |         .navbar-nav .btn {
 76 |             display: inline-flex;
 77 |             align-items: center;
 78 |         }
 79 |     </style>
 80 | </head>
 81 | <body>
 82 |     <!-- Skip to content link for accessibility -->
 83 |     <a href="#main-content" class="skip-link">Skip to main content</a>
 84 | 
 85 |     <!-- Clean Navigation -->
 86 |     <nav class="navbar navbar-expand-lg fixed-top" style="background-color: rgba(255, 255, 255, 0.95); backdrop-filter: blur(10px); border-bottom: 1px solid #e5e7eb;">
 87 |         <div class="container">
 88 |             <a class="navbar-brand" href="{{ url_for('index') }}">
 89 |                 <i class="fas fa-microphone-alt me-2"></i>
 90 |                 <span class="fw-bold">TTSFM</span>
 91 |                 <span class="badge bg-primary ms-2 small">v3.4.2</span>
 92 |             </a>
 93 | 
 94 |             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
 95 |                 <span class="navbar-toggler-icon"></span>
 96 |             </button>
 97 | 
 98 |             <div class="collapse navbar-collapse" id="navbarNav">
 99 |                 <ul class="navbar-nav me-auto">
100 |                     <li class="nav-item">
101 |                         <a class="nav-link" href="{{ url_for('index') }}" aria-label="{{ _('nav.home') }}">
102 |                             <i class="fas fa-home me-1"></i>{{ _('nav.home') }}
103 |                         </a>
104 |                     </li>
105 |                     <li class="nav-item">
106 |                         <a class="nav-link" href="{{ url_for('playground') }}" aria-label="{{ _('nav.playground') }}">
107 |                             <i class="fas fa-play me-1"></i>{{ _('nav.playground') }}
108 |                         </a>
109 |                     </li>
110 |                     <li class="nav-item">
111 |                         <a class="nav-link" href="{{ url_for('docs') }}" aria-label="{{ _('nav.documentation') }}">
112 |                             <i class="fas fa-book me-1"></i>{{ _('nav.documentation') }}
113 |                         </a>
114 |                     </li>
115 |                 </ul>
116 | 
117 |                 <ul class="navbar-nav">
118 |                     <li class="nav-item">
119 |                         <span class="navbar-text d-flex align-items-center">
120 |                             <span id="status-indicator" class="status-indicator status-offline" aria-hidden="true"></span>
121 |                             <span id="status-text" class="small">{{ _('nav.status_checking') }}</span>
122 |                         </span>
123 |                     </li>
124 |                     <li class="nav-item dropdown ms-3">
125 |                         <button class="btn btn-outline-secondary btn-sm dropdown-toggle" type="button" id="languageDropdown" data-bs-toggle="dropdown" aria-expanded="false" title="{{ _('common.language') }}">
126 |                             {% if get_locale() == 'zh' %}🇨🇳 中文{% else %}🇺🇸 English{% endif %}
127 |                         </button>
128 |                         <ul class="dropdown-menu" aria-labelledby="languageDropdown">
129 |                             {% for lang_code, lang_name in get_supported_languages().items() %}
130 |                             <li>
131 |                                 <a class="dropdown-item{% if get_locale() == lang_code %} active{% endif %}"
132 |                                    href="{{ url_for('set_language', lang_code=lang_code) }}">
133 |                                     {% if lang_code == 'en' %}🇺🇸{% elif lang_code == 'zh' %}🇨🇳{% endif %} {{ lang_name }}
134 |                                 </a>
135 |                             </li>
136 |                             {% endfor %}
137 |                         </ul>
138 |                     </li>
139 |                     <li class="nav-item ms-3">
140 |                         <a class="btn btn-outline-primary btn-sm" href="https://github.com/dbccccccc/ttsfm" target="_blank" rel="noopener noreferrer" aria-label="{{ _('nav.github') }}">
141 |                             <i class="fab fa-github me-1"></i>{{ _('nav.github') }}
142 |                         </a>
143 |                     </li>
144 |                 </ul>
145 |             </div>
146 |         </div>
147 |     </nav>
148 | 
149 |     <!-- Main Content -->
150 |     <main id="main-content" style="padding-top: 76px; min-height: calc(100vh - 76px);">
151 |         {% block content %}{% endblock %}
152 |     </main>
153 | 
154 |     <!-- Bootstrap JS -->
155 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
156 | 
157 |     <!-- Internationalization Support -->
158 |     <script src="{{ url_for('static', filename='js/i18n.js') }}"></script>
159 | 
160 |     <!-- Enhanced Common JavaScript -->
161 |     <script>
162 |         // Enhanced service status checking
163 |         async function checkStatus() {
164 |             try {
165 |                 const response = await fetch('/api/health');
166 |                 const data = await response.json();
167 | 
168 |                 const indicator = document.getElementById('status-indicator');
169 |                 const text = document.getElementById('status-text');
170 | 
171 |                 if (response.ok && data.status === 'healthy') {
172 |                     indicator.className = 'status-indicator status-online';
173 |                     text.textContent = '{{ _("nav.status_online") }}';
174 |                 } else {
175 |                     indicator.className = 'status-indicator status-offline';
176 |                     text.textContent = '{{ _("nav.status_offline") }}';
177 |                 }
178 |             } catch (error) {
179 |                 const indicator = document.getElementById('status-indicator');
180 |                 const text = document.getElementById('status-text');
181 |                 indicator.className = 'status-indicator status-offline';
182 |                 text.textContent = '{{ _("nav.status_offline") }}';
183 |             }
184 |         }
185 | 
186 |         // Enhanced page initialization
187 |         document.addEventListener('DOMContentLoaded', function() {
188 |             // Check status immediately and periodically
189 |             checkStatus();
190 |             setInterval(checkStatus, 30000); // Check every 30 seconds
191 | 
192 |             // Initialize tooltips
193 |             if (typeof bootstrap !== 'undefined') {
194 |                 const tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]'));
195 |                 tooltipTriggerList.map(function (tooltipTriggerEl) {
196 |                     return new bootstrap.Tooltip(tooltipTriggerEl);
197 |                 });
198 |             }
199 | 
200 |             // Add smooth scrolling for anchor links
201 |             document.querySelectorAll('a[href^="#"]').forEach(anchor => {
202 |                 anchor.addEventListener('click', function (e) {
203 |                     const target = document.querySelector(this.getAttribute('href'));
204 |                     if (target) {
205 |                         e.preventDefault();
206 |                         target.scrollIntoView({
207 |                             behavior: 'smooth',
208 |                             block: 'start'
209 |                         });
210 |                     }
211 |                 });
212 |             });
213 | 
214 |             // Add fade-in animation to main content
215 |             const mainContent = document.querySelector('main');
216 |             if (mainContent) {
217 |                 mainContent.classList.add('fade-in');
218 |             }
219 | 
220 |             // Add loading states to external links
221 |             document.querySelectorAll('a[target="_blank"]').forEach(link => {
222 |                 link.addEventListener('click', function() {
223 |                     this.style.opacity = '0.7';
224 |                     setTimeout(() => {
225 |                         this.style.opacity = '1';
226 |                     }, 1000);
227 |                 });
228 |             });
229 |         });
230 | 
231 |         // Enhanced utility function to show loading state
232 |         function setLoading(button, loading) {
233 |             if (loading) {
234 |                 button.classList.add('loading');
235 |                 button.disabled = true;
236 |                 button.style.cursor = 'wait';
237 |             } else {
238 |                 button.classList.remove('loading');
239 |                 button.disabled = false;
240 |                 button.style.cursor = 'pointer';
241 |             }
242 |         }
243 | 
244 |         // Enhanced utility function to show alerts
245 |         function showAlert(message, type = 'info', duration = 5000) {
246 |             const alertDiv = document.createElement('div');
247 |             alertDiv.className = `alert alert-${type} alert-dismissible fade show fade-in`;
248 |             alertDiv.style.position = 'relative';
249 |             alertDiv.style.zIndex = '1050';
250 |             alertDiv.innerHTML = `
251 |                 <i class="fas fa-${getAlertIcon(type)} me-2"></i>
252 |                 ${message}
253 |                 <button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
254 |             `;
255 | 
256 |             // Find the best container to insert the alert
257 |             const container = document.querySelector('main .container') || document.querySelector('.container') || document.body;
258 |             if (container) {
259 |                 container.insertBefore(alertDiv, container.firstChild);
260 | 
261 |                 // Auto-dismiss after specified duration
262 |                 setTimeout(() => {
263 |                     if (alertDiv.parentNode) {
264 |                         alertDiv.classList.remove('show');
265 |                         setTimeout(() => {
266 |                             if (alertDiv.parentNode) {
267 |                                 alertDiv.remove();
268 |                             }
269 |                         }, 150);
270 |                     }
271 |                 }, duration);
272 | 
273 |                 // Scroll to alert if it's not visible
274 |                 alertDiv.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
275 |             }
276 |         }
277 | 
278 |         // Helper function to get appropriate icon for alert type
279 |         function getAlertIcon(type) {
280 |             const icons = {
281 |                 'success': 'check-circle',
282 |                 'danger': 'exclamation-triangle',
283 |                 'warning': 'exclamation-triangle',
284 |                 'info': 'info-circle',
285 |                 'primary': 'info-circle'
286 |             };
287 |             return icons[type] || 'info-circle';
288 |         }
289 | 
290 |         // Enhanced error handling for fetch requests
291 |         async function safeFetch(url, options = {}) {
292 |             try {
293 |                 const response = await fetch(url, options);
294 |                 if (!response.ok) {
295 |                     throw new Error(`HTTP ${response.status}: ${response.statusText}`);
296 |                 }
297 |                 return response;
298 |             } catch (error) {
299 |                 console.error('Fetch error:', error);
300 |                 showAlert(`Network error: ${error.message}`, 'danger');
301 |                 throw error;
302 |             }
303 |         }
304 | 
305 |         // Performance monitoring
306 |         window.addEventListener('load', function() {
307 |             // Log page load time
308 |             const loadTime = performance.now();
309 |             console.log(`Page loaded in ${Math.round(loadTime)}ms`);
310 | 
311 |             // Check for slow loading resources
312 |             if (loadTime > 3000) {
313 |                 console.warn('Page load time is slow. Consider optimizing resources.');
314 |             }
315 |         });
316 | 
317 |         // Keyboard shortcuts
318 |         document.addEventListener('keydown', function(e) {
319 |             // Alt + H for home
320 |             if (e.altKey && e.key === 'h') {
321 |                 e.preventDefault();
322 |                 window.location.href = '{{ url_for("index") }}';
323 |             }
324 | 
325 |             // Alt + P for playground
326 |             if (e.altKey && e.key === 'p') {
327 |                 e.preventDefault();
328 |                 window.location.href = '{{ url_for("playground") }}';
329 |             }
330 | 
331 |             // Alt + D for docs
332 |             if (e.altKey && e.key === 'd') {
333 |                 e.preventDefault();
334 |                 window.location.href = '{{ url_for("docs") }}';
335 |             }
336 |         });
337 |     </script>
338 |     
339 |     {% block extra_js %}{% endblock %}
340 | </body>
341 | </html>
342 | 
343 | 


--------------------------------------------------------------------------------
/ttsfm-web/translations/en.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nav": {
  3 |     "home": "Home",
  4 |     "playground": "Playground",
  5 |     "documentation": "Documentation",
  6 |     "github": "GitHub",
  7 |     "status_checking": "Checking...",
  8 |     "status_online": "Online",
  9 |     "status_offline": "Offline"
 10 |   },
 11 |   "common": {
 12 |     "loading": "Loading...",
 13 |     "error": "Error",
 14 |     "success": "Success",
 15 |     "warning": "Warning",
 16 |     "info": "Info",
 17 |     "close": "Close",
 18 |     "save": "Save",
 19 |     "cancel": "Cancel",
 20 |     "confirm": "Confirm",
 21 |     "download": "Download",
 22 |     "upload": "Upload",
 23 |     "generate": "Generate",
 24 |     "play": "Play",
 25 |     "stop": "Stop",
 26 |     "pause": "Pause",
 27 |     "resume": "Resume",
 28 |     "clear": "Clear",
 29 |     "reset": "Reset",
 30 |     "copy": "Copy",
 31 |     "copied": "Copied!",
 32 |     "language": "Language",
 33 |     "english": "English",
 34 |     "chinese": "中文",
 35 |     "validate": "Validate",
 36 |     "options": "Options",
 37 |     "max_length": "Max Length",
 38 |     "tip": "Tip",
 39 |     "choose_voice": "Choose from available voices",
 40 |     "select_format": "Select your preferred audio format",
 41 |     "loading_voices": "Loading voices...",
 42 |     "loading_formats": "Loading formats...",
 43 |     "ctrl_enter_tip": "Use Ctrl+Enter to generate",
 44 |     "auto_combine_enabled": "Auto-combine enabled",
 45 |     "demo": "Demo",
 46 |     "clear_text": "Clear text",
 47 |     "tip_ctrl_enter": "Tip: Use Ctrl+Enter to generate",
 48 |     "ready": "Ready",
 49 |     "replay_audio": "Replay audio",
 50 |     "share_audio": "Share audio",
 51 |     "browser_no_audio_support": "Your browser does not support the audio element.",
 52 |     "generating_speech": "Generating speech...",
 53 |     "streaming": "Streaming",
 54 |     "chars": "chars",
 55 |     "generated": "Generated"
 56 |   },
 57 |   "home": {
 58 |     "title": "Free Text-to-Speech for Python",
 59 |     "subtitle": "Generate high-quality speech from text using the free openai.fm service. No API keys, no registration - just install and start creating audio.",
 60 |     "try_demo": "Try Demo",
 61 |     "documentation": "Documentation",
 62 |     "github": "GitHub",
 63 |     "features_title": "Key Features",
 64 |     "features_subtitle": "Simple, free, and powerful text-to-speech for Python developers.",
 65 |     "feature_free_title": "Completely Free",
 66 |     "feature_free_desc": "No API keys or registration required. Uses the free openai.fm service.",
 67 |     "feature_voices_title": "11 Voices",
 68 |     "feature_voices_desc": "All OpenAI-compatible voices available for different use cases.",
 69 |     "feature_formats_title": "6 Audio Formats",
 70 |     "feature_formats_desc": "MP3, WAV, OPUS, AAC, FLAC, and PCM support for any application.",
 71 |     "feature_docker_title": "Docker Ready",
 72 |     "feature_docker_desc": "One-command deployment with web interface and API endpoints.",
 73 |     "feature_openai_title": "OpenAI Compatible",
 74 |     "feature_openai_desc": "Drop-in replacement for OpenAI's TTS API with auto-combine for long text.",
 75 |     "feature_async_title": "Async & Sync",
 76 |     "feature_async_desc": "Both asyncio and synchronous clients for maximum flexibility.",
 77 |     "quick_start_title": "Quick Start",
 78 |     "installation_title": "Installation",
 79 |     "installation_code": "pip install ttsfm",
 80 |     "usage_title": "Basic Usage",
 81 |     "docker_title": "Docker Deployment",
 82 |     "docker_desc": "Run TTSFM with web interface:",
 83 |     "api_title": "OpenAI-Compatible API",
 84 |     "api_desc": "Use with OpenAI Python client:",
 85 |     "footer_copyright": "© 2024 dbcccc"
 86 |   },
 87 |   "playground": {
 88 |     "title": "Interactive TTS Playground",
 89 |     "subtitle": "Test different voices and audio formats in real-time",
 90 |     "text_input_label": "Text to Convert",
 91 |     "text_input_placeholder": "Enter the text you want to convert to speech...",
 92 |     "voice_label": "Voice",
 93 |     "format_label": "Audio Format",
 94 |     "instructions_label": "Voice Instructions (Optional)",
 95 |     "instructions_placeholder": "Additional instructions for voice generation...",
 96 |     "character_count": "characters",
 97 |     "max_length_warning": "Text exceeds maximum length. It will be automatically split and combined.",
 98 |     "generate_speech": "Generate Speech",
 99 |     "generating": "Generating...",
100 |     "download_audio": "Download Audio",
101 |     "audio_player_title": "Generated Audio",
102 |     "file_size": "File Size",
103 |     "duration": "Duration",
104 |     "format": "Format",
105 |     "voice": "Voice",
106 |     "chunks_combined": "Chunks Combined",
107 |     "random_text": "Random Text",
108 |     "clear_text": "Clear Text",
109 |     "max_length_description": "Maximum characters per request (default: 1000)",
110 |     "enable_length_validation": "Enable length validation",
111 |     "auto_combine_long_text": "Auto-combine long text",
112 |     "auto_combine_tooltip": "Automatically split long text and combine audio chunks into a single file",
113 |     "auto_combine_description": "Automatically handles text longer than the limit",
114 |     "instructions_description": "Provide optional instructions for voice modulation",
115 |     "api_key_optional": "API Key (Optional)",
116 |     "api_key_placeholder": "Enter your API key if required",
117 |     "api_key_description": "Only required if API key protection is enabled on the server",
118 |     "sample_texts": {
119 |       "welcome": "Welcome to TTSFM! This is a free text-to-speech service that converts your text into high-quality audio using advanced AI technology.",
120 |       "story": "Once upon a time, in a digital world far away, there lived a small Python package that could transform any text into beautiful speech. This package was called TTSFM, and it brought joy to developers everywhere.",
121 |       "technical": "TTSFM is a Python client for text-to-speech APIs that provides both synchronous and asynchronous interfaces. It supports multiple voices and audio formats, making it perfect for various applications.",
122 |       "multilingual": "TTSFM supports multiple languages and voices, allowing you to create diverse audio content for global audiences. The service is completely free and requires no API keys.",
123 |       "long": "This is a longer text sample designed to test the auto-combine feature of TTSFM. When text exceeds the maximum length limit, TTSFM automatically splits it into smaller chunks, generates audio for each chunk, and then seamlessly combines them into a single audio file. This process is completely transparent to the user and ensures that you can convert text of any length without worrying about technical limitations. The resulting audio maintains consistent quality and natural flow throughout the entire content."
124 |     },
125 |     "error_messages": {
126 |       "empty_text": "Please enter some text to convert.",
127 |       "generation_failed": "Failed to generate speech. Please try again.",
128 |       "network_error": "Network error. Please check your connection and try again.",
129 |       "invalid_format": "Invalid audio format selected.",
130 |       "invalid_voice": "Invalid voice selected.",
131 |       "text_too_long": "Text is too long. Please reduce the length or enable auto-combine.",
132 |       "server_error": "Server error. Please try again later."
133 |     },
134 |     "success_messages": {
135 |       "generation_complete": "Speech generated successfully!",
136 |       "text_copied": "Text copied to clipboard!",
137 |       "download_started": "Download started!"
138 |     },
139 |     "speed_label": "Playback Speed",
140 |     "speed_description": "Adjust audio playback speed from 0.25x (slower) to 4.0x (faster). Default is 1.0x (normal speed).",
141 |     "speed": "Speed",
142 |     "chunks": "Chunks",
143 |     "format_description": "Choose audio output format. Converted formats require ffmpeg.",
144 |     "enable_websocket_streaming": "Enable WebSocket Streaming",
145 |     "realtime_audio_chunks": "(Real-time audio chunks)",
146 |     "streaming_progress": "Streaming Progress",
147 |     "stream_speech": "Stream Speech",
148 |     "streaming_complete": "Streaming Complete",
149 |     "streaming_ready": "Streaming Ready",
150 |     "streaming_active": "Streaming...",
151 |     "streaming_offline": "Streaming Offline",
152 |     "chunks_label": "Chunks:",
153 |     "total_size_label": "Total Size:",
154 |     "time_label": "Time:",
155 |     "format_label_colon": "Format:",
156 |     "connection_error": "Connection Error",
157 |     "chunks_heading": "Chunks",
158 |     "data_heading": "Data",
159 |     "time_heading": "Time",
160 |     "chunk_title": "Chunk"
161 |   },
162 |   "docs": {
163 |     "title": "API Documentation",
164 |     "subtitle": "Complete reference for the TTSFM Text-to-Speech API. Free, simple, and powerful.",
165 |     "contents": "Contents",
166 |     "overview": "Overview",
167 |     "authentication": "Authentication",
168 |     "text_validation": "Text Validation",
169 |     "endpoints": "API Endpoints",
170 |     "voices": "Voices",
171 |     "formats": "Audio Formats",
172 |     "generate": "Generate Speech",
173 |     "combined": "Combined Audio",
174 |     "status": "Status & Health",
175 |     "errors": "Error Handling",
176 |     "examples": "Code Examples",
177 |     "python_package": "Python Package",
178 |     "overview_title": "Overview",
179 |     "overview_desc": "The TTSFM API provides a modern, OpenAI-compatible interface for text-to-speech generation. It supports multiple voices, audio formats, and includes advanced features like text length validation and intelligent auto-combine functionality.",
180 |     "base_url": "Base URL:",
181 |     "key_features": "Key Features",
182 |     "feature_voices": "11 different voice options - Choose from alloy, echo, nova, and more",
183 |     "feature_formats": "Multiple audio formats - MP3, WAV, OPUS, AAC, FLAC, PCM support",
184 |     "feature_openai": "OpenAI compatibility - Drop-in replacement for OpenAI's TTS API",
185 |     "feature_auto_combine": "Auto-combine feature - Automatically handles long text (>1000 chars) by splitting and combining audio",
186 |     "feature_validation": "Text length validation - Smart validation with configurable limits",
187 |     "feature_monitoring": "Real-time monitoring - Status endpoints and health checks",
188 |     "new_version": "New in v3.3.4:",
189 |     "new_version_desc": "Runtime images now ship with ffmpeg so MP3 auto-combine succeeds immediately, and the default long-text limit is trimmed to 1000 characters for predictable playback.",
190 |     "authentication_title": "Authentication",
191 |     "authentication_desc": "Currently, the API supports optional API key authentication. If configured, include your API key in the request headers.",
192 |     "text_validation_title": "Text Length Validation",
193 |     "text_validation_desc": "TTSFM includes built-in text length validation to ensure compatibility with TTS models. The default maximum length is 1000 characters, but this can be customized.",
194 |     "important": "Important:",
195 |     "text_validation_warning": "Text exceeding the maximum length will be rejected unless validation is disabled or the text is split into chunks.",
196 |     "validation_options": "Validation Options",
197 |     "max_length_option": "Maximum allowed characters (default: 1000)",
198 |     "validate_length_option": "Enable/disable validation (default: true)",
199 |     "preserve_words_option": "Avoid splitting words when chunking (default: true)",
200 |     "endpoints_title": "API Endpoints",
201 |     "get_voices_desc": "Get list of available voices.",
202 |     "get_formats_desc": "Get list of supported audio formats.",
203 |     "validate_text_desc": "Validate text length and get splitting suggestions.",
204 |     "generate_speech_desc": "Generate speech from text.",
205 |     "response_example": "Response Example:",
206 |     "request_body": "Request Body:",
207 |     "parameters": "Parameters:",
208 |     "text_param": "Text to convert to speech",
209 |     "voice_param": "Voice ID (default: \"alloy\")",
210 |     "format_param": "Audio format (default: \"mp3\")",
211 |     "instructions_param": "Voice modulation instructions",
212 |     "max_length_param": "Maximum text length (default: 1000)",
213 |     "validate_length_param": "Enable validation (default: true)",
214 |     "response": "Response:",
215 |     "response_audio": "Returns audio file with appropriate Content-Type header.",
216 |     "response_combined_audio": "Returns a single audio file containing all chunks combined seamlessly.",
217 |     "required": "required",
218 |     "optional": "optional",
219 |     "python_package_title": "Python Package",
220 |     "long_text_support": "Long Text Support",
221 |     "long_text_desc": "The TTSFM Python package includes built-in long text splitting functionality for developers who need fine-grained control:",
222 |     "developer_features": "Developer Features:",
223 |     "manual_splitting": "Manual Splitting: Full control over text chunking for advanced use cases",
224 |     "word_preservation": "Word Preservation: Maintains word boundaries for natural speech",
225 |     "separate_files": "Separate Files: Each chunk saved as individual audio file",
226 |     "cli_support": "CLI Support: Use `--split-long-text` flag for command-line usage",
227 |     "note": "Note:",
228 |     "auto_combine_note": "For web users, the auto-combine feature in `/v1/audio/speech` is recommended as it automatically handles long text and returns a single seamless audio file.",
229 |     "combined_audio_desc": "Generate a single combined audio file from long text. Automatically splits text into chunks, generates speech for each chunk, and combines them into one seamless audio file.",
230 |     "response_headers": "Response Headers:",
231 |     "chunks_combined_header": "Number of chunks that were combined",
232 |     "original_text_length_header": "Original text length in characters",
233 |     "audio_size_header": "Final audio file size in bytes",
234 |     "openai_compatible_desc": "Enhanced OpenAI-compatible endpoint with auto-combine feature. Automatically handles long text by splitting and combining audio chunks when needed.",
235 |     "enhanced_parameters": "Enhanced Parameters:",
236 |     "auto_combine_param": "Automatically split long text and combine audio chunks into a single file",
237 |     "auto_combine_false": "Return error if text exceeds max_length (standard OpenAI behavior)",
238 |     "max_length_chunk_param": "Maximum characters per chunk when splitting",
239 |     "auto_combine_header": "Whether auto-combine was enabled (true/false)",
240 |     "chunks_combined_response": "Number of audio chunks combined (1 for short text)",
241 |     "original_text_response": "Original text length (for long text processing)",
242 |     "audio_format_header": "Audio format of the response",
243 |     "audio_size_response": "Audio file size in bytes",
244 |     "short_text_comment": "Short text (works normally)",
245 |     "long_text_auto_comment": "Long text with auto-combine (default)",
246 |     "long_text_no_auto_comment": "Long text without auto-combine (will error)",
247 |     "audio_combination": "Audio Combination:",
248 |     "audio_combination_desc": "Uses advanced audio processing (PyDub) when available, with intelligent fallbacks for different environments. Supports all audio formats.",
249 |     "use_cases": "Use Cases:",
250 |     "use_case_articles": "Long Articles: Convert blog posts or articles to single audio files",
251 |     "use_case_audiobooks": "Audiobooks: Generate chapters as single audio files",
252 |     "use_case_podcasts": "Podcasts: Create podcast episodes from scripts",
253 |     "use_case_education": "Educational Content: Convert learning materials to audio",
254 |     "example_usage": "Example Usage:",
255 |     "python_example_comment": "Python example",
256 |     "operations": "Operational Notes",
257 |     "operational_title": "Operational Notes",
258 |     "operational_limit": "Requests above 1000 characters are automatically split when auto_combine is enabled; disable validation to manage chunking yourself.",
259 |     "operational_fallback": "MP3 requests return MP3. OPUS, AAC, FLAC, WAV, and PCM map to WAV for reliable playback.",
260 |     "operational_backend": "Audio comes from the third-party openai.fm service; availability may change without notice—add graceful fallbacks.",
261 |     "operational_ffmpeg": "The Docker image bundles ffmpeg so combined MP3 responses work immediately without extra setup."
262 |   }
263 | }


--------------------------------------------------------------------------------