├── .coveragerc
├── .dockerignore
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .python-version
├── .ruff.toml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── VERSION
├── api
    ├── __init__.py
    ├── src
    │   ├── builds
    │   │   └── v1_0
    │   │   │   └── config.json
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── don_quixote.txt
    │   │   ├── model_config.py
    │   │   ├── openai_mappings.json
    │   │   └── paths.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── kokoro_v1.py
    │   │   ├── model_manager.py
    │   │   └── voice_manager.py
    │   ├── main.py
    │   ├── models
    │   │   └── v1_0
    │   │   │   └── config.json
    │   ├── routers
    │   │   ├── __init__.py
    │   │   ├── debug.py
    │   │   ├── development.py
    │   │   ├── openai_compatible.py
    │   │   └── web_player.py
    │   ├── services
    │   │   ├── __init__.py
    │   │   ├── audio.py
    │   │   ├── streaming_audio_writer.py
    │   │   ├── temp_manager.py
    │   │   ├── text_processing
    │   │   │   ├── __init__.py
    │   │   │   ├── normalizer.py
    │   │   │   ├── phonemizer.py
    │   │   │   ├── text_processor.py
    │   │   │   └── vocabulary.py
    │   │   └── tts_service.py
    │   ├── structures
    │   │   ├── __init__.py
    │   │   ├── custom_responses.py
    │   │   ├── model_schemas.py
    │   │   ├── schemas.py
    │   │   └── text_schemas.py
    │   └── voices
    │   │   └── v1_0
    │   │       ├── af_alloy.pt
    │   │       ├── af_aoede.pt
    │   │       ├── af_bella.pt
    │   │       ├── af_heart.pt
    │   │       ├── af_jadzia.pt
    │   │       ├── af_jessica.pt
    │   │       ├── af_kore.pt
    │   │       ├── af_nicole.pt
    │   │       ├── af_nova.pt
    │   │       ├── af_river.pt
    │   │       ├── af_sarah.pt
    │   │       ├── af_sky.pt
    │   │       ├── af_v0.pt
    │   │       ├── af_v0bella.pt
    │   │       ├── af_v0irulan.pt
    │   │       ├── af_v0nicole.pt
    │   │       ├── af_v0sarah.pt
    │   │       ├── af_v0sky.pt
    │   │       ├── am_adam.pt
    │   │       ├── am_echo.pt
    │   │       ├── am_eric.pt
    │   │       ├── am_fenrir.pt
    │   │       ├── am_liam.pt
    │   │       ├── am_michael.pt
    │   │       ├── am_onyx.pt
    │   │       ├── am_puck.pt
    │   │       ├── am_santa.pt
    │   │       ├── am_v0adam.pt
    │   │       ├── am_v0gurney.pt
    │   │       ├── am_v0michael.pt
    │   │       ├── bf_alice.pt
    │   │       ├── bf_emma.pt
    │   │       ├── bf_lily.pt
    │   │       ├── bf_v0emma.pt
    │   │       ├── bf_v0isabella.pt
    │   │       ├── bm_daniel.pt
    │   │       ├── bm_fable.pt
    │   │       ├── bm_george.pt
    │   │       ├── bm_lewis.pt
    │   │       ├── bm_v0george.pt
    │   │       ├── bm_v0lewis.pt
    │   │       ├── ef_dora.pt
    │   │       ├── em_alex.pt
    │   │       ├── em_santa.pt
    │   │       ├── ff_siwis.pt
    │   │       ├── hf_alpha.pt
    │   │       ├── hf_beta.pt
    │   │       ├── hm_omega.pt
    │   │       ├── hm_psi.pt
    │   │       ├── if_sara.pt
    │   │       ├── im_nicola.pt
    │   │       ├── jf_alpha.pt
    │   │       ├── jf_gongitsune.pt
    │   │       ├── jf_nezumi.pt
    │   │       ├── jf_tebukuro.pt
    │   │       ├── jm_kumo.pt
    │   │       ├── pf_dora.pt
    │   │       ├── pm_alex.pt
    │   │       ├── pm_santa.pt
    │   │       ├── zf_xiaobei.pt
    │   │       ├── zf_xiaoni.pt
    │   │       ├── zf_xiaoxiao.pt
    │   │       ├── zf_xiaoyi.pt
    │   │       ├── zm_yunjian.pt
    │   │       ├── zm_yunxi.pt
    │   │       ├── zm_yunxia.pt
    │   │       └── zm_yunyang.pt
    └── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_audio_service.py
    │   ├── test_data
    │       ├── generate_test_data.py
    │       └── test_audio.npy
    │   ├── test_development.py
    │   ├── test_kokoro_v1.py
    │   ├── test_normalizer.py
    │   ├── test_openai_endpoints.py
    │   ├── test_paths.py
    │   ├── test_text_processor.py
    │   └── test_tts_service.py
├── assets
    ├── cpu_first_token_timeline_stream_openai.png
    ├── docs-screenshot.png
    ├── format_comparison.png
    ├── gpu_first_token_latency_direct.png
    ├── gpu_first_token_latency_openai.png
    ├── gpu_first_token_timeline_direct.png
    ├── gpu_first_token_timeline_openai.png
    ├── gpu_processing_time.png
    ├── gpu_realtime_factor.png
    ├── gpu_total_time_latency_direct.png
    ├── gpu_total_time_latency_openai.png
    ├── voice_analysis.png
    └── webui-screenshot.png
├── charts
    └── kokoro-fastapi
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── examples
    │       ├── aks-tls-values.yaml
    │       └── gpu-operator-values.yaml
    │   ├── templates
    │       ├── NOTES.txt
    │       ├── _helpers.tpl
    │       ├── hpa.yaml
    │       ├── ingress.yaml
    │       ├── kokoro-tts-deployment.yaml
    │       ├── kokoro-tts-service.yaml
    │       ├── serviceaccount.yaml
    │       └── tests
    │       │   └── test-connection.yaml
    │   └── values.yaml
├── debug.http
├── dev
    ├── Test Phon.py
    ├── Test Threads.py
    ├── Test copy 2.py
    ├── Test copy.py
    ├── Test money.py
    ├── Test num.py
    └── Test.py
├── docker-bake.hcl
├── docker
    ├── build.sh
    ├── cpu
    │   ├── .dockerignore
    │   ├── Dockerfile
    │   └── docker-compose.yml
    ├── gpu
    │   ├── .dockerignore
    │   ├── Dockerfile
    │   └── docker-compose.yml
    └── scripts
    │   ├── download_model.py
    │   ├── download_model.sh
    │   └── entrypoint.sh
├── docs
    ├── architecture
    │   ├── espeak_setup_fix.md
    │   └── streaming_audio_writer_analysis.md
    ├── requirements.in
    └── requirements.txt
├── examples
    ├── __init__.py
    ├── assorted_checks
    │   ├── __init__.py
    │   ├── benchmarks
    │   │   ├── __init__.py
    │   │   ├── benchmark_first_token.py
    │   │   ├── benchmark_first_token_stream_unified.py
    │   │   ├── benchmark_tts_rtf.py
    │   │   ├── depr_benchmark_tts.py
    │   │   ├── lib
    │   │   │   ├── __init__.py
    │   │   │   ├── shared_benchmark_utils.py
    │   │   │   ├── shared_plotting.py
    │   │   │   ├── shared_utils.py
    │   │   │   └── stream_utils.py
    │   │   ├── output_data
    │   │   │   ├── cpu_benchmark_results_rtf.json
    │   │   │   ├── cpu_benchmark_stats_rtf.txt
    │   │   │   ├── first_token_benchmark_stream.json
    │   │   │   ├── first_token_benchmark_stream_openai.json
    │   │   │   ├── gpu_benchmark_results_rtf.json
    │   │   │   └── gpu_benchmark_stats_rtf.txt
    │   │   ├── output_plots
    │   │   │   ├── cpu_processing_time_rtf.png
    │   │   │   ├── cpu_realtime_factor_rtf.png
    │   │   │   ├── cpu_system_usage_rtf.png
    │   │   │   ├── first_token_latency_stream.png
    │   │   │   ├── first_token_latency_stream_openai.png
    │   │   │   ├── first_token_timeline_stream.png
    │   │   │   ├── first_token_timeline_stream_openai.png
    │   │   │   ├── gpu_processing_time_rtf.png
    │   │   │   ├── gpu_realtime_factor_rtf.png
    │   │   │   ├── gpu_system_usage_rtf.png
    │   │   │   ├── total_time_latency_stream.png
    │   │   │   └── total_time_latency_stream_openai.png
    │   │   └── the_time_machine_hg_wells.txt
    │   ├── generate_readme_plots.py
    │   ├── test_combinations
    │   │   ├── test_analyze_combined_voices.py
    │   │   └── test_download_voice.py
    │   ├── test_formats
    │   │   └── test_audio_formats.py
    │   ├── test_normalizer.py
    │   ├── test_openai
    │   │   └── test_openai_tts.py
    │   ├── test_voices
    │   │   ├── analyze_voice_dimensions.py
    │   │   ├── test_all_voices.py
    │   │   └── trim_voice_dimensions.py
    │   ├── validate_wav.py
    │   └── validate_wavs.py
    ├── audio_analysis.png
    ├── captioned_speech_example.py
    ├── openai_streaming_audio.py
    ├── phoneme_examples
    │   ├── examples
    │   │   └── phoneme_examples
    │   │   │   └── output
    │   │   │       └── phoneme_test.wav
    │   ├── generate_phonemes.py
    │   └── test_phoneme_generation.py
    ├── requirements.txt
    ├── simul_file_test.py
    ├── simul_openai_streaming_audio.py
    ├── simul_speaker_test.py
    ├── speech.mp3
    ├── stream_tts_playback.py
    ├── streaming_refactor
    │   ├── benchmark_unified_streaming.py
    │   └── test_unified_streaming.py
    └── voice_samples
    │   ├── speech_af.mp3
    │   ├── speech_af_bella.mp3
    │   ├── speech_af_nicole.mp3
    │   ├── speech_af_sarah.mp3
    │   ├── speech_am_adam.mp3
    │   ├── speech_am_michael.mp3
    │   ├── speech_bf_emma.mp3
    │   ├── speech_bf_isabella.mp3
    │   ├── speech_bm_george.mp3
    │   └── speech_bm_lewis.mp3
├── githubbanner.png
├── pyproject.toml
├── pytest.ini
├── scripts
    ├── fix_misaki.py
    ├── update_badges.py
    └── update_version.py
├── start-cpu.ps1
├── start-cpu.sh
├── start-gpu.ps1
├── start-gpu.sh
├── start-gpu_mac.sh
├── ui
    ├── Dockerfile
    ├── GUIBanner.png
    ├── GradioScreenShot.png
    ├── app.py
    ├── data
    │   └── inputs
    │   │   └── test_timemachine.txt
    ├── depr_tests
    │   ├── conftest.py
    │   ├── test_api.py
    │   ├── test_components.py
    │   ├── test_files.py
    │   ├── test_handlers.py
    │   ├── test_input.py
    │   └── test_interface.py
    └── lib
    │   ├── __init__.py
    │   ├── api.py
    │   ├── components
    │       ├── __init__.py
    │       ├── input.py
    │       ├── model.py
    │       └── output.py
    │   ├── config.py
    │   ├── files.py
    │   ├── handlers.py
    │   └── interface.py
└── web
    ├── favicon.svg
    ├── index.html
    ├── siriwave.js
    ├── src
        ├── App.js
        ├── components
        │   ├── PlayerControls.js
        │   ├── TextEditor.js
        │   ├── VoiceSelector.js
        │   └── WaveVisualizer.js
        ├── services
        │   ├── AudioService.js
        │   └── VoiceService.js
        └── state
        │   └── PlayerState.js
    └── styles
        ├── badges.css
        ├── base.css
        ├── controls.css
        ├── forms.css
        ├── header.css
        ├── layout.css
        ├── player.css
        └── responsive.css


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = 
 3 |     api
 4 |     ui
 5 | omit = 
 6 |     Kokoro-82M/*
 7 |     MagicMock/*
 8 |     test_*.py
 9 |     examples/*
10 |     src/builds/*
11 | 
12 | [report]
13 | exclude_lines =
14 |     pragma: no cover
15 |     def __repr__
16 |     raise NotImplementedError
17 |     if __name__ == .__main__.:
18 |     pass
19 |     raise ImportError
20 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | 
 4 | # Python
 5 | __pycache__
 6 | *.pyc
 7 | *.pyo
 8 | *.pyd
 9 | .Python
10 | *.py[cod]
11 | *$py.class
12 | .pytest_cache
13 | .coverage
14 | .coveragerc
15 | 
16 | # Environment
17 | # .env
18 | .venv
19 | env/
20 | venv/
21 | ENV/
22 | 
23 | # IDE
24 | .idea
25 | .vscode
26 | *.swp
27 | *.swo
28 | 
29 | # Project specific
30 | examples/
31 | Kokoro-82M/
32 | ui/
33 | tests/
34 | *.md
35 | *.txt
36 | !requirements.txt
37 | 
38 | # Docker
39 | Dockerfile*
40 | docker-compose*
41 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 
3 | *.py text eol=lf
4 | *.sh text eol=lf
5 | *.yml text eol=lf


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: remsky
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Screenshots or console output**
14 | If applicable, add screenshots to help explain your problem. When doing so., please ensure you have the first command that triggered the trace and/or the command that started up your build included, otherwise it is difficult to diagnose. 
15 | 
16 | **Branch / Deployment used**
17 | Let us know if it's the master branch, or the stable branch indicated in the readme, as well as if you're running it locally, in the cloud, via the docker compose (cpu or gpu), or direct docker run commands. Please include the exact commands used to run in the latter cases.
18 | 
19 | **Operating System**
20 | Include the platform, version numbers of your docker, etc. Whether its GPU (Nvidia or other) or CPU, Mac, Linux, Windows, etc. 
21 | 
22 | **Additional context**
23 | Add any other context about the problem here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the feature you'd like**
11 | A clear and concise description of what you want to happen. Is it a quality of life improvement, something new entirely? 
12 | 
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered. Consider whether it could be submitted as PR, or you'd need a hand to do so
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches: [ "master", "pre-release" ]
 5 |   pull_request:
 6 |     branches: [ "master", "pre-release" ]
 7 | jobs:
 8 |   test:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: ["3.10"]
13 |       fail-fast: false
14 |     
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     
18 |     # Match Dockerfile dependencies
19 |     - name: Install Dependencies
20 |       run: |
21 |         sudo apt-get update
22 |         sudo apt-get install -y --no-install-recommends \
23 |           espeak-ng \
24 |           git \
25 |           libsndfile1 \
26 |           curl \
27 |           ffmpeg
28 |     
29 |     - name: Install uv
30 |       uses: astral-sh/setup-uv@v5
31 |       with:
32 |         python-version: ${{ matrix.python-version }}
33 |         enable-cache: true
34 |     - name: Install dependencies
35 |       run: |
36 |         uv pip install -e .[test,cpu]
37 |     - name: Run Tests
38 |       run: |
39 |         uv run pytest api/tests/ --asyncio-mode=auto --cov=api --cov-report=term-missing
40 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Create Release and Publish Docker Images
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - release # Trigger when commits are pushed to the release branch (e.g., after merging master)
  7 |     paths-ignore:
  8 |       - '**.md'
  9 |       - 'docs/**'
 10 | 
 11 | jobs:
 12 |   prepare-release:
 13 |     runs-on: ubuntu-latest
 14 |     outputs:
 15 |       version: ${{ steps.get-version.outputs.version }}
 16 |       version_tag: ${{ steps.get-version.outputs.version_tag }}
 17 |     steps:
 18 |       - name: Checkout repository
 19 |         uses: actions/checkout@v4
 20 | 
 21 |       - name: Get version from VERSION file
 22 |         id: get-version
 23 |         run: |
 24 |           VERSION_PLAIN=$(cat VERSION)
 25 |           echo "version=${VERSION_PLAIN}" >> $GITHUB_OUTPUT
 26 |           echo "version_tag=v${VERSION_PLAIN}" >> $GITHUB_OUTPUT # Add 'v' prefix for tag
 27 | 
 28 |   build-images:
 29 |     needs: prepare-release
 30 |     runs-on: ubuntu-latest
 31 |     permissions:
 32 |       packages: write # Needed to push images to GHCR
 33 |     env:
 34 |       DOCKER_BUILDKIT: 1
 35 |       BUILDKIT_STEP_LOG_MAX_SIZE: 10485760
 36 |       # This environment variable will override the VERSION variable in docker-bake.hcl
 37 |       VERSION: ${{ needs.prepare-release.outputs.version_tag }} # Use tag version (vX.Y.Z) for bake
 38 |     steps:
 39 |       - name: Checkout repository
 40 |         uses: actions/checkout@v4
 41 |         with:
 42 |           fetch-depth: 0 # Needed to check for existing tags
 43 | 
 44 |       - name: Check if tag already exists
 45 |         run: |
 46 |           TAG_NAME="${{ needs.prepare-release.outputs.version_tag }}"
 47 |           echo "Checking for existing tag: $TAG_NAME"
 48 |           # Fetch tags explicitly just in case checkout didn't get them all
 49 |           git fetch --tags
 50 |           if git rev-parse "$TAG_NAME" >/dev/null 2>&1; then
 51 |             echo "::error::Tag $TAG_NAME already exists. Please increment the version in the VERSION file."
 52 |             exit 1
 53 |           else
 54 |             echo "Tag $TAG_NAME does not exist. Proceeding with release."
 55 |           fi
 56 | 
 57 |       - name: Free disk space # Optional: Keep as needed for large builds
 58 |         run: |
 59 |           echo "Listing current disk space"
 60 |           df -h
 61 |           echo "Cleaning up disk space..."
 62 |           sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache
 63 |           docker system prune -af
 64 |           echo "Disk space after cleanup"
 65 |           df -h
 66 | 
 67 |       - name: Set up QEMU
 68 |         uses: docker/setup-qemu-action@v3 # Use v3
 69 | 
 70 |       - name: Set up Docker Buildx
 71 |         uses: docker/setup-buildx-action@v3 # Use v3
 72 |         with:
 73 |           driver-opts: |
 74 |             image=moby/buildkit:latest
 75 |             network=host
 76 | 
 77 |       - name: Log in to GitHub Container Registry
 78 |         uses: docker/login-action@v3 # Use v3
 79 |         with:
 80 |           registry: ghcr.io
 81 |           username: ${{ github.actor }}
 82 |           password: ${{ secrets.GITHUB_TOKEN }}
 83 | 
 84 |       - name: Build and push images using Docker Bake
 85 |         run: |
 86 |           echo "Building and pushing images for version ${{ needs.prepare-release.outputs.version_tag }}"
 87 |           # The VERSION env var above sets the tag for the bake file targets
 88 |           docker buildx bake --push
 89 | 
 90 |   create-release:
 91 |     needs: [prepare-release, build-images]
 92 |     runs-on: ubuntu-latest
 93 |     permissions:
 94 |       contents: write # Needed to create releases
 95 |     steps:
 96 |       - name: Checkout repository
 97 |         uses: actions/checkout@v4
 98 |         with:
 99 |           fetch-depth: 0 # Fetch all history for release notes generation
100 | 
101 |       - name: Create GitHub Release
102 |         uses: softprops/action-gh-release@v2 # Use v2
103 |         with:
104 |           tag_name: ${{ needs.prepare-release.outputs.version_tag }} # Use vX.Y.Z tag
105 |           name: Release ${{ needs.prepare-release.outputs.version_tag }}
106 |           generate_release_notes: true # Auto-generate release notes
107 |           draft: false # Publish immediately
108 |           prerelease: false # Mark as a stable release
109 |         env:
110 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
111 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | 
 4 | # Python
 5 | __pycache__/
 6 | *.pyc
 7 | *.pyo
 8 | *.pyd
 9 | *.py[cod]
10 | *$py.class
11 | .Python
12 | .pytest_cache
13 | .coverage
14 | .coveragerc
15 | 
16 | # Python package build artifacts
17 | *.egg-info/
18 | *.egg
19 | dist/
20 | build/
21 | *.onnx
22 | *.pth
23 | # Environment
24 | # .env
25 | .venv/
26 | env/
27 | venv/
28 | ENV/
29 | 
30 | # IDE
31 | .idea/
32 | .vscode/
33 | *.swp
34 | *.swo
35 | 
36 | # Project specific
37 | # Model files
38 | 
39 | *.pth
40 | *.tar*
41 | 
42 | 
43 | # Other project files
44 | .env
45 | Kokoro-82M/
46 | ui/data/
47 | EXTERNAL_UV_DOCUMENTATION*
48 | app
49 | api/temp_files/
50 | 
51 | # Docker
52 | Dockerfile*
53 | docker-compose*
54 | examples/ebook_test/chapter_to_audio.py
55 | examples/ebook_test/chapters_to_audio.py
56 | examples/ebook_test/parse_epub.py
57 | api/src/voices/af_jadzia.pt
58 | examples/assorted_checks/test_combinations/output/*
59 | examples/assorted_checks/test_openai/output/*
60 | 
61 | 
62 | # Audio files
63 | examples/*.wav
64 | examples/*.pcm
65 | examples/*.mp3
66 | examples/*.flac
67 | examples/*.acc
68 | examples/*.ogg
69 | examples/speech.mp3
70 | examples/phoneme_examples/output/*.wav
71 | examples/assorted_checks/benchmarks/output_audio/*
72 | uv.lock
73 | 
74 | # Mac MPS virtualenv for dual testing
75 | .venv-mps
76 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/.ruff.toml:
--------------------------------------------------------------------------------
 1 | line-length = 88
 2 | 
 3 | exclude = ["examples"]
 4 | 
 5 | [lint]
 6 | select = ["I"]
 7 | 
 8 | [lint.isort]
 9 | combine-as-imports = true
10 | force-wrap-aliases = true
11 | split-on-trailing-comma = true
12 | section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Kokoro-FastAPI
 2 | 
 3 | Always appreciate community involvement in making this project better. 
 4 | 
 5 | ## Development Setup
 6 | 
 7 | We use `uv` for managing Python environments and dependencies, and `ruff` for linting and formatting.
 8 | 
 9 | 1.  **Clone the repository:**
10 |     ```bash
11 |     git clone https://github.com/remsky/Kokoro-FastAPI.git
12 |     cd Kokoro-FastAPI
13 |     ```
14 | 
15 | 2.  **Install `uv`:**
16 |     Follow the instructions on the [official `uv` documentation](https://docs.astral.sh/uv/install/).
17 | 
18 | 3.  **Create a virtual environment and install dependencies:**
19 |     It's recommended to use a virtual environment. `uv` can create one for you. Install the base dependencies along with the `test` and `cpu` extras (needed for running tests locally).
20 |     ```bash
21 |     # Create and activate a virtual environment (e.g., named .venv)
22 |     uv venv
23 |     source .venv/bin/activate # On Linux/macOS
24 |     # .venv\Scripts\activate # On Windows
25 | 
26 |     # Install dependencies including test requirements
27 |     uv pip install -e ".[test,cpu]"
28 |     ```
29 |     *Note: If you have an NVIDIA GPU and want to test GPU-specific features locally, you can install `.[test,gpu]` instead, ensuring you have the correct CUDA toolkit installed.*
30 | 
31 |     *Note: If running via uv locally, you will have to install espeak and handle any pathing issues that arise. The Docker images handle this automatically*
32 | 
33 | 4.  **Install `ruff` (if not already installed globally):**
34 |     While `ruff` might be included via dependencies, installing it explicitly ensures you have it available.
35 |     ```bash
36 |     uv pip install ruff
37 |     ```
38 | 
39 | ## Running Tests
40 | 
41 | Before submitting changes, please ensure all tests pass as this is a automated requirement. The tests are run using `pytest`. 
42 | ```bash
43 | # Make sure your virtual environment is activated
44 | uv run pytest
45 | ```
46 | *Note: The CI workflow runs tests using `uv run pytest api/tests/ --asyncio-mode=auto --cov=api --cov-report=term-missing`. Running `uv run pytest` locally should cover the essential checks.*
47 | 
48 | ## Testing with Docker Compose
49 | 
50 | In addition to local `pytest` runs, test your changes using Docker Compose to ensure they work correctly within the containerized environment. If you aren't able to test on CUDA hardware, make note so it can be tested by another maintainer
51 | 
52 | ```bash
53 | 
54 | docker compose -f docker/cpu/docker-compose.yml up --build
55 | +
56 | docker compose -f docker/gpu/docker-compose.yml up --build
57 | ```
58 | This command will build the Docker images (if they've changed) and start the services defined in the respective compose file. Verify the application starts correctly and test the relevant functionality.
59 | 
60 | ## Code Formatting and Linting
61 | 
62 | We use `ruff` to maintain code quality and consistency. Please format and lint your code before committing. 
63 | 
64 | 1.  **Format the code:**
65 |     ```bash
66 |     # Make sure your virtual environment is activated
67 |     ruff format .
68 |     ```
69 | 
70 | 2.  **Lint the code (and apply automatic fixes):**
71 |     ```bash
72 |     # Make sure your virtual environment is activated
73 |     ruff check . --fix
74 |     ```
75 |     Review any changes made by `--fix` and address any remaining linting errors manually.
76 | 
77 | ## Submitting Changes
78 | 
79 | 0.  Clone the repo
80 | 1.  Create a new branch for your feature or bug fix.
81 | 2.  Make your changes, following setup, testing, and formatting guidelines above.
82 | 3.  Please try to keep your changes inline with the current design, and modular. Large-scale changes will take longer to review and integrate, and have less chance of being approved outright.
83 | 4.  Push your branch to your fork.
84 | 5.  Open a Pull Request against the `master` branch of the main repository.
85 | 
86 | Thank you for contributing!
87 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.0
2 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
1 | # Make api directory a Python package
2 | 


--------------------------------------------------------------------------------
/api/src/builds/v1_0/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "istftnet": {
  3 |     "upsample_kernel_sizes": [
  4 |       20,
  5 |       12
  6 |     ],
  7 |     "upsample_rates": [
  8 |       10,
  9 |       6
 10 |     ],
 11 |     "gen_istft_hop_size": 5,
 12 |     "gen_istft_n_fft": 20,
 13 |     "resblock_dilation_sizes": [
 14 |       [
 15 |         1,
 16 |         3,
 17 |         5
 18 |       ],
 19 |       [
 20 |         1,
 21 |         3,
 22 |         5
 23 |       ],
 24 |       [
 25 |         1,
 26 |         3,
 27 |         5
 28 |       ]
 29 |     ],
 30 |     "resblock_kernel_sizes": [
 31 |       3,
 32 |       7,
 33 |       11
 34 |     ],
 35 |     "upsample_initial_channel": 512
 36 |   },
 37 |   "dim_in": 64,
 38 |   "dropout": 0.2,
 39 |   "hidden_dim": 512,
 40 |   "max_conv_dim": 512,
 41 |   "max_dur": 50,
 42 |   "multispeaker": true,
 43 |   "n_layer": 3,
 44 |   "n_mels": 80,
 45 |   "n_token": 178,
 46 |   "style_dim": 128,
 47 |   "text_encoder_kernel_size": 5,
 48 |   "plbert": {
 49 |     "hidden_size": 768,
 50 |     "num_attention_heads": 12,
 51 |     "intermediate_size": 2048,
 52 |     "max_position_embeddings": 512,
 53 |     "num_hidden_layers": 12,
 54 |     "dropout": 0.1
 55 |   },
 56 |   "vocab": {
 57 |     ";": 1,
 58 |     ":": 2,
 59 |     ",": 3,
 60 |     ".": 4,
 61 |     "!": 5,
 62 |     "?": 6,
 63 |     "—": 9,
 64 |     "…": 10,
 65 |     "\"": 11,
 66 |     "(": 12,
 67 |     ")": 13,
 68 |     "“": 14,
 69 |     "”": 15,
 70 |     " ": 16,
 71 |     "̃": 17,
 72 |     "ʣ": 18,
 73 |     "ʥ": 19,
 74 |     "ʦ": 20,
 75 |     "ʨ": 21,
 76 |     "ᵝ": 22,
 77 |     "ꭧ": 23,
 78 |     "A": 24,
 79 |     "I": 25,
 80 |     "O": 31,
 81 |     "Q": 33,
 82 |     "S": 35,
 83 |     "T": 36,
 84 |     "W": 39,
 85 |     "Y": 41,
 86 |     "ᵊ": 42,
 87 |     "a": 43,
 88 |     "b": 44,
 89 |     "c": 45,
 90 |     "d": 46,
 91 |     "e": 47,
 92 |     "f": 48,
 93 |     "h": 50,
 94 |     "i": 51,
 95 |     "j": 52,
 96 |     "k": 53,
 97 |     "l": 54,
 98 |     "m": 55,
 99 |     "n": 56,
100 |     "o": 57,
101 |     "p": 58,
102 |     "q": 59,
103 |     "r": 60,
104 |     "s": 61,
105 |     "t": 62,
106 |     "u": 63,
107 |     "v": 64,
108 |     "w": 65,
109 |     "x": 66,
110 |     "y": 67,
111 |     "z": 68,
112 |     "ɑ": 69,
113 |     "ɐ": 70,
114 |     "ɒ": 71,
115 |     "æ": 72,
116 |     "β": 75,
117 |     "ɔ": 76,
118 |     "ɕ": 77,
119 |     "ç": 78,
120 |     "ɖ": 80,
121 |     "ð": 81,
122 |     "ʤ": 82,
123 |     "ə": 83,
124 |     "ɚ": 85,
125 |     "ɛ": 86,
126 |     "ɜ": 87,
127 |     "ɟ": 90,
128 |     "ɡ": 92,
129 |     "ɥ": 99,
130 |     "ɨ": 101,
131 |     "ɪ": 102,
132 |     "ʝ": 103,
133 |     "ɯ": 110,
134 |     "ɰ": 111,
135 |     "ŋ": 112,
136 |     "ɳ": 113,
137 |     "ɲ": 114,
138 |     "ɴ": 115,
139 |     "ø": 116,
140 |     "ɸ": 118,
141 |     "θ": 119,
142 |     "œ": 120,
143 |     "ɹ": 123,
144 |     "ɾ": 125,
145 |     "ɻ": 126,
146 |     "ʁ": 128,
147 |     "ɽ": 129,
148 |     "ʂ": 130,
149 |     "ʃ": 131,
150 |     "ʈ": 132,
151 |     "ʧ": 133,
152 |     "ʊ": 135,
153 |     "ʋ": 136,
154 |     "ʌ": 138,
155 |     "ɣ": 139,
156 |     "ɤ": 140,
157 |     "χ": 142,
158 |     "ʎ": 143,
159 |     "ʒ": 147,
160 |     "ʔ": 148,
161 |     "ˈ": 156,
162 |     "ˌ": 157,
163 |     "ː": 158,
164 |     "ʰ": 162,
165 |     "ʲ": 164,
166 |     "↓": 169,
167 |     "→": 171,
168 |     "↗": 172,
169 |     "↘": 173,
170 |     "ᵻ": 177
171 |   }
172 | }


--------------------------------------------------------------------------------
/api/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import settings
2 | 
3 | __all__ = ["settings"]
4 | 


--------------------------------------------------------------------------------
/api/src/core/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pydantic_settings import BaseSettings
 3 | 
 4 | 
 5 | class Settings(BaseSettings):
 6 |     # API Settings
 7 |     api_title: str = "Kokoro TTS API"
 8 |     api_description: str = "API for text-to-speech generation using Kokoro"
 9 |     api_version: str = "1.0.0"
10 |     host: str = "0.0.0.0"
11 |     port: int = 8880
12 | 
13 |     # Application Settings
14 |     output_dir: str = "output"
15 |     output_dir_size_limit_mb: float = 500.0  # Maximum size of output directory in MB
16 |     default_voice: str = "af_heart"
17 |     default_voice_code: str | None = (
18 |         None  # If set, overrides the first letter of voice name, though api call param still takes precedence
19 |     )
20 |     use_gpu: bool = True  # Whether to use GPU acceleration if available
21 |     device_type: str | None = (
22 |         None  # Will be auto-detected if None, can be "cuda", "mps", or "cpu"
23 |     )
24 |     allow_local_voice_saving: bool = (
25 |         False  # Whether to allow saving combined voices locally
26 |     )
27 | 
28 |     # Container absolute paths
29 |     model_dir: str = "/app/api/src/models"  # Absolute path in container
30 |     voices_dir: str = "/app/api/src/voices/v1_0"  # Absolute path in container
31 | 
32 |     # Audio Settings
33 |     sample_rate: int = 24000
34 |     # Text Processing Settings
35 |     target_min_tokens: int = 175  # Target minimum tokens per chunk
36 |     target_max_tokens: int = 250  # Target maximum tokens per chunk
37 |     absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
38 |     advanced_text_normalization: bool = True  # Preproesses the text before misiki
39 |     voice_weight_normalization: bool = (
40 |         True  # Normalize the voice weights so they add up to 1
41 |     )
42 | 
43 |     gap_trim_ms: int = (
44 |         1  # Base amount to trim from streaming chunk ends in milliseconds
45 |     )
46 |     dynamic_gap_trim_padding_ms: int = 410  # Padding to add to dynamic gap trim
47 |     dynamic_gap_trim_padding_char_multiplier: dict[str, float] = {
48 |         ".": 1,
49 |         "!": 0.9,
50 |         "?": 1,
51 |         ",": 0.8,
52 |     }
53 | 
54 |     # Web Player Settings
55 |     enable_web_player: bool = True  # Whether to serve the web player UI
56 |     web_player_path: str = "web"  # Path to web player static files
57 |     cors_origins: list[str] = ["*"]  # CORS origins for web player
58 |     cors_enabled: bool = True  # Whether to enable CORS
59 | 
60 |     # Temp File Settings for WEB Ui
61 |     temp_file_dir: str = "api/temp_files"  # Directory for temporary audio files (relative to project root)
62 |     max_temp_dir_size_mb: int = 2048  # Maximum size of temp directory (2GB)
63 |     max_temp_dir_age_hours: int = 1  # Remove temp files older than 1 hour
64 |     max_temp_dir_count: int = 3  # Maximum number of temp files to keep
65 | 
66 |     class Config:
67 |         env_file = ".env"
68 | 
69 |     def get_device(self) -> str:
70 |         """Get the appropriate device based on settings and availability"""
71 |         if not self.use_gpu:
72 |             return "cpu"
73 | 
74 |         if self.device_type:
75 |             return self.device_type
76 | 
77 |         # Auto-detect device
78 |         if torch.backends.mps.is_available():
79 |             return "mps"
80 |         elif torch.cuda.is_available():
81 |             return "cuda"
82 |         return "cpu"
83 | 
84 | 
85 | settings = Settings()
86 | 


--------------------------------------------------------------------------------
/api/src/core/don_quixote.txt:
--------------------------------------------------------------------------------
1 | In a village of La Mancha, the name of which I have no desire to call
2 | to mind, there lived not long since one of those gentlemen that keep a
3 | lance in the lance-rack, an old buckler, a lean hack, and a greyhound
4 | for coursing. An olla of rather more beef than mutton, a salad on most
5 | nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
6 | extra on Sundays, made away with three-quarters of his income. The rest
7 | of it went in a doublet of fine cloth and velvet breeches and shoes to
8 | match for holidays, while on week-days he made a brave figure in his
9 | best homespun. 


--------------------------------------------------------------------------------
/api/src/core/model_config.py:
--------------------------------------------------------------------------------
 1 | """Model configuration for Kokoro V1.
 2 | 
 3 | This module provides model-specific configuration settings that complement the application-level
 4 | settings in config.py. While config.py handles general application settings (API, paths, etc.),
 5 | this module focuses on memory management and model file paths.
 6 | """
 7 | 
 8 | from pydantic import BaseModel, Field
 9 | 
10 | 
11 | class KokoroV1Config(BaseModel):
12 |     """Kokoro V1 configuration."""
13 | 
14 |     languages: list[str] = ["en"]
15 | 
16 |     class Config:
17 |         frozen = True
18 | 
19 | 
20 | class PyTorchConfig(BaseModel):
21 |     """PyTorch backend configuration."""
22 | 
23 |     memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
24 |     retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
25 | 
26 |     class Config:
27 |         frozen = True
28 | 
29 | 
30 | class ModelConfig(BaseModel):
31 |     """Kokoro V1 model configuration."""
32 | 
33 |     # General settings
34 |     cache_voices: bool = Field(True, description="Whether to cache voice tensors")
35 |     voice_cache_size: int = Field(2, description="Maximum number of cached voices")
36 | 
37 |     # Model filename
38 |     pytorch_kokoro_v1_file: str = Field(
39 |         "v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename"
40 |     )
41 | 
42 |     # Backend config
43 |     pytorch_gpu: PyTorchConfig = Field(default_factory=PyTorchConfig)
44 | 
45 |     class Config:
46 |         frozen = True
47 | 
48 | 
49 | # Global instance
50 | model_config = ModelConfig()
51 | 


--------------------------------------------------------------------------------
/api/src/core/openai_mappings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "models": {
 3 |         "tts-1": "kokoro-v1_0",
 4 |         "tts-1-hd": "kokoro-v1_0",
 5 |         "kokoro": "kokoro-v1_0"
 6 |     },
 7 |     "voices": {
 8 |         "alloy": "am_v0adam",
 9 |         "ash": "af_v0nicole",
10 |         "coral": "bf_v0emma",
11 |         "echo": "af_v0bella",
12 |         "fable": "af_sarah",
13 |         "onyx": "bm_george",
14 |         "nova": "bf_isabella",
15 |         "sage": "am_michael",
16 |         "shimmer": "af_sky"
17 |     }
18 | }


--------------------------------------------------------------------------------
/api/src/inference/__init__.py:
--------------------------------------------------------------------------------
 1 | """Model inference package."""
 2 | 
 3 | from .base import BaseModelBackend
 4 | from .kokoro_v1 import KokoroV1
 5 | from .model_manager import ModelManager, get_manager
 6 | 
 7 | __all__ = [
 8 |     "BaseModelBackend",
 9 |     "ModelManager",
10 |     "get_manager",
11 |     "KokoroV1",
12 | ]
13 | 


--------------------------------------------------------------------------------
/api/src/inference/base.py:
--------------------------------------------------------------------------------
  1 | """Base interface for Kokoro inference."""
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from typing import AsyncGenerator, List, Optional, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | 
 10 | class AudioChunk:
 11 |     """Class for audio chunks returned by model backends"""
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         audio: np.ndarray,
 16 |         word_timestamps: Optional[List] = [],
 17 |         output: Optional[Union[bytes, np.ndarray]] = b"",
 18 |     ):
 19 |         self.audio = audio
 20 |         self.word_timestamps = word_timestamps
 21 |         self.output = output
 22 | 
 23 |     @staticmethod
 24 |     def combine(audio_chunk_list: List):
 25 |         output = AudioChunk(
 26 |             audio_chunk_list[0].audio, audio_chunk_list[0].word_timestamps
 27 |         )
 28 | 
 29 |         for audio_chunk in audio_chunk_list[1:]:
 30 |             output.audio = np.concatenate(
 31 |                 (output.audio, audio_chunk.audio), dtype=np.int16
 32 |             )
 33 |             if output.word_timestamps is not None:
 34 |                 output.word_timestamps += audio_chunk.word_timestamps
 35 | 
 36 |         return output
 37 | 
 38 | 
 39 | class ModelBackend(ABC):
 40 |     """Abstract base class for model inference backend."""
 41 | 
 42 |     @abstractmethod
 43 |     async def load_model(self, path: str) -> None:
 44 |         """Load model from path.
 45 | 
 46 |         Args:
 47 |             path: Path to model file
 48 | 
 49 |         Raises:
 50 |             RuntimeError: If model loading fails
 51 |         """
 52 |         pass
 53 | 
 54 |     @abstractmethod
 55 |     async def generate(
 56 |         self,
 57 |         text: str,
 58 |         voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
 59 |         speed: float = 1.0,
 60 |     ) -> AsyncGenerator[AudioChunk, None]:
 61 |         """Generate audio from text.
 62 | 
 63 |         Args:
 64 |             text: Input text to synthesize
 65 |             voice: Either a voice path or tuple of (name, tensor/path)
 66 |             speed: Speed multiplier
 67 | 
 68 |         Yields:
 69 |             Generated audio chunks
 70 | 
 71 |         Raises:
 72 |             RuntimeError: If generation fails
 73 |         """
 74 |         pass
 75 | 
 76 |     @abstractmethod
 77 |     def unload(self) -> None:
 78 |         """Unload model and free resources."""
 79 |         pass
 80 | 
 81 |     @property
 82 |     @abstractmethod
 83 |     def is_loaded(self) -> bool:
 84 |         """Check if model is loaded.
 85 | 
 86 |         Returns:
 87 |             True if model is loaded, False otherwise
 88 |         """
 89 |         pass
 90 | 
 91 |     @property
 92 |     @abstractmethod
 93 |     def device(self) -> str:
 94 |         """Get device model is running on.
 95 | 
 96 |         Returns:
 97 |             Device string ('cpu' or 'cuda')
 98 |         """
 99 |         pass
100 | 
101 | 
102 | class BaseModelBackend(ModelBackend):
103 |     """Base implementation of model backend."""
104 | 
105 |     def __init__(self):
106 |         """Initialize base backend."""
107 |         self._model: Optional[torch.nn.Module] = None
108 |         self._device: str = "cpu"
109 | 
110 |     @property
111 |     def is_loaded(self) -> bool:
112 |         """Check if model is loaded."""
113 |         return self._model is not None
114 | 
115 |     @property
116 |     def device(self) -> str:
117 |         """Get device model is running on."""
118 |         return self._device
119 | 
120 |     def unload(self) -> None:
121 |         """Unload model and free resources."""
122 |         if self._model is not None:
123 |             del self._model
124 |             self._model = None
125 |             if torch.cuda.is_available():
126 |                 torch.cuda.empty_cache()
127 |                 torch.cuda.synchronize()
128 | 


--------------------------------------------------------------------------------
/api/src/inference/voice_manager.py:
--------------------------------------------------------------------------------
  1 | """Voice management with controlled resource handling."""
  2 | 
  3 | from typing import Dict, List, Optional
  4 | 
  5 | import aiofiles
  6 | import torch
  7 | from loguru import logger
  8 | 
  9 | from ..core import paths
 10 | from ..core.config import settings
 11 | 
 12 | 
 13 | class VoiceManager:
 14 |     """Manages voice loading and caching with controlled resource usage."""
 15 | 
 16 |     # Singleton instance
 17 |     _instance = None
 18 | 
 19 |     def __init__(self):
 20 |         """Initialize voice manager."""
 21 |         # Strictly respect settings.use_gpu
 22 |         self._device = settings.get_device()
 23 |         self._voices: Dict[str, torch.Tensor] = {}
 24 | 
 25 |     async def get_voice_path(self, voice_name: str) -> str:
 26 |         """Get path to voice file.
 27 | 
 28 |         Args:
 29 |             voice_name: Name of voice
 30 | 
 31 |         Returns:
 32 |             Path to voice file
 33 | 
 34 |         Raises:
 35 |             RuntimeError: If voice not found
 36 |         """
 37 |         return await paths.get_voice_path(voice_name)
 38 | 
 39 |     async def load_voice(
 40 |         self, voice_name: str, device: Optional[str] = None
 41 |     ) -> torch.Tensor:
 42 |         """Load voice tensor.
 43 | 
 44 |         Args:
 45 |             voice_name: Name of voice to load
 46 |             device: Optional override for target device
 47 | 
 48 |         Returns:
 49 |             Voice tensor
 50 | 
 51 |         Raises:
 52 |             RuntimeError: If voice not found
 53 |         """
 54 |         try:
 55 |             voice_path = await self.get_voice_path(voice_name)
 56 |             target_device = device or self._device
 57 |             voice = await paths.load_voice_tensor(voice_path, target_device)
 58 |             self._voices[voice_name] = voice
 59 |             return voice
 60 |         except Exception as e:
 61 |             raise RuntimeError(f"Failed to load voice {voice_name}: {e}")
 62 | 
 63 |     async def combine_voices(
 64 |         self, voices: List[str], device: Optional[str] = None
 65 |     ) -> torch.Tensor:
 66 |         """Combine multiple voices.
 67 | 
 68 |         Args:
 69 |             voices: List of voice names to combine
 70 |             device: Optional override for target device
 71 | 
 72 |         Returns:
 73 |             Combined voice tensor
 74 | 
 75 |         Raises:
 76 |             RuntimeError: If any voice not found
 77 |         """
 78 |         if len(voices) < 2:
 79 |             raise ValueError("Need at least 2 voices to combine")
 80 | 
 81 |         target_device = device or self._device
 82 |         voice_tensors = []
 83 |         for name in voices:
 84 |             voice = await self.load_voice(name, target_device)
 85 |             voice_tensors.append(voice)
 86 | 
 87 |         combined = torch.mean(torch.stack(voice_tensors), dim=0)
 88 |         return combined
 89 | 
 90 |     async def list_voices(self) -> List[str]:
 91 |         """List available voice names.
 92 | 
 93 |         Returns:
 94 |             List of voice names
 95 |         """
 96 |         return await paths.list_voices()
 97 | 
 98 |     def cache_info(self) -> Dict[str, int]:
 99 |         """Get cache statistics.
100 | 
101 |         Returns:
102 |             Dict with cache statistics
103 |         """
104 |         return {"loaded_voices": len(self._voices), "device": self._device}
105 | 
106 | 
107 | async def get_manager() -> VoiceManager:
108 |     """Get voice manager instance.
109 | 
110 |     Returns:
111 |         VoiceManager instance
112 |     """
113 |     if VoiceManager._instance is None:
114 |         VoiceManager._instance = VoiceManager()
115 |     return VoiceManager._instance
116 | 


--------------------------------------------------------------------------------
/api/src/models/v1_0/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "istftnet": {
  3 |     "upsample_kernel_sizes": [20, 12],
  4 |     "upsample_rates": [10, 6],
  5 |     "gen_istft_hop_size": 5,
  6 |     "gen_istft_n_fft": 20,
  7 |     "resblock_dilation_sizes": [
  8 |       [1, 3, 5],
  9 |       [1, 3, 5],
 10 |       [1, 3, 5]
 11 |     ],
 12 |     "resblock_kernel_sizes": [3, 7, 11],
 13 |     "upsample_initial_channel": 512
 14 |   },
 15 |   "dim_in": 64,
 16 |   "dropout": 0.2,
 17 |   "hidden_dim": 512,
 18 |   "max_conv_dim": 512,
 19 |   "max_dur": 50,
 20 |   "multispeaker": true,
 21 |   "n_layer": 3,
 22 |   "n_mels": 80,
 23 |   "n_token": 178,
 24 |   "style_dim": 128,
 25 |   "text_encoder_kernel_size": 5,
 26 |   "plbert": {
 27 |     "hidden_size": 768,
 28 |     "num_attention_heads": 12,
 29 |     "intermediate_size": 2048,
 30 |     "max_position_embeddings": 512,
 31 |     "num_hidden_layers": 12,
 32 |     "dropout": 0.1
 33 |   },
 34 |   "vocab": {
 35 |     ";": 1,
 36 |     ":": 2,
 37 |     ",": 3,
 38 |     ".": 4,
 39 |     "!": 5,
 40 |     "?": 6,
 41 |     "—": 9,
 42 |     "…": 10,
 43 |     "\"": 11,
 44 |     "(": 12,
 45 |     ")": 13,
 46 |     "“": 14,
 47 |     "”": 15,
 48 |     " ": 16,
 49 |     "\u0303": 17,
 50 |     "ʣ": 18,
 51 |     "ʥ": 19,
 52 |     "ʦ": 20,
 53 |     "ʨ": 21,
 54 |     "ᵝ": 22,
 55 |     "\uAB67": 23,
 56 |     "A": 24,
 57 |     "I": 25,
 58 |     "O": 31,
 59 |     "Q": 33,
 60 |     "S": 35,
 61 |     "T": 36,
 62 |     "W": 39,
 63 |     "Y": 41,
 64 |     "ᵊ": 42,
 65 |     "a": 43,
 66 |     "b": 44,
 67 |     "c": 45,
 68 |     "d": 46,
 69 |     "e": 47,
 70 |     "f": 48,
 71 |     "h": 50,
 72 |     "i": 51,
 73 |     "j": 52,
 74 |     "k": 53,
 75 |     "l": 54,
 76 |     "m": 55,
 77 |     "n": 56,
 78 |     "o": 57,
 79 |     "p": 58,
 80 |     "q": 59,
 81 |     "r": 60,
 82 |     "s": 61,
 83 |     "t": 62,
 84 |     "u": 63,
 85 |     "v": 64,
 86 |     "w": 65,
 87 |     "x": 66,
 88 |     "y": 67,
 89 |     "z": 68,
 90 |     "ɑ": 69,
 91 |     "ɐ": 70,
 92 |     "ɒ": 71,
 93 |     "æ": 72,
 94 |     "β": 75,
 95 |     "ɔ": 76,
 96 |     "ɕ": 77,
 97 |     "ç": 78,
 98 |     "ɖ": 80,
 99 |     "ð": 81,
100 |     "ʤ": 82,
101 |     "ə": 83,
102 |     "ɚ": 85,
103 |     "ɛ": 86,
104 |     "ɜ": 87,
105 |     "ɟ": 90,
106 |     "ɡ": 92,
107 |     "ɥ": 99,
108 |     "ɨ": 101,
109 |     "ɪ": 102,
110 |     "ʝ": 103,
111 |     "ɯ": 110,
112 |     "ɰ": 111,
113 |     "ŋ": 112,
114 |     "ɳ": 113,
115 |     "ɲ": 114,
116 |     "ɴ": 115,
117 |     "ø": 116,
118 |     "ɸ": 118,
119 |     "θ": 119,
120 |     "œ": 120,
121 |     "ɹ": 123,
122 |     "ɾ": 125,
123 |     "ɻ": 126,
124 |     "ʁ": 128,
125 |     "ɽ": 129,
126 |     "ʂ": 130,
127 |     "ʃ": 131,
128 |     "ʈ": 132,
129 |     "ʧ": 133,
130 |     "ʊ": 135,
131 |     "ʋ": 136,
132 |     "ʌ": 138,
133 |     "ɣ": 139,
134 |     "ɤ": 140,
135 |     "χ": 142,
136 |     "ʎ": 143,
137 |     "ʒ": 147,
138 |     "ʔ": 148,
139 |     "ˈ": 156,
140 |     "ˌ": 157,
141 |     "ː": 158,
142 |     "ʰ": 162,
143 |     "ʲ": 164,
144 |     "↓": 169,
145 |     "→": 171,
146 |     "↗": 172,
147 |     "↘": 173,
148 |     "ᵻ": 177
149 |   }
150 | }


--------------------------------------------------------------------------------
/api/src/routers/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/api/src/routers/web_player.py:
--------------------------------------------------------------------------------
 1 | """Web player router with async file serving."""
 2 | 
 3 | from fastapi import APIRouter, HTTPException
 4 | from fastapi.responses import Response
 5 | from loguru import logger
 6 | 
 7 | from ..core.config import settings
 8 | from ..core.paths import get_content_type, get_web_file_path, read_bytes
 9 | 
10 | router = APIRouter(
11 |     tags=["Web Player"],
12 |     responses={404: {"description": "Not found"}},
13 | )
14 | 
15 | 
16 | @router.get("/{filename:path}")
17 | async def serve_web_file(filename: str):
18 |     """Serve web player static files asynchronously."""
19 |     if not settings.enable_web_player:
20 |         raise HTTPException(status_code=404, detail="Web player is disabled")
21 | 
22 |     try:
23 |         # Default to index.html for root path
24 |         if filename == "" or filename == "/":
25 |             filename = "index.html"
26 | 
27 |         # Get file path
28 |         file_path = await get_web_file_path(filename)
29 | 
30 |         # Read file content
31 |         content = await read_bytes(file_path)
32 | 
33 |         # Get content type
34 |         content_type = await get_content_type(file_path)
35 | 
36 |         return Response(
37 |             content=content,
38 |             media_type=content_type,
39 |             headers={
40 |                 "Cache-Control": "no-cache",  # Prevent caching during development
41 |             },
42 |         )
43 | 
44 |     except RuntimeError as e:
45 |         logger.warning(f"Web file not found: {filename}")
46 |         raise HTTPException(status_code=404, detail=str(e))
47 |     except Exception as e:
48 |         logger.error(f"Error serving web file {filename}: {e}")
49 |         raise HTTPException(status_code=500, detail="Internal server error")
50 | 


--------------------------------------------------------------------------------
/api/src/services/__init__.py:
--------------------------------------------------------------------------------
1 | from .tts_service import TTSService
2 | 
3 | __all__ = ["TTSService"]
4 | 


--------------------------------------------------------------------------------
/api/src/services/text_processing/__init__.py:
--------------------------------------------------------------------------------
 1 | """Text processing pipeline."""
 2 | 
 3 | from .normalizer import normalize_text
 4 | from .phonemizer import phonemize
 5 | from .text_processor import process_text_chunk, smart_split
 6 | from .vocabulary import tokenize
 7 | 
 8 | 
 9 | def process_text(text: str) -> list[int]:
10 |     """Process text into token IDs (for backward compatibility)."""
11 |     return process_text_chunk(text)
12 | 
13 | 
14 | __all__ = [
15 |     "normalize_text",
16 |     "phonemize",
17 |     "tokenize",
18 |     "process_text",
19 |     "process_text_chunk",
20 |     "smart_split",
21 | ]
22 | 


--------------------------------------------------------------------------------
/api/src/services/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | import phonemizer
  5 | 
  6 | from .normalizer import normalize_text
  7 | from ...structures.schemas import NormalizationOptions
  8 | 
  9 | phonemizers = {}
 10 | 
 11 | 
 12 | class PhonemizerBackend(ABC):
 13 |     """Abstract base class for phonemization backends"""
 14 | 
 15 |     @abstractmethod
 16 |     def phonemize(self, text: str) -> str:
 17 |         """Convert text to phonemes
 18 | 
 19 |         Args:
 20 |             text: Text to convert to phonemes
 21 | 
 22 |         Returns:
 23 |             Phonemized text
 24 |         """
 25 |         pass
 26 | 
 27 | 
 28 | class EspeakBackend(PhonemizerBackend):
 29 |     """Espeak-based phonemizer implementation"""
 30 | 
 31 |     def __init__(self, language: str):
 32 |         """Initialize espeak backend
 33 | 
 34 |         Args:
 35 |             language: Language code ('en-us' or 'en-gb')
 36 |         """
 37 |         self.backend = phonemizer.backend.EspeakBackend(
 38 |             language=language, preserve_punctuation=True, with_stress=True
 39 |         )
 40 | 
 41 |         self.language = language
 42 | 
 43 |     def phonemize(self, text: str) -> str:
 44 |         """Convert text to phonemes using espeak
 45 | 
 46 |         Args:
 47 |             text: Text to convert to phonemes
 48 | 
 49 |         Returns:
 50 |             Phonemized text
 51 |         """
 52 |         # Phonemize text
 53 |         ps = self.backend.phonemize([text])
 54 |         ps = ps[0] if ps else ""
 55 | 
 56 |         # Handle special cases
 57 |         ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
 58 |         ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
 59 |         ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
 60 |         ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
 61 | 
 62 |         # Language-specific rules
 63 |         if self.language == "en-us":
 64 |             ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
 65 | 
 66 |         return ps.strip()
 67 | 
 68 | 
 69 | def create_phonemizer(language: str = "a") -> PhonemizerBackend:
 70 |     """Factory function to create phonemizer backend
 71 | 
 72 |     Args:
 73 |         language: Language code ('a' for US English, 'b' for British English)
 74 | 
 75 |     Returns:
 76 |         Phonemizer backend instance
 77 |     """
 78 |     # Map language codes to espeak language codes
 79 |     lang_map = {"a": "en-us", "b": "en-gb", "z": "z"}
 80 | 
 81 |     if language not in lang_map:
 82 |         raise ValueError(f"Unsupported language code: {language}")
 83 | 
 84 |     return EspeakBackend(lang_map[language])
 85 | 
 86 | 
 87 | def phonemize(text: str, language: str = "a") -> str:
 88 |     """Convert text to phonemes
 89 | 
 90 |     Args:
 91 |         text: Text to convert to phonemes
 92 |         language: Language code ('a' for US English, 'b' for British English)
 93 | 
 94 |     Returns:
 95 |         Phonemized text
 96 |     """
 97 |     global phonemizers
 98 |     
 99 |     # Strip input text first to remove problematic leading/trailing spaces
100 |     text = text.strip()
101 |     
102 |     if language not in phonemizers:
103 |         phonemizers[language] = create_phonemizer(language)
104 |     
105 |     result = phonemizers[language].phonemize(text)
106 |     # Final strip to ensure no leading/trailing spaces in phonemes
107 |     return result.strip()
108 | 


--------------------------------------------------------------------------------
/api/src/services/text_processing/vocabulary.py:
--------------------------------------------------------------------------------
 1 | def get_vocab():
 2 |     """Get the vocabulary dictionary mapping characters to token IDs"""
 3 |     _pad = "$"
 4 |     _punctuation = ';:,.!?¡¿—…"«»"" '
 5 |     _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 |     _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 7 | 
 8 |     # Create vocabulary dictionary
 9 |     symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
10 |     return {symbol: i for i, symbol in enumerate(symbols)}
11 | 
12 | 
13 | # Initialize vocabulary
14 | VOCAB = get_vocab()
15 | 
16 | 
17 | def tokenize(phonemes: str) -> list[int]:
18 |     """Convert phonemes string to token IDs
19 | 
20 |     Args:
21 |         phonemes: String of phonemes to tokenize
22 | 
23 |     Returns:
24 |         List of token IDs
25 |     """
26 |     # Strip phonemes to remove leading/trailing spaces that could cause artifacts
27 |     phonemes = phonemes.strip()
28 |     return [i for i in map(VOCAB.get, phonemes) if i is not None]
29 | 
30 | 
31 | def decode_tokens(tokens: list[int]) -> str:
32 |     """Convert token IDs back to phonemes string
33 | 
34 |     Args:
35 |         tokens: List of token IDs
36 | 
37 |     Returns:
38 |         String of phonemes
39 |     """
40 |     # Create reverse mapping
41 |     id_to_symbol = {i: s for s, i in VOCAB.items()}
42 |     return "".join(id_to_symbol[t] for t in tokens)
43 | 


--------------------------------------------------------------------------------
/api/src/structures/__init__.py:
--------------------------------------------------------------------------------
 1 | from .schemas import (
 2 |     CaptionedSpeechRequest,
 3 |     CaptionedSpeechResponse,
 4 |     OpenAISpeechRequest,
 5 |     TTSStatus,
 6 |     VoiceCombineRequest,
 7 |     WordTimestamp,
 8 | )
 9 | 
10 | __all__ = [
11 |     "OpenAISpeechRequest",
12 |     "CaptionedSpeechRequest",
13 |     "CaptionedSpeechResponse",
14 |     "WordTimestamp",
15 |     "TTSStatus",
16 |     "VoiceCombineRequest",
17 | ]
18 | 


--------------------------------------------------------------------------------
/api/src/structures/custom_responses.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import typing
 3 | from collections.abc import AsyncIterable, Iterable
 4 | 
 5 | from pydantic import BaseModel
 6 | from starlette.background import BackgroundTask
 7 | from starlette.concurrency import iterate_in_threadpool
 8 | from starlette.responses import JSONResponse, StreamingResponse
 9 | 
10 | 
11 | class JSONStreamingResponse(StreamingResponse, JSONResponse):
12 |     """StreamingResponse that also render with JSON."""
13 | 
14 |     def __init__(
15 |         self,
16 |         content: Iterable | AsyncIterable,
17 |         status_code: int = 200,
18 |         headers: dict[str, str] | None = None,
19 |         media_type: str | None = None,
20 |         background: BackgroundTask | None = None,
21 |     ) -> None:
22 |         if isinstance(content, AsyncIterable):
23 |             self._content_iterable: AsyncIterable = content
24 |         else:
25 |             self._content_iterable = iterate_in_threadpool(content)
26 | 
27 |         async def body_iterator() -> AsyncIterable[bytes]:
28 |             async for content_ in self._content_iterable:
29 |                 if isinstance(content_, BaseModel):
30 |                     content_ = content_.model_dump()
31 |                 yield self.render(content_)
32 | 
33 |         self.body_iterator = body_iterator()
34 |         self.status_code = status_code
35 |         if media_type is not None:
36 |             self.media_type = media_type
37 |         self.background = background
38 |         self.init_headers(headers)
39 | 
40 |     def render(self, content: typing.Any) -> bytes:
41 |         return (
42 |             json.dumps(
43 |                 content,
44 |                 ensure_ascii=False,
45 |                 allow_nan=False,
46 |                 indent=None,
47 |                 separators=(",", ":"),
48 |             )
49 |             + "\n"
50 |         ).encode("utf-8")
51 | 


--------------------------------------------------------------------------------
/api/src/structures/model_schemas.py:
--------------------------------------------------------------------------------
 1 | """Voice configuration schemas."""
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | 
 6 | class VoiceConfig(BaseModel):
 7 |     """Voice configuration."""
 8 | 
 9 |     use_cache: bool = Field(True, description="Whether to cache loaded voices")
10 |     cache_size: int = Field(3, description="Number of voices to cache")
11 |     validate_on_load: bool = Field(
12 |         True, description="Whether to validate voices when loading"
13 |     )
14 | 
15 |     class Config:
16 |         frozen = True  # Make config immutable
17 | 


--------------------------------------------------------------------------------
/api/src/structures/text_schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Union
 2 | 
 3 | from pydantic import BaseModel, Field, field_validator
 4 | 
 5 | 
 6 | class PhonemeRequest(BaseModel):
 7 |     text: str
 8 |     language: str = "a"  # Default to American English
 9 | 
10 | 
11 | class PhonemeResponse(BaseModel):
12 |     phonemes: str
13 |     tokens: list[int]
14 | 
15 | 
16 | class StitchOptions(BaseModel):
17 |     """Options for stitching audio chunks together"""
18 | 
19 |     gap_method: str = Field(
20 |         default="static_trim",
21 |         description="Method to handle gaps between chunks. Currently only 'static_trim' supported.",
22 |     )
23 |     trim_ms: int = Field(
24 |         default=0,
25 |         ge=0,
26 |         description="Milliseconds to trim from chunk boundaries when using static_trim",
27 |     )
28 | 
29 |     @field_validator("gap_method")
30 |     @classmethod
31 |     def validate_gap_method(cls, v: str) -> str:
32 |         if v != "static_trim":
33 |             raise ValueError("Currently only 'static_trim' gap method is supported")
34 |         return v
35 | 
36 | 
37 | class GenerateFromPhonemesRequest(BaseModel):
38 |     """Simple request for phoneme-to-speech generation"""
39 | 
40 |     phonemes: str = Field(..., description="Phoneme string to synthesize")
41 |     voice: str = Field(..., description="Voice ID to use for generation")
42 | 


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_alloy.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_alloy.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_aoede.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_aoede.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_bella.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_bella.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_heart.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_heart.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_jadzia.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_jadzia.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_jessica.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_jessica.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_kore.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_kore.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_nicole.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_nicole.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_nova.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_nova.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_river.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_river.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_sarah.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_sarah.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_sky.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_sky.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0bella.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0bella.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0irulan.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0irulan.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0nicole.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0nicole.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0sarah.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0sarah.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/af_v0sky.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0sky.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_adam.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_adam.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_echo.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_echo.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_eric.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_eric.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_fenrir.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_fenrir.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_liam.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_liam.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_michael.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_michael.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_onyx.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_onyx.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_puck.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_puck.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_santa.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_santa.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_v0adam.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0adam.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_v0gurney.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0gurney.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/am_v0michael.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0michael.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bf_alice.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_alice.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bf_emma.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_emma.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bf_lily.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_lily.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bf_v0emma.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_v0emma.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bf_v0isabella.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_v0isabella.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_daniel.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_daniel.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_fable.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_fable.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_george.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_george.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_lewis.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_lewis.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_v0george.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_v0george.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/bm_v0lewis.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_v0lewis.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/ef_dora.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/ef_dora.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/em_alex.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/em_alex.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/em_santa.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/em_santa.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/ff_siwis.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/ff_siwis.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/hf_alpha.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hf_alpha.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/hf_beta.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hf_beta.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/hm_omega.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hm_omega.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/hm_psi.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hm_psi.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/if_sara.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/if_sara.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/im_nicola.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/im_nicola.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/jf_alpha.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_alpha.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/jf_gongitsune.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_gongitsune.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/jf_nezumi.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_nezumi.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/jf_tebukuro.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_tebukuro.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/jm_kumo.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jm_kumo.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/pf_dora.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pf_dora.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/pm_alex.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pm_alex.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/pm_santa.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pm_santa.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zf_xiaobei.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaobei.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zf_xiaoni.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoni.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zf_xiaoxiao.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoxiao.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zf_xiaoyi.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoyi.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zm_yunjian.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunjian.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zm_yunxi.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunxi.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zm_yunxia.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunxia.pt


--------------------------------------------------------------------------------
/api/src/voices/v1_0/zm_yunyang.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunyang.pt


--------------------------------------------------------------------------------
/api/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Make tests directory a Python package
2 | 


--------------------------------------------------------------------------------
/api/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from unittest.mock import AsyncMock, MagicMock, patch
 4 | 
 5 | import numpy as np
 6 | import pytest
 7 | import pytest_asyncio
 8 | import torch
 9 | 
10 | from api.src.inference.model_manager import ModelManager
11 | from api.src.inference.voice_manager import VoiceManager
12 | from api.src.services.tts_service import TTSService
13 | from api.src.structures.model_schemas import VoiceConfig
14 | 
15 | 
16 | @pytest.fixture
17 | def mock_voice_tensor():
18 |     """Load a real voice tensor for testing."""
19 |     voice_path = os.path.join(
20 |         os.path.dirname(os.path.dirname(__file__)), "src/voices/af_bella.pt"
21 |     )
22 |     return torch.load(voice_path, map_location="cpu", weights_only=False)
23 | 
24 | 
25 | @pytest.fixture
26 | def mock_audio_output():
27 |     """Load pre-generated test audio for consistent testing."""
28 |     test_audio_path = os.path.join(
29 |         os.path.dirname(__file__), "test_data/test_audio.npy"
30 |     )
31 |     return np.load(test_audio_path)  # Return as numpy array instead of bytes
32 | 
33 | 
34 | @pytest_asyncio.fixture
35 | async def mock_model_manager(mock_audio_output):
36 |     """Mock model manager for testing."""
37 |     manager = AsyncMock(spec=ModelManager)
38 |     manager.get_backend = MagicMock()
39 | 
40 |     async def mock_generate(*args, **kwargs):
41 |         # Simulate successful audio generation
42 |         return np.random.rand(24000).astype(np.float32)  # 1 second of random audio data
43 | 
44 |     manager.generate = AsyncMock(side_effect=mock_generate)
45 |     return manager
46 | 
47 | 
48 | @pytest_asyncio.fixture
49 | async def mock_voice_manager(mock_voice_tensor):
50 |     """Mock voice manager for testing."""
51 |     manager = AsyncMock(spec=VoiceManager)
52 |     manager.get_voice_path = MagicMock(return_value="/mock/path/voice.pt")
53 |     manager.load_voice = AsyncMock(return_value=mock_voice_tensor)
54 |     manager.list_voices = AsyncMock(return_value=["voice1", "voice2"])
55 |     manager.combine_voices = AsyncMock(return_value="voice1_voice2")
56 |     return manager
57 | 
58 | 
59 | @pytest_asyncio.fixture
60 | async def tts_service(mock_model_manager, mock_voice_manager):
61 |     """Get mocked TTS service instance."""
62 |     service = TTSService()
63 |     service.model_manager = mock_model_manager
64 |     service._voice_manager = mock_voice_manager
65 |     return service
66 | 
67 | 
68 | @pytest.fixture
69 | def test_voice():
70 |     """Return a test voice name."""
71 |     return "voice1"
72 | 


--------------------------------------------------------------------------------
/api/tests/test_data/generate_test_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def generate_test_audio():
 7 |     """Generate test audio data - 1 second of 440Hz tone"""
 8 |     # Create 1 second of silence at 24kHz
 9 |     audio = np.zeros(24000, dtype=np.float32)
10 | 
11 |     # Add a simple sine wave to make it non-zero
12 |     t = np.linspace(0, 1, 24000)
13 |     audio += 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz tone at half amplitude
14 | 
15 |     # Create test_data directory if it doesn't exist
16 |     os.makedirs("api/tests/test_data", exist_ok=True)
17 | 
18 |     # Save the test audio
19 |     np.save("api/tests/test_data/test_audio.npy", audio)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     generate_test_audio()
24 | 


--------------------------------------------------------------------------------
/api/tests/test_data/test_audio.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/tests/test_data/test_audio.npy


--------------------------------------------------------------------------------
/api/tests/test_development.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | from unittest.mock import MagicMock, patch
 4 | 
 5 | import pytest
 6 | import requests
 7 | 
 8 | 
 9 | def test_generate_captioned_speech():
10 |     """Test the generate_captioned_speech function with mocked responses"""
11 |     # Mock the API responses
12 |     mock_audio_response = MagicMock()
13 |     mock_audio_response.status_code = 200
14 | 
15 |     mock_timestamps_response = MagicMock()
16 |     mock_timestamps_response.status_code = 200
17 |     mock_timestamps_response.content = json.dumps(
18 |         {
19 |             "audio": base64.b64encode(b"mock audio data").decode("utf-8"),
20 |             "timestamps": [{"word": "test", "start_time": 0.0, "end_time": 1.0}],
21 |         }
22 |     )
23 | 
24 |     # Patch the HTTP requests
25 |     with patch("requests.post", return_value=mock_timestamps_response):
26 |         # Import here to avoid module-level import issues
27 |         from examples.captioned_speech_example import generate_captioned_speech
28 | 
29 |         # Test the function
30 |         audio, timestamps = generate_captioned_speech("test text")
31 | 
32 |         # Verify we got both audio and timestamps
33 |         assert audio == b"mock audio data"
34 |         assert timestamps == [{"word": "test", "start_time": 0.0, "end_time": 1.0}]
35 | 


--------------------------------------------------------------------------------
/assets/cpu_first_token_timeline_stream_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/cpu_first_token_timeline_stream_openai.png


--------------------------------------------------------------------------------
/assets/docs-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/docs-screenshot.png


--------------------------------------------------------------------------------
/assets/format_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/format_comparison.png


--------------------------------------------------------------------------------
/assets/gpu_first_token_latency_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_latency_direct.png


--------------------------------------------------------------------------------
/assets/gpu_first_token_latency_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_latency_openai.png


--------------------------------------------------------------------------------
/assets/gpu_first_token_timeline_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_timeline_direct.png


--------------------------------------------------------------------------------
/assets/gpu_first_token_timeline_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_timeline_openai.png


--------------------------------------------------------------------------------
/assets/gpu_processing_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_processing_time.png


--------------------------------------------------------------------------------
/assets/gpu_realtime_factor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_realtime_factor.png


--------------------------------------------------------------------------------
/assets/gpu_total_time_latency_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_total_time_latency_direct.png


--------------------------------------------------------------------------------
/assets/gpu_total_time_latency_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_total_time_latency_openai.png


--------------------------------------------------------------------------------
/assets/voice_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/voice_analysis.png


--------------------------------------------------------------------------------
/assets/webui-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/webui-screenshot.png


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: kokoro-fastapi
 3 | description: A Helm chart for deploying the Kokoro FastAPI TTS service to Kubernetes
 4 | type: application
 5 | version: 0.3.0
 6 | appVersion: "0.3.0"
 7 | 
 8 | keywords:
 9 |   - tts
10 |   - fastapi
11 |   - gpu
12 |   - kokoro
13 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/examples/aks-tls-values.yaml:
--------------------------------------------------------------------------------
 1 | # Tested on
 2 | # - Azure AKS with GPU node pool with Nvidia GPU operator
 3 | # - This setup uses 1 ingress and load balances between 2 replicas, enabling simultaneous requests
 4 | # 
 5 | # Azure CLI command to create a GPU node pool:
 6 | # az aks nodepool add \
 7 | #   --resource-group $AZ_RESOURCE_GROUP \
 8 | #   --cluster-name $CLUSTER_NAME \
 9 | #   --name t4gpus \
10 | #   --node-vm-size Standard_NC4as_T4_v3 \
11 | #   --node-count 2 \
12 | #   --enable-cluster-autoscaler \
13 | #   --min-count 1 \
14 | #   --max-count 2 \
15 | #   --priority Spot \
16 | #   --eviction-policy Delete \
17 | #   --spot-max-price -1 \
18 | #   --node-taints "sku=gpu:NoSchedule,kubernetes.azure.com/scalesetpriority=spot:NoSchedule" \
19 | #   --skip-gpu-driver-install
20 | 
21 | kokoroTTS:
22 |   replicaCount: 8
23 |   port: 8880
24 |   tag: v0.2.0
25 |   pullPolicy: IfNotPresent
26 | 
27 | # Azure specific settings for spot t4 GPU nodes with Nvidia GPU operator
28 | tolerations:
29 |   - key: "kubernetes.azure.com/scalesetpriority"
30 |     operator: Equal
31 |     value: "spot"
32 |     effect: NoSchedule
33 |   - key: "sku"
34 |     operator: Equal
35 |     value: "gpu"
36 |     effect: NoSchedule
37 | 
38 | ingress:
39 |   enabled: true
40 |   className: "nginx"
41 |   annotations:
42 |     # Requires cert-manager and external-dns to be in the cluster for TLS and DNS
43 |     cert-manager.io/cluster-issuer: letsencrypt-prod
44 |     external-dns.alpha.kubernetes.io/hostname: your-external-dns-enabled-hostname
45 |     external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
46 |   hosts:
47 |     - host: your-external-dns-enabled-hostname
48 |       paths:
49 |         - path: /
50 |           pathType: Prefix
51 |   tls:
52 |     - secretName: kokoro-fastapi-tls
53 |       hosts:
54 |         - your-external-dns-enabled-hostname


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/examples/gpu-operator-values.yaml:
--------------------------------------------------------------------------------
 1 | # Follow the official NVIDIA GPU Operator documentation
 2 | #   to install the GPU operator with these settings:
 3 | #     https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html
 4 | #
 5 | # This example is for a Nvidia T4 16gb GPU node pool with only 1 GPU on each node on Azure AKS.
 6 | # It uses time-slicing to share the a and claim to the system that 1 GPU is 4 GPUs.
 7 | # So each pod has access to a smaller gpu with 4gb of memory.
 8 | #
 9 | devicePlugin: # Remove this if you dont want to use time-slicing
10 |   config:
11 |     create: true
12 |     name: "time-slicing-config"
13 |     default: "any"
14 |     data:
15 |       any: |-
16 |         version: v1
17 |         flags:
18 |           migStrategy: none
19 |         sharing:
20 |           timeSlicing:
21 |             resources:
22 |             - name: nvidia.com/gpu
23 |               replicas: 4
24 | 
25 | daemonsets:
26 |   tolerations:
27 |     - key: "sku"
28 |       operator: Equal
29 |       value: "gpu"
30 |       effect: NoSchedule
31 |     - key: "kubernetes.azure.com/scalesetpriority"
32 |       operator: Equal
33 |       value: "spot"
34 |       effect: NoSchedule
35 | 
36 | node-feature-discovery:
37 |   master:
38 |     tolerations:
39 |       - key: "sku"
40 |         operator: Equal
41 |         value: "gpu"
42 |         effect: NoSchedule
43 |       - key: "kubernetes.azure.com/scalesetpriority"
44 |         operator: Equal
45 |         value: "spot"
46 |         effect: NoSchedule
47 |   worker:
48 |     tolerations:
49 |       - key: "sku"
50 |         operator: Equal
51 |         value: "gpu"
52 |         effect: NoSchedule
53 |       - key: "kubernetes.azure.com/scalesetpriority"
54 |         operator: Equal
55 |         value: "spot"
56 |         effect: NoSchedule


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 1. Get the application URL by running these commands:
 2 | {{- if .Values.ingress.enabled }}
 3 | {{- range $host := .Values.ingress.hosts }}
 4 |   {{- range .paths }}
 5 |   http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
 6 |   {{- end }}
 7 | {{- end }}
 8 | {{- else if contains "NodePort" .Values.service.type }}
 9 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kokoro-fastapi.fullname" . }})
10 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
11 |   echo http://$NODE_IP:$NODE_PORT
12 | {{- else if contains "LoadBalancer" .Values.service.type }}
13 |      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
14 |            You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kokoro-fastapi.fullname" . }}'
15 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kokoro-fastapi.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
16 |   echo http://$SERVICE_IP:{{ .Values.kokoroTTS.port }}
17 | {{- else if contains "ClusterIP" .Values.service.type }}
18 |   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kokoro-fastapi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
19 |   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
20 |   echo "Visit http://127.0.0.1:8880 to use your application"
21 |   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8880:$CONTAINER_PORT
22 | {{- end }}
23 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "kokoro-fastapi.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "kokoro-fastapi.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "kokoro-fastapi.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "kokoro-fastapi.labels" -}}
37 | helm.sh/chart: {{ include "kokoro-fastapi.chart" . }}
38 | {{ include "kokoro-fastapi.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "kokoro-fastapi.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "kokoro-fastapi.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "kokoro-fastapi.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "kokoro-fastapi.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/hpa.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.autoscaling.enabled }}
 2 | apiVersion: autoscaling/v2beta1
 3 | kind: HorizontalPodAutoscaler
 4 | metadata:
 5 |   name: {{ include "kokoro-fastapi.fullname" . }}
 6 |   labels:
 7 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 8 | spec:
 9 |   scaleTargetRef:
10 |     apiVersion: apps/v1
11 |     kind: Deployment
12 |     name: {{ include "kokoro-fastapi.fullname" . }}
13 |   minReplicas: {{ .Values.autoscaling.minReplicas }}
14 |   maxReplicas: {{ .Values.autoscaling.maxReplicas }}
15 |   metrics:
16 |     {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
17 |     - type: Resource
18 |       resource:
19 |         name: cpu
20 |         targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
21 |     {{- end }}
22 |     {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
23 |     - type: Resource
24 |       resource:
25 |         name: memory
26 |         targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
27 |     {{- end }}
28 | {{- end }}
29 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/ingress.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.ingress.enabled -}}
 2 | apiVersion: networking.k8s.io/v1
 3 | kind: Ingress
 4 | metadata:
 5 |   name: {{ include "kokoro-fastapi.fullname" . }}
 6 |   labels:
 7 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 8 |   {{- with .Values.ingress.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | spec:
13 |   {{- with .Values.ingress.className }}
14 |   ingressClassName: {{ . }}
15 |   {{- end }}
16 |   {{- if .Values.ingress.tls }}
17 |   tls:
18 |     {{- range .Values.ingress.tls }}
19 |     - hosts:
20 |         {{- range .hosts }}
21 |         - {{ . | quote }}
22 |         {{- end }}
23 |       secretName: {{ .secretName }}
24 |     {{- end }}
25 |   {{- end }}
26 |   rules:
27 |     {{- range .Values.ingress.hosts }}
28 |     - host: {{ .host | quote }}
29 |       http:
30 |         paths:
31 |           {{- range .paths }}
32 |           - path: {{ .path }}
33 |             {{- with .pathType }}
34 |             pathType: {{ . }}
35 |             {{- end }}
36 |             backend:
37 |               service:
38 |                 name: {{ include "kokoro-fastapi.fullname" $ }}-kokoro-tts-service
39 |                 port:
40 |                   number: {{ $.Values.kokoroTTS.port }}
41 |           {{- end }}
42 |     {{- end }}
43 | {{- end }}
44 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts
 5 |   labels:
 6 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 7 | spec:
 8 |   {{- if not .Values.autoscaling.enabled }}
 9 |   replicas: {{ .Values.kokoroTTS.replicaCount }}
10 |   {{- end }}
11 |   selector:
12 |     matchLabels:
13 |       {{- include "kokoro-fastapi.selectorLabels" . | nindent 6 }}
14 |   template:
15 |     metadata:
16 |       {{- with .Values.podAnnotations }}
17 |       annotations:
18 |         {{- toYaml . | nindent 8 }}
19 |       {{- end }}
20 |       labels:
21 |         {{- include "kokoro-fastapi.selectorLabels" . | nindent 8 }}
22 |     spec:
23 |       {{- with .Values.kokoroTTS.imagePullSecrets }}
24 |       imagePullSecrets:
25 |         {{- toYaml . | nindent 8 }}
26 |       {{- end }}
27 |       serviceAccountName: {{ include "kokoro-fastapi.serviceAccountName" . }}
28 |       securityContext:
29 |         {{- toYaml .Values.podSecurityContext | nindent 8 }}
30 |       initContainers: []
31 |       containers:
32 |         - name: kokoro-tts
33 |           securityContext:
34 |             {{- toYaml .Values.securityContext | nindent 12 }}
35 |           image: "{{ .Values.kokoroTTS.repository }}:{{ .Values.kokoroTTS.tag | default .Chart.AppVersion }}"
36 |           imagePullPolicy: {{ .Values.kokoroTTS.pullPolicy }}
37 |           env:
38 |             - name: PYTHONPATH
39 |               value: "/app:/app/api"
40 |             - name: USE_GPU
41 |               value: "true"
42 |             - name: PYTHONUNBUFFERED
43 |               value: "1"
44 |           ports:
45 |             - name: kokoro-tts-http
46 |               containerPort: {{ .Values.kokoroTTS.port | default 8880 }}
47 |               protocol: TCP
48 |           livenessProbe:
49 |             httpGet:
50 |               path: /health
51 |               port: kokoro-tts-http
52 |             initialDelaySeconds: 30
53 |             periodSeconds: 30
54 |             timeoutSeconds: 5
55 |           readinessProbe:
56 |             httpGet:
57 |               path: /health
58 |               port: kokoro-tts-http
59 |             initialDelaySeconds: 30
60 |             periodSeconds: 30
61 |             timeoutSeconds: 5
62 |           resources:
63 |             {{- toYaml .Values.kokoroTTS.resources | nindent 12 }}
64 |           volumeMounts: []
65 |       volumes: []
66 |       {{- with .Values.nodeSelector }}
67 |       nodeSelector:
68 |         {{- toYaml . | nindent 8 }}
69 |       {{- end }}
70 |       {{- with .Values.affinity }}
71 |       affinity:
72 |         {{- toYaml . | nindent 8 }}
73 |       {{- end }}
74 |       {{- with .Values.tolerations }}
75 |       tolerations:
76 |         {{- toYaml . | nindent 8 }}
77 |       {{- end }}
78 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts-service
 5 |   labels:
 6 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 7 | spec:
 8 |   type: {{ .Values.service.type }}
 9 |   ports:
10 |     - port: {{ .Values.kokoroTTS.port }}
11 |       targetPort: kokoro-tts-http
12 |       protocol: TCP
13 |       name: kokoro-tts-http
14 |   selector:
15 |     {{- include "kokoro-fastapi.selectorLabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "kokoro-fastapi.serviceAccountName" . }}
 6 |   labels:
 7 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 8 |   {{- with .Values.serviceAccount.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | {{- end }}
13 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: "{{ include "kokoro-fastapi.fullname" . }}-test-connection"
 5 |   labels:
 6 |     {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 7 |   annotations:
 8 |     "helm.sh/hook": test
 9 | spec:
10 |   containers:
11 |     - name: wget
12 |       image: busybox
13 |       command: ['wget']
14 |       args: ['{{ include "kokoro-fastapi.fullname" . }}:{{ .Values.kokoroTTS.port }}']
15 |   restartPolicy: Never
16 | 


--------------------------------------------------------------------------------
/charts/kokoro-fastapi/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for kokoro-fastapi.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | kokoroTTS:
 5 |   replicaCount: 1
 6 |   # The name of the deployment repository
 7 |   repository: "ghcr.io/remsky/kokoro-fastapi-gpu"
 8 |   imagePullSecrets: [] # Set if using a private image or getting rate limited
 9 |   tag: "latest"
10 |   pullPolicy: Always
11 |   port: 8880
12 |   resources:
13 |     limits:
14 |       nvidia.com/gpu: 1
15 |     requests:
16 |       nvidia.com/gpu: 1
17 | 
18 | nameOverride: ""
19 | fullnameOverride: ""
20 | 
21 | serviceAccount:
22 |   # Specifies whether a service account should be created
23 |   create: true
24 |   # Annotations to add to the service account
25 |   annotations: {}
26 |   # The name of the service account to use.
27 |   # If not set and create is true, a name is generated using the fullname template
28 |   name: ""
29 | 
30 | podAnnotations: {}
31 | 
32 | podSecurityContext: {}
33 |   # fsGroup: 2000
34 | 
35 | securityContext: {}
36 |   # capabilities:
37 |   #   drop:
38 |   #   - ALL
39 |   # readOnlyRootFilesystem: true
40 |   # runAsNonRoot: true
41 |   # runAsUser: 1000
42 | 
43 | service:
44 |   type: ClusterIP
45 | 
46 | ingress:
47 |   enabled: false
48 |   className: "nginx"
49 |   annotations: {}
50 |     # cert-manager.io/cluster-issuer: letsencrypt-prod
51 |     # external-dns.alpha.kubernetes.io/hostname: kokoro.example.com
52 |     # external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
53 |   hosts:
54 |     - host: kokoro.example.com
55 |       paths:
56 |         - path: /
57 |           pathType: Prefix
58 | 
59 |   tls: []
60 |   #  - secretName: kokoro-fastapi-tls
61 |   #    hosts:
62 |   #      - kokoro.example.com
63 | 
64 | autoscaling:
65 |   enabled: false
66 |   minReplicas: 1
67 |   maxReplicas: 100
68 |   targetCPUUtilizationPercentage: 80
69 |   # targetMemoryUtilizationPercentage: 80
70 | 
71 | nodeSelector: {}
72 | 
73 | tolerations: []
74 | 
75 | affinity: {}
76 | 


--------------------------------------------------------------------------------
/debug.http:
--------------------------------------------------------------------------------
 1 | ### Get Thread Information
 2 | GET http://localhost:8880/debug/threads
 3 | Accept: application/json
 4 | 
 5 | ### Get Storage Information
 6 | GET http://localhost:8880/debug/storage
 7 | Accept: application/json
 8 | 
 9 | ### Get System Information
10 | GET http://localhost:8880/debug/system
11 | Accept: application/json
12 | 
13 | ### Get Session Pool Status
14 | # Shows active ONNX sessions, CUDA stream usage, and session ages
15 | # Useful for debugging resource exhaustion issues
16 | GET http://localhost:8880/debug/session_pools
17 | Accept: application/json
18 | 
19 | ### List Available Models
20 | # Returns list of all available models in OpenAI format
21 | # Response includes tts-1, tts-1-hd, and kokoro models
22 | GET http://localhost:8880/v1/models
23 | Accept: application/json
24 | 
25 | ### Get Specific Model
26 | # Returns same model list as above for compatibility
27 | # Works with any model name (e.g., tts-1, tts-1-hd, kokoro)
28 | GET http://localhost:8880/v1/models/tts-1
29 | Accept: application/json


--------------------------------------------------------------------------------
/dev/Test Phon.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | 
 4 | import pydub
 5 | import requests
 6 | 
 7 | def generate_audio_from_phonemes(phonemes: str, voice: str = "af_bella"):
 8 |     """Generate audio from phonemes"""
 9 |     response = requests.post(
10 |         "http://localhost:8880/dev/generate_from_phonemes",
11 |         json={"phonemes": phonemes, "voice": voice},
12 |         headers={"Accept": "audio/wav"}
13 |     )
14 |     if response.status_code != 200:
15 |         print(f"Error: {response.text}")
16 |         return None
17 |     return response.content
18 | 
19 | 
20 | 
21 | 
22 | with open(f"outputnostreammoney.wav", "wb") as f:
23 |     f.write(generate_audio_from_phonemes(r"mɪsəki ɪz ɐn ɪkspˌɛɹəmˈɛntᵊl ʤˈitəpˈi ˈɛnʤən dəzˈInd tə pˈWəɹ fjˈuʧəɹ vˈɜɹʒənz ʌv kəkˈɔɹO mˈɑdᵊlz."))


--------------------------------------------------------------------------------
/dev/Test copy 2.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | 
 4 | import pydub
 5 | import requests
 6 | 
 7 | text = """Running on localhost:7860"""
 8 | 
 9 | 
10 | Type = "wav"
11 | response = requests.post(
12 |     "http://localhost:8880/dev/captioned_speech",
13 |     json={
14 |         "model": "kokoro",
15 |         "input": text,
16 |         "voice": "af_heart+af_sky",
17 |         "speed": 1.0,
18 |         "response_format": Type,
19 |         "stream": True,
20 |     },
21 |     stream=True,
22 | )
23 | 
24 | f = open(f"outputstream.{Type}", "wb")
25 | for chunk in response.iter_lines(decode_unicode=True):
26 |     if chunk:
27 |         temp_json = json.loads(chunk)
28 |         if temp_json["timestamps"] != []:
29 |             chunk_json = temp_json
30 | 
31 |         # Decode base 64 stream to bytes
32 |         chunk_audio = base64.b64decode(temp_json["audio"].encode("utf-8"))
33 | 
34 |         # Process streaming chunks
35 |         f.write(chunk_audio)
36 | 
37 |         # Print word level timestamps
38 |         print(chunk_json["timestamps"])
39 | 


--------------------------------------------------------------------------------
/dev/Test money.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | text = """奶酪芝士很浓郁！臭豆腐芝士有争议？陈年奶酪价格昂贵。"""
 7 | 
 8 | 
 9 | Type = "wav"
10 | 
11 | response = requests.post(
12 |     "http://localhost:8880/v1/audio/speech",
13 |     json={
14 |         "model": "kokoro",
15 |         "input": text,
16 |         "voice": "zf_xiaobei",
17 |         "speed": 1.0,
18 |         "response_format": Type,
19 |         "stream": False,
20 |     },
21 |     stream=True,
22 | )
23 | 
24 | with open(f"outputnostreammoney.{Type}", "wb") as f:
25 |     f.write(response.content)
26 | 


--------------------------------------------------------------------------------
/dev/Test num.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import inflect
 4 | from text_to_num import text2num
 5 | from torch import mul
 6 | 
 7 | INFLECT_ENGINE = inflect.engine()
 8 | 
 9 | 
10 | def conditional_int(number: float, threshold: float = 0.00001):
11 |     if abs(round(number) - number) < threshold:
12 |         return int(round(number))
13 |     return number
14 | 
15 | 
16 | def handle_money(m: re.Match[str]) -> str:
17 |     """Convert money expressions to spoken form"""
18 | 
19 |     bill = "dollar" if m.group(2) == "$" else "pound"
20 |     coin = "cent" if m.group(2) == "$" else "pence"
21 |     number = m.group(3)
22 | 
23 |     multiplier = m.group(4)
24 |     try:
25 |         number = float(number)
26 |     except:
27 |         return m.group()
28 | 
29 |     if m.group(1) == "-":
30 |         number *= -1
31 | 
32 |     if number % 1 == 0 or multiplier != "":
33 |         text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
34 |     else:
35 |         sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
36 | 
37 |         text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
38 | 
39 |     return text_number
40 | 
41 | 
42 | text = re.sub(
43 |     r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
44 |     handle_money,
45 |     "he administration has offered up a platter of repression for more than a year and is still slated to lose -$5.3 billion",
46 | )
47 | print(text)
48 | 


--------------------------------------------------------------------------------
/docker-bake.hcl:
--------------------------------------------------------------------------------
 1 | # Variables for reuse
 2 | variable "VERSION" {
 3 |     default = "latest"
 4 | }
 5 | 
 6 | variable "REGISTRY" {
 7 |     default = "ghcr.io"
 8 | }
 9 | 
10 | variable "OWNER" {
11 |     default = "remsky"
12 | }
13 | 
14 | variable "REPO" {
15 |     default = "kokoro-fastapi"
16 | }
17 | 
18 | variable "DOWNLOAD_MODEL" {
19 |     default = "true"
20 | }
21 | 
22 | # Common settings shared between targets
23 | target "_common" {
24 |     context = "."
25 |     args = {
26 |         DEBIAN_FRONTEND = "noninteractive"
27 |         DOWNLOAD_MODEL = "${DOWNLOAD_MODEL}"
28 |     }
29 | }
30 | 
31 | # Base settings for CPU builds
32 | target "_cpu_base" {
33 |     inherits = ["_common"]
34 |     dockerfile = "docker/cpu/Dockerfile"
35 | }
36 | 
37 | # Base settings for GPU builds
38 | target "_gpu_base" {
39 |     inherits = ["_common"]
40 |     dockerfile = "docker/gpu/Dockerfile"
41 | }
42 | 
43 | # CPU target with multi-platform support
44 | target "cpu" {
45 |     inherits = ["_cpu_base"]
46 |     platforms = ["linux/amd64", "linux/arm64"]
47 |     tags = [
48 |         "${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
49 |         "${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
50 |     ]
51 | }
52 | 
53 | # GPU target with multi-platform support
54 | target "gpu" {
55 |     inherits = ["_gpu_base"]
56 |     platforms = ["linux/amd64", "linux/arm64"]
57 |     tags = [
58 |         "${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
59 |         "${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
60 |     ]
61 | }
62 | 
63 | # Default group to build both CPU and GPU versions
64 | group "default" {
65 |     targets = ["cpu", "gpu"]
66 | }
67 | 
68 | # Development targets for faster local builds
69 | target "cpu-dev" {
70 |     inherits = ["_cpu_base"]
71 |     # No multi-platform for dev builds
72 |     tags = ["${REGISTRY}/${OWNER}/${REPO}-cpu:dev"]
73 | }
74 | 
75 | target "gpu-dev" {
76 |     inherits = ["_gpu_base"]
77 |     # No multi-platform for dev builds
78 |     tags = ["${REGISTRY}/${OWNER}/${REPO}-gpu:dev"]
79 | }
80 | 
81 | group "dev" {
82 |     targets = ["cpu-dev", "gpu-dev"]
83 | }


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Get version from argument or use default
 5 | VERSION=${1:-"latest"}
 6 | 
 7 | # Build both CPU and GPU images using docker buildx bake
 8 | echo "Building CPU and GPU images..."
 9 | VERSION=$VERSION docker buildx bake --push
10 | 
11 | echo "Build complete!"
12 | echo "Created images with version: $VERSION"
13 | 


--------------------------------------------------------------------------------
/docker/cpu/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | 
 4 | # Python
 5 | __pycache__
 6 | *.pyc
 7 | *.pyo
 8 | *.pyd
 9 | .Python
10 | *.py[cod]
11 | *$py.class
12 | .pytest_cache
13 | .coverage
14 | .coveragerc
15 | 
16 | # Environment
17 | # .env
18 | .venv
19 | env/
20 | venv/
21 | ENV/
22 | 
23 | # IDE
24 | .idea
25 | .vscode
26 | *.swp
27 | *.swo
28 | 
29 | # Project specific
30 | examples/
31 | Kokoro-82M/
32 | ui/
33 | tests/
34 | *.md
35 | *.txt
36 | !requirements.txt
37 | 
38 | # Docker
39 | Dockerfile*
40 | docker-compose*
41 | 


--------------------------------------------------------------------------------
/docker/cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim 
 2 | 
 3 | # Install dependencies and check espeak location
 4 | RUN apt-get update && apt-get install -y \
 5 |     espeak-ng \
 6 |     espeak-ng-data \
 7 |     git \
 8 |     libsndfile1 \
 9 |     curl \
10 |     ffmpeg \
11 |     g++ \
12 | && apt-get clean \
13 | && rm -rf /var/lib/apt/lists/* \
14 | && mkdir -p /usr/share/espeak-ng-data \
15 | && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
16 | 
17 | # Install UV using the installer script
18 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
19 |     mv /root/.local/bin/uv /usr/local/bin/ && \
20 |     mv /root/.local/bin/uvx /usr/local/bin/
21 | 
22 | # Create non-root user and set up directories and permissions
23 | RUN useradd -m -u 1000 appuser && \
24 |     mkdir -p /app/api/src/models/v1_0 && \
25 |     chown -R appuser:appuser /app
26 | 
27 | USER appuser
28 | WORKDIR /app
29 | 
30 | # Copy dependency files
31 | COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
32 | 
33 | # Install Rust (required to build sudachipy and pyopenjtalk-plus)
34 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
35 | ENV PATH="/home/appuser/.cargo/bin:$PATH"
36 | 
37 | # Install dependencies
38 | RUN --mount=type=cache,target=/root/.cache/uv \
39 |     uv venv --python 3.10 && \
40 |     uv sync --extra cpu
41 | 
42 | # Copy project files including models
43 | COPY --chown=appuser:appuser api ./api
44 | COPY --chown=appuser:appuser web ./web
45 | COPY --chown=appuser:appuser docker/scripts/ ./
46 | RUN chmod +x ./entrypoint.sh
47 | 
48 | # Set environment variables
49 | ENV PYTHONUNBUFFERED=1 \
50 |     PYTHONPATH=/app:/app/api \
51 |     PATH="/app/.venv/bin:$PATH" \
52 |     UV_LINK_MODE=copy \
53 |     USE_GPU=false \
54 |     PHONEMIZER_ESPEAK_PATH=/usr/bin \
55 |     PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
56 |     ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
57 | 
58 | ENV DOWNLOAD_MODEL=true
59 | # Download model if enabled
60 | RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
61 |     python download_model.py --output api/src/models/v1_0; \
62 |     fi
63 | 
64 | ENV DEVICE="cpu"
65 | # Run FastAPI server through entrypoint.sh
66 | CMD ["./entrypoint.sh"]
67 | 


--------------------------------------------------------------------------------
/docker/cpu/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | name: kokoro-fastapi-cpu
 2 | services:
 3 |   kokoro-tts:
 4 |     build:
 5 |       context: ../..
 6 |       dockerfile: docker/cpu/Dockerfile
 7 |     volumes:
 8 |       - ../../api:/app/api
 9 |     ports:
10 |       - "8880:8880"
11 |     environment:
12 |       - PYTHONPATH=/app:/app/api
13 |       # ONNX Optimization Settings for vectorized operations
14 |       - ONNX_NUM_THREADS=8  # Maximize core usage for vectorized ops
15 |       - ONNX_INTER_OP_THREADS=4  # Higher inter-op for parallel matrix operations
16 |       - ONNX_EXECUTION_MODE=parallel
17 |       - ONNX_OPTIMIZATION_LEVEL=all
18 |       - ONNX_MEMORY_PATTERN=true
19 |       - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
20 |       
21 |   # # Gradio UI service [Comment out everything below if you don't need it]
22 |   # gradio-ui:
23 |   #   image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION}
24 |   #   # Uncomment below (and comment out above) to build from source instead of using the released image
25 |   #   build:
26 |   #     context: ../../ui
27 |   #   ports:
28 |   #     - "7860:7860"
29 |   #   volumes:
30 |   #     - ../../ui/data:/app/ui/data
31 |   #     - ../../ui/app.py:/app/app.py  # Mount app.py for hot reload
32 |   #   environment:
33 |   #     - GRADIO_WATCH=True  # Enable hot reloading
34 |   #     - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
35 |   #     - DISABLE_LOCAL_SAVING=false  # Set to 'true' to disable local saving and hide file view
36 |   #     - API_HOST=kokoro-tts  # Set TTS service URL
37 |   #     - API_PORT=8880  # Set TTS service PORT
38 | 


--------------------------------------------------------------------------------
/docker/gpu/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | 
 4 | # Python
 5 | __pycache__
 6 | *.pyc
 7 | *.pyo
 8 | *.pyd
 9 | .Python
10 | *.py[cod]
11 | *$py.class
12 | .pytest_cache
13 | .coverage
14 | .coveragerc
15 | 
16 | # Environment
17 | # .env
18 | .venv*
19 | env/
20 | venv/
21 | ENV/
22 | 
23 | # IDE
24 | .idea
25 | .vscode
26 | *.swp
27 | *.swo
28 | 
29 | # Project specific
30 | examples/
31 | Kokoro-82M/
32 | ui/
33 | tests/
34 | *.md
35 | *.txt
36 | !requirements.txt
37 | 
38 | # Docker
39 | Dockerfile*
40 | docker-compose*
41 | 


--------------------------------------------------------------------------------
/docker/gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
 2 | # Set non-interactive frontend
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # Install Python and other dependencies
 6 | RUN apt-get update && apt-get install -y \
 7 |     python3.10 \
 8 |     python3-venv \
 9 |     espeak-ng \
10 |     espeak-ng-data \
11 |     git \
12 |     libsndfile1 \
13 |     curl \
14 |     ffmpeg \
15 |     g++ \
16 |  && apt-get clean && rm -rf /var/lib/apt/lists/* \
17 |  && mkdir -p /usr/share/espeak-ng-data \
18 |  && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
19 | 
20 | # Install UV using the installer script
21 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
22 |     mv /root/.local/bin/uv /usr/local/bin/ && \
23 |     mv /root/.local/bin/uvx /usr/local/bin/ 
24 | 
25 | # Create non-root user and set up directories and permissions
26 | RUN useradd -m -u 1001 appuser && \
27 |     mkdir -p /app/api/src/models/v1_0 && \
28 |     chown -R appuser:appuser /app
29 |     
30 | USER appuser
31 | WORKDIR /app
32 | 
33 | # Copy dependency files
34 | COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
35 | 
36 | ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
37 |     PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
38 |     ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
39 | 
40 | # Install dependencies with GPU extras (using cache mounts)
41 | RUN --mount=type=cache,target=/root/.cache/uv \
42 |     uv venv --python 3.10 && \
43 |     uv sync --extra gpu
44 | 
45 | # Copy project files including models
46 | COPY --chown=appuser:appuser api ./api
47 | COPY --chown=appuser:appuser web ./web
48 | COPY --chown=appuser:appuser docker/scripts/ ./
49 | RUN chmod +x ./entrypoint.sh
50 | 
51 | 
52 | # Set all environment variables in one go
53 | ENV PYTHONUNBUFFERED=1 \
54 |     PYTHONPATH=/app:/app/api \
55 |     PATH="/app/.venv/bin:$PATH" \
56 |     UV_LINK_MODE=copy \
57 |     USE_GPU=true 
58 | 
59 | ENV DOWNLOAD_MODEL=true
60 | # Download model if enabled
61 | RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
62 |     python download_model.py --output api/src/models/v1_0; \
63 |     fi
64 | 
65 | ENV DEVICE="gpu"
66 | # Run FastAPI server through entrypoint.sh
67 | CMD ["./entrypoint.sh"]
68 | 


--------------------------------------------------------------------------------
/docker/gpu/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | name: kokoro-tts-gpu
 2 | services:
 3 |   kokoro-tts:
 4 |     # image: ghcr.io/remsky/kokoro-fastapi-gpu:v${VERSION}
 5 |     build:
 6 |       context: ../..
 7 |       dockerfile: docker/gpu/Dockerfile
 8 |     volumes:
 9 |       - ../../api:/app/api
10 |     user: "1001:1001"  # Ensure container runs as UID 1001 (appuser)
11 |     ports:
12 |       - "8880:8880"
13 |     environment:
14 |       - PYTHONPATH=/app:/app/api
15 |       - USE_GPU=true
16 |       - PYTHONUNBUFFERED=1
17 |     deploy:
18 |       resources:
19 |         reservations:
20 |           devices:
21 |             - driver: nvidia
22 |               count: all
23 |               capabilities: [gpu]
24 | 
25 |   # # Gradio UI service
26 |   # gradio-ui:
27 |   #   image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION}
28 |   #   # Uncomment below to build from source instead of using the released image
29 |   #   # build:
30 |   #     # context: ../../ui
31 |   #   ports:
32 |   #     - "7860:7860"
33 |   #   volumes:
34 |   #     - ../../ui/data:/app/ui/data
35 |   #     - ../../ui/app.py:/app/app.py  # Mount app.py for hot reload
36 |   #   environment:
37 |   #     - GRADIO_WATCH=1  # Enable hot reloading
38 |   #     - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
39 |   #     - DISABLE_LOCAL_SAVING=false  # Set to 'true' to disable local saving and hide file view
40 |   #     - API_HOST=kokoro-tts  # Set TTS service URL
41 |   #     - API_PORT=8880  # Set TTS service PORT
42 | 


--------------------------------------------------------------------------------
/docker/scripts/download_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Download and prepare Kokoro v1.0 model."""
  3 | 
  4 | import json
  5 | import os
  6 | from pathlib import Path
  7 | from urllib.request import urlretrieve
  8 | 
  9 | from loguru import logger
 10 | 
 11 | 
 12 | def verify_files(model_path: str, config_path: str) -> bool:
 13 |     """Verify that model files exist and are valid.
 14 | 
 15 |     Args:
 16 |         model_path: Path to model file
 17 |         config_path: Path to config file
 18 | 
 19 |     Returns:
 20 |         True if files exist and are valid
 21 |     """
 22 |     try:
 23 |         # Check files exist
 24 |         if not os.path.exists(model_path):
 25 |             return False
 26 |         if not os.path.exists(config_path):
 27 |             return False
 28 | 
 29 |         # Verify config file is valid JSON
 30 |         with open(config_path) as f:
 31 |             config = json.load(f)
 32 | 
 33 |         # Check model file size (should be non-zero)
 34 |         if os.path.getsize(model_path) == 0:
 35 |             return False
 36 | 
 37 |         return True
 38 |     except Exception:
 39 |         return False
 40 | 
 41 | 
 42 | def download_model(output_dir: str) -> None:
 43 |     """Download model files from GitHub release.
 44 | 
 45 |     Args:
 46 |         output_dir: Directory to save model files
 47 |     """
 48 |     try:
 49 |         # Create output directory
 50 |         os.makedirs(output_dir, exist_ok=True)
 51 | 
 52 |         # Define file paths
 53 |         model_file = "kokoro-v1_0.pth"
 54 |         config_file = "config.json"
 55 |         model_path = os.path.join(output_dir, model_file)
 56 |         config_path = os.path.join(output_dir, config_file)
 57 | 
 58 |         # Check if files already exist and are valid
 59 |         if verify_files(model_path, config_path):
 60 |             logger.info("Model files already exist and are valid")
 61 |             return
 62 | 
 63 |         logger.info("Downloading Kokoro v1.0 model files")
 64 | 
 65 |         # GitHub release URLs (to be updated with v0.2.0 release)
 66 |         base_url = "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.4"
 67 |         model_url = f"{base_url}/{model_file}"
 68 |         config_url = f"{base_url}/{config_file}"
 69 | 
 70 |         # Download files
 71 |         logger.info("Downloading model file...")
 72 |         urlretrieve(model_url, model_path)
 73 | 
 74 |         logger.info("Downloading config file...")
 75 |         urlretrieve(config_url, config_path)
 76 | 
 77 |         # Verify downloaded files
 78 |         if not verify_files(model_path, config_path):
 79 |             raise RuntimeError("Failed to verify downloaded files")
 80 | 
 81 |         logger.info(f"✓ Model files prepared in {output_dir}")
 82 | 
 83 |     except Exception as e:
 84 |         logger.error(f"Failed to download model: {e}")
 85 |         raise
 86 | 
 87 | 
 88 | def main():
 89 |     """Main entry point."""
 90 |     import argparse
 91 | 
 92 |     parser = argparse.ArgumentParser(description="Download Kokoro v1.0 model")
 93 |     parser.add_argument(
 94 |         "--output", required=True, help="Output directory for model files"
 95 |     )
 96 | 
 97 |     args = parser.parse_args()
 98 |     download_model(args.output)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/docker/scripts/download_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Find project root by looking for api directory
  4 | find_project_root() {
  5 |     local current_dir="$PWD"
  6 |     local max_steps=5
  7 |     local steps=0
  8 |     
  9 |     while [ $steps -lt $max_steps ]; do
 10 |         if [ -d "$current_dir/api" ]; then
 11 |             echo "$current_dir"
 12 |             return 0
 13 |         fi
 14 |         current_dir="$(dirname "$current_dir")"
 15 |         ((steps++))
 16 |     done
 17 |     
 18 |     echo "Error: Could not find project root (no api directory found)" >&2
 19 |     exit 1
 20 | }
 21 | 
 22 | # Function to verify files exist and are valid
 23 | verify_files() {
 24 |     local model_path="$1"
 25 |     local config_path="$2"
 26 |     
 27 |     # Check files exist
 28 |     if [ ! -f "$model_path" ] || [ ! -f "$config_path" ]; then
 29 |         return 1
 30 |     fi
 31 |     
 32 |     # Check files are not empty
 33 |     if [ ! -s "$model_path" ] || [ ! -s "$config_path" ]; then
 34 |         return 1
 35 |     fi
 36 |     
 37 |     # Try to parse config.json
 38 |     if ! jq . "$config_path" >/dev/null 2>&1; then
 39 |         return 1
 40 |     fi
 41 |     
 42 |     return 0
 43 | }
 44 | 
 45 | # Function to download a file
 46 | download_file() {
 47 |     local url="$1"
 48 |     local output_path="$2"
 49 |     local filename=$(basename "$output_path")
 50 |     
 51 |     echo "Downloading $filename..."
 52 |     mkdir -p "$(dirname "$output_path")"
 53 |     if curl -L "$url" -o "$output_path"; then
 54 |         echo "Successfully downloaded $filename"
 55 |         return 0
 56 |     else
 57 |         echo "Error downloading $filename" >&2
 58 |         return 1
 59 |     fi
 60 | }
 61 | 
 62 | # Find project root and ensure models directory exists
 63 | PROJECT_ROOT=$(find_project_root)
 64 | if [ $? -ne 0 ]; then
 65 |     exit 1
 66 | fi
 67 | 
 68 | MODEL_DIR="$PROJECT_ROOT/api/src/models/v1_0"
 69 | echo "Model directory: $MODEL_DIR"
 70 | mkdir -p "$MODEL_DIR"
 71 | 
 72 | # Define file paths
 73 | MODEL_FILE="kokoro-v1_0.pth"
 74 | CONFIG_FILE="config.json"
 75 | MODEL_PATH="$MODEL_DIR/$MODEL_FILE"
 76 | CONFIG_PATH="$MODEL_DIR/$CONFIG_FILE"
 77 | 
 78 | # Check if files already exist and are valid
 79 | if verify_files "$MODEL_PATH" "$CONFIG_PATH"; then
 80 |     echo "Model files already exist and are valid"
 81 |     exit 0
 82 | fi
 83 | 
 84 | # Define URLs
 85 | BASE_URL="https://github.com/remsky/Kokoro-FastAPI/releases/download/v1.4"
 86 | MODEL_URL="$BASE_URL/$MODEL_FILE"
 87 | CONFIG_URL="$BASE_URL/$CONFIG_FILE"
 88 | 
 89 | # Download files
 90 | success=true
 91 | 
 92 | if ! download_file "$MODEL_URL" "$MODEL_PATH"; then
 93 |     success=false
 94 | fi
 95 | 
 96 | if ! download_file "$CONFIG_URL" "$CONFIG_PATH"; then
 97 |     success=false
 98 | fi
 99 | 
100 | # Verify downloaded files
101 | if [ "$success" = true ] && verify_files "$MODEL_PATH" "$CONFIG_PATH"; then
102 |     echo "✓ Model files prepared in $MODEL_DIR"
103 |     exit 0
104 | else
105 |     echo "Failed to download or verify model files" >&2
106 |     exit 1
107 | fi


--------------------------------------------------------------------------------
/docker/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | if [ "$DOWNLOAD_MODEL" = "true" ]; then
5 |     python download_model.py --output api/src/models/v1_0
6 | fi
7 | 
8 | exec uv run --extra $DEVICE --no-sync python -m uvicorn api.src.main:app --host 0.0.0.0 --port 8880 --log-level debug


--------------------------------------------------------------------------------
/docs/architecture/espeak_setup_fix.md:
--------------------------------------------------------------------------------
  1 | # ESpeak-NG Setup Fix
  2 | 
  3 | ## Issue Description
  4 | 
  5 | Users are reporting two distinct errors:
  6 | 
  7 | 1. Missing espeak-ng-data/phontab file:
  8 | ```
  9 | Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory.
 10 | ```
 11 | 
 12 | 2. Invalid pipeline state:
 13 | ```
 14 | Error generating speech: The object is in an invalid state.
 15 | ```
 16 | 
 17 | ## Root Cause Analysis
 18 | 
 19 | ### 1. ESpeak-NG Data Issue
 20 | 
 21 | The dependency chain has changed:
 22 | ```
 23 | Before:
 24 | kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer
 25 | 
 26 | After:
 27 | kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader
 28 | ```
 29 | 
 30 | The issue arises because:
 31 | 1. misaki now uses espeakng-loader to manage espeak paths
 32 | 2. espeakng-loader looks for data in its package directory
 33 | 3. We have a direct dependency on phonemizer 3.3.0 that conflicts
 34 | 
 35 | ### 2. Pipeline State Issue
 36 | The "invalid state" error occurs due to device mismatch in pipeline creation.
 37 | 
 38 | ## Solution
 39 | 
 40 | ### 1. For ESpeak-NG Data
 41 | 
 42 | Update dependencies and environment:
 43 | 
 44 | 1. Remove direct phonemizer dependency:
 45 | ```diff
 46 | - "phonemizer==3.3.0",  # Remove this
 47 | ```
 48 | 
 49 | 2. Let misaki handle phonemizer-fork and espeakng-loader
 50 | 
 51 | 3. Set environment variable in Dockerfile:
 52 | ```dockerfile
 53 | ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
 54 |     PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
 55 |     ESPEAK_DATA_PATH=/usr/share/espeak-ng-data  # Add this
 56 | ```
 57 | 
 58 | This approach:
 59 | - Works with misaki's new dependencies
 60 | - Maintains our working espeak setup
 61 | - Avoids complex file copying or path manipulation
 62 | 
 63 | ### 2. For Pipeline State
 64 | 
 65 | Use kokoro_v1's pipeline management:
 66 | ```python
 67 | # Instead of creating pipelines directly:
 68 | # pipeline = KPipeline(...)
 69 | 
 70 | # Use backend's pipeline management:
 71 | pipeline = backend._get_pipeline(pipeline_lang_code)
 72 | ```
 73 | 
 74 | ## Implementation Steps
 75 | 
 76 | 1. Update pyproject.toml:
 77 |    - Remove direct phonemizer dependency
 78 |    - Keep misaki dependency as is
 79 | 
 80 | 2. Update Dockerfiles:
 81 |    - Add ESPEAK_DATA_PATH environment variable
 82 |    - Keep existing espeak-ng setup
 83 | 
 84 | 3. Update tts_service.py:
 85 |    - Use backend's pipeline management
 86 |    - Add proper error handling
 87 | 
 88 | ## Testing
 89 | 
 90 | 1. Test espeak-ng functionality:
 91 |    ```bash
 92 |    # Verify environment variables
 93 |    echo $ESPEAK_DATA_PATH
 94 |    echo $PHONEMIZER_ESPEAK_DATA
 95 |    
 96 |    # Check data directory
 97 |    ls /usr/share/espeak-ng-data
 98 |    ```
 99 | 
100 | 2. Test pipeline state:
101 |    - Test on both CPU and GPU
102 |    - Verify no invalid state errors
103 |    - Test with different voice models
104 | 
105 | ## Success Criteria
106 | 
107 | 1. No espeak-ng-data/phontab file errors
108 | 2. No invalid state errors
109 | 3. Consistent behavior across platforms
110 | 4. Successful CI/CD pipeline runs
111 | 
112 | ## Future Considerations
113 | 
114 | 1. Potential PR to misaki:
115 |    - Add fallback mechanism if espeakng-loader fails
116 |    - Make path configuration more flexible
117 |    - Add better error messages
118 | 
119 | 2. Environment Variable Documentation:
120 |    - Document ESPEAK_DATA_PATH requirement
121 |    - Explain interaction with espeakng-loader
122 |    - Provide platform-specific setup instructions
123 | 
124 | ## Notes
125 | 
126 | - This solution works with misaki's new dependencies while maintaining our setup
127 | - Environment variable approach is simpler than file copying
128 | - May want to contribute improvements back to misaki later


--------------------------------------------------------------------------------
/docs/requirements.in:
--------------------------------------------------------------------------------
 1 | # Primarily for reference, as Dockerfile refer
 2 | # Core dependencies
 3 | fastapi==0.115.6
 4 | uvicorn==0.34.0
 5 | pydantic==2.10.4
 6 | pydantic-settings==2.7.0
 7 | python-dotenv==1.0.1
 8 | sqlalchemy==2.0.27
 9 | 
10 | # ML/DL
11 | transformers==4.47.1
12 | numpy>=1.26.0  # Version managed by PyTorch dependencies
13 | scipy==1.14.1
14 | onnxruntime==1.20.1
15 | 
16 | # Audio processing
17 | soundfile==0.13.0
18 | 
19 | # Text processing
20 | phonemizer==3.3.0
21 | regex==2024.11.6
22 | 
23 | # Utilities
24 | aiofiles==23.2.1  # Last version before Windows path handling changes
25 | tqdm==4.67.1
26 | requests==2.32.3
27 | munch==4.0.0
28 | tiktoken===0.8.0
29 | loguru==0.7.3
30 | 
31 | # Testing
32 | pytest==8.0.0
33 | httpx==0.26.0
34 | pytest-asyncio==0.23.5
35 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/__init__.py


--------------------------------------------------------------------------------
/examples/assorted_checks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/__init__.py


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/__init__.py


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/lib/__init__.py


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "results": [
  3 |     {
  4 |       "tokens": 100,
  5 |       "processing_time": 0.98,
  6 |       "output_length": 28.975,
  7 |       "rtf": 0.03,
  8 |       "elapsed_time": 1.02255
  9 |     },
 10 |     {
 11 |       "tokens": 200,
 12 |       "processing_time": 1.79,
 13 |       "output_length": 58.45,
 14 |       "rtf": 0.03,
 15 |       "elapsed_time": 2.84766
 16 |     },
 17 |     {
 18 |       "tokens": 300,
 19 |       "processing_time": 2.1,
 20 |       "output_length": 86.75,
 21 |       "rtf": 0.02,
 22 |       "elapsed_time": 4.98201
 23 |     },
 24 |     {
 25 |       "tokens": 400,
 26 |       "processing_time": 2.66,
 27 |       "output_length": 113.5,
 28 |       "rtf": 0.02,
 29 |       "elapsed_time": 7.67743
 30 |     },
 31 |     {
 32 |       "tokens": 500,
 33 |       "processing_time": 3.13,
 34 |       "output_length": 140.225,
 35 |       "rtf": 0.02,
 36 |       "elapsed_time": 10.84279
 37 |     }
 38 |   ],
 39 |   "system_metrics": [
 40 |     {
 41 |       "timestamp": "2025-01-30T05:03:26.422469",
 42 |       "cpu_percent": 0.0,
 43 |       "ram_percent": 18.5,
 44 |       "ram_used_gb": 5.2551727294921875,
 45 |       "gpu_memory_used": 1988.0,
 46 |       "relative_time": 0.14498639106750488
 47 |     },
 48 |     {
 49 |       "timestamp": "2025-01-30T05:03:27.568319",
 50 |       "cpu_percent": 13.42,
 51 |       "ram_percent": 18.6,
 52 |       "ram_used_gb": 5.267307281494141,
 53 |       "gpu_memory_used": 2025.0,
 54 |       "relative_time": 1.1970372200012207
 55 |     },
 56 |     {
 57 |       "timestamp": "2025-01-30T05:03:28.620098",
 58 |       "cpu_percent": 12.89,
 59 |       "ram_percent": 18.6,
 60 |       "ram_used_gb": 5.267337799072266,
 61 |       "gpu_memory_used": 3071.0,
 62 |       "relative_time": 2.254074811935425
 63 |     },
 64 |     {
 65 |       "timestamp": "2025-01-30T05:03:29.677030",
 66 |       "cpu_percent": 12.43,
 67 |       "ram_percent": 18.6,
 68 |       "ram_used_gb": 5.29168701171875,
 69 |       "gpu_memory_used": 2555.0,
 70 |       "relative_time": 3.306957244873047
 71 |     },
 72 |     {
 73 |       "timestamp": "2025-01-30T05:03:30.729971",
 74 |       "cpu_percent": 12.47,
 75 |       "ram_percent": 18.6,
 76 |       "ram_used_gb": 5.292213439941406,
 77 |       "gpu_memory_used": 3345.0,
 78 |       "relative_time": 4.3373119831085205
 79 |     },
 80 |     {
 81 |       "timestamp": "2025-01-30T05:03:31.760463",
 82 |       "cpu_percent": 13.71,
 83 |       "ram_percent": 18.7,
 84 |       "ram_used_gb": 5.30987548828125,
 85 |       "gpu_memory_used": 2549.0,
 86 |       "relative_time": 5.368744850158691
 87 |     },
 88 |     {
 89 |       "timestamp": "2025-01-30T05:03:32.791904",
 90 |       "cpu_percent": 12.16,
 91 |       "ram_percent": 18.7,
 92 |       "ram_used_gb": 5.308803558349609,
 93 |       "gpu_memory_used": 3358.0,
 94 |       "relative_time": 6.418949842453003
 95 |     },
 96 |     {
 97 |       "timestamp": "2025-01-30T05:03:33.842039",
 98 |       "cpu_percent": 11.5,
 99 |       "ram_percent": 18.7,
100 |       "ram_used_gb": 5.309070587158203,
101 |       "gpu_memory_used": 3349.0,
102 |       "relative_time": 7.4437031745910645
103 |     },
104 |     {
105 |       "timestamp": "2025-01-30T05:03:34.866692",
106 |       "cpu_percent": 15.38,
107 |       "ram_percent": 18.7,
108 |       "ram_used_gb": 5.2960205078125,
109 |       "gpu_memory_used": 3034.0,
110 |       "relative_time": 8.472418069839478
111 |     },
112 |     {
113 |       "timestamp": "2025-01-30T05:03:35.895656",
114 |       "cpu_percent": 13.44,
115 |       "ram_percent": 18.7,
116 |       "ram_used_gb": 5.294971466064453,
117 |       "gpu_memory_used": 3315.0,
118 |       "relative_time": 9.498533248901367
119 |     },
120 |     {
121 |       "timestamp": "2025-01-30T05:03:36.921589",
122 |       "cpu_percent": 12.64,
123 |       "ram_percent": 18.7,
124 |       "ram_used_gb": 5.297389984130859,
125 |       "gpu_memory_used": 3314.0,
126 |       "relative_time": 10.565555095672607
127 |     },
128 |     {
129 |       "timestamp": "2025-01-30T05:03:37.994149",
130 |       "cpu_percent": 8.32,
131 |       "ram_percent": 18.7,
132 |       "ram_used_gb": 5.305477142333984,
133 |       "gpu_memory_used": 1958.0,
134 |       "relative_time": 11.616873502731323
135 |     }
136 |   ],
137 |   "test_duration": 14.051392793655396
138 | }


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_rtf.txt:
--------------------------------------------------------------------------------
 1 | === Benchmark Statistics (with correct RTF) ===
 2 | 
 3 | Total tokens processed: 1500
 4 | Total audio generated (s): 427.90
 5 | Total test duration (s): 10.84
 6 | Average processing rate (tokens/s): 133.35
 7 | Average RTF: 0.02
 8 | Average Real Time Speed: 41.67
 9 | 
10 | === Per-chunk Stats ===
11 | 
12 | Average chunk size (tokens): 300.00
13 | Min chunk size (tokens): 100
14 | Max chunk size (tokens): 500
15 | Average processing time (s): 2.13
16 | Average output length (s): 85.58
17 | 
18 | === Performance Ranges ===
19 | 
20 | Processing rate range (tokens/s): 102.04 - 159.74
21 | RTF range: 0.02x - 0.03x
22 | Real Time Speed range: 33.33x - 50.00x
23 | 
24 | 


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt:
--------------------------------------------------------------------------------
 1 | === Benchmark Statistics (with correct RTF) ===
 2 | 
 3 | Total tokens processed: 3150
 4 | Total audio generated (s): 895.98
 5 | Total test duration (s): 23.54
 6 | Average processing rate (tokens/s): 133.43
 7 | Average RTF: 0.03
 8 | Average Real Time Speed: 35.29
 9 | 
10 | === Per-chunk Stats ===
11 | 
12 | Average chunk size (tokens): 525.00
13 | Min chunk size (tokens): 150
14 | Max chunk size (tokens): 900
15 | Average processing time (s): 3.88
16 | Average output length (s): 149.33
17 | 
18 | === Performance Ranges ===
19 | 
20 | Processing rate range (tokens/s): 127.12 - 147.93
21 | RTF range: 0.02x - 0.03x
22 | Real Time Speed range: 33.33x - 50.00x
23 | 
24 | 


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png


--------------------------------------------------------------------------------
/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png


--------------------------------------------------------------------------------
/examples/assorted_checks/test_combinations/test_download_voice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | from pathlib import Path
 4 | import requests
 5 | 
 6 | # Create output directory
 7 | output_dir = Path(__file__).parent / "output"
 8 | output_dir.mkdir(exist_ok=True)
 9 | 
10 | def download_combined_voice(voice1: str, voice2: str, weights: tuple[float, float] = None) -> str:
11 |     """Download a combined voice file.
12 |     
13 |     Args:
14 |         voice1: First voice name
15 |         voice2: Second voice name
16 |         weights: Optional tuple of weights (w1, w2). If not provided, uses equal weights.
17 |     
18 |     Returns:
19 |         Path to downloaded .pt file
20 |     """
21 |     print(f"\nDownloading combined voice: {voice1} + {voice2}")
22 |     
23 |     # Construct voice string with optional weights
24 |     if weights:
25 |         voice_str = f"{voice1}({weights[0]})+{voice2}({weights[1]})"
26 |     else:
27 |         voice_str = f"{voice1}+{voice2}"
28 |     
29 |     # Make the request to combine voices
30 |     response = requests.post(
31 |         "http://localhost:8880/v1/audio/voices/combine",
32 |         json=voice_str
33 |     )
34 |     
35 |     if response.status_code != 200:
36 |         raise Exception(f"Failed to combine voices: {response.text}")
37 |     
38 |     # Save the .pt file
39 |     output_path = output_dir / f"{voice_str}.pt"
40 |     with open(output_path, "wb") as f:
41 |         f.write(response.content)
42 |     
43 |     print(f"Saved combined voice to {output_path}")
44 |     return str(output_path)
45 | 
46 | def main():
47 |     # Test downloading various voice combinations
48 |     combinations = [
49 |         # Equal weights (default)
50 |         ("af_bella", "af_kore"),
51 |         
52 |         # Different weight combinations
53 |         ("af_bella", "af_kore", (0.2, 0.8)),
54 |         ("af_bella", "af_kore", (0.8, 0.2)),
55 |         ("af_bella", "af_kore", (0.5, 0.5)),
56 |         
57 |         # Test with different voices
58 |         ("af_bella", "af_jadzia"),
59 |         ("af_bella", "af_jadzia", (0.3, 0.7))
60 |     ]
61 |     
62 |     for combo in combinations:
63 |         try:
64 |             if len(combo) == 3:
65 |                 voice1, voice2, weights = combo
66 |                 download_combined_voice(voice1, voice2, weights)
67 |             else:
68 |                 voice1, voice2 = combo
69 |                 download_combined_voice(voice1, voice2)
70 |         except Exception as e:
71 |             print(f"Error downloading combination {combo}: {e}")
72 | 
73 | if __name__ == "__main__":
74 |     main()


--------------------------------------------------------------------------------
/examples/assorted_checks/test_openai/test_openai_tts.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import openai
 4 | 
 5 | # Configure OpenAI client to use our local endpoint
 6 | client = openai.OpenAI(
 7 |     timeout=30,
 8 |     api_key="notneeded",  # API key not required for our endpoint
 9 |     base_url="http://localhost:8880/v1",  # Point to our local server with v1 prefix
10 | )
11 | 
12 | # Create output directory if it doesn't exist
13 | output_dir = Path(__file__).parent / "output"
14 | output_dir.mkdir(exist_ok=True)
15 | 
16 | 
17 | def test_format(
18 |     format: str, text: str = "The quick brown fox jumped over the lazy dog."
19 | ):
20 |     speech_file = output_dir / f"speech_{format}.{format}"
21 |     print(f"\nTesting {format} format...")
22 |     print(f"Making request to {client.base_url}/audio/speech...")
23 | 
24 |     try:
25 |         response = client.audio.speech.create(
26 |             model="tts-1", voice="af_heart", input=text, response_format=format
27 |         )
28 | 
29 |         print("Got response, saving to file...")
30 |         with open(speech_file, "wb") as f:
31 |             f.write(response.content)
32 |         print(f"Success! Saved to: {speech_file}")
33 | 
34 |     except Exception as e:
35 |         print(f"Error: {str(e)}")
36 | 
37 | 
38 | def test_speed(speed: float):
39 |     speech_file = output_dir / f"speech_speed_{speed}.wav"
40 |     print(f"\nTesting speed {speed}x...")
41 |     print(f"Making request to {client.base_url}/audio/speech...")
42 | 
43 |     try:
44 |         response = client.audio.speech.create(
45 |             model="tts-1",
46 |             voice="af_heart",
47 |             input="The quick brown fox jumped over the lazy dog.",
48 |             response_format="wav",
49 |             speed=speed,
50 |         )
51 | 
52 |         print("Got response, saving to file...")
53 |         with open(speech_file, "wb") as f:
54 |             f.write(response.content)
55 |         print(f"Success! Saved to: {speech_file}")
56 | 
57 |     except Exception as e:
58 |         print(f"Error: {str(e)}")
59 | 
60 | 
61 | # Test different formats
62 | for format in ["wav", "mp3", "opus", "aac", "flac", "pcm"]:
63 |     test_format(format)  # aac and pcm should fail as they are not supported
64 | 
65 | # Test different speeds
66 | for speed in [0.25, 1.0, 2.0, 4.0]:  # 5.0 should fail as it's out of range
67 |     test_speed(speed)
68 | 
69 | # Test long text
70 | test_format(
71 |     "wav",
72 |     """
73 | That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. 
74 | """,
75 | )
76 | 


--------------------------------------------------------------------------------
/examples/assorted_checks/test_voices/analyze_voice_dimensions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from loguru import logger
 4 | 
 5 | def analyze_voice_file(file_path):
 6 |     """Analyze dimensions and statistics of a voice tensor."""
 7 |     try:
 8 |         tensor = torch.load(file_path, map_location="cpu")
 9 |         logger.info(f"\nAnalyzing {os.path.basename(file_path)}:")
10 |         logger.info(f"Shape: {tensor.shape}")
11 |         logger.info(f"Mean: {tensor.mean().item():.4f}")
12 |         logger.info(f"Std: {tensor.std().item():.4f}")
13 |         logger.info(f"Min: {tensor.min().item():.4f}")
14 |         logger.info(f"Max: {tensor.max().item():.4f}")
15 |         return tensor.shape
16 |     except Exception as e:
17 |         logger.error(f"Error analyzing {file_path}: {e}")
18 |         return None
19 | 
20 | def main():
21 |     """Analyze voice files in the voices directory."""
22 |     # Get the project root directory
23 |     current_dir = os.path.dirname(os.path.abspath(__file__))
24 |     project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
25 |     voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
26 |     
27 |     logger.info(f"Scanning voices in: {voices_dir}")
28 |     
29 |     # Track shapes for comparison
30 |     shapes = {}
31 |     
32 |     # Analyze each .pt file
33 |     for file in os.listdir(voices_dir):
34 |         if file.endswith('.pt'):
35 |             file_path = os.path.join(voices_dir, file)
36 |             shape = analyze_voice_file(file_path)
37 |             if shape:
38 |                 shapes[file] = shape
39 |     
40 |     # Report findings
41 |     logger.info("\nShape Analysis:")
42 |     shape_groups = {}
43 |     for file, shape in shapes.items():
44 |         if shape not in shape_groups:
45 |             shape_groups[shape] = []
46 |         shape_groups[shape].append(file)
47 |     
48 |     for shape, files in shape_groups.items():
49 |         logger.info(f"\nShape {shape}:")
50 |         for file in files:
51 |             logger.info(f"  - {file}")
52 | 
53 | if __name__ == "__main__":
54 |     main()


--------------------------------------------------------------------------------
/examples/assorted_checks/test_voices/test_all_voices.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import openai
 4 | import requests
 5 | 
 6 | SAMPLE_TEXT = """
 7 | That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. 
 8 | """
 9 | 
10 | # Configure OpenAI client to use our local endpoint
11 | client = openai.OpenAI(
12 |     timeout=60,
13 |     api_key="notneeded",  # API key not required for our endpoint
14 |     base_url="http://localhost:8880/v1",  # Point to our local server with v1 prefix
15 | )
16 | 
17 | # Create output directory if it doesn't exist
18 | output_dir = Path(__file__).parent / "output"
19 | output_dir.mkdir(exist_ok=True)
20 | 
21 | 
22 | def test_voice(voice: str):
23 |     speech_file = output_dir / f"speech_{voice}.mp3"
24 |     print(f"\nTesting voice: {voice}")
25 |     print(f"Making request to {client.base_url}/audio/speech...")
26 | 
27 |     try:
28 |         response = client.audio.speech.create(
29 |             model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format="mp3"
30 |         )
31 | 
32 |         print("Got response, saving to file...")
33 |         with open(speech_file, "wb") as f:
34 |             f.write(response.content)
35 |         print(f"Success! Saved to: {speech_file}")
36 | 
37 |     except Exception as e:
38 |         print(f"Error with voice {voice}: {str(e)}")
39 | 
40 | 
41 | # First, get list of available voices using requests
42 | print("Getting list of available voices...")
43 | try:
44 |     # Convert base_url to string and ensure no double slashes
45 |     base_url = str(client.base_url).rstrip("/")
46 |     response = requests.get(f"{base_url}/audio/voices")
47 |     if response.status_code != 200:
48 |         raise Exception(f"Failed to get voices: {response.text}")
49 |     data = response.json()
50 |     if "voices" not in data:
51 |         raise Exception(f"Unexpected response format: {data}")
52 |     voices = data["voices"]
53 |     print(f"Found {len(voices)} voices: {', '.join(voices)}")
54 | 
55 |     # Test each voice
56 |     for voice in voices:
57 |         test_voice(voice)
58 | 
59 | except Exception as e:
60 |     print(f"Error getting voices: {str(e)}")
61 | 


--------------------------------------------------------------------------------
/examples/assorted_checks/test_voices/trim_voice_dimensions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from loguru import logger
 4 | 
 5 | def analyze_voice_content(tensor):
 6 |     """Analyze the content distribution in the voice tensor."""
 7 |     # Look at the variance along the first dimension to see where the information is concentrated
 8 |     variance = torch.var(tensor, dim=(1,2))  # Variance across features
 9 |     logger.info(f"Variance distribution:")
10 |     logger.info(f"First 5 rows variance: {variance[:5].mean().item():.6f}")
11 |     logger.info(f"Last 5 rows variance: {variance[-5:].mean().item():.6f}")
12 |     return variance
13 | 
14 | def trim_voice_tensor(tensor):
15 |     """Trim a 511x1x256 tensor to 510x1x256 by removing the row with least impact."""
16 |     if tensor.shape[0] != 511:
17 |         raise ValueError(f"Expected tensor with first dimension 511, got {tensor.shape[0]}")
18 |     
19 |     # Analyze variance contribution of each row
20 |     variance = analyze_voice_content(tensor)
21 |     
22 |     # Determine which end has lower variance (less information)
23 |     start_var = variance[:5].mean().item()
24 |     end_var = variance[-5:].mean().item()
25 |     
26 |     # Remove from the end with lower variance
27 |     if end_var < start_var:
28 |         logger.info("Trimming last row (lower variance at end)")
29 |         return tensor[:-1]
30 |     else:
31 |         logger.info("Trimming first row (lower variance at start)")
32 |         return tensor[1:]
33 | 
34 | def process_voice_file(file_path):
35 |     """Process a single voice file."""
36 |     try:
37 |         tensor = torch.load(file_path, map_location="cpu")
38 |         if tensor.shape[0] != 511:
39 |             logger.info(f"Skipping {os.path.basename(file_path)} - already correct shape {tensor.shape}")
40 |             return False
41 |             
42 |         logger.info(f"\nProcessing {os.path.basename(file_path)}:")
43 |         logger.info(f"Original shape: {tensor.shape}")
44 |         
45 |         # Create backup
46 |         backup_path = file_path + ".backup"
47 |         if not os.path.exists(backup_path):
48 |             torch.save(tensor, backup_path)
49 |             logger.info(f"Created backup at {backup_path}")
50 |         
51 |         # Trim tensor
52 |         trimmed = trim_voice_tensor(tensor)
53 |         logger.info(f"New shape: {trimmed.shape}")
54 |         
55 |         # Save trimmed tensor
56 |         torch.save(trimmed, file_path)
57 |         logger.info(f"Saved trimmed tensor to {file_path}")
58 |         
59 |         return True
60 |     except Exception as e:
61 |         logger.error(f"Error processing {file_path}: {e}")
62 |         return False
63 | 
64 | def main():
65 |     """Process voice files in the voices directory."""
66 |     # Get the project root directory
67 |     current_dir = os.path.dirname(os.path.abspath(__file__))
68 |     project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
69 |     voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
70 |     
71 |     logger.info(f"Processing voices in: {voices_dir}")
72 |     
73 |     processed = 0
74 |     for file in os.listdir(voices_dir):
75 |         if file.endswith('.pt') and not file.endswith('.backup'):
76 |             file_path = os.path.join(voices_dir, file)
77 |             if process_voice_file(file_path):
78 |                 processed += 1
79 |     
80 |     logger.info(f"\nProcessed {processed} voice files")
81 |     logger.info("Backups created with .backup extension")
82 |     logger.info("To restore backups if needed, remove .backup extension to replace trimmed files")
83 | 
84 | if __name__ == "__main__":
85 |     main()


--------------------------------------------------------------------------------
/examples/assorted_checks/validate_wavs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | from validate_wav import validate_tts
 5 | 
 6 | 
 7 | def print_validation_result(result: dict, rel_path: Path):
 8 |     """Print full validation details for a single file."""
 9 |     print(f"\nValidating: {rel_path}")
10 |     if "error" in result:
11 |         print(f"Error: {result['error']}")
12 |     else:
13 |         print(f"Duration: {result['duration']}")
14 |         print(f"Sample Rate: {result['sample_rate']} Hz")
15 |         print(f"Peak Amplitude: {result['peak_amplitude']}")
16 |         print(f"RMS Level: {result['rms_level']}")
17 |         print(f"DC Offset: {result['dc_offset']}")
18 | 
19 |         if result["issues"]:
20 |             print("\nIssues Found:")
21 |             for issue in result["issues"]:
22 |                 print(f"- {issue}")
23 |         else:
24 |             print("\nNo issues found")
25 | 
26 | 
27 | def validate_directory(directory: str):
28 |     """Validate all wav files in a directory with detailed output and summary."""
29 |     dir_path = Path(directory)
30 | 
31 |     # Find all wav files (including nested directories)
32 |     wav_files = list(dir_path.rglob("*.wav"))
33 |     wav_files.extend(dir_path.rglob("*.mp3"))  # Also check mp3s
34 |     wav_files = sorted(wav_files)
35 | 
36 |     if not wav_files:
37 |         print(f"No .wav or .mp3 files found in {directory}")
38 |         return
39 | 
40 |     print(f"Found {len(wav_files)} files in {directory}")
41 |     print("=" * 80)
42 | 
43 |     # Store results for summary
44 |     results = []
45 | 
46 |     # Detailed validation output
47 |     for wav_file in wav_files:
48 |         result = validate_tts(str(wav_file))
49 |         rel_path = wav_file.relative_to(dir_path)
50 |         print_validation_result(result, rel_path)
51 |         results.append((rel_path, result))
52 |         print("=" * 80)
53 | 
54 |     # Summary with detailed issues
55 |     print("\nSUMMARY:")
56 |     for rel_path, result in results:
57 |         if "error" in result:
58 |             print(f"{rel_path}: ERROR - {result['error']}")
59 |         elif result["issues"]:
60 |             # Show first issue in summary, indicate if there are more
61 |             issues = result["issues"]
62 |             first_issue = issues[0].replace("WARNING: ", "")
63 |             if len(issues) > 1:
64 |                 print(
65 |                     f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
66 |                 )
67 |             else:
68 |                 print(f"{rel_path}: FAIL - {first_issue}")
69 |         else:
70 |             print(f"{rel_path}: PASS")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
75 |     parser.add_argument("directory", help="Directory containing wav files to validate")
76 |     args = parser.parse_args()
77 | 
78 |     validate_directory(args.directory)
79 | 


--------------------------------------------------------------------------------
/examples/audio_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/audio_analysis.png


--------------------------------------------------------------------------------
/examples/openai_streaming_audio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rye run python
 2 | import time
 3 | from pathlib import Path
 4 | 
 5 | from openai import OpenAI
 6 | 
 7 | # gets OPENAI_API_KEY from your environment variables
 8 | openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
 9 | 
10 | speech_file_path = Path(__file__).parent / "speech.mp3"
11 | 
12 | 
13 | def main() -> None:
14 |     stream_to_speakers()
15 | 
16 |     # Create text-to-speech audio file
17 |     with openai.audio.speech.with_streaming_response.create(
18 |         model="kokoro",
19 |         voice="af_bella",
20 |         input="the quick brown fox jumped over the lazy dogs",
21 |     ) as response:
22 |         response.stream_to_file(speech_file_path)
23 | 
24 | 
25 | def stream_to_speakers() -> None:
26 |     import pyaudio
27 | 
28 |     player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
29 | 
30 |     start_time = time.time()
31 | 
32 |     with openai.audio.speech.with_streaming_response.create(
33 |         model="kokoro",
34 |         voice="af_bella+af_irulan",
35 |         response_format="pcm",  # similar to WAV, but without a header chunk at the start.
36 |         input="""I see skies of blue and clouds of white
37 |                 The bright blessed days, the dark sacred nights
38 |                 And I think to myself
39 |                 What a wonderful world""",
40 |     ) as response:
41 |         print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
42 |         for chunk in response.iter_bytes(chunk_size=1024):
43 |             player_stream.write(chunk)
44 | 
45 |     print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav


--------------------------------------------------------------------------------
/examples/phoneme_examples/test_phoneme_generation.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | import json
 4 | 
 5 | def main():
 6 |     # Test phoneme string
 7 |     phonemes = "hˈɛloʊ wˈɜrld"  # "Hello world" in phonemes
 8 |     
 9 |     try:
10 |         print("\nTesting phoneme generation via API...")
11 |         
12 |         # Create request payload
13 |         payload = {
14 |             "phonemes": phonemes,
15 |             "voice": "af_bella"  # Using bella voice
16 |         }
17 |         
18 |         # Make request to the API endpoint
19 |         response = requests.post(
20 |             "http://localhost:8880/dev/generate_from_phonemes",
21 |             json=payload,
22 |             stream=True  # Enable streaming for audio data
23 |         )
24 |         
25 |         # Check if request was successful
26 |         if response.status_code == 200:
27 |             # Create output directory if it doesn't exist
28 |             os.makedirs("examples/phoneme_examples/output", exist_ok=True)
29 |             
30 |             # Save the audio response
31 |             output_path = 'examples/phoneme_examples/output/phoneme_test.wav'
32 |             with open(output_path, 'wb') as f:
33 |                 for chunk in response.iter_content(chunk_size=8192):
34 |                     if chunk:
35 |                         f.write(chunk)
36 |             
37 |             print(f"\nAudio saved to: {output_path}")
38 |             print("\nPhoneme test completed successfully!")
39 |             print(f"\nInput phonemes: {phonemes}")
40 |         else:
41 |             print(f"Error: API request failed with status code {response.status_code}")
42 |             print(f"Response: {response.text}")
43 |     
44 |     except Exception as e:
45 |         print(f"An error occurred: {str(e)}")
46 | 
47 | if __name__ == "__main__":
48 |     main()


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | openai>=1.0.0
2 | pyaudio>=0.2.13
3 | 


--------------------------------------------------------------------------------
/examples/simul_file_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rye run python
 2 | import asyncio
 3 | import time
 4 | from pathlib import Path
 5 | from openai import AsyncOpenAI
 6 | 
 7 | # Initialize async client
 8 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
 9 | 
10 | async def save_to_file(text: str, file_id: int) -> None:
11 |     """Save TTS output to file asynchronously"""
12 |     speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3"
13 |     
14 |     start_time = time.time()
15 |     print(f"Starting file {file_id}")
16 |     
17 |     try:
18 |         # Use streaming endpoint with mp3 format
19 |         async with openai.audio.speech.with_streaming_response.create(
20 |             model="kokoro",
21 |             voice="af_bella",
22 |             input=text,
23 |             response_format="mp3"
24 |         ) as response:
25 |             print(f"File {file_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
26 |             
27 |             # Open file in binary write mode
28 |             with open(speech_file_path, 'wb') as f:
29 |                 async for chunk in response.iter_bytes():
30 |                     f.write(chunk)
31 |             
32 |             print(f"File {file_id} completed in {int((time.time() - start_time) * 1000)}ms")
33 |     except Exception as e:
34 |         print(f"Error processing file {file_id}: {e}")
35 | 
36 | async def main() -> None:
37 |     # Different text samples for variety
38 |     texts = [
39 |         "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
40 |         "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
41 |     ]
42 |     
43 |     # Create tasks for saving to files
44 |     file_tasks = [
45 |         save_to_file(text, i) 
46 |         for i, text in enumerate(texts)
47 |     ]
48 |     
49 |     # Run file tasks concurrently
50 |     await asyncio.gather(*file_tasks)
51 | 
52 | if __name__ == "__main__":
53 |     asyncio.run(main())


--------------------------------------------------------------------------------
/examples/simul_openai_streaming_audio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rye run python
 2 | import asyncio
 3 | import time
 4 | from pathlib import Path
 5 | import pyaudio
 6 | from openai import AsyncOpenAI
 7 | 
 8 | # Initialize async client
 9 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
10 | 
11 | # Create a shared PyAudio instance
12 | p = pyaudio.PyAudio()
13 | 
14 | async def stream_to_speakers(text: str, stream_id: int) -> None:
15 |     """Stream TTS audio to speakers asynchronously"""
16 |     player_stream = p.open(
17 |         format=pyaudio.paInt16,
18 |         channels=1,
19 |         rate=24000,
20 |         output=True
21 |     )
22 | 
23 |     start_time = time.time()
24 |     print(f"Starting stream {stream_id}")
25 | 
26 |     try:
27 |         async with openai.audio.speech.with_streaming_response.create(
28 |             model="kokoro",
29 |             voice="af_bella",
30 |             response_format="pcm",
31 |             input=text
32 |         ) as response:
33 |             print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
34 |             
35 |             async for chunk in response.iter_bytes(chunk_size=1024):
36 |                 player_stream.write(chunk)
37 |                 # Small sleep to allow other coroutines to run
38 |                 await asyncio.sleep(0.001)
39 | 
40 |         print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms")
41 |     
42 |     finally:
43 |         player_stream.stop_stream()
44 |         player_stream.close()
45 | 
46 | async def save_to_file(text: str, file_id: int) -> None:
47 |     """Save TTS output to file asynchronously"""
48 |     speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3"
49 |     
50 |     async with openai.audio.speech.with_streaming_response.create(
51 |         model="kokoro",
52 |         voice="af_bella",
53 |         input=text
54 |     ) as response:
55 |         # Open file in binary write mode
56 |         with open(speech_file_path, 'wb') as f:
57 |             async for chunk in response.iter_bytes():
58 |                 f.write(chunk)
59 |         print(f"File {file_id} saved to {speech_file_path}")
60 | 
61 | async def main() -> None:
62 |     # Different text samples for variety
63 |     texts = [
64 |         "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
65 |         "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
66 |     ]
67 |     
68 |     # Create tasks for streaming to speakers
69 |     speaker_tasks = [
70 |         stream_to_speakers(text, i) 
71 |         for i, text in enumerate(texts)
72 |     ]
73 |     
74 |     # Create tasks for saving to files
75 |     file_tasks = [
76 |         save_to_file(text, i) 
77 |         for i, text in enumerate(texts)
78 |     ]
79 |     
80 |     # Combine all tasks
81 |     all_tasks = speaker_tasks + file_tasks
82 |     
83 |     # Run all tasks concurrently
84 |     try:
85 |         await asyncio.gather(*all_tasks)
86 |     finally:
87 |         # Clean up PyAudio
88 |         p.terminate()
89 | 
90 | if __name__ == "__main__":
91 |     asyncio.run(main())


--------------------------------------------------------------------------------
/examples/simul_speaker_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rye run python
 2 | import asyncio
 3 | import time
 4 | import pyaudio
 5 | from openai import AsyncOpenAI
 6 | 
 7 | # Initialize async client
 8 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
 9 | 
10 | # Create a shared PyAudio instance
11 | p = pyaudio.PyAudio()
12 | 
13 | async def stream_to_speakers(text: str, stream_id: int) -> None:
14 |     """Stream TTS audio to speakers asynchronously"""
15 |     player_stream = p.open(
16 |         format=pyaudio.paInt16,
17 |         channels=1,
18 |         rate=24000,
19 |         output=True
20 |     )
21 | 
22 |     start_time = time.time()
23 |     print(f"Starting stream {stream_id}")
24 | 
25 |     try:
26 |         async with openai.audio.speech.with_streaming_response.create(
27 |             model="kokoro",
28 |             voice="af_bella",
29 |             response_format="pcm",
30 |             input=text
31 |         ) as response:
32 |             print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms")
33 |             
34 |             async for chunk in response.iter_bytes(chunk_size=1024):
35 |                 player_stream.write(chunk)
36 |                 # Small sleep to allow other coroutines to run
37 |                 await asyncio.sleep(0.001)
38 | 
39 |         print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms")
40 |     
41 |     finally:
42 |         player_stream.stop_stream()
43 |         player_stream.close()
44 | 
45 | async def main() -> None:
46 |     # Different text samples for variety
47 |     texts = [
48 |         "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white",
49 |         "I see skies of blue and clouds of white. I see skies of blue and clouds of white",
50 |     ]
51 |     
52 |     # Create tasks for streaming to speakers
53 |     speaker_tasks = [
54 |         stream_to_speakers(text, i) 
55 |         for i, text in enumerate(texts)
56 |     ]
57 |     
58 |     # Run speaker tasks concurrently
59 |     try:
60 |         await asyncio.gather(*speaker_tasks)
61 |     finally:
62 |         # Clean up PyAudio
63 |         p.terminate()
64 | 
65 | if __name__ == "__main__":
66 |     asyncio.run(main())


--------------------------------------------------------------------------------
/examples/speech.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/speech.mp3


--------------------------------------------------------------------------------
/examples/streaming_refactor/test_unified_streaming.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Test script for unified streaming implementation"""
 3 | 
 4 | import asyncio
 5 | import time
 6 | from pathlib import Path
 7 | 
 8 | from openai import OpenAI
 9 | 
10 | # Initialize OpenAI client
11 | client = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed")
12 | 
13 | async def test_streaming_to_file():
14 |     """Test streaming to file"""
15 |     print("\nTesting streaming to file...")
16 |     speech_file = Path(__file__).parent / "stream_output.mp3"
17 |     
18 |     start_time = time.time()
19 |     with client.audio.speech.with_streaming_response.create(
20 |         model="kokoro",
21 |         voice="af_bella",
22 |         input="Testing unified streaming implementation with a short phrase.",
23 |     ) as response:
24 |         response.stream_to_file(speech_file)
25 |     
26 |     print(f"Streaming to file completed in {(time.time() - start_time):.2f}s")
27 |     print(f"Output saved to: {speech_file}")
28 | 
29 | async def test_streaming_chunks():
30 |     """Test streaming chunks for real-time playback"""
31 |     print("\nTesting chunk streaming...")
32 |     
33 |     start_time = time.time()
34 |     chunk_count = 0
35 |     total_bytes = 0
36 |     
37 |     with client.audio.speech.with_streaming_response.create(
38 |         model="kokoro",
39 |         voice="af_bella",
40 |         response_format="pcm",
41 |         input="""This is a longer text to test chunk streaming.
42 |                 We want to verify that the unified streaming implementation
43 |                 works efficiently for both small and large inputs.""",
44 |     ) as response:
45 |         print(f"Time to first byte: {(time.time() - start_time):.3f}s")
46 |         
47 |         for chunk in response.iter_bytes(chunk_size=1024):
48 |             chunk_count += 1
49 |             total_bytes += len(chunk)
50 |             # In real usage, this would go to audio playback
51 |             # For testing, we just count chunks and bytes
52 |     
53 |     total_time = time.time() - start_time
54 |     print(f"Received {chunk_count} chunks, {total_bytes} bytes")
55 |     print(f"Total streaming time: {total_time:.2f}s")
56 |     print(f"Average throughput: {total_bytes/total_time/1024:.1f} KB/s")
57 | 
58 | async def main():
59 |     """Run all tests"""
60 |     print("Starting unified streaming tests...")
61 |     
62 |     # Test both streaming modes
63 |     await test_streaming_to_file()
64 |     await test_streaming_chunks()
65 |     
66 |     print("\nAll tests completed!")
67 | 
68 | if __name__ == "__main__":
69 |     asyncio.run(main())


--------------------------------------------------------------------------------
/examples/voice_samples/speech_af.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_af_bella.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_bella.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_af_nicole.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_nicole.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_af_sarah.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_sarah.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_am_adam.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_am_adam.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_am_michael.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_am_michael.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_bf_emma.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bf_emma.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_bf_isabella.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bf_isabella.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_bm_george.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bm_george.mp3


--------------------------------------------------------------------------------
/examples/voice_samples/speech_bm_lewis.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bm_lewis.mp3


--------------------------------------------------------------------------------
/githubbanner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/githubbanner.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "kokoro-fastapi"
 3 | version = "0.3.0"
 4 | description = "FastAPI TTS Service"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     # Core dependencies
 9 |     "fastapi==0.115.6",
10 |     "uvicorn==0.34.0",
11 |     "click>=8.0.0",
12 |     "pydantic==2.10.4",
13 |     "pydantic-settings==2.7.0",
14 |     "python-dotenv==1.0.1",
15 |     "sqlalchemy==2.0.27",
16 |     # ML/DL Base
17 |     "numpy>=1.26.0",
18 |     "scipy==1.14.1",
19 |     # Audio processing
20 |     "soundfile==0.13.0",
21 |     "regex==2024.11.6",
22 |     # Utilities
23 |     "aiofiles==23.2.1",
24 |     "tqdm==4.67.1",
25 |     "requests==2.32.3",
26 |     "munch==4.0.0",
27 |     "tiktoken==0.8.0",
28 |     "loguru==0.7.3",
29 |     "openai>=1.59.6",
30 |     "pydub>=0.25.1",
31 |     "matplotlib>=3.10.0",
32 |     "mutagen>=1.47.0",
33 |     "psutil>=6.1.1",
34 |     "espeakng-loader==0.2.4",
35 |     "kokoro==0.9.2",
36 |     "misaki[en,ja,ko,zh]==0.9.3",
37 |     "spacy==3.8.5",
38 |     "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
39 |     "inflect>=7.5.0",
40 |     "phonemizer-fork>=3.3.2",
41 |     "av>=14.2.0",
42 |     "text2num>=2.5.1",
43 | ]
44 | 
45 | [project.optional-dependencies]
46 | gpu = [
47 |     "torch==2.6.0+cu124",
48 | ]
49 | cpu = [
50 |     "torch==2.6.0",
51 | ]
52 | test = [
53 |     "pytest==8.3.5",
54 |     "pytest-cov==6.0.0",
55 |     "httpx==0.26.0",
56 |     "pytest-asyncio==0.25.3",
57 |     "tomli>=2.0.1",
58 |     "jinja2>=3.1.6"
59 | ]
60 | 
61 | [tool.uv]
62 | conflicts = [
63 |     [
64 |         { extra = "cpu" },
65 |         { extra = "gpu" },
66 |     ],
67 | ]
68 | 
69 | [tool.uv.sources]
70 | torch = [
71 |     { index = "pytorch-cpu", extra = "cpu" },
72 |     { index = "pytorch-cuda", extra = "gpu" },
73 | ]
74 | 
75 | [[tool.uv.index]]
76 | name = "pytorch-cpu"
77 | url = "https://download.pytorch.org/whl/cpu"
78 | explicit = true
79 | 
80 | [[tool.uv.index]]
81 | name = "pytorch-cuda"
82 | url = "https://download.pytorch.org/whl/cu124"
83 | explicit = true
84 | 
85 | [build-system]
86 | requires = ["setuptools>=61.0"]
87 | build-backend = "setuptools.build_meta"
88 | 
89 | [tool.setuptools]
90 | package-dir = {"" = "api/src"}
91 | packages.find = {where = ["api/src"], namespaces = true}
92 | 
93 | [tool.pytest.ini_options]
94 | testpaths = ["api/tests", "ui/tests"]
95 | python_files = ["test_*.py"]
96 | addopts = "--cov=api --cov=ui --cov-report=term-missing --cov-config=.coveragerc --full-trace" 
97 | asyncio_mode = "auto"
98 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = api/tests
3 | python_files = test_*.py
4 | addopts = -v --tb=short --cov=api --cov-report=term-missing --cov-config=.coveragerc
5 | pythonpath = .
6 | 


--------------------------------------------------------------------------------
/scripts/fix_misaki.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Patch for misaki package to fix the EspeakWrapper.set_data_path issue.
 3 | """
 4 | 
 5 | import importlib.util
 6 | import os
 7 | import sys
 8 | 
 9 | # Find the misaki package
10 | try:
11 |     import misaki
12 | 
13 |     misaki_path = os.path.dirname(misaki.__file__)
14 |     print(f"Found misaki package at: {misaki_path}")
15 | except ImportError:
16 |     print("Misaki package not found. Make sure it's installed.")
17 |     sys.exit(1)
18 | 
19 | # Path to the espeak.py file
20 | espeak_file = os.path.join(misaki_path, "espeak.py")
21 | 
22 | if not os.path.exists(espeak_file):
23 |     print(f"Could not find {espeak_file}")
24 |     sys.exit(1)
25 | 
26 | # Read the current content
27 | with open(espeak_file, "r") as f:
28 |     content = f.read()
29 | 
30 | # Check if the problematic line exists
31 | if "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())" in content:
32 |     # Replace the problematic line
33 |     new_content = content.replace(
34 |         "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())",
35 |         "# Fixed line to use data_path attribute instead of set_data_path method\n"
36 |         "EspeakWrapper.data_path = espeakng_loader.get_data_path()",
37 |     )
38 | 
39 |     # Write the modified content back
40 |     with open(espeak_file, "w") as f:
41 |         f.write(new_content)
42 | 
43 |     print(f"Successfully patched {espeak_file}")
44 | else:
45 |     print(f"The problematic line was not found in {espeak_file}")
46 |     print("The file may have already been patched or the issue is different.")
47 | 


--------------------------------------------------------------------------------
/start-cpu.ps1:
--------------------------------------------------------------------------------
 1 | $env:PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll"
 2 | $env:PYTHONUTF8=1
 3 | $Env:PROJECT_ROOT="$pwd"
 4 | $Env:USE_GPU="false"
 5 | $Env:USE_ONNX="false"
 6 | $Env:PYTHONPATH="$Env:PROJECT_ROOT;$Env:PROJECT_ROOT/api"
 7 | $Env:MODEL_DIR="src/models"
 8 | $Env:VOICES_DIR="src/voices/v1_0"
 9 | $Env:WEB_PLAYER_PATH="$Env:PROJECT_ROOT/web"
10 | 
11 | uv pip install -e ".[cpu]"
12 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
13 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880


--------------------------------------------------------------------------------
/start-cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get project root directory
 4 | PROJECT_ROOT=$(pwd)
 5 | 
 6 | # Set environment variables
 7 | export USE_GPU=false
 8 | export USE_ONNX=false
 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
10 | export MODEL_DIR=src/models
11 | export VOICES_DIR=src/voices/v1_0
12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web
13 | # Set the espeak-ng data path to your location
14 | export ESPEAK_DATA_PATH=/usr/lib/x86_64-linux-gnu/espeak-ng-data
15 | 
16 | # Run FastAPI with CPU extras using uv run
17 | # Note: espeak may still require manual installation,
18 | uv pip install -e ".[cpu]"
19 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
20 | 
21 | # Apply the misaki patch to fix possible EspeakWrapper issue in older versions
22 | # echo "Applying misaki patch..."
23 | # python scripts/fix_misaki.py
24 | 
25 | # Start the server
26 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880
27 | 


--------------------------------------------------------------------------------
/start-gpu.ps1:
--------------------------------------------------------------------------------
 1 | $env:PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll"
 2 | $env:PYTHONUTF8=1
 3 | $Env:PROJECT_ROOT="$pwd"
 4 | $Env:USE_GPU="true"
 5 | $Env:USE_ONNX="false"
 6 | $Env:PYTHONPATH="$Env:PROJECT_ROOT;$Env:PROJECT_ROOT/api"
 7 | $Env:MODEL_DIR="src/models"
 8 | $Env:VOICES_DIR="src/voices/v1_0"
 9 | $Env:WEB_PLAYER_PATH="$Env:PROJECT_ROOT/web"
10 | 
11 | uv pip install -e ".[gpu]"
12 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
13 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880


--------------------------------------------------------------------------------
/start-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get project root directory
 4 | PROJECT_ROOT=$(pwd)
 5 | 
 6 | # Set environment variables
 7 | export USE_GPU=true
 8 | export USE_ONNX=false
 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
10 | export MODEL_DIR=src/models
11 | export VOICES_DIR=src/voices/v1_0
12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web
13 | 
14 | # Run FastAPI with GPU extras using uv run
15 | # Note: espeak may still require manual installation,
16 | uv pip install -e ".[gpu]"
17 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
18 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880
19 | 


--------------------------------------------------------------------------------
/start-gpu_mac.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get project root directory
 4 | PROJECT_ROOT=$(pwd)
 5 | 
 6 | # Set other environment variables
 7 | export USE_GPU=true
 8 | export USE_ONNX=false
 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
10 | export MODEL_DIR=src/models
11 | export VOICES_DIR=src/voices/v1_0
12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web
13 | 
14 | export DEVICE_TYPE=mps
15 | # Enable MPS fallback for unsupported operations
16 | export PYTORCH_ENABLE_MPS_FALLBACK=1
17 | 
18 | # Run FastAPI with GPU extras using uv run
19 | uv pip install -e .
20 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
21 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880
22 | 


--------------------------------------------------------------------------------
/ui/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | WORKDIR /app/ui
 4 | 
 5 | # Install dependencies
 6 | RUN pip install gradio==5.9.1 requests==2.32.3
 7 | 
 8 | # Create necessary directories
 9 | RUN mkdir -p data/inputs data/outputs
10 | 
11 | # Copy the application files
12 | COPY . .
13 | 
14 | ENV API_HOST=kokoro-tts
15 | ENV API_PORT=8880
16 | 
17 | # Run the Gradio app
18 | CMD ["python", "app.py"]
19 | 


--------------------------------------------------------------------------------
/ui/GUIBanner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/GUIBanner.png


--------------------------------------------------------------------------------
/ui/GradioScreenShot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/GradioScreenShot.png


--------------------------------------------------------------------------------
/ui/app.py:
--------------------------------------------------------------------------------
1 | from lib.interface import create_interface
2 | 
3 | if __name__ == "__main__":
4 |     demo = create_interface()
5 |     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
6 | 


--------------------------------------------------------------------------------
/ui/depr_tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import AsyncMock, Mock
 2 | 
 3 | import pytest
 4 | 
 5 | from api.src.services.tts_service import TTSService
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | async def mock_model_manager():
10 |     """Mock model manager for UI tests"""
11 |     manager = AsyncMock()
12 |     manager.get_backend = Mock(return_value=Mock(device="cpu"))
13 |     return manager
14 | 
15 | 
16 | @pytest.fixture
17 | async def mock_voice_manager():
18 |     """Mock voice manager for UI tests"""
19 |     manager = AsyncMock()
20 |     manager.list_voices = AsyncMock(return_value=["af_heart", "bm_lewis", "af_sarah"])
21 |     return manager
22 | 
23 | 
24 | @pytest.fixture
25 | async def mock_tts_service(mock_model_manager, mock_voice_manager):
26 |     """Mock TTSService for UI tests"""
27 |     service = AsyncMock()
28 |     service.model_manager = mock_model_manager
29 |     service._voice_manager = mock_voice_manager
30 |     return service
31 | 
32 | 
33 | @pytest.fixture(autouse=True)
34 | async def setup_mocks(
35 |     monkeypatch, mock_model_manager, mock_voice_manager, mock_tts_service
36 | ):
37 |     """Setup global mocks for UI tests"""
38 | 
39 |     async def mock_get_model():
40 |         return mock_model_manager
41 | 
42 |     async def mock_get_voice():
43 |         return mock_voice_manager
44 | 
45 |     async def mock_create_service():
46 |         return mock_tts_service
47 | 
48 |     monkeypatch.setattr("api.src.inference.model_manager.get_manager", mock_get_model)
49 |     monkeypatch.setattr("api.src.inference.voice_manager.get_manager", mock_get_voice)
50 |     monkeypatch.setattr(
51 |         "api.src.services.tts_service.TTSService.create", mock_create_service
52 |     )
53 | 


--------------------------------------------------------------------------------
/ui/depr_tests/test_handlers.py:
--------------------------------------------------------------------------------
1 | """
2 | Drop all tests for now. The Gradio event system is too complex to test properly.
3 | We'll need to find a better way to test the UI functionality.
4 | """
5 | 


--------------------------------------------------------------------------------
/ui/depr_tests/test_input.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import pytest
 3 | 
 4 | from ui.lib.components.input import create_input_column
 5 | 
 6 | 
 7 | def test_create_input_column_structure():
 8 |     """Test that create_input_column returns the expected structure"""
 9 |     column, components = create_input_column()
10 | 
11 |     # Test the return types
12 |     assert isinstance(column, gr.Column)
13 |     assert isinstance(components, dict)
14 | 
15 |     # Test that all expected components are present
16 |     expected_components = {
17 |         "tabs",
18 |         "text_input",
19 |         "file_select",
20 |         "file_upload",
21 |         "file_preview",
22 |         "text_submit",
23 |         "file_submit",
24 |         "clear_files",
25 |     }
26 |     assert set(components.keys()) == expected_components
27 | 
28 |     # Test component types
29 |     assert isinstance(components["tabs"], gr.Tabs)
30 |     assert isinstance(components["text_input"], gr.Textbox)
31 |     assert isinstance(components["file_select"], gr.Dropdown)
32 |     assert isinstance(components["file_upload"], gr.File)
33 |     assert isinstance(components["file_preview"], gr.Textbox)
34 |     assert isinstance(components["text_submit"], gr.Button)
35 |     assert isinstance(components["file_submit"], gr.Button)
36 |     assert isinstance(components["clear_files"], gr.Button)
37 | 
38 | 
39 | def test_text_input_configuration():
40 |     """Test the text input component configuration"""
41 |     _, components = create_input_column()
42 |     text_input = components["text_input"]
43 | 
44 |     assert text_input.label == "Text to speak"
45 |     assert text_input.placeholder == "Enter text here..."
46 |     assert text_input.lines == 4
47 | 
48 | 
49 | def test_file_upload_configuration():
50 |     """Test the file upload component configuration"""
51 |     _, components = create_input_column()
52 |     file_upload = components["file_upload"]
53 | 
54 |     assert file_upload.label == "Upload Text File (.txt)"
55 |     assert file_upload.file_types == [".txt"]
56 | 
57 | 
58 | def test_button_configurations():
59 |     """Test the button configurations"""
60 |     _, components = create_input_column()
61 | 
62 |     # Test text submit button
63 |     assert components["text_submit"].value == "Generate Speech"
64 |     assert components["text_submit"].variant == "primary"
65 |     assert components["text_submit"].size == "lg"
66 | 
67 |     # Test file submit button
68 |     assert components["file_submit"].value == "Generate Speech"
69 |     assert components["file_submit"].variant == "primary"
70 |     assert components["file_submit"].size == "lg"
71 | 
72 |     # Test clear files button
73 |     assert components["clear_files"].value == "Clear Files"
74 |     assert components["clear_files"].variant == "secondary"
75 |     assert components["clear_files"].size == "lg"
76 | 


--------------------------------------------------------------------------------
/ui/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/lib/__init__.py


--------------------------------------------------------------------------------
/ui/lib/api.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | from typing import List, Optional, Tuple
 4 | 
 5 | import requests
 6 | 
 7 | from .config import API_URL, OUTPUTS_DIR
 8 | 
 9 | 
10 | def check_api_status() -> Tuple[bool, List[str]]:
11 |     """Check TTS service status and get available voices."""
12 |     try:
13 |         # Use a longer timeout during startup
14 |         response = requests.get(
15 |             f"{API_URL}/v1/audio/voices",
16 |             timeout=30,  # Increased timeout for initial startup period
17 |         )
18 |         response.raise_for_status()
19 |         voices = response.json().get("voices", [])
20 |         if voices:
21 |             return True, voices
22 |         print("No voices found in response")
23 |         return False, []
24 |     except requests.exceptions.Timeout:
25 |         print("API request timed out (waiting for service startup)")
26 |         return False, []
27 |     except requests.exceptions.ConnectionError as e:
28 |         print(f"Connection error (service may be starting up): {str(e)}")
29 |         return False, []
30 |     except requests.exceptions.RequestException as e:
31 |         print(f"API request failed: {str(e)}")
32 |         return False, []
33 |     except Exception as e:
34 |         print(f"Unexpected error checking API status: {str(e)}")
35 |         return False, []
36 | 
37 | 
38 | def text_to_speech(
39 |     text: str, voice_id: str | list, format: str, speed: float
40 | ) -> Optional[str]:
41 |     """Generate speech from text using TTS API."""
42 |     if not text.strip():
43 |         return None
44 | 
45 |     # Handle multiple voices
46 |     voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id)
47 | 
48 |     # Create output filename
49 |     timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
50 |     output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}"
51 |     output_path = os.path.join(OUTPUTS_DIR, output_filename)
52 | 
53 |     try:
54 |         response = requests.post(
55 |             f"{API_URL}/v1/audio/speech",
56 |             json={
57 |                 "model": "kokoro",
58 |                 "input": text,
59 |                 "voice": voice_str,
60 |                 "response_format": format,
61 |                 "speed": float(speed),
62 |             },
63 |             headers={"Content-Type": "application/json"},
64 |             timeout=300,  # Longer timeout for speech generation
65 |         )
66 |         response.raise_for_status()
67 | 
68 |         with open(output_path, "wb") as f:
69 |             f.write(response.content)
70 |         return output_path
71 | 
72 |     except requests.exceptions.Timeout:
73 |         print("Speech generation request timed out")
74 |         return None
75 |     except requests.exceptions.RequestException as e:
76 |         print(f"Speech generation request failed: {str(e)}")
77 |         return None
78 |     except Exception as e:
79 |         print(f"Unexpected error generating speech: {str(e)}")
80 |         return None
81 | 
82 | 
83 | def get_status_html(is_available: bool) -> str:
84 |     """Generate HTML for status indicator."""
85 |     color = "green" if is_available else "red"
86 |     status = "Available" if is_available else "Unavailable"
87 |     return f"""
88 |         <div style="display: flex; align-items: center; gap: 8px;">
89 |             <div style="width: 12px; height: 12px; border-radius: 50%; background-color: {color};"></div>
90 |             <span>TTS Service: {status}</span>
91 |         </div>
92 |     """
93 | 


--------------------------------------------------------------------------------
/ui/lib/components/__init__.py:
--------------------------------------------------------------------------------
1 | from .input import create_input_column
2 | from .model import create_model_column
3 | from .output import create_output_column
4 | 
5 | __all__ = ["create_input_column", "create_model_column", "create_output_column"]
6 | 


--------------------------------------------------------------------------------
/ui/lib/components/input.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import gradio as gr
 4 | 
 5 | from .. import files
 6 | 
 7 | 
 8 | def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
 9 |     """Create the input column with text input and file handling."""
10 |     with gr.Column(scale=1) as col:
11 |         text_input = gr.Textbox(
12 |             label="Text to speak", placeholder="Enter text here...", lines=4
13 |         )
14 | 
15 |         # Always show file upload but handle differently based on disable_local_saving
16 |         file_upload = gr.File(label="Upload Text File (.txt)", file_types=[".txt"])
17 | 
18 |         if not disable_local_saving:
19 |             # Show full interface with tabs when saving is enabled
20 |             with gr.Tabs() as tabs:
21 |                 # Set first tab as selected by default
22 |                 tabs.selected = 0
23 |                 # Direct Input Tab
24 |                 with gr.TabItem("Direct Input"):
25 |                     text_submit_direct = gr.Button(
26 |                         "Generate Speech", variant="primary", size="lg"
27 |                     )
28 | 
29 |                 # File Input Tab
30 |                 with gr.TabItem("From File"):
31 |                     # Existing files dropdown
32 |                     input_files_list = gr.Dropdown(
33 |                         label="Select Existing File",
34 |                         choices=files.list_input_files(),
35 |                         value=None,
36 |                     )
37 | 
38 |                     file_preview = gr.Textbox(
39 |                         label="File Content Preview", interactive=False, lines=4
40 |                     )
41 | 
42 |                     with gr.Row():
43 |                         file_submit = gr.Button(
44 |                             "Generate Speech", variant="primary", size="lg"
45 |                         )
46 |                         clear_files = gr.Button(
47 |                             "Clear Files", variant="secondary", size="lg"
48 |                         )
49 |         else:
50 |             # Just show the generate button when saving is disabled
51 |             text_submit_direct = gr.Button(
52 |                 "Generate Speech", variant="primary", size="lg"
53 |             )
54 |             tabs = None
55 |             input_files_list = None
56 |             file_preview = None
57 |             file_submit = None
58 |             clear_files = None
59 | 
60 |     # Initialize components based on disable_local_saving
61 |     if disable_local_saving:
62 |         components = {
63 |             "tabs": None,
64 |             "text_input": text_input,
65 |             "text_submit": text_submit_direct,
66 |             "file_select": None,
67 |             "file_upload": file_upload,  # Keep file upload even when saving is disabled
68 |             "file_preview": None,
69 |             "file_submit": None,
70 |             "clear_files": None,
71 |         }
72 |     else:
73 |         components = {
74 |             "tabs": tabs,
75 |             "text_input": text_input,
76 |             "text_submit": text_submit_direct,
77 |             "file_select": input_files_list,
78 |             "file_upload": file_upload,
79 |             "file_preview": file_preview,
80 |             "file_submit": file_submit,
81 |             "clear_files": clear_files,
82 |         }
83 | 
84 |     return col, components
85 | 


--------------------------------------------------------------------------------
/ui/lib/components/model.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import gradio as gr
 4 | 
 5 | from .. import api, config
 6 | 
 7 | 
 8 | def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, dict]:
 9 |     """Create the model settings column."""
10 |     if voice_ids is None:
11 |         voice_ids = []
12 | 
13 |     with gr.Column(scale=1) as col:
14 |         gr.Markdown("### Model Settings")
15 | 
16 |         # Status button starts in waiting state
17 |         status_btn = gr.Button(
18 |             "⌛ TTS Service: Waiting for Service...", variant="secondary"
19 |         )
20 | 
21 |         voice_input = gr.Dropdown(
22 |             choices=voice_ids,
23 |             label="Voice(s)",
24 |             value=voice_ids[0] if voice_ids else None,
25 |             interactive=True,
26 |             multiselect=True,
27 |         )
28 |         format_input = gr.Dropdown(
29 |             choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3"
30 |         )
31 |         speed_input = gr.Slider(
32 |             minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"
33 |         )
34 | 
35 |     components = {
36 |         "status_btn": status_btn,
37 |         "voice": voice_input,
38 |         "format": format_input,
39 |         "speed": speed_input,
40 |     }
41 | 
42 |     return col, components
43 | 


--------------------------------------------------------------------------------
/ui/lib/components/output.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import gradio as gr
 4 | 
 5 | from .. import files
 6 | 
 7 | 
 8 | def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
 9 |     """Create the output column with audio player and file list."""
10 |     with gr.Column(scale=1) as col:
11 |         gr.Markdown("### Latest Output")
12 |         audio_output = gr.Audio(
13 |             label="Generated Speech",
14 |             type="filepath",
15 |             waveform_options={"waveform_color": "#4C87AB"},
16 |         )
17 | 
18 |         # Create file-related components with visible=False when local saving is disabled
19 |         gr.Markdown("### Generated Files", visible=not disable_local_saving)
20 |         output_files = gr.Dropdown(
21 |             label="Previous Outputs",
22 |             choices=files.list_output_files() if not disable_local_saving else [],
23 |             value=None,
24 |             allow_custom_value=True,
25 |             visible=not disable_local_saving,
26 |         )
27 | 
28 |         play_btn = gr.Button(
29 |             "▶️ Play Selected",
30 |             size="sm",
31 |             visible=not disable_local_saving,
32 |         )
33 | 
34 |         selected_audio = gr.Audio(
35 |             label="Selected Output",
36 |             type="filepath",
37 |             visible=False,  # Always initially hidden
38 |         )
39 | 
40 |         clear_outputs = gr.Button(
41 |             "⚠️ Delete All Previously Generated Output Audio 🗑️",
42 |             size="sm",
43 |             variant="secondary",
44 |             visible=not disable_local_saving,
45 |         )
46 | 
47 |     components = {
48 |         "audio_output": audio_output,
49 |         "output_files": output_files,
50 |         "play_btn": play_btn,
51 |         "selected_audio": selected_audio,
52 |         "clear_outputs": clear_outputs,
53 |     }
54 | 
55 |     return col, components
56 | 


--------------------------------------------------------------------------------
/ui/lib/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # API Configuration
 4 | API_HOST = os.getenv("API_HOST", "kokoro-tts")
 5 | API_PORT = os.getenv("API_PORT", "8880")
 6 | API_URL = f"http://{API_HOST}:{API_PORT}"
 7 | 
 8 | # File paths
 9 | INPUTS_DIR = "app/ui/data/inputs"
10 | OUTPUTS_DIR = "app/ui/data/outputs"
11 | 
12 | # Create directories if they don't exist
13 | 
14 | os.makedirs(INPUTS_DIR, exist_ok=True)
15 | os.makedirs(OUTPUTS_DIR, exist_ok=True)
16 | 
17 | # Audio formats
18 | AUDIO_FORMATS = ["mp3", "wav", "opus", "flac"]
19 | 
20 | # UI Theme
21 | THEME = "monochrome"
22 | CSS = """
23 | .gradio-container {
24 |     max-width: 1000px;
25 |     margin: auto;
26 | }
27 | 
28 | .banner-container {
29 |     background: transparent !important;
30 |     border: none !important;
31 |     box-shadow: none !important;
32 |     margin-bottom: 2rem;
33 | }
34 | 
35 | .banner-container img {
36 |     width: 100%;
37 |     max-width: 600px;
38 |     border-radius: 10px;
39 |     margin: 20px auto;
40 |     display: block;
41 |     box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
42 | }
43 | """
44 | 


--------------------------------------------------------------------------------
/web/favicon.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
 2 |   <!-- Cup base -->
 3 |   <path d="M6 8v16c0 2 2 4 4 4h8c2 0 4-2 4-4V8H6z" 
 4 |         fill="#6c5ce7" 
 5 |         stroke="black" 
 6 |         stroke-width="3"/>
 7 |   <path d="M6 8v16c0 2 2 4 4 4h8c2 0 4-2 4-4V8H6z" 
 8 |         fill="#6c5ce7" 
 9 |         stroke="white" 
10 |         stroke-width="1.5"/>
11 |   
12 |   <!-- Handle -->
13 |   <path d="M22 12v8c2 0 4-2 4-4s-2-4-4-4z" 
14 |         fill="none" 
15 |         stroke="black" 
16 |         stroke-width="3"/>
17 |   <path d="M22 12v8c2 0 4-2 4-4s-2-4-4-4z" 
18 |         fill="none" 
19 |         stroke="white" 
20 |         stroke-width="1.5"/>
21 |   
22 |   <!-- Steam -->
23 |   <path d="M10 4c0 0 2-2 4 0s4 0 4 0" 
24 |         fill="none" 
25 |         stroke="black" 
26 |         stroke-width="3" 
27 |         stroke-linecap="round">
28 |     <animate attributeName="d" 
29 |              dur="2s" 
30 |              repeatCount="indefinite"
31 |              values="M10 4c0 0 2-2 4 0s4 0 4 0;
32 |                      M10 2c0 0 2-2 4 0s4 0 4 0;
33 |                      M10 4c0 0 2-2 4 0s4 0 4 0"/>
34 |   </path>
35 |   <path d="M10 4c0 0 2-2 4 0s4 0 4 0" 
36 |         fill="none" 
37 |         stroke="white" 
38 |         stroke-width="1.5" 
39 |         stroke-linecap="round">
40 |     <animate attributeName="d" 
41 |              dur="2s" 
42 |              repeatCount="indefinite"
43 |              values="M10 4c0 0 2-2 4 0s4 0 4 0;
44 |                      M10 2c0 0 2-2 4 0s4 0 4 0;
45 |                      M10 4c0 0 2-2 4 0s4 0 4 0"/>
46 |   </path>
47 | </svg>


--------------------------------------------------------------------------------
/web/siriwave.js:
--------------------------------------------------------------------------------
  1 | (function() {
  2 | function SiriWave(opt) {
  3 |   opt = opt || {};
  4 | 
  5 |   this.phase = 0;
  6 |   this.run = false;
  7 | 
  8 |   // UI vars
  9 |   this.ratio = opt.ratio || window.devicePixelRatio || 1;
 10 |   this.width = this.ratio * (opt.width || 320);
 11 |   this.width_2 = this.width / 2;
 12 |   this.width_4 = this.width / 4;
 13 |   this.height = this.ratio * (opt.height || 50);
 14 |   this.height_2 = this.height / 2;
 15 |   this.MAX = (this.height_2) - 4;
 16 | 
 17 |   // Constructor opt
 18 |   this.amplitude = opt.amplitude || 1;
 19 |   this.speed = opt.speed || 0.2;
 20 |   this.frequency = opt.frequency || 6;
 21 |   this.color = (function hex2rgb(hex){
 22 |     var shorthandRegex = /^#?([a-f\d])([a-f\d])([a-f\d])$/i;
 23 |     hex = hex.replace(shorthandRegex, function(m,r,g,b) { return r + r + g + g + b + b; });
 24 |     var result = /^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(hex);
 25 |     return result ?
 26 |     parseInt(result[1],16).toString()+','+parseInt(result[2], 16).toString()+','+parseInt(result[3], 16).toString()
 27 |     : null;
 28 |   })(opt.color || '#6366f1') || '99,102,241';
 29 | 
 30 |   // Canvas
 31 |   this.canvas = document.createElement('canvas');
 32 |   this.canvas.width = this.width;
 33 |   this.canvas.height = this.height;
 34 |   
 35 |   this.canvas.style.width = '100%';
 36 |   this.canvas.style.height = '100%';
 37 |   this.canvas.style.borderRadius = '4px';
 38 | 
 39 |   this.container = opt.container || document.body;
 40 |   this.container.appendChild(this.canvas);
 41 |   this.ctx = this.canvas.getContext('2d');
 42 |   
 43 |   // Start
 44 |   if (opt.autostart) {
 45 |     this.start();
 46 |   }
 47 | }
 48 | 
 49 | SiriWave.prototype._GATF_cache = {};
 50 | SiriWave.prototype._globAttFunc = function(x) {
 51 |   if (SiriWave.prototype._GATF_cache[x] == null) {
 52 |     SiriWave.prototype._GATF_cache[x] = Math.pow(4/(4+Math.pow(x,4)), 4);
 53 |   }
 54 |   return SiriWave.prototype._GATF_cache[x];
 55 | };
 56 | 
 57 | SiriWave.prototype._xpos = function(i) {
 58 |   return this.width_2 + i * this.width_4;
 59 | };
 60 | 
 61 | SiriWave.prototype._ypos = function(i, attenuation) {
 62 |   var att = (this.MAX * this.amplitude) / attenuation;
 63 |   return this.height_2 + this._globAttFunc(i) * att * Math.sin(this.frequency * i - this.phase);
 64 | };
 65 | 
 66 | SiriWave.prototype._drawLine = function(attenuation, color, width){
 67 |   this.ctx.moveTo(0,0);
 68 |   this.ctx.beginPath();
 69 |   this.ctx.strokeStyle = color;
 70 |   this.ctx.lineWidth = width || 1;
 71 | 
 72 |   var i = -2;
 73 |   while ((i += 0.01) <= 2) {
 74 |     var y = this._ypos(i, attenuation);
 75 |     if (Math.abs(i) >= 1.90) y = this.height_2;
 76 |     this.ctx.lineTo(this._xpos(i), y);
 77 |   }
 78 | 
 79 |   this.ctx.stroke();
 80 | };
 81 | 
 82 | SiriWave.prototype._clear = function() {
 83 |   this.ctx.globalCompositeOperation = 'destination-out';
 84 |   this.ctx.fillRect(0, 0, this.width, this.height);
 85 |   this.ctx.globalCompositeOperation = 'source-over';
 86 | };
 87 | 
 88 | SiriWave.prototype._draw = function() {
 89 |   if (this.run === false) return;
 90 | 
 91 |   this.phase = (this.phase + Math.PI*this.speed) % (2*Math.PI);
 92 | 
 93 |   this._clear();
 94 |   this._drawLine(-2, 'rgba(' + this.color + ',0.1)');
 95 |   this._drawLine(-6, 'rgba(' + this.color + ',0.2)');
 96 |   this._drawLine(4, 'rgba(' + this.color + ',0.4)');
 97 |   this._drawLine(2, 'rgba(' + this.color + ',0.6)');
 98 |   this._drawLine(1, 'rgba(' + this.color + ',1)', 1.5);
 99 | 
100 |   if (window.requestAnimationFrame) {
101 |     requestAnimationFrame(this._draw.bind(this));
102 |     return;
103 |   };
104 |   setTimeout(this._draw.bind(this), 20);
105 | };
106 | 
107 | SiriWave.prototype.start = function() {
108 |   this.phase = 0;
109 |   this.run = true;
110 |   this._draw();
111 | };
112 | 
113 | SiriWave.prototype.stop = function() {
114 |   this.phase = 0;
115 |   this.run = false;
116 | };
117 | 
118 | SiriWave.prototype.setSpeed = function(v) {
119 |   this.speed = v;
120 | };
121 | 
122 | SiriWave.prototype.setNoise = SiriWave.prototype.setAmplitude = function(v) {
123 |   this.amplitude = Math.max(Math.min(v, 1), 0);
124 | };
125 | 
126 | if (typeof define === 'function' && define.amd) {
127 |   define(function(){ return SiriWave; });
128 |   return;
129 | };
130 | window.SiriWave = SiriWave;
131 | })();


--------------------------------------------------------------------------------
/web/src/components/WaveVisualizer.js:
--------------------------------------------------------------------------------
  1 | export class WaveVisualizer {
  2 |     constructor(playerState) {
  3 |         this.playerState = playerState;
  4 |         this.wave = null;
  5 |         this.progressBar = null;
  6 |         this.container = document.getElementById('wave-container');
  7 |         
  8 |         this.setupWave();
  9 |         this.setupProgressBar();
 10 |         this.setupStateSubscription();
 11 |     }
 12 | 
 13 |     setupWave() {
 14 |         this.wave = new SiriWave({
 15 |             container: this.container,
 16 |             style: 'ios9',
 17 |             width: this.container.clientWidth,
 18 |             height: 100,  // Increased height
 19 |             autostart: false,
 20 |             amplitude: 1,
 21 |             speed: 0.1
 22 |         });
 23 | 
 24 |         // Handle window resize
 25 |         window.addEventListener('resize', () => {
 26 |             if (this.wave) {
 27 |                 this.wave.width = this.container.clientWidth;
 28 |             }
 29 |         });
 30 |     }
 31 | 
 32 |     setupProgressBar() {
 33 |         this.progressBar = document.createElement('progress');
 34 |         this.progressBar.max = 100;
 35 |         this.progressBar.value = 0;
 36 |         this.progressBar.className = 'generation-progress';
 37 |         // Insert inside wave-container at the bottom
 38 |         this.container.appendChild(this.progressBar);
 39 |         this.progressBar.style.display = 'none';
 40 |     }
 41 | 
 42 |     setupStateSubscription() {
 43 |         this.playerState.subscribe(state => {
 44 |             // Handle generation progress
 45 |             if (state.isGenerating) {
 46 |                 this.progressBar.style.display = 'block';
 47 |                 this.progressBar.value = state.progress;
 48 |             } else if (state.progress >= 100) {
 49 |                 // Hide progress bar after completion
 50 |                 setTimeout(() => {
 51 |                     this.progressBar.style.display = 'none';
 52 |                     this.progressBar.value = 0;
 53 |                 }, 500);
 54 |             }
 55 | 
 56 |             // Only animate when playing, stop otherwise
 57 |             if (state.isPlaying) {
 58 |                 this.wave.start();
 59 |             } else {
 60 |                 this.wave.stop();
 61 |             }
 62 |         });
 63 |     }
 64 | 
 65 |     updateProgress(receivedChunks, totalChunks) {
 66 |         if (!totalChunks) return;
 67 |         
 68 |         // Calculate progress percentage based on chunks
 69 |         const progress = Math.min((receivedChunks / totalChunks) * 100, 99);
 70 |         
 71 |         // Always update on 0 progress or when progress increases
 72 |         if (receivedChunks === 0 || progress > this.progressBar.value) {
 73 |             this.progressBar.style.display = 'block';
 74 |             this.progressBar.value = progress;
 75 |             this.playerState.setProgress(receivedChunks, totalChunks);
 76 |         }
 77 |     }
 78 | 
 79 |     cleanup() {
 80 |         if (this.wave) {
 81 |             this.wave.stop();
 82 |             this.wave.dispose();
 83 |             this.wave = null;
 84 |         }
 85 |         
 86 |         if (this.progressBar) {
 87 |             this.progressBar.style.display = 'none';
 88 |             this.progressBar.value = 0;
 89 |             if (this.progressBar.parentNode) {
 90 |                 this.progressBar.parentNode.removeChild(this.progressBar);
 91 |             }
 92 |             this.progressBar = null;
 93 |         }
 94 |         
 95 |         // Re-setup wave and progress bar
 96 |         this.setupWave();
 97 |         this.setupProgressBar();
 98 |         
 99 |         if (this.playerState) {
100 |             this.playerState.setProgress(0, 1); // Reset progress in state
101 |         }
102 |     }
103 | }
104 | 
105 | export default WaveVisualizer;
106 | 


--------------------------------------------------------------------------------
/web/src/services/VoiceService.js:
--------------------------------------------------------------------------------
  1 | export class VoiceService {
  2 |     constructor() {
  3 |         this.availableVoices = [];
  4 |         this.selectedVoices = new Map(); // Changed to Map to store voice:weight pairs
  5 |     }
  6 | 
  7 |     async loadVoices() {
  8 |         try {
  9 |             const response = await fetch('/v1/audio/voices');
 10 |             if (!response.ok) {
 11 |                 const error = await response.json();
 12 |                 throw new Error(error.detail?.message || 'Failed to load voices');
 13 |             }
 14 |             
 15 |             const data = await response.json();
 16 |             if (!data.voices?.length) {
 17 |                 throw new Error('No voices available');
 18 |             }
 19 | 
 20 |             this.availableVoices = data.voices;
 21 |             
 22 |             // Select first voice if none selected
 23 |             if (this.selectedVoices.size === 0) {
 24 |                 const firstVoice = this.availableVoices.find(voice => voice && voice.trim());
 25 |                 if (firstVoice) {
 26 |                     this.addVoice(firstVoice);
 27 |                 }
 28 |             }
 29 | 
 30 |             return this.availableVoices;
 31 |         } catch (error) {
 32 |             console.error('Failed to load voices:', error);
 33 |             throw error;
 34 |         }
 35 |     }
 36 | 
 37 |     getAvailableVoices() {
 38 |         return this.availableVoices;
 39 |     }
 40 | 
 41 |     getSelectedVoices() {
 42 |         return Array.from(this.selectedVoices.keys());
 43 |     }
 44 | 
 45 |     getSelectedVoiceWeights() {
 46 |         return Array.from(this.selectedVoices.entries()).map(([voice, weight]) => ({
 47 |             voice,
 48 |             weight
 49 |         }));
 50 |     }
 51 | 
 52 |     getSelectedVoiceString() {
 53 |         const entries = Array.from(this.selectedVoices.entries());
 54 |         
 55 |         // If only one voice with weight 1, return just the voice name
 56 |         if (entries.length === 1 && entries[0][1] === 1) {
 57 |             return entries[0][0];
 58 |         }
 59 |         
 60 |         // Otherwise return voice(weight) format
 61 |         return entries
 62 |             .map(([voice, weight]) => `${voice}(${weight})`)
 63 |             .join('+');
 64 |     }
 65 | 
 66 |     addVoice(voice, weight = 1) {
 67 |         if (this.availableVoices.includes(voice)) {
 68 |             this.selectedVoices.set(voice, parseFloat(weight) || 1);
 69 |             return true;
 70 |         }
 71 |         return false;
 72 |     }
 73 | 
 74 |     updateWeight(voice, weight) {
 75 |         if (this.selectedVoices.has(voice)) {
 76 |             this.selectedVoices.set(voice, parseFloat(weight) || 1);
 77 |             return true;
 78 |         }
 79 |         return false;
 80 |     }
 81 | 
 82 |     removeVoice(voice) {
 83 |         return this.selectedVoices.delete(voice);
 84 |     }
 85 | 
 86 |     clearSelectedVoices() {
 87 |         this.selectedVoices.clear();
 88 |     }
 89 | 
 90 |     filterVoices(searchTerm) {
 91 |         if (!searchTerm) {
 92 |             return this.availableVoices;
 93 |         }
 94 |         
 95 |         const term = searchTerm.toLowerCase();
 96 |         return this.availableVoices.filter(voice => 
 97 |             voice.toLowerCase().includes(term)
 98 |         );
 99 |     }
100 | 
101 |     hasSelectedVoices() {
102 |         return this.selectedVoices.size > 0;
103 |     }
104 | }
105 | 
106 | export default VoiceService;


--------------------------------------------------------------------------------
/web/src/state/PlayerState.js:
--------------------------------------------------------------------------------
 1 | export class PlayerState {
 2 |     constructor() {
 3 |         this.state = {
 4 |             isPlaying: false,
 5 |             isGenerating: false,
 6 |             currentTime: 0,
 7 |             duration: 0,
 8 |             volume: 1,
 9 |             speed: 1,
10 |             progress: 0,
11 |             error: null
12 |         };
13 |         this.listeners = new Set();
14 |     }
15 | 
16 |     subscribe(listener) {
17 |         this.listeners.add(listener);
18 |         return () => this.listeners.delete(listener);
19 |     }
20 | 
21 |     notify() {
22 |         this.listeners.forEach(listener => listener(this.state));
23 |     }
24 | 
25 |     setState(updates) {
26 |         this.state = {
27 |             ...this.state,
28 |             ...updates
29 |         };
30 |         this.notify();
31 |     }
32 | 
33 |     setPlaying(isPlaying) {
34 |         this.setState({ isPlaying });
35 |     }
36 | 
37 |     setGenerating(isGenerating) {
38 |         this.setState({ isGenerating });
39 |     }
40 | 
41 |     setProgress(loaded, total) {
42 |         const progress = total > 0 ? (loaded / total) * 100 : 0;
43 |         this.setState({ progress });
44 |     }
45 | 
46 |     setTime(currentTime, duration) {
47 |         this.setState({ currentTime, duration });
48 |     }
49 | 
50 |     setVolume(volume) {
51 |         this.setState({ volume });
52 |     }
53 | 
54 |     setSpeed(speed) {
55 |         this.setState({ speed });
56 |     }
57 | 
58 |     setError(error) {
59 |         this.setState({ error });
60 |     }
61 | 
62 |     clearError() {
63 |         this.setState({ error: null });
64 |     }
65 | 
66 |     reset() {
67 |         // Keep current speed setting but reset everything else
68 |         const currentSpeed = this.state.speed;
69 |         const currentVolume = this.state.volume;
70 |         
71 |         this.setState({
72 |             isPlaying: false,
73 |             isGenerating: false,
74 |             currentTime: 0,
75 |             duration: 0,
76 |             progress: 0,
77 |             error: null,
78 |             speed: currentSpeed,
79 |             volume: currentVolume
80 |         });
81 |     }
82 | 
83 |     getState() {
84 |         return { ...this.state };
85 |     }
86 | }
87 | 
88 | export default PlayerState;


--------------------------------------------------------------------------------
/web/styles/badges.css:
--------------------------------------------------------------------------------
 1 | .badges-container {
 2 |     position: fixed;
 3 |     top: 0;
 4 |     left: 0;
 5 |     right: 0;
 6 |     padding: clamp(0.75rem, 1.5vh, 1rem) clamp(1rem, 2vw, 2rem);
 7 |     display: flex;
 8 |     justify-content: space-between;
 9 |     align-items: center;
10 |     z-index: 100;
11 |     background: rgba(15, 23, 42, 0.95);
12 |     backdrop-filter: blur(12px);
13 |     border-bottom: 1px solid rgba(99, 102, 241, 0.2);
14 |     min-height: clamp(3.5rem, 6vh, 4.5rem);
15 |     box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
16 |                 0 2px 4px -1px rgba(0, 0, 0, 0.06);
17 | }
18 | 
19 | .badge {
20 |     height: clamp(24px, 3vh, 28px);
21 |     display: flex;
22 |     align-items: center;
23 |     transition: opacity 0.2s ease;
24 |     flex-shrink: 0;
25 | }
26 | 
27 | .logo-container {
28 |     display: flex;
29 |     align-items: center;
30 |     gap: clamp(0.5rem, 1vw, 1rem);
31 |     margin: 0 auto;
32 |     transform: translateX(-50%);
33 |     left: 50%;
34 |     position: absolute;
35 | }
36 | 
37 | @media (max-width: 768px) {
38 |     .badges-container {
39 |         padding: 0.75rem;
40 |         flex-wrap: wrap;
41 |         justify-content: center;
42 |         gap: 0.75rem;
43 |         min-height: clamp(4rem, 8vh, 5rem);
44 |     }
45 |     
46 |     .badge {
47 |         height: 24px;
48 |     }
49 |     
50 |     .badge iframe {
51 |         height: 24px !important;
52 |         max-width: 100%;
53 |     }
54 | 
55 |     .logo-container {
56 |         position: static;
57 |         transform: none;
58 |         margin: 0;
59 |         order: -1;
60 |         width: 100%;
61 |         justify-content: center;
62 |         margin-bottom: 0.5rem;
63 |     }
64 | }
65 | 
66 | .badge iframe {
67 |     height: 28px !important;
68 | }
69 | 
70 | .badge:hover {
71 |     opacity: 0.9;
72 | }
73 | 
74 | .badge img {
75 |     height: 100%;
76 |     border-radius: 4px;
77 | }
78 | 


--------------------------------------------------------------------------------
/web/styles/base.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |     --bg-color: #0f172a;
  3 |     --fg-color: #6366f1;
  4 |     --surface: rgba(30, 41, 59, 1);
  5 |     --text: #f8fafc;
  6 |     --text-light: #cbd5e1;
  7 |     --border: rgba(148, 163, 184, 0.2);
  8 |     --error: #ef4444;
  9 |     --success: #22c55e;
 10 |     --font-family: 'Inter', system-ui, sans-serif;
 11 | }
 12 | 
 13 | html {
 14 |     width: 100%;
 15 |     height: 100%;
 16 |     overflow-x: hidden;
 17 | }
 18 | 
 19 | * {
 20 |     margin: 0;
 21 |     padding: 0;
 22 |     box-sizing: border-box;
 23 | }
 24 | 
 25 | body {
 26 |     font-family: var(--font-family);
 27 |     line-height: 1.6;
 28 |     color: var(--text);
 29 |     background: var(--bg-color);
 30 |     min-height: 100vh;
 31 |     position: relative;
 32 |     padding: 0;
 33 |     width: 100%;
 34 |     max-width: 100vw;
 35 |     overflow-x: hidden;
 36 | }
 37 | 
 38 | .overlay {
 39 |     position: fixed;
 40 |     inset: 0;
 41 |     background:
 42 |         radial-gradient(circle at top right,
 43 |             var(--fg-color) 0%,
 44 |             var(--bg-color) 100%);
 45 |     pointer-events: none;
 46 |     z-index: 0;
 47 | }
 48 | 
 49 | .grid-overlay {
 50 |     position: fixed;
 51 |     inset: 0;
 52 |     background-image:
 53 |         repeating-linear-gradient(0deg,
 54 |             rgba(255,255,255,0.03) 0px,
 55 |             rgba(255,255,255,0.03) 1px,
 56 |             transparent 1px,
 57 |             transparent 20px),
 58 |         repeating-linear-gradient(90deg,
 59 |             rgba(255,255,255,0.03) 0px,
 60 |             rgba(255,255,255,0.03) 1px,
 61 |             transparent 1px,
 62 |             transparent 20px);
 63 |     pointer-events: none;
 64 |     z-index: 0;
 65 | }
 66 | 
 67 | .container {
 68 |     width: 100%;
 69 |     max-width: min(1400px, 98vw);
 70 |     margin: 0 auto;
 71 |     display: flex;
 72 |     flex-direction: column;
 73 |     box-sizing: border-box;
 74 |     padding: clamp(5rem, 8vh, 7rem) clamp(0.75rem, 2vw, 2rem) 2rem;
 75 |     min-height: 100vh;
 76 | }
 77 | 
 78 | @media (max-width: 768px) {
 79 |     .container {
 80 |         padding-top: clamp(6rem, 10vh, 8rem);
 81 |         padding-left: 0.75rem;
 82 |         padding-right: 0.75rem;
 83 |     }
 84 | }
 85 | 
 86 | main {
 87 |     display: flex;
 88 |     flex-direction: column;
 89 |     gap: clamp(1rem, 2vh, 2rem);
 90 |     min-width: 0;
 91 |     width: 100%;
 92 |     position: relative;
 93 |     flex: 1;
 94 | }
 95 | 
 96 | .status {
 97 |     padding: 0.75rem 1rem;
 98 |     border-radius: 0.5rem;
 99 |     margin-bottom: 1rem;
100 |     transition: all 0.3s ease;
101 |     opacity: 0;
102 |     font-weight: 500;
103 |     text-align: center;
104 | }
105 | 
106 | .status.info {
107 |     background: rgba(99, 102, 241, 0.1);
108 |     border: 1px solid rgba(99, 102, 241, 0.2);
109 |     opacity: 1;
110 | }
111 | 
112 | .status.error {
113 |     background: rgba(239, 68, 68, 0.1);
114 |     border: 1px solid rgba(239, 68, 68, 0.2);
115 |     opacity: 1;
116 | }
117 | 
118 | .status.success {
119 |     background: rgba(34, 197, 94, 0.1);
120 |     border: 1px solid rgba(34, 197, 94, 0.2);
121 |     opacity: 1;
122 | }
123 | 


--------------------------------------------------------------------------------
/web/styles/header.css:
--------------------------------------------------------------------------------
 1 | .logo-container {
 2 |     display: flex;
 3 |     align-items: center;
 4 |     gap: 0.75rem;
 5 | }
 6 | 
 7 | h1 {
 8 |     font-size: 1.75rem;
 9 |     font-weight: 700;
10 |     margin: 0;
11 |     line-height: 1;
12 |     background: linear-gradient(rgba(255,255,255,0.1) 1px, transparent 1px),
13 |                 linear-gradient(90deg, rgba(255,255,255,0.1) 1px, transparent 1px);
14 |     background-size: 5px 5px;
15 |     -webkit-background-clip: text;
16 |     background-clip: text;
17 |     color: var(--text);
18 |     text-shadow: 
19 |         -1px -1px 0 rgba(0,0,0,0.5),  
20 |         1px -1px 0 rgba(0,0,0,0.5),
21 |         -1px 1px 0 rgba(0,0,0,0.5),
22 |         1px 1px 0 rgba(0,0,0,0.5),
23 |         2px 2px var(--fg-color);
24 | }
25 | 
26 | @media (max-width: 768px) {
27 |     .logo-container {
28 |         gap: 0.5rem;
29 |     }
30 |     
31 |     h1 {
32 |         font-size: 1.5rem;
33 |     }
34 | }
35 | 
36 | .cup {
37 |     width: 16px;
38 |     height: 20px;
39 |     border: 2px solid var(--text);
40 |     border-radius: 0 0 8px 8px;
41 |     position: relative;
42 |     animation: float 3s ease-in-out;
43 |     animation-iteration-count: 3;
44 |     animation-fill-mode: forwards;
45 | }
46 | 
47 | .handle {
48 |     width: 6px;
49 |     height: 10px;
50 |     border: 2px solid var(--text);
51 |     border-radius: 0 4px 4px 0;
52 |     position: absolute;
53 |     right: -6px;
54 |     top: 4px;
55 | }
56 | 
57 | .steam {
58 |     position: absolute;
59 |     top: -6px;
60 |     left: 2px;
61 |     right: 2px;
62 |     height: 6px;
63 |     display: flex;
64 |     justify-content: space-between;
65 | }
66 | 
67 | .steam::before,
68 | .steam::after {
69 |     content: "";
70 |     width: 3px;
71 |     height: 100%;
72 |     background: rgba(255,255,255,0.7);
73 |     border-radius: 3px;
74 |     animation: steam 2s;
75 |     animation-iteration-count: 3;
76 |     animation-fill-mode: forwards;
77 | }
78 | 
79 | @keyframes steam {
80 |     to {
81 |         transform: translateY(-6px) scale(1.3);
82 |         opacity: 0;
83 |     }
84 | }
85 | 
86 | @keyframes float {
87 |     50% {
88 |         transform: translateY(-2px);
89 |     }
90 |     100% {
91 |         transform: translateY(0);
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/web/styles/layout.css:
--------------------------------------------------------------------------------
 1 | /* Main Layout */
 2 | main {
 3 |     display: grid;
 4 |     grid-template-columns: 1fr 320px;
 5 |     gap: 1rem;
 6 |     width: 80%;
 7 |     margin: 0 auto;
 8 |     min-width: 0;
 9 |     height: calc(100vh - 8rem);
10 | }
11 | 
12 | /* Main Column */
13 | .main-column {
14 |     display: flex;
15 |     flex-direction: column;
16 |     gap: 1rem;
17 |     min-height: min-content;
18 |     height: auto;
19 |     overflow-y: auto;
20 | }
21 | 
22 | /* Text Editor Container */
23 | .text-editor {
24 |     min-height: 400px;
25 |     height: auto;
26 |     overflow: auto;
27 |     background: rgba(15, 23, 42, 0.3);
28 |     border: 1px solid var(--border);
29 |     border-radius: 0.5rem;
30 |     padding: 0.75rem;
31 | }
32 | 
33 | /* Controls Panel */
34 | .controls {
35 |     display: flex;
36 |     flex-direction: column;
37 |     gap: 1rem;
38 |     width: 100%;
39 |     height: 100%;
40 |     overflow-y: auto;
41 |     scrollbar-width: thin;
42 |     scrollbar-color: rgba(99, 102, 241, 0.2) transparent;
43 | }
44 | 
45 | .controls::-webkit-scrollbar {
46 |     width: 6px;
47 | }
48 | 
49 | .controls::-webkit-scrollbar-track {
50 |     background: transparent;
51 | }
52 | 
53 | .controls::-webkit-scrollbar-thumb {
54 |     background-color: rgba(99, 102, 241, 0.2);
55 |     border-radius: 3px;
56 | }
57 | 
58 | /* Controls Sections */
59 | .voice-select-container,
60 | .speed-control,
61 | .button-group {
62 |     width: 100%;
63 |     background: rgba(15, 23, 42, 0.3);
64 |     border: 1px solid var(--border);
65 |     border-radius: 0.5rem;
66 |     padding: 0.75rem;
67 | }
68 | 
69 | /* Player Container */
70 | .player-container {
71 |     background: rgba(15, 23, 42, 0.3);
72 |     border: 1px solid var(--border);
73 |     border-radius: 0.5rem;
74 |     padding: 0.75rem;
75 | }
76 | 
77 | /* Responsive Layout */
78 | @media (max-width: 768px) {
79 |     main {
80 |         grid-template-columns: 1fr;
81 |         gap: 0.5rem;
82 |         width: 95%;
83 |         height: auto;
84 |     }
85 | 
86 |     .text-editor {
87 |         min-height: 300px;
88 |     }
89 | 
90 |     .controls {
91 |         max-height: none;
92 |         overflow: visible;
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/web/styles/responsive.css:
--------------------------------------------------------------------------------
  1 | @media (max-width: 1200px) {
  2 |     .container {
  3 |         max-width: 100%;
  4 |     }
  5 | 
  6 |     main {
  7 |         gap: 1rem;
  8 |     }
  9 | 
 10 |     .text-editor,
 11 |     .controls {
 12 |         padding: 1rem;
 13 |     }
 14 | }
 15 | 
 16 | @media (max-width: 1023px) {
 17 |     h1 {
 18 |         font-size: clamp(1.5rem, 4vw, 2rem);
 19 |     }
 20 | 
 21 |     .cup {
 22 |         width: clamp(20px, 3vw, 30px);
 23 |         height: clamp(25px, 4vw, 40px);
 24 |     }
 25 | 
 26 |     .handle {
 27 |         width: clamp(8px, 1.5vw, 12px);
 28 |         height: clamp(15px, 2.5vw, 20px);
 29 |         right: clamp(-8px, -1.5vw, -12px);
 30 |         top: clamp(6px, 1vw, 8px);
 31 |     }
 32 | 
 33 |     .steam {
 34 |         top: clamp(-8px, -1.5vw, -12px);
 35 |     }
 36 | 
 37 |     .steam::before,
 38 |     .steam::after {
 39 |         width: clamp(4px, 0.75vw, 6px);
 40 |     }
 41 | }
 42 | 
 43 | @media (max-width: 768px) {
 44 |     .container {
 45 |         padding-left: 0.5rem;
 46 |         padding-right: 0.5rem;
 47 |     }
 48 | 
 49 |     .text-editor,
 50 |     .controls {
 51 |         padding: 0.75rem;
 52 |     }
 53 | 
 54 |     .voice-select-container {
 55 |         flex-direction: column;
 56 |         align-items: stretch;
 57 |     }
 58 | 
 59 |     .options {
 60 |         flex-direction: column;
 61 |         gap: 0.75rem;
 62 |     }
 63 | 
 64 |     .button-group {
 65 |         flex-direction: column;
 66 |     }
 67 | 
 68 |     .generation-options {
 69 |         flex-direction: column;
 70 |         align-items: stretch;
 71 |         gap: 0.5rem;
 72 |     }
 73 | 
 74 |     .format-select {
 75 |         width: 100%;
 76 |     }
 77 | 
 78 |     .player-container {
 79 |         padding: 0.75rem;
 80 |     }
 81 | 
 82 |     .player-controls {
 83 |         padding: 0.5rem;
 84 |         gap: 0.5rem;
 85 |     }
 86 | 
 87 |     .volume-control {
 88 |         gap: 0.25rem;
 89 |     }
 90 | 
 91 |     .volume-slider {
 92 |         width: 60px;
 93 |     }
 94 | 
 95 |     .wave-container {
 96 |         height: 32px;
 97 |     }
 98 | 
 99 |     .download-button {
100 |         top: 0.5rem;
101 |         right: 0.5rem;
102 |         width: 26px;
103 |         height: 26px;
104 |     }
105 | 
106 |     .download-icon {
107 |         width: 26px;
108 |         height: 26px;
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------