├── .coveragerc ├── .dockerignore ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .python-version ├── .ruff.toml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── VERSION ├── api ├── __init__.py ├── src │ ├── builds │ │ └── v1_0 │ │ │ └── config.json │ ├── core │ │ ├── __init__.py │ │ ├── config.py │ │ ├── don_quixote.txt │ │ ├── model_config.py │ │ ├── openai_mappings.json │ │ └── paths.py │ ├── inference │ │ ├── __init__.py │ │ ├── base.py │ │ ├── kokoro_v1.py │ │ ├── model_manager.py │ │ └── voice_manager.py │ ├── main.py │ ├── models │ │ └── v1_0 │ │ │ └── config.json │ ├── routers │ │ ├── __init__.py │ │ ├── debug.py │ │ ├── development.py │ │ ├── openai_compatible.py │ │ └── web_player.py │ ├── services │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── streaming_audio_writer.py │ │ ├── temp_manager.py │ │ ├── text_processing │ │ │ ├── __init__.py │ │ │ ├── normalizer.py │ │ │ ├── phonemizer.py │ │ │ ├── text_processor.py │ │ │ └── vocabulary.py │ │ └── tts_service.py │ ├── structures │ │ ├── __init__.py │ │ ├── custom_responses.py │ │ ├── model_schemas.py │ │ ├── schemas.py │ │ └── text_schemas.py │ └── voices │ │ └── v1_0 │ │ ├── af_alloy.pt │ │ ├── af_aoede.pt │ │ ├── af_bella.pt │ │ ├── af_heart.pt │ │ ├── af_jadzia.pt │ │ ├── af_jessica.pt │ │ ├── af_kore.pt │ │ ├── af_nicole.pt │ │ ├── af_nova.pt │ │ ├── af_river.pt │ │ ├── af_sarah.pt │ │ ├── af_sky.pt │ │ ├── af_v0.pt │ │ ├── af_v0bella.pt │ │ ├── af_v0irulan.pt │ │ ├── af_v0nicole.pt │ │ ├── af_v0sarah.pt │ │ ├── af_v0sky.pt │ │ ├── am_adam.pt │ │ ├── am_echo.pt │ │ ├── am_eric.pt │ │ ├── am_fenrir.pt │ │ ├── am_liam.pt │ │ ├── am_michael.pt │ │ ├── am_onyx.pt │ │ ├── am_puck.pt │ │ ├── am_santa.pt │ │ ├── am_v0adam.pt │ │ ├── am_v0gurney.pt │ │ ├── am_v0michael.pt │ │ ├── bf_alice.pt │ │ ├── bf_emma.pt │ │ ├── bf_lily.pt │ │ ├── bf_v0emma.pt │ │ ├── bf_v0isabella.pt │ │ ├── bm_daniel.pt │ │ ├── bm_fable.pt │ │ ├── bm_george.pt │ │ ├── bm_lewis.pt │ │ ├── bm_v0george.pt │ │ ├── bm_v0lewis.pt │ │ ├── ef_dora.pt │ │ ├── em_alex.pt │ │ ├── em_santa.pt │ │ ├── ff_siwis.pt │ │ ├── hf_alpha.pt │ │ ├── hf_beta.pt │ │ ├── hm_omega.pt │ │ ├── hm_psi.pt │ │ ├── if_sara.pt │ │ ├── im_nicola.pt │ │ ├── jf_alpha.pt │ │ ├── jf_gongitsune.pt │ │ ├── jf_nezumi.pt │ │ ├── jf_tebukuro.pt │ │ ├── jm_kumo.pt │ │ ├── pf_dora.pt │ │ ├── pm_alex.pt │ │ ├── pm_santa.pt │ │ ├── zf_xiaobei.pt │ │ ├── zf_xiaoni.pt │ │ ├── zf_xiaoxiao.pt │ │ ├── zf_xiaoyi.pt │ │ ├── zm_yunjian.pt │ │ ├── zm_yunxi.pt │ │ ├── zm_yunxia.pt │ │ └── zm_yunyang.pt └── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_audio_service.py │ ├── test_data │ ├── generate_test_data.py │ └── test_audio.npy │ ├── test_development.py │ ├── test_kokoro_v1.py │ ├── test_normalizer.py │ ├── test_openai_endpoints.py │ ├── test_paths.py │ ├── test_text_processor.py │ └── test_tts_service.py ├── assets ├── cpu_first_token_timeline_stream_openai.png ├── docs-screenshot.png ├── format_comparison.png ├── gpu_first_token_latency_direct.png ├── gpu_first_token_latency_openai.png ├── gpu_first_token_timeline_direct.png ├── gpu_first_token_timeline_openai.png ├── gpu_processing_time.png ├── gpu_realtime_factor.png ├── gpu_total_time_latency_direct.png ├── gpu_total_time_latency_openai.png ├── voice_analysis.png └── webui-screenshot.png ├── charts └── kokoro-fastapi │ ├── .helmignore │ ├── Chart.yaml │ ├── examples │ ├── aks-tls-values.yaml │ └── gpu-operator-values.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── hpa.yaml │ ├── ingress.yaml │ ├── kokoro-tts-deployment.yaml │ ├── kokoro-tts-service.yaml │ ├── serviceaccount.yaml │ └── tests │ │ └── test-connection.yaml │ └── values.yaml ├── debug.http ├── dev ├── Test Phon.py ├── Test Threads.py ├── Test copy 2.py ├── Test copy.py ├── Test money.py ├── Test num.py └── Test.py ├── docker-bake.hcl ├── docker ├── build.sh ├── cpu │ ├── .dockerignore │ ├── Dockerfile │ └── docker-compose.yml ├── gpu │ ├── .dockerignore │ ├── Dockerfile │ └── docker-compose.yml └── scripts │ ├── download_model.py │ ├── download_model.sh │ └── entrypoint.sh ├── docs ├── architecture │ ├── espeak_setup_fix.md │ └── streaming_audio_writer_analysis.md ├── requirements.in └── requirements.txt ├── examples ├── __init__.py ├── assorted_checks │ ├── __init__.py │ ├── benchmarks │ │ ├── __init__.py │ │ ├── benchmark_first_token.py │ │ ├── benchmark_first_token_stream_unified.py │ │ ├── benchmark_tts_rtf.py │ │ ├── depr_benchmark_tts.py │ │ ├── lib │ │ │ ├── __init__.py │ │ │ ├── shared_benchmark_utils.py │ │ │ ├── shared_plotting.py │ │ │ ├── shared_utils.py │ │ │ └── stream_utils.py │ │ ├── output_data │ │ │ ├── cpu_benchmark_results_rtf.json │ │ │ ├── cpu_benchmark_stats_rtf.txt │ │ │ ├── first_token_benchmark_stream.json │ │ │ ├── first_token_benchmark_stream_openai.json │ │ │ ├── gpu_benchmark_results_rtf.json │ │ │ └── gpu_benchmark_stats_rtf.txt │ │ ├── output_plots │ │ │ ├── cpu_processing_time_rtf.png │ │ │ ├── cpu_realtime_factor_rtf.png │ │ │ ├── cpu_system_usage_rtf.png │ │ │ ├── first_token_latency_stream.png │ │ │ ├── first_token_latency_stream_openai.png │ │ │ ├── first_token_timeline_stream.png │ │ │ ├── first_token_timeline_stream_openai.png │ │ │ ├── gpu_processing_time_rtf.png │ │ │ ├── gpu_realtime_factor_rtf.png │ │ │ ├── gpu_system_usage_rtf.png │ │ │ ├── total_time_latency_stream.png │ │ │ └── total_time_latency_stream_openai.png │ │ └── the_time_machine_hg_wells.txt │ ├── generate_readme_plots.py │ ├── test_combinations │ │ ├── test_analyze_combined_voices.py │ │ └── test_download_voice.py │ ├── test_formats │ │ └── test_audio_formats.py │ ├── test_normalizer.py │ ├── test_openai │ │ └── test_openai_tts.py │ ├── test_voices │ │ ├── analyze_voice_dimensions.py │ │ ├── test_all_voices.py │ │ └── trim_voice_dimensions.py │ ├── validate_wav.py │ └── validate_wavs.py ├── audio_analysis.png ├── captioned_speech_example.py ├── openai_streaming_audio.py ├── phoneme_examples │ ├── examples │ │ └── phoneme_examples │ │ │ └── output │ │ │ └── phoneme_test.wav │ ├── generate_phonemes.py │ └── test_phoneme_generation.py ├── requirements.txt ├── simul_file_test.py ├── simul_openai_streaming_audio.py ├── simul_speaker_test.py ├── speech.mp3 ├── stream_tts_playback.py ├── streaming_refactor │ ├── benchmark_unified_streaming.py │ └── test_unified_streaming.py └── voice_samples │ ├── speech_af.mp3 │ ├── speech_af_bella.mp3 │ ├── speech_af_nicole.mp3 │ ├── speech_af_sarah.mp3 │ ├── speech_am_adam.mp3 │ ├── speech_am_michael.mp3 │ ├── speech_bf_emma.mp3 │ ├── speech_bf_isabella.mp3 │ ├── speech_bm_george.mp3 │ └── speech_bm_lewis.mp3 ├── githubbanner.png ├── pyproject.toml ├── pytest.ini ├── scripts ├── fix_misaki.py ├── update_badges.py └── update_version.py ├── start-cpu.ps1 ├── start-cpu.sh ├── start-gpu.ps1 ├── start-gpu.sh ├── start-gpu_mac.sh ├── ui ├── Dockerfile ├── GUIBanner.png ├── GradioScreenShot.png ├── app.py ├── data │ └── inputs │ │ └── test_timemachine.txt ├── depr_tests │ ├── conftest.py │ ├── test_api.py │ ├── test_components.py │ ├── test_files.py │ ├── test_handlers.py │ ├── test_input.py │ └── test_interface.py └── lib │ ├── __init__.py │ ├── api.py │ ├── components │ ├── __init__.py │ ├── input.py │ ├── model.py │ └── output.py │ ├── config.py │ ├── files.py │ ├── handlers.py │ └── interface.py └── web ├── favicon.svg ├── index.html ├── siriwave.js ├── src ├── App.js ├── components │ ├── PlayerControls.js │ ├── TextEditor.js │ ├── VoiceSelector.js │ └── WaveVisualizer.js ├── services │ ├── AudioService.js │ └── VoiceService.js └── state │ └── PlayerState.js └── styles ├── badges.css ├── base.css ├── controls.css ├── forms.css ├── header.css ├── layout.css ├── player.css └── responsive.css /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | api 4 | ui 5 | omit = 6 | Kokoro-82M/* 7 | MagicMock/* 8 | test_*.py 9 | examples/* 10 | src/builds/* 11 | 12 | [report] 13 | exclude_lines = 14 | pragma: no cover 15 | def __repr__ 16 | raise NotImplementedError 17 | if __name__ == .__main__.: 18 | pass 19 | raise ImportError 20 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | 4 | # Python 5 | __pycache__ 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | .Python 10 | *.py[cod] 11 | *$py.class 12 | .pytest_cache 13 | .coverage 14 | .coveragerc 15 | 16 | # Environment 17 | # .env 18 | .venv 19 | env/ 20 | venv/ 21 | ENV/ 22 | 23 | # IDE 24 | .idea 25 | .vscode 26 | *.swp 27 | *.swo 28 | 29 | # Project specific 30 | examples/ 31 | Kokoro-82M/ 32 | ui/ 33 | tests/ 34 | *.md 35 | *.txt 36 | !requirements.txt 37 | 38 | # Docker 39 | Dockerfile* 40 | docker-compose* 41 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.py text eol=lf 4 | *.sh text eol=lf 5 | *.yml text eol=lf -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: remsky 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Screenshots or console output** 14 | If applicable, add screenshots to help explain your problem. When doing so., please ensure you have the first command that triggered the trace and/or the command that started up your build included, otherwise it is difficult to diagnose. 15 | 16 | **Branch / Deployment used** 17 | Let us know if it's the master branch, or the stable branch indicated in the readme, as well as if you're running it locally, in the cloud, via the docker compose (cpu or gpu), or direct docker run commands. Please include the exact commands used to run in the latter cases. 18 | 19 | **Operating System** 20 | Include the platform, version numbers of your docker, etc. Whether its GPU (Nvidia or other) or CPU, Mac, Linux, Windows, etc. 21 | 22 | **Additional context** 23 | Add any other context about the problem here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the feature you'd like** 11 | A clear and concise description of what you want to happen. Is it a quality of life improvement, something new entirely? 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. Consider whether it could be submitted as PR, or you'd need a hand to do so 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [ "master", "pre-release" ] 5 | pull_request: 6 | branches: [ "master", "pre-release" ] 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: ["3.10"] 13 | fail-fast: false 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | # Match Dockerfile dependencies 19 | - name: Install Dependencies 20 | run: | 21 | sudo apt-get update 22 | sudo apt-get install -y --no-install-recommends \ 23 | espeak-ng \ 24 | git \ 25 | libsndfile1 \ 26 | curl \ 27 | ffmpeg 28 | 29 | - name: Install uv 30 | uses: astral-sh/setup-uv@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | enable-cache: true 34 | - name: Install dependencies 35 | run: | 36 | uv pip install -e .[test,cpu] 37 | - name: Run Tests 38 | run: | 39 | uv run pytest api/tests/ --asyncio-mode=auto --cov=api --cov-report=term-missing 40 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create Release and Publish Docker Images 2 | 3 | on: 4 | push: 5 | branches: 6 | - release # Trigger when commits are pushed to the release branch (e.g., after merging master) 7 | paths-ignore: 8 | - '**.md' 9 | - 'docs/**' 10 | 11 | jobs: 12 | prepare-release: 13 | runs-on: ubuntu-latest 14 | outputs: 15 | version: ${{ steps.get-version.outputs.version }} 16 | version_tag: ${{ steps.get-version.outputs.version_tag }} 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Get version from VERSION file 22 | id: get-version 23 | run: | 24 | VERSION_PLAIN=$(cat VERSION) 25 | echo "version=${VERSION_PLAIN}" >> $GITHUB_OUTPUT 26 | echo "version_tag=v${VERSION_PLAIN}" >> $GITHUB_OUTPUT # Add 'v' prefix for tag 27 | 28 | build-images: 29 | needs: prepare-release 30 | runs-on: ubuntu-latest 31 | permissions: 32 | packages: write # Needed to push images to GHCR 33 | env: 34 | DOCKER_BUILDKIT: 1 35 | BUILDKIT_STEP_LOG_MAX_SIZE: 10485760 36 | # This environment variable will override the VERSION variable in docker-bake.hcl 37 | VERSION: ${{ needs.prepare-release.outputs.version_tag }} # Use tag version (vX.Y.Z) for bake 38 | steps: 39 | - name: Checkout repository 40 | uses: actions/checkout@v4 41 | with: 42 | fetch-depth: 0 # Needed to check for existing tags 43 | 44 | - name: Check if tag already exists 45 | run: | 46 | TAG_NAME="${{ needs.prepare-release.outputs.version_tag }}" 47 | echo "Checking for existing tag: $TAG_NAME" 48 | # Fetch tags explicitly just in case checkout didn't get them all 49 | git fetch --tags 50 | if git rev-parse "$TAG_NAME" >/dev/null 2>&1; then 51 | echo "::error::Tag $TAG_NAME already exists. Please increment the version in the VERSION file." 52 | exit 1 53 | else 54 | echo "Tag $TAG_NAME does not exist. Proceeding with release." 55 | fi 56 | 57 | - name: Free disk space # Optional: Keep as needed for large builds 58 | run: | 59 | echo "Listing current disk space" 60 | df -h 61 | echo "Cleaning up disk space..." 62 | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache 63 | docker system prune -af 64 | echo "Disk space after cleanup" 65 | df -h 66 | 67 | - name: Set up QEMU 68 | uses: docker/setup-qemu-action@v3 # Use v3 69 | 70 | - name: Set up Docker Buildx 71 | uses: docker/setup-buildx-action@v3 # Use v3 72 | with: 73 | driver-opts: | 74 | image=moby/buildkit:latest 75 | network=host 76 | 77 | - name: Log in to GitHub Container Registry 78 | uses: docker/login-action@v3 # Use v3 79 | with: 80 | registry: ghcr.io 81 | username: ${{ github.actor }} 82 | password: ${{ secrets.GITHUB_TOKEN }} 83 | 84 | - name: Build and push images using Docker Bake 85 | run: | 86 | echo "Building and pushing images for version ${{ needs.prepare-release.outputs.version_tag }}" 87 | # The VERSION env var above sets the tag for the bake file targets 88 | docker buildx bake --push 89 | 90 | create-release: 91 | needs: [prepare-release, build-images] 92 | runs-on: ubuntu-latest 93 | permissions: 94 | contents: write # Needed to create releases 95 | steps: 96 | - name: Checkout repository 97 | uses: actions/checkout@v4 98 | with: 99 | fetch-depth: 0 # Fetch all history for release notes generation 100 | 101 | - name: Create GitHub Release 102 | uses: softprops/action-gh-release@v2 # Use v2 103 | with: 104 | tag_name: ${{ needs.prepare-release.outputs.version_tag }} # Use vX.Y.Z tag 105 | name: Release ${{ needs.prepare-release.outputs.version_tag }} 106 | generate_release_notes: true # Auto-generate release notes 107 | draft: false # Publish immediately 108 | prerelease: false # Mark as a stable release 109 | env: 110 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | 4 | # Python 5 | __pycache__/ 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | *.py[cod] 10 | *$py.class 11 | .Python 12 | .pytest_cache 13 | .coverage 14 | .coveragerc 15 | 16 | # Python package build artifacts 17 | *.egg-info/ 18 | *.egg 19 | dist/ 20 | build/ 21 | *.onnx 22 | *.pth 23 | # Environment 24 | # .env 25 | .venv/ 26 | env/ 27 | venv/ 28 | ENV/ 29 | 30 | # IDE 31 | .idea/ 32 | .vscode/ 33 | *.swp 34 | *.swo 35 | 36 | # Project specific 37 | # Model files 38 | 39 | *.pth 40 | *.tar* 41 | 42 | 43 | # Other project files 44 | .env 45 | Kokoro-82M/ 46 | ui/data/ 47 | EXTERNAL_UV_DOCUMENTATION* 48 | app 49 | api/temp_files/ 50 | 51 | # Docker 52 | Dockerfile* 53 | docker-compose* 54 | examples/ebook_test/chapter_to_audio.py 55 | examples/ebook_test/chapters_to_audio.py 56 | examples/ebook_test/parse_epub.py 57 | api/src/voices/af_jadzia.pt 58 | examples/assorted_checks/test_combinations/output/* 59 | examples/assorted_checks/test_openai/output/* 60 | 61 | 62 | # Audio files 63 | examples/*.wav 64 | examples/*.pcm 65 | examples/*.mp3 66 | examples/*.flac 67 | examples/*.acc 68 | examples/*.ogg 69 | examples/speech.mp3 70 | examples/phoneme_examples/output/*.wav 71 | examples/assorted_checks/benchmarks/output_audio/* 72 | uv.lock 73 | 74 | # Mac MPS virtualenv for dual testing 75 | .venv-mps 76 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /.ruff.toml: -------------------------------------------------------------------------------- 1 | line-length = 88 2 | 3 | exclude = ["examples"] 4 | 5 | [lint] 6 | select = ["I"] 7 | 8 | [lint.isort] 9 | combine-as-imports = true 10 | force-wrap-aliases = true 11 | split-on-trailing-comma = true 12 | section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Kokoro-FastAPI 2 | 3 | Always appreciate community involvement in making this project better. 4 | 5 | ## Development Setup 6 | 7 | We use `uv` for managing Python environments and dependencies, and `ruff` for linting and formatting. 8 | 9 | 1. **Clone the repository:** 10 | ```bash 11 | git clone https://github.com/remsky/Kokoro-FastAPI.git 12 | cd Kokoro-FastAPI 13 | ``` 14 | 15 | 2. **Install `uv`:** 16 | Follow the instructions on the [official `uv` documentation](https://docs.astral.sh/uv/install/). 17 | 18 | 3. **Create a virtual environment and install dependencies:** 19 | It's recommended to use a virtual environment. `uv` can create one for you. Install the base dependencies along with the `test` and `cpu` extras (needed for running tests locally). 20 | ```bash 21 | # Create and activate a virtual environment (e.g., named .venv) 22 | uv venv 23 | source .venv/bin/activate # On Linux/macOS 24 | # .venv\Scripts\activate # On Windows 25 | 26 | # Install dependencies including test requirements 27 | uv pip install -e ".[test,cpu]" 28 | ``` 29 | *Note: If you have an NVIDIA GPU and want to test GPU-specific features locally, you can install `.[test,gpu]` instead, ensuring you have the correct CUDA toolkit installed.* 30 | 31 | *Note: If running via uv locally, you will have to install espeak and handle any pathing issues that arise. The Docker images handle this automatically* 32 | 33 | 4. **Install `ruff` (if not already installed globally):** 34 | While `ruff` might be included via dependencies, installing it explicitly ensures you have it available. 35 | ```bash 36 | uv pip install ruff 37 | ``` 38 | 39 | ## Running Tests 40 | 41 | Before submitting changes, please ensure all tests pass as this is a automated requirement. The tests are run using `pytest`. 42 | ```bash 43 | # Make sure your virtual environment is activated 44 | uv run pytest 45 | ``` 46 | *Note: The CI workflow runs tests using `uv run pytest api/tests/ --asyncio-mode=auto --cov=api --cov-report=term-missing`. Running `uv run pytest` locally should cover the essential checks.* 47 | 48 | ## Testing with Docker Compose 49 | 50 | In addition to local `pytest` runs, test your changes using Docker Compose to ensure they work correctly within the containerized environment. If you aren't able to test on CUDA hardware, make note so it can be tested by another maintainer 51 | 52 | ```bash 53 | 54 | docker compose -f docker/cpu/docker-compose.yml up --build 55 | + 56 | docker compose -f docker/gpu/docker-compose.yml up --build 57 | ``` 58 | This command will build the Docker images (if they've changed) and start the services defined in the respective compose file. Verify the application starts correctly and test the relevant functionality. 59 | 60 | ## Code Formatting and Linting 61 | 62 | We use `ruff` to maintain code quality and consistency. Please format and lint your code before committing. 63 | 64 | 1. **Format the code:** 65 | ```bash 66 | # Make sure your virtual environment is activated 67 | ruff format . 68 | ``` 69 | 70 | 2. **Lint the code (and apply automatic fixes):** 71 | ```bash 72 | # Make sure your virtual environment is activated 73 | ruff check . --fix 74 | ``` 75 | Review any changes made by `--fix` and address any remaining linting errors manually. 76 | 77 | ## Submitting Changes 78 | 79 | 0. Clone the repo 80 | 1. Create a new branch for your feature or bug fix. 81 | 2. Make your changes, following setup, testing, and formatting guidelines above. 82 | 3. Please try to keep your changes inline with the current design, and modular. Large-scale changes will take longer to review and integrate, and have less chance of being approved outright. 83 | 4. Push your branch to your fork. 84 | 5. Open a Pull Request against the `master` branch of the main repository. 85 | 86 | Thank you for contributing! 87 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.3.0 2 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | # Make api directory a Python package 2 | -------------------------------------------------------------------------------- /api/src/builds/v1_0/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "istftnet": { 3 | "upsample_kernel_sizes": [ 4 | 20, 5 | 12 6 | ], 7 | "upsample_rates": [ 8 | 10, 9 | 6 10 | ], 11 | "gen_istft_hop_size": 5, 12 | "gen_istft_n_fft": 20, 13 | "resblock_dilation_sizes": [ 14 | [ 15 | 1, 16 | 3, 17 | 5 18 | ], 19 | [ 20 | 1, 21 | 3, 22 | 5 23 | ], 24 | [ 25 | 1, 26 | 3, 27 | 5 28 | ] 29 | ], 30 | "resblock_kernel_sizes": [ 31 | 3, 32 | 7, 33 | 11 34 | ], 35 | "upsample_initial_channel": 512 36 | }, 37 | "dim_in": 64, 38 | "dropout": 0.2, 39 | "hidden_dim": 512, 40 | "max_conv_dim": 512, 41 | "max_dur": 50, 42 | "multispeaker": true, 43 | "n_layer": 3, 44 | "n_mels": 80, 45 | "n_token": 178, 46 | "style_dim": 128, 47 | "text_encoder_kernel_size": 5, 48 | "plbert": { 49 | "hidden_size": 768, 50 | "num_attention_heads": 12, 51 | "intermediate_size": 2048, 52 | "max_position_embeddings": 512, 53 | "num_hidden_layers": 12, 54 | "dropout": 0.1 55 | }, 56 | "vocab": { 57 | ";": 1, 58 | ":": 2, 59 | ",": 3, 60 | ".": 4, 61 | "!": 5, 62 | "?": 6, 63 | "—": 9, 64 | "…": 10, 65 | "\"": 11, 66 | "(": 12, 67 | ")": 13, 68 | "“": 14, 69 | "”": 15, 70 | " ": 16, 71 | "̃": 17, 72 | "ʣ": 18, 73 | "ʥ": 19, 74 | "ʦ": 20, 75 | "ʨ": 21, 76 | "ᵝ": 22, 77 | "ꭧ": 23, 78 | "A": 24, 79 | "I": 25, 80 | "O": 31, 81 | "Q": 33, 82 | "S": 35, 83 | "T": 36, 84 | "W": 39, 85 | "Y": 41, 86 | "ᵊ": 42, 87 | "a": 43, 88 | "b": 44, 89 | "c": 45, 90 | "d": 46, 91 | "e": 47, 92 | "f": 48, 93 | "h": 50, 94 | "i": 51, 95 | "j": 52, 96 | "k": 53, 97 | "l": 54, 98 | "m": 55, 99 | "n": 56, 100 | "o": 57, 101 | "p": 58, 102 | "q": 59, 103 | "r": 60, 104 | "s": 61, 105 | "t": 62, 106 | "u": 63, 107 | "v": 64, 108 | "w": 65, 109 | "x": 66, 110 | "y": 67, 111 | "z": 68, 112 | "ɑ": 69, 113 | "ɐ": 70, 114 | "ɒ": 71, 115 | "æ": 72, 116 | "β": 75, 117 | "ɔ": 76, 118 | "ɕ": 77, 119 | "ç": 78, 120 | "ɖ": 80, 121 | "ð": 81, 122 | "ʤ": 82, 123 | "ə": 83, 124 | "ɚ": 85, 125 | "ɛ": 86, 126 | "ɜ": 87, 127 | "ɟ": 90, 128 | "ɡ": 92, 129 | "ɥ": 99, 130 | "ɨ": 101, 131 | "ɪ": 102, 132 | "ʝ": 103, 133 | "ɯ": 110, 134 | "ɰ": 111, 135 | "ŋ": 112, 136 | "ɳ": 113, 137 | "ɲ": 114, 138 | "ɴ": 115, 139 | "ø": 116, 140 | "ɸ": 118, 141 | "θ": 119, 142 | "œ": 120, 143 | "ɹ": 123, 144 | "ɾ": 125, 145 | "ɻ": 126, 146 | "ʁ": 128, 147 | "ɽ": 129, 148 | "ʂ": 130, 149 | "ʃ": 131, 150 | "ʈ": 132, 151 | "ʧ": 133, 152 | "ʊ": 135, 153 | "ʋ": 136, 154 | "ʌ": 138, 155 | "ɣ": 139, 156 | "ɤ": 140, 157 | "χ": 142, 158 | "ʎ": 143, 159 | "ʒ": 147, 160 | "ʔ": 148, 161 | "ˈ": 156, 162 | "ˌ": 157, 163 | "ː": 158, 164 | "ʰ": 162, 165 | "ʲ": 164, 166 | "↓": 169, 167 | "→": 171, 168 | "↗": 172, 169 | "↘": 173, 170 | "ᵻ": 177 171 | } 172 | } -------------------------------------------------------------------------------- /api/src/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import settings 2 | 3 | __all__ = ["settings"] 4 | -------------------------------------------------------------------------------- /api/src/core/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pydantic_settings import BaseSettings 3 | 4 | 5 | class Settings(BaseSettings): 6 | # API Settings 7 | api_title: str = "Kokoro TTS API" 8 | api_description: str = "API for text-to-speech generation using Kokoro" 9 | api_version: str = "1.0.0" 10 | host: str = "0.0.0.0" 11 | port: int = 8880 12 | 13 | # Application Settings 14 | output_dir: str = "output" 15 | output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB 16 | default_voice: str = "af_heart" 17 | default_voice_code: str | None = ( 18 | None # If set, overrides the first letter of voice name, though api call param still takes precedence 19 | ) 20 | use_gpu: bool = True # Whether to use GPU acceleration if available 21 | device_type: str | None = ( 22 | None # Will be auto-detected if None, can be "cuda", "mps", or "cpu" 23 | ) 24 | allow_local_voice_saving: bool = ( 25 | False # Whether to allow saving combined voices locally 26 | ) 27 | 28 | # Container absolute paths 29 | model_dir: str = "/app/api/src/models" # Absolute path in container 30 | voices_dir: str = "/app/api/src/voices/v1_0" # Absolute path in container 31 | 32 | # Audio Settings 33 | sample_rate: int = 24000 34 | # Text Processing Settings 35 | target_min_tokens: int = 175 # Target minimum tokens per chunk 36 | target_max_tokens: int = 250 # Target maximum tokens per chunk 37 | absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk 38 | advanced_text_normalization: bool = True # Preproesses the text before misiki 39 | voice_weight_normalization: bool = ( 40 | True # Normalize the voice weights so they add up to 1 41 | ) 42 | 43 | gap_trim_ms: int = ( 44 | 1 # Base amount to trim from streaming chunk ends in milliseconds 45 | ) 46 | dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim 47 | dynamic_gap_trim_padding_char_multiplier: dict[str, float] = { 48 | ".": 1, 49 | "!": 0.9, 50 | "?": 1, 51 | ",": 0.8, 52 | } 53 | 54 | # Web Player Settings 55 | enable_web_player: bool = True # Whether to serve the web player UI 56 | web_player_path: str = "web" # Path to web player static files 57 | cors_origins: list[str] = ["*"] # CORS origins for web player 58 | cors_enabled: bool = True # Whether to enable CORS 59 | 60 | # Temp File Settings for WEB Ui 61 | temp_file_dir: str = "api/temp_files" # Directory for temporary audio files (relative to project root) 62 | max_temp_dir_size_mb: int = 2048 # Maximum size of temp directory (2GB) 63 | max_temp_dir_age_hours: int = 1 # Remove temp files older than 1 hour 64 | max_temp_dir_count: int = 3 # Maximum number of temp files to keep 65 | 66 | class Config: 67 | env_file = ".env" 68 | 69 | def get_device(self) -> str: 70 | """Get the appropriate device based on settings and availability""" 71 | if not self.use_gpu: 72 | return "cpu" 73 | 74 | if self.device_type: 75 | return self.device_type 76 | 77 | # Auto-detect device 78 | if torch.backends.mps.is_available(): 79 | return "mps" 80 | elif torch.cuda.is_available(): 81 | return "cuda" 82 | return "cpu" 83 | 84 | 85 | settings = Settings() 86 | -------------------------------------------------------------------------------- /api/src/core/don_quixote.txt: -------------------------------------------------------------------------------- 1 | In a village of La Mancha, the name of which I have no desire to call 2 | to mind, there lived not long since one of those gentlemen that keep a 3 | lance in the lance-rack, an old buckler, a lean hack, and a greyhound 4 | for coursing. An olla of rather more beef than mutton, a salad on most 5 | nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so 6 | extra on Sundays, made away with three-quarters of his income. The rest 7 | of it went in a doublet of fine cloth and velvet breeches and shoes to 8 | match for holidays, while on week-days he made a brave figure in his 9 | best homespun. -------------------------------------------------------------------------------- /api/src/core/model_config.py: -------------------------------------------------------------------------------- 1 | """Model configuration for Kokoro V1. 2 | 3 | This module provides model-specific configuration settings that complement the application-level 4 | settings in config.py. While config.py handles general application settings (API, paths, etc.), 5 | this module focuses on memory management and model file paths. 6 | """ 7 | 8 | from pydantic import BaseModel, Field 9 | 10 | 11 | class KokoroV1Config(BaseModel): 12 | """Kokoro V1 configuration.""" 13 | 14 | languages: list[str] = ["en"] 15 | 16 | class Config: 17 | frozen = True 18 | 19 | 20 | class PyTorchConfig(BaseModel): 21 | """PyTorch backend configuration.""" 22 | 23 | memory_threshold: float = Field(0.8, description="Memory threshold for cleanup") 24 | retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors") 25 | 26 | class Config: 27 | frozen = True 28 | 29 | 30 | class ModelConfig(BaseModel): 31 | """Kokoro V1 model configuration.""" 32 | 33 | # General settings 34 | cache_voices: bool = Field(True, description="Whether to cache voice tensors") 35 | voice_cache_size: int = Field(2, description="Maximum number of cached voices") 36 | 37 | # Model filename 38 | pytorch_kokoro_v1_file: str = Field( 39 | "v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename" 40 | ) 41 | 42 | # Backend config 43 | pytorch_gpu: PyTorchConfig = Field(default_factory=PyTorchConfig) 44 | 45 | class Config: 46 | frozen = True 47 | 48 | 49 | # Global instance 50 | model_config = ModelConfig() 51 | -------------------------------------------------------------------------------- /api/src/core/openai_mappings.json: -------------------------------------------------------------------------------- 1 | { 2 | "models": { 3 | "tts-1": "kokoro-v1_0", 4 | "tts-1-hd": "kokoro-v1_0", 5 | "kokoro": "kokoro-v1_0" 6 | }, 7 | "voices": { 8 | "alloy": "am_v0adam", 9 | "ash": "af_v0nicole", 10 | "coral": "bf_v0emma", 11 | "echo": "af_v0bella", 12 | "fable": "af_sarah", 13 | "onyx": "bm_george", 14 | "nova": "bf_isabella", 15 | "sage": "am_michael", 16 | "shimmer": "af_sky" 17 | } 18 | } -------------------------------------------------------------------------------- /api/src/inference/__init__.py: -------------------------------------------------------------------------------- 1 | """Model inference package.""" 2 | 3 | from .base import BaseModelBackend 4 | from .kokoro_v1 import KokoroV1 5 | from .model_manager import ModelManager, get_manager 6 | 7 | __all__ = [ 8 | "BaseModelBackend", 9 | "ModelManager", 10 | "get_manager", 11 | "KokoroV1", 12 | ] 13 | -------------------------------------------------------------------------------- /api/src/inference/base.py: -------------------------------------------------------------------------------- 1 | """Base interface for Kokoro inference.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import AsyncGenerator, List, Optional, Tuple, Union 5 | 6 | import numpy as np 7 | import torch 8 | 9 | 10 | class AudioChunk: 11 | """Class for audio chunks returned by model backends""" 12 | 13 | def __init__( 14 | self, 15 | audio: np.ndarray, 16 | word_timestamps: Optional[List] = [], 17 | output: Optional[Union[bytes, np.ndarray]] = b"", 18 | ): 19 | self.audio = audio 20 | self.word_timestamps = word_timestamps 21 | self.output = output 22 | 23 | @staticmethod 24 | def combine(audio_chunk_list: List): 25 | output = AudioChunk( 26 | audio_chunk_list[0].audio, audio_chunk_list[0].word_timestamps 27 | ) 28 | 29 | for audio_chunk in audio_chunk_list[1:]: 30 | output.audio = np.concatenate( 31 | (output.audio, audio_chunk.audio), dtype=np.int16 32 | ) 33 | if output.word_timestamps is not None: 34 | output.word_timestamps += audio_chunk.word_timestamps 35 | 36 | return output 37 | 38 | 39 | class ModelBackend(ABC): 40 | """Abstract base class for model inference backend.""" 41 | 42 | @abstractmethod 43 | async def load_model(self, path: str) -> None: 44 | """Load model from path. 45 | 46 | Args: 47 | path: Path to model file 48 | 49 | Raises: 50 | RuntimeError: If model loading fails 51 | """ 52 | pass 53 | 54 | @abstractmethod 55 | async def generate( 56 | self, 57 | text: str, 58 | voice: Union[str, Tuple[str, Union[torch.Tensor, str]]], 59 | speed: float = 1.0, 60 | ) -> AsyncGenerator[AudioChunk, None]: 61 | """Generate audio from text. 62 | 63 | Args: 64 | text: Input text to synthesize 65 | voice: Either a voice path or tuple of (name, tensor/path) 66 | speed: Speed multiplier 67 | 68 | Yields: 69 | Generated audio chunks 70 | 71 | Raises: 72 | RuntimeError: If generation fails 73 | """ 74 | pass 75 | 76 | @abstractmethod 77 | def unload(self) -> None: 78 | """Unload model and free resources.""" 79 | pass 80 | 81 | @property 82 | @abstractmethod 83 | def is_loaded(self) -> bool: 84 | """Check if model is loaded. 85 | 86 | Returns: 87 | True if model is loaded, False otherwise 88 | """ 89 | pass 90 | 91 | @property 92 | @abstractmethod 93 | def device(self) -> str: 94 | """Get device model is running on. 95 | 96 | Returns: 97 | Device string ('cpu' or 'cuda') 98 | """ 99 | pass 100 | 101 | 102 | class BaseModelBackend(ModelBackend): 103 | """Base implementation of model backend.""" 104 | 105 | def __init__(self): 106 | """Initialize base backend.""" 107 | self._model: Optional[torch.nn.Module] = None 108 | self._device: str = "cpu" 109 | 110 | @property 111 | def is_loaded(self) -> bool: 112 | """Check if model is loaded.""" 113 | return self._model is not None 114 | 115 | @property 116 | def device(self) -> str: 117 | """Get device model is running on.""" 118 | return self._device 119 | 120 | def unload(self) -> None: 121 | """Unload model and free resources.""" 122 | if self._model is not None: 123 | del self._model 124 | self._model = None 125 | if torch.cuda.is_available(): 126 | torch.cuda.empty_cache() 127 | torch.cuda.synchronize() 128 | -------------------------------------------------------------------------------- /api/src/inference/voice_manager.py: -------------------------------------------------------------------------------- 1 | """Voice management with controlled resource handling.""" 2 | 3 | from typing import Dict, List, Optional 4 | 5 | import aiofiles 6 | import torch 7 | from loguru import logger 8 | 9 | from ..core import paths 10 | from ..core.config import settings 11 | 12 | 13 | class VoiceManager: 14 | """Manages voice loading and caching with controlled resource usage.""" 15 | 16 | # Singleton instance 17 | _instance = None 18 | 19 | def __init__(self): 20 | """Initialize voice manager.""" 21 | # Strictly respect settings.use_gpu 22 | self._device = settings.get_device() 23 | self._voices: Dict[str, torch.Tensor] = {} 24 | 25 | async def get_voice_path(self, voice_name: str) -> str: 26 | """Get path to voice file. 27 | 28 | Args: 29 | voice_name: Name of voice 30 | 31 | Returns: 32 | Path to voice file 33 | 34 | Raises: 35 | RuntimeError: If voice not found 36 | """ 37 | return await paths.get_voice_path(voice_name) 38 | 39 | async def load_voice( 40 | self, voice_name: str, device: Optional[str] = None 41 | ) -> torch.Tensor: 42 | """Load voice tensor. 43 | 44 | Args: 45 | voice_name: Name of voice to load 46 | device: Optional override for target device 47 | 48 | Returns: 49 | Voice tensor 50 | 51 | Raises: 52 | RuntimeError: If voice not found 53 | """ 54 | try: 55 | voice_path = await self.get_voice_path(voice_name) 56 | target_device = device or self._device 57 | voice = await paths.load_voice_tensor(voice_path, target_device) 58 | self._voices[voice_name] = voice 59 | return voice 60 | except Exception as e: 61 | raise RuntimeError(f"Failed to load voice {voice_name}: {e}") 62 | 63 | async def combine_voices( 64 | self, voices: List[str], device: Optional[str] = None 65 | ) -> torch.Tensor: 66 | """Combine multiple voices. 67 | 68 | Args: 69 | voices: List of voice names to combine 70 | device: Optional override for target device 71 | 72 | Returns: 73 | Combined voice tensor 74 | 75 | Raises: 76 | RuntimeError: If any voice not found 77 | """ 78 | if len(voices) < 2: 79 | raise ValueError("Need at least 2 voices to combine") 80 | 81 | target_device = device or self._device 82 | voice_tensors = [] 83 | for name in voices: 84 | voice = await self.load_voice(name, target_device) 85 | voice_tensors.append(voice) 86 | 87 | combined = torch.mean(torch.stack(voice_tensors), dim=0) 88 | return combined 89 | 90 | async def list_voices(self) -> List[str]: 91 | """List available voice names. 92 | 93 | Returns: 94 | List of voice names 95 | """ 96 | return await paths.list_voices() 97 | 98 | def cache_info(self) -> Dict[str, int]: 99 | """Get cache statistics. 100 | 101 | Returns: 102 | Dict with cache statistics 103 | """ 104 | return {"loaded_voices": len(self._voices), "device": self._device} 105 | 106 | 107 | async def get_manager() -> VoiceManager: 108 | """Get voice manager instance. 109 | 110 | Returns: 111 | VoiceManager instance 112 | """ 113 | if VoiceManager._instance is None: 114 | VoiceManager._instance = VoiceManager() 115 | return VoiceManager._instance 116 | -------------------------------------------------------------------------------- /api/src/models/v1_0/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "istftnet": { 3 | "upsample_kernel_sizes": [20, 12], 4 | "upsample_rates": [10, 6], 5 | "gen_istft_hop_size": 5, 6 | "gen_istft_n_fft": 20, 7 | "resblock_dilation_sizes": [ 8 | [1, 3, 5], 9 | [1, 3, 5], 10 | [1, 3, 5] 11 | ], 12 | "resblock_kernel_sizes": [3, 7, 11], 13 | "upsample_initial_channel": 512 14 | }, 15 | "dim_in": 64, 16 | "dropout": 0.2, 17 | "hidden_dim": 512, 18 | "max_conv_dim": 512, 19 | "max_dur": 50, 20 | "multispeaker": true, 21 | "n_layer": 3, 22 | "n_mels": 80, 23 | "n_token": 178, 24 | "style_dim": 128, 25 | "text_encoder_kernel_size": 5, 26 | "plbert": { 27 | "hidden_size": 768, 28 | "num_attention_heads": 12, 29 | "intermediate_size": 2048, 30 | "max_position_embeddings": 512, 31 | "num_hidden_layers": 12, 32 | "dropout": 0.1 33 | }, 34 | "vocab": { 35 | ";": 1, 36 | ":": 2, 37 | ",": 3, 38 | ".": 4, 39 | "!": 5, 40 | "?": 6, 41 | "—": 9, 42 | "…": 10, 43 | "\"": 11, 44 | "(": 12, 45 | ")": 13, 46 | "“": 14, 47 | "”": 15, 48 | " ": 16, 49 | "\u0303": 17, 50 | "ʣ": 18, 51 | "ʥ": 19, 52 | "ʦ": 20, 53 | "ʨ": 21, 54 | "ᵝ": 22, 55 | "\uAB67": 23, 56 | "A": 24, 57 | "I": 25, 58 | "O": 31, 59 | "Q": 33, 60 | "S": 35, 61 | "T": 36, 62 | "W": 39, 63 | "Y": 41, 64 | "ᵊ": 42, 65 | "a": 43, 66 | "b": 44, 67 | "c": 45, 68 | "d": 46, 69 | "e": 47, 70 | "f": 48, 71 | "h": 50, 72 | "i": 51, 73 | "j": 52, 74 | "k": 53, 75 | "l": 54, 76 | "m": 55, 77 | "n": 56, 78 | "o": 57, 79 | "p": 58, 80 | "q": 59, 81 | "r": 60, 82 | "s": 61, 83 | "t": 62, 84 | "u": 63, 85 | "v": 64, 86 | "w": 65, 87 | "x": 66, 88 | "y": 67, 89 | "z": 68, 90 | "ɑ": 69, 91 | "ɐ": 70, 92 | "ɒ": 71, 93 | "æ": 72, 94 | "β": 75, 95 | "ɔ": 76, 96 | "ɕ": 77, 97 | "ç": 78, 98 | "ɖ": 80, 99 | "ð": 81, 100 | "ʤ": 82, 101 | "ə": 83, 102 | "ɚ": 85, 103 | "ɛ": 86, 104 | "ɜ": 87, 105 | "ɟ": 90, 106 | "ɡ": 92, 107 | "ɥ": 99, 108 | "ɨ": 101, 109 | "ɪ": 102, 110 | "ʝ": 103, 111 | "ɯ": 110, 112 | "ɰ": 111, 113 | "ŋ": 112, 114 | "ɳ": 113, 115 | "ɲ": 114, 116 | "ɴ": 115, 117 | "ø": 116, 118 | "ɸ": 118, 119 | "θ": 119, 120 | "œ": 120, 121 | "ɹ": 123, 122 | "ɾ": 125, 123 | "ɻ": 126, 124 | "ʁ": 128, 125 | "ɽ": 129, 126 | "ʂ": 130, 127 | "ʃ": 131, 128 | "ʈ": 132, 129 | "ʧ": 133, 130 | "ʊ": 135, 131 | "ʋ": 136, 132 | "ʌ": 138, 133 | "ɣ": 139, 134 | "ɤ": 140, 135 | "χ": 142, 136 | "ʎ": 143, 137 | "ʒ": 147, 138 | "ʔ": 148, 139 | "ˈ": 156, 140 | "ˌ": 157, 141 | "ː": 158, 142 | "ʰ": 162, 143 | "ʲ": 164, 144 | "↓": 169, 145 | "→": 171, 146 | "↗": 172, 147 | "↘": 173, 148 | "ᵻ": 177 149 | } 150 | } -------------------------------------------------------------------------------- /api/src/routers/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /api/src/routers/web_player.py: -------------------------------------------------------------------------------- 1 | """Web player router with async file serving.""" 2 | 3 | from fastapi import APIRouter, HTTPException 4 | from fastapi.responses import Response 5 | from loguru import logger 6 | 7 | from ..core.config import settings 8 | from ..core.paths import get_content_type, get_web_file_path, read_bytes 9 | 10 | router = APIRouter( 11 | tags=["Web Player"], 12 | responses={404: {"description": "Not found"}}, 13 | ) 14 | 15 | 16 | @router.get("/{filename:path}") 17 | async def serve_web_file(filename: str): 18 | """Serve web player static files asynchronously.""" 19 | if not settings.enable_web_player: 20 | raise HTTPException(status_code=404, detail="Web player is disabled") 21 | 22 | try: 23 | # Default to index.html for root path 24 | if filename == "" or filename == "/": 25 | filename = "index.html" 26 | 27 | # Get file path 28 | file_path = await get_web_file_path(filename) 29 | 30 | # Read file content 31 | content = await read_bytes(file_path) 32 | 33 | # Get content type 34 | content_type = await get_content_type(file_path) 35 | 36 | return Response( 37 | content=content, 38 | media_type=content_type, 39 | headers={ 40 | "Cache-Control": "no-cache", # Prevent caching during development 41 | }, 42 | ) 43 | 44 | except RuntimeError as e: 45 | logger.warning(f"Web file not found: {filename}") 46 | raise HTTPException(status_code=404, detail=str(e)) 47 | except Exception as e: 48 | logger.error(f"Error serving web file {filename}: {e}") 49 | raise HTTPException(status_code=500, detail="Internal server error") 50 | -------------------------------------------------------------------------------- /api/src/services/__init__.py: -------------------------------------------------------------------------------- 1 | from .tts_service import TTSService 2 | 3 | __all__ = ["TTSService"] 4 | -------------------------------------------------------------------------------- /api/src/services/text_processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Text processing pipeline.""" 2 | 3 | from .normalizer import normalize_text 4 | from .phonemizer import phonemize 5 | from .text_processor import process_text_chunk, smart_split 6 | from .vocabulary import tokenize 7 | 8 | 9 | def process_text(text: str) -> list[int]: 10 | """Process text into token IDs (for backward compatibility).""" 11 | return process_text_chunk(text) 12 | 13 | 14 | __all__ = [ 15 | "normalize_text", 16 | "phonemize", 17 | "tokenize", 18 | "process_text", 19 | "process_text_chunk", 20 | "smart_split", 21 | ] 22 | -------------------------------------------------------------------------------- /api/src/services/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | 4 | import phonemizer 5 | 6 | from .normalizer import normalize_text 7 | from ...structures.schemas import NormalizationOptions 8 | 9 | phonemizers = {} 10 | 11 | 12 | class PhonemizerBackend(ABC): 13 | """Abstract base class for phonemization backends""" 14 | 15 | @abstractmethod 16 | def phonemize(self, text: str) -> str: 17 | """Convert text to phonemes 18 | 19 | Args: 20 | text: Text to convert to phonemes 21 | 22 | Returns: 23 | Phonemized text 24 | """ 25 | pass 26 | 27 | 28 | class EspeakBackend(PhonemizerBackend): 29 | """Espeak-based phonemizer implementation""" 30 | 31 | def __init__(self, language: str): 32 | """Initialize espeak backend 33 | 34 | Args: 35 | language: Language code ('en-us' or 'en-gb') 36 | """ 37 | self.backend = phonemizer.backend.EspeakBackend( 38 | language=language, preserve_punctuation=True, with_stress=True 39 | ) 40 | 41 | self.language = language 42 | 43 | def phonemize(self, text: str) -> str: 44 | """Convert text to phonemes using espeak 45 | 46 | Args: 47 | text: Text to convert to phonemes 48 | 49 | Returns: 50 | Phonemized text 51 | """ 52 | # Phonemize text 53 | ps = self.backend.phonemize([text]) 54 | ps = ps[0] if ps else "" 55 | 56 | # Handle special cases 57 | ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ") 58 | ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l") 59 | ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps) 60 | ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps) 61 | 62 | # Language-specific rules 63 | if self.language == "en-us": 64 | ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps) 65 | 66 | return ps.strip() 67 | 68 | 69 | def create_phonemizer(language: str = "a") -> PhonemizerBackend: 70 | """Factory function to create phonemizer backend 71 | 72 | Args: 73 | language: Language code ('a' for US English, 'b' for British English) 74 | 75 | Returns: 76 | Phonemizer backend instance 77 | """ 78 | # Map language codes to espeak language codes 79 | lang_map = {"a": "en-us", "b": "en-gb", "z": "z"} 80 | 81 | if language not in lang_map: 82 | raise ValueError(f"Unsupported language code: {language}") 83 | 84 | return EspeakBackend(lang_map[language]) 85 | 86 | 87 | def phonemize(text: str, language: str = "a") -> str: 88 | """Convert text to phonemes 89 | 90 | Args: 91 | text: Text to convert to phonemes 92 | language: Language code ('a' for US English, 'b' for British English) 93 | 94 | Returns: 95 | Phonemized text 96 | """ 97 | global phonemizers 98 | 99 | # Strip input text first to remove problematic leading/trailing spaces 100 | text = text.strip() 101 | 102 | if language not in phonemizers: 103 | phonemizers[language] = create_phonemizer(language) 104 | 105 | result = phonemizers[language].phonemize(text) 106 | # Final strip to ensure no leading/trailing spaces in phonemes 107 | return result.strip() 108 | -------------------------------------------------------------------------------- /api/src/services/text_processing/vocabulary.py: -------------------------------------------------------------------------------- 1 | def get_vocab(): 2 | """Get the vocabulary dictionary mapping characters to token IDs""" 3 | _pad = "$" 4 | _punctuation = ';:,.!?¡¿—…"«»"" ' 5 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 7 | 8 | # Create vocabulary dictionary 9 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 10 | return {symbol: i for i, symbol in enumerate(symbols)} 11 | 12 | 13 | # Initialize vocabulary 14 | VOCAB = get_vocab() 15 | 16 | 17 | def tokenize(phonemes: str) -> list[int]: 18 | """Convert phonemes string to token IDs 19 | 20 | Args: 21 | phonemes: String of phonemes to tokenize 22 | 23 | Returns: 24 | List of token IDs 25 | """ 26 | # Strip phonemes to remove leading/trailing spaces that could cause artifacts 27 | phonemes = phonemes.strip() 28 | return [i for i in map(VOCAB.get, phonemes) if i is not None] 29 | 30 | 31 | def decode_tokens(tokens: list[int]) -> str: 32 | """Convert token IDs back to phonemes string 33 | 34 | Args: 35 | tokens: List of token IDs 36 | 37 | Returns: 38 | String of phonemes 39 | """ 40 | # Create reverse mapping 41 | id_to_symbol = {i: s for s, i in VOCAB.items()} 42 | return "".join(id_to_symbol[t] for t in tokens) 43 | -------------------------------------------------------------------------------- /api/src/structures/__init__.py: -------------------------------------------------------------------------------- 1 | from .schemas import ( 2 | CaptionedSpeechRequest, 3 | CaptionedSpeechResponse, 4 | OpenAISpeechRequest, 5 | TTSStatus, 6 | VoiceCombineRequest, 7 | WordTimestamp, 8 | ) 9 | 10 | __all__ = [ 11 | "OpenAISpeechRequest", 12 | "CaptionedSpeechRequest", 13 | "CaptionedSpeechResponse", 14 | "WordTimestamp", 15 | "TTSStatus", 16 | "VoiceCombineRequest", 17 | ] 18 | -------------------------------------------------------------------------------- /api/src/structures/custom_responses.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing 3 | from collections.abc import AsyncIterable, Iterable 4 | 5 | from pydantic import BaseModel 6 | from starlette.background import BackgroundTask 7 | from starlette.concurrency import iterate_in_threadpool 8 | from starlette.responses import JSONResponse, StreamingResponse 9 | 10 | 11 | class JSONStreamingResponse(StreamingResponse, JSONResponse): 12 | """StreamingResponse that also render with JSON.""" 13 | 14 | def __init__( 15 | self, 16 | content: Iterable | AsyncIterable, 17 | status_code: int = 200, 18 | headers: dict[str, str] | None = None, 19 | media_type: str | None = None, 20 | background: BackgroundTask | None = None, 21 | ) -> None: 22 | if isinstance(content, AsyncIterable): 23 | self._content_iterable: AsyncIterable = content 24 | else: 25 | self._content_iterable = iterate_in_threadpool(content) 26 | 27 | async def body_iterator() -> AsyncIterable[bytes]: 28 | async for content_ in self._content_iterable: 29 | if isinstance(content_, BaseModel): 30 | content_ = content_.model_dump() 31 | yield self.render(content_) 32 | 33 | self.body_iterator = body_iterator() 34 | self.status_code = status_code 35 | if media_type is not None: 36 | self.media_type = media_type 37 | self.background = background 38 | self.init_headers(headers) 39 | 40 | def render(self, content: typing.Any) -> bytes: 41 | return ( 42 | json.dumps( 43 | content, 44 | ensure_ascii=False, 45 | allow_nan=False, 46 | indent=None, 47 | separators=(",", ":"), 48 | ) 49 | + "\n" 50 | ).encode("utf-8") 51 | -------------------------------------------------------------------------------- /api/src/structures/model_schemas.py: -------------------------------------------------------------------------------- 1 | """Voice configuration schemas.""" 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | 6 | class VoiceConfig(BaseModel): 7 | """Voice configuration.""" 8 | 9 | use_cache: bool = Field(True, description="Whether to cache loaded voices") 10 | cache_size: int = Field(3, description="Number of voices to cache") 11 | validate_on_load: bool = Field( 12 | True, description="Whether to validate voices when loading" 13 | ) 14 | 15 | class Config: 16 | frozen = True # Make config immutable 17 | -------------------------------------------------------------------------------- /api/src/structures/text_schemas.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from pydantic import BaseModel, Field, field_validator 4 | 5 | 6 | class PhonemeRequest(BaseModel): 7 | text: str 8 | language: str = "a" # Default to American English 9 | 10 | 11 | class PhonemeResponse(BaseModel): 12 | phonemes: str 13 | tokens: list[int] 14 | 15 | 16 | class StitchOptions(BaseModel): 17 | """Options for stitching audio chunks together""" 18 | 19 | gap_method: str = Field( 20 | default="static_trim", 21 | description="Method to handle gaps between chunks. Currently only 'static_trim' supported.", 22 | ) 23 | trim_ms: int = Field( 24 | default=0, 25 | ge=0, 26 | description="Milliseconds to trim from chunk boundaries when using static_trim", 27 | ) 28 | 29 | @field_validator("gap_method") 30 | @classmethod 31 | def validate_gap_method(cls, v: str) -> str: 32 | if v != "static_trim": 33 | raise ValueError("Currently only 'static_trim' gap method is supported") 34 | return v 35 | 36 | 37 | class GenerateFromPhonemesRequest(BaseModel): 38 | """Simple request for phoneme-to-speech generation""" 39 | 40 | phonemes: str = Field(..., description="Phoneme string to synthesize") 41 | voice: str = Field(..., description="Voice ID to use for generation") 42 | -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_alloy.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_alloy.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_aoede.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_aoede.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_bella.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_bella.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_heart.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_heart.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_jadzia.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_jadzia.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_jessica.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_jessica.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_kore.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_kore.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_nicole.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_nicole.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_nova.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_nova.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_river.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_river.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_sarah.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_sarah.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_sky.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_sky.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0bella.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0bella.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0irulan.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0irulan.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0nicole.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0nicole.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0sarah.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0sarah.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/af_v0sky.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/af_v0sky.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_adam.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_adam.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_echo.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_echo.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_eric.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_eric.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_fenrir.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_fenrir.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_liam.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_liam.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_michael.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_michael.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_onyx.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_onyx.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_puck.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_puck.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_santa.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_santa.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_v0adam.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0adam.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_v0gurney.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0gurney.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/am_v0michael.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/am_v0michael.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bf_alice.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_alice.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bf_emma.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_emma.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bf_lily.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_lily.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bf_v0emma.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_v0emma.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bf_v0isabella.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bf_v0isabella.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_daniel.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_daniel.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_fable.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_fable.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_george.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_george.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_lewis.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_lewis.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_v0george.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_v0george.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/bm_v0lewis.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/bm_v0lewis.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/ef_dora.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/ef_dora.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/em_alex.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/em_alex.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/em_santa.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/em_santa.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/ff_siwis.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/ff_siwis.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/hf_alpha.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hf_alpha.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/hf_beta.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hf_beta.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/hm_omega.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hm_omega.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/hm_psi.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/hm_psi.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/if_sara.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/if_sara.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/im_nicola.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/im_nicola.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/jf_alpha.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_alpha.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/jf_gongitsune.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_gongitsune.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/jf_nezumi.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_nezumi.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/jf_tebukuro.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jf_tebukuro.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/jm_kumo.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/jm_kumo.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/pf_dora.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pf_dora.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/pm_alex.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pm_alex.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/pm_santa.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/pm_santa.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zf_xiaobei.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaobei.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zf_xiaoni.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoni.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zf_xiaoxiao.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoxiao.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zf_xiaoyi.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zf_xiaoyi.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zm_yunjian.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunjian.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zm_yunxi.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunxi.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zm_yunxia.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunxia.pt -------------------------------------------------------------------------------- /api/src/voices/v1_0/zm_yunyang.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/src/voices/v1_0/zm_yunyang.pt -------------------------------------------------------------------------------- /api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Make tests directory a Python package 2 | -------------------------------------------------------------------------------- /api/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from unittest.mock import AsyncMock, MagicMock, patch 4 | 5 | import numpy as np 6 | import pytest 7 | import pytest_asyncio 8 | import torch 9 | 10 | from api.src.inference.model_manager import ModelManager 11 | from api.src.inference.voice_manager import VoiceManager 12 | from api.src.services.tts_service import TTSService 13 | from api.src.structures.model_schemas import VoiceConfig 14 | 15 | 16 | @pytest.fixture 17 | def mock_voice_tensor(): 18 | """Load a real voice tensor for testing.""" 19 | voice_path = os.path.join( 20 | os.path.dirname(os.path.dirname(__file__)), "src/voices/af_bella.pt" 21 | ) 22 | return torch.load(voice_path, map_location="cpu", weights_only=False) 23 | 24 | 25 | @pytest.fixture 26 | def mock_audio_output(): 27 | """Load pre-generated test audio for consistent testing.""" 28 | test_audio_path = os.path.join( 29 | os.path.dirname(__file__), "test_data/test_audio.npy" 30 | ) 31 | return np.load(test_audio_path) # Return as numpy array instead of bytes 32 | 33 | 34 | @pytest_asyncio.fixture 35 | async def mock_model_manager(mock_audio_output): 36 | """Mock model manager for testing.""" 37 | manager = AsyncMock(spec=ModelManager) 38 | manager.get_backend = MagicMock() 39 | 40 | async def mock_generate(*args, **kwargs): 41 | # Simulate successful audio generation 42 | return np.random.rand(24000).astype(np.float32) # 1 second of random audio data 43 | 44 | manager.generate = AsyncMock(side_effect=mock_generate) 45 | return manager 46 | 47 | 48 | @pytest_asyncio.fixture 49 | async def mock_voice_manager(mock_voice_tensor): 50 | """Mock voice manager for testing.""" 51 | manager = AsyncMock(spec=VoiceManager) 52 | manager.get_voice_path = MagicMock(return_value="/mock/path/voice.pt") 53 | manager.load_voice = AsyncMock(return_value=mock_voice_tensor) 54 | manager.list_voices = AsyncMock(return_value=["voice1", "voice2"]) 55 | manager.combine_voices = AsyncMock(return_value="voice1_voice2") 56 | return manager 57 | 58 | 59 | @pytest_asyncio.fixture 60 | async def tts_service(mock_model_manager, mock_voice_manager): 61 | """Get mocked TTS service instance.""" 62 | service = TTSService() 63 | service.model_manager = mock_model_manager 64 | service._voice_manager = mock_voice_manager 65 | return service 66 | 67 | 68 | @pytest.fixture 69 | def test_voice(): 70 | """Return a test voice name.""" 71 | return "voice1" 72 | -------------------------------------------------------------------------------- /api/tests/test_data/generate_test_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | 6 | def generate_test_audio(): 7 | """Generate test audio data - 1 second of 440Hz tone""" 8 | # Create 1 second of silence at 24kHz 9 | audio = np.zeros(24000, dtype=np.float32) 10 | 11 | # Add a simple sine wave to make it non-zero 12 | t = np.linspace(0, 1, 24000) 13 | audio += 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz tone at half amplitude 14 | 15 | # Create test_data directory if it doesn't exist 16 | os.makedirs("api/tests/test_data", exist_ok=True) 17 | 18 | # Save the test audio 19 | np.save("api/tests/test_data/test_audio.npy", audio) 20 | 21 | 22 | if __name__ == "__main__": 23 | generate_test_audio() 24 | -------------------------------------------------------------------------------- /api/tests/test_data/test_audio.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/api/tests/test_data/test_audio.npy -------------------------------------------------------------------------------- /api/tests/test_development.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from unittest.mock import MagicMock, patch 4 | 5 | import pytest 6 | import requests 7 | 8 | 9 | def test_generate_captioned_speech(): 10 | """Test the generate_captioned_speech function with mocked responses""" 11 | # Mock the API responses 12 | mock_audio_response = MagicMock() 13 | mock_audio_response.status_code = 200 14 | 15 | mock_timestamps_response = MagicMock() 16 | mock_timestamps_response.status_code = 200 17 | mock_timestamps_response.content = json.dumps( 18 | { 19 | "audio": base64.b64encode(b"mock audio data").decode("utf-8"), 20 | "timestamps": [{"word": "test", "start_time": 0.0, "end_time": 1.0}], 21 | } 22 | ) 23 | 24 | # Patch the HTTP requests 25 | with patch("requests.post", return_value=mock_timestamps_response): 26 | # Import here to avoid module-level import issues 27 | from examples.captioned_speech_example import generate_captioned_speech 28 | 29 | # Test the function 30 | audio, timestamps = generate_captioned_speech("test text") 31 | 32 | # Verify we got both audio and timestamps 33 | assert audio == b"mock audio data" 34 | assert timestamps == [{"word": "test", "start_time": 0.0, "end_time": 1.0}] 35 | -------------------------------------------------------------------------------- /assets/cpu_first_token_timeline_stream_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/cpu_first_token_timeline_stream_openai.png -------------------------------------------------------------------------------- /assets/docs-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/docs-screenshot.png -------------------------------------------------------------------------------- /assets/format_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/format_comparison.png -------------------------------------------------------------------------------- /assets/gpu_first_token_latency_direct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_latency_direct.png -------------------------------------------------------------------------------- /assets/gpu_first_token_latency_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_latency_openai.png -------------------------------------------------------------------------------- /assets/gpu_first_token_timeline_direct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_timeline_direct.png -------------------------------------------------------------------------------- /assets/gpu_first_token_timeline_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_first_token_timeline_openai.png -------------------------------------------------------------------------------- /assets/gpu_processing_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_processing_time.png -------------------------------------------------------------------------------- /assets/gpu_realtime_factor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_realtime_factor.png -------------------------------------------------------------------------------- /assets/gpu_total_time_latency_direct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_total_time_latency_direct.png -------------------------------------------------------------------------------- /assets/gpu_total_time_latency_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/gpu_total_time_latency_openai.png -------------------------------------------------------------------------------- /assets/voice_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/voice_analysis.png -------------------------------------------------------------------------------- /assets/webui-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/assets/webui-screenshot.png -------------------------------------------------------------------------------- /charts/kokoro-fastapi/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: kokoro-fastapi 3 | description: A Helm chart for deploying the Kokoro FastAPI TTS service to Kubernetes 4 | type: application 5 | version: 0.3.0 6 | appVersion: "0.3.0" 7 | 8 | keywords: 9 | - tts 10 | - fastapi 11 | - gpu 12 | - kokoro 13 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/examples/aks-tls-values.yaml: -------------------------------------------------------------------------------- 1 | # Tested on 2 | # - Azure AKS with GPU node pool with Nvidia GPU operator 3 | # - This setup uses 1 ingress and load balances between 2 replicas, enabling simultaneous requests 4 | # 5 | # Azure CLI command to create a GPU node pool: 6 | # az aks nodepool add \ 7 | # --resource-group $AZ_RESOURCE_GROUP \ 8 | # --cluster-name $CLUSTER_NAME \ 9 | # --name t4gpus \ 10 | # --node-vm-size Standard_NC4as_T4_v3 \ 11 | # --node-count 2 \ 12 | # --enable-cluster-autoscaler \ 13 | # --min-count 1 \ 14 | # --max-count 2 \ 15 | # --priority Spot \ 16 | # --eviction-policy Delete \ 17 | # --spot-max-price -1 \ 18 | # --node-taints "sku=gpu:NoSchedule,kubernetes.azure.com/scalesetpriority=spot:NoSchedule" \ 19 | # --skip-gpu-driver-install 20 | 21 | kokoroTTS: 22 | replicaCount: 8 23 | port: 8880 24 | tag: v0.2.0 25 | pullPolicy: IfNotPresent 26 | 27 | # Azure specific settings for spot t4 GPU nodes with Nvidia GPU operator 28 | tolerations: 29 | - key: "kubernetes.azure.com/scalesetpriority" 30 | operator: Equal 31 | value: "spot" 32 | effect: NoSchedule 33 | - key: "sku" 34 | operator: Equal 35 | value: "gpu" 36 | effect: NoSchedule 37 | 38 | ingress: 39 | enabled: true 40 | className: "nginx" 41 | annotations: 42 | # Requires cert-manager and external-dns to be in the cluster for TLS and DNS 43 | cert-manager.io/cluster-issuer: letsencrypt-prod 44 | external-dns.alpha.kubernetes.io/hostname: your-external-dns-enabled-hostname 45 | external-dns.alpha.kubernetes.io/cloudflare-proxied: "false" 46 | hosts: 47 | - host: your-external-dns-enabled-hostname 48 | paths: 49 | - path: / 50 | pathType: Prefix 51 | tls: 52 | - secretName: kokoro-fastapi-tls 53 | hosts: 54 | - your-external-dns-enabled-hostname -------------------------------------------------------------------------------- /charts/kokoro-fastapi/examples/gpu-operator-values.yaml: -------------------------------------------------------------------------------- 1 | # Follow the official NVIDIA GPU Operator documentation 2 | # to install the GPU operator with these settings: 3 | # https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html 4 | # 5 | # This example is for a Nvidia T4 16gb GPU node pool with only 1 GPU on each node on Azure AKS. 6 | # It uses time-slicing to share the a and claim to the system that 1 GPU is 4 GPUs. 7 | # So each pod has access to a smaller gpu with 4gb of memory. 8 | # 9 | devicePlugin: # Remove this if you dont want to use time-slicing 10 | config: 11 | create: true 12 | name: "time-slicing-config" 13 | default: "any" 14 | data: 15 | any: |- 16 | version: v1 17 | flags: 18 | migStrategy: none 19 | sharing: 20 | timeSlicing: 21 | resources: 22 | - name: nvidia.com/gpu 23 | replicas: 4 24 | 25 | daemonsets: 26 | tolerations: 27 | - key: "sku" 28 | operator: Equal 29 | value: "gpu" 30 | effect: NoSchedule 31 | - key: "kubernetes.azure.com/scalesetpriority" 32 | operator: Equal 33 | value: "spot" 34 | effect: NoSchedule 35 | 36 | node-feature-discovery: 37 | master: 38 | tolerations: 39 | - key: "sku" 40 | operator: Equal 41 | value: "gpu" 42 | effect: NoSchedule 43 | - key: "kubernetes.azure.com/scalesetpriority" 44 | operator: Equal 45 | value: "spot" 46 | effect: NoSchedule 47 | worker: 48 | tolerations: 49 | - key: "sku" 50 | operator: Equal 51 | value: "gpu" 52 | effect: NoSchedule 53 | - key: "kubernetes.azure.com/scalesetpriority" 54 | operator: Equal 55 | value: "spot" 56 | effect: NoSchedule -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if .Values.ingress.enabled }} 3 | {{- range $host := .Values.ingress.hosts }} 4 | {{- range .paths }} 5 | http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} 6 | {{- end }} 7 | {{- end }} 8 | {{- else if contains "NodePort" .Values.service.type }} 9 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kokoro-fastapi.fullname" . }}) 10 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 11 | echo http://$NODE_IP:$NODE_PORT 12 | {{- else if contains "LoadBalancer" .Values.service.type }} 13 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 14 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kokoro-fastapi.fullname" . }}' 15 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kokoro-fastapi.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 16 | echo http://$SERVICE_IP:{{ .Values.kokoroTTS.port }} 17 | {{- else if contains "ClusterIP" .Values.service.type }} 18 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kokoro-fastapi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 19 | export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") 20 | echo "Visit http://127.0.0.1:8880 to use your application" 21 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8880:$CONTAINER_PORT 22 | {{- end }} 23 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "kokoro-fastapi.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "kokoro-fastapi.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "kokoro-fastapi.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "kokoro-fastapi.labels" -}} 37 | helm.sh/chart: {{ include "kokoro-fastapi.chart" . }} 38 | {{ include "kokoro-fastapi.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "kokoro-fastapi.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "kokoro-fastapi.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "kokoro-fastapi.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "kokoro-fastapi.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/hpa.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.autoscaling.enabled }} 2 | apiVersion: autoscaling/v2beta1 3 | kind: HorizontalPodAutoscaler 4 | metadata: 5 | name: {{ include "kokoro-fastapi.fullname" . }} 6 | labels: 7 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 8 | spec: 9 | scaleTargetRef: 10 | apiVersion: apps/v1 11 | kind: Deployment 12 | name: {{ include "kokoro-fastapi.fullname" . }} 13 | minReplicas: {{ .Values.autoscaling.minReplicas }} 14 | maxReplicas: {{ .Values.autoscaling.maxReplicas }} 15 | metrics: 16 | {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} 17 | - type: Resource 18 | resource: 19 | name: cpu 20 | targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} 21 | {{- end }} 22 | {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} 23 | - type: Resource 24 | resource: 25 | name: memory 26 | targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} 27 | {{- end }} 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | apiVersion: networking.k8s.io/v1 3 | kind: Ingress 4 | metadata: 5 | name: {{ include "kokoro-fastapi.fullname" . }} 6 | labels: 7 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 8 | {{- with .Values.ingress.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | {{- with .Values.ingress.className }} 14 | ingressClassName: {{ . }} 15 | {{- end }} 16 | {{- if .Values.ingress.tls }} 17 | tls: 18 | {{- range .Values.ingress.tls }} 19 | - hosts: 20 | {{- range .hosts }} 21 | - {{ . | quote }} 22 | {{- end }} 23 | secretName: {{ .secretName }} 24 | {{- end }} 25 | {{- end }} 26 | rules: 27 | {{- range .Values.ingress.hosts }} 28 | - host: {{ .host | quote }} 29 | http: 30 | paths: 31 | {{- range .paths }} 32 | - path: {{ .path }} 33 | {{- with .pathType }} 34 | pathType: {{ . }} 35 | {{- end }} 36 | backend: 37 | service: 38 | name: {{ include "kokoro-fastapi.fullname" $ }}-kokoro-tts-service 39 | port: 40 | number: {{ $.Values.kokoroTTS.port }} 41 | {{- end }} 42 | {{- end }} 43 | {{- end }} 44 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts 5 | labels: 6 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 7 | spec: 8 | {{- if not .Values.autoscaling.enabled }} 9 | replicas: {{ .Values.kokoroTTS.replicaCount }} 10 | {{- end }} 11 | selector: 12 | matchLabels: 13 | {{- include "kokoro-fastapi.selectorLabels" . | nindent 6 }} 14 | template: 15 | metadata: 16 | {{- with .Values.podAnnotations }} 17 | annotations: 18 | {{- toYaml . | nindent 8 }} 19 | {{- end }} 20 | labels: 21 | {{- include "kokoro-fastapi.selectorLabels" . | nindent 8 }} 22 | spec: 23 | {{- with .Values.kokoroTTS.imagePullSecrets }} 24 | imagePullSecrets: 25 | {{- toYaml . | nindent 8 }} 26 | {{- end }} 27 | serviceAccountName: {{ include "kokoro-fastapi.serviceAccountName" . }} 28 | securityContext: 29 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 30 | initContainers: [] 31 | containers: 32 | - name: kokoro-tts 33 | securityContext: 34 | {{- toYaml .Values.securityContext | nindent 12 }} 35 | image: "{{ .Values.kokoroTTS.repository }}:{{ .Values.kokoroTTS.tag | default .Chart.AppVersion }}" 36 | imagePullPolicy: {{ .Values.kokoroTTS.pullPolicy }} 37 | env: 38 | - name: PYTHONPATH 39 | value: "/app:/app/api" 40 | - name: USE_GPU 41 | value: "true" 42 | - name: PYTHONUNBUFFERED 43 | value: "1" 44 | ports: 45 | - name: kokoro-tts-http 46 | containerPort: {{ .Values.kokoroTTS.port | default 8880 }} 47 | protocol: TCP 48 | livenessProbe: 49 | httpGet: 50 | path: /health 51 | port: kokoro-tts-http 52 | initialDelaySeconds: 30 53 | periodSeconds: 30 54 | timeoutSeconds: 5 55 | readinessProbe: 56 | httpGet: 57 | path: /health 58 | port: kokoro-tts-http 59 | initialDelaySeconds: 30 60 | periodSeconds: 30 61 | timeoutSeconds: 5 62 | resources: 63 | {{- toYaml .Values.kokoroTTS.resources | nindent 12 }} 64 | volumeMounts: [] 65 | volumes: [] 66 | {{- with .Values.nodeSelector }} 67 | nodeSelector: 68 | {{- toYaml . | nindent 8 }} 69 | {{- end }} 70 | {{- with .Values.affinity }} 71 | affinity: 72 | {{- toYaml . | nindent 8 }} 73 | {{- end }} 74 | {{- with .Values.tolerations }} 75 | tolerations: 76 | {{- toYaml . | nindent 8 }} 77 | {{- end }} 78 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/kokoro-tts-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts-service 5 | labels: 6 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 7 | spec: 8 | type: {{ .Values.service.type }} 9 | ports: 10 | - port: {{ .Values.kokoroTTS.port }} 11 | targetPort: kokoro-tts-http 12 | protocol: TCP 13 | name: kokoro-tts-http 14 | selector: 15 | {{- include "kokoro-fastapi.selectorLabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "kokoro-fastapi.serviceAccountName" . }} 6 | labels: 7 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/templates/tests/test-connection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: "{{ include "kokoro-fastapi.fullname" . }}-test-connection" 5 | labels: 6 | {{- include "kokoro-fastapi.labels" . | nindent 4 }} 7 | annotations: 8 | "helm.sh/hook": test 9 | spec: 10 | containers: 11 | - name: wget 12 | image: busybox 13 | command: ['wget'] 14 | args: ['{{ include "kokoro-fastapi.fullname" . }}:{{ .Values.kokoroTTS.port }}'] 15 | restartPolicy: Never 16 | -------------------------------------------------------------------------------- /charts/kokoro-fastapi/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for kokoro-fastapi. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | kokoroTTS: 5 | replicaCount: 1 6 | # The name of the deployment repository 7 | repository: "ghcr.io/remsky/kokoro-fastapi-gpu" 8 | imagePullSecrets: [] # Set if using a private image or getting rate limited 9 | tag: "latest" 10 | pullPolicy: Always 11 | port: 8880 12 | resources: 13 | limits: 14 | nvidia.com/gpu: 1 15 | requests: 16 | nvidia.com/gpu: 1 17 | 18 | nameOverride: "" 19 | fullnameOverride: "" 20 | 21 | serviceAccount: 22 | # Specifies whether a service account should be created 23 | create: true 24 | # Annotations to add to the service account 25 | annotations: {} 26 | # The name of the service account to use. 27 | # If not set and create is true, a name is generated using the fullname template 28 | name: "" 29 | 30 | podAnnotations: {} 31 | 32 | podSecurityContext: {} 33 | # fsGroup: 2000 34 | 35 | securityContext: {} 36 | # capabilities: 37 | # drop: 38 | # - ALL 39 | # readOnlyRootFilesystem: true 40 | # runAsNonRoot: true 41 | # runAsUser: 1000 42 | 43 | service: 44 | type: ClusterIP 45 | 46 | ingress: 47 | enabled: false 48 | className: "nginx" 49 | annotations: {} 50 | # cert-manager.io/cluster-issuer: letsencrypt-prod 51 | # external-dns.alpha.kubernetes.io/hostname: kokoro.example.com 52 | # external-dns.alpha.kubernetes.io/cloudflare-proxied: "false" 53 | hosts: 54 | - host: kokoro.example.com 55 | paths: 56 | - path: / 57 | pathType: Prefix 58 | 59 | tls: [] 60 | # - secretName: kokoro-fastapi-tls 61 | # hosts: 62 | # - kokoro.example.com 63 | 64 | autoscaling: 65 | enabled: false 66 | minReplicas: 1 67 | maxReplicas: 100 68 | targetCPUUtilizationPercentage: 80 69 | # targetMemoryUtilizationPercentage: 80 70 | 71 | nodeSelector: {} 72 | 73 | tolerations: [] 74 | 75 | affinity: {} 76 | -------------------------------------------------------------------------------- /debug.http: -------------------------------------------------------------------------------- 1 | ### Get Thread Information 2 | GET http://localhost:8880/debug/threads 3 | Accept: application/json 4 | 5 | ### Get Storage Information 6 | GET http://localhost:8880/debug/storage 7 | Accept: application/json 8 | 9 | ### Get System Information 10 | GET http://localhost:8880/debug/system 11 | Accept: application/json 12 | 13 | ### Get Session Pool Status 14 | # Shows active ONNX sessions, CUDA stream usage, and session ages 15 | # Useful for debugging resource exhaustion issues 16 | GET http://localhost:8880/debug/session_pools 17 | Accept: application/json 18 | 19 | ### List Available Models 20 | # Returns list of all available models in OpenAI format 21 | # Response includes tts-1, tts-1-hd, and kokoro models 22 | GET http://localhost:8880/v1/models 23 | Accept: application/json 24 | 25 | ### Get Specific Model 26 | # Returns same model list as above for compatibility 27 | # Works with any model name (e.g., tts-1, tts-1-hd, kokoro) 28 | GET http://localhost:8880/v1/models/tts-1 29 | Accept: application/json -------------------------------------------------------------------------------- /dev/Test Phon.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | 4 | import pydub 5 | import requests 6 | 7 | def generate_audio_from_phonemes(phonemes: str, voice: str = "af_bella"): 8 | """Generate audio from phonemes""" 9 | response = requests.post( 10 | "http://localhost:8880/dev/generate_from_phonemes", 11 | json={"phonemes": phonemes, "voice": voice}, 12 | headers={"Accept": "audio/wav"} 13 | ) 14 | if response.status_code != 200: 15 | print(f"Error: {response.text}") 16 | return None 17 | return response.content 18 | 19 | 20 | 21 | 22 | with open(f"outputnostreammoney.wav", "wb") as f: 23 | f.write(generate_audio_from_phonemes(r"mɪsəki ɪz ɐn ɪkspˌɛɹəmˈɛntᵊl ʤˈitəpˈi ˈɛnʤən dəzˈInd tə pˈWəɹ fjˈuʧəɹ vˈɜɹʒənz ʌv kəkˈɔɹO mˈɑdᵊlz.")) -------------------------------------------------------------------------------- /dev/Test copy 2.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | 4 | import pydub 5 | import requests 6 | 7 | text = """Running on localhost:7860""" 8 | 9 | 10 | Type = "wav" 11 | response = requests.post( 12 | "http://localhost:8880/dev/captioned_speech", 13 | json={ 14 | "model": "kokoro", 15 | "input": text, 16 | "voice": "af_heart+af_sky", 17 | "speed": 1.0, 18 | "response_format": Type, 19 | "stream": True, 20 | }, 21 | stream=True, 22 | ) 23 | 24 | f = open(f"outputstream.{Type}", "wb") 25 | for chunk in response.iter_lines(decode_unicode=True): 26 | if chunk: 27 | temp_json = json.loads(chunk) 28 | if temp_json["timestamps"] != []: 29 | chunk_json = temp_json 30 | 31 | # Decode base 64 stream to bytes 32 | chunk_audio = base64.b64decode(temp_json["audio"].encode("utf-8")) 33 | 34 | # Process streaming chunks 35 | f.write(chunk_audio) 36 | 37 | # Print word level timestamps 38 | print(chunk_json["timestamps"]) 39 | -------------------------------------------------------------------------------- /dev/Test money.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | 4 | import requests 5 | 6 | text = """奶酪芝士很浓郁!臭豆腐芝士有争议?陈年奶酪价格昂贵。""" 7 | 8 | 9 | Type = "wav" 10 | 11 | response = requests.post( 12 | "http://localhost:8880/v1/audio/speech", 13 | json={ 14 | "model": "kokoro", 15 | "input": text, 16 | "voice": "zf_xiaobei", 17 | "speed": 1.0, 18 | "response_format": Type, 19 | "stream": False, 20 | }, 21 | stream=True, 22 | ) 23 | 24 | with open(f"outputnostreammoney.{Type}", "wb") as f: 25 | f.write(response.content) 26 | -------------------------------------------------------------------------------- /dev/Test num.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | from text_to_num import text2num 5 | from torch import mul 6 | 7 | INFLECT_ENGINE = inflect.engine() 8 | 9 | 10 | def conditional_int(number: float, threshold: float = 0.00001): 11 | if abs(round(number) - number) < threshold: 12 | return int(round(number)) 13 | return number 14 | 15 | 16 | def handle_money(m: re.Match[str]) -> str: 17 | """Convert money expressions to spoken form""" 18 | 19 | bill = "dollar" if m.group(2) == "$" else "pound" 20 | coin = "cent" if m.group(2) == "$" else "pence" 21 | number = m.group(3) 22 | 23 | multiplier = m.group(4) 24 | try: 25 | number = float(number) 26 | except: 27 | return m.group() 28 | 29 | if m.group(1) == "-": 30 | number *= -1 31 | 32 | if number % 1 == 0 or multiplier != "": 33 | text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}" 34 | else: 35 | sub_number = int(str(number).split(".")[-1].ljust(2, "0")) 36 | 37 | text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}" 38 | 39 | return text_number 40 | 41 | 42 | text = re.sub( 43 | r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b", 44 | handle_money, 45 | "he administration has offered up a platter of repression for more than a year and is still slated to lose -$5.3 billion", 46 | ) 47 | print(text) 48 | -------------------------------------------------------------------------------- /docker-bake.hcl: -------------------------------------------------------------------------------- 1 | # Variables for reuse 2 | variable "VERSION" { 3 | default = "latest" 4 | } 5 | 6 | variable "REGISTRY" { 7 | default = "ghcr.io" 8 | } 9 | 10 | variable "OWNER" { 11 | default = "remsky" 12 | } 13 | 14 | variable "REPO" { 15 | default = "kokoro-fastapi" 16 | } 17 | 18 | variable "DOWNLOAD_MODEL" { 19 | default = "true" 20 | } 21 | 22 | # Common settings shared between targets 23 | target "_common" { 24 | context = "." 25 | args = { 26 | DEBIAN_FRONTEND = "noninteractive" 27 | DOWNLOAD_MODEL = "${DOWNLOAD_MODEL}" 28 | } 29 | } 30 | 31 | # Base settings for CPU builds 32 | target "_cpu_base" { 33 | inherits = ["_common"] 34 | dockerfile = "docker/cpu/Dockerfile" 35 | } 36 | 37 | # Base settings for GPU builds 38 | target "_gpu_base" { 39 | inherits = ["_common"] 40 | dockerfile = "docker/gpu/Dockerfile" 41 | } 42 | 43 | # CPU target with multi-platform support 44 | target "cpu" { 45 | inherits = ["_cpu_base"] 46 | platforms = ["linux/amd64", "linux/arm64"] 47 | tags = [ 48 | "${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}", 49 | "${REGISTRY}/${OWNER}/${REPO}-cpu:latest" 50 | ] 51 | } 52 | 53 | # GPU target with multi-platform support 54 | target "gpu" { 55 | inherits = ["_gpu_base"] 56 | platforms = ["linux/amd64", "linux/arm64"] 57 | tags = [ 58 | "${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}", 59 | "${REGISTRY}/${OWNER}/${REPO}-gpu:latest" 60 | ] 61 | } 62 | 63 | # Default group to build both CPU and GPU versions 64 | group "default" { 65 | targets = ["cpu", "gpu"] 66 | } 67 | 68 | # Development targets for faster local builds 69 | target "cpu-dev" { 70 | inherits = ["_cpu_base"] 71 | # No multi-platform for dev builds 72 | tags = ["${REGISTRY}/${OWNER}/${REPO}-cpu:dev"] 73 | } 74 | 75 | target "gpu-dev" { 76 | inherits = ["_gpu_base"] 77 | # No multi-platform for dev builds 78 | tags = ["${REGISTRY}/${OWNER}/${REPO}-gpu:dev"] 79 | } 80 | 81 | group "dev" { 82 | targets = ["cpu-dev", "gpu-dev"] 83 | } -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Get version from argument or use default 5 | VERSION=${1:-"latest"} 6 | 7 | # Build both CPU and GPU images using docker buildx bake 8 | echo "Building CPU and GPU images..." 9 | VERSION=$VERSION docker buildx bake --push 10 | 11 | echo "Build complete!" 12 | echo "Created images with version: $VERSION" 13 | -------------------------------------------------------------------------------- /docker/cpu/.dockerignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | 4 | # Python 5 | __pycache__ 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | .Python 10 | *.py[cod] 11 | *$py.class 12 | .pytest_cache 13 | .coverage 14 | .coveragerc 15 | 16 | # Environment 17 | # .env 18 | .venv 19 | env/ 20 | venv/ 21 | ENV/ 22 | 23 | # IDE 24 | .idea 25 | .vscode 26 | *.swp 27 | *.swo 28 | 29 | # Project specific 30 | examples/ 31 | Kokoro-82M/ 32 | ui/ 33 | tests/ 34 | *.md 35 | *.txt 36 | !requirements.txt 37 | 38 | # Docker 39 | Dockerfile* 40 | docker-compose* 41 | -------------------------------------------------------------------------------- /docker/cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | # Install dependencies and check espeak location 4 | RUN apt-get update && apt-get install -y \ 5 | espeak-ng \ 6 | espeak-ng-data \ 7 | git \ 8 | libsndfile1 \ 9 | curl \ 10 | ffmpeg \ 11 | g++ \ 12 | && apt-get clean \ 13 | && rm -rf /var/lib/apt/lists/* \ 14 | && mkdir -p /usr/share/espeak-ng-data \ 15 | && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ 16 | 17 | # Install UV using the installer script 18 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ 19 | mv /root/.local/bin/uv /usr/local/bin/ && \ 20 | mv /root/.local/bin/uvx /usr/local/bin/ 21 | 22 | # Create non-root user and set up directories and permissions 23 | RUN useradd -m -u 1000 appuser && \ 24 | mkdir -p /app/api/src/models/v1_0 && \ 25 | chown -R appuser:appuser /app 26 | 27 | USER appuser 28 | WORKDIR /app 29 | 30 | # Copy dependency files 31 | COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml 32 | 33 | # Install Rust (required to build sudachipy and pyopenjtalk-plus) 34 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y 35 | ENV PATH="/home/appuser/.cargo/bin:$PATH" 36 | 37 | # Install dependencies 38 | RUN --mount=type=cache,target=/root/.cache/uv \ 39 | uv venv --python 3.10 && \ 40 | uv sync --extra cpu 41 | 42 | # Copy project files including models 43 | COPY --chown=appuser:appuser api ./api 44 | COPY --chown=appuser:appuser web ./web 45 | COPY --chown=appuser:appuser docker/scripts/ ./ 46 | RUN chmod +x ./entrypoint.sh 47 | 48 | # Set environment variables 49 | ENV PYTHONUNBUFFERED=1 \ 50 | PYTHONPATH=/app:/app/api \ 51 | PATH="/app/.venv/bin:$PATH" \ 52 | UV_LINK_MODE=copy \ 53 | USE_GPU=false \ 54 | PHONEMIZER_ESPEAK_PATH=/usr/bin \ 55 | PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ 56 | ESPEAK_DATA_PATH=/usr/share/espeak-ng-data 57 | 58 | ENV DOWNLOAD_MODEL=true 59 | # Download model if enabled 60 | RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \ 61 | python download_model.py --output api/src/models/v1_0; \ 62 | fi 63 | 64 | ENV DEVICE="cpu" 65 | # Run FastAPI server through entrypoint.sh 66 | CMD ["./entrypoint.sh"] 67 | -------------------------------------------------------------------------------- /docker/cpu/docker-compose.yml: -------------------------------------------------------------------------------- 1 | name: kokoro-fastapi-cpu 2 | services: 3 | kokoro-tts: 4 | build: 5 | context: ../.. 6 | dockerfile: docker/cpu/Dockerfile 7 | volumes: 8 | - ../../api:/app/api 9 | ports: 10 | - "8880:8880" 11 | environment: 12 | - PYTHONPATH=/app:/app/api 13 | # ONNX Optimization Settings for vectorized operations 14 | - ONNX_NUM_THREADS=8 # Maximize core usage for vectorized ops 15 | - ONNX_INTER_OP_THREADS=4 # Higher inter-op for parallel matrix operations 16 | - ONNX_EXECUTION_MODE=parallel 17 | - ONNX_OPTIMIZATION_LEVEL=all 18 | - ONNX_MEMORY_PATTERN=true 19 | - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo 20 | 21 | # # Gradio UI service [Comment out everything below if you don't need it] 22 | # gradio-ui: 23 | # image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION} 24 | # # Uncomment below (and comment out above) to build from source instead of using the released image 25 | # build: 26 | # context: ../../ui 27 | # ports: 28 | # - "7860:7860" 29 | # volumes: 30 | # - ../../ui/data:/app/ui/data 31 | # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload 32 | # environment: 33 | # - GRADIO_WATCH=True # Enable hot reloading 34 | # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered 35 | # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view 36 | # - API_HOST=kokoro-tts # Set TTS service URL 37 | # - API_PORT=8880 # Set TTS service PORT 38 | -------------------------------------------------------------------------------- /docker/gpu/.dockerignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | 4 | # Python 5 | __pycache__ 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | .Python 10 | *.py[cod] 11 | *$py.class 12 | .pytest_cache 13 | .coverage 14 | .coveragerc 15 | 16 | # Environment 17 | # .env 18 | .venv* 19 | env/ 20 | venv/ 21 | ENV/ 22 | 23 | # IDE 24 | .idea 25 | .vscode 26 | *.swp 27 | *.swo 28 | 29 | # Project specific 30 | examples/ 31 | Kokoro-82M/ 32 | ui/ 33 | tests/ 34 | *.md 35 | *.txt 36 | !requirements.txt 37 | 38 | # Docker 39 | Dockerfile* 40 | docker-compose* 41 | -------------------------------------------------------------------------------- /docker/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 2 | # Set non-interactive frontend 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | # Install Python and other dependencies 6 | RUN apt-get update && apt-get install -y \ 7 | python3.10 \ 8 | python3-venv \ 9 | espeak-ng \ 10 | espeak-ng-data \ 11 | git \ 12 | libsndfile1 \ 13 | curl \ 14 | ffmpeg \ 15 | g++ \ 16 | && apt-get clean && rm -rf /var/lib/apt/lists/* \ 17 | && mkdir -p /usr/share/espeak-ng-data \ 18 | && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ 19 | 20 | # Install UV using the installer script 21 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ 22 | mv /root/.local/bin/uv /usr/local/bin/ && \ 23 | mv /root/.local/bin/uvx /usr/local/bin/ 24 | 25 | # Create non-root user and set up directories and permissions 26 | RUN useradd -m -u 1001 appuser && \ 27 | mkdir -p /app/api/src/models/v1_0 && \ 28 | chown -R appuser:appuser /app 29 | 30 | USER appuser 31 | WORKDIR /app 32 | 33 | # Copy dependency files 34 | COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml 35 | 36 | ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \ 37 | PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ 38 | ESPEAK_DATA_PATH=/usr/share/espeak-ng-data 39 | 40 | # Install dependencies with GPU extras (using cache mounts) 41 | RUN --mount=type=cache,target=/root/.cache/uv \ 42 | uv venv --python 3.10 && \ 43 | uv sync --extra gpu 44 | 45 | # Copy project files including models 46 | COPY --chown=appuser:appuser api ./api 47 | COPY --chown=appuser:appuser web ./web 48 | COPY --chown=appuser:appuser docker/scripts/ ./ 49 | RUN chmod +x ./entrypoint.sh 50 | 51 | 52 | # Set all environment variables in one go 53 | ENV PYTHONUNBUFFERED=1 \ 54 | PYTHONPATH=/app:/app/api \ 55 | PATH="/app/.venv/bin:$PATH" \ 56 | UV_LINK_MODE=copy \ 57 | USE_GPU=true 58 | 59 | ENV DOWNLOAD_MODEL=true 60 | # Download model if enabled 61 | RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \ 62 | python download_model.py --output api/src/models/v1_0; \ 63 | fi 64 | 65 | ENV DEVICE="gpu" 66 | # Run FastAPI server through entrypoint.sh 67 | CMD ["./entrypoint.sh"] 68 | -------------------------------------------------------------------------------- /docker/gpu/docker-compose.yml: -------------------------------------------------------------------------------- 1 | name: kokoro-tts-gpu 2 | services: 3 | kokoro-tts: 4 | # image: ghcr.io/remsky/kokoro-fastapi-gpu:v${VERSION} 5 | build: 6 | context: ../.. 7 | dockerfile: docker/gpu/Dockerfile 8 | volumes: 9 | - ../../api:/app/api 10 | user: "1001:1001" # Ensure container runs as UID 1001 (appuser) 11 | ports: 12 | - "8880:8880" 13 | environment: 14 | - PYTHONPATH=/app:/app/api 15 | - USE_GPU=true 16 | - PYTHONUNBUFFERED=1 17 | deploy: 18 | resources: 19 | reservations: 20 | devices: 21 | - driver: nvidia 22 | count: all 23 | capabilities: [gpu] 24 | 25 | # # Gradio UI service 26 | # gradio-ui: 27 | # image: ghcr.io/remsky/kokoro-fastapi-ui:v${VERSION} 28 | # # Uncomment below to build from source instead of using the released image 29 | # # build: 30 | # # context: ../../ui 31 | # ports: 32 | # - "7860:7860" 33 | # volumes: 34 | # - ../../ui/data:/app/ui/data 35 | # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload 36 | # environment: 37 | # - GRADIO_WATCH=1 # Enable hot reloading 38 | # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered 39 | # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view 40 | # - API_HOST=kokoro-tts # Set TTS service URL 41 | # - API_PORT=8880 # Set TTS service PORT 42 | -------------------------------------------------------------------------------- /docker/scripts/download_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Download and prepare Kokoro v1.0 model.""" 3 | 4 | import json 5 | import os 6 | from pathlib import Path 7 | from urllib.request import urlretrieve 8 | 9 | from loguru import logger 10 | 11 | 12 | def verify_files(model_path: str, config_path: str) -> bool: 13 | """Verify that model files exist and are valid. 14 | 15 | Args: 16 | model_path: Path to model file 17 | config_path: Path to config file 18 | 19 | Returns: 20 | True if files exist and are valid 21 | """ 22 | try: 23 | # Check files exist 24 | if not os.path.exists(model_path): 25 | return False 26 | if not os.path.exists(config_path): 27 | return False 28 | 29 | # Verify config file is valid JSON 30 | with open(config_path) as f: 31 | config = json.load(f) 32 | 33 | # Check model file size (should be non-zero) 34 | if os.path.getsize(model_path) == 0: 35 | return False 36 | 37 | return True 38 | except Exception: 39 | return False 40 | 41 | 42 | def download_model(output_dir: str) -> None: 43 | """Download model files from GitHub release. 44 | 45 | Args: 46 | output_dir: Directory to save model files 47 | """ 48 | try: 49 | # Create output directory 50 | os.makedirs(output_dir, exist_ok=True) 51 | 52 | # Define file paths 53 | model_file = "kokoro-v1_0.pth" 54 | config_file = "config.json" 55 | model_path = os.path.join(output_dir, model_file) 56 | config_path = os.path.join(output_dir, config_file) 57 | 58 | # Check if files already exist and are valid 59 | if verify_files(model_path, config_path): 60 | logger.info("Model files already exist and are valid") 61 | return 62 | 63 | logger.info("Downloading Kokoro v1.0 model files") 64 | 65 | # GitHub release URLs (to be updated with v0.2.0 release) 66 | base_url = "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.4" 67 | model_url = f"{base_url}/{model_file}" 68 | config_url = f"{base_url}/{config_file}" 69 | 70 | # Download files 71 | logger.info("Downloading model file...") 72 | urlretrieve(model_url, model_path) 73 | 74 | logger.info("Downloading config file...") 75 | urlretrieve(config_url, config_path) 76 | 77 | # Verify downloaded files 78 | if not verify_files(model_path, config_path): 79 | raise RuntimeError("Failed to verify downloaded files") 80 | 81 | logger.info(f"✓ Model files prepared in {output_dir}") 82 | 83 | except Exception as e: 84 | logger.error(f"Failed to download model: {e}") 85 | raise 86 | 87 | 88 | def main(): 89 | """Main entry point.""" 90 | import argparse 91 | 92 | parser = argparse.ArgumentParser(description="Download Kokoro v1.0 model") 93 | parser.add_argument( 94 | "--output", required=True, help="Output directory for model files" 95 | ) 96 | 97 | args = parser.parse_args() 98 | download_model(args.output) 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /docker/scripts/download_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Find project root by looking for api directory 4 | find_project_root() { 5 | local current_dir="$PWD" 6 | local max_steps=5 7 | local steps=0 8 | 9 | while [ $steps -lt $max_steps ]; do 10 | if [ -d "$current_dir/api" ]; then 11 | echo "$current_dir" 12 | return 0 13 | fi 14 | current_dir="$(dirname "$current_dir")" 15 | ((steps++)) 16 | done 17 | 18 | echo "Error: Could not find project root (no api directory found)" >&2 19 | exit 1 20 | } 21 | 22 | # Function to verify files exist and are valid 23 | verify_files() { 24 | local model_path="$1" 25 | local config_path="$2" 26 | 27 | # Check files exist 28 | if [ ! -f "$model_path" ] || [ ! -f "$config_path" ]; then 29 | return 1 30 | fi 31 | 32 | # Check files are not empty 33 | if [ ! -s "$model_path" ] || [ ! -s "$config_path" ]; then 34 | return 1 35 | fi 36 | 37 | # Try to parse config.json 38 | if ! jq . "$config_path" >/dev/null 2>&1; then 39 | return 1 40 | fi 41 | 42 | return 0 43 | } 44 | 45 | # Function to download a file 46 | download_file() { 47 | local url="$1" 48 | local output_path="$2" 49 | local filename=$(basename "$output_path") 50 | 51 | echo "Downloading $filename..." 52 | mkdir -p "$(dirname "$output_path")" 53 | if curl -L "$url" -o "$output_path"; then 54 | echo "Successfully downloaded $filename" 55 | return 0 56 | else 57 | echo "Error downloading $filename" >&2 58 | return 1 59 | fi 60 | } 61 | 62 | # Find project root and ensure models directory exists 63 | PROJECT_ROOT=$(find_project_root) 64 | if [ $? -ne 0 ]; then 65 | exit 1 66 | fi 67 | 68 | MODEL_DIR="$PROJECT_ROOT/api/src/models/v1_0" 69 | echo "Model directory: $MODEL_DIR" 70 | mkdir -p "$MODEL_DIR" 71 | 72 | # Define file paths 73 | MODEL_FILE="kokoro-v1_0.pth" 74 | CONFIG_FILE="config.json" 75 | MODEL_PATH="$MODEL_DIR/$MODEL_FILE" 76 | CONFIG_PATH="$MODEL_DIR/$CONFIG_FILE" 77 | 78 | # Check if files already exist and are valid 79 | if verify_files "$MODEL_PATH" "$CONFIG_PATH"; then 80 | echo "Model files already exist and are valid" 81 | exit 0 82 | fi 83 | 84 | # Define URLs 85 | BASE_URL="https://github.com/remsky/Kokoro-FastAPI/releases/download/v1.4" 86 | MODEL_URL="$BASE_URL/$MODEL_FILE" 87 | CONFIG_URL="$BASE_URL/$CONFIG_FILE" 88 | 89 | # Download files 90 | success=true 91 | 92 | if ! download_file "$MODEL_URL" "$MODEL_PATH"; then 93 | success=false 94 | fi 95 | 96 | if ! download_file "$CONFIG_URL" "$CONFIG_PATH"; then 97 | success=false 98 | fi 99 | 100 | # Verify downloaded files 101 | if [ "$success" = true ] && verify_files "$MODEL_PATH" "$CONFIG_PATH"; then 102 | echo "✓ Model files prepared in $MODEL_DIR" 103 | exit 0 104 | else 105 | echo "Failed to download or verify model files" >&2 106 | exit 1 107 | fi -------------------------------------------------------------------------------- /docker/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$DOWNLOAD_MODEL" = "true" ]; then 5 | python download_model.py --output api/src/models/v1_0 6 | fi 7 | 8 | exec uv run --extra $DEVICE --no-sync python -m uvicorn api.src.main:app --host 0.0.0.0 --port 8880 --log-level debug -------------------------------------------------------------------------------- /docs/architecture/espeak_setup_fix.md: -------------------------------------------------------------------------------- 1 | # ESpeak-NG Setup Fix 2 | 3 | ## Issue Description 4 | 5 | Users are reporting two distinct errors: 6 | 7 | 1. Missing espeak-ng-data/phontab file: 8 | ``` 9 | Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory. 10 | ``` 11 | 12 | 2. Invalid pipeline state: 13 | ``` 14 | Error generating speech: The object is in an invalid state. 15 | ``` 16 | 17 | ## Root Cause Analysis 18 | 19 | ### 1. ESpeak-NG Data Issue 20 | 21 | The dependency chain has changed: 22 | ``` 23 | Before: 24 | kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer 25 | 26 | After: 27 | kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader 28 | ``` 29 | 30 | The issue arises because: 31 | 1. misaki now uses espeakng-loader to manage espeak paths 32 | 2. espeakng-loader looks for data in its package directory 33 | 3. We have a direct dependency on phonemizer 3.3.0 that conflicts 34 | 35 | ### 2. Pipeline State Issue 36 | The "invalid state" error occurs due to device mismatch in pipeline creation. 37 | 38 | ## Solution 39 | 40 | ### 1. For ESpeak-NG Data 41 | 42 | Update dependencies and environment: 43 | 44 | 1. Remove direct phonemizer dependency: 45 | ```diff 46 | - "phonemizer==3.3.0", # Remove this 47 | ``` 48 | 49 | 2. Let misaki handle phonemizer-fork and espeakng-loader 50 | 51 | 3. Set environment variable in Dockerfile: 52 | ```dockerfile 53 | ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \ 54 | PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ 55 | ESPEAK_DATA_PATH=/usr/share/espeak-ng-data # Add this 56 | ``` 57 | 58 | This approach: 59 | - Works with misaki's new dependencies 60 | - Maintains our working espeak setup 61 | - Avoids complex file copying or path manipulation 62 | 63 | ### 2. For Pipeline State 64 | 65 | Use kokoro_v1's pipeline management: 66 | ```python 67 | # Instead of creating pipelines directly: 68 | # pipeline = KPipeline(...) 69 | 70 | # Use backend's pipeline management: 71 | pipeline = backend._get_pipeline(pipeline_lang_code) 72 | ``` 73 | 74 | ## Implementation Steps 75 | 76 | 1. Update pyproject.toml: 77 | - Remove direct phonemizer dependency 78 | - Keep misaki dependency as is 79 | 80 | 2. Update Dockerfiles: 81 | - Add ESPEAK_DATA_PATH environment variable 82 | - Keep existing espeak-ng setup 83 | 84 | 3. Update tts_service.py: 85 | - Use backend's pipeline management 86 | - Add proper error handling 87 | 88 | ## Testing 89 | 90 | 1. Test espeak-ng functionality: 91 | ```bash 92 | # Verify environment variables 93 | echo $ESPEAK_DATA_PATH 94 | echo $PHONEMIZER_ESPEAK_DATA 95 | 96 | # Check data directory 97 | ls /usr/share/espeak-ng-data 98 | ``` 99 | 100 | 2. Test pipeline state: 101 | - Test on both CPU and GPU 102 | - Verify no invalid state errors 103 | - Test with different voice models 104 | 105 | ## Success Criteria 106 | 107 | 1. No espeak-ng-data/phontab file errors 108 | 2. No invalid state errors 109 | 3. Consistent behavior across platforms 110 | 4. Successful CI/CD pipeline runs 111 | 112 | ## Future Considerations 113 | 114 | 1. Potential PR to misaki: 115 | - Add fallback mechanism if espeakng-loader fails 116 | - Make path configuration more flexible 117 | - Add better error messages 118 | 119 | 2. Environment Variable Documentation: 120 | - Document ESPEAK_DATA_PATH requirement 121 | - Explain interaction with espeakng-loader 122 | - Provide platform-specific setup instructions 123 | 124 | ## Notes 125 | 126 | - This solution works with misaki's new dependencies while maintaining our setup 127 | - Environment variable approach is simpler than file copying 128 | - May want to contribute improvements back to misaki later -------------------------------------------------------------------------------- /docs/requirements.in: -------------------------------------------------------------------------------- 1 | # Primarily for reference, as Dockerfile refer 2 | # Core dependencies 3 | fastapi==0.115.6 4 | uvicorn==0.34.0 5 | pydantic==2.10.4 6 | pydantic-settings==2.7.0 7 | python-dotenv==1.0.1 8 | sqlalchemy==2.0.27 9 | 10 | # ML/DL 11 | transformers==4.47.1 12 | numpy>=1.26.0 # Version managed by PyTorch dependencies 13 | scipy==1.14.1 14 | onnxruntime==1.20.1 15 | 16 | # Audio processing 17 | soundfile==0.13.0 18 | 19 | # Text processing 20 | phonemizer==3.3.0 21 | regex==2024.11.6 22 | 23 | # Utilities 24 | aiofiles==23.2.1 # Last version before Windows path handling changes 25 | tqdm==4.67.1 26 | requests==2.32.3 27 | munch==4.0.0 28 | tiktoken===0.8.0 29 | loguru==0.7.3 30 | 31 | # Testing 32 | pytest==8.0.0 33 | httpx==0.26.0 34 | pytest-asyncio==0.23.5 35 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/__init__.py -------------------------------------------------------------------------------- /examples/assorted_checks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/__init__.py -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/__init__.py -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/lib/__init__.py -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": [ 3 | { 4 | "tokens": 100, 5 | "processing_time": 0.98, 6 | "output_length": 28.975, 7 | "rtf": 0.03, 8 | "elapsed_time": 1.02255 9 | }, 10 | { 11 | "tokens": 200, 12 | "processing_time": 1.79, 13 | "output_length": 58.45, 14 | "rtf": 0.03, 15 | "elapsed_time": 2.84766 16 | }, 17 | { 18 | "tokens": 300, 19 | "processing_time": 2.1, 20 | "output_length": 86.75, 21 | "rtf": 0.02, 22 | "elapsed_time": 4.98201 23 | }, 24 | { 25 | "tokens": 400, 26 | "processing_time": 2.66, 27 | "output_length": 113.5, 28 | "rtf": 0.02, 29 | "elapsed_time": 7.67743 30 | }, 31 | { 32 | "tokens": 500, 33 | "processing_time": 3.13, 34 | "output_length": 140.225, 35 | "rtf": 0.02, 36 | "elapsed_time": 10.84279 37 | } 38 | ], 39 | "system_metrics": [ 40 | { 41 | "timestamp": "2025-01-30T05:03:26.422469", 42 | "cpu_percent": 0.0, 43 | "ram_percent": 18.5, 44 | "ram_used_gb": 5.2551727294921875, 45 | "gpu_memory_used": 1988.0, 46 | "relative_time": 0.14498639106750488 47 | }, 48 | { 49 | "timestamp": "2025-01-30T05:03:27.568319", 50 | "cpu_percent": 13.42, 51 | "ram_percent": 18.6, 52 | "ram_used_gb": 5.267307281494141, 53 | "gpu_memory_used": 2025.0, 54 | "relative_time": 1.1970372200012207 55 | }, 56 | { 57 | "timestamp": "2025-01-30T05:03:28.620098", 58 | "cpu_percent": 12.89, 59 | "ram_percent": 18.6, 60 | "ram_used_gb": 5.267337799072266, 61 | "gpu_memory_used": 3071.0, 62 | "relative_time": 2.254074811935425 63 | }, 64 | { 65 | "timestamp": "2025-01-30T05:03:29.677030", 66 | "cpu_percent": 12.43, 67 | "ram_percent": 18.6, 68 | "ram_used_gb": 5.29168701171875, 69 | "gpu_memory_used": 2555.0, 70 | "relative_time": 3.306957244873047 71 | }, 72 | { 73 | "timestamp": "2025-01-30T05:03:30.729971", 74 | "cpu_percent": 12.47, 75 | "ram_percent": 18.6, 76 | "ram_used_gb": 5.292213439941406, 77 | "gpu_memory_used": 3345.0, 78 | "relative_time": 4.3373119831085205 79 | }, 80 | { 81 | "timestamp": "2025-01-30T05:03:31.760463", 82 | "cpu_percent": 13.71, 83 | "ram_percent": 18.7, 84 | "ram_used_gb": 5.30987548828125, 85 | "gpu_memory_used": 2549.0, 86 | "relative_time": 5.368744850158691 87 | }, 88 | { 89 | "timestamp": "2025-01-30T05:03:32.791904", 90 | "cpu_percent": 12.16, 91 | "ram_percent": 18.7, 92 | "ram_used_gb": 5.308803558349609, 93 | "gpu_memory_used": 3358.0, 94 | "relative_time": 6.418949842453003 95 | }, 96 | { 97 | "timestamp": "2025-01-30T05:03:33.842039", 98 | "cpu_percent": 11.5, 99 | "ram_percent": 18.7, 100 | "ram_used_gb": 5.309070587158203, 101 | "gpu_memory_used": 3349.0, 102 | "relative_time": 7.4437031745910645 103 | }, 104 | { 105 | "timestamp": "2025-01-30T05:03:34.866692", 106 | "cpu_percent": 15.38, 107 | "ram_percent": 18.7, 108 | "ram_used_gb": 5.2960205078125, 109 | "gpu_memory_used": 3034.0, 110 | "relative_time": 8.472418069839478 111 | }, 112 | { 113 | "timestamp": "2025-01-30T05:03:35.895656", 114 | "cpu_percent": 13.44, 115 | "ram_percent": 18.7, 116 | "ram_used_gb": 5.294971466064453, 117 | "gpu_memory_used": 3315.0, 118 | "relative_time": 9.498533248901367 119 | }, 120 | { 121 | "timestamp": "2025-01-30T05:03:36.921589", 122 | "cpu_percent": 12.64, 123 | "ram_percent": 18.7, 124 | "ram_used_gb": 5.297389984130859, 125 | "gpu_memory_used": 3314.0, 126 | "relative_time": 10.565555095672607 127 | }, 128 | { 129 | "timestamp": "2025-01-30T05:03:37.994149", 130 | "cpu_percent": 8.32, 131 | "ram_percent": 18.7, 132 | "ram_used_gb": 5.305477142333984, 133 | "gpu_memory_used": 1958.0, 134 | "relative_time": 11.616873502731323 135 | } 136 | ], 137 | "test_duration": 14.051392793655396 138 | } -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_rtf.txt: -------------------------------------------------------------------------------- 1 | === Benchmark Statistics (with correct RTF) === 2 | 3 | Total tokens processed: 1500 4 | Total audio generated (s): 427.90 5 | Total test duration (s): 10.84 6 | Average processing rate (tokens/s): 133.35 7 | Average RTF: 0.02 8 | Average Real Time Speed: 41.67 9 | 10 | === Per-chunk Stats === 11 | 12 | Average chunk size (tokens): 300.00 13 | Min chunk size (tokens): 100 14 | Max chunk size (tokens): 500 15 | Average processing time (s): 2.13 16 | Average output length (s): 85.58 17 | 18 | === Performance Ranges === 19 | 20 | Processing rate range (tokens/s): 102.04 - 159.74 21 | RTF range: 0.02x - 0.03x 22 | Real Time Speed range: 33.33x - 50.00x 23 | 24 | -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt: -------------------------------------------------------------------------------- 1 | === Benchmark Statistics (with correct RTF) === 2 | 3 | Total tokens processed: 3150 4 | Total audio generated (s): 895.98 5 | Total test duration (s): 23.54 6 | Average processing rate (tokens/s): 133.43 7 | Average RTF: 0.03 8 | Average Real Time Speed: 35.29 9 | 10 | === Per-chunk Stats === 11 | 12 | Average chunk size (tokens): 525.00 13 | Min chunk size (tokens): 150 14 | Max chunk size (tokens): 900 15 | Average processing time (s): 3.88 16 | Average output length (s): 149.33 17 | 18 | === Performance Ranges === 19 | 20 | Processing rate range (tokens/s): 127.12 - 147.93 21 | RTF range: 0.02x - 0.03x 22 | Real Time Speed range: 33.33x - 50.00x 23 | 24 | -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png -------------------------------------------------------------------------------- /examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png -------------------------------------------------------------------------------- /examples/assorted_checks/test_combinations/test_download_voice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from pathlib import Path 4 | import requests 5 | 6 | # Create output directory 7 | output_dir = Path(__file__).parent / "output" 8 | output_dir.mkdir(exist_ok=True) 9 | 10 | def download_combined_voice(voice1: str, voice2: str, weights: tuple[float, float] = None) -> str: 11 | """Download a combined voice file. 12 | 13 | Args: 14 | voice1: First voice name 15 | voice2: Second voice name 16 | weights: Optional tuple of weights (w1, w2). If not provided, uses equal weights. 17 | 18 | Returns: 19 | Path to downloaded .pt file 20 | """ 21 | print(f"\nDownloading combined voice: {voice1} + {voice2}") 22 | 23 | # Construct voice string with optional weights 24 | if weights: 25 | voice_str = f"{voice1}({weights[0]})+{voice2}({weights[1]})" 26 | else: 27 | voice_str = f"{voice1}+{voice2}" 28 | 29 | # Make the request to combine voices 30 | response = requests.post( 31 | "http://localhost:8880/v1/audio/voices/combine", 32 | json=voice_str 33 | ) 34 | 35 | if response.status_code != 200: 36 | raise Exception(f"Failed to combine voices: {response.text}") 37 | 38 | # Save the .pt file 39 | output_path = output_dir / f"{voice_str}.pt" 40 | with open(output_path, "wb") as f: 41 | f.write(response.content) 42 | 43 | print(f"Saved combined voice to {output_path}") 44 | return str(output_path) 45 | 46 | def main(): 47 | # Test downloading various voice combinations 48 | combinations = [ 49 | # Equal weights (default) 50 | ("af_bella", "af_kore"), 51 | 52 | # Different weight combinations 53 | ("af_bella", "af_kore", (0.2, 0.8)), 54 | ("af_bella", "af_kore", (0.8, 0.2)), 55 | ("af_bella", "af_kore", (0.5, 0.5)), 56 | 57 | # Test with different voices 58 | ("af_bella", "af_jadzia"), 59 | ("af_bella", "af_jadzia", (0.3, 0.7)) 60 | ] 61 | 62 | for combo in combinations: 63 | try: 64 | if len(combo) == 3: 65 | voice1, voice2, weights = combo 66 | download_combined_voice(voice1, voice2, weights) 67 | else: 68 | voice1, voice2 = combo 69 | download_combined_voice(voice1, voice2) 70 | except Exception as e: 71 | print(f"Error downloading combination {combo}: {e}") 72 | 73 | if __name__ == "__main__": 74 | main() -------------------------------------------------------------------------------- /examples/assorted_checks/test_openai/test_openai_tts.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import openai 4 | 5 | # Configure OpenAI client to use our local endpoint 6 | client = openai.OpenAI( 7 | timeout=30, 8 | api_key="notneeded", # API key not required for our endpoint 9 | base_url="http://localhost:8880/v1", # Point to our local server with v1 prefix 10 | ) 11 | 12 | # Create output directory if it doesn't exist 13 | output_dir = Path(__file__).parent / "output" 14 | output_dir.mkdir(exist_ok=True) 15 | 16 | 17 | def test_format( 18 | format: str, text: str = "The quick brown fox jumped over the lazy dog." 19 | ): 20 | speech_file = output_dir / f"speech_{format}.{format}" 21 | print(f"\nTesting {format} format...") 22 | print(f"Making request to {client.base_url}/audio/speech...") 23 | 24 | try: 25 | response = client.audio.speech.create( 26 | model="tts-1", voice="af_heart", input=text, response_format=format 27 | ) 28 | 29 | print("Got response, saving to file...") 30 | with open(speech_file, "wb") as f: 31 | f.write(response.content) 32 | print(f"Success! Saved to: {speech_file}") 33 | 34 | except Exception as e: 35 | print(f"Error: {str(e)}") 36 | 37 | 38 | def test_speed(speed: float): 39 | speech_file = output_dir / f"speech_speed_{speed}.wav" 40 | print(f"\nTesting speed {speed}x...") 41 | print(f"Making request to {client.base_url}/audio/speech...") 42 | 43 | try: 44 | response = client.audio.speech.create( 45 | model="tts-1", 46 | voice="af_heart", 47 | input="The quick brown fox jumped over the lazy dog.", 48 | response_format="wav", 49 | speed=speed, 50 | ) 51 | 52 | print("Got response, saving to file...") 53 | with open(speech_file, "wb") as f: 54 | f.write(response.content) 55 | print(f"Success! Saved to: {speech_file}") 56 | 57 | except Exception as e: 58 | print(f"Error: {str(e)}") 59 | 60 | 61 | # Test different formats 62 | for format in ["wav", "mp3", "opus", "aac", "flac", "pcm"]: 63 | test_format(format) # aac and pcm should fail as they are not supported 64 | 65 | # Test different speeds 66 | for speed in [0.25, 1.0, 2.0, 4.0]: # 5.0 should fail as it's out of range 67 | test_speed(speed) 68 | 69 | # Test long text 70 | test_format( 71 | "wav", 72 | """ 73 | That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. 74 | """, 75 | ) 76 | -------------------------------------------------------------------------------- /examples/assorted_checks/test_voices/analyze_voice_dimensions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from loguru import logger 4 | 5 | def analyze_voice_file(file_path): 6 | """Analyze dimensions and statistics of a voice tensor.""" 7 | try: 8 | tensor = torch.load(file_path, map_location="cpu") 9 | logger.info(f"\nAnalyzing {os.path.basename(file_path)}:") 10 | logger.info(f"Shape: {tensor.shape}") 11 | logger.info(f"Mean: {tensor.mean().item():.4f}") 12 | logger.info(f"Std: {tensor.std().item():.4f}") 13 | logger.info(f"Min: {tensor.min().item():.4f}") 14 | logger.info(f"Max: {tensor.max().item():.4f}") 15 | return tensor.shape 16 | except Exception as e: 17 | logger.error(f"Error analyzing {file_path}: {e}") 18 | return None 19 | 20 | def main(): 21 | """Analyze voice files in the voices directory.""" 22 | # Get the project root directory 23 | current_dir = os.path.dirname(os.path.abspath(__file__)) 24 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir))) 25 | voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0") 26 | 27 | logger.info(f"Scanning voices in: {voices_dir}") 28 | 29 | # Track shapes for comparison 30 | shapes = {} 31 | 32 | # Analyze each .pt file 33 | for file in os.listdir(voices_dir): 34 | if file.endswith('.pt'): 35 | file_path = os.path.join(voices_dir, file) 36 | shape = analyze_voice_file(file_path) 37 | if shape: 38 | shapes[file] = shape 39 | 40 | # Report findings 41 | logger.info("\nShape Analysis:") 42 | shape_groups = {} 43 | for file, shape in shapes.items(): 44 | if shape not in shape_groups: 45 | shape_groups[shape] = [] 46 | shape_groups[shape].append(file) 47 | 48 | for shape, files in shape_groups.items(): 49 | logger.info(f"\nShape {shape}:") 50 | for file in files: 51 | logger.info(f" - {file}") 52 | 53 | if __name__ == "__main__": 54 | main() -------------------------------------------------------------------------------- /examples/assorted_checks/test_voices/test_all_voices.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import openai 4 | import requests 5 | 6 | SAMPLE_TEXT = """ 7 | That is the germ of my great discovery. But you are wrong to say that we cannot move about in Time. For instance, if I am recalling an incident very vividly I go back to the instant of its occurrence: I become absent-minded, as you say. I jump back for a moment. 8 | """ 9 | 10 | # Configure OpenAI client to use our local endpoint 11 | client = openai.OpenAI( 12 | timeout=60, 13 | api_key="notneeded", # API key not required for our endpoint 14 | base_url="http://localhost:8880/v1", # Point to our local server with v1 prefix 15 | ) 16 | 17 | # Create output directory if it doesn't exist 18 | output_dir = Path(__file__).parent / "output" 19 | output_dir.mkdir(exist_ok=True) 20 | 21 | 22 | def test_voice(voice: str): 23 | speech_file = output_dir / f"speech_{voice}.mp3" 24 | print(f"\nTesting voice: {voice}") 25 | print(f"Making request to {client.base_url}/audio/speech...") 26 | 27 | try: 28 | response = client.audio.speech.create( 29 | model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format="mp3" 30 | ) 31 | 32 | print("Got response, saving to file...") 33 | with open(speech_file, "wb") as f: 34 | f.write(response.content) 35 | print(f"Success! Saved to: {speech_file}") 36 | 37 | except Exception as e: 38 | print(f"Error with voice {voice}: {str(e)}") 39 | 40 | 41 | # First, get list of available voices using requests 42 | print("Getting list of available voices...") 43 | try: 44 | # Convert base_url to string and ensure no double slashes 45 | base_url = str(client.base_url).rstrip("/") 46 | response = requests.get(f"{base_url}/audio/voices") 47 | if response.status_code != 200: 48 | raise Exception(f"Failed to get voices: {response.text}") 49 | data = response.json() 50 | if "voices" not in data: 51 | raise Exception(f"Unexpected response format: {data}") 52 | voices = data["voices"] 53 | print(f"Found {len(voices)} voices: {', '.join(voices)}") 54 | 55 | # Test each voice 56 | for voice in voices: 57 | test_voice(voice) 58 | 59 | except Exception as e: 60 | print(f"Error getting voices: {str(e)}") 61 | -------------------------------------------------------------------------------- /examples/assorted_checks/test_voices/trim_voice_dimensions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from loguru import logger 4 | 5 | def analyze_voice_content(tensor): 6 | """Analyze the content distribution in the voice tensor.""" 7 | # Look at the variance along the first dimension to see where the information is concentrated 8 | variance = torch.var(tensor, dim=(1,2)) # Variance across features 9 | logger.info(f"Variance distribution:") 10 | logger.info(f"First 5 rows variance: {variance[:5].mean().item():.6f}") 11 | logger.info(f"Last 5 rows variance: {variance[-5:].mean().item():.6f}") 12 | return variance 13 | 14 | def trim_voice_tensor(tensor): 15 | """Trim a 511x1x256 tensor to 510x1x256 by removing the row with least impact.""" 16 | if tensor.shape[0] != 511: 17 | raise ValueError(f"Expected tensor with first dimension 511, got {tensor.shape[0]}") 18 | 19 | # Analyze variance contribution of each row 20 | variance = analyze_voice_content(tensor) 21 | 22 | # Determine which end has lower variance (less information) 23 | start_var = variance[:5].mean().item() 24 | end_var = variance[-5:].mean().item() 25 | 26 | # Remove from the end with lower variance 27 | if end_var < start_var: 28 | logger.info("Trimming last row (lower variance at end)") 29 | return tensor[:-1] 30 | else: 31 | logger.info("Trimming first row (lower variance at start)") 32 | return tensor[1:] 33 | 34 | def process_voice_file(file_path): 35 | """Process a single voice file.""" 36 | try: 37 | tensor = torch.load(file_path, map_location="cpu") 38 | if tensor.shape[0] != 511: 39 | logger.info(f"Skipping {os.path.basename(file_path)} - already correct shape {tensor.shape}") 40 | return False 41 | 42 | logger.info(f"\nProcessing {os.path.basename(file_path)}:") 43 | logger.info(f"Original shape: {tensor.shape}") 44 | 45 | # Create backup 46 | backup_path = file_path + ".backup" 47 | if not os.path.exists(backup_path): 48 | torch.save(tensor, backup_path) 49 | logger.info(f"Created backup at {backup_path}") 50 | 51 | # Trim tensor 52 | trimmed = trim_voice_tensor(tensor) 53 | logger.info(f"New shape: {trimmed.shape}") 54 | 55 | # Save trimmed tensor 56 | torch.save(trimmed, file_path) 57 | logger.info(f"Saved trimmed tensor to {file_path}") 58 | 59 | return True 60 | except Exception as e: 61 | logger.error(f"Error processing {file_path}: {e}") 62 | return False 63 | 64 | def main(): 65 | """Process voice files in the voices directory.""" 66 | # Get the project root directory 67 | current_dir = os.path.dirname(os.path.abspath(__file__)) 68 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir))) 69 | voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0") 70 | 71 | logger.info(f"Processing voices in: {voices_dir}") 72 | 73 | processed = 0 74 | for file in os.listdir(voices_dir): 75 | if file.endswith('.pt') and not file.endswith('.backup'): 76 | file_path = os.path.join(voices_dir, file) 77 | if process_voice_file(file_path): 78 | processed += 1 79 | 80 | logger.info(f"\nProcessed {processed} voice files") 81 | logger.info("Backups created with .backup extension") 82 | logger.info("To restore backups if needed, remove .backup extension to replace trimmed files") 83 | 84 | if __name__ == "__main__": 85 | main() -------------------------------------------------------------------------------- /examples/assorted_checks/validate_wavs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | from validate_wav import validate_tts 5 | 6 | 7 | def print_validation_result(result: dict, rel_path: Path): 8 | """Print full validation details for a single file.""" 9 | print(f"\nValidating: {rel_path}") 10 | if "error" in result: 11 | print(f"Error: {result['error']}") 12 | else: 13 | print(f"Duration: {result['duration']}") 14 | print(f"Sample Rate: {result['sample_rate']} Hz") 15 | print(f"Peak Amplitude: {result['peak_amplitude']}") 16 | print(f"RMS Level: {result['rms_level']}") 17 | print(f"DC Offset: {result['dc_offset']}") 18 | 19 | if result["issues"]: 20 | print("\nIssues Found:") 21 | for issue in result["issues"]: 22 | print(f"- {issue}") 23 | else: 24 | print("\nNo issues found") 25 | 26 | 27 | def validate_directory(directory: str): 28 | """Validate all wav files in a directory with detailed output and summary.""" 29 | dir_path = Path(directory) 30 | 31 | # Find all wav files (including nested directories) 32 | wav_files = list(dir_path.rglob("*.wav")) 33 | wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s 34 | wav_files = sorted(wav_files) 35 | 36 | if not wav_files: 37 | print(f"No .wav or .mp3 files found in {directory}") 38 | return 39 | 40 | print(f"Found {len(wav_files)} files in {directory}") 41 | print("=" * 80) 42 | 43 | # Store results for summary 44 | results = [] 45 | 46 | # Detailed validation output 47 | for wav_file in wav_files: 48 | result = validate_tts(str(wav_file)) 49 | rel_path = wav_file.relative_to(dir_path) 50 | print_validation_result(result, rel_path) 51 | results.append((rel_path, result)) 52 | print("=" * 80) 53 | 54 | # Summary with detailed issues 55 | print("\nSUMMARY:") 56 | for rel_path, result in results: 57 | if "error" in result: 58 | print(f"{rel_path}: ERROR - {result['error']}") 59 | elif result["issues"]: 60 | # Show first issue in summary, indicate if there are more 61 | issues = result["issues"] 62 | first_issue = issues[0].replace("WARNING: ", "") 63 | if len(issues) > 1: 64 | print( 65 | f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)" 66 | ) 67 | else: 68 | print(f"{rel_path}: FAIL - {first_issue}") 69 | else: 70 | print(f"{rel_path}: PASS") 71 | 72 | 73 | if __name__ == "__main__": 74 | parser = argparse.ArgumentParser(description="Batch validate TTS wav files") 75 | parser.add_argument("directory", help="Directory containing wav files to validate") 76 | args = parser.parse_args() 77 | 78 | validate_directory(args.directory) 79 | -------------------------------------------------------------------------------- /examples/audio_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/audio_analysis.png -------------------------------------------------------------------------------- /examples/openai_streaming_audio.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rye run python 2 | import time 3 | from pathlib import Path 4 | 5 | from openai import OpenAI 6 | 7 | # gets OPENAI_API_KEY from your environment variables 8 | openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local") 9 | 10 | speech_file_path = Path(__file__).parent / "speech.mp3" 11 | 12 | 13 | def main() -> None: 14 | stream_to_speakers() 15 | 16 | # Create text-to-speech audio file 17 | with openai.audio.speech.with_streaming_response.create( 18 | model="kokoro", 19 | voice="af_bella", 20 | input="the quick brown fox jumped over the lazy dogs", 21 | ) as response: 22 | response.stream_to_file(speech_file_path) 23 | 24 | 25 | def stream_to_speakers() -> None: 26 | import pyaudio 27 | 28 | player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) 29 | 30 | start_time = time.time() 31 | 32 | with openai.audio.speech.with_streaming_response.create( 33 | model="kokoro", 34 | voice="af_bella+af_irulan", 35 | response_format="pcm", # similar to WAV, but without a header chunk at the start. 36 | input="""I see skies of blue and clouds of white 37 | The bright blessed days, the dark sacred nights 38 | And I think to myself 39 | What a wonderful world""", 40 | ) as response: 41 | print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") 42 | for chunk in response.iter_bytes(chunk_size=1024): 43 | player_stream.write(chunk) 44 | 45 | print(f"Done in {int((time.time() - start_time) * 1000)}ms.") 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/phoneme_examples/examples/phoneme_examples/output/phoneme_test.wav -------------------------------------------------------------------------------- /examples/phoneme_examples/test_phoneme_generation.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import json 4 | 5 | def main(): 6 | # Test phoneme string 7 | phonemes = "hˈɛloʊ wˈɜrld" # "Hello world" in phonemes 8 | 9 | try: 10 | print("\nTesting phoneme generation via API...") 11 | 12 | # Create request payload 13 | payload = { 14 | "phonemes": phonemes, 15 | "voice": "af_bella" # Using bella voice 16 | } 17 | 18 | # Make request to the API endpoint 19 | response = requests.post( 20 | "http://localhost:8880/dev/generate_from_phonemes", 21 | json=payload, 22 | stream=True # Enable streaming for audio data 23 | ) 24 | 25 | # Check if request was successful 26 | if response.status_code == 200: 27 | # Create output directory if it doesn't exist 28 | os.makedirs("examples/phoneme_examples/output", exist_ok=True) 29 | 30 | # Save the audio response 31 | output_path = 'examples/phoneme_examples/output/phoneme_test.wav' 32 | with open(output_path, 'wb') as f: 33 | for chunk in response.iter_content(chunk_size=8192): 34 | if chunk: 35 | f.write(chunk) 36 | 37 | print(f"\nAudio saved to: {output_path}") 38 | print("\nPhoneme test completed successfully!") 39 | print(f"\nInput phonemes: {phonemes}") 40 | else: 41 | print(f"Error: API request failed with status code {response.status_code}") 42 | print(f"Response: {response.text}") 43 | 44 | except Exception as e: 45 | print(f"An error occurred: {str(e)}") 46 | 47 | if __name__ == "__main__": 48 | main() -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | openai>=1.0.0 2 | pyaudio>=0.2.13 3 | -------------------------------------------------------------------------------- /examples/simul_file_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rye run python 2 | import asyncio 3 | import time 4 | from pathlib import Path 5 | from openai import AsyncOpenAI 6 | 7 | # Initialize async client 8 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local") 9 | 10 | async def save_to_file(text: str, file_id: int) -> None: 11 | """Save TTS output to file asynchronously""" 12 | speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3" 13 | 14 | start_time = time.time() 15 | print(f"Starting file {file_id}") 16 | 17 | try: 18 | # Use streaming endpoint with mp3 format 19 | async with openai.audio.speech.with_streaming_response.create( 20 | model="kokoro", 21 | voice="af_bella", 22 | input=text, 23 | response_format="mp3" 24 | ) as response: 25 | print(f"File {file_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms") 26 | 27 | # Open file in binary write mode 28 | with open(speech_file_path, 'wb') as f: 29 | async for chunk in response.iter_bytes(): 30 | f.write(chunk) 31 | 32 | print(f"File {file_id} completed in {int((time.time() - start_time) * 1000)}ms") 33 | except Exception as e: 34 | print(f"Error processing file {file_id}: {e}") 35 | 36 | async def main() -> None: 37 | # Different text samples for variety 38 | texts = [ 39 | "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white", 40 | "I see skies of blue and clouds of white. I see skies of blue and clouds of white", 41 | ] 42 | 43 | # Create tasks for saving to files 44 | file_tasks = [ 45 | save_to_file(text, i) 46 | for i, text in enumerate(texts) 47 | ] 48 | 49 | # Run file tasks concurrently 50 | await asyncio.gather(*file_tasks) 51 | 52 | if __name__ == "__main__": 53 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/simul_openai_streaming_audio.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rye run python 2 | import asyncio 3 | import time 4 | from pathlib import Path 5 | import pyaudio 6 | from openai import AsyncOpenAI 7 | 8 | # Initialize async client 9 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local") 10 | 11 | # Create a shared PyAudio instance 12 | p = pyaudio.PyAudio() 13 | 14 | async def stream_to_speakers(text: str, stream_id: int) -> None: 15 | """Stream TTS audio to speakers asynchronously""" 16 | player_stream = p.open( 17 | format=pyaudio.paInt16, 18 | channels=1, 19 | rate=24000, 20 | output=True 21 | ) 22 | 23 | start_time = time.time() 24 | print(f"Starting stream {stream_id}") 25 | 26 | try: 27 | async with openai.audio.speech.with_streaming_response.create( 28 | model="kokoro", 29 | voice="af_bella", 30 | response_format="pcm", 31 | input=text 32 | ) as response: 33 | print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms") 34 | 35 | async for chunk in response.iter_bytes(chunk_size=1024): 36 | player_stream.write(chunk) 37 | # Small sleep to allow other coroutines to run 38 | await asyncio.sleep(0.001) 39 | 40 | print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms") 41 | 42 | finally: 43 | player_stream.stop_stream() 44 | player_stream.close() 45 | 46 | async def save_to_file(text: str, file_id: int) -> None: 47 | """Save TTS output to file asynchronously""" 48 | speech_file_path = Path(__file__).parent / f"speech_{file_id}.mp3" 49 | 50 | async with openai.audio.speech.with_streaming_response.create( 51 | model="kokoro", 52 | voice="af_bella", 53 | input=text 54 | ) as response: 55 | # Open file in binary write mode 56 | with open(speech_file_path, 'wb') as f: 57 | async for chunk in response.iter_bytes(): 58 | f.write(chunk) 59 | print(f"File {file_id} saved to {speech_file_path}") 60 | 61 | async def main() -> None: 62 | # Different text samples for variety 63 | texts = [ 64 | "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white", 65 | "I see skies of blue and clouds of white. I see skies of blue and clouds of white", 66 | ] 67 | 68 | # Create tasks for streaming to speakers 69 | speaker_tasks = [ 70 | stream_to_speakers(text, i) 71 | for i, text in enumerate(texts) 72 | ] 73 | 74 | # Create tasks for saving to files 75 | file_tasks = [ 76 | save_to_file(text, i) 77 | for i, text in enumerate(texts) 78 | ] 79 | 80 | # Combine all tasks 81 | all_tasks = speaker_tasks + file_tasks 82 | 83 | # Run all tasks concurrently 84 | try: 85 | await asyncio.gather(*all_tasks) 86 | finally: 87 | # Clean up PyAudio 88 | p.terminate() 89 | 90 | if __name__ == "__main__": 91 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/simul_speaker_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rye run python 2 | import asyncio 3 | import time 4 | import pyaudio 5 | from openai import AsyncOpenAI 6 | 7 | # Initialize async client 8 | openai = AsyncOpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local") 9 | 10 | # Create a shared PyAudio instance 11 | p = pyaudio.PyAudio() 12 | 13 | async def stream_to_speakers(text: str, stream_id: int) -> None: 14 | """Stream TTS audio to speakers asynchronously""" 15 | player_stream = p.open( 16 | format=pyaudio.paInt16, 17 | channels=1, 18 | rate=24000, 19 | output=True 20 | ) 21 | 22 | start_time = time.time() 23 | print(f"Starting stream {stream_id}") 24 | 25 | try: 26 | async with openai.audio.speech.with_streaming_response.create( 27 | model="kokoro", 28 | voice="af_bella", 29 | response_format="pcm", 30 | input=text 31 | ) as response: 32 | print(f"Stream {stream_id} - Time to first byte: {int((time.time() - start_time) * 1000)}ms") 33 | 34 | async for chunk in response.iter_bytes(chunk_size=1024): 35 | player_stream.write(chunk) 36 | # Small sleep to allow other coroutines to run 37 | await asyncio.sleep(0.001) 38 | 39 | print(f"Stream {stream_id} completed in {int((time.time() - start_time) * 1000)}ms") 40 | 41 | finally: 42 | player_stream.stop_stream() 43 | player_stream.close() 44 | 45 | async def main() -> None: 46 | # Different text samples for variety 47 | texts = [ 48 | "The quick brown fox jumped over the lazy dogs. I see skies of blue and clouds of white", 49 | "I see skies of blue and clouds of white. I see skies of blue and clouds of white", 50 | ] 51 | 52 | # Create tasks for streaming to speakers 53 | speaker_tasks = [ 54 | stream_to_speakers(text, i) 55 | for i, text in enumerate(texts) 56 | ] 57 | 58 | # Run speaker tasks concurrently 59 | try: 60 | await asyncio.gather(*speaker_tasks) 61 | finally: 62 | # Clean up PyAudio 63 | p.terminate() 64 | 65 | if __name__ == "__main__": 66 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/speech.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/speech.mp3 -------------------------------------------------------------------------------- /examples/streaming_refactor/test_unified_streaming.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Test script for unified streaming implementation""" 3 | 4 | import asyncio 5 | import time 6 | from pathlib import Path 7 | 8 | from openai import OpenAI 9 | 10 | # Initialize OpenAI client 11 | client = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed") 12 | 13 | async def test_streaming_to_file(): 14 | """Test streaming to file""" 15 | print("\nTesting streaming to file...") 16 | speech_file = Path(__file__).parent / "stream_output.mp3" 17 | 18 | start_time = time.time() 19 | with client.audio.speech.with_streaming_response.create( 20 | model="kokoro", 21 | voice="af_bella", 22 | input="Testing unified streaming implementation with a short phrase.", 23 | ) as response: 24 | response.stream_to_file(speech_file) 25 | 26 | print(f"Streaming to file completed in {(time.time() - start_time):.2f}s") 27 | print(f"Output saved to: {speech_file}") 28 | 29 | async def test_streaming_chunks(): 30 | """Test streaming chunks for real-time playback""" 31 | print("\nTesting chunk streaming...") 32 | 33 | start_time = time.time() 34 | chunk_count = 0 35 | total_bytes = 0 36 | 37 | with client.audio.speech.with_streaming_response.create( 38 | model="kokoro", 39 | voice="af_bella", 40 | response_format="pcm", 41 | input="""This is a longer text to test chunk streaming. 42 | We want to verify that the unified streaming implementation 43 | works efficiently for both small and large inputs.""", 44 | ) as response: 45 | print(f"Time to first byte: {(time.time() - start_time):.3f}s") 46 | 47 | for chunk in response.iter_bytes(chunk_size=1024): 48 | chunk_count += 1 49 | total_bytes += len(chunk) 50 | # In real usage, this would go to audio playback 51 | # For testing, we just count chunks and bytes 52 | 53 | total_time = time.time() - start_time 54 | print(f"Received {chunk_count} chunks, {total_bytes} bytes") 55 | print(f"Total streaming time: {total_time:.2f}s") 56 | print(f"Average throughput: {total_bytes/total_time/1024:.1f} KB/s") 57 | 58 | async def main(): 59 | """Run all tests""" 60 | print("Starting unified streaming tests...") 61 | 62 | # Test both streaming modes 63 | await test_streaming_to_file() 64 | await test_streaming_chunks() 65 | 66 | print("\nAll tests completed!") 67 | 68 | if __name__ == "__main__": 69 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/voice_samples/speech_af.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_af_bella.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_bella.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_af_nicole.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_nicole.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_af_sarah.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_af_sarah.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_am_adam.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_am_adam.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_am_michael.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_am_michael.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_bf_emma.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bf_emma.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_bf_isabella.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bf_isabella.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_bm_george.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bm_george.mp3 -------------------------------------------------------------------------------- /examples/voice_samples/speech_bm_lewis.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/examples/voice_samples/speech_bm_lewis.mp3 -------------------------------------------------------------------------------- /githubbanner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/githubbanner.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "kokoro-fastapi" 3 | version = "0.3.0" 4 | description = "FastAPI TTS Service" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | # Core dependencies 9 | "fastapi==0.115.6", 10 | "uvicorn==0.34.0", 11 | "click>=8.0.0", 12 | "pydantic==2.10.4", 13 | "pydantic-settings==2.7.0", 14 | "python-dotenv==1.0.1", 15 | "sqlalchemy==2.0.27", 16 | # ML/DL Base 17 | "numpy>=1.26.0", 18 | "scipy==1.14.1", 19 | # Audio processing 20 | "soundfile==0.13.0", 21 | "regex==2024.11.6", 22 | # Utilities 23 | "aiofiles==23.2.1", 24 | "tqdm==4.67.1", 25 | "requests==2.32.3", 26 | "munch==4.0.0", 27 | "tiktoken==0.8.0", 28 | "loguru==0.7.3", 29 | "openai>=1.59.6", 30 | "pydub>=0.25.1", 31 | "matplotlib>=3.10.0", 32 | "mutagen>=1.47.0", 33 | "psutil>=6.1.1", 34 | "espeakng-loader==0.2.4", 35 | "kokoro==0.9.2", 36 | "misaki[en,ja,ko,zh]==0.9.3", 37 | "spacy==3.8.5", 38 | "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", 39 | "inflect>=7.5.0", 40 | "phonemizer-fork>=3.3.2", 41 | "av>=14.2.0", 42 | "text2num>=2.5.1", 43 | ] 44 | 45 | [project.optional-dependencies] 46 | gpu = [ 47 | "torch==2.6.0+cu124", 48 | ] 49 | cpu = [ 50 | "torch==2.6.0", 51 | ] 52 | test = [ 53 | "pytest==8.3.5", 54 | "pytest-cov==6.0.0", 55 | "httpx==0.26.0", 56 | "pytest-asyncio==0.25.3", 57 | "tomli>=2.0.1", 58 | "jinja2>=3.1.6" 59 | ] 60 | 61 | [tool.uv] 62 | conflicts = [ 63 | [ 64 | { extra = "cpu" }, 65 | { extra = "gpu" }, 66 | ], 67 | ] 68 | 69 | [tool.uv.sources] 70 | torch = [ 71 | { index = "pytorch-cpu", extra = "cpu" }, 72 | { index = "pytorch-cuda", extra = "gpu" }, 73 | ] 74 | 75 | [[tool.uv.index]] 76 | name = "pytorch-cpu" 77 | url = "https://download.pytorch.org/whl/cpu" 78 | explicit = true 79 | 80 | [[tool.uv.index]] 81 | name = "pytorch-cuda" 82 | url = "https://download.pytorch.org/whl/cu124" 83 | explicit = true 84 | 85 | [build-system] 86 | requires = ["setuptools>=61.0"] 87 | build-backend = "setuptools.build_meta" 88 | 89 | [tool.setuptools] 90 | package-dir = {"" = "api/src"} 91 | packages.find = {where = ["api/src"], namespaces = true} 92 | 93 | [tool.pytest.ini_options] 94 | testpaths = ["api/tests", "ui/tests"] 95 | python_files = ["test_*.py"] 96 | addopts = "--cov=api --cov=ui --cov-report=term-missing --cov-config=.coveragerc --full-trace" 97 | asyncio_mode = "auto" 98 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = api/tests 3 | python_files = test_*.py 4 | addopts = -v --tb=short --cov=api --cov-report=term-missing --cov-config=.coveragerc 5 | pythonpath = . 6 | -------------------------------------------------------------------------------- /scripts/fix_misaki.py: -------------------------------------------------------------------------------- 1 | """ 2 | Patch for misaki package to fix the EspeakWrapper.set_data_path issue. 3 | """ 4 | 5 | import importlib.util 6 | import os 7 | import sys 8 | 9 | # Find the misaki package 10 | try: 11 | import misaki 12 | 13 | misaki_path = os.path.dirname(misaki.__file__) 14 | print(f"Found misaki package at: {misaki_path}") 15 | except ImportError: 16 | print("Misaki package not found. Make sure it's installed.") 17 | sys.exit(1) 18 | 19 | # Path to the espeak.py file 20 | espeak_file = os.path.join(misaki_path, "espeak.py") 21 | 22 | if not os.path.exists(espeak_file): 23 | print(f"Could not find {espeak_file}") 24 | sys.exit(1) 25 | 26 | # Read the current content 27 | with open(espeak_file, "r") as f: 28 | content = f.read() 29 | 30 | # Check if the problematic line exists 31 | if "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())" in content: 32 | # Replace the problematic line 33 | new_content = content.replace( 34 | "EspeakWrapper.set_data_path(espeakng_loader.get_data_path())", 35 | "# Fixed line to use data_path attribute instead of set_data_path method\n" 36 | "EspeakWrapper.data_path = espeakng_loader.get_data_path()", 37 | ) 38 | 39 | # Write the modified content back 40 | with open(espeak_file, "w") as f: 41 | f.write(new_content) 42 | 43 | print(f"Successfully patched {espeak_file}") 44 | else: 45 | print(f"The problematic line was not found in {espeak_file}") 46 | print("The file may have already been patched or the issue is different.") 47 | -------------------------------------------------------------------------------- /start-cpu.ps1: -------------------------------------------------------------------------------- 1 | $env:PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll" 2 | $env:PYTHONUTF8=1 3 | $Env:PROJECT_ROOT="$pwd" 4 | $Env:USE_GPU="false" 5 | $Env:USE_ONNX="false" 6 | $Env:PYTHONPATH="$Env:PROJECT_ROOT;$Env:PROJECT_ROOT/api" 7 | $Env:MODEL_DIR="src/models" 8 | $Env:VOICES_DIR="src/voices/v1_0" 9 | $Env:WEB_PLAYER_PATH="$Env:PROJECT_ROOT/web" 10 | 11 | uv pip install -e ".[cpu]" 12 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0 13 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880 -------------------------------------------------------------------------------- /start-cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get project root directory 4 | PROJECT_ROOT=$(pwd) 5 | 6 | # Set environment variables 7 | export USE_GPU=false 8 | export USE_ONNX=false 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api 10 | export MODEL_DIR=src/models 11 | export VOICES_DIR=src/voices/v1_0 12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web 13 | # Set the espeak-ng data path to your location 14 | export ESPEAK_DATA_PATH=/usr/lib/x86_64-linux-gnu/espeak-ng-data 15 | 16 | # Run FastAPI with CPU extras using uv run 17 | # Note: espeak may still require manual installation, 18 | uv pip install -e ".[cpu]" 19 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0 20 | 21 | # Apply the misaki patch to fix possible EspeakWrapper issue in older versions 22 | # echo "Applying misaki patch..." 23 | # python scripts/fix_misaki.py 24 | 25 | # Start the server 26 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880 27 | -------------------------------------------------------------------------------- /start-gpu.ps1: -------------------------------------------------------------------------------- 1 | $env:PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll" 2 | $env:PYTHONUTF8=1 3 | $Env:PROJECT_ROOT="$pwd" 4 | $Env:USE_GPU="true" 5 | $Env:USE_ONNX="false" 6 | $Env:PYTHONPATH="$Env:PROJECT_ROOT;$Env:PROJECT_ROOT/api" 7 | $Env:MODEL_DIR="src/models" 8 | $Env:VOICES_DIR="src/voices/v1_0" 9 | $Env:WEB_PLAYER_PATH="$Env:PROJECT_ROOT/web" 10 | 11 | uv pip install -e ".[gpu]" 12 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0 13 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880 -------------------------------------------------------------------------------- /start-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get project root directory 4 | PROJECT_ROOT=$(pwd) 5 | 6 | # Set environment variables 7 | export USE_GPU=true 8 | export USE_ONNX=false 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api 10 | export MODEL_DIR=src/models 11 | export VOICES_DIR=src/voices/v1_0 12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web 13 | 14 | # Run FastAPI with GPU extras using uv run 15 | # Note: espeak may still require manual installation, 16 | uv pip install -e ".[gpu]" 17 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0 18 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880 19 | -------------------------------------------------------------------------------- /start-gpu_mac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get project root directory 4 | PROJECT_ROOT=$(pwd) 5 | 6 | # Set other environment variables 7 | export USE_GPU=true 8 | export USE_ONNX=false 9 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api 10 | export MODEL_DIR=src/models 11 | export VOICES_DIR=src/voices/v1_0 12 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web 13 | 14 | export DEVICE_TYPE=mps 15 | # Enable MPS fallback for unsupported operations 16 | export PYTORCH_ENABLE_MPS_FALLBACK=1 17 | 18 | # Run FastAPI with GPU extras using uv run 19 | uv pip install -e . 20 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0 21 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880 22 | -------------------------------------------------------------------------------- /ui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | WORKDIR /app/ui 4 | 5 | # Install dependencies 6 | RUN pip install gradio==5.9.1 requests==2.32.3 7 | 8 | # Create necessary directories 9 | RUN mkdir -p data/inputs data/outputs 10 | 11 | # Copy the application files 12 | COPY . . 13 | 14 | ENV API_HOST=kokoro-tts 15 | ENV API_PORT=8880 16 | 17 | # Run the Gradio app 18 | CMD ["python", "app.py"] 19 | -------------------------------------------------------------------------------- /ui/GUIBanner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/GUIBanner.png -------------------------------------------------------------------------------- /ui/GradioScreenShot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/GradioScreenShot.png -------------------------------------------------------------------------------- /ui/app.py: -------------------------------------------------------------------------------- 1 | from lib.interface import create_interface 2 | 3 | if __name__ == "__main__": 4 | demo = create_interface() 5 | demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) 6 | -------------------------------------------------------------------------------- /ui/depr_tests/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import AsyncMock, Mock 2 | 3 | import pytest 4 | 5 | from api.src.services.tts_service import TTSService 6 | 7 | 8 | @pytest.fixture 9 | async def mock_model_manager(): 10 | """Mock model manager for UI tests""" 11 | manager = AsyncMock() 12 | manager.get_backend = Mock(return_value=Mock(device="cpu")) 13 | return manager 14 | 15 | 16 | @pytest.fixture 17 | async def mock_voice_manager(): 18 | """Mock voice manager for UI tests""" 19 | manager = AsyncMock() 20 | manager.list_voices = AsyncMock(return_value=["af_heart", "bm_lewis", "af_sarah"]) 21 | return manager 22 | 23 | 24 | @pytest.fixture 25 | async def mock_tts_service(mock_model_manager, mock_voice_manager): 26 | """Mock TTSService for UI tests""" 27 | service = AsyncMock() 28 | service.model_manager = mock_model_manager 29 | service._voice_manager = mock_voice_manager 30 | return service 31 | 32 | 33 | @pytest.fixture(autouse=True) 34 | async def setup_mocks( 35 | monkeypatch, mock_model_manager, mock_voice_manager, mock_tts_service 36 | ): 37 | """Setup global mocks for UI tests""" 38 | 39 | async def mock_get_model(): 40 | return mock_model_manager 41 | 42 | async def mock_get_voice(): 43 | return mock_voice_manager 44 | 45 | async def mock_create_service(): 46 | return mock_tts_service 47 | 48 | monkeypatch.setattr("api.src.inference.model_manager.get_manager", mock_get_model) 49 | monkeypatch.setattr("api.src.inference.voice_manager.get_manager", mock_get_voice) 50 | monkeypatch.setattr( 51 | "api.src.services.tts_service.TTSService.create", mock_create_service 52 | ) 53 | -------------------------------------------------------------------------------- /ui/depr_tests/test_handlers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Drop all tests for now. The Gradio event system is too complex to test properly. 3 | We'll need to find a better way to test the UI functionality. 4 | """ 5 | -------------------------------------------------------------------------------- /ui/depr_tests/test_input.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import pytest 3 | 4 | from ui.lib.components.input import create_input_column 5 | 6 | 7 | def test_create_input_column_structure(): 8 | """Test that create_input_column returns the expected structure""" 9 | column, components = create_input_column() 10 | 11 | # Test the return types 12 | assert isinstance(column, gr.Column) 13 | assert isinstance(components, dict) 14 | 15 | # Test that all expected components are present 16 | expected_components = { 17 | "tabs", 18 | "text_input", 19 | "file_select", 20 | "file_upload", 21 | "file_preview", 22 | "text_submit", 23 | "file_submit", 24 | "clear_files", 25 | } 26 | assert set(components.keys()) == expected_components 27 | 28 | # Test component types 29 | assert isinstance(components["tabs"], gr.Tabs) 30 | assert isinstance(components["text_input"], gr.Textbox) 31 | assert isinstance(components["file_select"], gr.Dropdown) 32 | assert isinstance(components["file_upload"], gr.File) 33 | assert isinstance(components["file_preview"], gr.Textbox) 34 | assert isinstance(components["text_submit"], gr.Button) 35 | assert isinstance(components["file_submit"], gr.Button) 36 | assert isinstance(components["clear_files"], gr.Button) 37 | 38 | 39 | def test_text_input_configuration(): 40 | """Test the text input component configuration""" 41 | _, components = create_input_column() 42 | text_input = components["text_input"] 43 | 44 | assert text_input.label == "Text to speak" 45 | assert text_input.placeholder == "Enter text here..." 46 | assert text_input.lines == 4 47 | 48 | 49 | def test_file_upload_configuration(): 50 | """Test the file upload component configuration""" 51 | _, components = create_input_column() 52 | file_upload = components["file_upload"] 53 | 54 | assert file_upload.label == "Upload Text File (.txt)" 55 | assert file_upload.file_types == [".txt"] 56 | 57 | 58 | def test_button_configurations(): 59 | """Test the button configurations""" 60 | _, components = create_input_column() 61 | 62 | # Test text submit button 63 | assert components["text_submit"].value == "Generate Speech" 64 | assert components["text_submit"].variant == "primary" 65 | assert components["text_submit"].size == "lg" 66 | 67 | # Test file submit button 68 | assert components["file_submit"].value == "Generate Speech" 69 | assert components["file_submit"].variant == "primary" 70 | assert components["file_submit"].size == "lg" 71 | 72 | # Test clear files button 73 | assert components["clear_files"].value == "Clear Files" 74 | assert components["clear_files"].variant == "secondary" 75 | assert components["clear_files"].size == "lg" 76 | -------------------------------------------------------------------------------- /ui/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remsky/Kokoro-FastAPI/d7d90cdc9d232c36d117deb2b3ffb37b82413fda/ui/lib/__init__.py -------------------------------------------------------------------------------- /ui/lib/api.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from typing import List, Optional, Tuple 4 | 5 | import requests 6 | 7 | from .config import API_URL, OUTPUTS_DIR 8 | 9 | 10 | def check_api_status() -> Tuple[bool, List[str]]: 11 | """Check TTS service status and get available voices.""" 12 | try: 13 | # Use a longer timeout during startup 14 | response = requests.get( 15 | f"{API_URL}/v1/audio/voices", 16 | timeout=30, # Increased timeout for initial startup period 17 | ) 18 | response.raise_for_status() 19 | voices = response.json().get("voices", []) 20 | if voices: 21 | return True, voices 22 | print("No voices found in response") 23 | return False, [] 24 | except requests.exceptions.Timeout: 25 | print("API request timed out (waiting for service startup)") 26 | return False, [] 27 | except requests.exceptions.ConnectionError as e: 28 | print(f"Connection error (service may be starting up): {str(e)}") 29 | return False, [] 30 | except requests.exceptions.RequestException as e: 31 | print(f"API request failed: {str(e)}") 32 | return False, [] 33 | except Exception as e: 34 | print(f"Unexpected error checking API status: {str(e)}") 35 | return False, [] 36 | 37 | 38 | def text_to_speech( 39 | text: str, voice_id: str | list, format: str, speed: float 40 | ) -> Optional[str]: 41 | """Generate speech from text using TTS API.""" 42 | if not text.strip(): 43 | return None 44 | 45 | # Handle multiple voices 46 | voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id) 47 | 48 | # Create output filename 49 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 50 | output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}" 51 | output_path = os.path.join(OUTPUTS_DIR, output_filename) 52 | 53 | try: 54 | response = requests.post( 55 | f"{API_URL}/v1/audio/speech", 56 | json={ 57 | "model": "kokoro", 58 | "input": text, 59 | "voice": voice_str, 60 | "response_format": format, 61 | "speed": float(speed), 62 | }, 63 | headers={"Content-Type": "application/json"}, 64 | timeout=300, # Longer timeout for speech generation 65 | ) 66 | response.raise_for_status() 67 | 68 | with open(output_path, "wb") as f: 69 | f.write(response.content) 70 | return output_path 71 | 72 | except requests.exceptions.Timeout: 73 | print("Speech generation request timed out") 74 | return None 75 | except requests.exceptions.RequestException as e: 76 | print(f"Speech generation request failed: {str(e)}") 77 | return None 78 | except Exception as e: 79 | print(f"Unexpected error generating speech: {str(e)}") 80 | return None 81 | 82 | 83 | def get_status_html(is_available: bool) -> str: 84 | """Generate HTML for status indicator.""" 85 | color = "green" if is_available else "red" 86 | status = "Available" if is_available else "Unavailable" 87 | return f""" 88 |
89 |
90 | TTS Service: {status} 91 |
92 | """ 93 | -------------------------------------------------------------------------------- /ui/lib/components/__init__.py: -------------------------------------------------------------------------------- 1 | from .input import create_input_column 2 | from .model import create_model_column 3 | from .output import create_output_column 4 | 5 | __all__ = ["create_input_column", "create_model_column", "create_output_column"] 6 | -------------------------------------------------------------------------------- /ui/lib/components/input.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import gradio as gr 4 | 5 | from .. import files 6 | 7 | 8 | def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]: 9 | """Create the input column with text input and file handling.""" 10 | with gr.Column(scale=1) as col: 11 | text_input = gr.Textbox( 12 | label="Text to speak", placeholder="Enter text here...", lines=4 13 | ) 14 | 15 | # Always show file upload but handle differently based on disable_local_saving 16 | file_upload = gr.File(label="Upload Text File (.txt)", file_types=[".txt"]) 17 | 18 | if not disable_local_saving: 19 | # Show full interface with tabs when saving is enabled 20 | with gr.Tabs() as tabs: 21 | # Set first tab as selected by default 22 | tabs.selected = 0 23 | # Direct Input Tab 24 | with gr.TabItem("Direct Input"): 25 | text_submit_direct = gr.Button( 26 | "Generate Speech", variant="primary", size="lg" 27 | ) 28 | 29 | # File Input Tab 30 | with gr.TabItem("From File"): 31 | # Existing files dropdown 32 | input_files_list = gr.Dropdown( 33 | label="Select Existing File", 34 | choices=files.list_input_files(), 35 | value=None, 36 | ) 37 | 38 | file_preview = gr.Textbox( 39 | label="File Content Preview", interactive=False, lines=4 40 | ) 41 | 42 | with gr.Row(): 43 | file_submit = gr.Button( 44 | "Generate Speech", variant="primary", size="lg" 45 | ) 46 | clear_files = gr.Button( 47 | "Clear Files", variant="secondary", size="lg" 48 | ) 49 | else: 50 | # Just show the generate button when saving is disabled 51 | text_submit_direct = gr.Button( 52 | "Generate Speech", variant="primary", size="lg" 53 | ) 54 | tabs = None 55 | input_files_list = None 56 | file_preview = None 57 | file_submit = None 58 | clear_files = None 59 | 60 | # Initialize components based on disable_local_saving 61 | if disable_local_saving: 62 | components = { 63 | "tabs": None, 64 | "text_input": text_input, 65 | "text_submit": text_submit_direct, 66 | "file_select": None, 67 | "file_upload": file_upload, # Keep file upload even when saving is disabled 68 | "file_preview": None, 69 | "file_submit": None, 70 | "clear_files": None, 71 | } 72 | else: 73 | components = { 74 | "tabs": tabs, 75 | "text_input": text_input, 76 | "text_submit": text_submit_direct, 77 | "file_select": input_files_list, 78 | "file_upload": file_upload, 79 | "file_preview": file_preview, 80 | "file_submit": file_submit, 81 | "clear_files": clear_files, 82 | } 83 | 84 | return col, components 85 | -------------------------------------------------------------------------------- /ui/lib/components/model.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import gradio as gr 4 | 5 | from .. import api, config 6 | 7 | 8 | def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, dict]: 9 | """Create the model settings column.""" 10 | if voice_ids is None: 11 | voice_ids = [] 12 | 13 | with gr.Column(scale=1) as col: 14 | gr.Markdown("### Model Settings") 15 | 16 | # Status button starts in waiting state 17 | status_btn = gr.Button( 18 | "⌛ TTS Service: Waiting for Service...", variant="secondary" 19 | ) 20 | 21 | voice_input = gr.Dropdown( 22 | choices=voice_ids, 23 | label="Voice(s)", 24 | value=voice_ids[0] if voice_ids else None, 25 | interactive=True, 26 | multiselect=True, 27 | ) 28 | format_input = gr.Dropdown( 29 | choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3" 30 | ) 31 | speed_input = gr.Slider( 32 | minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed" 33 | ) 34 | 35 | components = { 36 | "status_btn": status_btn, 37 | "voice": voice_input, 38 | "format": format_input, 39 | "speed": speed_input, 40 | } 41 | 42 | return col, components 43 | -------------------------------------------------------------------------------- /ui/lib/components/output.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import gradio as gr 4 | 5 | from .. import files 6 | 7 | 8 | def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]: 9 | """Create the output column with audio player and file list.""" 10 | with gr.Column(scale=1) as col: 11 | gr.Markdown("### Latest Output") 12 | audio_output = gr.Audio( 13 | label="Generated Speech", 14 | type="filepath", 15 | waveform_options={"waveform_color": "#4C87AB"}, 16 | ) 17 | 18 | # Create file-related components with visible=False when local saving is disabled 19 | gr.Markdown("### Generated Files", visible=not disable_local_saving) 20 | output_files = gr.Dropdown( 21 | label="Previous Outputs", 22 | choices=files.list_output_files() if not disable_local_saving else [], 23 | value=None, 24 | allow_custom_value=True, 25 | visible=not disable_local_saving, 26 | ) 27 | 28 | play_btn = gr.Button( 29 | "▶️ Play Selected", 30 | size="sm", 31 | visible=not disable_local_saving, 32 | ) 33 | 34 | selected_audio = gr.Audio( 35 | label="Selected Output", 36 | type="filepath", 37 | visible=False, # Always initially hidden 38 | ) 39 | 40 | clear_outputs = gr.Button( 41 | "⚠️ Delete All Previously Generated Output Audio 🗑️", 42 | size="sm", 43 | variant="secondary", 44 | visible=not disable_local_saving, 45 | ) 46 | 47 | components = { 48 | "audio_output": audio_output, 49 | "output_files": output_files, 50 | "play_btn": play_btn, 51 | "selected_audio": selected_audio, 52 | "clear_outputs": clear_outputs, 53 | } 54 | 55 | return col, components 56 | -------------------------------------------------------------------------------- /ui/lib/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # API Configuration 4 | API_HOST = os.getenv("API_HOST", "kokoro-tts") 5 | API_PORT = os.getenv("API_PORT", "8880") 6 | API_URL = f"http://{API_HOST}:{API_PORT}" 7 | 8 | # File paths 9 | INPUTS_DIR = "app/ui/data/inputs" 10 | OUTPUTS_DIR = "app/ui/data/outputs" 11 | 12 | # Create directories if they don't exist 13 | 14 | os.makedirs(INPUTS_DIR, exist_ok=True) 15 | os.makedirs(OUTPUTS_DIR, exist_ok=True) 16 | 17 | # Audio formats 18 | AUDIO_FORMATS = ["mp3", "wav", "opus", "flac"] 19 | 20 | # UI Theme 21 | THEME = "monochrome" 22 | CSS = """ 23 | .gradio-container { 24 | max-width: 1000px; 25 | margin: auto; 26 | } 27 | 28 | .banner-container { 29 | background: transparent !important; 30 | border: none !important; 31 | box-shadow: none !important; 32 | margin-bottom: 2rem; 33 | } 34 | 35 | .banner-container img { 36 | width: 100%; 37 | max-width: 600px; 38 | border-radius: 10px; 39 | margin: 20px auto; 40 | display: block; 41 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); 42 | } 43 | """ 44 | -------------------------------------------------------------------------------- /web/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 11 | 12 | 13 | 17 | 21 | 22 | 23 | 28 | 34 | 35 | 40 | 46 | 47 | -------------------------------------------------------------------------------- /web/siriwave.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | function SiriWave(opt) { 3 | opt = opt || {}; 4 | 5 | this.phase = 0; 6 | this.run = false; 7 | 8 | // UI vars 9 | this.ratio = opt.ratio || window.devicePixelRatio || 1; 10 | this.width = this.ratio * (opt.width || 320); 11 | this.width_2 = this.width / 2; 12 | this.width_4 = this.width / 4; 13 | this.height = this.ratio * (opt.height || 50); 14 | this.height_2 = this.height / 2; 15 | this.MAX = (this.height_2) - 4; 16 | 17 | // Constructor opt 18 | this.amplitude = opt.amplitude || 1; 19 | this.speed = opt.speed || 0.2; 20 | this.frequency = opt.frequency || 6; 21 | this.color = (function hex2rgb(hex){ 22 | var shorthandRegex = /^#?([a-f\d])([a-f\d])([a-f\d])$/i; 23 | hex = hex.replace(shorthandRegex, function(m,r,g,b) { return r + r + g + g + b + b; }); 24 | var result = /^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(hex); 25 | return result ? 26 | parseInt(result[1],16).toString()+','+parseInt(result[2], 16).toString()+','+parseInt(result[3], 16).toString() 27 | : null; 28 | })(opt.color || '#6366f1') || '99,102,241'; 29 | 30 | // Canvas 31 | this.canvas = document.createElement('canvas'); 32 | this.canvas.width = this.width; 33 | this.canvas.height = this.height; 34 | 35 | this.canvas.style.width = '100%'; 36 | this.canvas.style.height = '100%'; 37 | this.canvas.style.borderRadius = '4px'; 38 | 39 | this.container = opt.container || document.body; 40 | this.container.appendChild(this.canvas); 41 | this.ctx = this.canvas.getContext('2d'); 42 | 43 | // Start 44 | if (opt.autostart) { 45 | this.start(); 46 | } 47 | } 48 | 49 | SiriWave.prototype._GATF_cache = {}; 50 | SiriWave.prototype._globAttFunc = function(x) { 51 | if (SiriWave.prototype._GATF_cache[x] == null) { 52 | SiriWave.prototype._GATF_cache[x] = Math.pow(4/(4+Math.pow(x,4)), 4); 53 | } 54 | return SiriWave.prototype._GATF_cache[x]; 55 | }; 56 | 57 | SiriWave.prototype._xpos = function(i) { 58 | return this.width_2 + i * this.width_4; 59 | }; 60 | 61 | SiriWave.prototype._ypos = function(i, attenuation) { 62 | var att = (this.MAX * this.amplitude) / attenuation; 63 | return this.height_2 + this._globAttFunc(i) * att * Math.sin(this.frequency * i - this.phase); 64 | }; 65 | 66 | SiriWave.prototype._drawLine = function(attenuation, color, width){ 67 | this.ctx.moveTo(0,0); 68 | this.ctx.beginPath(); 69 | this.ctx.strokeStyle = color; 70 | this.ctx.lineWidth = width || 1; 71 | 72 | var i = -2; 73 | while ((i += 0.01) <= 2) { 74 | var y = this._ypos(i, attenuation); 75 | if (Math.abs(i) >= 1.90) y = this.height_2; 76 | this.ctx.lineTo(this._xpos(i), y); 77 | } 78 | 79 | this.ctx.stroke(); 80 | }; 81 | 82 | SiriWave.prototype._clear = function() { 83 | this.ctx.globalCompositeOperation = 'destination-out'; 84 | this.ctx.fillRect(0, 0, this.width, this.height); 85 | this.ctx.globalCompositeOperation = 'source-over'; 86 | }; 87 | 88 | SiriWave.prototype._draw = function() { 89 | if (this.run === false) return; 90 | 91 | this.phase = (this.phase + Math.PI*this.speed) % (2*Math.PI); 92 | 93 | this._clear(); 94 | this._drawLine(-2, 'rgba(' + this.color + ',0.1)'); 95 | this._drawLine(-6, 'rgba(' + this.color + ',0.2)'); 96 | this._drawLine(4, 'rgba(' + this.color + ',0.4)'); 97 | this._drawLine(2, 'rgba(' + this.color + ',0.6)'); 98 | this._drawLine(1, 'rgba(' + this.color + ',1)', 1.5); 99 | 100 | if (window.requestAnimationFrame) { 101 | requestAnimationFrame(this._draw.bind(this)); 102 | return; 103 | }; 104 | setTimeout(this._draw.bind(this), 20); 105 | }; 106 | 107 | SiriWave.prototype.start = function() { 108 | this.phase = 0; 109 | this.run = true; 110 | this._draw(); 111 | }; 112 | 113 | SiriWave.prototype.stop = function() { 114 | this.phase = 0; 115 | this.run = false; 116 | }; 117 | 118 | SiriWave.prototype.setSpeed = function(v) { 119 | this.speed = v; 120 | }; 121 | 122 | SiriWave.prototype.setNoise = SiriWave.prototype.setAmplitude = function(v) { 123 | this.amplitude = Math.max(Math.min(v, 1), 0); 124 | }; 125 | 126 | if (typeof define === 'function' && define.amd) { 127 | define(function(){ return SiriWave; }); 128 | return; 129 | }; 130 | window.SiriWave = SiriWave; 131 | })(); -------------------------------------------------------------------------------- /web/src/components/WaveVisualizer.js: -------------------------------------------------------------------------------- 1 | export class WaveVisualizer { 2 | constructor(playerState) { 3 | this.playerState = playerState; 4 | this.wave = null; 5 | this.progressBar = null; 6 | this.container = document.getElementById('wave-container'); 7 | 8 | this.setupWave(); 9 | this.setupProgressBar(); 10 | this.setupStateSubscription(); 11 | } 12 | 13 | setupWave() { 14 | this.wave = new SiriWave({ 15 | container: this.container, 16 | style: 'ios9', 17 | width: this.container.clientWidth, 18 | height: 100, // Increased height 19 | autostart: false, 20 | amplitude: 1, 21 | speed: 0.1 22 | }); 23 | 24 | // Handle window resize 25 | window.addEventListener('resize', () => { 26 | if (this.wave) { 27 | this.wave.width = this.container.clientWidth; 28 | } 29 | }); 30 | } 31 | 32 | setupProgressBar() { 33 | this.progressBar = document.createElement('progress'); 34 | this.progressBar.max = 100; 35 | this.progressBar.value = 0; 36 | this.progressBar.className = 'generation-progress'; 37 | // Insert inside wave-container at the bottom 38 | this.container.appendChild(this.progressBar); 39 | this.progressBar.style.display = 'none'; 40 | } 41 | 42 | setupStateSubscription() { 43 | this.playerState.subscribe(state => { 44 | // Handle generation progress 45 | if (state.isGenerating) { 46 | this.progressBar.style.display = 'block'; 47 | this.progressBar.value = state.progress; 48 | } else if (state.progress >= 100) { 49 | // Hide progress bar after completion 50 | setTimeout(() => { 51 | this.progressBar.style.display = 'none'; 52 | this.progressBar.value = 0; 53 | }, 500); 54 | } 55 | 56 | // Only animate when playing, stop otherwise 57 | if (state.isPlaying) { 58 | this.wave.start(); 59 | } else { 60 | this.wave.stop(); 61 | } 62 | }); 63 | } 64 | 65 | updateProgress(receivedChunks, totalChunks) { 66 | if (!totalChunks) return; 67 | 68 | // Calculate progress percentage based on chunks 69 | const progress = Math.min((receivedChunks / totalChunks) * 100, 99); 70 | 71 | // Always update on 0 progress or when progress increases 72 | if (receivedChunks === 0 || progress > this.progressBar.value) { 73 | this.progressBar.style.display = 'block'; 74 | this.progressBar.value = progress; 75 | this.playerState.setProgress(receivedChunks, totalChunks); 76 | } 77 | } 78 | 79 | cleanup() { 80 | if (this.wave) { 81 | this.wave.stop(); 82 | this.wave.dispose(); 83 | this.wave = null; 84 | } 85 | 86 | if (this.progressBar) { 87 | this.progressBar.style.display = 'none'; 88 | this.progressBar.value = 0; 89 | if (this.progressBar.parentNode) { 90 | this.progressBar.parentNode.removeChild(this.progressBar); 91 | } 92 | this.progressBar = null; 93 | } 94 | 95 | // Re-setup wave and progress bar 96 | this.setupWave(); 97 | this.setupProgressBar(); 98 | 99 | if (this.playerState) { 100 | this.playerState.setProgress(0, 1); // Reset progress in state 101 | } 102 | } 103 | } 104 | 105 | export default WaveVisualizer; 106 | -------------------------------------------------------------------------------- /web/src/services/VoiceService.js: -------------------------------------------------------------------------------- 1 | export class VoiceService { 2 | constructor() { 3 | this.availableVoices = []; 4 | this.selectedVoices = new Map(); // Changed to Map to store voice:weight pairs 5 | } 6 | 7 | async loadVoices() { 8 | try { 9 | const response = await fetch('/v1/audio/voices'); 10 | if (!response.ok) { 11 | const error = await response.json(); 12 | throw new Error(error.detail?.message || 'Failed to load voices'); 13 | } 14 | 15 | const data = await response.json(); 16 | if (!data.voices?.length) { 17 | throw new Error('No voices available'); 18 | } 19 | 20 | this.availableVoices = data.voices; 21 | 22 | // Select first voice if none selected 23 | if (this.selectedVoices.size === 0) { 24 | const firstVoice = this.availableVoices.find(voice => voice && voice.trim()); 25 | if (firstVoice) { 26 | this.addVoice(firstVoice); 27 | } 28 | } 29 | 30 | return this.availableVoices; 31 | } catch (error) { 32 | console.error('Failed to load voices:', error); 33 | throw error; 34 | } 35 | } 36 | 37 | getAvailableVoices() { 38 | return this.availableVoices; 39 | } 40 | 41 | getSelectedVoices() { 42 | return Array.from(this.selectedVoices.keys()); 43 | } 44 | 45 | getSelectedVoiceWeights() { 46 | return Array.from(this.selectedVoices.entries()).map(([voice, weight]) => ({ 47 | voice, 48 | weight 49 | })); 50 | } 51 | 52 | getSelectedVoiceString() { 53 | const entries = Array.from(this.selectedVoices.entries()); 54 | 55 | // If only one voice with weight 1, return just the voice name 56 | if (entries.length === 1 && entries[0][1] === 1) { 57 | return entries[0][0]; 58 | } 59 | 60 | // Otherwise return voice(weight) format 61 | return entries 62 | .map(([voice, weight]) => `${voice}(${weight})`) 63 | .join('+'); 64 | } 65 | 66 | addVoice(voice, weight = 1) { 67 | if (this.availableVoices.includes(voice)) { 68 | this.selectedVoices.set(voice, parseFloat(weight) || 1); 69 | return true; 70 | } 71 | return false; 72 | } 73 | 74 | updateWeight(voice, weight) { 75 | if (this.selectedVoices.has(voice)) { 76 | this.selectedVoices.set(voice, parseFloat(weight) || 1); 77 | return true; 78 | } 79 | return false; 80 | } 81 | 82 | removeVoice(voice) { 83 | return this.selectedVoices.delete(voice); 84 | } 85 | 86 | clearSelectedVoices() { 87 | this.selectedVoices.clear(); 88 | } 89 | 90 | filterVoices(searchTerm) { 91 | if (!searchTerm) { 92 | return this.availableVoices; 93 | } 94 | 95 | const term = searchTerm.toLowerCase(); 96 | return this.availableVoices.filter(voice => 97 | voice.toLowerCase().includes(term) 98 | ); 99 | } 100 | 101 | hasSelectedVoices() { 102 | return this.selectedVoices.size > 0; 103 | } 104 | } 105 | 106 | export default VoiceService; -------------------------------------------------------------------------------- /web/src/state/PlayerState.js: -------------------------------------------------------------------------------- 1 | export class PlayerState { 2 | constructor() { 3 | this.state = { 4 | isPlaying: false, 5 | isGenerating: false, 6 | currentTime: 0, 7 | duration: 0, 8 | volume: 1, 9 | speed: 1, 10 | progress: 0, 11 | error: null 12 | }; 13 | this.listeners = new Set(); 14 | } 15 | 16 | subscribe(listener) { 17 | this.listeners.add(listener); 18 | return () => this.listeners.delete(listener); 19 | } 20 | 21 | notify() { 22 | this.listeners.forEach(listener => listener(this.state)); 23 | } 24 | 25 | setState(updates) { 26 | this.state = { 27 | ...this.state, 28 | ...updates 29 | }; 30 | this.notify(); 31 | } 32 | 33 | setPlaying(isPlaying) { 34 | this.setState({ isPlaying }); 35 | } 36 | 37 | setGenerating(isGenerating) { 38 | this.setState({ isGenerating }); 39 | } 40 | 41 | setProgress(loaded, total) { 42 | const progress = total > 0 ? (loaded / total) * 100 : 0; 43 | this.setState({ progress }); 44 | } 45 | 46 | setTime(currentTime, duration) { 47 | this.setState({ currentTime, duration }); 48 | } 49 | 50 | setVolume(volume) { 51 | this.setState({ volume }); 52 | } 53 | 54 | setSpeed(speed) { 55 | this.setState({ speed }); 56 | } 57 | 58 | setError(error) { 59 | this.setState({ error }); 60 | } 61 | 62 | clearError() { 63 | this.setState({ error: null }); 64 | } 65 | 66 | reset() { 67 | // Keep current speed setting but reset everything else 68 | const currentSpeed = this.state.speed; 69 | const currentVolume = this.state.volume; 70 | 71 | this.setState({ 72 | isPlaying: false, 73 | isGenerating: false, 74 | currentTime: 0, 75 | duration: 0, 76 | progress: 0, 77 | error: null, 78 | speed: currentSpeed, 79 | volume: currentVolume 80 | }); 81 | } 82 | 83 | getState() { 84 | return { ...this.state }; 85 | } 86 | } 87 | 88 | export default PlayerState; -------------------------------------------------------------------------------- /web/styles/badges.css: -------------------------------------------------------------------------------- 1 | .badges-container { 2 | position: fixed; 3 | top: 0; 4 | left: 0; 5 | right: 0; 6 | padding: clamp(0.75rem, 1.5vh, 1rem) clamp(1rem, 2vw, 2rem); 7 | display: flex; 8 | justify-content: space-between; 9 | align-items: center; 10 | z-index: 100; 11 | background: rgba(15, 23, 42, 0.95); 12 | backdrop-filter: blur(12px); 13 | border-bottom: 1px solid rgba(99, 102, 241, 0.2); 14 | min-height: clamp(3.5rem, 6vh, 4.5rem); 15 | box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 16 | 0 2px 4px -1px rgba(0, 0, 0, 0.06); 17 | } 18 | 19 | .badge { 20 | height: clamp(24px, 3vh, 28px); 21 | display: flex; 22 | align-items: center; 23 | transition: opacity 0.2s ease; 24 | flex-shrink: 0; 25 | } 26 | 27 | .logo-container { 28 | display: flex; 29 | align-items: center; 30 | gap: clamp(0.5rem, 1vw, 1rem); 31 | margin: 0 auto; 32 | transform: translateX(-50%); 33 | left: 50%; 34 | position: absolute; 35 | } 36 | 37 | @media (max-width: 768px) { 38 | .badges-container { 39 | padding: 0.75rem; 40 | flex-wrap: wrap; 41 | justify-content: center; 42 | gap: 0.75rem; 43 | min-height: clamp(4rem, 8vh, 5rem); 44 | } 45 | 46 | .badge { 47 | height: 24px; 48 | } 49 | 50 | .badge iframe { 51 | height: 24px !important; 52 | max-width: 100%; 53 | } 54 | 55 | .logo-container { 56 | position: static; 57 | transform: none; 58 | margin: 0; 59 | order: -1; 60 | width: 100%; 61 | justify-content: center; 62 | margin-bottom: 0.5rem; 63 | } 64 | } 65 | 66 | .badge iframe { 67 | height: 28px !important; 68 | } 69 | 70 | .badge:hover { 71 | opacity: 0.9; 72 | } 73 | 74 | .badge img { 75 | height: 100%; 76 | border-radius: 4px; 77 | } 78 | -------------------------------------------------------------------------------- /web/styles/base.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --bg-color: #0f172a; 3 | --fg-color: #6366f1; 4 | --surface: rgba(30, 41, 59, 1); 5 | --text: #f8fafc; 6 | --text-light: #cbd5e1; 7 | --border: rgba(148, 163, 184, 0.2); 8 | --error: #ef4444; 9 | --success: #22c55e; 10 | --font-family: 'Inter', system-ui, sans-serif; 11 | } 12 | 13 | html { 14 | width: 100%; 15 | height: 100%; 16 | overflow-x: hidden; 17 | } 18 | 19 | * { 20 | margin: 0; 21 | padding: 0; 22 | box-sizing: border-box; 23 | } 24 | 25 | body { 26 | font-family: var(--font-family); 27 | line-height: 1.6; 28 | color: var(--text); 29 | background: var(--bg-color); 30 | min-height: 100vh; 31 | position: relative; 32 | padding: 0; 33 | width: 100%; 34 | max-width: 100vw; 35 | overflow-x: hidden; 36 | } 37 | 38 | .overlay { 39 | position: fixed; 40 | inset: 0; 41 | background: 42 | radial-gradient(circle at top right, 43 | var(--fg-color) 0%, 44 | var(--bg-color) 100%); 45 | pointer-events: none; 46 | z-index: 0; 47 | } 48 | 49 | .grid-overlay { 50 | position: fixed; 51 | inset: 0; 52 | background-image: 53 | repeating-linear-gradient(0deg, 54 | rgba(255,255,255,0.03) 0px, 55 | rgba(255,255,255,0.03) 1px, 56 | transparent 1px, 57 | transparent 20px), 58 | repeating-linear-gradient(90deg, 59 | rgba(255,255,255,0.03) 0px, 60 | rgba(255,255,255,0.03) 1px, 61 | transparent 1px, 62 | transparent 20px); 63 | pointer-events: none; 64 | z-index: 0; 65 | } 66 | 67 | .container { 68 | width: 100%; 69 | max-width: min(1400px, 98vw); 70 | margin: 0 auto; 71 | display: flex; 72 | flex-direction: column; 73 | box-sizing: border-box; 74 | padding: clamp(5rem, 8vh, 7rem) clamp(0.75rem, 2vw, 2rem) 2rem; 75 | min-height: 100vh; 76 | } 77 | 78 | @media (max-width: 768px) { 79 | .container { 80 | padding-top: clamp(6rem, 10vh, 8rem); 81 | padding-left: 0.75rem; 82 | padding-right: 0.75rem; 83 | } 84 | } 85 | 86 | main { 87 | display: flex; 88 | flex-direction: column; 89 | gap: clamp(1rem, 2vh, 2rem); 90 | min-width: 0; 91 | width: 100%; 92 | position: relative; 93 | flex: 1; 94 | } 95 | 96 | .status { 97 | padding: 0.75rem 1rem; 98 | border-radius: 0.5rem; 99 | margin-bottom: 1rem; 100 | transition: all 0.3s ease; 101 | opacity: 0; 102 | font-weight: 500; 103 | text-align: center; 104 | } 105 | 106 | .status.info { 107 | background: rgba(99, 102, 241, 0.1); 108 | border: 1px solid rgba(99, 102, 241, 0.2); 109 | opacity: 1; 110 | } 111 | 112 | .status.error { 113 | background: rgba(239, 68, 68, 0.1); 114 | border: 1px solid rgba(239, 68, 68, 0.2); 115 | opacity: 1; 116 | } 117 | 118 | .status.success { 119 | background: rgba(34, 197, 94, 0.1); 120 | border: 1px solid rgba(34, 197, 94, 0.2); 121 | opacity: 1; 122 | } 123 | -------------------------------------------------------------------------------- /web/styles/header.css: -------------------------------------------------------------------------------- 1 | .logo-container { 2 | display: flex; 3 | align-items: center; 4 | gap: 0.75rem; 5 | } 6 | 7 | h1 { 8 | font-size: 1.75rem; 9 | font-weight: 700; 10 | margin: 0; 11 | line-height: 1; 12 | background: linear-gradient(rgba(255,255,255,0.1) 1px, transparent 1px), 13 | linear-gradient(90deg, rgba(255,255,255,0.1) 1px, transparent 1px); 14 | background-size: 5px 5px; 15 | -webkit-background-clip: text; 16 | background-clip: text; 17 | color: var(--text); 18 | text-shadow: 19 | -1px -1px 0 rgba(0,0,0,0.5), 20 | 1px -1px 0 rgba(0,0,0,0.5), 21 | -1px 1px 0 rgba(0,0,0,0.5), 22 | 1px 1px 0 rgba(0,0,0,0.5), 23 | 2px 2px var(--fg-color); 24 | } 25 | 26 | @media (max-width: 768px) { 27 | .logo-container { 28 | gap: 0.5rem; 29 | } 30 | 31 | h1 { 32 | font-size: 1.5rem; 33 | } 34 | } 35 | 36 | .cup { 37 | width: 16px; 38 | height: 20px; 39 | border: 2px solid var(--text); 40 | border-radius: 0 0 8px 8px; 41 | position: relative; 42 | animation: float 3s ease-in-out; 43 | animation-iteration-count: 3; 44 | animation-fill-mode: forwards; 45 | } 46 | 47 | .handle { 48 | width: 6px; 49 | height: 10px; 50 | border: 2px solid var(--text); 51 | border-radius: 0 4px 4px 0; 52 | position: absolute; 53 | right: -6px; 54 | top: 4px; 55 | } 56 | 57 | .steam { 58 | position: absolute; 59 | top: -6px; 60 | left: 2px; 61 | right: 2px; 62 | height: 6px; 63 | display: flex; 64 | justify-content: space-between; 65 | } 66 | 67 | .steam::before, 68 | .steam::after { 69 | content: ""; 70 | width: 3px; 71 | height: 100%; 72 | background: rgba(255,255,255,0.7); 73 | border-radius: 3px; 74 | animation: steam 2s; 75 | animation-iteration-count: 3; 76 | animation-fill-mode: forwards; 77 | } 78 | 79 | @keyframes steam { 80 | to { 81 | transform: translateY(-6px) scale(1.3); 82 | opacity: 0; 83 | } 84 | } 85 | 86 | @keyframes float { 87 | 50% { 88 | transform: translateY(-2px); 89 | } 90 | 100% { 91 | transform: translateY(0); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /web/styles/layout.css: -------------------------------------------------------------------------------- 1 | /* Main Layout */ 2 | main { 3 | display: grid; 4 | grid-template-columns: 1fr 320px; 5 | gap: 1rem; 6 | width: 80%; 7 | margin: 0 auto; 8 | min-width: 0; 9 | height: calc(100vh - 8rem); 10 | } 11 | 12 | /* Main Column */ 13 | .main-column { 14 | display: flex; 15 | flex-direction: column; 16 | gap: 1rem; 17 | min-height: min-content; 18 | height: auto; 19 | overflow-y: auto; 20 | } 21 | 22 | /* Text Editor Container */ 23 | .text-editor { 24 | min-height: 400px; 25 | height: auto; 26 | overflow: auto; 27 | background: rgba(15, 23, 42, 0.3); 28 | border: 1px solid var(--border); 29 | border-radius: 0.5rem; 30 | padding: 0.75rem; 31 | } 32 | 33 | /* Controls Panel */ 34 | .controls { 35 | display: flex; 36 | flex-direction: column; 37 | gap: 1rem; 38 | width: 100%; 39 | height: 100%; 40 | overflow-y: auto; 41 | scrollbar-width: thin; 42 | scrollbar-color: rgba(99, 102, 241, 0.2) transparent; 43 | } 44 | 45 | .controls::-webkit-scrollbar { 46 | width: 6px; 47 | } 48 | 49 | .controls::-webkit-scrollbar-track { 50 | background: transparent; 51 | } 52 | 53 | .controls::-webkit-scrollbar-thumb { 54 | background-color: rgba(99, 102, 241, 0.2); 55 | border-radius: 3px; 56 | } 57 | 58 | /* Controls Sections */ 59 | .voice-select-container, 60 | .speed-control, 61 | .button-group { 62 | width: 100%; 63 | background: rgba(15, 23, 42, 0.3); 64 | border: 1px solid var(--border); 65 | border-radius: 0.5rem; 66 | padding: 0.75rem; 67 | } 68 | 69 | /* Player Container */ 70 | .player-container { 71 | background: rgba(15, 23, 42, 0.3); 72 | border: 1px solid var(--border); 73 | border-radius: 0.5rem; 74 | padding: 0.75rem; 75 | } 76 | 77 | /* Responsive Layout */ 78 | @media (max-width: 768px) { 79 | main { 80 | grid-template-columns: 1fr; 81 | gap: 0.5rem; 82 | width: 95%; 83 | height: auto; 84 | } 85 | 86 | .text-editor { 87 | min-height: 300px; 88 | } 89 | 90 | .controls { 91 | max-height: none; 92 | overflow: visible; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /web/styles/responsive.css: -------------------------------------------------------------------------------- 1 | @media (max-width: 1200px) { 2 | .container { 3 | max-width: 100%; 4 | } 5 | 6 | main { 7 | gap: 1rem; 8 | } 9 | 10 | .text-editor, 11 | .controls { 12 | padding: 1rem; 13 | } 14 | } 15 | 16 | @media (max-width: 1023px) { 17 | h1 { 18 | font-size: clamp(1.5rem, 4vw, 2rem); 19 | } 20 | 21 | .cup { 22 | width: clamp(20px, 3vw, 30px); 23 | height: clamp(25px, 4vw, 40px); 24 | } 25 | 26 | .handle { 27 | width: clamp(8px, 1.5vw, 12px); 28 | height: clamp(15px, 2.5vw, 20px); 29 | right: clamp(-8px, -1.5vw, -12px); 30 | top: clamp(6px, 1vw, 8px); 31 | } 32 | 33 | .steam { 34 | top: clamp(-8px, -1.5vw, -12px); 35 | } 36 | 37 | .steam::before, 38 | .steam::after { 39 | width: clamp(4px, 0.75vw, 6px); 40 | } 41 | } 42 | 43 | @media (max-width: 768px) { 44 | .container { 45 | padding-left: 0.5rem; 46 | padding-right: 0.5rem; 47 | } 48 | 49 | .text-editor, 50 | .controls { 51 | padding: 0.75rem; 52 | } 53 | 54 | .voice-select-container { 55 | flex-direction: column; 56 | align-items: stretch; 57 | } 58 | 59 | .options { 60 | flex-direction: column; 61 | gap: 0.75rem; 62 | } 63 | 64 | .button-group { 65 | flex-direction: column; 66 | } 67 | 68 | .generation-options { 69 | flex-direction: column; 70 | align-items: stretch; 71 | gap: 0.5rem; 72 | } 73 | 74 | .format-select { 75 | width: 100%; 76 | } 77 | 78 | .player-container { 79 | padding: 0.75rem; 80 | } 81 | 82 | .player-controls { 83 | padding: 0.5rem; 84 | gap: 0.5rem; 85 | } 86 | 87 | .volume-control { 88 | gap: 0.25rem; 89 | } 90 | 91 | .volume-slider { 92 | width: 60px; 93 | } 94 | 95 | .wave-container { 96 | height: 32px; 97 | } 98 | 99 | .download-button { 100 | top: 0.5rem; 101 | right: 0.5rem; 102 | width: 26px; 103 | height: 26px; 104 | } 105 | 106 | .download-icon { 107 | width: 26px; 108 | height: 26px; 109 | } 110 | } 111 | --------------------------------------------------------------------------------