├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.yaml │ └── feature_request.yaml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── check.yml │ ├── fetch_voices.yml │ ├── publish-to-pypi.yml │ └── test.yml ├── .gitignore ├── .python-version ├── .vscode ├── extensions.json └── settings.json ├── BUILDING.md ├── CONTRIBUTE.md ├── LICENSE ├── README.md ├── examples ├── app.py ├── chinese.py ├── english.py ├── french.py ├── hindi.py ├── italian.py ├── japanse.py ├── play.py ├── podcast.py ├── portuguese.py ├── save.py ├── spanish.py ├── with_blending.py ├── with_espeak_data.py ├── with_espeak_lib.py ├── with_gpu.py ├── with_log.py ├── with_phonemes.py ├── with_provider.py ├── with_quant.py ├── with_session.py ├── with_stream.py ├── with_stream_save.py └── with_voice.py ├── pyproject.toml ├── scripts ├── export.py └── fetch_voices.py ├── src └── kokoro_onnx │ ├── __init__.py │ ├── config.json │ ├── config.py │ ├── log.py │ ├── py.typed │ ├── tokenizer.py │ └── trim.py └── uv.lock /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [thewh1teagle] 4 | ko_fi: thewh1teagle 5 | patreon: # Replace with a single Patreon username 6 | open_collective: # Replace with a single Open Collective username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | title: "[Title here. keep it short]" 4 | labels: ["bug"] 5 | assignees: 6 | - octocat 7 | body: 8 | - type: markdown 9 | attributes: 10 | value: | 11 | Thanks for taking the time to fill out this bug report! 12 | One second before you create, search if it's already reported [issues](https://github.com/thewh1teagle/kokoro-onnx/issues?q=is:issue+label:bug+) 13 | 14 | - type: textarea 15 | id: what-happened 16 | attributes: 17 | label: What happened? 18 | description: Also tell us, what did you expect to happen? 19 | placeholder: Tell us what you see! 20 | value: "A bug happened!" 21 | validations: 22 | required: true 23 | - type: textarea 24 | id: steps-to-reproduce 25 | attributes: 26 | label: Steps to reproduce 27 | description: Also tell us, what did you expect to happen? 28 | placeholder: Tell us what you see! adding code example won't hurt. 29 | value: | 30 | 1. step one... 31 | 2. step two... 32 | validations: 33 | required: true 34 | - type: dropdown 35 | id: platforms 36 | attributes: 37 | label: What OS are you seeing the problem on? 38 | multiple: true 39 | options: 40 | - Window 41 | - Linux 42 | - MacOS 43 | - type: input 44 | id: version 45 | attributes: 46 | label: Package version 47 | description: | 48 | Run `uv pip show kokoro-onnx` (omit uv if needed) 49 | placeholder: eg. 0.4.0 50 | - type: textarea 51 | id: logs 52 | attributes: 53 | label: Relevant log output 54 | description: | 55 | Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 56 | 57 | You can enable logs by setting LOG_LEVEL=DEBUG environment variable. 58 | 59 | Example (Linux/macOS): 60 | LOG_LEVEL=DEBUG python main.py 61 | 62 | Example (PowerShell): 63 | $env:LOG_LEVEL=DEBUG; python main.py 64 | 65 | render: shell 66 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | title: '[Title here. keep it short]' 4 | labels: ['feature'] 5 | assignees: 6 | - thewh1teagle 7 | body: 8 | - type: markdown 9 | attributes: 10 | value: | 11 | 💚💜 Thank you for interest. ❤️💛 12 | *Please prioritize checking existing issues first*. [features](https://github.com/thewh1teagle/kokoro-onnx/issues?q=is:issue+label:feature+) 13 | I will repay with higher-quality code. 14 | 15 | - type: textarea 16 | id: describe-the-feature 17 | attributes: 18 | label: Describe the feature 19 | description: Also tell us why you think it useful 20 | placeholder: Description... 21 | validations: 22 | required: true 23 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Checklist 2 | 3 | - [ ] Discussed idea or confident it's critical (See Contribute.md). 4 | - [ ] One feature/bug per PR (unless minor/related). 5 | - [ ] PR is from a **feature branch**, not `main`. 6 | - [ ] I ran at least one example to ensure the code works. 7 | - [ ] Checked linting/formatting (`uv run ruff format && uv run ruff check`). 8 | 9 | --- 10 | 11 | ## Description 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | # This Action uses minimal steps to run in ~5 seconds to rapidly: 2 | # look for typos in the codebase using codespell, and 3 | # lint Python code using ruff and provide intuitive GitHub Annotations to contributors. 4 | name: ci 5 | on: 6 | push: 7 | branches: [main] 8 | pull_request: 9 | branches: [main] 10 | workflow_dispatch: 11 | jobs: 12 | ruff: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: astral-sh/ruff-action@v3 17 | - run: ruff format --diff 18 | -------------------------------------------------------------------------------- /.github/workflows/fetch_voices.yml: -------------------------------------------------------------------------------- 1 | name: fetch voices 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | fetch-voices: 8 | permissions: 9 | contents: write 10 | strategy: 11 | fail-fast: false 12 | 13 | runs-on: macos-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Install the latest version of uv 17 | uses: astral-sh/setup-uv@v5 18 | 19 | - name: fetch voices 20 | run: | 21 | uv run scripts/fetch_voices.py 22 | latestTag=$(gh release list --json isPrerelease,tagName --jq 'map(select(.isPrerelease)) | first | .tagName') 23 | gh release upload $latestTag voices-v1.0.bin --clobber 24 | env: 25 | GH_TOKEN: ${{ github.token }} 26 | 27 | shell: bash 28 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | on: 3 | workflow_dispatch: 4 | 5 | jobs: 6 | publish: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: astral-sh/setup-uv@v5 11 | - name: Test 12 | env: 13 | UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }} 14 | run: | 15 | uv sync 16 | uv build 17 | uv publish 18 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | test: 8 | permissions: 9 | contents: write 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | include: 14 | - platform: "macos-14" # for Arm based macs (M1 and above). 15 | - platform: "macos-13" # for Intel based macs. 16 | - platform: "ubuntu-22.04" # Ubuntu 22.04 x86_64 17 | - platform: "ubuntu-22.04-arm" # Linux ARM 18 | - platform: "windows-2022" # Windows x86_64 19 | 20 | runs-on: ${{ matrix.platform }} 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Install the latest version of uv 24 | uses: astral-sh/setup-uv@v5 25 | 26 | # https://github.com/crate-ci/typos/issues/1191 27 | - name: Install wget for Windows 28 | if: matrix.platform == 'windows-2022' 29 | run: choco install wget --no-progress 30 | 31 | - name: test 32 | run: | 33 | wget --progress=bar:force:noscroll https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx -O kokoro-v1.0.onnx 34 | wget --progress=bar:force:noscroll https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin -O voices-v1.0.bin 35 | uv run examples/save.py 36 | shell: bash 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # General 10 | *.pt 11 | *.onnx 12 | .DS_Store 13 | *.wav 14 | *.json 15 | !.vscode/*.json 16 | !src/kokoro_onnx/config.json 17 | espeak-ng-data/ 18 | *.tar.gz 19 | *.dylib 20 | *.so 21 | *.dll 22 | *.m4a 23 | *.npz 24 | *.bin 25 | 26 | # Virtual environments 27 | .venv 28 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "charliermarsh.ruff" 4 | ] 5 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "[python]": { 4 | "editor.defaultFormatter": "charliermarsh.ruff", 5 | "editor.codeActionsOnSave": { 6 | "source.organizeImports.ruff": "always" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /BUILDING.md: -------------------------------------------------------------------------------- 1 | # Building 2 | 3 | ## Publish new version 4 | 5 | ```console 6 | rm -rf dist 7 | uv build 8 | UV_PUBLISH_TOKEN="pypi token here" uv publish 9 | ``` 10 | 11 | ## Format and lint 12 | 13 | ```console 14 | uv run ruff format 15 | uv run ruff check 16 | ``` 17 | 18 | ## Log 19 | 20 | Enable log with 21 | 22 | ```console 23 | LOG_LEVEL=DEBUG python main.py 24 | ``` 25 | -------------------------------------------------------------------------------- /CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | # Contributing to kokoro-onnx 2 | 3 | Thanks for thinking about contributing! 🎉 4 | 5 | ## What We Focus On 6 | 7 | This repo is for the kokoro-onnx package and examples. Our focus is on improving the package, adding examples, fixing bugs, and keeping things minimal and simple. We aim to prevent unnecessary complexity and ensure the project stays straightforward. 8 | 9 | Before contributing, **please open a [new issue](https://github.com/thewh1teagle/kokoro-onnx/issues)** to discuss your idea. This helps make sure it's a good fit and relevant. We're here to help! 10 | 11 | ## Development Recommendations 12 | 13 | We strongly recommend using [uv](https://docs.astral.sh/uv/getting-started/installation) for development, along with the Visual Studio Code extension suggested in the repository's recommendations. 14 | 15 | Before submitting a pull request, please ensure your code meets the project's formatting and linting standards by running: 16 | 17 | ```console 18 | uv run ruff format 19 | uv run ruff check 20 | ``` 21 | 22 | If you want to use ruff for quick [safety fixes](https://docs.astral.sh/ruff/linter/#fix-safety), 23 | you can run the following command: 24 | 25 | ```console 26 | uv run ruff check --fix 27 | ``` 28 | 29 | ## Pull Request Guidelines 30 | 31 | Do not create a pull request from your main branch. This ensures we can collaborate and edit the PR if needed. 32 | Thank you for contributing and helping improve kokoro-onnx! 🚀 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 github.com/thewh1teagle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kokoro-onnx 2 | 3 | ![Python Version](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) 4 | [![PyPI Release](https://img.shields.io/pypi/v/kokoro-onnx.svg)](https://pypi.org/project/kokoro-onnx/) 5 | [![Github Model Releases](https://img.shields.io/github/v/release/thewh1teagle/kokoro-onnx)](https://github.com/thewh1teagle/kokoro-onnx/releases) 6 | [![License](https://img.shields.io/github/license/thewh1teagle/kokoro-onnx)](https://github.com/thewh1teagle/kokoro-onnx/blob/main/LICENSE) 7 | [![GitHub stars](https://img.shields.io/github/stars/thewh1teagle/kokoro-onnx?style=social)](https://github.com/thewh1teagle/kokoro-onnx/stargazers) 8 | [![PyPI Downloads](https://img.shields.io/pypi/dm/kokoro-onnx?style=plastic)](https://pypi.org/project/kokoro-onnx/) 9 | 10 | [![ONNX Runtime](https://img.shields.io/badge/ONNX%20Runtime-%E2%89%A51.20.1-blue)](https://github.com/microsoft/onnxruntime) 11 | ![CPU](https://img.shields.io/badge/CPU-supported-brightgreen) 12 | ![GPU](https://img.shields.io/badge/GPU-supported-brightgreen) 13 | 14 | TTS with onnx runtime based on [Kokoro-TTS](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) 15 | 16 | 🚀 Version 1.0 models are out now! 🎉 17 | 18 | 19 | 20 | ## Features 21 | 22 | - Supports multiple languages 23 | - Fast performance near real-time on macOS M1 24 | - Offer multiple voices 25 | - Lightweight: ~300MB (quantized: ~80MB) 26 | 27 | ## Setup 28 | 29 | ```console 30 | pip install -U kokoro-onnx 31 | ``` 32 | 33 |
34 | 35 | Instructions 36 | 37 | 1. Install [uv](https://docs.astral.sh/uv/getting-started/installation) for isolated Python (Recommend). 38 | 39 | Basically open the terminal (PowerShell / Bash) and run the command listed in their website. 40 | 41 | _Note: you don't have to use `uv`. but it just make things much simpler. You can use regular Python as well._ 42 | 43 | 2. Create new project folder (you name it) 44 | 3. Run in the project folder 45 | 46 | ```console 47 | uv init -p 3.12 48 | uv add kokoro-onnx soundfile 49 | ``` 50 | 51 | 4. Paste the contents of [`examples/save.py`](https://github.com/thewh1teagle/kokoro-onnx/blob/main/examples/save.py) in `hello.py` 52 | 5. Download the files [`kokoro-v1.0.onnx`](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx), and [`voices-v1.0.bin`](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin) and place them in the same directory. 53 | 6. Run 54 | 55 | ```console 56 | uv run hello.py 57 | ``` 58 | 59 | You can edit the text in `hello.py` 60 | 61 | That's it! `audio.wav` should be created. 62 | 63 |
64 | 65 | ## Examples 66 | 67 | See [examples](examples) 68 | 69 | ## Voices 70 | 71 | See the latest voices and languages in [Kokoro-82M/VOICES.md](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) 72 | 73 | Note: It's recommend to use misaki g2p package from v1.0, see [examples](examples) 74 | 75 | ## Contribute 76 | 77 | See [CONTRIBUTE.md](CONTRIBUTE.md) 78 | 79 | ## License 80 | 81 | - kokoro-onnx: MIT 82 | - kokoro model: Apache 2.0 83 | -------------------------------------------------------------------------------- /examples/app.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.12" 3 | # dependencies = [ 4 | # "gradio>=5.13.1", 5 | # "kokoro-onnx>=0.3.8", 6 | # ] 7 | # 8 | # [tool.uv.sources] 9 | # kokoro-onnx = { path = "../" } 10 | # /// 11 | 12 | """ 13 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 14 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 15 | uv run examples/app.py 16 | """ 17 | 18 | import gradio as gr 19 | import numpy as np 20 | 21 | from kokoro_onnx import Kokoro 22 | from kokoro_onnx.tokenizer import Tokenizer 23 | 24 | tokenizer = Tokenizer() 25 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 26 | 27 | 28 | SUPPORTED_LANGUAGES = ["en-us"] 29 | 30 | 31 | def create(text: str, voice: str, language: str, blend_voice_name: str = None): 32 | phonemes = tokenizer.phonemize(text, lang=language) 33 | 34 | # Blending 35 | if blend_voice_name: 36 | first_voice = kokoro.get_voice_style(voice) 37 | second_voice = kokoro.get_voice_style(blend_voice_name) 38 | voice = np.add(first_voice * (50 / 100), second_voice * (50 / 100)) 39 | samples, sample_rate = kokoro.create( 40 | phonemes, voice=voice, speed=1.0, is_phonemes=True 41 | ) 42 | return [(sample_rate, samples), phonemes] 43 | 44 | 45 | def create_app(): 46 | with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])) as ui: 47 | text_input = gr.TextArea( 48 | label="Input Text", 49 | rtl=False, 50 | value="Kokoro TTS. Turning words into emotion, one voice at a time!", 51 | ) 52 | language_input = gr.Dropdown( 53 | label="Language", 54 | value="en-us", 55 | choices=SUPPORTED_LANGUAGES, 56 | ) 57 | voice_input = gr.Dropdown( 58 | label="Voice", value="af_sky", choices=sorted(kokoro.get_voices()) 59 | ) 60 | blend_voice_input = gr.Dropdown( 61 | label="Blend Voice (Optional)", 62 | value=None, 63 | choices=sorted(kokoro.get_voices()) + [None], 64 | ) 65 | submit_button = gr.Button("Create") 66 | phonemes_output = gr.Textbox(label="Phonemes") 67 | audio_output = gr.Audio() 68 | submit_button.click( 69 | fn=create, 70 | inputs=[text_input, voice_input, language_input, blend_voice_input], 71 | outputs=[audio_output, phonemes_output], 72 | ) 73 | return ui 74 | 75 | 76 | ui = create_app() 77 | ui.launch(debug=True) 78 | -------------------------------------------------------------------------------- /examples/chinese.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Run 9 | uv venv -p 3.12 10 | uv pip install -U kokoro-onnx soundfile 'misaki[zh]' 11 | 3. 12 | Download these files 13 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx 14 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin 15 | https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json 16 | 4. Run 17 | uv run main.py 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import zh 22 | 23 | from kokoro_onnx import Kokoro 24 | 25 | # Misaki G2P with espeak-ng fallback 26 | g2p = zh.ZHG2P(version="1.1") 27 | 28 | text = "千里之行,始于足下。" 29 | voice = "zf_001" 30 | kokoro = Kokoro("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin", vocab_config="config.json") 31 | phonemes, _ = g2p(text) 32 | samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True) 33 | sf.write("audio.wav", samples, sample_rate) 34 | print("Created audio.wav") 35 | -------------------------------------------------------------------------------- /examples/english.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import en, espeak 22 | 23 | from kokoro_onnx import Kokoro 24 | 25 | # Misaki G2P with espeak-ng fallback 26 | fallback = espeak.EspeakFallback(british=False) 27 | g2p = en.G2P(trf=False, british=False, fallback=fallback) 28 | 29 | # Kokoro 30 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 31 | 32 | # Phonemize 33 | text = "[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models." 34 | phonemes, _ = g2p(text) 35 | 36 | # Create 37 | samples, sample_rate = kokoro.create(phonemes, "af_heart", is_phonemes=True) 38 | 39 | # Save 40 | sf.write("audio.wav", samples, sample_rate) 41 | print("Created audio.wav") 42 | -------------------------------------------------------------------------------- /examples/french.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import espeak 22 | from misaki.espeak import EspeakG2P 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | # Misaki G2P with espeak-ng fallback 27 | fallback = espeak.EspeakFallback(british=False) 28 | g2p = EspeakG2P(language="fr-fr") 29 | 30 | # Kokoro 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | # Phonemize 34 | text = "On ne voit bien qu’avec le cœur. L’essentiel est invisible pour les yeux." 35 | phonemes, _ = g2p(text) 36 | 37 | # Create 38 | samples, sample_rate = kokoro.create(phonemes, "ff_siwis", is_phonemes=True) 39 | 40 | # Save 41 | sf.write("audio.wav", samples, sample_rate) 42 | print("Created audio.wav") 43 | -------------------------------------------------------------------------------- /examples/hindi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import espeak 22 | from misaki.espeak import EspeakG2P 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | # Misaki G2P with espeak-ng fallback 27 | fallback = espeak.EspeakFallback(british=False) 28 | g2p = EspeakG2P(language="hi") 29 | 30 | # Kokoro 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | # Phonemize 34 | text = "सपने वो नहीं जो हम सोते समय देखते हैं, सपने वो हैं जो हमें सोने नहीं देते।" 35 | phonemes, _ = g2p(text) 36 | 37 | # Create 38 | samples, sample_rate = kokoro.create(phonemes, "hf_alpha", is_phonemes=True) 39 | 40 | # Save 41 | sf.write("audio.wav", samples, sample_rate) 42 | print("Created audio.wav") 43 | -------------------------------------------------------------------------------- /examples/italian.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import espeak 22 | from misaki.espeak import EspeakG2P 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | # Misaki G2P with espeak-ng fallback 27 | fallback = espeak.EspeakFallback(british=False) 28 | g2p = EspeakG2P(language="it") 29 | 30 | # Kokoro 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | # Phonemize 34 | text = "Non sognare la tua vita, vivi il tuo sogno." 35 | phonemes, _ = g2p(text) 36 | 37 | # Create 38 | samples, sample_rate = kokoro.create(phonemes, "im_nicola", is_phonemes=True) 39 | 40 | # Save 41 | sf.write("audio.wav", samples, sample_rate) 42 | print("Created audio.wav") 43 | -------------------------------------------------------------------------------- /examples/japanse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Run 9 | uv venv -p 3.12 10 | uv pip install -U kokoro-onnx soundfile 'misaki[ja]' 11 | 3. 12 | Download these files 13 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx 14 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin 15 | https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json 16 | 4. Run 17 | uv run main.py 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import ja 22 | 23 | from kokoro_onnx import Kokoro 24 | 25 | # Misaki G2P with espeak-ng fallback 26 | g2p = ja.JAG2P() 27 | 28 | text = "「人生を夢見るな。夢を生きろ。」" 29 | voice = "jf_alpha" 30 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin", vocab_config="config.json") 31 | phonemes, _ = g2p(text) 32 | samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True) 33 | sf.write("audio.wav", samples, sample_rate) 34 | print("Created audio.wav") 35 | -------------------------------------------------------------------------------- /examples/play.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev 3 | 4 | pip install -U kokoro-onnx sounddevice 5 | 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 8 | python examples/play.py 9 | """ 10 | 11 | import sounddevice as sd 12 | 13 | from kokoro_onnx import Kokoro 14 | 15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 16 | samples, sample_rate = kokoro.create( 17 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 18 | ) 19 | print("Playing audio...") 20 | sd.play(samples, sample_rate) 21 | sd.wait() 22 | -------------------------------------------------------------------------------- /examples/podcast.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/podcast.py 7 | """ 8 | 9 | import random 10 | 11 | import numpy as np 12 | import soundfile as sf 13 | 14 | from kokoro_onnx import Kokoro 15 | 16 | # fmt: off 17 | sentences = [ 18 | { "voice": "af_sarah", "text": "Hello and welcome to the podcast! We’ve got some exciting things lined up today." }, # Sarah 19 | { "voice": "am_michael", "text": "It’s going to be an exciting episode. Stick with us!" }, # Michael 20 | { "voice": "af_sarah", "text": "But first, we’ve got a special guest with us. Please welcome Nicole!" }, # Sarah 21 | { "voice": "af_sarah", "text": "Now, we’ve been told Nicole has a very unique way of speaking today... a bit of a mysterious vibe, if you will." }, # Sarah 22 | { "voice": "af_nicole", "text": "Hey there... I’m so excited to be a guest today... But I thought I’d keep it quiet... for now..." }, # Nicole whispers 23 | { "voice": "am_michael", "text": "Well, it certainly adds some intrigue! Let’s dive in and see what that’s all about." }, # Sarah 24 | { "voice": "af_sarah", "text": "Today, we’re covering something that’s close to our hearts" }, # Sarah 25 | { "voice": "am_michael", "text": "It’s going to be a good one!" } # Michael 26 | ] 27 | 28 | def random_pause(min_duration=0.5, max_duration=2.0): 29 | silence_duration = random.uniform(min_duration, max_duration) 30 | silence = np.zeros(int(silence_duration * sample_rate)) 31 | return silence 32 | 33 | 34 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 35 | 36 | audio = [] 37 | 38 | for sentence in sentences: 39 | voice = sentence["voice"] 40 | text = sentence["text"] 41 | print(f"Creating audio with {voice}: {text}") 42 | 43 | samples, sample_rate = kokoro.create( 44 | text, 45 | voice=voice, 46 | speed=1.0, 47 | lang="en-us", 48 | ) 49 | audio.append(samples) 50 | # Add random silence after each sentence 51 | audio.append(random_pause()) 52 | 53 | # Concatenate all audio parts 54 | audio = np.concatenate(audio) 55 | 56 | # Save the generated audio to file 57 | sf.write("podcast.wav", audio, sample_rate) 58 | print("Created podcast.wav") 59 | -------------------------------------------------------------------------------- /examples/portuguese.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import espeak 22 | from misaki.espeak import EspeakG2P 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | # Misaki G2P with espeak-ng fallback 27 | fallback = espeak.EspeakFallback(british=False) 28 | g2p = EspeakG2P(language="pt-br") 29 | 30 | # Kokoro 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | # Phonemize 34 | text = "Não sonhe sua vida, viva seu sonho." 35 | phonemes, _ = g2p(text) 36 | 37 | # Create 38 | samples, sample_rate = kokoro.create(phonemes, "pf_dora", is_phonemes=True) 39 | 40 | # Save 41 | sf.write("audio.wav", samples, sample_rate) 42 | print("Created audio.wav") 43 | -------------------------------------------------------------------------------- /examples/save.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/save.py 7 | """ 8 | 9 | import soundfile as sf 10 | 11 | from kokoro_onnx import Kokoro 12 | 13 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 14 | samples, sample_rate = kokoro.create( 15 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 16 | ) 17 | sf.write("audio.wav", samples, sample_rate) 18 | print("Created audio.wav") 19 | -------------------------------------------------------------------------------- /examples/spanish.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | 1. 4 | Install uv from https://docs.astral.sh/uv/getting-started/installation 5 | 2. 6 | Copy this file to new folder 7 | 3. 8 | Download these files 9 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 4. Run 12 | uv venv --seed -p 3.12 13 | source .venv/bin/activate 14 | uv pip install -U kokoro-onnx soundfile 'misaki[en]' 15 | uv run main.py 16 | 17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 18 | """ 19 | 20 | import soundfile as sf 21 | from misaki import espeak 22 | from misaki.espeak import EspeakG2P 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | # Misaki G2P with espeak-ng fallback 27 | fallback = espeak.EspeakFallback(british=False) 28 | g2p = EspeakG2P(language="es") 29 | 30 | # Kokoro 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | # Phonemize 34 | text = "No cuentes los días, haz que los días cuenten." 35 | phonemes, _ = g2p(text) 36 | 37 | # Create 38 | samples, sample_rate = kokoro.create(phonemes, "im_nicola", is_phonemes=True) 39 | 40 | # Save 41 | sf.write("audio.wav", samples, sample_rate) 42 | print("Created audio.wav") 43 | -------------------------------------------------------------------------------- /examples/with_blending.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_blending.py 7 | """ 8 | 9 | import numpy as np 10 | import soundfile as sf 11 | 12 | from kokoro_onnx import Kokoro 13 | 14 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 15 | nicole: np.ndarray = kokoro.get_voice_style("af_nicole") 16 | michael: np.ndarray = kokoro.get_voice_style("am_michael") 17 | blend = np.add(nicole * (50 / 100), michael * (50 / 100)) 18 | samples, sample_rate = kokoro.create( 19 | "Hello. This audio is generated by Kokoro!", 20 | voice=blend, 21 | speed=1.0, 22 | lang="en-us", 23 | ) 24 | sf.write("audio.wav", samples, sample_rate) 25 | print("Created audio.wav") 26 | -------------------------------------------------------------------------------- /examples/with_espeak_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_espeak_data.py 7 | """ 8 | 9 | import soundfile as sf 10 | 11 | from kokoro_onnx import EspeakConfig, Kokoro 12 | 13 | kokoro = Kokoro( 14 | "kokoro-v1.0.onnx", 15 | "voices-v1.0.bin", 16 | espeak_config=EspeakConfig(data_path="./espeak-ng-data"), 17 | ) 18 | samples, sample_rate = kokoro.create( 19 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 20 | ) 21 | sf.write("audio.wav", samples, sample_rate) 22 | print("Created audio.wav") 23 | -------------------------------------------------------------------------------- /examples/with_espeak_lib.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev 3 | 4 | pip install -U kokoro-onnx sounddevice 5 | 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 8 | 9 | PHONEMIZER_ESPEAK_LIBRARY="/usr/local/Cellar/espeak-ng/1.52.0/lib/libespeak-ng.1.dylib" python examples/with_espeak_lib.py 10 | """ 11 | 12 | import os 13 | 14 | import sounddevice as sd 15 | 16 | from kokoro_onnx import EspeakConfig, Kokoro 17 | 18 | kokoro = Kokoro( 19 | "kokoro-v1.0.onnx", 20 | "voices-v1.0.bin", 21 | espeak_config=EspeakConfig(lib_path=os.getenv("PHONEMIZER_ESPEAK_LIBRARY")), 22 | ) 23 | samples, sample_rate = kokoro.create( 24 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 25 | ) 26 | print("Playing audio...") 27 | sd.play(samples, sample_rate) 28 | sd.wait() 29 | -------------------------------------------------------------------------------- /examples/with_gpu.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: 3 | On Linux you need to run this as well: apt-get install portaudio19-dev 4 | gpu version is sufficient only for Linux and Windows. macOS works with GPU by default. 5 | You can see the used execution provider by enable debug log. see with_log.py 6 | 7 | Setup: 8 | pip install -U kokoro-onnx[gpu] sounddevice 9 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 10 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 11 | 12 | Run: 13 | python examples/play.py 14 | """ 15 | 16 | import sounddevice as sd 17 | 18 | from kokoro_onnx import Kokoro 19 | 20 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 21 | samples, sample_rate = kokoro.create( 22 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 23 | ) 24 | print("Playing audio...") 25 | sd.play(samples, sample_rate) 26 | sd.wait() 27 | -------------------------------------------------------------------------------- /examples/with_log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev 3 | 4 | pip install -U kokoro-onnx sounddevice 5 | 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 8 | python examples/with_log.py 9 | """ 10 | 11 | import logging 12 | 13 | import sounddevice as sd 14 | 15 | import kokoro_onnx 16 | from kokoro_onnx import Kokoro 17 | 18 | # You can set the environment variable LOG_LEVEL 19 | # Linux: export LOG_LEVEL=DEBUG 20 | # Windows: $env:LOG_LEVEL="DEBUG" 21 | 22 | # Or programmatically 23 | logging.getLogger(kokoro_onnx.__name__).setLevel("DEBUG") 24 | 25 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 26 | samples, sample_rate = kokoro.create( 27 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 28 | ) 29 | print("Playing audio...") 30 | sd.play(samples, sample_rate) 31 | sd.wait() 32 | -------------------------------------------------------------------------------- /examples/with_phonemes.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx sounddevice 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_phonemes.py 7 | """ 8 | 9 | import sounddevice as sd 10 | 11 | from kokoro_onnx import Kokoro 12 | from kokoro_onnx.tokenizer import Tokenizer 13 | 14 | tokenizer = Tokenizer() 15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 16 | 17 | phonemes = tokenizer.phonemize("Hello world!") 18 | samples, sample_rate = kokoro.create(phonemes, voice="af_heart", is_phonemes=True) 19 | print("Playing audio...") 20 | sd.play(samples, sample_rate) 21 | sd.wait() 22 | -------------------------------------------------------------------------------- /examples/with_provider.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use a specific ONNX execution provider with Kokoro. 3 | 4 | For available providers, see: 5 | https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377 6 | 7 | Setup: 8 | 1. Install dependencies: pip install -U kokoro-onnx soundfile 9 | 2. Download model and voices: 10 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 11 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 12 | 13 | Run: 14 | macOS/Linux: ONNX_PROVIDER="CPUExecutionProvider" python examples/with_provider.py 15 | Windows PowerShell: $env:ONNX_PROVIDER="CPUExecutionProvider" ; python examples/with_provider.py 16 | """ 17 | 18 | import soundfile as sf 19 | 20 | from kokoro_onnx import Kokoro 21 | 22 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 23 | samples, sample_rate = kokoro.create( 24 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 25 | ) 26 | sf.write("audio.wav", samples, sample_rate) 27 | print("Created audio.wav") 28 | -------------------------------------------------------------------------------- /examples/with_quant.py: -------------------------------------------------------------------------------- 1 | """ 2 | **Smaller models should have lower quality but show no significant quality loss in checks. 3 | 4 | Usage: 5 | 6 | 1. Install dependencies: 7 | sudo apt-get install portaudio19-dev 8 | pip install -U kokoro-onnx sounddevice 9 | 2. Download a model (choose one): 10 | - INT8 (88MB): 11 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.int8.onnx 12 | - FP16 (169MB): 13 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.fp16.onnx 14 | 3. Download voices-v1.0.bin: 15 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 16 | 4. Run example: 17 | python examples/with_quant.py 18 | """ 19 | 20 | import sys 21 | 22 | import sounddevice as sd 23 | 24 | from kokoro_onnx import Kokoro 25 | 26 | kokoro = Kokoro(sys.argv[1], "voices-v1.0.bin") 27 | samples, sample_rate = kokoro.create( 28 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 29 | ) 30 | print("Playing audio...") 31 | sd.play(samples, sample_rate) 32 | sd.wait() 33 | -------------------------------------------------------------------------------- /examples/with_session.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_session.py 7 | """ 8 | 9 | import os 10 | 11 | import onnxruntime 12 | import soundfile as sf 13 | from onnxruntime import InferenceSession 14 | 15 | from kokoro_onnx import Kokoro 16 | 17 | 18 | def create_session(): 19 | # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377 20 | providers = onnxruntime.get_available_providers() 21 | print(f"Available onnx runtime providers: {providers}") 22 | 23 | # See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management 24 | sess_options = onnxruntime.SessionOptions() 25 | # Set threads to num of CPU cores 26 | cpu_count = os.cpu_count() 27 | print(f"Setting threads to CPU cores count: {cpu_count}") 28 | sess_options.intra_op_num_threads = cpu_count 29 | session = InferenceSession( 30 | "kokoro-v1.0.onnx", providers=providers, sess_options=sess_options 31 | ) 32 | return session 33 | 34 | 35 | session = create_session() 36 | kokoro = Kokoro.from_session(session, "voices-v1.0.bin") 37 | samples, sample_rate = kokoro.create( 38 | "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us" 39 | ) 40 | sf.write("audio.wav", samples, sample_rate) 41 | print("Created audio.wav") 42 | -------------------------------------------------------------------------------- /examples/with_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev 3 | 4 | pip install -U kokoro-onnx sounddevice 5 | 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 8 | python examples/with_stream.py 9 | """ 10 | 11 | import asyncio 12 | 13 | import sounddevice as sd 14 | 15 | from kokoro_onnx import Kokoro 16 | 17 | text = """ 18 | We've just been hearing from Matthew Cappucci, a senior meteorologist at the weather app MyRadar, who says Kansas City is seeing its heaviest snow in 32 years - with more than a foot (30 to 40cm) having come down so far. 19 | 20 | Despite it looking as though the storm is slowly moving eastwards, Cappucci says the situation in Kansas and Missouri remains serious. 21 | 22 | He says some areas near the Ohio River are like "skating rinks", telling our colleagues on Newsday that in Missouri in particular there is concern about how many people have lost power, and will lose power, creating enough ice to pull power lines down. 23 | 24 | Temperatures are set to drop in the next several days, in may cases dipping maybe below minus 10 to minus 15 degrees Celsius for an extended period of time. 25 | 26 | There is a special alert for Kansas, urging people not to leave their homes: "The ploughs are getting stuck, the police are getting stuck, everybody’s getting stuck - stay home." 27 | """ 28 | 29 | 30 | async def main(): 31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 32 | 33 | stream = kokoro.create_stream( 34 | text, 35 | voice="af_nicole", 36 | speed=1.0, 37 | lang="en-us", 38 | ) 39 | 40 | count = 0 41 | async for samples, sample_rate in stream: 42 | count += 1 43 | print(f"Playing audio stream ({count})...") 44 | sd.play(samples, sample_rate) 45 | sd.wait() 46 | 47 | 48 | asyncio.run(main()) 49 | -------------------------------------------------------------------------------- /examples/with_stream_save.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_stream_save.py 7 | """ 8 | 9 | import asyncio 10 | 11 | import soundfile as sf 12 | 13 | from kokoro_onnx import SAMPLE_RATE, Kokoro 14 | 15 | text = """ 16 | We've just been hearing from Matthew Cappucci, a senior meteorologist at the weather app MyRadar, who says Kansas City is seeing its heaviest snow in 32 years - with more than a foot (30 to 40cm) having come down so far. 17 | 18 | Despite it looking as though the storm is slowly moving eastwards, Cappucci says the situation in Kansas and Missouri remains serious. 19 | 20 | He says some areas near the Ohio River are like "skating rinks", telling our colleagues on Newsday that in Missouri in particular there is concern about how many people have lost power, and will lose power, creating enough ice to pull power lines down. 21 | 22 | Temperatures are set to drop in the next several days, in may cases dipping maybe below minus 10 to minus 15 degrees Celsius for an extended period of time. 23 | 24 | There is a special alert for Kansas, urging people not to leave their homes: "The ploughs are getting stuck, the police are getting stuck, everybody’s getting stuck - stay home." 25 | """ 26 | 27 | 28 | async def main(): 29 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 30 | 31 | stream = kokoro.create_stream( 32 | text, 33 | voice="af_nicole", 34 | speed=1.0, 35 | lang="en-us", 36 | ) 37 | 38 | with sf.SoundFile("audio.wav", mode="w", samplerate=SAMPLE_RATE, channels=1) as f: 39 | count = 0 40 | async for samples, sample_rate in stream: 41 | count += 1 42 | print(f"Writing chunk {count} of audio stream...") 43 | f.write(samples) 44 | 45 | 46 | asyncio.run(main()) 47 | -------------------------------------------------------------------------------- /examples/with_voice.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -U kokoro-onnx soundfile 3 | 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin 6 | python examples/with_voice.py 7 | """ 8 | 9 | import numpy as np 10 | import soundfile as sf 11 | 12 | from kokoro_onnx import Kokoro 13 | from kokoro_onnx.config import SAMPLE_RATE 14 | 15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin") 16 | created = [] 17 | 18 | for voice in kokoro.get_voices(): 19 | samples, sample_rate = kokoro.create( 20 | f"Hello! This audio generated by {voice}!", voice=voice, speed=1.0 21 | ) 22 | created.append(samples) 23 | print(f"Generated audio for {voice}") 24 | 25 | audio = np.concatenate(created) 26 | 27 | sf.write("voices.wav", audio, SAMPLE_RATE) 28 | print("Created voices.wav") 29 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "kokoro-onnx" 3 | version = "0.4.9" 4 | description = "TTS with kokoro and onnx runtime" 5 | readme = "README.md" 6 | authors = [ 7 | { name = "thewh1teagle", email = "61390950+thewh1teagle@users.noreply.github.com" }, 8 | ] 9 | requires-python = ">=3.10,<3.14" 10 | dependencies = [ 11 | "onnxruntime>=1.20.1", 12 | "colorlog>=6.9.0", 13 | "espeakng-loader>=0.2.4", 14 | "phonemizer-fork>=3.3.2", 15 | "numpy>=2.0.2", 16 | ] 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/thewh1teagle/kokoro-onnx" 20 | Repository = "https://github.com/thewh1teagle/kokoro-onnx" 21 | Issues = "https://github.com/thewh1teagle/kokoro-onnx/issues" 22 | 23 | 24 | [project.optional-dependencies] 25 | # Windows/Linux GPU feature 26 | # Install with kokoro-onnx[gpu] 27 | gpu = [ 28 | # onnxruntime-gpu is not available on Linux ARM or macOS 29 | "onnxruntime-gpu>=1.20.1; platform_machine == 'x86_64' and sys_platform != 'darwin'", 30 | ] 31 | 32 | [build-system] 33 | requires = ["hatchling"] 34 | build-backend = "hatchling.build" 35 | 36 | [dependency-groups] 37 | dev = ["ruff>=0.11.0", "sounddevice>=0.5.1", "soundfile>=0.13.0"] 38 | 39 | [tool.ruff] 40 | required-version = ">=0.9.0" 41 | output-format = "concise" 42 | show-fixes = true 43 | 44 | [tool.ruff.lint] 45 | extend-select = ["I", "UP"] 46 | 47 | [tool.ruff.lint.isort] 48 | split-on-trailing-comma = false 49 | -------------------------------------------------------------------------------- /scripts/export.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.12" 3 | # dependencies = [ 4 | # "kokoro==0.8.4", 5 | # "onnx==1.17.0", 6 | # "onnxruntime==1.20.1", 7 | # "sounddevice==0.5.1", 8 | # ] 9 | # 10 | # /// 11 | 12 | """ 13 | From https://github.com/hexgrad/kokoro/blob/3f9dd88d6f739b98a86aea608e238621f5b40add/examples/export.py 14 | 15 | mkdir checkpoints 16 | wget https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json -O checkpoints/config.json 17 | wget https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth -O checkpoints/kokoro-v1_1-zh.pth 18 | uv run examples/export.py 19 | uv run examples/export.py --config_file checkpoints/config.json --checkpoint_path checkpoints/kokoro-v1_1-zh.pth 20 | """ 21 | 22 | import argparse 23 | import os 24 | 25 | import onnx 26 | import onnxruntime as ort 27 | import sounddevice as sd 28 | import torch 29 | from kokoro import KModel, KPipeline 30 | from kokoro.model import KModelForONNX 31 | 32 | 33 | def export_onnx(model, output): 34 | onnx_file = output + "/" + "kokoro.onnx" 35 | 36 | input_ids = torch.randint(1, 100, (48,)).numpy() 37 | input_ids = torch.LongTensor([[0, *input_ids, 0]]) 38 | style = torch.randn(1, 256) 39 | speed = torch.randint(1, 10, (1,)).int() 40 | 41 | torch.onnx.export( 42 | model, 43 | args=(input_ids, style, speed), 44 | f=onnx_file, 45 | export_params=True, 46 | verbose=True, 47 | input_names=["input_ids", "style", "speed"], 48 | output_names=["waveform", "duration"], 49 | opset_version=17, 50 | dynamic_axes={ 51 | "input_ids": {1: "input_ids_len"}, 52 | "waveform": {0: "num_samples"}, 53 | }, 54 | do_constant_folding=True, 55 | ) 56 | 57 | print("export kokoro.onnx ok!") 58 | 59 | onnx_model = onnx.load(onnx_file) 60 | onnx.checker.check_model(onnx_model) 61 | print("onnx check ok!") 62 | 63 | 64 | def load_input_ids(pipeline, text): 65 | if pipeline.lang_code in "ab": 66 | _, tokens = pipeline.g2p(text) 67 | for gs, ps, tks in pipeline.en_tokenize(tokens): 68 | if not ps: 69 | continue 70 | else: 71 | ps, _ = pipeline.g2p(text) 72 | 73 | if len(ps) > 510: 74 | ps = ps[:510] 75 | 76 | input_ids = list( 77 | filter(lambda i: i is not None, map(lambda p: pipeline.model.vocab.get(p), ps)) 78 | ) 79 | print(f"text: {text} -> phonemes: {ps} -> input_ids: {input_ids}") 80 | input_ids = torch.LongTensor([[0, *input_ids, 0]]).to(pipeline.model.device) 81 | return ps, input_ids 82 | 83 | 84 | def load_voice(pipeline, voice, phonemes): 85 | pack = pipeline.load_voice(voice).to("cpu") 86 | return pack[len(phonemes) - 1] 87 | 88 | 89 | def load_sample(model): 90 | pipeline = KPipeline(lang_code="a", model=model.kmodel, device="cpu") 91 | text = """ 92 | In today's fast-paced tech world, building software applications has never been easier — thanks to AI-powered coding assistants.' 93 | """ 94 | text = """ 95 | The sky above the port was the color of television, tuned to a dead channel. 96 | """ 97 | voice = "checkpoints/voices/af_heart.pt" 98 | 99 | pipeline = KPipeline(lang_code="z", model=model.kmodel, device="cpu") 100 | text = """ 101 | 2月15日晚,猫眼专业版数据显示,截至发稿,《哪吒之魔童闹海》(或称《哪吒2》)今日票房已达7.8亿元,累计票房(含预售)超过114亿元。 102 | """ 103 | voice = "checkpoints/voices/zf_xiaoxiao.pt" 104 | 105 | phonemes, input_ids = load_input_ids(pipeline, text) 106 | style = load_voice(pipeline, voice, phonemes) 107 | speed = torch.IntTensor([1]) 108 | 109 | return input_ids, style, speed 110 | 111 | 112 | def inference_onnx(model, output): 113 | onnx_file = output + "/" + "kokoro.onnx" 114 | session = ort.InferenceSession(onnx_file) 115 | 116 | input_ids, style, speed = load_sample(model) 117 | 118 | outputs = session.run( 119 | None, 120 | { 121 | "input_ids": input_ids.numpy(), 122 | "style": style.numpy(), 123 | "speed": speed.numpy(), 124 | }, 125 | ) 126 | 127 | output = torch.from_numpy(outputs[0]) 128 | print(f"output: {output.shape}") 129 | print(output) 130 | 131 | audio = output.numpy() 132 | sd.play(audio, 24000) 133 | sd.wait() 134 | 135 | 136 | def check_model(model): 137 | input_ids, style, speed = load_sample(model) 138 | output, duration = model(input_ids, style, speed) 139 | 140 | print(f"output: {output.shape}") 141 | print(f"duration: {duration.shape}") 142 | print(output) 143 | 144 | audio = output.numpy() 145 | sd.play(audio, 24000) 146 | sd.wait() 147 | 148 | 149 | if __name__ == "__main__": 150 | parser = argparse.ArgumentParser("Export kokoro Model to ONNX", add_help=True) 151 | parser.add_argument( 152 | "--inference", "-t", help="test kokoro.onnx model", action="store_true" 153 | ) 154 | parser.add_argument("--check", "-m", help="check kokoro model", action="store_true") 155 | parser.add_argument( 156 | "--config_file", 157 | "-c", 158 | type=str, 159 | default="checkpoints/config.json", 160 | help="path to config file", 161 | ) 162 | parser.add_argument( 163 | "--checkpoint_path", 164 | "-p", 165 | type=str, 166 | default="checkpoints/kokoro-v1_0.pth", 167 | help="path to checkpoint file", 168 | ) 169 | parser.add_argument( 170 | "--output_dir", "-o", type=str, default="onnx", help="output directory" 171 | ) 172 | 173 | args = parser.parse_args() 174 | 175 | # cfg 176 | config_file = args.config_file # change the path of the model config file 177 | checkpoint_path = args.checkpoint_path # change the path of the model 178 | output_dir = args.output_dir 179 | 180 | # make dir 181 | os.makedirs(output_dir, exist_ok=True) 182 | 183 | kmodel = KModel(config=config_file, model=checkpoint_path, disable_complex=True) 184 | model = KModelForONNX(kmodel).eval() 185 | 186 | if args.inference: 187 | inference_onnx(model, output_dir) 188 | elif args.check: 189 | check_model(model) 190 | else: 191 | export_onnx(model, output_dir) 192 | -------------------------------------------------------------------------------- /scripts/fetch_voices.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.12" 3 | # dependencies = [ 4 | # "numpy==2.0.2", 5 | # "requests", 6 | # "torch==2.5.1", 7 | # "tqdm==4.67.1", 8 | # ] 9 | # /// 10 | """ 11 | Run this file via: 12 | uv run scripts/fetch_voices.py 13 | 14 | See voices in 15 | https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md 16 | """ 17 | 18 | import io 19 | import os 20 | from pathlib import Path 21 | 22 | import numpy as np 23 | import requests 24 | import torch 25 | from tqdm import tqdm 26 | 27 | config = { 28 | "Kokoro-82M-v1.1-zh": { 29 | "voice_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/{name}.pt", 30 | "api_url": "https://huggingface.co/api/models/hexgrad/Kokoro-82M-v1.1-zh/tree/main/voices", 31 | "npz_path": "voices-v1.1-zh.bin", 32 | }, 33 | # "Kokoro-82M": { 34 | # "voice_url": "https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/{name}.pt", 35 | # "api_url": "https://huggingface.co/api/models/hexgrad/Kokoro-82M/tree/main/voices", 36 | # "npz_path": "voices-v1.0.bin", 37 | # }, 38 | } 39 | # Extract voice names 40 | 41 | 42 | def get_voice_names(api_url): 43 | resp = requests.get(api_url) 44 | resp.raise_for_status() 45 | data = resp.json() 46 | names = [voice["path"][7:-3] for voice in data] 47 | return names 48 | 49 | 50 | def download_config(): 51 | resp = requests.get( 52 | "https://huggingface.co/hexgrad/Kokoro-82M/raw/main/config.json" 53 | ) 54 | resp.raise_for_status() 55 | content = resp.content 56 | with open( 57 | Path(__file__).parent / "../src/kokoro_onnx/config.json", "wb", encoding="utf-8" 58 | ) as fp: 59 | fp.write(content) 60 | 61 | 62 | def download_voices(voice_url: str, names: list[str], npz_path: str): 63 | count = len(names) 64 | 65 | # Extract voice files 66 | print(f"Found {count} voices") 67 | voices = {} 68 | for name in tqdm(names): 69 | url = voice_url.format(name=name) 70 | print(f"Downloading {name}") 71 | r = requests.get(url) 72 | r.raise_for_status() # Ensure the request was successful 73 | content = io.BytesIO(r.content) 74 | data: np.ndarray = torch.load(content, weights_only=True).numpy() 75 | voices[name] = data 76 | 77 | # Save all voices to a single .npz file 78 | with open(npz_path, "wb", encoding="utf-8") as f: 79 | np.savez(f, **voices) 80 | 81 | mb_size = os.path.getsize(npz_path) // 1000 // 1000 82 | print(f"Created {npz_path} ({mb_size}MB)") 83 | 84 | 85 | def main(): 86 | for model_name, model_config in config.items(): 87 | print(f"Downloading {model_name}") 88 | voice_url, api_url, npz_path = ( 89 | model_config["voice_url"], 90 | model_config["api_url"], 91 | model_config["npz_path"], 92 | ) 93 | voice_names = get_voice_names(api_url) 94 | download_voices(voice_url, voice_names, npz_path) 95 | download_config() 96 | 97 | 98 | main() 99 | -------------------------------------------------------------------------------- /src/kokoro_onnx/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import importlib 3 | import importlib.metadata 4 | import importlib.util 5 | import json 6 | import os 7 | import platform 8 | import re 9 | import time 10 | from collections.abc import AsyncGenerator 11 | 12 | import numpy as np 13 | import onnxruntime as rt 14 | from numpy.typing import NDArray 15 | 16 | from .config import MAX_PHONEME_LENGTH, SAMPLE_RATE, EspeakConfig, KoKoroConfig 17 | from .log import log 18 | from .tokenizer import Tokenizer 19 | from .trim import trim as trim_audio 20 | 21 | 22 | class Kokoro: 23 | def __init__( 24 | self, 25 | model_path: str, 26 | voices_path: str, 27 | espeak_config: EspeakConfig | None = None, 28 | vocab_config: dict | str | None = None, 29 | ): 30 | # Show useful information for bug reports 31 | log.debug( 32 | f"koko-onnx version {importlib.metadata.version('kokoro-onnx')} on {platform.platform()} {platform.version()}" 33 | ) 34 | self.config = KoKoroConfig(model_path, voices_path, espeak_config) 35 | self.config.validate() 36 | 37 | # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377 38 | providers = ["CPUExecutionProvider"] 39 | 40 | # Check if kokoro-onnx installed with kokoro-onnx[gpu] feature (Windows/Linux) 41 | gpu_enabled = importlib.util.find_spec("onnxruntime-gpu") 42 | if gpu_enabled: 43 | providers: list[str] = rt.get_available_providers() 44 | 45 | # Check if ONNX_PROVIDER environment variable was set 46 | env_provider = os.getenv("ONNX_PROVIDER") 47 | if env_provider: 48 | providers = [env_provider] 49 | 50 | log.debug(f"Providers: {providers}") 51 | self.sess = rt.InferenceSession(model_path, providers=providers) 52 | self.voices: np.ndarray = np.load(voices_path) 53 | 54 | vocab = self._load_vocab(vocab_config) 55 | self.tokenizer = Tokenizer(espeak_config, vocab=vocab) 56 | 57 | @classmethod 58 | def from_session( 59 | cls, 60 | session: rt.InferenceSession, 61 | voices_path: str, 62 | espeak_config: EspeakConfig | None = None, 63 | vocab_config: dict | str | None = None, 64 | ): 65 | instance = cls.__new__(cls) 66 | instance.sess = session 67 | instance.config = KoKoroConfig(session._model_path, voices_path, espeak_config) 68 | instance.config.validate() 69 | instance.voices = np.load(voices_path) 70 | 71 | vocab = instance._load_vocab(vocab_config) 72 | instance.tokenizer = Tokenizer(espeak_config, vocab=vocab) 73 | return instance 74 | 75 | def _load_vocab(self, vocab_config: dict | str | None) -> dict: 76 | """Load vocabulary from config file or dictionary. 77 | 78 | Args: 79 | vocab_config: Path to vocab config file or dictionary containing vocab. 80 | 81 | Returns: 82 | Loaded vocabulary dictionary or empty dictionary if no config provided. 83 | """ 84 | 85 | if isinstance(vocab_config, str): 86 | with open(vocab_config, encoding="utf-8") as fp: 87 | config = json.load(fp) 88 | return config["vocab"] 89 | if isinstance(vocab_config, dict): 90 | return vocab_config["vocab"] 91 | return {} 92 | 93 | def _create_audio( 94 | self, phonemes: str, voice: NDArray[np.float32], speed: float 95 | ) -> tuple[NDArray[np.float32], int]: 96 | log.debug(f"Phonemes: {phonemes}") 97 | if len(phonemes) > MAX_PHONEME_LENGTH: 98 | log.warning( 99 | f"Phonemes are too long, truncating to {MAX_PHONEME_LENGTH} phonemes" 100 | ) 101 | phonemes = phonemes[:MAX_PHONEME_LENGTH] 102 | start_t = time.time() 103 | tokens = np.array(self.tokenizer.tokenize(phonemes), dtype=np.int64) 104 | assert len(tokens) <= MAX_PHONEME_LENGTH, ( 105 | f"Context length is {MAX_PHONEME_LENGTH}, but leave room for the pad token 0 at the start & end" 106 | ) 107 | 108 | voice = voice[len(tokens)] 109 | tokens = [[0, *tokens, 0]] 110 | if "input_ids" in [i.name for i in self.sess.get_inputs()]: 111 | # Newer export versions 112 | inputs = { 113 | "input_ids": tokens, 114 | "style": np.array(voice, dtype=np.float32), 115 | "speed": np.array([speed], dtype=np.int32), 116 | } 117 | else: 118 | inputs = { 119 | "tokens": tokens, 120 | "style": voice, 121 | "speed": np.ones(1, dtype=np.float32) * speed, 122 | } 123 | 124 | audio = self.sess.run(None, inputs)[0] 125 | audio_duration = len(audio) / SAMPLE_RATE 126 | create_duration = time.time() - start_t 127 | rtf = create_duration / audio_duration 128 | log.debug( 129 | f"Created audio in length of {audio_duration:.2f}s for {len(phonemes)} phonemes in {create_duration:.2f}s (RTF: {rtf:.2f}" 130 | ) 131 | return audio, SAMPLE_RATE 132 | 133 | def get_voice_style(self, name: str) -> NDArray[np.float32]: 134 | return self.voices[name] 135 | 136 | def _split_phonemes(self, phonemes: str) -> list[str]: 137 | """ 138 | Split phonemes into batches of MAX_PHONEME_LENGTH 139 | Prefer splitting at punctuation marks. 140 | """ 141 | # Regular expression to split by punctuation and keep them 142 | words = re.split(r"([.,!?;])", phonemes) 143 | batched_phoenemes: list[str] = [] 144 | current_batch = "" 145 | 146 | for part in words: 147 | # Remove leading/trailing whitespace 148 | part = part.strip() 149 | 150 | if part: 151 | # If adding the part exceeds the max length, split into a new batch 152 | # TODO: make it more accurate 153 | if len(current_batch) + len(part) + 1 >= MAX_PHONEME_LENGTH: 154 | batched_phoenemes.append(current_batch.strip()) 155 | current_batch = part 156 | else: 157 | if part in ".,!?;": 158 | current_batch += part 159 | else: 160 | if current_batch: 161 | current_batch += " " 162 | current_batch += part 163 | 164 | # Append the last batch if it contains any phonemes 165 | if current_batch: 166 | batched_phoenemes.append(current_batch.strip()) 167 | 168 | return batched_phoenemes 169 | 170 | def create( 171 | self, 172 | text: str, 173 | voice: str | NDArray[np.float32], 174 | speed: float = 1.0, 175 | lang: str = "en-us", 176 | is_phonemes: bool = False, 177 | trim: bool = True, 178 | ) -> tuple[NDArray[np.float32], int]: 179 | """ 180 | Create audio from text using the specified voice and speed. 181 | """ 182 | assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0" 183 | 184 | if isinstance(voice, str): 185 | assert voice in self.voices, f"Voice {voice} not found in available voices" 186 | voice = self.get_voice_style(voice) 187 | 188 | start_t = time.time() 189 | if is_phonemes: 190 | phonemes = text 191 | else: 192 | phonemes = self.tokenizer.phonemize(text, lang) 193 | # Create batches of phonemes by splitting spaces to MAX_PHONEME_LENGTH 194 | batched_phoenemes = self._split_phonemes(phonemes) 195 | 196 | audio = [] 197 | log.debug( 198 | f"Creating audio for {len(batched_phoenemes)} batches for {len(phonemes)} phonemes" 199 | ) 200 | for phonemes in batched_phoenemes: 201 | audio_part, _ = self._create_audio(phonemes, voice, speed) 202 | if trim: 203 | # Trim leading and trailing silence for a more natural sound concatenation 204 | # (initial ~2s, subsequent ~0.02s) 205 | audio_part, _ = trim_audio(audio_part) 206 | audio.append(audio_part) 207 | audio = np.concatenate(audio) 208 | log.debug(f"Created audio in {time.time() - start_t:.2f}s") 209 | return audio, SAMPLE_RATE 210 | 211 | async def create_stream( 212 | self, 213 | text: str, 214 | voice: str | NDArray[np.float32], 215 | speed: float = 1.0, 216 | lang: str = "en-us", 217 | is_phonemes: bool = False, 218 | trim: bool = True, 219 | ) -> AsyncGenerator[tuple[NDArray[np.float32], int], None]: 220 | """ 221 | Stream audio creation asynchronously in the background, yielding chunks as they are processed. 222 | """ 223 | assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0" 224 | 225 | if isinstance(voice, str): 226 | assert voice in self.voices, f"Voice {voice} not found in available voices" 227 | voice = self.get_voice_style(voice) 228 | 229 | if is_phonemes: 230 | phonemes = text 231 | else: 232 | phonemes = self.tokenizer.phonemize(text, lang) 233 | 234 | batched_phonemes = self._split_phonemes(phonemes) 235 | queue: asyncio.Queue[tuple[NDArray[np.float32], int] | None] = asyncio.Queue() 236 | 237 | async def process_batches(): 238 | """Process phoneme batches in the background.""" 239 | for i, phonemes in enumerate(batched_phonemes): 240 | loop = asyncio.get_event_loop() 241 | # Execute in separate thread since it's blocking operation 242 | audio_part, sample_rate = await loop.run_in_executor( 243 | None, self._create_audio, phonemes, voice, speed 244 | ) 245 | if trim: 246 | # Trim leading and trailing silence for a more natural sound concatenation 247 | # (initial ~2s, subsequent ~0.02s) 248 | audio_part, _ = trim_audio(audio_part) 249 | log.debug(f"Processed chunk {i} of stream") 250 | await queue.put((audio_part, sample_rate)) 251 | await queue.put(None) # Signal the end of the stream 252 | 253 | # Start processing in the background 254 | asyncio.create_task(process_batches()) 255 | 256 | while True: 257 | chunk = await queue.get() 258 | if chunk is None: 259 | break 260 | yield chunk 261 | 262 | def get_voices(self) -> list[str]: 263 | return list(sorted(self.voices.keys())) 264 | -------------------------------------------------------------------------------- /src/kokoro_onnx/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "istftnet": { 3 | "upsample_kernel_sizes": [20, 12], 4 | "upsample_rates": [10, 6], 5 | "gen_istft_hop_size": 5, 6 | "gen_istft_n_fft": 20, 7 | "resblock_dilation_sizes": [ 8 | [1, 3, 5], 9 | [1, 3, 5], 10 | [1, 3, 5] 11 | ], 12 | "resblock_kernel_sizes": [3, 7, 11], 13 | "upsample_initial_channel": 512 14 | }, 15 | "dim_in": 64, 16 | "dropout": 0.2, 17 | "hidden_dim": 512, 18 | "max_conv_dim": 512, 19 | "max_dur": 50, 20 | "multispeaker": true, 21 | "n_layer": 3, 22 | "n_mels": 80, 23 | "n_token": 178, 24 | "style_dim": 128, 25 | "text_encoder_kernel_size": 5, 26 | "plbert": { 27 | "hidden_size": 768, 28 | "num_attention_heads": 12, 29 | "intermediate_size": 2048, 30 | "max_position_embeddings": 512, 31 | "num_hidden_layers": 12, 32 | "dropout": 0.1 33 | }, 34 | "vocab": { 35 | ";": 1, 36 | ":": 2, 37 | ",": 3, 38 | ".": 4, 39 | "!": 5, 40 | "?": 6, 41 | "—": 9, 42 | "…": 10, 43 | "\"": 11, 44 | "(": 12, 45 | ")": 13, 46 | "“": 14, 47 | "”": 15, 48 | " ": 16, 49 | "\u0303": 17, 50 | "ʣ": 18, 51 | "ʥ": 19, 52 | "ʦ": 20, 53 | "ʨ": 21, 54 | "ᵝ": 22, 55 | "\uAB67": 23, 56 | "A": 24, 57 | "I": 25, 58 | "O": 31, 59 | "Q": 33, 60 | "S": 35, 61 | "T": 36, 62 | "W": 39, 63 | "Y": 41, 64 | "ᵊ": 42, 65 | "a": 43, 66 | "b": 44, 67 | "c": 45, 68 | "d": 46, 69 | "e": 47, 70 | "f": 48, 71 | "h": 50, 72 | "i": 51, 73 | "j": 52, 74 | "k": 53, 75 | "l": 54, 76 | "m": 55, 77 | "n": 56, 78 | "o": 57, 79 | "p": 58, 80 | "q": 59, 81 | "r": 60, 82 | "s": 61, 83 | "t": 62, 84 | "u": 63, 85 | "v": 64, 86 | "w": 65, 87 | "x": 66, 88 | "y": 67, 89 | "z": 68, 90 | "ɑ": 69, 91 | "ɐ": 70, 92 | "ɒ": 71, 93 | "æ": 72, 94 | "β": 75, 95 | "ɔ": 76, 96 | "ɕ": 77, 97 | "ç": 78, 98 | "ɖ": 80, 99 | "ð": 81, 100 | "ʤ": 82, 101 | "ə": 83, 102 | "ɚ": 85, 103 | "ɛ": 86, 104 | "ɜ": 87, 105 | "ɟ": 90, 106 | "ɡ": 92, 107 | "ɥ": 99, 108 | "ɨ": 101, 109 | "ɪ": 102, 110 | "ʝ": 103, 111 | "ɯ": 110, 112 | "ɰ": 111, 113 | "ŋ": 112, 114 | "ɳ": 113, 115 | "ɲ": 114, 116 | "ɴ": 115, 117 | "ø": 116, 118 | "ɸ": 118, 119 | "θ": 119, 120 | "œ": 120, 121 | "ɹ": 123, 122 | "ɾ": 125, 123 | "ɻ": 126, 124 | "ʁ": 128, 125 | "ɽ": 129, 126 | "ʂ": 130, 127 | "ʃ": 131, 128 | "ʈ": 132, 129 | "ʧ": 133, 130 | "ʊ": 135, 131 | "ʋ": 136, 132 | "ʌ": 138, 133 | "ɣ": 139, 134 | "ɤ": 140, 135 | "χ": 142, 136 | "ʎ": 143, 137 | "ʒ": 147, 138 | "ʔ": 148, 139 | "ˈ": 156, 140 | "ˌ": 157, 141 | "ː": 158, 142 | "ʰ": 162, 143 | "ʲ": 164, 144 | "↓": 169, 145 | "→": 171, 146 | "↗": 172, 147 | "↘": 173, 148 | "ᵻ": 177 149 | } 150 | } -------------------------------------------------------------------------------- /src/kokoro_onnx/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | from pathlib import Path 4 | 5 | MAX_PHONEME_LENGTH = 510 6 | SAMPLE_RATE = 24000 7 | 8 | 9 | @dataclass 10 | class EspeakConfig: 11 | lib_path: str | None = None 12 | data_path: str | None = None 13 | 14 | 15 | class KoKoroConfig: 16 | def __init__( 17 | self, 18 | model_path: str, 19 | voices_path: str, 20 | espeak_config: EspeakConfig | None = None, 21 | ): 22 | self.model_path = model_path 23 | self.voices_path = voices_path 24 | self.espeak_config = espeak_config 25 | 26 | def validate(self): 27 | if not Path(self.voices_path).exists(): 28 | error_msg = f"Voices file not found at {self.voices_path}" 29 | error_msg += ( 30 | "\nYou can download the voices file using the following command:" 31 | ) 32 | error_msg += "\nwget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin" 33 | raise FileNotFoundError(error_msg) 34 | 35 | if not Path(self.model_path).exists(): 36 | error_msg = f"Model file not found at {self.model_path}" 37 | error_msg += "\nYou can download the model file from https://github.com/thewh1teagle/kokoro-onnx/releases" 38 | raise FileNotFoundError(error_msg) 39 | 40 | 41 | def get_vocab(): 42 | with open(Path(__file__).parent / "config.json", encoding="utf-8") as fp: 43 | config = json.load(fp) 44 | return config["vocab"] 45 | 46 | 47 | DEFAULT_VOCAB = get_vocab() 48 | -------------------------------------------------------------------------------- /src/kokoro_onnx/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide a way to enable logging by setting LOG_LEVEL environment variable 3 | """ 4 | 5 | import logging 6 | import os 7 | 8 | import colorlog 9 | 10 | 11 | def _create_logger(): 12 | """ 13 | Create a logger with colorized output 14 | Usage: LOG_LEVEL=DEBUG python 15 | """ 16 | 17 | handler = colorlog.StreamHandler() 18 | fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s" 19 | handler.setFormatter( 20 | colorlog.ColoredFormatter( 21 | fmt=fmt, 22 | log_colors={ 23 | "DEBUG": "blue", 24 | "INFO": "green", 25 | "WARNING": "yellow", 26 | "ERROR": "red", 27 | "CRITICAL": "red", 28 | }, 29 | ) 30 | ) 31 | # Get log level from LOG_LEVEL environment variable 32 | log_level = os.getenv("LOG_LEVEL", "WARNING").upper() 33 | logger = colorlog.getLogger(__package__) 34 | logger.setLevel(level=getattr(logging, log_level, logging.WARNING)) 35 | # Setup logging to stdout 36 | logger.addHandler(handler) 37 | return logger 38 | 39 | 40 | log = _create_logger() 41 | -------------------------------------------------------------------------------- /src/kokoro_onnx/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thewh1teagle/kokoro-onnx/8edb984a28c3d9b806d208b51fdbca499c81af2b/src/kokoro_onnx/py.typed -------------------------------------------------------------------------------- /src/kokoro_onnx/tokenizer.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import os 3 | import platform 4 | import sys 5 | 6 | import espeakng_loader 7 | import phonemizer 8 | from phonemizer.backend.espeak.wrapper import EspeakWrapper 9 | 10 | from .config import DEFAULT_VOCAB, MAX_PHONEME_LENGTH, EspeakConfig 11 | from .log import log 12 | 13 | 14 | class Tokenizer: 15 | def __init__(self, espeak_config: EspeakConfig | None = None, vocab: dict = None): 16 | self.vocab = vocab or DEFAULT_VOCAB 17 | 18 | if not espeak_config: 19 | espeak_config = EspeakConfig() 20 | if not espeak_config.data_path: 21 | espeak_config.data_path = espeakng_loader.get_data_path() 22 | if not espeak_config.lib_path: 23 | espeak_config.lib_path = espeakng_loader.get_library_path() 24 | 25 | # Check if PHONEMIZER_ESPEAK_LIBRARY was set 26 | if os.getenv("PHONEMIZER_ESPEAK_LIBRARY"): 27 | espeak_config.lib_path = os.getenv("PHONEMIZER_ESPEAK_LIBRARY") 28 | 29 | # Check that the espeak-ng library can be loaded 30 | try: 31 | ctypes.cdll.LoadLibrary(espeak_config.lib_path) 32 | except Exception as e: 33 | log.error(f"Failed to load espeak shared library: {e}") 34 | log.warning("Falling back to system wide espeak-ng library") 35 | 36 | # Fallback system wide load 37 | error_info = ( 38 | "Failed to load espeak-ng from fallback. Please install espeak-ng system wide.\n" 39 | "\tSee https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md\n" 40 | "\tNote: you can specify shared library path using PHONEMIZER_ESPEAK_LIBRARY environment variable.\n" 41 | f"Environment:\n\t{platform.platform()} ({platform.release()}) | {sys.version}" 42 | ) 43 | espeak_config.lib_path = ctypes.util.find_library( 44 | "espeak-ng" 45 | ) or ctypes.util.find_library("espeak") 46 | if not espeak_config.lib_path: 47 | raise RuntimeError(error_info) 48 | try: 49 | ctypes.cdll.LoadLibrary(espeak_config.lib_path) 50 | except Exception as e: 51 | raise RuntimeError(f"{e}: {error_info}") 52 | 53 | EspeakWrapper.set_data_path(espeak_config.data_path) 54 | EspeakWrapper.set_library(espeak_config.lib_path) 55 | 56 | @staticmethod 57 | def normalize_text(text) -> str: 58 | return text.strip() 59 | 60 | def tokenize(self, phonemes): 61 | if len(phonemes) > MAX_PHONEME_LENGTH: 62 | raise ValueError( 63 | f"text is too long, must be less than {MAX_PHONEME_LENGTH} phonemes" 64 | ) 65 | return [i for i in map(self.vocab.get, phonemes) if i is not None] 66 | 67 | def phonemize(self, text, lang="en-us", norm=True) -> str: 68 | """ 69 | lang can be 'en-us' or 'en-gb' 70 | """ 71 | if norm: 72 | text = Tokenizer.normalize_text(text) 73 | 74 | phonemes = phonemizer.phonemize( 75 | text, lang, preserve_punctuation=True, with_stress=True 76 | ) 77 | phonemes = "".join(filter(lambda p: p in self.vocab, phonemes)) 78 | return phonemes.strip() 79 | -------------------------------------------------------------------------------- /src/kokoro_onnx/trim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013--2023, librosa development team. 3 | 4 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 5 | 6 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 7 | 8 | 9 | ***This file extracted from librosa package since we use only the trim() function and librosa requires many dependencies*** 10 | 11 | Reference: 12 | - https://gist.github.com/evq/82e95a363eeeb75d15dd62abc1eb1bde 13 | - https://github.com/librosa/librosa/blob/894942673d55aa2206df1296b6c4c50827c7f1d6/librosa/effects.py#L612 14 | """ 15 | 16 | import warnings 17 | from collections.abc import Callable 18 | from typing import Any 19 | 20 | import numpy as np 21 | from numpy.lib.stride_tricks import as_strided 22 | 23 | 24 | class LibrosaError(Exception): 25 | """The root librosa exception class""" 26 | 27 | pass 28 | 29 | 30 | class ParameterError(LibrosaError): 31 | """Exception class for mal-formed inputs""" 32 | 33 | pass 34 | 35 | 36 | # @numba.vectorize( 37 | # ["float32(complex64)", "float64(complex128)"], nopython=True, cache=True, identity=0 38 | # ) # type: ignore 39 | def _cabs2(x): # pragma: no cover 40 | """Efficiently compute abs2 on complex inputs""" 41 | return x.real**2 + x.imag**2 42 | 43 | 44 | def abs2(x, dtype): 45 | """Compute the squared magnitude of a real or complex array. 46 | 47 | This function is equivalent to calling `np.abs(x)**2` but it 48 | is slightly more efficient. 49 | 50 | Parameters 51 | ---------- 52 | x : np.ndarray or scalar, real or complex typed 53 | The input data, either real (float32, float64) or complex (complex64, complex128) typed 54 | dtype : np.dtype, optional 55 | The data type of the output array. 56 | If not provided, it will be inferred from `x` 57 | 58 | Returns 59 | ------- 60 | p : np.ndarray or scale, real 61 | squared magnitude of `x` 62 | 63 | Examples 64 | -------- 65 | >>> librosa.util.abs2(3 + 4j) 66 | 25.0 67 | 68 | >>> librosa.util.abs2((0.5j)**np.arange(8)) 69 | array([1.000e+00, 2.500e-01, 6.250e-02, 1.562e-02, 3.906e-03, 9.766e-04, 70 | 2.441e-04, 6.104e-05]) 71 | """ 72 | if np.iscomplexobj(x): 73 | # suppress type check, mypy doesn't like vectorization 74 | y = _cabs2(x) 75 | if dtype is None: 76 | return y # type: ignore 77 | else: 78 | return y.astype(dtype) # type: ignore 79 | else: 80 | # suppress type check, mypy doesn't know this is real 81 | return np.square(x, dtype=dtype) # type: ignore 82 | 83 | 84 | def amplitude_to_db( 85 | S, 86 | *, 87 | ref: float | Callable = 1.0, 88 | amin: float = 1e-5, 89 | top_db: float | None = 80.0, 90 | ) -> np.floating[Any] | np.ndarray: 91 | """Convert an amplitude spectrogram to dB-scaled spectrogram. 92 | 93 | This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``, 94 | but is provided for convenience. 95 | 96 | Parameters 97 | ---------- 98 | S : np.ndarray 99 | input amplitude 100 | 101 | ref : scalar or callable 102 | If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``: 103 | ``20 * log10(S / ref)``. 104 | Zeros in the output correspond to positions where ``S == ref``. 105 | 106 | If callable, the reference value is computed as ``ref(S)``. 107 | 108 | amin : float > 0 [scalar] 109 | minimum threshold for ``S`` and ``ref`` 110 | 111 | top_db : float >= 0 [scalar] 112 | threshold the output at ``top_db`` below the peak: 113 | ``max(20 * log10(S/ref)) - top_db`` 114 | 115 | Returns 116 | ------- 117 | S_db : np.ndarray 118 | ``S`` measured in dB 119 | 120 | See Also 121 | -------- 122 | power_to_db, db_to_amplitude 123 | 124 | Notes 125 | ----- 126 | This function caches at level 30. 127 | """ 128 | S = np.asarray(S) 129 | 130 | if np.issubdtype(S.dtype, np.complexfloating): 131 | warnings.warn( 132 | "amplitude_to_db was called on complex input so phase " 133 | "information will be discarded. To suppress this warning, " 134 | "call amplitude_to_db(np.abs(S)) instead.", 135 | stacklevel=2, 136 | ) 137 | 138 | magnitude = np.abs(S) 139 | 140 | if callable(ref): 141 | # User supplied a function to calculate reference power 142 | ref_value = ref(magnitude) 143 | else: 144 | ref_value = np.abs(ref) 145 | 146 | out_array = magnitude if isinstance(magnitude, np.ndarray) else None 147 | power = np.square(magnitude, out=out_array) 148 | 149 | db: np.ndarray = power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db) 150 | return db 151 | 152 | 153 | def _signal_to_frame_nonsilent( 154 | y: np.ndarray, 155 | frame_length: int = 2048, 156 | hop_length: int = 512, 157 | top_db: float = 60, 158 | ref: Callable | float = np.max, 159 | aggregate: Callable = np.max, 160 | ) -> np.ndarray: 161 | """Frame-wise non-silent indicator for audio input. 162 | 163 | This is a helper function for `trim` and `split`. 164 | 165 | Parameters 166 | ---------- 167 | y : np.ndarray 168 | Audio signal, mono or stereo 169 | 170 | frame_length : int > 0 171 | The number of samples per frame 172 | 173 | hop_length : int > 0 174 | The number of samples between frames 175 | 176 | top_db : number 177 | The threshold (in decibels) below reference to consider as 178 | silence. 179 | You can also use a negative value for `top_db` to treat any value 180 | below `ref + |top_db|` as silent. This will only make sense if 181 | `ref` is not `np.max`. 182 | 183 | ref : callable or float 184 | The reference amplitude 185 | 186 | aggregate : callable [default: np.max] 187 | Function to aggregate dB measurements across channels (if y.ndim > 1) 188 | 189 | Note: for multiple leading axes, this is performed using ``np.apply_over_axes``. 190 | 191 | Returns 192 | ------- 193 | non_silent : np.ndarray, shape=(m,), dtype=bool 194 | Indicator of non-silent frames 195 | """ 196 | # Compute the MSE for the signal 197 | mse = rms(y=y, frame_length=frame_length, hop_length=hop_length) 198 | 199 | # Convert to decibels and slice out the mse channel 200 | db: np.ndarray = amplitude_to_db(mse[..., 0, :], ref=ref, top_db=None) 201 | 202 | # Aggregate everything but the time dimension 203 | if db.ndim > 1: 204 | db = np.apply_over_axes(aggregate, db, range(db.ndim - 1)) 205 | # Squeeze out leading singleton dimensions here 206 | # We always want to keep the trailing dimension though 207 | db = np.squeeze(db, axis=tuple(range(db.ndim - 1))) 208 | 209 | return db > -top_db 210 | 211 | 212 | def trim( 213 | y: np.ndarray, 214 | *, 215 | top_db: float = 60, 216 | ref: float | Callable = np.max, 217 | frame_length: int = 2048, 218 | hop_length: int = 512, 219 | aggregate: Callable = np.max, 220 | ) -> tuple[np.ndarray, np.ndarray]: 221 | """Trim leading and trailing silence from an audio signal. 222 | 223 | Silence is defined as segments of the audio signal that are `top_db` 224 | decibels (or more) quieter than a reference level, `ref`. 225 | By default, `ref` is set to the signal's maximum RMS value. 226 | It's important to note that if the entire signal maintains a uniform 227 | RMS value, there will be no segments considered quieter than the maximum, 228 | leading to no trimming. 229 | This implies that a completely silent signal will remain untrimmed with the default `ref` setting. 230 | In these situations, an explicit value for `ref` (in decibels) should be used instead. 231 | 232 | Parameters 233 | ---------- 234 | y : np.ndarray, shape=(..., n) 235 | Audio signal. Multi-channel is supported. 236 | top_db : number 237 | The threshold (in decibels) below reference to consider as 238 | silence. 239 | You can also use a negative value for `top_db` to treat any value 240 | below `ref + |top_db|` as silent. This will only make sense if 241 | `ref` is not `np.max`. 242 | ref : number or callable 243 | The reference amplitude. By default, it uses `np.max` and compares 244 | to the peak amplitude in the signal. 245 | frame_length : int > 0 246 | The number of samples per analysis frame 247 | hop_length : int > 0 248 | The number of samples between analysis frames 249 | aggregate : callable [default: np.max] 250 | Function to aggregate across channels (if y.ndim > 1) 251 | 252 | Returns 253 | ------- 254 | y_trimmed : np.ndarray, shape=(..., m) 255 | The trimmed signal 256 | index : np.ndarray, shape=(2,) 257 | the interval of ``y`` corresponding to the non-silent region: 258 | ``y_trimmed = y[index[0]:index[1]]`` (for mono) or 259 | ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo). 260 | 261 | Examples 262 | -------- 263 | >>> # Load some audio 264 | >>> y, sr = librosa.load(librosa.ex('choice')) 265 | >>> # Trim the beginning and ending silence 266 | >>> yt, index = librosa.effects.trim(y) 267 | >>> # Print the durations 268 | >>> print(librosa.get_duration(y, sr=sr), librosa.get_duration(yt, sr=sr)) 269 | 25.025986394557822 25.007891156462584 270 | """ 271 | non_silent = _signal_to_frame_nonsilent( 272 | y, 273 | frame_length=frame_length, 274 | hop_length=hop_length, 275 | ref=ref, 276 | top_db=top_db, 277 | aggregate=aggregate, 278 | ) 279 | 280 | nonzero = np.flatnonzero(non_silent) 281 | 282 | if nonzero.size > 0: 283 | # Compute the start and end positions 284 | # End position goes one frame past the last non-zero 285 | start = int(frames_to_samples(nonzero[0], hop_length=hop_length)) 286 | end = min( 287 | y.shape[-1], 288 | int(frames_to_samples(nonzero[-1] + 1, hop_length=hop_length)), 289 | ) 290 | else: 291 | # The entire signal is trimmed here: nothing is above the threshold 292 | start, end = 0, 0 293 | 294 | # Slice the buffer and return the corresponding interval 295 | return y[..., start:end], np.asarray([start, end]) 296 | 297 | 298 | def rms( 299 | *, 300 | y: np.ndarray | None = None, 301 | S: np.ndarray | None = None, 302 | frame_length: int = 2048, 303 | hop_length: int = 512, 304 | center: bool = True, 305 | pad_mode="constant", 306 | dtype=np.float32, 307 | ) -> np.ndarray: 308 | """Compute root-mean-square (RMS) value for each frame, either from the 309 | audio samples ``y`` or from a spectrogram ``S``. 310 | 311 | Computing the RMS value from audio samples is faster as it doesn't require 312 | a STFT calculation. However, using a spectrogram will give a more accurate 313 | representation of energy over time because its frames can be windowed, 314 | thus prefer using ``S`` if it's already available. 315 | 316 | Parameters 317 | ---------- 318 | y : np.ndarray [shape=(..., n)] or None 319 | (optional) audio time series. Required if ``S`` is not input. 320 | Multi-channel is supported. 321 | S : np.ndarray [shape=(..., d, t)] or None 322 | (optional) spectrogram magnitude. Required if ``y`` is not input. 323 | frame_length : int > 0 [scalar] 324 | length of analysis frame (in samples) for energy calculation 325 | hop_length : int > 0 [scalar] 326 | hop length for STFT. See `librosa.stft` for details. 327 | center : bool 328 | If `True` and operating on time-domain input (``y``), pad the signal 329 | by ``frame_length//2`` on either side. 330 | If operating on spectrogram input, this has no effect. 331 | pad_mode : str 332 | Padding mode for centered analysis. See `numpy.pad` for valid 333 | values. 334 | dtype : np.dtype, optional 335 | Data type of the output array. Defaults to float32. 336 | 337 | Returns 338 | ------- 339 | rms : np.ndarray [shape=(..., 1, t)] 340 | RMS value for each frame 341 | 342 | Examples 343 | -------- 344 | >>> y, sr = librosa.load(librosa.ex('trumpet')) 345 | >>> librosa.feature.rms(y=y) 346 | array([[1.248e-01, 1.259e-01, ..., 1.845e-05, 1.796e-05]], 347 | dtype=float32) 348 | 349 | Or from spectrogram input 350 | 351 | >>> S, phase = librosa.magphase(librosa.stft(y)) 352 | >>> rms = librosa.feature.rms(S=S) 353 | 354 | >>> import matplotlib.pyplot as plt 355 | >>> fig, ax = plt.subplots(nrows=2, sharex=True) 356 | >>> times = librosa.times_like(rms) 357 | >>> ax[0].semilogy(times, rms[0], label='RMS Energy') 358 | >>> ax[0].set(xticks=[]) 359 | >>> ax[0].legend() 360 | >>> ax[0].label_outer() 361 | >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), 362 | ... y_axis='log', x_axis='time', ax=ax[1]) 363 | >>> ax[1].set(title='log Power spectrogram') 364 | 365 | Use a STFT window of constant ones and no frame centering to get consistent 366 | results with the RMS computed from the audio samples ``y`` 367 | 368 | >>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0] 369 | >>> librosa.feature.rms(S=S) 370 | >>> plt.show() 371 | 372 | """ 373 | if y is not None: 374 | if center: 375 | padding = [(0, 0) for _ in range(y.ndim)] 376 | padding[-1] = (int(frame_length // 2), int(frame_length // 2)) 377 | y = np.pad(y, padding, mode=pad_mode) 378 | 379 | x = frame(y, frame_length=frame_length, hop_length=hop_length) 380 | 381 | # Calculate power 382 | power = np.mean(abs2(x, dtype=dtype), axis=-2, keepdims=True) 383 | elif S is not None: 384 | # Check the frame length 385 | if S.shape[-2] != frame_length // 2 + 1: 386 | raise ParameterError( 387 | f"Since S.shape[-2] is {S.shape[-2]}, " 388 | f"frame_length is expected to be {S.shape[-2] * 2 - 2} or {S.shape[-2] * 2 - 1}; " 389 | f"found {frame_length}" 390 | ) 391 | 392 | # power spectrogram 393 | x = abs2(S, dtype=dtype) 394 | 395 | # Adjust the DC and sr/2 component 396 | x[..., 0, :] *= 0.5 397 | if frame_length % 2 == 0: 398 | x[..., -1, :] *= 0.5 399 | 400 | # Calculate power 401 | power = 2 * np.sum(x, axis=-2, keepdims=True) / frame_length**2 402 | else: 403 | raise ParameterError("Either `y` or `S` must be input.") 404 | 405 | rms_result: np.ndarray = np.sqrt(power) 406 | return rms_result 407 | 408 | 409 | def frame( 410 | x: np.ndarray, 411 | *, 412 | frame_length: int, 413 | hop_length: int, 414 | axis: int = -1, 415 | writeable: bool = False, 416 | subok: bool = False, 417 | ) -> np.ndarray: 418 | """Slice a data array into (overlapping) frames. 419 | 420 | This implementation uses low-level stride manipulation to avoid 421 | making a copy of the data. The resulting frame representation 422 | is a new view of the same input data. 423 | 424 | For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]`` 425 | can be framed with frame length 3 and hop length 2 in two ways. 426 | The first (``axis=-1``), results in the array ``x_frames``:: 427 | 428 | [[0, 2, 4], 429 | [1, 3, 5], 430 | [2, 4, 6]] 431 | 432 | where each column ``x_frames[:, i]`` contains a contiguous slice of 433 | the input ``x[i * hop_length : i * hop_length + frame_length]``. 434 | 435 | The second way (``axis=0``) results in the array ``x_frames``:: 436 | 437 | [[0, 1, 2], 438 | [2, 3, 4], 439 | [4, 5, 6]] 440 | 441 | where each row ``x_frames[i]`` contains a contiguous slice of the input. 442 | 443 | This generalizes to higher dimensional inputs, as shown in the examples below. 444 | In general, the framing operation increments by 1 the number of dimensions, 445 | adding a new "frame axis" either before the framing axis (if ``axis < 0``) 446 | or after the framing axis (if ``axis >= 0``). 447 | 448 | Parameters 449 | ---------- 450 | x : np.ndarray 451 | Array to frame 452 | frame_length : int > 0 [scalar] 453 | Length of the frame 454 | hop_length : int > 0 [scalar] 455 | Number of steps to advance between frames 456 | axis : int 457 | The axis along which to frame. 458 | writeable : bool 459 | If ``False``, then the framed view of ``x`` is read-only. 460 | If ``True``, then the framed view is read-write. Note that writing to the framed view 461 | will also write to the input array ``x`` in this case. 462 | subok : bool 463 | If True, sub-classes will be passed-through, otherwise the returned array will be 464 | forced to be a base-class array (default). 465 | 466 | Returns 467 | ------- 468 | x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES, ...)] 469 | A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension):: 470 | 471 | x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length] 472 | 473 | If ``axis=0`` (framing on the first dimension), then:: 474 | 475 | x_frames[j] = x[j * hop_length : j * hop_length + frame_length] 476 | 477 | Raises 478 | ------ 479 | ParameterError 480 | If ``x.shape[axis] < frame_length``, there is not enough data to fill one frame. 481 | 482 | If ``hop_length < 1``, frames cannot advance. 483 | 484 | See Also 485 | -------- 486 | numpy.lib.stride_tricks.as_strided 487 | 488 | Examples 489 | -------- 490 | Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame 491 | 492 | >>> y, sr = librosa.load(librosa.ex('trumpet')) 493 | >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64) 494 | >>> frames 495 | array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06], 496 | [-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05], 497 | ..., 498 | [ 7.960e-02, -2.335e-01, ..., -6.815e-06, 1.266e-05], 499 | [ 9.568e-02, -1.252e-01, ..., 7.397e-06, -1.921e-05]], 500 | dtype=float32) 501 | >>> y.shape 502 | (117601,) 503 | 504 | >>> frames.shape 505 | (2048, 1806) 506 | 507 | Or frame along the first axis instead of the last: 508 | 509 | >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0) 510 | >>> frames.shape 511 | (1806, 2048) 512 | 513 | Frame a stereo signal: 514 | 515 | >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), mono=False) 516 | >>> y.shape 517 | (2, 117601) 518 | >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64) 519 | (2, 2048, 1806) 520 | 521 | Carve an STFT into fixed-length patches of 32 frames with 50% overlap 522 | 523 | >>> y, sr = librosa.load(librosa.ex('trumpet')) 524 | >>> S = np.abs(librosa.stft(y)) 525 | >>> S.shape 526 | (1025, 230) 527 | >>> S_patch = librosa.util.frame(S, frame_length=32, hop_length=16) 528 | >>> S_patch.shape 529 | (1025, 32, 13) 530 | >>> # The first patch contains the first 32 frames of S 531 | >>> np.allclose(S_patch[:, :, 0], S[:, :32]) 532 | True 533 | >>> # The second patch contains frames 16 to 16+32=48, and so on 534 | >>> np.allclose(S_patch[:, :, 1], S[:, 16:48]) 535 | True 536 | """ 537 | # This implementation is derived from numpy.lib.stride_tricks.sliding_window_view (1.20.0) 538 | # https://numpy.org/doc/stable/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html 539 | 540 | x = np.array(x, copy=False, subok=subok) 541 | 542 | if x.shape[axis] < frame_length: 543 | raise ParameterError( 544 | f"Input is too short (n={x.shape[axis]:d}) for frame_length={frame_length:d}" 545 | ) 546 | 547 | if hop_length < 1: 548 | raise ParameterError(f"Invalid hop_length: {hop_length:d}") 549 | 550 | # put our new within-frame axis at the end for now 551 | out_strides = x.strides + tuple([x.strides[axis]]) 552 | 553 | # Reduce the shape on the framing axis 554 | x_shape_trimmed = list(x.shape) 555 | x_shape_trimmed[axis] -= frame_length - 1 556 | 557 | out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) 558 | xw = as_strided( 559 | x, strides=out_strides, shape=out_shape, subok=subok, writeable=writeable 560 | ) 561 | 562 | if axis < 0: 563 | target_axis = axis - 1 564 | else: 565 | target_axis = axis + 1 566 | 567 | xw = np.moveaxis(xw, -1, target_axis) 568 | 569 | # Downsample along the target axis 570 | slices = [slice(None)] * xw.ndim 571 | slices[axis] = slice(0, None, hop_length) 572 | return xw[tuple(slices)] 573 | 574 | 575 | def power_to_db( 576 | S, 577 | *, 578 | ref: float | Callable = 1.0, 579 | amin: float = 1e-10, 580 | top_db: float | None = 80.0, 581 | ) -> np.floating[Any] | np.ndarray: 582 | """Convert a power spectrogram (amplitude squared) to decibel (dB) units 583 | 584 | This computes the scaling ``10 * log10(S / ref)`` in a numerically 585 | stable way. 586 | 587 | Parameters 588 | ---------- 589 | S : np.ndarray 590 | input power 591 | 592 | ref : scalar or callable 593 | If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:: 594 | 595 | 10 * log10(S / ref) 596 | 597 | Zeros in the output correspond to positions where ``S == ref``. 598 | 599 | If callable, the reference value is computed as ``ref(S)``. 600 | 601 | amin : float > 0 [scalar] 602 | minimum threshold for ``abs(S)`` and ``ref`` 603 | 604 | top_db : float >= 0 [scalar] 605 | threshold the output at ``top_db`` below the peak: 606 | ``max(10 * log10(S/ref)) - top_db`` 607 | 608 | Returns 609 | ------- 610 | S_db : np.ndarray 611 | ``S_db ~= 10 * log10(S) - 10 * log10(ref)`` 612 | 613 | See Also 614 | -------- 615 | perceptual_weighting 616 | db_to_power 617 | amplitude_to_db 618 | db_to_amplitude 619 | 620 | Notes 621 | ----- 622 | This function caches at level 30. 623 | 624 | Examples 625 | -------- 626 | Get a power spectrogram from a waveform ``y`` 627 | 628 | >>> y, sr = librosa.load(librosa.ex('trumpet')) 629 | >>> S = np.abs(librosa.stft(y)) 630 | >>> librosa.power_to_db(S**2) 631 | array([[-41.809, -41.809, ..., -41.809, -41.809], 632 | [-41.809, -41.809, ..., -41.809, -41.809], 633 | ..., 634 | [-41.809, -41.809, ..., -41.809, -41.809], 635 | [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32) 636 | 637 | Compute dB relative to peak power 638 | 639 | >>> librosa.power_to_db(S**2, ref=np.max) 640 | array([[-80., -80., ..., -80., -80.], 641 | [-80., -80., ..., -80., -80.], 642 | ..., 643 | [-80., -80., ..., -80., -80.], 644 | [-80., -80., ..., -80., -80.]], dtype=float32) 645 | 646 | Or compare to median power 647 | 648 | >>> librosa.power_to_db(S**2, ref=np.median) 649 | array([[16.578, 16.578, ..., 16.578, 16.578], 650 | [16.578, 16.578, ..., 16.578, 16.578], 651 | ..., 652 | [16.578, 16.578, ..., 16.578, 16.578], 653 | [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32) 654 | 655 | And plot the results 656 | 657 | >>> import matplotlib.pyplot as plt 658 | >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) 659 | >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time', 660 | ... ax=ax[0]) 661 | >>> ax[0].set(title='Power spectrogram') 662 | >>> ax[0].label_outer() 663 | >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max), 664 | ... sr=sr, y_axis='log', x_axis='time', ax=ax[1]) 665 | >>> ax[1].set(title='Log-Power spectrogram') 666 | >>> fig.colorbar(imgpow, ax=ax[0]) 667 | >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB") 668 | """ 669 | S = np.asarray(S) 670 | 671 | if amin <= 0: 672 | raise ParameterError("amin must be strictly positive") 673 | 674 | if np.issubdtype(S.dtype, np.complexfloating): 675 | warnings.warn( 676 | "power_to_db was called on complex input so phase " 677 | "information will be discarded. To suppress this warning, " 678 | "call power_to_db(np.abs(D)**2) instead.", 679 | stacklevel=2, 680 | ) 681 | magnitude = np.abs(S) 682 | else: 683 | magnitude = S 684 | 685 | if callable(ref): 686 | # User supplied a function to calculate reference power 687 | ref_value = ref(magnitude) 688 | else: 689 | ref_value = np.abs(ref) 690 | 691 | log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude)) 692 | log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) 693 | 694 | if top_db is not None: 695 | if top_db < 0: 696 | raise ParameterError("top_db must be non-negative") 697 | log_spec = np.maximum(log_spec, log_spec.max() - top_db) 698 | 699 | return log_spec 700 | 701 | 702 | def frames_to_samples( 703 | frames, 704 | *, 705 | hop_length: int = 512, 706 | n_fft: int | None = None, 707 | ) -> np.integer[Any] | np.ndarray: 708 | """Convert frame indices to audio sample indices. 709 | 710 | Parameters 711 | ---------- 712 | frames : number or np.ndarray [shape=(n,)] 713 | frame index or vector of frame indices 714 | hop_length : int > 0 [scalar] 715 | number of samples between successive frames 716 | n_fft : None or int > 0 [scalar] 717 | Optional: length of the FFT window. 718 | If given, time conversion will include an offset of ``n_fft // 2`` 719 | to counteract windowing effects when using a non-centered STFT. 720 | 721 | Returns 722 | ------- 723 | times : number or np.ndarray 724 | time (in samples) of each given frame number:: 725 | 726 | times[i] = frames[i] * hop_length 727 | 728 | See Also 729 | -------- 730 | frames_to_time : convert frame indices to time values 731 | samples_to_frames : convert sample indices to frame indices 732 | 733 | Examples 734 | -------- 735 | >>> y, sr = librosa.load(librosa.ex('choice')) 736 | >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr) 737 | >>> beat_samples = librosa.frames_to_samples(beats, sr=sr) 738 | """ 739 | offset = 0 740 | if n_fft is not None: 741 | offset = int(n_fft // 2) 742 | 743 | return (np.asanyarray(frames) * hop_length + offset).astype(int) 744 | --------------------------------------------------------------------------------