├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yaml
    │   └── feature_request.yaml
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── check.yml
    │   ├── fetch_voices.yml
    │   ├── publish-to-pypi.yml
    │   └── test.yml
├── .gitignore
├── .python-version
├── .vscode
    ├── extensions.json
    └── settings.json
├── BUILDING.md
├── CONTRIBUTE.md
├── LICENSE
├── README.md
├── examples
    ├── app.py
    ├── chinese.py
    ├── english.py
    ├── french.py
    ├── hindi.py
    ├── italian.py
    ├── japanse.py
    ├── play.py
    ├── podcast.py
    ├── portuguese.py
    ├── save.py
    ├── spanish.py
    ├── with_blending.py
    ├── with_espeak_data.py
    ├── with_espeak_lib.py
    ├── with_gpu.py
    ├── with_log.py
    ├── with_phonemes.py
    ├── with_provider.py
    ├── with_quant.py
    ├── with_session.py
    ├── with_stream.py
    ├── with_stream_save.py
    └── with_voice.py
├── pyproject.toml
├── scripts
    ├── export.py
    └── fetch_voices.py
├── src
    └── kokoro_onnx
    │   ├── __init__.py
    │   ├── config.json
    │   ├── config.py
    │   ├── log.py
    │   ├── py.typed
    │   ├── tokenizer.py
    │   └── trim.py
└── uv.lock


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [thewh1teagle]
 4 | ko_fi: thewh1teagle
 5 | patreon: # Replace with a single Patreon username
 6 | open_collective: # Replace with a single Open Collective username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: File a bug report
 3 | title: "[Title here. keep it short]"
 4 | labels: ["bug"]
 5 | assignees:
 6 |   - octocat
 7 | body:
 8 |   - type: markdown
 9 |     attributes:
10 |       value: |
11 |         Thanks for taking the time to fill out this bug report!
12 |         One second before you create, search if it's already reported [issues](https://github.com/thewh1teagle/kokoro-onnx/issues?q=is:issue+label:bug+)
13 | 
14 |   - type: textarea
15 |     id: what-happened
16 |     attributes:
17 |       label: What happened?
18 |       description: Also tell us, what did you expect to happen?
19 |       placeholder: Tell us what you see!
20 |       value: "A bug happened!"
21 |     validations:
22 |       required: true
23 |   - type: textarea
24 |     id: steps-to-reproduce
25 |     attributes:
26 |       label: Steps to reproduce
27 |       description: Also tell us, what did you expect to happen?
28 |       placeholder: Tell us what you see! adding code example won't hurt.
29 |       value: |
30 |         1. step one...
31 |         2. step two...
32 |     validations:
33 |       required: true
34 |   - type: dropdown
35 |     id: platforms
36 |     attributes:
37 |       label: What OS are you seeing the problem on?
38 |       multiple: true
39 |       options:
40 |         - Window
41 |         - Linux
42 |         - MacOS
43 |   - type: input
44 |     id: version
45 |     attributes:
46 |       label: Package version
47 |       description: |
48 |         Run `uv pip show kokoro-onnx` (omit uv if needed)
49 |       placeholder: eg. 0.4.0
50 |   - type: textarea
51 |     id: logs
52 |     attributes:
53 |       label: Relevant log output
54 |       description: |
55 |         Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
56 | 
57 |         You can enable logs by setting LOG_LEVEL=DEBUG environment variable.
58 | 
59 |         Example (Linux/macOS):
60 |           LOG_LEVEL=DEBUG python main.py
61 | 
62 |         Example (PowerShell):
63 |           $env:LOG_LEVEL=DEBUG; python main.py
64 | 
65 |       render: shell
66 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | title: '[Title here. keep it short]'
 4 | labels: ['feature']
 5 | assignees:
 6 |     - thewh1teagle
 7 | body:
 8 |     - type: markdown
 9 |       attributes:
10 |           value: |
11 |               💚💜 Thank you for interest. ❤️💛
12 |               *Please prioritize checking existing issues first*. [features](https://github.com/thewh1teagle/kokoro-onnx/issues?q=is:issue+label:feature+)
13 |               I will repay with higher-quality code.
14 |               
15 |     - type: textarea
16 |       id: describe-the-feature
17 |       attributes:
18 |           label: Describe the feature
19 |           description: Also tell us why you think it useful
20 |           placeholder: Description...
21 |       validations:
22 |           required: true
23 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Checklist
 2 | 
 3 | - [ ] Discussed idea or confident it's critical (See Contribute.md).
 4 | - [ ] One feature/bug per PR (unless minor/related).
 5 | - [ ] PR is from a **feature branch**, not `main`.
 6 | - [ ] I ran at least one example to ensure the code works.
 7 | - [ ] Checked linting/formatting (`uv run ruff format && uv run ruff check`).
 8 | 
 9 | ---
10 | 
11 | ## Description
12 | 
13 | <!-- Short description of the feature/bug this PR resolve; Optional: Closes #123 -->
14 | 


--------------------------------------------------------------------------------
/.github/workflows/check.yml:
--------------------------------------------------------------------------------
 1 | # This Action uses minimal steps to run in ~5 seconds to rapidly:
 2 | # look for typos in the codebase using codespell, and
 3 | # lint Python code using ruff and provide intuitive GitHub Annotations to contributors.
 4 | name: ci
 5 | on:
 6 |   push:
 7 |     branches: [main]
 8 |   pull_request:
 9 |     branches: [main]
10 |   workflow_dispatch:
11 | jobs:
12 |   ruff:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: astral-sh/ruff-action@v3
17 |       - run: ruff format --diff
18 | 


--------------------------------------------------------------------------------
/.github/workflows/fetch_voices.yml:
--------------------------------------------------------------------------------
 1 | name: fetch voices
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   fetch-voices:
 8 |     permissions:
 9 |       contents: write
10 |     strategy:
11 |       fail-fast: false
12 | 
13 |     runs-on: macos-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Install the latest version of uv
17 |         uses: astral-sh/setup-uv@v5
18 | 
19 |       - name: fetch voices
20 |         run: |
21 |           uv run scripts/fetch_voices.py
22 |           latestTag=$(gh release list --json isPrerelease,tagName --jq 'map(select(.isPrerelease)) | first | .tagName')
23 |           gh release upload $latestTag voices-v1.0.bin --clobber
24 |         env:
25 |           GH_TOKEN: ${{ github.token }}
26 | 
27 |         shell: bash
28 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: publish
 2 | on:
 3 |   workflow_dispatch:
 4 | 
 5 | jobs:
 6 |   publish:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: astral-sh/setup-uv@v5
11 |       - name: Test
12 |         env:
13 |           UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }}
14 |         run: |
15 |           uv sync
16 |           uv build
17 |           uv publish
18 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   test:
 8 |     permissions:
 9 |       contents: write
10 |     strategy:
11 |       fail-fast: false
12 |       matrix:
13 |         include:
14 |           - platform: "macos-14" # for Arm based macs (M1 and above).
15 |           - platform: "macos-13" # for Intel based macs.
16 |           - platform: "ubuntu-22.04" # Ubuntu 22.04 x86_64
17 |           - platform: "ubuntu-22.04-arm" # Linux ARM
18 |           - platform: "windows-2022" # Windows x86_64
19 | 
20 |     runs-on: ${{ matrix.platform }}
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |       - name: Install the latest version of uv
24 |         uses: astral-sh/setup-uv@v5
25 | 
26 |       # https://github.com/crate-ci/typos/issues/1191
27 |       - name: Install wget for Windows
28 |         if: matrix.platform == 'windows-2022'
29 |         run: choco install wget --no-progress
30 | 
31 |       - name: test
32 |         run: |
33 |           wget --progress=bar:force:noscroll https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx -O kokoro-v1.0.onnx
34 |           wget --progress=bar:force:noscroll https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin -O voices-v1.0.bin
35 |           uv run examples/save.py
36 |         shell: bash
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # General
10 | *.pt
11 | *.onnx
12 | .DS_Store
13 | *.wav
14 | *.json
15 | !.vscode/*.json
16 | !src/kokoro_onnx/config.json
17 | espeak-ng-data/
18 | *.tar.gz
19 | *.dylib
20 | *.so
21 | *.dll
22 | *.m4a
23 | *.npz
24 | *.bin
25 | 
26 | # Virtual environments
27 | .venv
28 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "charliermarsh.ruff"
4 |     ]
5 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.formatOnSave": true,
 3 |   "[python]": {
 4 |     "editor.defaultFormatter": "charliermarsh.ruff",
 5 |     "editor.codeActionsOnSave": {
 6 |       "source.organizeImports.ruff": "always"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/BUILDING.md:
--------------------------------------------------------------------------------
 1 | # Building
 2 | 
 3 | ## Publish new version
 4 | 
 5 | ```console
 6 | rm -rf dist
 7 | uv build
 8 | UV_PUBLISH_TOKEN="pypi token here" uv publish
 9 | ```
10 | 
11 | ## Format and lint
12 | 
13 | ```console
14 | uv run ruff format
15 | uv run ruff check
16 | ```
17 | 
18 | ## Log
19 | 
20 | Enable log with
21 | 
22 | ```console
23 | LOG_LEVEL=DEBUG python main.py
24 | ```
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTE.md:
--------------------------------------------------------------------------------
 1 | # Contributing to kokoro-onnx
 2 | 
 3 | Thanks for thinking about contributing! 🎉
 4 | 
 5 | ## What We Focus On
 6 | 
 7 | This repo is for the kokoro-onnx package and examples. Our focus is on improving the package, adding examples, fixing bugs, and keeping things minimal and simple. We aim to prevent unnecessary complexity and ensure the project stays straightforward.
 8 | 
 9 | Before contributing, **please open a [new issue](https://github.com/thewh1teagle/kokoro-onnx/issues)** to discuss your idea. This helps make sure it's a good fit and relevant. We're here to help!
10 | 
11 | ## Development Recommendations
12 | 
13 | We strongly recommend using [uv](https://docs.astral.sh/uv/getting-started/installation) for development, along with the Visual Studio Code extension suggested in the repository's recommendations.
14 | 
15 | Before submitting a pull request, please ensure your code meets the project's formatting and linting standards by running:
16 | 
17 | ```console
18 | uv run ruff format
19 | uv run ruff check
20 | ```
21 | 
22 | If you want to use ruff for quick [safety fixes](https://docs.astral.sh/ruff/linter/#fix-safety),
23 | you can run the following command:
24 | 
25 | ```console
26 | uv run ruff check --fix
27 | ```
28 | 
29 | ## Pull Request Guidelines
30 | 
31 | Do not create a pull request from your main branch. This ensures we can collaborate and edit the PR if needed.
32 | Thank you for contributing and helping improve kokoro-onnx! 🚀
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 github.com/thewh1teagle
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # kokoro-onnx
 2 | 
 3 | ![Python Version](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
 4 | [![PyPI Release](https://img.shields.io/pypi/v/kokoro-onnx.svg)](https://pypi.org/project/kokoro-onnx/)
 5 | [![Github Model Releases](https://img.shields.io/github/v/release/thewh1teagle/kokoro-onnx)](https://github.com/thewh1teagle/kokoro-onnx/releases)
 6 | [![License](https://img.shields.io/github/license/thewh1teagle/kokoro-onnx)](https://github.com/thewh1teagle/kokoro-onnx/blob/main/LICENSE)
 7 | [![GitHub stars](https://img.shields.io/github/stars/thewh1teagle/kokoro-onnx?style=social)](https://github.com/thewh1teagle/kokoro-onnx/stargazers)
 8 | [![PyPI Downloads](https://img.shields.io/pypi/dm/kokoro-onnx?style=plastic)](https://pypi.org/project/kokoro-onnx/)
 9 | 
10 | [![ONNX Runtime](https://img.shields.io/badge/ONNX%20Runtime-%E2%89%A51.20.1-blue)](https://github.com/microsoft/onnxruntime)
11 | ![CPU](https://img.shields.io/badge/CPU-supported-brightgreen)
12 | ![GPU](https://img.shields.io/badge/GPU-supported-brightgreen)
13 | 
14 | TTS with onnx runtime based on [Kokoro-TTS](https://huggingface.co/spaces/hexgrad/Kokoro-TTS)
15 | 
16 | 🚀 Version 1.0 models are out now! 🎉
17 | 
18 | <https://github.com/user-attachments/assets/00ca06e8-bbbd-4e08-bfb7-23c0acb10ef9>
19 | 
20 | ## Features
21 | 
22 | - Supports multiple languages
23 | - Fast performance near real-time on macOS M1
24 | - Offer multiple voices
25 | - Lightweight: ~300MB (quantized: ~80MB)
26 | 
27 | ## Setup
28 | 
29 | ```console
30 | pip install -U kokoro-onnx
31 | ```
32 | 
33 | <details>
34 | 
35 | <summary>Instructions</summary>
36 | 
37 | 1. Install [uv](https://docs.astral.sh/uv/getting-started/installation) for isolated Python (Recommend).
38 | 
39 | Basically open the terminal (PowerShell / Bash) and run the command listed in their website.
40 | 
41 | _Note: you don't have to use `uv`. but it just make things much simpler. You can use regular Python as well._
42 | 
43 | 2. Create new project folder (you name it)
44 | 3. Run in the project folder
45 | 
46 | ```console
47 | uv init -p 3.12
48 | uv add kokoro-onnx soundfile
49 | ```
50 | 
51 | 4. Paste the contents of [`examples/save.py`](https://github.com/thewh1teagle/kokoro-onnx/blob/main/examples/save.py) in `hello.py`
52 | 5. Download the files [`kokoro-v1.0.onnx`](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx), and [`voices-v1.0.bin`](https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin) and place them in the same directory.
53 | 6. Run
54 | 
55 | ```console
56 | uv run hello.py
57 | ```
58 | 
59 | You can edit the text in `hello.py`
60 | 
61 | That's it! `audio.wav` should be created.
62 | 
63 | </details>
64 | 
65 | ## Examples
66 | 
67 | See [examples](examples)
68 | 
69 | ## Voices
70 | 
71 | See the latest voices and languages in [Kokoro-82M/VOICES.md](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md)
72 | 
73 | Note: It's recommend to use misaki g2p package from v1.0, see [examples](examples)
74 | 
75 | ## Contribute
76 | 
77 | See [CONTRIBUTE.md](CONTRIBUTE.md)
78 | 
79 | ## License
80 | 
81 | - kokoro-onnx: MIT
82 | - kokoro model: Apache 2.0
83 | 


--------------------------------------------------------------------------------
/examples/app.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.12"
 3 | # dependencies = [
 4 | #     "gradio>=5.13.1",
 5 | #     "kokoro-onnx>=0.3.8",
 6 | # ]
 7 | #
 8 | # [tool.uv.sources]
 9 | # kokoro-onnx = { path = "../" }
10 | # ///
11 | 
12 | """
13 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
14 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
15 | uv run examples/app.py
16 | """
17 | 
18 | import gradio as gr
19 | import numpy as np
20 | 
21 | from kokoro_onnx import Kokoro
22 | from kokoro_onnx.tokenizer import Tokenizer
23 | 
24 | tokenizer = Tokenizer()
25 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
26 | 
27 | 
28 | SUPPORTED_LANGUAGES = ["en-us"]
29 | 
30 | 
31 | def create(text: str, voice: str, language: str, blend_voice_name: str = None):
32 |     phonemes = tokenizer.phonemize(text, lang=language)
33 | 
34 |     # Blending
35 |     if blend_voice_name:
36 |         first_voice = kokoro.get_voice_style(voice)
37 |         second_voice = kokoro.get_voice_style(blend_voice_name)
38 |         voice = np.add(first_voice * (50 / 100), second_voice * (50 / 100))
39 |     samples, sample_rate = kokoro.create(
40 |         phonemes, voice=voice, speed=1.0, is_phonemes=True
41 |     )
42 |     return [(sample_rate, samples), phonemes]
43 | 
44 | 
45 | def create_app():
46 |     with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])) as ui:
47 |         text_input = gr.TextArea(
48 |             label="Input Text",
49 |             rtl=False,
50 |             value="Kokoro TTS. Turning words into emotion, one voice at a time!",
51 |         )
52 |         language_input = gr.Dropdown(
53 |             label="Language",
54 |             value="en-us",
55 |             choices=SUPPORTED_LANGUAGES,
56 |         )
57 |         voice_input = gr.Dropdown(
58 |             label="Voice", value="af_sky", choices=sorted(kokoro.get_voices())
59 |         )
60 |         blend_voice_input = gr.Dropdown(
61 |             label="Blend Voice (Optional)",
62 |             value=None,
63 |             choices=sorted(kokoro.get_voices()) + [None],
64 |         )
65 |         submit_button = gr.Button("Create")
66 |         phonemes_output = gr.Textbox(label="Phonemes")
67 |         audio_output = gr.Audio()
68 |         submit_button.click(
69 |             fn=create,
70 |             inputs=[text_input, voice_input, language_input, blend_voice_input],
71 |             outputs=[audio_output, phonemes_output],
72 |         )
73 |     return ui
74 | 
75 | 
76 | ui = create_app()
77 | ui.launch(debug=True)
78 | 


--------------------------------------------------------------------------------
/examples/chinese.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Run
 9 |     uv venv -p 3.12
10 |     uv pip install -U kokoro-onnx soundfile 'misaki[zh]'
11 | 3.
12 |     Download these files
13 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx
14 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin
15 |     https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json
16 | 4. Run
17 |     uv run main.py
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import zh
22 | 
23 | from kokoro_onnx import Kokoro
24 | 
25 | # Misaki G2P with espeak-ng fallback
26 | g2p = zh.ZHG2P(version="1.1")
27 | 
28 | text = "千里之行，始于足下。"
29 | voice = "zf_001"
30 | kokoro = Kokoro("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin", vocab_config="config.json")
31 | phonemes, _ = g2p(text)
32 | samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
33 | sf.write("audio.wav", samples, sample_rate)
34 | print("Created audio.wav")
35 | 


--------------------------------------------------------------------------------
/examples/english.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import en, espeak
22 | 
23 | from kokoro_onnx import Kokoro
24 | 
25 | # Misaki G2P with espeak-ng fallback
26 | fallback = espeak.EspeakFallback(british=False)
27 | g2p = en.G2P(trf=False, british=False, fallback=fallback)
28 | 
29 | # Kokoro
30 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
31 | 
32 | # Phonemize
33 | text = "[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models."
34 | phonemes, _ = g2p(text)
35 | 
36 | # Create
37 | samples, sample_rate = kokoro.create(phonemes, "af_heart", is_phonemes=True)
38 | 
39 | # Save
40 | sf.write("audio.wav", samples, sample_rate)
41 | print("Created audio.wav")
42 | 


--------------------------------------------------------------------------------
/examples/french.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import espeak
22 | from misaki.espeak import EspeakG2P
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | # Misaki G2P with espeak-ng fallback
27 | fallback = espeak.EspeakFallback(british=False)
28 | g2p = EspeakG2P(language="fr-fr")
29 | 
30 | # Kokoro
31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 | # Phonemize
34 | text = "On ne voit bien qu’avec le cœur. L’essentiel est invisible pour les yeux."
35 | phonemes, _ = g2p(text)
36 | 
37 | # Create
38 | samples, sample_rate = kokoro.create(phonemes, "ff_siwis", is_phonemes=True)
39 | 
40 | # Save
41 | sf.write("audio.wav", samples, sample_rate)
42 | print("Created audio.wav")
43 | 


--------------------------------------------------------------------------------
/examples/hindi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import espeak
22 | from misaki.espeak import EspeakG2P
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | # Misaki G2P with espeak-ng fallback
27 | fallback = espeak.EspeakFallback(british=False)
28 | g2p = EspeakG2P(language="hi")
29 | 
30 | # Kokoro
31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 | # Phonemize
34 | text = "सपने वो नहीं जो हम सोते समय देखते हैं, सपने वो हैं जो हमें सोने नहीं देते।"
35 | phonemes, _ = g2p(text)
36 | 
37 | # Create
38 | samples, sample_rate = kokoro.create(phonemes, "hf_alpha", is_phonemes=True)
39 | 
40 | # Save
41 | sf.write("audio.wav", samples, sample_rate)
42 | print("Created audio.wav")
43 | 


--------------------------------------------------------------------------------
/examples/italian.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import espeak
22 | from misaki.espeak import EspeakG2P
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | # Misaki G2P with espeak-ng fallback
27 | fallback = espeak.EspeakFallback(british=False)
28 | g2p = EspeakG2P(language="it")
29 | 
30 | # Kokoro
31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 | # Phonemize
34 | text = "Non sognare la tua vita, vivi il tuo sogno."
35 | phonemes, _ = g2p(text)
36 | 
37 | # Create
38 | samples, sample_rate = kokoro.create(phonemes, "im_nicola", is_phonemes=True)
39 | 
40 | # Save
41 | sf.write("audio.wav", samples, sample_rate)
42 | print("Created audio.wav")
43 | 


--------------------------------------------------------------------------------
/examples/japanse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Run
 9 |     uv venv -p 3.12
10 |     uv pip install -U kokoro-onnx soundfile 'misaki[ja]'
11 | 3.
12 |     Download these files
13 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx
14 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin
15 |     https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json
16 | 4. Run
17 |     uv run main.py
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import ja
22 | 
23 | from kokoro_onnx import Kokoro
24 | 
25 | # Misaki G2P with espeak-ng fallback
26 | g2p = ja.JAG2P()
27 | 
28 | text = "「人生を夢見るな。夢を生きろ。」"
29 | voice = "jf_alpha"
30 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin", vocab_config="config.json")
31 | phonemes, _ = g2p(text)
32 | samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
33 | sf.write("audio.wav", samples, sample_rate)
34 | print("Created audio.wav")
35 | 


--------------------------------------------------------------------------------
/examples/play.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev
 3 | 
 4 | pip install -U kokoro-onnx sounddevice
 5 | 
 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 8 | python examples/play.py
 9 | """
10 | 
11 | import sounddevice as sd
12 | 
13 | from kokoro_onnx import Kokoro
14 | 
15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
16 | samples, sample_rate = kokoro.create(
17 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
18 | )
19 | print("Playing audio...")
20 | sd.play(samples, sample_rate)
21 | sd.wait()
22 | 


--------------------------------------------------------------------------------
/examples/podcast.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/podcast.py
 7 | """
 8 | 
 9 | import random
10 | 
11 | import numpy as np
12 | import soundfile as sf
13 | 
14 | from kokoro_onnx import Kokoro
15 | 
16 | # fmt: off
17 | sentences = [
18 |     { "voice": "af_sarah", "text": "Hello and welcome to the podcast! We’ve got some exciting things lined up today." }, # Sarah
19 |     { "voice": "am_michael", "text": "It’s going to be an exciting episode. Stick with us!" }, # Michael
20 |     { "voice": "af_sarah", "text": "But first, we’ve got a special guest with us. Please welcome Nicole!" },   # Sarah
21 |     { "voice": "af_sarah", "text": "Now, we’ve been told Nicole has a very unique way of speaking today... a bit of a mysterious vibe, if you will." }, # Sarah
22 |     { "voice": "af_nicole", "text": "Hey there... I’m so excited to be a guest today... But I thought I’d keep it quiet... for now..." },  # Nicole whispers
23 |     { "voice": "am_michael", "text": "Well, it certainly adds some intrigue! Let’s dive in and see what that’s all about." }, # Sarah
24 |     { "voice": "af_sarah", "text": "Today, we’re covering something that’s close to our hearts" }, # Sarah
25 |     { "voice": "am_michael", "text": "It’s going to be a good one!" }   # Michael
26 | ]
27 | 
28 | def random_pause(min_duration=0.5, max_duration=2.0):
29 |     silence_duration = random.uniform(min_duration, max_duration)
30 |     silence = np.zeros(int(silence_duration * sample_rate))
31 |     return silence
32 | 
33 | 
34 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
35 | 
36 | audio = []
37 | 
38 | for sentence in sentences:
39 |     voice = sentence["voice"]
40 |     text = sentence["text"]
41 |     print(f"Creating audio with {voice}: {text}")
42 |     
43 |     samples, sample_rate = kokoro.create(
44 |         text,
45 |         voice=voice,
46 |         speed=1.0,
47 |         lang="en-us",
48 |     )
49 |     audio.append(samples)
50 |     # Add random silence after each sentence
51 |     audio.append(random_pause())
52 | 
53 | # Concatenate all audio parts
54 | audio = np.concatenate(audio)
55 | 
56 | # Save the generated audio to file
57 | sf.write("podcast.wav", audio, sample_rate)
58 | print("Created podcast.wav")
59 | 


--------------------------------------------------------------------------------
/examples/portuguese.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import espeak
22 | from misaki.espeak import EspeakG2P
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | # Misaki G2P with espeak-ng fallback
27 | fallback = espeak.EspeakFallback(british=False)
28 | g2p = EspeakG2P(language="pt-br")
29 | 
30 | # Kokoro
31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 | # Phonemize
34 | text = "Não sonhe sua vida, viva seu sonho."
35 | phonemes, _ = g2p(text)
36 | 
37 | # Create
38 | samples, sample_rate = kokoro.create(phonemes, "pf_dora", is_phonemes=True)
39 | 
40 | # Save
41 | sf.write("audio.wav", samples, sample_rate)
42 | print("Created audio.wav")
43 | 


--------------------------------------------------------------------------------
/examples/save.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/save.py
 7 | """
 8 | 
 9 | import soundfile as sf
10 | 
11 | from kokoro_onnx import Kokoro
12 | 
13 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
14 | samples, sample_rate = kokoro.create(
15 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
16 | )
17 | sf.write("audio.wav", samples, sample_rate)
18 | print("Created audio.wav")
19 | 


--------------------------------------------------------------------------------
/examples/spanish.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | 1.
 4 |     Install uv from https://docs.astral.sh/uv/getting-started/installation
 5 | 2.
 6 |     Copy this file to new folder
 7 | 3.
 8 |     Download these files
 9 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 4. Run
12 |     uv venv --seed -p 3.12
13 |     source .venv/bin/activate
14 |     uv pip install -U kokoro-onnx soundfile 'misaki[en]'
15 |     uv run main.py
16 | 
17 | For other languages read https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
18 | """
19 | 
20 | import soundfile as sf
21 | from misaki import espeak
22 | from misaki.espeak import EspeakG2P
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | # Misaki G2P with espeak-ng fallback
27 | fallback = espeak.EspeakFallback(british=False)
28 | g2p = EspeakG2P(language="es")
29 | 
30 | # Kokoro
31 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 | # Phonemize
34 | text = "No cuentes los días, haz que los días cuenten."
35 | phonemes, _ = g2p(text)
36 | 
37 | # Create
38 | samples, sample_rate = kokoro.create(phonemes, "im_nicola", is_phonemes=True)
39 | 
40 | # Save
41 | sf.write("audio.wav", samples, sample_rate)
42 | print("Created audio.wav")
43 | 


--------------------------------------------------------------------------------
/examples/with_blending.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_blending.py
 7 | """
 8 | 
 9 | import numpy as np
10 | import soundfile as sf
11 | 
12 | from kokoro_onnx import Kokoro
13 | 
14 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
15 | nicole: np.ndarray = kokoro.get_voice_style("af_nicole")
16 | michael: np.ndarray = kokoro.get_voice_style("am_michael")
17 | blend = np.add(nicole * (50 / 100), michael * (50 / 100))
18 | samples, sample_rate = kokoro.create(
19 |     "Hello. This audio is generated by Kokoro!",
20 |     voice=blend,
21 |     speed=1.0,
22 |     lang="en-us",
23 | )
24 | sf.write("audio.wav", samples, sample_rate)
25 | print("Created audio.wav")
26 | 


--------------------------------------------------------------------------------
/examples/with_espeak_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_espeak_data.py
 7 | """
 8 | 
 9 | import soundfile as sf
10 | 
11 | from kokoro_onnx import EspeakConfig, Kokoro
12 | 
13 | kokoro = Kokoro(
14 |     "kokoro-v1.0.onnx",
15 |     "voices-v1.0.bin",
16 |     espeak_config=EspeakConfig(data_path="./espeak-ng-data"),
17 | )
18 | samples, sample_rate = kokoro.create(
19 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
20 | )
21 | sf.write("audio.wav", samples, sample_rate)
22 | print("Created audio.wav")
23 | 


--------------------------------------------------------------------------------
/examples/with_espeak_lib.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev
 3 | 
 4 | pip install -U kokoro-onnx sounddevice
 5 | 
 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 8 | 
 9 | PHONEMIZER_ESPEAK_LIBRARY="/usr/local/Cellar/espeak-ng/1.52.0/lib/libespeak-ng.1.dylib" python examples/with_espeak_lib.py
10 | """
11 | 
12 | import os
13 | 
14 | import sounddevice as sd
15 | 
16 | from kokoro_onnx import EspeakConfig, Kokoro
17 | 
18 | kokoro = Kokoro(
19 |     "kokoro-v1.0.onnx",
20 |     "voices-v1.0.bin",
21 |     espeak_config=EspeakConfig(lib_path=os.getenv("PHONEMIZER_ESPEAK_LIBRARY")),
22 | )
23 | samples, sample_rate = kokoro.create(
24 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
25 | )
26 | print("Playing audio...")
27 | sd.play(samples, sample_rate)
28 | sd.wait()
29 | 


--------------------------------------------------------------------------------
/examples/with_gpu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note:
 3 |     On Linux you need to run this as well: apt-get install portaudio19-dev
 4 |     gpu version is sufficient only for Linux and Windows. macOS works with GPU by default.
 5 |     You can see the used execution provider by enable debug log. see with_log.py
 6 | 
 7 | Setup:
 8 |     pip install -U kokoro-onnx[gpu] sounddevice
 9 |     wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
10 |     wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
11 | 
12 | Run:
13 | python examples/play.py
14 | """
15 | 
16 | import sounddevice as sd
17 | 
18 | from kokoro_onnx import Kokoro
19 | 
20 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
21 | samples, sample_rate = kokoro.create(
22 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
23 | )
24 | print("Playing audio...")
25 | sd.play(samples, sample_rate)
26 | sd.wait()
27 | 


--------------------------------------------------------------------------------
/examples/with_log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev
 3 | 
 4 | pip install -U kokoro-onnx sounddevice
 5 | 
 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 8 | python examples/with_log.py
 9 | """
10 | 
11 | import logging
12 | 
13 | import sounddevice as sd
14 | 
15 | import kokoro_onnx
16 | from kokoro_onnx import Kokoro
17 | 
18 | # You can set the environment variable LOG_LEVEL
19 | # Linux: export LOG_LEVEL=DEBUG
20 | # Windows: $env:LOG_LEVEL="DEBUG"
21 | 
22 | # Or programmatically
23 | logging.getLogger(kokoro_onnx.__name__).setLevel("DEBUG")
24 | 
25 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
26 | samples, sample_rate = kokoro.create(
27 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
28 | )
29 | print("Playing audio...")
30 | sd.play(samples, sample_rate)
31 | sd.wait()
32 | 


--------------------------------------------------------------------------------
/examples/with_phonemes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx sounddevice
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_phonemes.py
 7 | """
 8 | 
 9 | import sounddevice as sd
10 | 
11 | from kokoro_onnx import Kokoro
12 | from kokoro_onnx.tokenizer import Tokenizer
13 | 
14 | tokenizer = Tokenizer()
15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
16 | 
17 | phonemes = tokenizer.phonemize("Hello world!")
18 | samples, sample_rate = kokoro.create(phonemes, voice="af_heart", is_phonemes=True)
19 | print("Playing audio...")
20 | sd.play(samples, sample_rate)
21 | sd.wait()
22 | 


--------------------------------------------------------------------------------
/examples/with_provider.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use a specific ONNX execution provider with Kokoro.
 3 | 
 4 | For available providers, see:
 5 | https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
 6 | 
 7 | Setup:
 8 | 1. Install dependencies: pip install -U kokoro-onnx soundfile
 9 | 2. Download model and voices:
10 |    wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
11 |    wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
12 | 
13 | Run:
14 | macOS/Linux: ONNX_PROVIDER="CPUExecutionProvider" python examples/with_provider.py
15 | Windows PowerShell: $env:ONNX_PROVIDER="CPUExecutionProvider" ; python examples/with_provider.py
16 | """
17 | 
18 | import soundfile as sf
19 | 
20 | from kokoro_onnx import Kokoro
21 | 
22 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
23 | samples, sample_rate = kokoro.create(
24 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
25 | )
26 | sf.write("audio.wav", samples, sample_rate)
27 | print("Created audio.wav")
28 | 


--------------------------------------------------------------------------------
/examples/with_quant.py:
--------------------------------------------------------------------------------
 1 | """
 2 | **Smaller models should have lower quality but show no significant quality loss in checks.
 3 | 
 4 | Usage:
 5 | 
 6 | 1. Install dependencies:
 7 |    sudo apt-get install portaudio19-dev
 8 |    pip install -U kokoro-onnx sounddevice
 9 | 2. Download a model (choose one):
10 |    - INT8 (88MB):
11 |      wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.int8.onnx
12 |    - FP16 (169MB):
13 |      wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.fp16.onnx
14 | 3. Download voices-v1.0.bin:
15 |    wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
16 | 4. Run example:
17 |    python examples/with_quant.py <chosen_model>
18 | """
19 | 
20 | import sys
21 | 
22 | import sounddevice as sd
23 | 
24 | from kokoro_onnx import Kokoro
25 | 
26 | kokoro = Kokoro(sys.argv[1], "voices-v1.0.bin")
27 | samples, sample_rate = kokoro.create(
28 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
29 | )
30 | print("Playing audio...")
31 | sd.play(samples, sample_rate)
32 | sd.wait()
33 | 


--------------------------------------------------------------------------------
/examples/with_session.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_session.py
 7 | """
 8 | 
 9 | import os
10 | 
11 | import onnxruntime
12 | import soundfile as sf
13 | from onnxruntime import InferenceSession
14 | 
15 | from kokoro_onnx import Kokoro
16 | 
17 | 
18 | def create_session():
19 |     # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
20 |     providers = onnxruntime.get_available_providers()
21 |     print(f"Available onnx runtime providers: {providers}")
22 | 
23 |     # See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management
24 |     sess_options = onnxruntime.SessionOptions()
25 |     # Set threads to num of CPU cores
26 |     cpu_count = os.cpu_count()
27 |     print(f"Setting threads to CPU cores count: {cpu_count}")
28 |     sess_options.intra_op_num_threads = cpu_count
29 |     session = InferenceSession(
30 |         "kokoro-v1.0.onnx", providers=providers, sess_options=sess_options
31 |     )
32 |     return session
33 | 
34 | 
35 | session = create_session()
36 | kokoro = Kokoro.from_session(session, "voices-v1.0.bin")
37 | samples, sample_rate = kokoro.create(
38 |     "Hello. This audio generated by kokoro!", voice="af_sarah", speed=1.0, lang="en-us"
39 | )
40 | sf.write("audio.wav", samples, sample_rate)
41 | print("Created audio.wav")
42 | 


--------------------------------------------------------------------------------
/examples/with_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: on Linux you need to run this as well: apt-get install portaudio19-dev
 3 | 
 4 | pip install -U kokoro-onnx sounddevice
 5 | 
 6 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 7 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 8 | python examples/with_stream.py
 9 | """
10 | 
11 | import asyncio
12 | 
13 | import sounddevice as sd
14 | 
15 | from kokoro_onnx import Kokoro
16 | 
17 | text = """
18 | We've just been hearing from Matthew Cappucci, a senior meteorologist at the weather app MyRadar, who says Kansas City is seeing its heaviest snow in 32 years - with more than a foot (30 to 40cm) having come down so far.
19 | 
20 | Despite it looking as though the storm is slowly moving eastwards, Cappucci says the situation in Kansas and Missouri remains serious.
21 | 
22 | He says some areas near the Ohio River are like "skating rinks", telling our colleagues on Newsday that in Missouri in particular there is concern about how many people have lost power, and will lose power, creating enough ice to pull power lines down.
23 | 
24 | Temperatures are set to drop in the next several days, in may cases dipping maybe below minus 10 to minus 15 degrees Celsius for an extended period of time.
25 | 
26 | There is a special alert for Kansas, urging people not to leave their homes: "The ploughs are getting stuck, the police are getting stuck, everybody’s getting stuck - stay home."
27 | """
28 | 
29 | 
30 | async def main():
31 |     kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
32 | 
33 |     stream = kokoro.create_stream(
34 |         text,
35 |         voice="af_nicole",
36 |         speed=1.0,
37 |         lang="en-us",
38 |     )
39 | 
40 |     count = 0
41 |     async for samples, sample_rate in stream:
42 |         count += 1
43 |         print(f"Playing audio stream ({count})...")
44 |         sd.play(samples, sample_rate)
45 |         sd.wait()
46 | 
47 | 
48 | asyncio.run(main())
49 | 


--------------------------------------------------------------------------------
/examples/with_stream_save.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_stream_save.py
 7 | """
 8 | 
 9 | import asyncio
10 | 
11 | import soundfile as sf
12 | 
13 | from kokoro_onnx import SAMPLE_RATE, Kokoro
14 | 
15 | text = """
16 | We've just been hearing from Matthew Cappucci, a senior meteorologist at the weather app MyRadar, who says Kansas City is seeing its heaviest snow in 32 years - with more than a foot (30 to 40cm) having come down so far.
17 | 
18 | Despite it looking as though the storm is slowly moving eastwards, Cappucci says the situation in Kansas and Missouri remains serious.
19 | 
20 | He says some areas near the Ohio River are like "skating rinks", telling our colleagues on Newsday that in Missouri in particular there is concern about how many people have lost power, and will lose power, creating enough ice to pull power lines down.
21 | 
22 | Temperatures are set to drop in the next several days, in may cases dipping maybe below minus 10 to minus 15 degrees Celsius for an extended period of time.
23 | 
24 | There is a special alert for Kansas, urging people not to leave their homes: "The ploughs are getting stuck, the police are getting stuck, everybody’s getting stuck - stay home."
25 | """
26 | 
27 | 
28 | async def main():
29 |     kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
30 | 
31 |     stream = kokoro.create_stream(
32 |         text,
33 |         voice="af_nicole",
34 |         speed=1.0,
35 |         lang="en-us",
36 |     )
37 | 
38 |     with sf.SoundFile("audio.wav", mode="w", samplerate=SAMPLE_RATE, channels=1) as f:
39 |         count = 0
40 |         async for samples, sample_rate in stream:
41 |             count += 1
42 |             print(f"Writing chunk {count} of audio stream...")
43 |             f.write(samples)
44 | 
45 | 
46 | asyncio.run(main())
47 | 


--------------------------------------------------------------------------------
/examples/with_voice.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pip install -U kokoro-onnx soundfile
 3 | 
 4 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx
 5 | wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
 6 | python examples/with_voice.py
 7 | """
 8 | 
 9 | import numpy as np
10 | import soundfile as sf
11 | 
12 | from kokoro_onnx import Kokoro
13 | from kokoro_onnx.config import SAMPLE_RATE
14 | 
15 | kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
16 | created = []
17 | 
18 | for voice in kokoro.get_voices():
19 |     samples, sample_rate = kokoro.create(
20 |         f"Hello! This audio generated by {voice}!", voice=voice, speed=1.0
21 |     )
22 |     created.append(samples)
23 |     print(f"Generated audio for {voice}")
24 | 
25 | audio = np.concatenate(created)
26 | 
27 | sf.write("voices.wav", audio, SAMPLE_RATE)
28 | print("Created voices.wav")
29 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "kokoro-onnx"
 3 | version = "0.4.9"
 4 | description = "TTS with kokoro and onnx runtime"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "thewh1teagle", email = "61390950+thewh1teagle@users.noreply.github.com" },
 8 | ]
 9 | requires-python = ">=3.10,<3.14"
10 | dependencies = [
11 |     "onnxruntime>=1.20.1",
12 |     "colorlog>=6.9.0",
13 |     "espeakng-loader>=0.2.4",
14 |     "phonemizer-fork>=3.3.2",
15 |     "numpy>=2.0.2",
16 | ]
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/thewh1teagle/kokoro-onnx"
20 | Repository = "https://github.com/thewh1teagle/kokoro-onnx"
21 | Issues = "https://github.com/thewh1teagle/kokoro-onnx/issues"
22 | 
23 | 
24 | [project.optional-dependencies]
25 | # Windows/Linux GPU feature
26 | # Install with kokoro-onnx[gpu]
27 | gpu = [
28 |     # onnxruntime-gpu is not available on Linux ARM or macOS
29 |     "onnxruntime-gpu>=1.20.1; platform_machine == 'x86_64' and sys_platform != 'darwin'",
30 | ]
31 | 
32 | [build-system]
33 | requires = ["hatchling"]
34 | build-backend = "hatchling.build"
35 | 
36 | [dependency-groups]
37 | dev = ["ruff>=0.11.0", "sounddevice>=0.5.1", "soundfile>=0.13.0"]
38 | 
39 | [tool.ruff]
40 | required-version = ">=0.9.0"
41 | output-format = "concise"
42 | show-fixes = true
43 | 
44 | [tool.ruff.lint]
45 | extend-select = ["I", "UP"]
46 | 
47 | [tool.ruff.lint.isort]
48 | split-on-trailing-comma = false
49 | 


--------------------------------------------------------------------------------
/scripts/export.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.12"
  3 | # dependencies = [
  4 | #     "kokoro==0.8.4",
  5 | #     "onnx==1.17.0",
  6 | #     "onnxruntime==1.20.1",
  7 | #     "sounddevice==0.5.1",
  8 | # ]
  9 | #
 10 | # ///
 11 | 
 12 | """
 13 | From https://github.com/hexgrad/kokoro/blob/3f9dd88d6f739b98a86aea608e238621f5b40add/examples/export.py
 14 | 
 15 | mkdir checkpoints
 16 | wget https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json -O checkpoints/config.json
 17 | wget https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth -O checkpoints/kokoro-v1_1-zh.pth
 18 | uv run examples/export.py
 19 | uv run examples/export.py --config_file checkpoints/config.json --checkpoint_path checkpoints/kokoro-v1_1-zh.pth
 20 | """
 21 | 
 22 | import argparse
 23 | import os
 24 | 
 25 | import onnx
 26 | import onnxruntime as ort
 27 | import sounddevice as sd
 28 | import torch
 29 | from kokoro import KModel, KPipeline
 30 | from kokoro.model import KModelForONNX
 31 | 
 32 | 
 33 | def export_onnx(model, output):
 34 |     onnx_file = output + "/" + "kokoro.onnx"
 35 | 
 36 |     input_ids = torch.randint(1, 100, (48,)).numpy()
 37 |     input_ids = torch.LongTensor([[0, *input_ids, 0]])
 38 |     style = torch.randn(1, 256)
 39 |     speed = torch.randint(1, 10, (1,)).int()
 40 | 
 41 |     torch.onnx.export(
 42 |         model,
 43 |         args=(input_ids, style, speed),
 44 |         f=onnx_file,
 45 |         export_params=True,
 46 |         verbose=True,
 47 |         input_names=["input_ids", "style", "speed"],
 48 |         output_names=["waveform", "duration"],
 49 |         opset_version=17,
 50 |         dynamic_axes={
 51 |             "input_ids": {1: "input_ids_len"},
 52 |             "waveform": {0: "num_samples"},
 53 |         },
 54 |         do_constant_folding=True,
 55 |     )
 56 | 
 57 |     print("export kokoro.onnx ok!")
 58 | 
 59 |     onnx_model = onnx.load(onnx_file)
 60 |     onnx.checker.check_model(onnx_model)
 61 |     print("onnx check ok!")
 62 | 
 63 | 
 64 | def load_input_ids(pipeline, text):
 65 |     if pipeline.lang_code in "ab":
 66 |         _, tokens = pipeline.g2p(text)
 67 |         for gs, ps, tks in pipeline.en_tokenize(tokens):
 68 |             if not ps:
 69 |                 continue
 70 |     else:
 71 |         ps, _ = pipeline.g2p(text)
 72 | 
 73 |     if len(ps) > 510:
 74 |         ps = ps[:510]
 75 | 
 76 |     input_ids = list(
 77 |         filter(lambda i: i is not None, map(lambda p: pipeline.model.vocab.get(p), ps))
 78 |     )
 79 |     print(f"text: {text} -> phonemes: {ps} -> input_ids: {input_ids}")
 80 |     input_ids = torch.LongTensor([[0, *input_ids, 0]]).to(pipeline.model.device)
 81 |     return ps, input_ids
 82 | 
 83 | 
 84 | def load_voice(pipeline, voice, phonemes):
 85 |     pack = pipeline.load_voice(voice).to("cpu")
 86 |     return pack[len(phonemes) - 1]
 87 | 
 88 | 
 89 | def load_sample(model):
 90 |     pipeline = KPipeline(lang_code="a", model=model.kmodel, device="cpu")
 91 |     text = """
 92 |     In today's fast-paced tech world, building software applications has never been easier — thanks to AI-powered coding assistants.'
 93 |     """
 94 |     text = """
 95 |     The sky above the port was the color of television, tuned to a dead channel.
 96 |     """
 97 |     voice = "checkpoints/voices/af_heart.pt"
 98 | 
 99 |     pipeline = KPipeline(lang_code="z", model=model.kmodel, device="cpu")
100 |     text = """
101 |     2月15日晚，猫眼专业版数据显示，截至发稿，《哪吒之魔童闹海》（或称《哪吒2》）今日票房已达7.8亿元，累计票房（含预售）超过114亿元。
102 |     """
103 |     voice = "checkpoints/voices/zf_xiaoxiao.pt"
104 | 
105 |     phonemes, input_ids = load_input_ids(pipeline, text)
106 |     style = load_voice(pipeline, voice, phonemes)
107 |     speed = torch.IntTensor([1])
108 | 
109 |     return input_ids, style, speed
110 | 
111 | 
112 | def inference_onnx(model, output):
113 |     onnx_file = output + "/" + "kokoro.onnx"
114 |     session = ort.InferenceSession(onnx_file)
115 | 
116 |     input_ids, style, speed = load_sample(model)
117 | 
118 |     outputs = session.run(
119 |         None,
120 |         {
121 |             "input_ids": input_ids.numpy(),
122 |             "style": style.numpy(),
123 |             "speed": speed.numpy(),
124 |         },
125 |     )
126 | 
127 |     output = torch.from_numpy(outputs[0])
128 |     print(f"output: {output.shape}")
129 |     print(output)
130 | 
131 |     audio = output.numpy()
132 |     sd.play(audio, 24000)
133 |     sd.wait()
134 | 
135 | 
136 | def check_model(model):
137 |     input_ids, style, speed = load_sample(model)
138 |     output, duration = model(input_ids, style, speed)
139 | 
140 |     print(f"output: {output.shape}")
141 |     print(f"duration: {duration.shape}")
142 |     print(output)
143 | 
144 |     audio = output.numpy()
145 |     sd.play(audio, 24000)
146 |     sd.wait()
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     parser = argparse.ArgumentParser("Export kokoro Model to ONNX", add_help=True)
151 |     parser.add_argument(
152 |         "--inference", "-t", help="test kokoro.onnx model", action="store_true"
153 |     )
154 |     parser.add_argument("--check", "-m", help="check kokoro model", action="store_true")
155 |     parser.add_argument(
156 |         "--config_file",
157 |         "-c",
158 |         type=str,
159 |         default="checkpoints/config.json",
160 |         help="path to config file",
161 |     )
162 |     parser.add_argument(
163 |         "--checkpoint_path",
164 |         "-p",
165 |         type=str,
166 |         default="checkpoints/kokoro-v1_0.pth",
167 |         help="path to checkpoint file",
168 |     )
169 |     parser.add_argument(
170 |         "--output_dir", "-o", type=str, default="onnx", help="output directory"
171 |     )
172 | 
173 |     args = parser.parse_args()
174 | 
175 |     # cfg
176 |     config_file = args.config_file  # change the path of the model config file
177 |     checkpoint_path = args.checkpoint_path  # change the path of the model
178 |     output_dir = args.output_dir
179 | 
180 |     # make dir
181 |     os.makedirs(output_dir, exist_ok=True)
182 | 
183 |     kmodel = KModel(config=config_file, model=checkpoint_path, disable_complex=True)
184 |     model = KModelForONNX(kmodel).eval()
185 | 
186 |     if args.inference:
187 |         inference_onnx(model, output_dir)
188 |     elif args.check:
189 |         check_model(model)
190 |     else:
191 |         export_onnx(model, output_dir)
192 | 


--------------------------------------------------------------------------------
/scripts/fetch_voices.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.12"
 3 | # dependencies = [
 4 | #     "numpy==2.0.2",
 5 | #     "requests",
 6 | #     "torch==2.5.1",
 7 | #     "tqdm==4.67.1",
 8 | # ]
 9 | # ///
10 | """
11 | Run this file via:
12 | uv run scripts/fetch_voices.py
13 | 
14 | See voices in
15 | https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
16 | """
17 | 
18 | import io
19 | import os
20 | from pathlib import Path
21 | 
22 | import numpy as np
23 | import requests
24 | import torch
25 | from tqdm import tqdm
26 | 
27 | config = {
28 |     "Kokoro-82M-v1.1-zh": {
29 |         "voice_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/{name}.pt",
30 |         "api_url": "https://huggingface.co/api/models/hexgrad/Kokoro-82M-v1.1-zh/tree/main/voices",
31 |         "npz_path": "voices-v1.1-zh.bin",
32 |     },
33 |     # "Kokoro-82M": {
34 |     #     "voice_url": "https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/{name}.pt",
35 |     #     "api_url": "https://huggingface.co/api/models/hexgrad/Kokoro-82M/tree/main/voices",
36 |     #     "npz_path": "voices-v1.0.bin",
37 |     # },
38 | }
39 | # Extract voice names
40 | 
41 | 
42 | def get_voice_names(api_url):
43 |     resp = requests.get(api_url)
44 |     resp.raise_for_status()
45 |     data = resp.json()
46 |     names = [voice["path"][7:-3] for voice in data]
47 |     return names
48 | 
49 | 
50 | def download_config():
51 |     resp = requests.get(
52 |         "https://huggingface.co/hexgrad/Kokoro-82M/raw/main/config.json"
53 |     )
54 |     resp.raise_for_status()
55 |     content = resp.content
56 |     with open(
57 |         Path(__file__).parent / "../src/kokoro_onnx/config.json", "wb", encoding="utf-8"
58 |     ) as fp:
59 |         fp.write(content)
60 | 
61 | 
62 | def download_voices(voice_url: str, names: list[str], npz_path: str):
63 |     count = len(names)
64 | 
65 |     # Extract voice files
66 |     print(f"Found {count} voices")
67 |     voices = {}
68 |     for name in tqdm(names):
69 |         url = voice_url.format(name=name)
70 |         print(f"Downloading {name}")
71 |         r = requests.get(url)
72 |         r.raise_for_status()  # Ensure the request was successful
73 |         content = io.BytesIO(r.content)
74 |         data: np.ndarray = torch.load(content, weights_only=True).numpy()
75 |         voices[name] = data
76 | 
77 |     # Save all voices to a single .npz file
78 |     with open(npz_path, "wb", encoding="utf-8") as f:
79 |         np.savez(f, **voices)
80 | 
81 |         mb_size = os.path.getsize(npz_path) // 1000 // 1000
82 |         print(f"Created {npz_path} ({mb_size}MB)")
83 | 
84 | 
85 | def main():
86 |     for model_name, model_config in config.items():
87 |         print(f"Downloading {model_name}")
88 |         voice_url, api_url, npz_path = (
89 |             model_config["voice_url"],
90 |             model_config["api_url"],
91 |             model_config["npz_path"],
92 |         )
93 |         voice_names = get_voice_names(api_url)
94 |         download_voices(voice_url, voice_names, npz_path)
95 |         download_config()
96 | 
97 | 
98 | main()
99 | 


--------------------------------------------------------------------------------
/src/kokoro_onnx/__init__.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import importlib
  3 | import importlib.metadata
  4 | import importlib.util
  5 | import json
  6 | import os
  7 | import platform
  8 | import re
  9 | import time
 10 | from collections.abc import AsyncGenerator
 11 | 
 12 | import numpy as np
 13 | import onnxruntime as rt
 14 | from numpy.typing import NDArray
 15 | 
 16 | from .config import MAX_PHONEME_LENGTH, SAMPLE_RATE, EspeakConfig, KoKoroConfig
 17 | from .log import log
 18 | from .tokenizer import Tokenizer
 19 | from .trim import trim as trim_audio
 20 | 
 21 | 
 22 | class Kokoro:
 23 |     def __init__(
 24 |         self,
 25 |         model_path: str,
 26 |         voices_path: str,
 27 |         espeak_config: EspeakConfig | None = None,
 28 |         vocab_config: dict | str | None = None,
 29 |     ):
 30 |         # Show useful information for bug reports
 31 |         log.debug(
 32 |             f"koko-onnx version {importlib.metadata.version('kokoro-onnx')} on {platform.platform()} {platform.version()}"
 33 |         )
 34 |         self.config = KoKoroConfig(model_path, voices_path, espeak_config)
 35 |         self.config.validate()
 36 | 
 37 |         # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
 38 |         providers = ["CPUExecutionProvider"]
 39 | 
 40 |         # Check if kokoro-onnx installed with kokoro-onnx[gpu] feature (Windows/Linux)
 41 |         gpu_enabled = importlib.util.find_spec("onnxruntime-gpu")
 42 |         if gpu_enabled:
 43 |             providers: list[str] = rt.get_available_providers()
 44 | 
 45 |         # Check if ONNX_PROVIDER environment variable was set
 46 |         env_provider = os.getenv("ONNX_PROVIDER")
 47 |         if env_provider:
 48 |             providers = [env_provider]
 49 | 
 50 |         log.debug(f"Providers: {providers}")
 51 |         self.sess = rt.InferenceSession(model_path, providers=providers)
 52 |         self.voices: np.ndarray = np.load(voices_path)
 53 | 
 54 |         vocab = self._load_vocab(vocab_config)
 55 |         self.tokenizer = Tokenizer(espeak_config, vocab=vocab)
 56 | 
 57 |     @classmethod
 58 |     def from_session(
 59 |         cls,
 60 |         session: rt.InferenceSession,
 61 |         voices_path: str,
 62 |         espeak_config: EspeakConfig | None = None,
 63 |         vocab_config: dict | str | None = None,
 64 |     ):
 65 |         instance = cls.__new__(cls)
 66 |         instance.sess = session
 67 |         instance.config = KoKoroConfig(session._model_path, voices_path, espeak_config)
 68 |         instance.config.validate()
 69 |         instance.voices = np.load(voices_path)
 70 | 
 71 |         vocab = instance._load_vocab(vocab_config)
 72 |         instance.tokenizer = Tokenizer(espeak_config, vocab=vocab)
 73 |         return instance
 74 | 
 75 |     def _load_vocab(self, vocab_config: dict | str | None) -> dict:
 76 |         """Load vocabulary from config file or dictionary.
 77 | 
 78 |         Args:
 79 |             vocab_config: Path to vocab config file or dictionary containing vocab.
 80 | 
 81 |         Returns:
 82 |             Loaded vocabulary dictionary or empty dictionary if no config provided.
 83 |         """
 84 | 
 85 |         if isinstance(vocab_config, str):
 86 |             with open(vocab_config, encoding="utf-8") as fp:
 87 |                 config = json.load(fp)
 88 |                 return config["vocab"]
 89 |         if isinstance(vocab_config, dict):
 90 |             return vocab_config["vocab"]
 91 |         return {}
 92 | 
 93 |     def _create_audio(
 94 |         self, phonemes: str, voice: NDArray[np.float32], speed: float
 95 |     ) -> tuple[NDArray[np.float32], int]:
 96 |         log.debug(f"Phonemes: {phonemes}")
 97 |         if len(phonemes) > MAX_PHONEME_LENGTH:
 98 |             log.warning(
 99 |                 f"Phonemes are too long, truncating to {MAX_PHONEME_LENGTH} phonemes"
100 |             )
101 |         phonemes = phonemes[:MAX_PHONEME_LENGTH]
102 |         start_t = time.time()
103 |         tokens = np.array(self.tokenizer.tokenize(phonemes), dtype=np.int64)
104 |         assert len(tokens) <= MAX_PHONEME_LENGTH, (
105 |             f"Context length is {MAX_PHONEME_LENGTH}, but leave room for the pad token 0 at the start & end"
106 |         )
107 | 
108 |         voice = voice[len(tokens)]
109 |         tokens = [[0, *tokens, 0]]
110 |         if "input_ids" in [i.name for i in self.sess.get_inputs()]:
111 |             # Newer export versions
112 |             inputs = {
113 |                 "input_ids": tokens,
114 |                 "style": np.array(voice, dtype=np.float32),
115 |                 "speed": np.array([speed], dtype=np.int32),
116 |             }
117 |         else:
118 |             inputs = {
119 |                 "tokens": tokens,
120 |                 "style": voice,
121 |                 "speed": np.ones(1, dtype=np.float32) * speed,
122 |             }
123 | 
124 |         audio = self.sess.run(None, inputs)[0]
125 |         audio_duration = len(audio) / SAMPLE_RATE
126 |         create_duration = time.time() - start_t
127 |         rtf = create_duration / audio_duration
128 |         log.debug(
129 |             f"Created audio in length of {audio_duration:.2f}s for {len(phonemes)} phonemes in {create_duration:.2f}s (RTF: {rtf:.2f}"
130 |         )
131 |         return audio, SAMPLE_RATE
132 | 
133 |     def get_voice_style(self, name: str) -> NDArray[np.float32]:
134 |         return self.voices[name]
135 | 
136 |     def _split_phonemes(self, phonemes: str) -> list[str]:
137 |         """
138 |         Split phonemes into batches of MAX_PHONEME_LENGTH
139 |         Prefer splitting at punctuation marks.
140 |         """
141 |         # Regular expression to split by punctuation and keep them
142 |         words = re.split(r"([.,!?;])", phonemes)
143 |         batched_phoenemes: list[str] = []
144 |         current_batch = ""
145 | 
146 |         for part in words:
147 |             # Remove leading/trailing whitespace
148 |             part = part.strip()
149 | 
150 |             if part:
151 |                 # If adding the part exceeds the max length, split into a new batch
152 |                 # TODO: make it more accurate
153 |                 if len(current_batch) + len(part) + 1 >= MAX_PHONEME_LENGTH:
154 |                     batched_phoenemes.append(current_batch.strip())
155 |                     current_batch = part
156 |                 else:
157 |                     if part in ".,!?;":
158 |                         current_batch += part
159 |                     else:
160 |                         if current_batch:
161 |                             current_batch += " "
162 |                         current_batch += part
163 | 
164 |         # Append the last batch if it contains any phonemes
165 |         if current_batch:
166 |             batched_phoenemes.append(current_batch.strip())
167 | 
168 |         return batched_phoenemes
169 | 
170 |     def create(
171 |         self,
172 |         text: str,
173 |         voice: str | NDArray[np.float32],
174 |         speed: float = 1.0,
175 |         lang: str = "en-us",
176 |         is_phonemes: bool = False,
177 |         trim: bool = True,
178 |     ) -> tuple[NDArray[np.float32], int]:
179 |         """
180 |         Create audio from text using the specified voice and speed.
181 |         """
182 |         assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0"
183 | 
184 |         if isinstance(voice, str):
185 |             assert voice in self.voices, f"Voice {voice} not found in available voices"
186 |             voice = self.get_voice_style(voice)
187 | 
188 |         start_t = time.time()
189 |         if is_phonemes:
190 |             phonemes = text
191 |         else:
192 |             phonemes = self.tokenizer.phonemize(text, lang)
193 |         # Create batches of phonemes by splitting spaces to MAX_PHONEME_LENGTH
194 |         batched_phoenemes = self._split_phonemes(phonemes)
195 | 
196 |         audio = []
197 |         log.debug(
198 |             f"Creating audio for {len(batched_phoenemes)} batches for {len(phonemes)} phonemes"
199 |         )
200 |         for phonemes in batched_phoenemes:
201 |             audio_part, _ = self._create_audio(phonemes, voice, speed)
202 |             if trim:
203 |                 # Trim leading and trailing silence for a more natural sound concatenation
204 |                 # (initial ~2s, subsequent ~0.02s)
205 |                 audio_part, _ = trim_audio(audio_part)
206 |             audio.append(audio_part)
207 |         audio = np.concatenate(audio)
208 |         log.debug(f"Created audio in {time.time() - start_t:.2f}s")
209 |         return audio, SAMPLE_RATE
210 | 
211 |     async def create_stream(
212 |         self,
213 |         text: str,
214 |         voice: str | NDArray[np.float32],
215 |         speed: float = 1.0,
216 |         lang: str = "en-us",
217 |         is_phonemes: bool = False,
218 |         trim: bool = True,
219 |     ) -> AsyncGenerator[tuple[NDArray[np.float32], int], None]:
220 |         """
221 |         Stream audio creation asynchronously in the background, yielding chunks as they are processed.
222 |         """
223 |         assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0"
224 | 
225 |         if isinstance(voice, str):
226 |             assert voice in self.voices, f"Voice {voice} not found in available voices"
227 |             voice = self.get_voice_style(voice)
228 | 
229 |         if is_phonemes:
230 |             phonemes = text
231 |         else:
232 |             phonemes = self.tokenizer.phonemize(text, lang)
233 | 
234 |         batched_phonemes = self._split_phonemes(phonemes)
235 |         queue: asyncio.Queue[tuple[NDArray[np.float32], int] | None] = asyncio.Queue()
236 | 
237 |         async def process_batches():
238 |             """Process phoneme batches in the background."""
239 |             for i, phonemes in enumerate(batched_phonemes):
240 |                 loop = asyncio.get_event_loop()
241 |                 # Execute in separate thread since it's blocking operation
242 |                 audio_part, sample_rate = await loop.run_in_executor(
243 |                     None, self._create_audio, phonemes, voice, speed
244 |                 )
245 |                 if trim:
246 |                     # Trim leading and trailing silence for a more natural sound concatenation
247 |                     # (initial ~2s, subsequent ~0.02s)
248 |                     audio_part, _ = trim_audio(audio_part)
249 |                 log.debug(f"Processed chunk {i} of stream")
250 |                 await queue.put((audio_part, sample_rate))
251 |             await queue.put(None)  # Signal the end of the stream
252 | 
253 |         # Start processing in the background
254 |         asyncio.create_task(process_batches())
255 | 
256 |         while True:
257 |             chunk = await queue.get()
258 |             if chunk is None:
259 |                 break
260 |             yield chunk
261 | 
262 |     def get_voices(self) -> list[str]:
263 |         return list(sorted(self.voices.keys()))
264 | 


--------------------------------------------------------------------------------
/src/kokoro_onnx/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "istftnet": {
  3 |     "upsample_kernel_sizes": [20, 12],
  4 |     "upsample_rates": [10, 6],
  5 |     "gen_istft_hop_size": 5,
  6 |     "gen_istft_n_fft": 20,
  7 |     "resblock_dilation_sizes": [
  8 |       [1, 3, 5],
  9 |       [1, 3, 5],
 10 |       [1, 3, 5]
 11 |     ],
 12 |     "resblock_kernel_sizes": [3, 7, 11],
 13 |     "upsample_initial_channel": 512
 14 |   },
 15 |   "dim_in": 64,
 16 |   "dropout": 0.2,
 17 |   "hidden_dim": 512,
 18 |   "max_conv_dim": 512,
 19 |   "max_dur": 50,
 20 |   "multispeaker": true,
 21 |   "n_layer": 3,
 22 |   "n_mels": 80,
 23 |   "n_token": 178,
 24 |   "style_dim": 128,
 25 |   "text_encoder_kernel_size": 5,
 26 |   "plbert": {
 27 |     "hidden_size": 768,
 28 |     "num_attention_heads": 12,
 29 |     "intermediate_size": 2048,
 30 |     "max_position_embeddings": 512,
 31 |     "num_hidden_layers": 12,
 32 |     "dropout": 0.1
 33 |   },
 34 |   "vocab": {
 35 |     ";": 1,
 36 |     ":": 2,
 37 |     ",": 3,
 38 |     ".": 4,
 39 |     "!": 5,
 40 |     "?": 6,
 41 |     "—": 9,
 42 |     "…": 10,
 43 |     "\"": 11,
 44 |     "(": 12,
 45 |     ")": 13,
 46 |     "“": 14,
 47 |     "”": 15,
 48 |     " ": 16,
 49 |     "\u0303": 17,
 50 |     "ʣ": 18,
 51 |     "ʥ": 19,
 52 |     "ʦ": 20,
 53 |     "ʨ": 21,
 54 |     "ᵝ": 22,
 55 |     "\uAB67": 23,
 56 |     "A": 24,
 57 |     "I": 25,
 58 |     "O": 31,
 59 |     "Q": 33,
 60 |     "S": 35,
 61 |     "T": 36,
 62 |     "W": 39,
 63 |     "Y": 41,
 64 |     "ᵊ": 42,
 65 |     "a": 43,
 66 |     "b": 44,
 67 |     "c": 45,
 68 |     "d": 46,
 69 |     "e": 47,
 70 |     "f": 48,
 71 |     "h": 50,
 72 |     "i": 51,
 73 |     "j": 52,
 74 |     "k": 53,
 75 |     "l": 54,
 76 |     "m": 55,
 77 |     "n": 56,
 78 |     "o": 57,
 79 |     "p": 58,
 80 |     "q": 59,
 81 |     "r": 60,
 82 |     "s": 61,
 83 |     "t": 62,
 84 |     "u": 63,
 85 |     "v": 64,
 86 |     "w": 65,
 87 |     "x": 66,
 88 |     "y": 67,
 89 |     "z": 68,
 90 |     "ɑ": 69,
 91 |     "ɐ": 70,
 92 |     "ɒ": 71,
 93 |     "æ": 72,
 94 |     "β": 75,
 95 |     "ɔ": 76,
 96 |     "ɕ": 77,
 97 |     "ç": 78,
 98 |     "ɖ": 80,
 99 |     "ð": 81,
100 |     "ʤ": 82,
101 |     "ə": 83,
102 |     "ɚ": 85,
103 |     "ɛ": 86,
104 |     "ɜ": 87,
105 |     "ɟ": 90,
106 |     "ɡ": 92,
107 |     "ɥ": 99,
108 |     "ɨ": 101,
109 |     "ɪ": 102,
110 |     "ʝ": 103,
111 |     "ɯ": 110,
112 |     "ɰ": 111,
113 |     "ŋ": 112,
114 |     "ɳ": 113,
115 |     "ɲ": 114,
116 |     "ɴ": 115,
117 |     "ø": 116,
118 |     "ɸ": 118,
119 |     "θ": 119,
120 |     "œ": 120,
121 |     "ɹ": 123,
122 |     "ɾ": 125,
123 |     "ɻ": 126,
124 |     "ʁ": 128,
125 |     "ɽ": 129,
126 |     "ʂ": 130,
127 |     "ʃ": 131,
128 |     "ʈ": 132,
129 |     "ʧ": 133,
130 |     "ʊ": 135,
131 |     "ʋ": 136,
132 |     "ʌ": 138,
133 |     "ɣ": 139,
134 |     "ɤ": 140,
135 |     "χ": 142,
136 |     "ʎ": 143,
137 |     "ʒ": 147,
138 |     "ʔ": 148,
139 |     "ˈ": 156,
140 |     "ˌ": 157,
141 |     "ː": 158,
142 |     "ʰ": 162,
143 |     "ʲ": 164,
144 |     "↓": 169,
145 |     "→": 171,
146 |     "↗": 172,
147 |     "↘": 173,
148 |     "ᵻ": 177
149 |   }
150 | }


--------------------------------------------------------------------------------
/src/kokoro_onnx/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | from pathlib import Path
 4 | 
 5 | MAX_PHONEME_LENGTH = 510
 6 | SAMPLE_RATE = 24000
 7 | 
 8 | 
 9 | @dataclass
10 | class EspeakConfig:
11 |     lib_path: str | None = None
12 |     data_path: str | None = None
13 | 
14 | 
15 | class KoKoroConfig:
16 |     def __init__(
17 |         self,
18 |         model_path: str,
19 |         voices_path: str,
20 |         espeak_config: EspeakConfig | None = None,
21 |     ):
22 |         self.model_path = model_path
23 |         self.voices_path = voices_path
24 |         self.espeak_config = espeak_config
25 | 
26 |     def validate(self):
27 |         if not Path(self.voices_path).exists():
28 |             error_msg = f"Voices file not found at {self.voices_path}"
29 |             error_msg += (
30 |                 "\nYou can download the voices file using the following command:"
31 |             )
32 |             error_msg += "\nwget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
33 |             raise FileNotFoundError(error_msg)
34 | 
35 |         if not Path(self.model_path).exists():
36 |             error_msg = f"Model file not found at {self.model_path}"
37 |             error_msg += "\nYou can download the model file from https://github.com/thewh1teagle/kokoro-onnx/releases"
38 |             raise FileNotFoundError(error_msg)
39 | 
40 | 
41 | def get_vocab():
42 |     with open(Path(__file__).parent / "config.json", encoding="utf-8") as fp:
43 |         config = json.load(fp)
44 |         return config["vocab"]
45 | 
46 | 
47 | DEFAULT_VOCAB = get_vocab()
48 | 


--------------------------------------------------------------------------------
/src/kokoro_onnx/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provide a way to enable logging by setting LOG_LEVEL environment variable
 3 | """
 4 | 
 5 | import logging
 6 | import os
 7 | 
 8 | import colorlog
 9 | 
10 | 
11 | def _create_logger():
12 |     """
13 |     Create a logger with colorized output
14 |     Usage: LOG_LEVEL=DEBUG python <script.py>
15 |     """
16 | 
17 |     handler = colorlog.StreamHandler()
18 |     fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
19 |     handler.setFormatter(
20 |         colorlog.ColoredFormatter(
21 |             fmt=fmt,
22 |             log_colors={
23 |                 "DEBUG": "blue",
24 |                 "INFO": "green",
25 |                 "WARNING": "yellow",
26 |                 "ERROR": "red",
27 |                 "CRITICAL": "red",
28 |             },
29 |         )
30 |     )
31 |     # Get log level from LOG_LEVEL environment variable
32 |     log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
33 |     logger = colorlog.getLogger(__package__)
34 |     logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
35 |     # Setup logging to stdout
36 |     logger.addHandler(handler)
37 |     return logger
38 | 
39 | 
40 | log = _create_logger()
41 | 


--------------------------------------------------------------------------------
/src/kokoro_onnx/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thewh1teagle/kokoro-onnx/8edb984a28c3d9b806d208b51fdbca499c81af2b/src/kokoro_onnx/py.typed


--------------------------------------------------------------------------------
/src/kokoro_onnx/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import os
 3 | import platform
 4 | import sys
 5 | 
 6 | import espeakng_loader
 7 | import phonemizer
 8 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 9 | 
10 | from .config import DEFAULT_VOCAB, MAX_PHONEME_LENGTH, EspeakConfig
11 | from .log import log
12 | 
13 | 
14 | class Tokenizer:
15 |     def __init__(self, espeak_config: EspeakConfig | None = None, vocab: dict = None):
16 |         self.vocab = vocab or DEFAULT_VOCAB
17 | 
18 |         if not espeak_config:
19 |             espeak_config = EspeakConfig()
20 |         if not espeak_config.data_path:
21 |             espeak_config.data_path = espeakng_loader.get_data_path()
22 |         if not espeak_config.lib_path:
23 |             espeak_config.lib_path = espeakng_loader.get_library_path()
24 | 
25 |         # Check if PHONEMIZER_ESPEAK_LIBRARY was set
26 |         if os.getenv("PHONEMIZER_ESPEAK_LIBRARY"):
27 |             espeak_config.lib_path = os.getenv("PHONEMIZER_ESPEAK_LIBRARY")
28 | 
29 |         # Check that the espeak-ng library can be loaded
30 |         try:
31 |             ctypes.cdll.LoadLibrary(espeak_config.lib_path)
32 |         except Exception as e:
33 |             log.error(f"Failed to load espeak shared library: {e}")
34 |             log.warning("Falling back to system wide espeak-ng library")
35 | 
36 |             # Fallback system wide load
37 |             error_info = (
38 |                 "Failed to load espeak-ng from fallback. Please install espeak-ng system wide.\n"
39 |                 "\tSee https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md\n"
40 |                 "\tNote: you can specify shared library path using PHONEMIZER_ESPEAK_LIBRARY environment variable.\n"
41 |                 f"Environment:\n\t{platform.platform()} ({platform.release()}) | {sys.version}"
42 |             )
43 |             espeak_config.lib_path = ctypes.util.find_library(
44 |                 "espeak-ng"
45 |             ) or ctypes.util.find_library("espeak")
46 |             if not espeak_config.lib_path:
47 |                 raise RuntimeError(error_info)
48 |             try:
49 |                 ctypes.cdll.LoadLibrary(espeak_config.lib_path)
50 |             except Exception as e:
51 |                 raise RuntimeError(f"{e}: {error_info}")
52 | 
53 |         EspeakWrapper.set_data_path(espeak_config.data_path)
54 |         EspeakWrapper.set_library(espeak_config.lib_path)
55 | 
56 |     @staticmethod
57 |     def normalize_text(text) -> str:
58 |         return text.strip()
59 | 
60 |     def tokenize(self, phonemes):
61 |         if len(phonemes) > MAX_PHONEME_LENGTH:
62 |             raise ValueError(
63 |                 f"text is too long, must be less than {MAX_PHONEME_LENGTH} phonemes"
64 |             )
65 |         return [i for i in map(self.vocab.get, phonemes) if i is not None]
66 | 
67 |     def phonemize(self, text, lang="en-us", norm=True) -> str:
68 |         """
69 |         lang can be 'en-us' or 'en-gb'
70 |         """
71 |         if norm:
72 |             text = Tokenizer.normalize_text(text)
73 | 
74 |         phonemes = phonemizer.phonemize(
75 |             text, lang, preserve_punctuation=True, with_stress=True
76 |         )
77 |         phonemes = "".join(filter(lambda p: p in self.vocab, phonemes))
78 |         return phonemes.strip()
79 | 


--------------------------------------------------------------------------------
/src/kokoro_onnx/trim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2013--2023, librosa development team.
  3 | 
  4 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
  5 | 
  6 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  7 | 
  8 | 
  9 | ***This file extracted from librosa package since we use only the trim() function and librosa requires many dependencies***
 10 | 
 11 | Reference:
 12 |     - https://gist.github.com/evq/82e95a363eeeb75d15dd62abc1eb1bde
 13 |     - https://github.com/librosa/librosa/blob/894942673d55aa2206df1296b6c4c50827c7f1d6/librosa/effects.py#L612
 14 | """
 15 | 
 16 | import warnings
 17 | from collections.abc import Callable
 18 | from typing import Any
 19 | 
 20 | import numpy as np
 21 | from numpy.lib.stride_tricks import as_strided
 22 | 
 23 | 
 24 | class LibrosaError(Exception):
 25 |     """The root librosa exception class"""
 26 | 
 27 |     pass
 28 | 
 29 | 
 30 | class ParameterError(LibrosaError):
 31 |     """Exception class for mal-formed inputs"""
 32 | 
 33 |     pass
 34 | 
 35 | 
 36 | # @numba.vectorize(
 37 | #    ["float32(complex64)", "float64(complex128)"], nopython=True, cache=True, identity=0
 38 | # )  # type: ignore
 39 | def _cabs2(x):  # pragma: no cover
 40 |     """Efficiently compute abs2 on complex inputs"""
 41 |     return x.real**2 + x.imag**2
 42 | 
 43 | 
 44 | def abs2(x, dtype):
 45 |     """Compute the squared magnitude of a real or complex array.
 46 | 
 47 |     This function is equivalent to calling `np.abs(x)**2` but it
 48 |     is slightly more efficient.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     x : np.ndarray or scalar, real or complex typed
 53 |         The input data, either real (float32, float64) or complex (complex64, complex128) typed
 54 |     dtype : np.dtype, optional
 55 |         The data type of the output array.
 56 |         If not provided, it will be inferred from `x`
 57 | 
 58 |     Returns
 59 |     -------
 60 |     p : np.ndarray or scale, real
 61 |         squared magnitude of `x`
 62 | 
 63 |     Examples
 64 |     --------
 65 |     >>> librosa.util.abs2(3 + 4j)
 66 |     25.0
 67 | 
 68 |     >>> librosa.util.abs2((0.5j)**np.arange(8))
 69 |     array([1.000e+00, 2.500e-01, 6.250e-02, 1.562e-02, 3.906e-03, 9.766e-04,
 70 |        2.441e-04, 6.104e-05])
 71 |     """
 72 |     if np.iscomplexobj(x):
 73 |         # suppress type check, mypy doesn't like vectorization
 74 |         y = _cabs2(x)
 75 |         if dtype is None:
 76 |             return y  # type: ignore
 77 |         else:
 78 |             return y.astype(dtype)  # type: ignore
 79 |     else:
 80 |         # suppress type check, mypy doesn't know this is real
 81 |         return np.square(x, dtype=dtype)  # type: ignore
 82 | 
 83 | 
 84 | def amplitude_to_db(
 85 |     S,
 86 |     *,
 87 |     ref: float | Callable = 1.0,
 88 |     amin: float = 1e-5,
 89 |     top_db: float | None = 80.0,
 90 | ) -> np.floating[Any] | np.ndarray:
 91 |     """Convert an amplitude spectrogram to dB-scaled spectrogram.
 92 | 
 93 |     This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
 94 |     but is provided for convenience.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     S : np.ndarray
 99 |         input amplitude
100 | 
101 |     ref : scalar or callable
102 |         If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
103 |         ``20 * log10(S / ref)``.
104 |         Zeros in the output correspond to positions where ``S == ref``.
105 | 
106 |         If callable, the reference value is computed as ``ref(S)``.
107 | 
108 |     amin : float > 0 [scalar]
109 |         minimum threshold for ``S`` and ``ref``
110 | 
111 |     top_db : float >= 0 [scalar]
112 |         threshold the output at ``top_db`` below the peak:
113 |         ``max(20 * log10(S/ref)) - top_db``
114 | 
115 |     Returns
116 |     -------
117 |     S_db : np.ndarray
118 |         ``S`` measured in dB
119 | 
120 |     See Also
121 |     --------
122 |     power_to_db, db_to_amplitude
123 | 
124 |     Notes
125 |     -----
126 |     This function caches at level 30.
127 |     """
128 |     S = np.asarray(S)
129 | 
130 |     if np.issubdtype(S.dtype, np.complexfloating):
131 |         warnings.warn(
132 |             "amplitude_to_db was called on complex input so phase "
133 |             "information will be discarded. To suppress this warning, "
134 |             "call amplitude_to_db(np.abs(S)) instead.",
135 |             stacklevel=2,
136 |         )
137 | 
138 |     magnitude = np.abs(S)
139 | 
140 |     if callable(ref):
141 |         # User supplied a function to calculate reference power
142 |         ref_value = ref(magnitude)
143 |     else:
144 |         ref_value = np.abs(ref)
145 | 
146 |     out_array = magnitude if isinstance(magnitude, np.ndarray) else None
147 |     power = np.square(magnitude, out=out_array)
148 | 
149 |     db: np.ndarray = power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)
150 |     return db
151 | 
152 | 
153 | def _signal_to_frame_nonsilent(
154 |     y: np.ndarray,
155 |     frame_length: int = 2048,
156 |     hop_length: int = 512,
157 |     top_db: float = 60,
158 |     ref: Callable | float = np.max,
159 |     aggregate: Callable = np.max,
160 | ) -> np.ndarray:
161 |     """Frame-wise non-silent indicator for audio input.
162 | 
163 |     This is a helper function for `trim` and `split`.
164 | 
165 |     Parameters
166 |     ----------
167 |     y : np.ndarray
168 |         Audio signal, mono or stereo
169 | 
170 |     frame_length : int > 0
171 |         The number of samples per frame
172 | 
173 |     hop_length : int > 0
174 |         The number of samples between frames
175 | 
176 |     top_db : number
177 |         The threshold (in decibels) below reference to consider as
178 |         silence.
179 |         You can also use a negative value for `top_db` to treat any value
180 |         below `ref + |top_db|` as silent.  This will only make sense if
181 |         `ref` is not `np.max`.
182 | 
183 |     ref : callable or float
184 |         The reference amplitude
185 | 
186 |     aggregate : callable [default: np.max]
187 |         Function to aggregate dB measurements across channels (if y.ndim > 1)
188 | 
189 |         Note: for multiple leading axes, this is performed using ``np.apply_over_axes``.
190 | 
191 |     Returns
192 |     -------
193 |     non_silent : np.ndarray, shape=(m,), dtype=bool
194 |         Indicator of non-silent frames
195 |     """
196 |     # Compute the MSE for the signal
197 |     mse = rms(y=y, frame_length=frame_length, hop_length=hop_length)
198 | 
199 |     # Convert to decibels and slice out the mse channel
200 |     db: np.ndarray = amplitude_to_db(mse[..., 0, :], ref=ref, top_db=None)
201 | 
202 |     # Aggregate everything but the time dimension
203 |     if db.ndim > 1:
204 |         db = np.apply_over_axes(aggregate, db, range(db.ndim - 1))
205 |         # Squeeze out leading singleton dimensions here
206 |         # We always want to keep the trailing dimension though
207 |         db = np.squeeze(db, axis=tuple(range(db.ndim - 1)))
208 | 
209 |     return db > -top_db
210 | 
211 | 
212 | def trim(
213 |     y: np.ndarray,
214 |     *,
215 |     top_db: float = 60,
216 |     ref: float | Callable = np.max,
217 |     frame_length: int = 2048,
218 |     hop_length: int = 512,
219 |     aggregate: Callable = np.max,
220 | ) -> tuple[np.ndarray, np.ndarray]:
221 |     """Trim leading and trailing silence from an audio signal.
222 | 
223 |     Silence is defined as segments of the audio signal that are `top_db`
224 |     decibels (or more) quieter than a reference level, `ref`.
225 |     By default, `ref` is set to the signal's maximum RMS value.
226 |     It's important to note that if the entire signal maintains a uniform
227 |     RMS value, there will be no segments considered quieter than the maximum,
228 |     leading to no trimming.
229 |     This implies that a completely silent signal will remain untrimmed with the default `ref` setting.
230 |     In these situations, an explicit value for `ref` (in decibels) should be used instead.
231 | 
232 |     Parameters
233 |     ----------
234 |     y : np.ndarray, shape=(..., n)
235 |         Audio signal. Multi-channel is supported.
236 |     top_db : number
237 |         The threshold (in decibels) below reference to consider as
238 |         silence.
239 |         You can also use a negative value for `top_db` to treat any value
240 |         below `ref + |top_db|` as silent.  This will only make sense if
241 |         `ref` is not `np.max`.
242 |     ref : number or callable
243 |         The reference amplitude.  By default, it uses `np.max` and compares
244 |         to the peak amplitude in the signal.
245 |     frame_length : int > 0
246 |         The number of samples per analysis frame
247 |     hop_length : int > 0
248 |         The number of samples between analysis frames
249 |     aggregate : callable [default: np.max]
250 |         Function to aggregate across channels (if y.ndim > 1)
251 | 
252 |     Returns
253 |     -------
254 |     y_trimmed : np.ndarray, shape=(..., m)
255 |         The trimmed signal
256 |     index : np.ndarray, shape=(2,)
257 |         the interval of ``y`` corresponding to the non-silent region:
258 |         ``y_trimmed = y[index[0]:index[1]]`` (for mono) or
259 |         ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).
260 | 
261 |     Examples
262 |     --------
263 |     >>> # Load some audio
264 |     >>> y, sr = librosa.load(librosa.ex('choice'))
265 |     >>> # Trim the beginning and ending silence
266 |     >>> yt, index = librosa.effects.trim(y)
267 |     >>> # Print the durations
268 |     >>> print(librosa.get_duration(y, sr=sr), librosa.get_duration(yt, sr=sr))
269 |     25.025986394557822 25.007891156462584
270 |     """
271 |     non_silent = _signal_to_frame_nonsilent(
272 |         y,
273 |         frame_length=frame_length,
274 |         hop_length=hop_length,
275 |         ref=ref,
276 |         top_db=top_db,
277 |         aggregate=aggregate,
278 |     )
279 | 
280 |     nonzero = np.flatnonzero(non_silent)
281 | 
282 |     if nonzero.size > 0:
283 |         # Compute the start and end positions
284 |         # End position goes one frame past the last non-zero
285 |         start = int(frames_to_samples(nonzero[0], hop_length=hop_length))
286 |         end = min(
287 |             y.shape[-1],
288 |             int(frames_to_samples(nonzero[-1] + 1, hop_length=hop_length)),
289 |         )
290 |     else:
291 |         # The entire signal is trimmed here: nothing is above the threshold
292 |         start, end = 0, 0
293 | 
294 |     # Slice the buffer and return the corresponding interval
295 |     return y[..., start:end], np.asarray([start, end])
296 | 
297 | 
298 | def rms(
299 |     *,
300 |     y: np.ndarray | None = None,
301 |     S: np.ndarray | None = None,
302 |     frame_length: int = 2048,
303 |     hop_length: int = 512,
304 |     center: bool = True,
305 |     pad_mode="constant",
306 |     dtype=np.float32,
307 | ) -> np.ndarray:
308 |     """Compute root-mean-square (RMS) value for each frame, either from the
309 |     audio samples ``y`` or from a spectrogram ``S``.
310 | 
311 |     Computing the RMS value from audio samples is faster as it doesn't require
312 |     a STFT calculation. However, using a spectrogram will give a more accurate
313 |     representation of energy over time because its frames can be windowed,
314 |     thus prefer using ``S`` if it's already available.
315 | 
316 |     Parameters
317 |     ----------
318 |     y : np.ndarray [shape=(..., n)] or None
319 |         (optional) audio time series. Required if ``S`` is not input.
320 |         Multi-channel is supported.
321 |     S : np.ndarray [shape=(..., d, t)] or None
322 |         (optional) spectrogram magnitude. Required if ``y`` is not input.
323 |     frame_length : int > 0 [scalar]
324 |         length of analysis frame (in samples) for energy calculation
325 |     hop_length : int > 0 [scalar]
326 |         hop length for STFT. See `librosa.stft` for details.
327 |     center : bool
328 |         If `True` and operating on time-domain input (``y``), pad the signal
329 |         by ``frame_length//2`` on either side.
330 |         If operating on spectrogram input, this has no effect.
331 |     pad_mode : str
332 |         Padding mode for centered analysis.  See `numpy.pad` for valid
333 |         values.
334 |     dtype : np.dtype, optional
335 |         Data type of the output array.  Defaults to float32.
336 | 
337 |     Returns
338 |     -------
339 |     rms : np.ndarray [shape=(..., 1, t)]
340 |         RMS value for each frame
341 | 
342 |     Examples
343 |     --------
344 |     >>> y, sr = librosa.load(librosa.ex('trumpet'))
345 |     >>> librosa.feature.rms(y=y)
346 |     array([[1.248e-01, 1.259e-01, ..., 1.845e-05, 1.796e-05]],
347 |           dtype=float32)
348 | 
349 |     Or from spectrogram input
350 | 
351 |     >>> S, phase = librosa.magphase(librosa.stft(y))
352 |     >>> rms = librosa.feature.rms(S=S)
353 | 
354 |     >>> import matplotlib.pyplot as plt
355 |     >>> fig, ax = plt.subplots(nrows=2, sharex=True)
356 |     >>> times = librosa.times_like(rms)
357 |     >>> ax[0].semilogy(times, rms[0], label='RMS Energy')
358 |     >>> ax[0].set(xticks=[])
359 |     >>> ax[0].legend()
360 |     >>> ax[0].label_outer()
361 |     >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
362 |     ...                          y_axis='log', x_axis='time', ax=ax[1])
363 |     >>> ax[1].set(title='log Power spectrogram')
364 | 
365 |     Use a STFT window of constant ones and no frame centering to get consistent
366 |     results with the RMS computed from the audio samples ``y``
367 | 
368 |     >>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0]
369 |     >>> librosa.feature.rms(S=S)
370 |     >>> plt.show()
371 | 
372 |     """
373 |     if y is not None:
374 |         if center:
375 |             padding = [(0, 0) for _ in range(y.ndim)]
376 |             padding[-1] = (int(frame_length // 2), int(frame_length // 2))
377 |             y = np.pad(y, padding, mode=pad_mode)
378 | 
379 |         x = frame(y, frame_length=frame_length, hop_length=hop_length)
380 | 
381 |         # Calculate power
382 |         power = np.mean(abs2(x, dtype=dtype), axis=-2, keepdims=True)
383 |     elif S is not None:
384 |         # Check the frame length
385 |         if S.shape[-2] != frame_length // 2 + 1:
386 |             raise ParameterError(
387 |                 f"Since S.shape[-2] is {S.shape[-2]}, "
388 |                 f"frame_length is expected to be {S.shape[-2] * 2 - 2} or {S.shape[-2] * 2 - 1}; "
389 |                 f"found {frame_length}"
390 |             )
391 | 
392 |         # power spectrogram
393 |         x = abs2(S, dtype=dtype)
394 | 
395 |         # Adjust the DC and sr/2 component
396 |         x[..., 0, :] *= 0.5
397 |         if frame_length % 2 == 0:
398 |             x[..., -1, :] *= 0.5
399 | 
400 |         # Calculate power
401 |         power = 2 * np.sum(x, axis=-2, keepdims=True) / frame_length**2
402 |     else:
403 |         raise ParameterError("Either `y` or `S` must be input.")
404 | 
405 |     rms_result: np.ndarray = np.sqrt(power)
406 |     return rms_result
407 | 
408 | 
409 | def frame(
410 |     x: np.ndarray,
411 |     *,
412 |     frame_length: int,
413 |     hop_length: int,
414 |     axis: int = -1,
415 |     writeable: bool = False,
416 |     subok: bool = False,
417 | ) -> np.ndarray:
418 |     """Slice a data array into (overlapping) frames.
419 | 
420 |     This implementation uses low-level stride manipulation to avoid
421 |     making a copy of the data.  The resulting frame representation
422 |     is a new view of the same input data.
423 | 
424 |     For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]``
425 |     can be framed with frame length 3 and hop length 2 in two ways.
426 |     The first (``axis=-1``), results in the array ``x_frames``::
427 | 
428 |         [[0, 2, 4],
429 |          [1, 3, 5],
430 |          [2, 4, 6]]
431 | 
432 |     where each column ``x_frames[:, i]`` contains a contiguous slice of
433 |     the input ``x[i * hop_length : i * hop_length + frame_length]``.
434 | 
435 |     The second way (``axis=0``) results in the array ``x_frames``::
436 | 
437 |         [[0, 1, 2],
438 |          [2, 3, 4],
439 |          [4, 5, 6]]
440 | 
441 |     where each row ``x_frames[i]`` contains a contiguous slice of the input.
442 | 
443 |     This generalizes to higher dimensional inputs, as shown in the examples below.
444 |     In general, the framing operation increments by 1 the number of dimensions,
445 |     adding a new "frame axis" either before the framing axis (if ``axis < 0``)
446 |     or after the framing axis (if ``axis >= 0``).
447 | 
448 |     Parameters
449 |     ----------
450 |     x : np.ndarray
451 |         Array to frame
452 |     frame_length : int > 0 [scalar]
453 |         Length of the frame
454 |     hop_length : int > 0 [scalar]
455 |         Number of steps to advance between frames
456 |     axis : int
457 |         The axis along which to frame.
458 |     writeable : bool
459 |         If ``False``, then the framed view of ``x`` is read-only.
460 |         If ``True``, then the framed view is read-write.  Note that writing to the framed view
461 |         will also write to the input array ``x`` in this case.
462 |     subok : bool
463 |         If True, sub-classes will be passed-through, otherwise the returned array will be
464 |         forced to be a base-class array (default).
465 | 
466 |     Returns
467 |     -------
468 |     x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES, ...)]
469 |         A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension)::
470 | 
471 |             x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length]
472 | 
473 |         If ``axis=0`` (framing on the first dimension), then::
474 | 
475 |             x_frames[j] = x[j * hop_length : j * hop_length + frame_length]
476 | 
477 |     Raises
478 |     ------
479 |     ParameterError
480 |         If ``x.shape[axis] < frame_length``, there is not enough data to fill one frame.
481 | 
482 |         If ``hop_length < 1``, frames cannot advance.
483 | 
484 |     See Also
485 |     --------
486 |     numpy.lib.stride_tricks.as_strided
487 | 
488 |     Examples
489 |     --------
490 |     Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame
491 | 
492 |     >>> y, sr = librosa.load(librosa.ex('trumpet'))
493 |     >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
494 |     >>> frames
495 |     array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06],
496 |            [-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05],
497 |            ...,
498 |            [ 7.960e-02, -2.335e-01, ..., -6.815e-06,  1.266e-05],
499 |            [ 9.568e-02, -1.252e-01, ...,  7.397e-06, -1.921e-05]],
500 |           dtype=float32)
501 |     >>> y.shape
502 |     (117601,)
503 | 
504 |     >>> frames.shape
505 |     (2048, 1806)
506 | 
507 |     Or frame along the first axis instead of the last:
508 | 
509 |     >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0)
510 |     >>> frames.shape
511 |     (1806, 2048)
512 | 
513 |     Frame a stereo signal:
514 | 
515 |     >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), mono=False)
516 |     >>> y.shape
517 |     (2, 117601)
518 |     >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
519 |     (2, 2048, 1806)
520 | 
521 |     Carve an STFT into fixed-length patches of 32 frames with 50% overlap
522 | 
523 |     >>> y, sr = librosa.load(librosa.ex('trumpet'))
524 |     >>> S = np.abs(librosa.stft(y))
525 |     >>> S.shape
526 |     (1025, 230)
527 |     >>> S_patch = librosa.util.frame(S, frame_length=32, hop_length=16)
528 |     >>> S_patch.shape
529 |     (1025, 32, 13)
530 |     >>> # The first patch contains the first 32 frames of S
531 |     >>> np.allclose(S_patch[:, :, 0], S[:, :32])
532 |     True
533 |     >>> # The second patch contains frames 16 to 16+32=48, and so on
534 |     >>> np.allclose(S_patch[:, :, 1], S[:, 16:48])
535 |     True
536 |     """
537 |     # This implementation is derived from numpy.lib.stride_tricks.sliding_window_view (1.20.0)
538 |     # https://numpy.org/doc/stable/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html
539 | 
540 |     x = np.array(x, copy=False, subok=subok)
541 | 
542 |     if x.shape[axis] < frame_length:
543 |         raise ParameterError(
544 |             f"Input is too short (n={x.shape[axis]:d}) for frame_length={frame_length:d}"
545 |         )
546 | 
547 |     if hop_length < 1:
548 |         raise ParameterError(f"Invalid hop_length: {hop_length:d}")
549 | 
550 |     # put our new within-frame axis at the end for now
551 |     out_strides = x.strides + tuple([x.strides[axis]])
552 | 
553 |     # Reduce the shape on the framing axis
554 |     x_shape_trimmed = list(x.shape)
555 |     x_shape_trimmed[axis] -= frame_length - 1
556 | 
557 |     out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
558 |     xw = as_strided(
559 |         x, strides=out_strides, shape=out_shape, subok=subok, writeable=writeable
560 |     )
561 | 
562 |     if axis < 0:
563 |         target_axis = axis - 1
564 |     else:
565 |         target_axis = axis + 1
566 | 
567 |     xw = np.moveaxis(xw, -1, target_axis)
568 | 
569 |     # Downsample along the target axis
570 |     slices = [slice(None)] * xw.ndim
571 |     slices[axis] = slice(0, None, hop_length)
572 |     return xw[tuple(slices)]
573 | 
574 | 
575 | def power_to_db(
576 |     S,
577 |     *,
578 |     ref: float | Callable = 1.0,
579 |     amin: float = 1e-10,
580 |     top_db: float | None = 80.0,
581 | ) -> np.floating[Any] | np.ndarray:
582 |     """Convert a power spectrogram (amplitude squared) to decibel (dB) units
583 | 
584 |     This computes the scaling ``10 * log10(S / ref)`` in a numerically
585 |     stable way.
586 | 
587 |     Parameters
588 |     ----------
589 |     S : np.ndarray
590 |         input power
591 | 
592 |     ref : scalar or callable
593 |         If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::
594 | 
595 |             10 * log10(S / ref)
596 | 
597 |         Zeros in the output correspond to positions where ``S == ref``.
598 | 
599 |         If callable, the reference value is computed as ``ref(S)``.
600 | 
601 |     amin : float > 0 [scalar]
602 |         minimum threshold for ``abs(S)`` and ``ref``
603 | 
604 |     top_db : float >= 0 [scalar]
605 |         threshold the output at ``top_db`` below the peak:
606 |         ``max(10 * log10(S/ref)) - top_db``
607 | 
608 |     Returns
609 |     -------
610 |     S_db : np.ndarray
611 |         ``S_db ~= 10 * log10(S) - 10 * log10(ref)``
612 | 
613 |     See Also
614 |     --------
615 |     perceptual_weighting
616 |     db_to_power
617 |     amplitude_to_db
618 |     db_to_amplitude
619 | 
620 |     Notes
621 |     -----
622 |     This function caches at level 30.
623 | 
624 |     Examples
625 |     --------
626 |     Get a power spectrogram from a waveform ``y``
627 | 
628 |     >>> y, sr = librosa.load(librosa.ex('trumpet'))
629 |     >>> S = np.abs(librosa.stft(y))
630 |     >>> librosa.power_to_db(S**2)
631 |     array([[-41.809, -41.809, ..., -41.809, -41.809],
632 |            [-41.809, -41.809, ..., -41.809, -41.809],
633 |            ...,
634 |            [-41.809, -41.809, ..., -41.809, -41.809],
635 |            [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)
636 | 
637 |     Compute dB relative to peak power
638 | 
639 |     >>> librosa.power_to_db(S**2, ref=np.max)
640 |     array([[-80., -80., ..., -80., -80.],
641 |            [-80., -80., ..., -80., -80.],
642 |            ...,
643 |            [-80., -80., ..., -80., -80.],
644 |            [-80., -80., ..., -80., -80.]], dtype=float32)
645 | 
646 |     Or compare to median power
647 | 
648 |     >>> librosa.power_to_db(S**2, ref=np.median)
649 |     array([[16.578, 16.578, ..., 16.578, 16.578],
650 |            [16.578, 16.578, ..., 16.578, 16.578],
651 |            ...,
652 |            [16.578, 16.578, ..., 16.578, 16.578],
653 |            [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)
654 | 
655 |     And plot the results
656 | 
657 |     >>> import matplotlib.pyplot as plt
658 |     >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
659 |     >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
660 |     ...                                   ax=ax[0])
661 |     >>> ax[0].set(title='Power spectrogram')
662 |     >>> ax[0].label_outer()
663 |     >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
664 |     ...                                  sr=sr, y_axis='log', x_axis='time', ax=ax[1])
665 |     >>> ax[1].set(title='Log-Power spectrogram')
666 |     >>> fig.colorbar(imgpow, ax=ax[0])
667 |     >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
668 |     """
669 |     S = np.asarray(S)
670 | 
671 |     if amin <= 0:
672 |         raise ParameterError("amin must be strictly positive")
673 | 
674 |     if np.issubdtype(S.dtype, np.complexfloating):
675 |         warnings.warn(
676 |             "power_to_db was called on complex input so phase "
677 |             "information will be discarded. To suppress this warning, "
678 |             "call power_to_db(np.abs(D)**2) instead.",
679 |             stacklevel=2,
680 |         )
681 |         magnitude = np.abs(S)
682 |     else:
683 |         magnitude = S
684 | 
685 |     if callable(ref):
686 |         # User supplied a function to calculate reference power
687 |         ref_value = ref(magnitude)
688 |     else:
689 |         ref_value = np.abs(ref)
690 | 
691 |     log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
692 |     log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
693 | 
694 |     if top_db is not None:
695 |         if top_db < 0:
696 |             raise ParameterError("top_db must be non-negative")
697 |         log_spec = np.maximum(log_spec, log_spec.max() - top_db)
698 | 
699 |     return log_spec
700 | 
701 | 
702 | def frames_to_samples(
703 |     frames,
704 |     *,
705 |     hop_length: int = 512,
706 |     n_fft: int | None = None,
707 | ) -> np.integer[Any] | np.ndarray:
708 |     """Convert frame indices to audio sample indices.
709 | 
710 |     Parameters
711 |     ----------
712 |     frames : number or np.ndarray [shape=(n,)]
713 |         frame index or vector of frame indices
714 |     hop_length : int > 0 [scalar]
715 |         number of samples between successive frames
716 |     n_fft : None or int > 0 [scalar]
717 |         Optional: length of the FFT window.
718 |         If given, time conversion will include an offset of ``n_fft // 2``
719 |         to counteract windowing effects when using a non-centered STFT.
720 | 
721 |     Returns
722 |     -------
723 |     times : number or np.ndarray
724 |         time (in samples) of each given frame number::
725 | 
726 |             times[i] = frames[i] * hop_length
727 | 
728 |     See Also
729 |     --------
730 |     frames_to_time : convert frame indices to time values
731 |     samples_to_frames : convert sample indices to frame indices
732 | 
733 |     Examples
734 |     --------
735 |     >>> y, sr = librosa.load(librosa.ex('choice'))
736 |     >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
737 |     >>> beat_samples = librosa.frames_to_samples(beats, sr=sr)
738 |     """
739 |     offset = 0
740 |     if n_fft is not None:
741 |         offset = int(n_fft // 2)
742 | 
743 |     return (np.asanyarray(frames) * hop_length + offset).astype(int)
744 | 


--------------------------------------------------------------------------------