├── .dockerignore ├── .env.sample ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── build.Dockerfile │ ├── build.yml │ └── lint.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── crates ├── sbv2_api │ ├── Cargo.toml │ ├── build.rs │ └── src │ │ ├── error.rs │ │ └── main.rs ├── sbv2_bindings │ ├── Cargo.toml │ ├── pyproject.toml │ └── src │ │ ├── lib.rs │ │ ├── sbv2.rs │ │ └── style.rs ├── sbv2_core │ ├── Cargo.toml │ ├── build.rs │ ├── mora_convert.py │ └── src │ │ ├── bert.rs │ │ ├── error.rs │ │ ├── jtalk.rs │ │ ├── lib.rs │ │ ├── main.rs │ │ ├── model.rs │ │ ├── mora.rs │ │ ├── mora_list.json │ │ ├── nlp.rs │ │ ├── norm.rs │ │ ├── sbv2file.rs │ │ ├── style.rs │ │ ├── tokenizer.rs │ │ ├── tts.rs │ │ ├── tts_util.rs │ │ └── utils.rs ├── sbv2_editor │ ├── Cargo.toml │ ├── README.md │ ├── query2.json │ └── src │ │ ├── error.rs │ │ └── main.rs └── sbv2_wasm │ ├── Cargo.toml │ ├── README.md │ ├── biome.json │ ├── build.sh │ ├── example.js │ ├── package.json │ ├── pnpm-lock.yaml │ ├── src-js │ └── index.ts │ ├── src │ ├── array_helper.rs │ └── lib.rs │ └── tsconfig.json ├── models └── .gitkeep ├── renovate.json ├── scripts ├── .gitignore ├── convert │ ├── .python-version │ ├── README.md │ ├── convert_deberta.py │ ├── convert_model.py │ └── requirements.txt ├── docker │ ├── cpu.Dockerfile │ ├── cuda.Dockerfile │ ├── run_cpu.sh │ └── run_cuda.sh ├── make_dict.sh ├── sbv2-bindings-colab.ipynb ├── sbv2-test-api.py └── sbv2-test-bindings.py └── test.py /.dockerignore: -------------------------------------------------------------------------------- 1 | target/ 2 | models/ 3 | docker/ 4 | .env* 5 | renovate.json 6 | *.py -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | BERT_MODEL_PATH=models/deberta.onnx 2 | MODEL_PATH=models/tsukuyomi.sbv2 3 | MODELS_PATH=models 4 | TOKENIZER_PATH=models/tokenizer.json 5 | ADDR=localhost:3000 6 | RUST_LOG=warn 7 | HOLDER_MAX_LOADED_MODElS=20 8 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [tuna2134] 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: バグの報告 3 | about: バグを報告する場所です。 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **バグの説明** 11 | バグのエラーを張ってください 12 | 13 | **再現する方法** 14 | どのようにバグが発生したか時系列でまとめてください。 15 | 16 | **本来の挙動** 17 | 本来動作すべきことについて簡潔にまとめてください。 18 | 19 | **スクリーンショット** 20 | もしもあるならでいいです。 21 | 22 | **端末の情報** 23 | - OS: [e.g. Linux] 24 | 25 | **コード** 26 | ```rs 27 | ここにコード貼ってください 28 | ``` 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 機能追加 3 | about: 機能追加してほしい場合これで作ってください。 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **機能追加の説明** 11 | ここで追加される機能の説明してください。 12 | 13 | **メリット** 14 | ここにメリットを書いてください。 15 | 16 | **デメリット** 17 | ここにデメリットを書いてください。 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## 概要 2 | 5 | 6 | ## 関連issue 7 | 11 | 12 | ## 確認 13 | - [ ] 動作確認しましたか? 14 | -------------------------------------------------------------------------------- /.github/workflows/build.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | RUN apt update && apt install openssl libssl-dev curl pkg-config software-properties-common -y && add-apt-repository ppa:deadsnakes/ppa && apt update && apt install python3.7 python3.8 python3.9 python3.10 python3.11 python3.12 python3.13 python3-pip python3 -y 3 | ENV PIP_BREAK_SYSTEM_PACKAGES=1 4 | RUN mkdir -p /root/.cache/sbv2 && curl https://huggingface.co/neody/sbv2-api-assets/resolve/main/dic/all.bin -o /root/.cache/sbv2/all.bin -L -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - '*' 9 | workflow_dispatch: 10 | 11 | permissions: 12 | contents: write 13 | id-token: write 14 | packages: write 15 | 16 | jobs: 17 | python-linux: 18 | runs-on: ${{ matrix.platform.runner }} 19 | strategy: 20 | matrix: 21 | platform: 22 | - runner: ubuntu-latest 23 | target: x86_64 24 | - runner: ubuntu-24.04-arm 25 | target: aarch64 26 | steps: 27 | - uses: actions/checkout@v4 28 | - uses: actions/setup-python@v5 29 | with: 30 | python-version: 3.x 31 | - run: docker build . -f .github/workflows/build.Dockerfile --tag ci 32 | - name: Build wheels 33 | uses: PyO3/maturin-action@v1 34 | with: 35 | target: ${{ matrix.platform.target }} 36 | args: --release --out dist --find-interpreter 37 | sccache: 'true' 38 | manylinux: auto 39 | container: ci 40 | working-directory: ./crates/sbv2_bindings 41 | - name: Upload wheels 42 | uses: actions/upload-artifact@v4 43 | with: 44 | name: wheels-linux-${{ matrix.platform.target }} 45 | path: ./crates/sbv2_bindings/dist 46 | 47 | python-windows: 48 | runs-on: ${{ matrix.platform.runner }} 49 | strategy: 50 | matrix: 51 | platform: 52 | - runner: windows-latest 53 | target: x64 54 | steps: 55 | - uses: actions/checkout@v4 56 | - uses: actions/setup-python@v5 57 | with: 58 | python-version: 3.x 59 | architecture: ${{ matrix.platform.target }} 60 | - name: Build wheels 61 | uses: PyO3/maturin-action@v1 62 | with: 63 | target: ${{ matrix.platform.target }} 64 | args: --release --out dist --find-interpreter 65 | sccache: 'true' 66 | working-directory: ./crates/sbv2_bindings 67 | - name: Upload wheels 68 | uses: actions/upload-artifact@v4 69 | with: 70 | name: wheels-windows-${{ matrix.platform.target }} 71 | path: ./crates/sbv2_bindings/dist 72 | 73 | python-macos: 74 | runs-on: ${{ matrix.platform.runner }} 75 | strategy: 76 | matrix: 77 | platform: 78 | - runner: macos-14 79 | target: aarch64 80 | steps: 81 | - uses: actions/checkout@v4 82 | - uses: actions/setup-python@v5 83 | with: 84 | python-version: 3.x 85 | - name: Build wheels 86 | uses: PyO3/maturin-action@v1 87 | with: 88 | target: ${{ matrix.platform.target }} 89 | args: --release --out dist --find-interpreter 90 | sccache: 'true' 91 | working-directory: ./crates/sbv2_bindings 92 | - name: Upload wheels 93 | uses: actions/upload-artifact@v4 94 | with: 95 | name: wheels-macos-${{ matrix.platform.target }} 96 | path: ./crates/sbv2_bindings/dist 97 | 98 | python-sdist: 99 | runs-on: ubuntu-latest 100 | steps: 101 | - uses: actions/checkout@v4 102 | - name: Build sdist 103 | uses: PyO3/maturin-action@v1 104 | with: 105 | command: sdist 106 | args: --out dist 107 | working-directory: ./crates/sbv2_bindings 108 | - name: Upload sdist 109 | uses: actions/upload-artifact@v4 110 | with: 111 | name: wheels-sdist 112 | path: ./crates/sbv2_bindings/dist 113 | 114 | python-wheel: 115 | name: Wheel Upload 116 | runs-on: ubuntu-latest 117 | needs: [python-linux, python-windows, python-macos, python-sdist] 118 | env: 119 | GH_TOKEN: ${{ github.token }} 120 | steps: 121 | - uses: actions/checkout@v4 122 | - run: gh run download ${{ github.run_id }} -p wheels-* 123 | - name: release 124 | run: | 125 | gh release create commit-${GITHUB_SHA:0:8} --prerelease wheels-*/* 126 | 127 | python-release: 128 | name: Release 129 | runs-on: ubuntu-latest 130 | if: "startsWith(github.ref, 'refs/tags/')" 131 | needs: [python-linux, python-windows, python-macos, python-sdist] 132 | environment: release 133 | env: 134 | GH_TOKEN: ${{ github.token }} 135 | steps: 136 | - uses: actions/checkout@v4 137 | - run: gh run download ${{ github.run_id }} -p wheels-* 138 | - name: Publish to PyPI 139 | uses: PyO3/maturin-action@v1 140 | with: 141 | command: upload 142 | args: --non-interactive --skip-existing wheels-*/* 143 | 144 | docker: 145 | runs-on: ${{ matrix.machine.runner }} 146 | strategy: 147 | fail-fast: false 148 | matrix: 149 | machine: 150 | - platform: amd64 151 | runner: ubuntu-latest 152 | - platform: arm64 153 | runner: ubuntu-24.04-arm 154 | tag: [cpu, cuda] 155 | steps: 156 | - name: Prepare 157 | run: | 158 | platform=${{ matrix.machine.platform }} 159 | echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV 160 | 161 | - name: Docker meta 162 | id: meta 163 | uses: docker/metadata-action@v5 164 | with: 165 | images: | 166 | ghcr.io/${{ github.repository }} 167 | 168 | - name: Login to GHCR 169 | uses: docker/login-action@v3 170 | with: 171 | registry: ghcr.io 172 | username: ${{ github.repository_owner }} 173 | password: ${{ secrets.GITHUB_TOKEN }} 174 | 175 | - name: Set up QEMU 176 | uses: docker/setup-qemu-action@v3 177 | 178 | - name: Set up Docker Buildx 179 | uses: docker/setup-buildx-action@v3 180 | 181 | - name: Build and push by digest 182 | id: build 183 | uses: docker/build-push-action@v6 184 | with: 185 | labels: ${{ steps.meta.outputs.labels }} 186 | file: ./scripts/docker/${{ matrix.tag }}.Dockerfile 187 | push: true 188 | tags: | 189 | ghcr.io/${{ github.repository }}:latest-${{ matrix.tag }}-${{ matrix.machine.platform }} 190 | 191 | docker-merge: 192 | runs-on: ubuntu-latest 193 | needs: 194 | - docker 195 | steps: 196 | - name: Download digests 197 | uses: actions/download-artifact@v4 198 | with: 199 | path: ${{ runner.temp }}/digests 200 | pattern: digests-* 201 | merge-multiple: true 202 | 203 | - name: Login to GHCR 204 | uses: docker/login-action@v3 205 | with: 206 | registry: ghcr.io 207 | username: ${{ github.repository_owner }} 208 | password: ${{ secrets.GITHUB_TOKEN }} 209 | 210 | - name: Set up Docker Buildx 211 | uses: docker/setup-buildx-action@v3 212 | 213 | - name: Merge 214 | run: | 215 | docker buildx imagetools create -t ghcr.io/${{ github.repository }}:cuda \ 216 | ghcr.io/${{ github.repository }}:latest-cuda-amd64 \ 217 | ghcr.io/${{ github.repository }}:latest-cuda-arm64 218 | docker buildx imagetools create -t ghcr.io/${{ github.repository }}:cpu \ 219 | ghcr.io/${{ github.repository }}:latest-cpu-amd64 \ 220 | ghcr.io/${{ github.repository }}:latest-cpu-arm64 221 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | check: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | components: 13 | - rustfmt 14 | - clippy 15 | steps: 16 | - name: Setup 17 | uses: actions/checkout@v4 18 | - uses: actions-rust-lang/setup-rust-toolchain@v1 19 | with: 20 | components: ${{ matrix.components }} 21 | - name: Format 22 | if: ${{ matrix.components == 'rustfmt' }} 23 | run: cargo fmt --all -- --check 24 | - name: Lint 25 | if: ${{ matrix.components == 'clippy' }} 26 | run: cargo clippy --all-targets --all-features -- -D warnings -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | models/ 3 | !models/.gitkeep 4 | venv/ 5 | .env 6 | *.wav 7 | node_modules/ 8 | dist/ 9 | *.csv 10 | *.bin -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "3" 3 | members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm"] 4 | 5 | [workspace.package] 6 | version = "0.2.0-alpha6" 7 | edition = "2021" 8 | description = "Style-Bert-VITSの推論ライブラリ" 9 | license = "MIT" 10 | readme = "./README.md" 11 | repository = "https://github.com/neodyland/sbv2-api" 12 | documentation = "https://docs.rs/sbv2_core" 13 | 14 | [workspace.dependencies] 15 | anyhow = "1.0.96" 16 | dotenvy = "0.15.7" 17 | env_logger = "0.11.6" 18 | ndarray = "0.16.1" 19 | once_cell = "1.20.3" 20 | 21 | [profile.release] 22 | strip = true 23 | opt-level = "z" 24 | lto = true 25 | codegen-units = 1 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 tuna2134 4 | Copyright (c) 2025- neodyland 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SBV2-API 2 | 3 | > [!CAUTION] 4 | > 本バージョンはアルファ版です。 5 | > 6 | > 安定版を利用したい場合は[こちら](https://github.com/neodyland/sbv2-api/tree/v0.1.x)をご覧ください。 7 | 8 | > [!CAUTION] 9 | > オプションの辞書はLGPLです。 10 | > 11 | > オプションの辞書を使用する場合、バイナリの内部の辞書部分について、LGPLが適用されます。 12 | 13 | > [!NOTE] 14 | > このレポジトリはメンテナンスの都合上、[tuna2134](https:://github.com/tuna2134)氏の所属する[Neodyland](https://neody.land/)へとリポジトリ所在地を移動しました。 15 | > 16 | > 引き続きtuna2134氏がメインメンテナとして管理しています。 17 | 18 | ## プログラミングに詳しくない方向け 19 | 20 | [こちら](https://github.com/tuna2134/sbv2-gui)を参照してください。 21 | 22 | コマンドやpythonの知識なしで簡単に使えるバージョンです。(できることはほぼ同じ) 23 | 24 | ## このプロジェクトについて 25 | 26 | このプロジェクトは Style-Bert-ViTS2 を ONNX 化したものを Rust で実行するのを目的としたライブラリです。 27 | 28 | JP-Extra しか対応していません。(基本的に対応する予定もありません) 29 | 30 | ## 変換方法 31 | 32 | [こちら](https://github.com/neodyland/sbv2-api/tree/main/scripts/convert)を参照してください。 33 | 34 | ## Todo 35 | 36 | - [x] REST API の実装 37 | - [x] Rust ライブラリの実装 38 | - [x] `.sbv2`フォーマットの開発 39 | - [x] PyO3 を利用し、 Python から使えるようにする 40 | - [ ] 組み込み向けにCライブラリの作成 41 | - [x] GPU 対応(CUDA) 42 | - [x] GPU 対応(DirectML) 43 | - [x] GPU 対応(CoreML) 44 | - [x] WASM 変換 45 | - [x] arm64のdockerサポート 46 | - [x] aivis形式のサポート 47 | - [ ] MeCabを利用する 48 | 49 | ## 構造説明 50 | 51 | - `crates/sbv2_api` - 推論用 REST API 52 | - `crates/sbv2_core` - 推論コア部分 53 | - `scripts/docker` - docker ビルドスクリプト 54 | - `scripts/convert` - onnx, sbv2フォーマットへの変換スクリプト 55 | 56 | ## プログラミングある程度できる人向けREST API起動方法 57 | 58 | ### models をインストール 59 | 60 | https://huggingface.co/neody/sbv2-api-assets/tree/main/deberta 61 | から`tokenizer.json`,`debert.onnx` 62 | https://huggingface.co/neody/sbv2-api-assets/tree/main/model 63 | から`tsukuyomi.sbv2` 64 | を models フォルダに配置 65 | 66 | ### .env ファイルの作成 67 | 68 | ```sh 69 | cp .env.sample .env 70 | ``` 71 | 72 | ### 起動 73 | 74 | CPUの場合は 75 | ```sh 76 | docker run -it --rm -p 3000:3000 --name sbv2 \ 77 | -v ./models:/work/models --env-file .env \ 78 | ghcr.io/neodyland/sbv2-api:cpu 79 | ``` 80 | 81 |
82 | Apple Silicon搭載のMac(M1以降)の場合 83 | docker上で動作させる場合、.envのADDRをlocalhostから0.0.0.0に変更してください。 84 | 85 | ```yaml 86 | ADDR=0.0.0.0:3000 87 | ``` 88 | 89 | CPUの場合は 90 | ```bash 91 | docker run --platform linux/amd64 -it --rm -p 3000:3000 --name sbv2 \ 92 | -v ./models:/work/models --env-file .env \ 93 | ghcr.io/neodyland/sbv2-api:cpu 94 | ``` 95 |
96 | 97 | CUDAの場合は 98 | ```sh 99 | docker run -it --rm -p 3000:3000 --name sbv2 \ 100 | -v ./models:/work/models --env-file .env \ 101 | --gpus all \ 102 | ghcr.io/neodyland/sbv2-api:cuda 103 | ``` 104 | 105 | ### 起動確認 106 | 107 | ```sh 108 | curl -XPOST -H "Content-type: application/json" -d '{"text": "こんにちは","ident": "tsukuyomi"}' 'http://localhost:3000/synthesize' --output "output.wav" 109 | curl http://localhost:3000/models 110 | ``` 111 | 112 | ## 開発者向けガイド 113 | 114 | ### Feature flags 115 | 116 | `sbv2_api`、`sbv2_core`共に 117 | - `cuda` featureでcuda 118 | - `cuda_tf32` featureでcudaのtf32機能 119 | - `tensorrt` featureでbert部分のtensorrt利用 120 | - `dynamic` featureで手元のonnxruntime共有ライブラリを利用(`ORT_DYLIB_PATH=./libonnxruntime.dll`などで指定) 121 | - `directml` featureでdirectmlの利用ができます。 122 | - `coreml` featureでcoremlの利用ができます。 123 | 124 | ### 環境変数 125 | 126 | 以下の環境変数はライブラリ側では適用されません。 127 | 128 | ライブラリAPIについては`https://docs.rs/sbv2_core`を参照してください。 129 | 130 | - `ADDR` `localhost:3000`などのようにサーバー起動アドレスをコントロールできます。 131 | - `MODELS_PATH` sbv2モデルの存在するフォルダを指定できます。 132 | - `RUST_LOG` おなじみlog levelです。 133 | - `HOLDER_MAX_LOADED_MODElS` RAMにロードされるモデルの最大数を指定します。 134 | 135 | ## 謝辞 136 | 137 | - [litagin02/Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) - このコードを書くにあたり、ベースとなる部分を参考にさせていただきました。 138 | - [Googlefan](https://github.com/Googlefan256) - 彼にモデルを ONNX ヘ変換および効率化をする方法を教わりました。 139 | - [Aivis Project](https://github.com/Aivis-Project/AivisSpeech-Engine) - 辞書部分 140 | -------------------------------------------------------------------------------- /crates/sbv2_api/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sbv2_api" 3 | version.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | readme.workspace = true 7 | repository.workspace = true 8 | documentation.workspace = true 9 | license.workspace = true 10 | 11 | [dependencies] 12 | anyhow.workspace = true 13 | axum = "0.8.0" 14 | dotenvy.workspace = true 15 | env_logger.workspace = true 16 | log = "0.4.22" 17 | sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core", features = ["aivmx"] } 18 | serde = { version = "1.0.210", features = ["derive"] } 19 | tokio = { version = "1.45.1", features = ["full"] } 20 | utoipa = { version = "5.0.0", features = ["axum_extras"] } 21 | utoipa-scalar = { version = "0.3.0", features = ["axum"] } 22 | 23 | [features] 24 | coreml = ["sbv2_core/coreml"] 25 | cuda = ["sbv2_core/cuda"] 26 | cuda_tf32 = ["sbv2_core/cuda_tf32"] 27 | dynamic = ["sbv2_core/dynamic"] 28 | directml = ["sbv2_core/directml"] 29 | tensorrt = ["sbv2_core/tensorrt"] 30 | -------------------------------------------------------------------------------- /crates/sbv2_api/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | if cfg!(feature = "coreml") { 3 | println!("cargo:rustc-link-arg=-fapple-link-rtlib"); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /crates/sbv2_api/src/error.rs: -------------------------------------------------------------------------------- 1 | use axum::{ 2 | http::StatusCode, 3 | response::{IntoResponse, Response}, 4 | }; 5 | 6 | pub type AppResult = std::result::Result; 7 | 8 | pub struct AppError(anyhow::Error); 9 | 10 | impl IntoResponse for AppError { 11 | fn into_response(self) -> Response { 12 | ( 13 | StatusCode::INTERNAL_SERVER_ERROR, 14 | format!("Something went wrong: {}", self.0), 15 | ) 16 | .into_response() 17 | } 18 | } 19 | 20 | impl From for AppError 21 | where 22 | E: Into, 23 | { 24 | fn from(err: E) -> Self { 25 | Self(err.into()) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /crates/sbv2_api/src/main.rs: -------------------------------------------------------------------------------- 1 | use axum::{ 2 | extract::State, 3 | http::header::CONTENT_TYPE, 4 | response::IntoResponse, 5 | routing::{get, post}, 6 | Json, Router, 7 | }; 8 | use sbv2_core::tts::{SynthesizeOptions, TTSModelHolder}; 9 | use serde::Deserialize; 10 | use std::env; 11 | use std::sync::Arc; 12 | use tokio::fs; 13 | use tokio::sync::Mutex; 14 | use utoipa::{OpenApi, ToSchema}; 15 | use utoipa_scalar::{Scalar, Servable}; 16 | 17 | mod error; 18 | use crate::error::AppResult; 19 | 20 | #[derive(OpenApi)] 21 | #[openapi(paths(models, synthesize), components(schemas(SynthesizeRequest)))] 22 | struct ApiDoc; 23 | 24 | #[utoipa::path( 25 | get, 26 | path = "/models", 27 | responses( 28 | (status = 200, description = "Return model list", body = Vec), 29 | ) 30 | )] 31 | async fn models(State(state): State) -> AppResult { 32 | Ok(Json(state.tts_model.lock().await.models())) 33 | } 34 | 35 | fn sdp_default() -> f32 { 36 | 0.0 37 | } 38 | 39 | fn length_default() -> f32 { 40 | 1.0 41 | } 42 | 43 | fn style_id_default() -> i32 { 44 | 0 45 | } 46 | 47 | fn speaker_id_default() -> i64 { 48 | 0 49 | } 50 | 51 | #[derive(Deserialize, ToSchema)] 52 | struct SynthesizeRequest { 53 | text: String, 54 | ident: String, 55 | #[serde(default = "sdp_default")] 56 | #[schema(example = 0.0_f32)] 57 | sdp_ratio: f32, 58 | #[serde(default = "length_default")] 59 | #[schema(example = 1.0_f32)] 60 | length_scale: f32, 61 | #[serde(default = "style_id_default")] 62 | #[schema(example = 0_i32)] 63 | style_id: i32, 64 | #[serde(default = "speaker_id_default")] 65 | #[schema(example = 0_i64)] 66 | speaker_id: i64, 67 | } 68 | 69 | #[utoipa::path( 70 | post, 71 | path = "/synthesize", 72 | request_body = SynthesizeRequest, 73 | responses( 74 | (status = 200, description = "Return audio/wav", body = Vec, content_type = "audio/wav") 75 | ) 76 | )] 77 | async fn synthesize( 78 | State(state): State, 79 | Json(SynthesizeRequest { 80 | text, 81 | ident, 82 | sdp_ratio, 83 | length_scale, 84 | style_id, 85 | speaker_id, 86 | }): Json, 87 | ) -> AppResult { 88 | log::debug!("processing request: text={text}, ident={ident}, sdp_ratio={sdp_ratio}, length_scale={length_scale}"); 89 | let buffer = { 90 | let mut tts_model = state.tts_model.lock().await; 91 | tts_model.easy_synthesize( 92 | &ident, 93 | &text, 94 | style_id, 95 | speaker_id, 96 | SynthesizeOptions { 97 | sdp_ratio, 98 | length_scale, 99 | ..Default::default() 100 | }, 101 | )? 102 | }; 103 | Ok(([(CONTENT_TYPE, "audio/wav")], buffer)) 104 | } 105 | 106 | #[derive(Clone)] 107 | struct AppState { 108 | tts_model: Arc>, 109 | } 110 | 111 | impl AppState { 112 | pub async fn new() -> anyhow::Result { 113 | let mut tts_model = TTSModelHolder::new( 114 | &fs::read(env::var("BERT_MODEL_PATH")?).await?, 115 | &fs::read(env::var("TOKENIZER_PATH")?).await?, 116 | env::var("HOLDER_MAX_LOADED_MODElS") 117 | .ok() 118 | .and_then(|x| x.parse().ok()), 119 | )?; 120 | let models = env::var("MODELS_PATH").unwrap_or("models".to_string()); 121 | let mut f = fs::read_dir(&models).await?; 122 | let mut entries = vec![]; 123 | while let Ok(Some(e)) = f.next_entry().await { 124 | let name = e.file_name().to_string_lossy().to_string(); 125 | if name.ends_with(".onnx") && name.starts_with("model_") { 126 | let name_len = name.len(); 127 | let name = name.chars(); 128 | entries.push( 129 | name.collect::>()[6..name_len - 5] 130 | .iter() 131 | .collect::(), 132 | ); 133 | } else if name.ends_with(".sbv2") { 134 | let entry = &name[..name.len() - 5]; 135 | log::info!("Try loading: {entry}"); 136 | let sbv2_bytes = match fs::read(format!("{models}/{entry}.sbv2")).await { 137 | Ok(b) => b, 138 | Err(e) => { 139 | log::warn!("Error loading sbv2_bytes from file {entry}: {e}"); 140 | continue; 141 | } 142 | }; 143 | if let Err(e) = tts_model.load_sbv2file(entry, sbv2_bytes) { 144 | log::warn!("Error loading {entry}: {e}"); 145 | }; 146 | log::info!("Loaded: {entry}"); 147 | } else if name.ends_with(".aivmx") { 148 | let entry = &name[..name.len() - 6]; 149 | log::info!("Try loading: {entry}"); 150 | let aivmx_bytes = match fs::read(format!("{models}/{entry}.aivmx")).await { 151 | Ok(b) => b, 152 | Err(e) => { 153 | log::warn!("Error loading aivmx bytes from file {entry}: {e}"); 154 | continue; 155 | } 156 | }; 157 | if let Err(e) = tts_model.load_aivmx(entry, aivmx_bytes) { 158 | log::error!("Error loading {entry}: {e}"); 159 | } 160 | log::info!("Loaded: {entry}"); 161 | } 162 | } 163 | for entry in entries { 164 | log::info!("Try loading: {entry}"); 165 | let style_vectors_bytes = 166 | match fs::read(format!("{models}/style_vectors_{entry}.json")).await { 167 | Ok(b) => b, 168 | Err(e) => { 169 | log::warn!("Error loading style_vectors_bytes from file {entry}: {e}"); 170 | continue; 171 | } 172 | }; 173 | let vits2_bytes = match fs::read(format!("{models}/model_{entry}.onnx")).await { 174 | Ok(b) => b, 175 | Err(e) => { 176 | log::warn!("Error loading vits2_bytes from file {entry}: {e}"); 177 | continue; 178 | } 179 | }; 180 | if let Err(e) = tts_model.load(&entry, style_vectors_bytes, vits2_bytes) { 181 | log::warn!("Error loading {entry}: {e}"); 182 | }; 183 | log::info!("Loaded: {entry}"); 184 | } 185 | Ok(Self { 186 | tts_model: Arc::new(Mutex::new(tts_model)), 187 | }) 188 | } 189 | } 190 | 191 | #[tokio::main] 192 | async fn main() -> anyhow::Result<()> { 193 | dotenvy::dotenv_override().ok(); 194 | env_logger::init(); 195 | let app = Router::new() 196 | .route("/", get(|| async { "Hello, World!" })) 197 | .route("/synthesize", post(synthesize)) 198 | .route("/models", get(models)) 199 | .with_state(AppState::new().await?) 200 | .merge(Scalar::with_url("/docs", ApiDoc::openapi())); 201 | let addr = env::var("ADDR").unwrap_or("0.0.0.0:3000".to_string()); 202 | let listener = tokio::net::TcpListener::bind(&addr).await?; 203 | log::info!("Listening on {addr}"); 204 | axum::serve(listener, app).await?; 205 | 206 | Ok(()) 207 | } 208 | -------------------------------------------------------------------------------- /crates/sbv2_bindings/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sbv2_bindings" 3 | version.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | readme.workspace = true 7 | repository.workspace = true 8 | documentation.workspace = true 9 | license.workspace = true 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | [lib] 13 | name = "sbv2_bindings" 14 | crate-type = ["cdylib"] 15 | 16 | [dependencies] 17 | anyhow.workspace = true 18 | ndarray.workspace = true 19 | pyo3 = { version = "0.25.0", features = ["anyhow"] } 20 | sbv2_core = { path = "../sbv2_core", features = ["std"], default-features = false } 21 | 22 | [features] 23 | agpl_dict = ["sbv2_core/agpl_dict"] 24 | default = ["agpl_dict"] -------------------------------------------------------------------------------- /crates/sbv2_bindings/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.7,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "sbv2_bindings" 7 | requires-python = ">=3.8" 8 | classifiers = [ 9 | "Programming Language :: Rust", 10 | "Programming Language :: Python :: Implementation :: CPython", 11 | "Programming Language :: Python :: Implementation :: PyPy", 12 | ] 13 | dynamic = ["version"] 14 | 15 | [tool.maturin] 16 | features = ["pyo3/extension-module"] 17 | strip = true -------------------------------------------------------------------------------- /crates/sbv2_bindings/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | mod sbv2; 3 | pub mod style; 4 | 5 | /// sbv2 bindings module 6 | #[pymodule] 7 | fn sbv2_bindings(m: &Bound<'_, PyModule>) -> PyResult<()> { 8 | m.add_class::()?; 9 | m.add_class::()?; 10 | Ok(()) 11 | } 12 | -------------------------------------------------------------------------------- /crates/sbv2_bindings/src/sbv2.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyBytes; 3 | use sbv2_core::tts::{SynthesizeOptions, TTSModelHolder}; 4 | 5 | use crate::style::StyleVector; 6 | 7 | use std::fs; 8 | 9 | /// TTSModel class 10 | /// 11 | /// 音声合成するために使うクラス 12 | /// 13 | /// Parameters 14 | /// ---------- 15 | /// bert_model_bytes : bytes 16 | /// BERTモデルのバイナリデータ 17 | /// tokenizer_bytes : bytes 18 | /// トークナイザーのバイナリデータ 19 | #[pyclass] 20 | pub struct TTSModel { 21 | pub model: TTSModelHolder, 22 | } 23 | 24 | #[pymethods] 25 | impl TTSModel { 26 | #[pyo3(signature = (bert_model_bytes, tokenizer_bytes, max_loaded_models=None))] 27 | #[new] 28 | fn new( 29 | bert_model_bytes: Vec, 30 | tokenizer_bytes: Vec, 31 | max_loaded_models: Option, 32 | ) -> anyhow::Result { 33 | Ok(Self { 34 | model: TTSModelHolder::new(bert_model_bytes, tokenizer_bytes, max_loaded_models)?, 35 | }) 36 | } 37 | 38 | /// パスからTTSModelインスタンスを生成する 39 | /// 40 | /// Parameters 41 | /// ---------- 42 | /// bert_model_path : str 43 | /// BERTモデルのパス 44 | /// tokenizer_path : str 45 | /// トークナイザーのパス 46 | /// max_loaded_models: int | None 47 | /// 同時にVRAMに存在するモデルの数 48 | #[pyo3(signature = (bert_model_path, tokenizer_path, max_loaded_models=None))] 49 | #[staticmethod] 50 | fn from_path( 51 | bert_model_path: String, 52 | tokenizer_path: String, 53 | max_loaded_models: Option, 54 | ) -> anyhow::Result { 55 | Ok(Self { 56 | model: TTSModelHolder::new( 57 | fs::read(bert_model_path)?, 58 | fs::read(tokenizer_path)?, 59 | max_loaded_models, 60 | )?, 61 | }) 62 | } 63 | 64 | /// SBV2ファイルを読み込む 65 | /// 66 | /// Parameters 67 | /// ---------- 68 | /// ident : str 69 | /// 識別子 70 | /// sbv2file_bytes : bytes 71 | /// SBV2ファイルのバイナリデータ 72 | fn load_sbv2file(&mut self, ident: String, sbv2file_bytes: Vec) -> anyhow::Result<()> { 73 | self.model.load_sbv2file(ident, sbv2file_bytes)?; 74 | Ok(()) 75 | } 76 | 77 | /// パスからSBV2ファイルを読み込む 78 | /// 79 | /// Parameters 80 | /// ---------- 81 | /// ident : str 82 | /// 識別子 83 | /// sbv2file_path : str 84 | /// SBV2ファイルのパス 85 | fn load_sbv2file_from_path( 86 | &mut self, 87 | ident: String, 88 | sbv2file_path: String, 89 | ) -> anyhow::Result<()> { 90 | self.model.load_sbv2file(ident, fs::read(sbv2file_path)?)?; 91 | Ok(()) 92 | } 93 | 94 | /// スタイルベクトルを取得する 95 | /// 96 | /// Parameters 97 | /// ---------- 98 | /// ident : str 99 | /// 識別子 100 | /// style_id : int 101 | /// スタイルID 102 | /// weight : float 103 | /// 重み 104 | /// 105 | /// Returns 106 | /// ------- 107 | /// style_vector : StyleVector 108 | /// スタイルベクトル 109 | fn get_style_vector( 110 | &mut self, 111 | ident: String, 112 | style_id: i32, 113 | weight: f32, 114 | ) -> anyhow::Result { 115 | Ok(StyleVector::new( 116 | self.model.get_style_vector(ident, style_id, weight)?, 117 | )) 118 | } 119 | 120 | /// テキストから音声を合成する 121 | /// 122 | /// Parameters 123 | /// ---------- 124 | /// text : str 125 | /// テキスト 126 | /// ident : str 127 | /// 識別子 128 | /// style_id : int 129 | /// スタイルID 130 | /// sdp_ratio : float 131 | /// SDP比率 132 | /// length_scale : float 133 | /// 音声の長さのスケール 134 | /// 135 | /// Returns 136 | /// ------- 137 | /// voice_data : bytes 138 | /// 音声データ 139 | #[allow(clippy::too_many_arguments)] 140 | fn synthesize<'p>( 141 | &'p mut self, 142 | py: Python<'p>, 143 | text: String, 144 | ident: String, 145 | style_id: i32, 146 | speaker_id: i64, 147 | sdp_ratio: f32, 148 | length_scale: f32, 149 | ) -> anyhow::Result> { 150 | let data = self.model.easy_synthesize( 151 | ident.as_str(), 152 | &text, 153 | style_id, 154 | speaker_id, 155 | SynthesizeOptions { 156 | sdp_ratio, 157 | length_scale, 158 | ..Default::default() 159 | }, 160 | )?; 161 | Ok(PyBytes::new(py, &data)) 162 | } 163 | 164 | fn unload(&mut self, ident: String) -> bool { 165 | self.model.unload(ident) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /crates/sbv2_bindings/src/style.rs: -------------------------------------------------------------------------------- 1 | use ndarray::Array1; 2 | use pyo3::prelude::*; 3 | 4 | /// StyleVector class 5 | /// 6 | /// スタイルベクトルを表すクラス 7 | #[pyclass] 8 | #[derive(Clone)] 9 | pub struct StyleVector(Array1); 10 | 11 | impl StyleVector { 12 | pub fn new(data: Array1) -> Self { 13 | StyleVector(data) 14 | } 15 | 16 | pub fn get(&self) -> Array1 { 17 | self.0.clone() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /crates/sbv2_core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sbv2_core" 3 | version.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | readme.workspace = true 7 | repository.workspace = true 8 | documentation.workspace = true 9 | license.workspace = true 10 | 11 | [dependencies] 12 | anyhow.workspace = true 13 | base64 = { version = "0.22.1", optional = true } 14 | dotenvy.workspace = true 15 | env_logger.workspace = true 16 | hound = "3.5.1" 17 | jpreprocess = { version = "0.12.0", features = ["naist-jdic"] } 18 | ndarray.workspace = true 19 | npyz = { version = "0.8.4", optional = true } 20 | num_cpus = "1.16.0" 21 | once_cell.workspace = true 22 | ort = { git = "https://github.com/pykeio/ort.git", version = "2.0.0-rc.9", optional = true } 23 | regex = "1.10.6" 24 | serde = { version = "1.0.210", features = ["derive"] } 25 | serde_json = "1.0.128" 26 | tar = "0.4.41" 27 | thiserror = "2.0.11" 28 | tokenizers = { version = "0.21.0", default-features = false } 29 | zstd = "0.13.2" 30 | 31 | [features] 32 | cuda = ["ort/cuda", "std"] 33 | cuda_tf32 = ["std", "cuda"] 34 | agpl_dict = [] 35 | std = ["dep:ort", "tokenizers/progressbar", "tokenizers/onig", "tokenizers/esaxx_fast"] 36 | dynamic = ["ort/load-dynamic", "std"] 37 | directml = ["ort/directml", "std"] 38 | tensorrt = ["ort/tensorrt", "std"] 39 | coreml = ["ort/coreml", "std"] 40 | default = ["std", "agpl_dict"] 41 | no_std = ["tokenizers/unstable_wasm"] 42 | aivmx = ["npyz", "base64"] 43 | base64 = ["dep:base64"] 44 | 45 | [build-dependencies] 46 | dirs = "6.0.0" 47 | ureq = "3.0.6" 48 | -------------------------------------------------------------------------------- /crates/sbv2_core/build.rs: -------------------------------------------------------------------------------- 1 | use dirs::home_dir; 2 | use std::env; 3 | use std::fs; 4 | use std::io::copy; 5 | use std::path::PathBuf; 6 | 7 | fn main() -> Result<(), Box> { 8 | let static_dir = home_dir().unwrap().join(".cache/sbv2"); 9 | let static_path = static_dir.join("all.bin"); 10 | let out_path = PathBuf::from(&env::var("OUT_DIR").unwrap()).join("all.bin"); 11 | println!("cargo:rerun-if-changed=build.rs"); 12 | if static_path.exists() { 13 | println!("cargo:info=Dictionary file already exists, skipping download."); 14 | } else { 15 | println!("cargo:warning=Downloading dictionary file..."); 16 | let mut response = 17 | ureq::get("https://huggingface.co/neody/sbv2-api-assets/resolve/main/dic/all.bin") 18 | .call()?; 19 | let mut response = response.body_mut().as_reader(); 20 | if !static_dir.exists() { 21 | fs::create_dir_all(static_dir)?; 22 | } 23 | let mut file = fs::File::create(&static_path)?; 24 | copy(&mut response, &mut file)?; 25 | } 26 | if !out_path.exists() && fs::hard_link(&static_path, &out_path).is_err() { 27 | println!("cargo:warning=Failed to create hard link, copying instead."); 28 | fs::copy(static_path, out_path)?; 29 | } 30 | Ok(()) 31 | } 32 | -------------------------------------------------------------------------------- /crates/sbv2_core/mora_convert.py: -------------------------------------------------------------------------------- 1 | # moraに変換します 2 | import json 3 | 4 | 5 | __MORA_LIST_MINIMUM: list[tuple[str, str | None, str]] = [ 6 | ("ヴォ", "v", "o"), 7 | ("ヴェ", "v", "e"), 8 | ("ヴィ", "v", "i"), 9 | ("ヴァ", "v", "a"), 10 | ("ヴ", "v", "u"), 11 | ("ン", None, "N"), 12 | ("ワ", "w", "a"), 13 | ("ロ", "r", "o"), 14 | ("レ", "r", "e"), 15 | ("ル", "r", "u"), 16 | ("リョ", "ry", "o"), 17 | ("リュ", "ry", "u"), 18 | ("リャ", "ry", "a"), 19 | ("リェ", "ry", "e"), 20 | ("リ", "r", "i"), 21 | ("ラ", "r", "a"), 22 | ("ヨ", "y", "o"), 23 | ("ユ", "y", "u"), 24 | ("ヤ", "y", "a"), 25 | ("モ", "m", "o"), 26 | ("メ", "m", "e"), 27 | ("ム", "m", "u"), 28 | ("ミョ", "my", "o"), 29 | ("ミュ", "my", "u"), 30 | ("ミャ", "my", "a"), 31 | ("ミェ", "my", "e"), 32 | ("ミ", "m", "i"), 33 | ("マ", "m", "a"), 34 | ("ポ", "p", "o"), 35 | ("ボ", "b", "o"), 36 | ("ホ", "h", "o"), 37 | ("ペ", "p", "e"), 38 | ("ベ", "b", "e"), 39 | ("ヘ", "h", "e"), 40 | ("プ", "p", "u"), 41 | ("ブ", "b", "u"), 42 | ("フォ", "f", "o"), 43 | ("フェ", "f", "e"), 44 | ("フィ", "f", "i"), 45 | ("ファ", "f", "a"), 46 | ("フ", "f", "u"), 47 | ("ピョ", "py", "o"), 48 | ("ピュ", "py", "u"), 49 | ("ピャ", "py", "a"), 50 | ("ピェ", "py", "e"), 51 | ("ピ", "p", "i"), 52 | ("ビョ", "by", "o"), 53 | ("ビュ", "by", "u"), 54 | ("ビャ", "by", "a"), 55 | ("ビェ", "by", "e"), 56 | ("ビ", "b", "i"), 57 | ("ヒョ", "hy", "o"), 58 | ("ヒュ", "hy", "u"), 59 | ("ヒャ", "hy", "a"), 60 | ("ヒェ", "hy", "e"), 61 | ("ヒ", "h", "i"), 62 | ("パ", "p", "a"), 63 | ("バ", "b", "a"), 64 | ("ハ", "h", "a"), 65 | ("ノ", "n", "o"), 66 | ("ネ", "n", "e"), 67 | ("ヌ", "n", "u"), 68 | ("ニョ", "ny", "o"), 69 | ("ニュ", "ny", "u"), 70 | ("ニャ", "ny", "a"), 71 | ("ニェ", "ny", "e"), 72 | ("ニ", "n", "i"), 73 | ("ナ", "n", "a"), 74 | ("ドゥ", "d", "u"), 75 | ("ド", "d", "o"), 76 | ("トゥ", "t", "u"), 77 | ("ト", "t", "o"), 78 | ("デョ", "dy", "o"), 79 | ("デュ", "dy", "u"), 80 | ("デャ", "dy", "a"), 81 | # ("デェ", "dy", "e"), 82 | ("ディ", "d", "i"), 83 | ("デ", "d", "e"), 84 | ("テョ", "ty", "o"), 85 | ("テュ", "ty", "u"), 86 | ("テャ", "ty", "a"), 87 | ("ティ", "t", "i"), 88 | ("テ", "t", "e"), 89 | ("ツォ", "ts", "o"), 90 | ("ツェ", "ts", "e"), 91 | ("ツィ", "ts", "i"), 92 | ("ツァ", "ts", "a"), 93 | ("ツ", "ts", "u"), 94 | ("ッ", None, "q"), # 「cl」から「q」に変更 95 | ("チョ", "ch", "o"), 96 | ("チュ", "ch", "u"), 97 | ("チャ", "ch", "a"), 98 | ("チェ", "ch", "e"), 99 | ("チ", "ch", "i"), 100 | ("ダ", "d", "a"), 101 | ("タ", "t", "a"), 102 | ("ゾ", "z", "o"), 103 | ("ソ", "s", "o"), 104 | ("ゼ", "z", "e"), 105 | ("セ", "s", "e"), 106 | ("ズィ", "z", "i"), 107 | ("ズ", "z", "u"), 108 | ("スィ", "s", "i"), 109 | ("ス", "s", "u"), 110 | ("ジョ", "j", "o"), 111 | ("ジュ", "j", "u"), 112 | ("ジャ", "j", "a"), 113 | ("ジェ", "j", "e"), 114 | ("ジ", "j", "i"), 115 | ("ショ", "sh", "o"), 116 | ("シュ", "sh", "u"), 117 | ("シャ", "sh", "a"), 118 | ("シェ", "sh", "e"), 119 | ("シ", "sh", "i"), 120 | ("ザ", "z", "a"), 121 | ("サ", "s", "a"), 122 | ("ゴ", "g", "o"), 123 | ("コ", "k", "o"), 124 | ("ゲ", "g", "e"), 125 | ("ケ", "k", "e"), 126 | ("グヮ", "gw", "a"), 127 | ("グ", "g", "u"), 128 | ("クヮ", "kw", "a"), 129 | ("ク", "k", "u"), 130 | ("ギョ", "gy", "o"), 131 | ("ギュ", "gy", "u"), 132 | ("ギャ", "gy", "a"), 133 | ("ギェ", "gy", "e"), 134 | ("ギ", "g", "i"), 135 | ("キョ", "ky", "o"), 136 | ("キュ", "ky", "u"), 137 | ("キャ", "ky", "a"), 138 | ("キェ", "ky", "e"), 139 | ("キ", "k", "i"), 140 | ("ガ", "g", "a"), 141 | ("カ", "k", "a"), 142 | ("オ", None, "o"), 143 | ("エ", None, "e"), 144 | ("ウォ", "w", "o"), 145 | ("ウェ", "w", "e"), 146 | ("ウィ", "w", "i"), 147 | ("ウ", None, "u"), 148 | ("イェ", "y", "e"), 149 | ("イ", None, "i"), 150 | ("ア", None, "a"), 151 | ] 152 | __MORA_LIST_ADDITIONAL: list[tuple[str, str | None, str]] = [ 153 | ("ヴョ", "by", "o"), 154 | ("ヴュ", "by", "u"), 155 | ("ヴャ", "by", "a"), 156 | ("ヲ", None, "o"), 157 | ("ヱ", None, "e"), 158 | ("ヰ", None, "i"), 159 | ("ヮ", "w", "a"), 160 | ("ョ", "y", "o"), 161 | ("ュ", "y", "u"), 162 | ("ヅ", "z", "u"), 163 | ("ヂ", "j", "i"), 164 | ("ヶ", "k", "e"), 165 | ("ャ", "y", "a"), 166 | ("ォ", None, "o"), 167 | ("ェ", None, "e"), 168 | ("ゥ", None, "u"), 169 | ("ィ", None, "i"), 170 | ("ァ", None, "a"), 171 | ] 172 | 173 | data = {"minimum": [], "additional": []} 174 | 175 | 176 | for mora, consonant, vowel in __MORA_LIST_MINIMUM: 177 | data["minimum"].append( 178 | { 179 | "mora": mora, 180 | "consonant": consonant, 181 | "vowel": vowel, 182 | } 183 | ) 184 | 185 | for mora, consonant, vowel in __MORA_LIST_ADDITIONAL: 186 | data["additional"].append( 187 | { 188 | "mora": mora, 189 | "consonant": consonant, 190 | "vowel": vowel, 191 | } 192 | ) 193 | 194 | 195 | with open("src/mora_list.json", "w") as f: 196 | json.dump(data, f, ensure_ascii=False, indent=4) 197 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/bert.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Result; 2 | use ndarray::{Array2, Ix2}; 3 | use ort::session::Session; 4 | use ort::value::TensorRef; 5 | 6 | pub fn predict( 7 | session: &mut Session, 8 | token_ids: Vec, 9 | attention_masks: Vec, 10 | ) -> Result> { 11 | let outputs = session.run( 12 | ort::inputs! { 13 | "input_ids" => TensorRef::from_array_view((vec![1, token_ids.len() as i64], token_ids.as_slice()))?, 14 | "attention_mask" => TensorRef::from_array_view((vec![1, attention_masks.len() as i64], attention_masks.as_slice()))?, 15 | } 16 | )?; 17 | let output = outputs["output"] 18 | .try_extract_array::()? 19 | .into_dimensionality::()? 20 | .to_owned(); 21 | Ok(output) 22 | } 23 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Error, Debug)] 4 | pub enum Error { 5 | #[error("Tokenizer error: {0}")] 6 | TokenizerError(#[from] tokenizers::Error), 7 | #[error("JPreprocess error: {0}")] 8 | JPreprocessError(#[from] jpreprocess::error::JPreprocessError), 9 | #[error("Lindera error: {0}")] 10 | LinderaError(String), 11 | #[cfg(feature = "std")] 12 | #[error("ONNX error: {0}")] 13 | OrtError(#[from] ort::Error), 14 | #[error("NDArray error: {0}")] 15 | NdArrayError(#[from] ndarray::ShapeError), 16 | #[error("Value error: {0}")] 17 | ValueError(String), 18 | #[error("Serde_json error: {0}")] 19 | SerdeJsonError(#[from] serde_json::Error), 20 | #[error("IO error: {0}")] 21 | IoError(#[from] std::io::Error), 22 | #[error("hound error: {0}")] 23 | HoundError(#[from] hound::Error), 24 | #[error("model not found error")] 25 | ModelNotFoundError(String), 26 | #[cfg(feature = "base64")] 27 | #[error("base64 error")] 28 | Base64Error(#[from] base64::DecodeError), 29 | #[error("other")] 30 | OtherError(String), 31 | #[error("Style error: {0}")] 32 | StyleError(String), 33 | } 34 | 35 | pub type Result = std::result::Result; 36 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/jtalk.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{Error, Result}; 2 | use crate::mora::{CONSONANTS, MORA_KATA_TO_MORA_PHONEMES, MORA_PHONEMES_TO_MORA_KATA, VOWELS}; 3 | use crate::norm::{replace_punctuation, PUNCTUATIONS}; 4 | use jpreprocess::{kind, DefaultTokenizer, JPreprocess, SystemDictionaryConfig, UserDictionary}; 5 | use once_cell::sync::Lazy; 6 | use regex::Regex; 7 | use std::cmp::Reverse; 8 | use std::collections::HashSet; 9 | use std::sync::Arc; 10 | 11 | type JPreprocessType = JPreprocess; 12 | 13 | #[cfg(feature = "agpl_dict")] 14 | fn agpl_dict() -> Result> { 15 | Ok(Some( 16 | UserDictionary::load(include_bytes!(concat!(env!("OUT_DIR"), "/all.bin"))) 17 | .map_err(|e| Error::LinderaError(e.to_string()))?, 18 | )) 19 | } 20 | 21 | #[cfg(not(feature = "agpl_dict"))] 22 | fn agpl_dict() -> Result> { 23 | Ok(None) 24 | } 25 | 26 | fn initialize_jtalk() -> Result { 27 | let sdic = 28 | SystemDictionaryConfig::Bundled(kind::JPreprocessDictionaryKind::NaistJdic).load()?; 29 | let jpreprocess = JPreprocess::with_dictionaries(sdic, agpl_dict()?); 30 | Ok(jpreprocess) 31 | } 32 | 33 | macro_rules! hash_set { 34 | ($($elem:expr),* $(,)?) => {{ 35 | let mut set = HashSet::new(); 36 | $( 37 | set.insert($elem); 38 | )* 39 | set 40 | }}; 41 | } 42 | 43 | pub struct JTalk { 44 | pub jpreprocess: Arc, 45 | } 46 | 47 | impl JTalk { 48 | pub fn new() -> Result { 49 | let jpreprocess = Arc::new(initialize_jtalk()?); 50 | Ok(Self { jpreprocess }) 51 | } 52 | 53 | pub fn num2word(&self, text: &str) -> Result { 54 | let mut parsed = self.jpreprocess.text_to_njd(text)?; 55 | parsed.preprocess(); 56 | let texts: Vec = parsed 57 | .nodes 58 | .iter() 59 | .map(|x| x.get_string().to_string()) 60 | .collect(); 61 | Ok(texts.join("")) 62 | } 63 | 64 | pub fn process_text(&self, text: &str) -> Result { 65 | let parsed = self.jpreprocess.run_frontend(text)?; 66 | let jtalk_process = JTalkProcess::new(Arc::clone(&self.jpreprocess), parsed); 67 | Ok(jtalk_process) 68 | } 69 | } 70 | 71 | static KATAKANA_PATTERN: Lazy = Lazy::new(|| Regex::new(r"[\u30A0-\u30FF]+").unwrap()); 72 | static MORA_PATTERN: Lazy> = Lazy::new(|| { 73 | let mut sorted_keys: Vec = MORA_KATA_TO_MORA_PHONEMES.keys().cloned().collect(); 74 | sorted_keys.sort_by_key(|b| Reverse(b.len())); 75 | sorted_keys 76 | }); 77 | static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); 78 | 79 | fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32)> { 80 | let phones = &phones[1..]; 81 | let tones = &tones[1..]; 82 | let mut results = Vec::new(); 83 | let mut current_mora = String::new(); 84 | for ((phone, _next_phone), (&tone, &next_tone)) in phones 85 | .iter() 86 | .zip(phones.iter().skip(1)) 87 | .zip(tones.iter().zip(tones.iter().skip(1))) 88 | { 89 | if PUNCTUATIONS.contains(&phone.clone().as_str()) { 90 | results.push((phone.to_string(), tone)); 91 | continue; 92 | } 93 | if CONSONANTS.contains(&phone.clone()) { 94 | assert_eq!(current_mora, ""); 95 | assert_eq!(tone, next_tone); 96 | current_mora = phone.to_string() 97 | } else { 98 | current_mora += phone; 99 | let kana = MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(); 100 | results.push((kana.to_string(), tone)); 101 | current_mora = String::new(); 102 | } 103 | } 104 | results 105 | } 106 | 107 | pub struct JTalkProcess { 108 | jpreprocess: Arc, 109 | parsed: Vec, 110 | } 111 | 112 | impl JTalkProcess { 113 | fn new(jpreprocess: Arc, parsed: Vec) -> Self { 114 | Self { 115 | jpreprocess, 116 | parsed, 117 | } 118 | } 119 | 120 | fn fix_phone_tone(&self, phone_tone_list: Vec<(String, i32)>) -> Result> { 121 | let tone_values: HashSet = phone_tone_list 122 | .iter() 123 | .map(|(_letter, tone)| *tone) 124 | .collect(); 125 | if tone_values.len() == 1 { 126 | assert!(tone_values == hash_set![0], "{:?}", tone_values); 127 | Ok(phone_tone_list) 128 | } else if tone_values.len() == 2 { 129 | if tone_values == hash_set![0, 1] { 130 | return Ok(phone_tone_list); 131 | } else if tone_values == hash_set![-1, 0] { 132 | return Ok(phone_tone_list 133 | .iter() 134 | .map(|x| { 135 | let new_tone = if x.1 == -1 { 0 } else { 1 }; 136 | (x.0.clone(), new_tone) 137 | }) 138 | .collect()); 139 | } else { 140 | return Err(Error::ValueError("Invalid tone values 0".to_string())); 141 | } 142 | } else { 143 | return Err(Error::ValueError("Invalid tone values 1".to_string())); 144 | } 145 | } 146 | 147 | pub fn g2p(&self) -> Result<(Vec, Vec, Vec)> { 148 | let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct()?; 149 | let (seq_text, seq_kata) = self.text_to_seq_kata()?; 150 | let sep_phonemes = JTalkProcess::handle_long( 151 | seq_kata 152 | .iter() 153 | .map(|x| JTalkProcess::kata_to_phoneme_list(x.clone()).unwrap()) 154 | .collect(), 155 | ); 156 | let phone_w_punct: Vec = sep_phonemes 157 | .iter() 158 | .flat_map(|x| x.iter()) 159 | .cloned() 160 | .collect(); 161 | 162 | let mut phone_tone_list = 163 | JTalkProcess::align_tones(phone_w_punct, phone_tone_list_wo_punct)?; 164 | 165 | let mut sep_tokenized: Vec> = Vec::new(); 166 | for seq_text_item in &seq_text { 167 | let text = seq_text_item.clone(); 168 | if !PUNCTUATIONS.contains(&text.as_str()) { 169 | sep_tokenized.push(text.chars().map(|x| x.to_string()).collect()); 170 | } else { 171 | sep_tokenized.push(vec![text]); 172 | } 173 | } 174 | 175 | let mut word2ph = Vec::new(); 176 | for (token, phoneme) in sep_tokenized.iter().zip(sep_phonemes.iter()) { 177 | let phone_len = phoneme.len() as i32; 178 | let word_len = token.len() as i32; 179 | word2ph.append(&mut JTalkProcess::distribute_phone(phone_len, word_len)); 180 | } 181 | 182 | let mut new_phone_tone_list = vec![("_".to_string(), 0)]; 183 | new_phone_tone_list.append(&mut phone_tone_list); 184 | new_phone_tone_list.push(("_".to_string(), 0)); 185 | 186 | let mut new_word2ph = vec![1]; 187 | new_word2ph.extend(word2ph.clone()); 188 | new_word2ph.push(1); 189 | 190 | let phones: Vec = new_phone_tone_list.iter().map(|(x, _)| x.clone()).collect(); 191 | let tones: Vec = new_phone_tone_list.iter().map(|(_, x)| *x).collect(); 192 | 193 | Ok((phones, tones, new_word2ph)) 194 | } 195 | 196 | pub fn g2kana_tone(&self) -> Result> { 197 | let (phones, tones, _) = self.g2p()?; 198 | Ok(phone_tone_to_kana(phones, tones)) 199 | } 200 | 201 | fn distribute_phone(n_phone: i32, n_word: i32) -> Vec { 202 | let mut phones_per_word = vec![0; n_word as usize]; 203 | for _ in 0..n_phone { 204 | let min_task = phones_per_word.iter().min().unwrap(); 205 | let min_index = phones_per_word 206 | .iter() 207 | .position(|&x| x == *min_task) 208 | .unwrap(); 209 | phones_per_word[min_index] += 1; 210 | } 211 | phones_per_word 212 | } 213 | 214 | fn align_tones( 215 | phone_with_punct: Vec, 216 | phone_tone_list: Vec<(String, i32)>, 217 | ) -> Result> { 218 | let mut result: Vec<(String, i32)> = Vec::new(); 219 | let mut tone_index = 0; 220 | for phone in phone_with_punct.clone() { 221 | if tone_index >= phone_tone_list.len() { 222 | result.push((phone, 0)); 223 | } else if phone == phone_tone_list[tone_index].0 { 224 | result.push((phone, phone_tone_list[tone_index].1)); 225 | tone_index += 1; 226 | } else if PUNCTUATIONS.contains(&phone.as_str()) { 227 | result.push((phone, 0)); 228 | } else { 229 | println!("phones {:?}", phone_with_punct); 230 | println!("phone_tone_list: {:?}", phone_tone_list); 231 | println!("result: {:?}", result); 232 | println!("tone_index: {:?}", tone_index); 233 | println!("phone: {:?}", phone); 234 | return Err(Error::ValueError(format!("Mismatched phoneme: {}", phone))); 235 | } 236 | } 237 | 238 | Ok(result) 239 | } 240 | 241 | fn handle_long(mut sep_phonemes: Vec>) -> Vec> { 242 | for i in 0..sep_phonemes.len() { 243 | if sep_phonemes[i].is_empty() { 244 | continue; 245 | } 246 | if sep_phonemes[i][0] == "ー" { 247 | if i != 0 { 248 | let prev_phoneme = sep_phonemes[i - 1].last().unwrap(); 249 | if VOWELS.contains(&prev_phoneme.as_str()) { 250 | sep_phonemes[i][0] = prev_phoneme.clone(); 251 | } else { 252 | sep_phonemes[i][0] = "ー".to_string(); 253 | } 254 | } else { 255 | sep_phonemes[i][0] = "ー".to_string(); 256 | } 257 | } 258 | if sep_phonemes[i].contains(&"ー".to_string()) { 259 | for e in 0..sep_phonemes[i].len() { 260 | if sep_phonemes[i][e] == "ー" { 261 | sep_phonemes[i][e] = 262 | sep_phonemes[i][e - 1].chars().last().unwrap().to_string(); 263 | } 264 | } 265 | } 266 | } 267 | sep_phonemes 268 | } 269 | 270 | fn kata_to_phoneme_list(mut text: String) -> Result> { 271 | let chars: HashSet = text.chars().map(|x| x.to_string()).collect(); 272 | if chars.is_subset(&HashSet::from_iter( 273 | PUNCTUATIONS.iter().map(|x| x.to_string()), 274 | )) { 275 | return Ok(text.chars().map(|x| x.to_string()).collect()); 276 | } 277 | if !KATAKANA_PATTERN.is_match(&text) { 278 | return Err(Error::ValueError(format!( 279 | "Input must be katakana only: {}", 280 | text 281 | ))); 282 | } 283 | 284 | for mora in MORA_PATTERN.iter() { 285 | let mora = mora.to_string(); 286 | let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap(); 287 | if consonant.is_none() { 288 | text = text.replace(&mora, &format!(" {}", vowel)); 289 | } else { 290 | text = text.replace( 291 | &mora, 292 | &format!(" {} {}", consonant.as_ref().unwrap(), vowel), 293 | ); 294 | } 295 | } 296 | 297 | let long_replacement = |m: ®ex::Captures| { 298 | let result = m.get(1).unwrap().as_str().to_string(); 299 | let mut second = String::new(); 300 | for _ in 0..m.get(2).unwrap().as_str().char_indices().count() { 301 | second += &format!(" {}", m.get(1).unwrap().as_str()); 302 | } 303 | result + &second 304 | }; 305 | text = LONG_PATTERN 306 | .replace_all(&text, long_replacement) 307 | .to_string(); 308 | 309 | let data = text.trim().split(' ').map(|x| x.to_string()).collect(); 310 | 311 | Ok(data) 312 | } 313 | 314 | pub fn text_to_seq_kata(&self) -> Result<(Vec, Vec)> { 315 | let mut seq_kata = vec![]; 316 | let mut seq_text = vec![]; 317 | 318 | for parts in &self.parsed { 319 | let (string, pron) = self.parse_to_string_and_pron(parts.clone()); 320 | let mut yomi = pron.replace('’', ""); 321 | let word = replace_punctuation(string); 322 | assert!(!yomi.is_empty(), "Empty yomi: {}", word); 323 | if yomi == "、" { 324 | if !word 325 | .chars() 326 | .all(|x| PUNCTUATIONS.contains(&x.to_string().as_str())) 327 | { 328 | yomi = "'".repeat(word.len()); 329 | } else { 330 | yomi = word.clone(); 331 | } 332 | } else if yomi == "?" { 333 | assert!(word == "?", "yomi `?` comes from: {}", word); 334 | yomi = "?".to_string(); 335 | } 336 | seq_text.push(word); 337 | seq_kata.push(yomi); 338 | } 339 | Ok((seq_text, seq_kata)) 340 | } 341 | 342 | fn parse_to_string_and_pron(&self, parts: String) -> (String, String) { 343 | let part_lists: Vec = parts.split(',').map(|x| x.to_string()).collect(); 344 | (part_lists[0].clone(), part_lists[9].clone()) 345 | } 346 | 347 | fn g2phone_tone_wo_punct(&self) -> Result> { 348 | let prosodies = self.g2p_prosody()?; 349 | 350 | let mut results: Vec<(String, i32)> = Vec::new(); 351 | let mut current_phrase: Vec<(String, i32)> = Vec::new(); 352 | let mut current_tone = 0; 353 | 354 | for (i, letter) in prosodies.iter().enumerate() { 355 | if letter == "^" { 356 | assert!(i == 0); 357 | } else if ["$", "?", "_", "#"].contains(&letter.as_str()) { 358 | results.extend(self.fix_phone_tone(current_phrase.clone())?); 359 | if ["$", "?"].contains(&letter.as_str()) { 360 | assert!(i == prosodies.len() - 1); 361 | } 362 | current_phrase = Vec::new(); 363 | current_tone = 0; 364 | } else if letter == "[" { 365 | current_tone += 1; 366 | } else if letter == "]" { 367 | current_tone -= 1; 368 | } else { 369 | let new_letter = if letter == "cl" { 370 | "q".to_string() 371 | } else { 372 | letter.clone() 373 | }; 374 | current_phrase.push((new_letter, current_tone)); 375 | } 376 | } 377 | 378 | Ok(results) 379 | } 380 | 381 | fn g2p_prosody(&self) -> Result> { 382 | let labels = self.jpreprocess.make_label(self.parsed.clone()); 383 | 384 | let mut phones: Vec = Vec::new(); 385 | for (i, label) in labels.iter().enumerate() { 386 | let mut p3 = label.phoneme.c.clone().unwrap(); 387 | if "AIUEO".contains(&p3) { 388 | // 文字をlowerする 389 | p3 = p3.to_lowercase(); 390 | } 391 | if p3 == "sil" { 392 | assert!(i == 0 || i == labels.len() - 1); 393 | if i == 0 { 394 | phones.push("^".to_string()); 395 | } else if i == labels.len() - 1 { 396 | let e3 = label.accent_phrase_prev.clone().unwrap().is_interrogative; 397 | if e3 { 398 | phones.push("$".to_string()); 399 | } else { 400 | phones.push("?".to_string()); 401 | } 402 | } 403 | continue; 404 | } else if p3 == "pau" { 405 | phones.push("_".to_string()); 406 | continue; 407 | } else { 408 | phones.push(p3.clone()); 409 | } 410 | 411 | let a1 = if let Some(mora) = &label.mora { 412 | mora.relative_accent_position as i32 413 | } else { 414 | -50 415 | }; 416 | let a2 = if let Some(mora) = &label.mora { 417 | mora.position_forward as i32 418 | } else { 419 | -50 420 | }; 421 | let a3 = if let Some(mora) = &label.mora { 422 | mora.position_backward as i32 423 | } else { 424 | -50 425 | }; 426 | 427 | let f1 = if let Some(accent_phrase) = &label.accent_phrase_curr { 428 | accent_phrase.mora_count as i32 429 | } else { 430 | -50 431 | }; 432 | 433 | let a2_next = if let Some(mora) = &labels[i + 1].mora { 434 | mora.position_forward as i32 435 | } else { 436 | -50 437 | }; 438 | 439 | if a3 == 1 && a2_next == 1 && "aeiouAEIOUNcl".contains(&p3) { 440 | phones.push("#".to_string()); 441 | } else if a1 == 0 && a2_next == a2 + 1 && a2 != f1 { 442 | phones.push("]".to_string()); 443 | } else if a2 == 1 && a2_next == 2 { 444 | phones.push("[".to_string()); 445 | } 446 | } 447 | 448 | Ok(phones) 449 | } 450 | } 451 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "std")] 2 | pub mod bert; 3 | pub mod error; 4 | pub mod jtalk; 5 | #[cfg(feature = "std")] 6 | pub mod model; 7 | pub mod mora; 8 | pub mod nlp; 9 | pub mod norm; 10 | pub mod sbv2file; 11 | pub mod style; 12 | pub mod tokenizer; 13 | #[cfg(feature = "std")] 14 | pub mod tts; 15 | pub mod tts_util; 16 | pub mod utils; 17 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs; 3 | 4 | #[cfg(feature = "std")] 5 | fn main_inner() -> anyhow::Result<()> { 6 | use sbv2_core::tts; 7 | dotenvy::dotenv_override().ok(); 8 | env_logger::init(); 9 | let text = "今日の天気は快晴です。"; 10 | let ident = "aaa"; 11 | let mut tts_holder = tts::TTSModelHolder::new( 12 | &fs::read(env::var("BERT_MODEL_PATH")?)?, 13 | &fs::read(env::var("TOKENIZER_PATH")?)?, 14 | env::var("HOLDER_MAX_LOADED_MODElS") 15 | .ok() 16 | .and_then(|x| x.parse().ok()), 17 | )?; 18 | let mp = env::var("MODEL_PATH")?; 19 | let b = fs::read(&mp)?; 20 | #[cfg(not(feature = "aivmx"))] 21 | { 22 | tts_holder.load_sbv2file(ident, b)?; 23 | } 24 | #[cfg(feature = "aivmx")] 25 | { 26 | if mp.ends_with(".sbv2") { 27 | tts_holder.load_sbv2file(ident, b)?; 28 | } else { 29 | tts_holder.load_aivmx(ident, b)?; 30 | } 31 | } 32 | 33 | let audio = tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?; 34 | fs::write("output.wav", audio)?; 35 | 36 | Ok(()) 37 | } 38 | 39 | #[cfg(not(feature = "std"))] 40 | fn main_inner() -> anyhow::Result<()> { 41 | Ok(()) 42 | } 43 | 44 | fn main() { 45 | if let Err(e) = main_inner() { 46 | println!("Error: {e}"); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/model.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Result; 2 | use ndarray::{array, Array1, Array2, Array3, Axis, Ix3}; 3 | use ort::session::{builder::GraphOptimizationLevel, Session}; 4 | 5 | #[allow(clippy::vec_init_then_push, unused_variables)] 6 | pub fn load_model>(model_file: P, bert: bool) -> Result { 7 | let mut exp = Vec::new(); 8 | #[cfg(feature = "tensorrt")] 9 | { 10 | if bert { 11 | exp.push( 12 | ort::execution_providers::TensorRTExecutionProvider::default() 13 | .with_fp16(true) 14 | .with_profile_min_shapes("input_ids:1x1,attention_mask:1x1") 15 | .with_profile_max_shapes("input_ids:1x100,attention_mask:1x100") 16 | .with_profile_opt_shapes("input_ids:1x25,attention_mask:1x25") 17 | .build(), 18 | ); 19 | } 20 | } 21 | #[cfg(feature = "cuda")] 22 | { 23 | #[allow(unused_mut)] 24 | let mut cuda = ort::execution_providers::CUDAExecutionProvider::default(); 25 | #[cfg(feature = "cuda_tf32")] 26 | { 27 | cuda = cuda.with_tf32(true); 28 | } 29 | exp.push(cuda.build()); 30 | } 31 | #[cfg(feature = "directml")] 32 | { 33 | exp.push(ort::execution_providers::DirectMLExecutionProvider::default().build()); 34 | } 35 | #[cfg(feature = "coreml")] 36 | { 37 | exp.push(ort::execution_providers::CoreMLExecutionProvider::default().build()); 38 | } 39 | exp.push(ort::execution_providers::CPUExecutionProvider::default().build()); 40 | Ok(Session::builder()? 41 | .with_execution_providers(exp)? 42 | .with_optimization_level(GraphOptimizationLevel::Level3)? 43 | .with_intra_threads(num_cpus::get_physical())? 44 | .with_parallel_execution(true)? 45 | .with_inter_threads(num_cpus::get_physical())? 46 | .commit_from_memory(model_file.as_ref())?) 47 | } 48 | 49 | #[allow(clippy::too_many_arguments)] 50 | pub fn synthesize( 51 | session: &mut Session, 52 | bert_ori: Array2, 53 | x_tst: Array1, 54 | mut spk_ids: Array1, 55 | tones: Array1, 56 | lang_ids: Array1, 57 | style_vector: Array1, 58 | sdp_ratio: f32, 59 | length_scale: f32, 60 | noise_scale: f32, 61 | noise_scale_w: f32, 62 | ) -> Result> { 63 | let bert_ori = bert_ori.insert_axis(Axis(0)); 64 | let bert_ori = bert_ori.as_standard_layout(); 65 | let bert = ort::value::TensorRef::from_array_view(&bert_ori)?; 66 | let mut x_tst_lengths = array![x_tst.shape()[0] as i64]; 67 | let x_tst_lengths = ort::value::TensorRef::from_array_view(&mut x_tst_lengths)?; 68 | let mut x_tst = x_tst.insert_axis(Axis(0)); 69 | let x_tst = ort::value::TensorRef::from_array_view(&mut x_tst)?; 70 | let mut lang_ids = lang_ids.insert_axis(Axis(0)); 71 | let lang_ids = ort::value::TensorRef::from_array_view(&mut lang_ids)?; 72 | let mut tones = tones.insert_axis(Axis(0)); 73 | let tones = ort::value::TensorRef::from_array_view(&mut tones)?; 74 | let mut style_vector = style_vector.insert_axis(Axis(0)); 75 | let style_vector = ort::value::TensorRef::from_array_view(&mut style_vector)?; 76 | let sid = ort::value::TensorRef::from_array_view(&mut spk_ids)?; 77 | let sdp_ratio = vec![sdp_ratio]; 78 | let sdp_ratio = ort::value::TensorRef::from_array_view((vec![1_i64], sdp_ratio.as_slice()))?; 79 | let length_scale = vec![length_scale]; 80 | let length_scale = 81 | ort::value::TensorRef::from_array_view((vec![1_i64], length_scale.as_slice()))?; 82 | let noise_scale = vec![noise_scale]; 83 | let noise_scale = 84 | ort::value::TensorRef::from_array_view((vec![1_i64], noise_scale.as_slice()))?; 85 | let noise_scale_w = vec![noise_scale_w]; 86 | let noise_scale_w = 87 | ort::value::TensorRef::from_array_view((vec![1_i64], noise_scale_w.as_slice()))?; 88 | let outputs = session.run(ort::inputs! { 89 | "x_tst" => x_tst, 90 | "x_tst_lengths" => x_tst_lengths, 91 | "sid" => sid, 92 | "tones" => tones, 93 | "language" => lang_ids, 94 | "bert" => bert, 95 | "style_vec" => style_vector, 96 | "sdp_ratio" => sdp_ratio, 97 | "length_scale" => length_scale, 98 | "noise_scale" => noise_scale, 99 | "noise_scale_w" => noise_scale_w, 100 | })?; 101 | let audio_array = outputs["output"] 102 | .try_extract_array::()? 103 | .into_dimensionality::()? 104 | .to_owned(); 105 | Ok(audio_array) 106 | } 107 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/mora.rs: -------------------------------------------------------------------------------- 1 | use once_cell::sync::Lazy; 2 | use serde::{Deserialize, Serialize}; 3 | use std::collections::HashMap; 4 | 5 | #[derive(Debug, Serialize, Deserialize)] 6 | pub struct Mora { 7 | pub mora: String, 8 | pub consonant: Option, 9 | pub vowel: String, 10 | } 11 | 12 | #[derive(Debug, Serialize, Deserialize)] 13 | pub struct MoraFile { 14 | pub minimum: Vec, 15 | pub additional: Vec, 16 | } 17 | 18 | static MORA_LIST_MINIMUM: Lazy> = Lazy::new(|| { 19 | let data: MoraFile = serde_json::from_str(include_str!("./mora_list.json")).unwrap(); 20 | data.minimum 21 | }); 22 | 23 | static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { 24 | let data: MoraFile = serde_json::from_str(include_str!("./mora_list.json")).unwrap(); 25 | data.additional 26 | }); 27 | 28 | pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = Lazy::new(|| { 29 | let mut map = HashMap::new(); 30 | for mora in MORA_LIST_MINIMUM.iter() { 31 | map.insert( 32 | format!( 33 | "{}{}", 34 | mora.consonant.clone().unwrap_or("".to_string()), 35 | mora.vowel 36 | ), 37 | mora.mora.clone(), 38 | ); 39 | } 40 | map 41 | }); 42 | 43 | pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, String)>> = 44 | Lazy::new(|| { 45 | let mut map = HashMap::new(); 46 | for mora in MORA_LIST_MINIMUM.iter().chain(MORA_LIST_ADDITIONAL.iter()) { 47 | map.insert( 48 | mora.mora.clone(), 49 | (mora.consonant.clone(), mora.vowel.clone()), 50 | ); 51 | } 52 | map 53 | }); 54 | 55 | pub static CONSONANTS: Lazy> = Lazy::new(|| { 56 | let consonants = MORA_KATA_TO_MORA_PHONEMES 57 | .values() 58 | .filter_map(|(consonant, _)| consonant.clone()) 59 | .collect::>(); 60 | consonants 61 | }); 62 | 63 | pub const VOWELS: [&str; 6] = ["a", "i", "u", "e", "o", "N"]; 64 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/mora_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "minimum": [ 3 | { 4 | "mora": "ヴォ", 5 | "consonant": "v", 6 | "vowel": "o" 7 | }, 8 | { 9 | "mora": "ヴェ", 10 | "consonant": "v", 11 | "vowel": "e" 12 | }, 13 | { 14 | "mora": "ヴィ", 15 | "consonant": "v", 16 | "vowel": "i" 17 | }, 18 | { 19 | "mora": "ヴァ", 20 | "consonant": "v", 21 | "vowel": "a" 22 | }, 23 | { 24 | "mora": "ヴ", 25 | "consonant": "v", 26 | "vowel": "u" 27 | }, 28 | { 29 | "mora": "ン", 30 | "consonant": null, 31 | "vowel": "N" 32 | }, 33 | { 34 | "mora": "ワ", 35 | "consonant": "w", 36 | "vowel": "a" 37 | }, 38 | { 39 | "mora": "ロ", 40 | "consonant": "r", 41 | "vowel": "o" 42 | }, 43 | { 44 | "mora": "レ", 45 | "consonant": "r", 46 | "vowel": "e" 47 | }, 48 | { 49 | "mora": "ル", 50 | "consonant": "r", 51 | "vowel": "u" 52 | }, 53 | { 54 | "mora": "リョ", 55 | "consonant": "ry", 56 | "vowel": "o" 57 | }, 58 | { 59 | "mora": "リュ", 60 | "consonant": "ry", 61 | "vowel": "u" 62 | }, 63 | { 64 | "mora": "リャ", 65 | "consonant": "ry", 66 | "vowel": "a" 67 | }, 68 | { 69 | "mora": "リェ", 70 | "consonant": "ry", 71 | "vowel": "e" 72 | }, 73 | { 74 | "mora": "リ", 75 | "consonant": "r", 76 | "vowel": "i" 77 | }, 78 | { 79 | "mora": "ラ", 80 | "consonant": "r", 81 | "vowel": "a" 82 | }, 83 | { 84 | "mora": "ヨ", 85 | "consonant": "y", 86 | "vowel": "o" 87 | }, 88 | { 89 | "mora": "ユ", 90 | "consonant": "y", 91 | "vowel": "u" 92 | }, 93 | { 94 | "mora": "ヤ", 95 | "consonant": "y", 96 | "vowel": "a" 97 | }, 98 | { 99 | "mora": "モ", 100 | "consonant": "m", 101 | "vowel": "o" 102 | }, 103 | { 104 | "mora": "メ", 105 | "consonant": "m", 106 | "vowel": "e" 107 | }, 108 | { 109 | "mora": "ム", 110 | "consonant": "m", 111 | "vowel": "u" 112 | }, 113 | { 114 | "mora": "ミョ", 115 | "consonant": "my", 116 | "vowel": "o" 117 | }, 118 | { 119 | "mora": "ミュ", 120 | "consonant": "my", 121 | "vowel": "u" 122 | }, 123 | { 124 | "mora": "ミャ", 125 | "consonant": "my", 126 | "vowel": "a" 127 | }, 128 | { 129 | "mora": "ミェ", 130 | "consonant": "my", 131 | "vowel": "e" 132 | }, 133 | { 134 | "mora": "ミ", 135 | "consonant": "m", 136 | "vowel": "i" 137 | }, 138 | { 139 | "mora": "マ", 140 | "consonant": "m", 141 | "vowel": "a" 142 | }, 143 | { 144 | "mora": "ポ", 145 | "consonant": "p", 146 | "vowel": "o" 147 | }, 148 | { 149 | "mora": "ボ", 150 | "consonant": "b", 151 | "vowel": "o" 152 | }, 153 | { 154 | "mora": "ホ", 155 | "consonant": "h", 156 | "vowel": "o" 157 | }, 158 | { 159 | "mora": "ペ", 160 | "consonant": "p", 161 | "vowel": "e" 162 | }, 163 | { 164 | "mora": "ベ", 165 | "consonant": "b", 166 | "vowel": "e" 167 | }, 168 | { 169 | "mora": "ヘ", 170 | "consonant": "h", 171 | "vowel": "e" 172 | }, 173 | { 174 | "mora": "プ", 175 | "consonant": "p", 176 | "vowel": "u" 177 | }, 178 | { 179 | "mora": "ブ", 180 | "consonant": "b", 181 | "vowel": "u" 182 | }, 183 | { 184 | "mora": "フォ", 185 | "consonant": "f", 186 | "vowel": "o" 187 | }, 188 | { 189 | "mora": "フェ", 190 | "consonant": "f", 191 | "vowel": "e" 192 | }, 193 | { 194 | "mora": "フィ", 195 | "consonant": "f", 196 | "vowel": "i" 197 | }, 198 | { 199 | "mora": "ファ", 200 | "consonant": "f", 201 | "vowel": "a" 202 | }, 203 | { 204 | "mora": "フ", 205 | "consonant": "f", 206 | "vowel": "u" 207 | }, 208 | { 209 | "mora": "ピョ", 210 | "consonant": "py", 211 | "vowel": "o" 212 | }, 213 | { 214 | "mora": "ピュ", 215 | "consonant": "py", 216 | "vowel": "u" 217 | }, 218 | { 219 | "mora": "ピャ", 220 | "consonant": "py", 221 | "vowel": "a" 222 | }, 223 | { 224 | "mora": "ピェ", 225 | "consonant": "py", 226 | "vowel": "e" 227 | }, 228 | { 229 | "mora": "ピ", 230 | "consonant": "p", 231 | "vowel": "i" 232 | }, 233 | { 234 | "mora": "ビョ", 235 | "consonant": "by", 236 | "vowel": "o" 237 | }, 238 | { 239 | "mora": "ビュ", 240 | "consonant": "by", 241 | "vowel": "u" 242 | }, 243 | { 244 | "mora": "ビャ", 245 | "consonant": "by", 246 | "vowel": "a" 247 | }, 248 | { 249 | "mora": "ビェ", 250 | "consonant": "by", 251 | "vowel": "e" 252 | }, 253 | { 254 | "mora": "ビ", 255 | "consonant": "b", 256 | "vowel": "i" 257 | }, 258 | { 259 | "mora": "ヒョ", 260 | "consonant": "hy", 261 | "vowel": "o" 262 | }, 263 | { 264 | "mora": "ヒュ", 265 | "consonant": "hy", 266 | "vowel": "u" 267 | }, 268 | { 269 | "mora": "ヒャ", 270 | "consonant": "hy", 271 | "vowel": "a" 272 | }, 273 | { 274 | "mora": "ヒェ", 275 | "consonant": "hy", 276 | "vowel": "e" 277 | }, 278 | { 279 | "mora": "ヒ", 280 | "consonant": "h", 281 | "vowel": "i" 282 | }, 283 | { 284 | "mora": "パ", 285 | "consonant": "p", 286 | "vowel": "a" 287 | }, 288 | { 289 | "mora": "バ", 290 | "consonant": "b", 291 | "vowel": "a" 292 | }, 293 | { 294 | "mora": "ハ", 295 | "consonant": "h", 296 | "vowel": "a" 297 | }, 298 | { 299 | "mora": "ノ", 300 | "consonant": "n", 301 | "vowel": "o" 302 | }, 303 | { 304 | "mora": "ネ", 305 | "consonant": "n", 306 | "vowel": "e" 307 | }, 308 | { 309 | "mora": "ヌ", 310 | "consonant": "n", 311 | "vowel": "u" 312 | }, 313 | { 314 | "mora": "ニョ", 315 | "consonant": "ny", 316 | "vowel": "o" 317 | }, 318 | { 319 | "mora": "ニュ", 320 | "consonant": "ny", 321 | "vowel": "u" 322 | }, 323 | { 324 | "mora": "ニャ", 325 | "consonant": "ny", 326 | "vowel": "a" 327 | }, 328 | { 329 | "mora": "ニェ", 330 | "consonant": "ny", 331 | "vowel": "e" 332 | }, 333 | { 334 | "mora": "ニ", 335 | "consonant": "n", 336 | "vowel": "i" 337 | }, 338 | { 339 | "mora": "ナ", 340 | "consonant": "n", 341 | "vowel": "a" 342 | }, 343 | { 344 | "mora": "ドゥ", 345 | "consonant": "d", 346 | "vowel": "u" 347 | }, 348 | { 349 | "mora": "ド", 350 | "consonant": "d", 351 | "vowel": "o" 352 | }, 353 | { 354 | "mora": "トゥ", 355 | "consonant": "t", 356 | "vowel": "u" 357 | }, 358 | { 359 | "mora": "ト", 360 | "consonant": "t", 361 | "vowel": "o" 362 | }, 363 | { 364 | "mora": "デョ", 365 | "consonant": "dy", 366 | "vowel": "o" 367 | }, 368 | { 369 | "mora": "デュ", 370 | "consonant": "dy", 371 | "vowel": "u" 372 | }, 373 | { 374 | "mora": "デャ", 375 | "consonant": "dy", 376 | "vowel": "a" 377 | }, 378 | { 379 | "mora": "ディ", 380 | "consonant": "d", 381 | "vowel": "i" 382 | }, 383 | { 384 | "mora": "デ", 385 | "consonant": "d", 386 | "vowel": "e" 387 | }, 388 | { 389 | "mora": "テョ", 390 | "consonant": "ty", 391 | "vowel": "o" 392 | }, 393 | { 394 | "mora": "テュ", 395 | "consonant": "ty", 396 | "vowel": "u" 397 | }, 398 | { 399 | "mora": "テャ", 400 | "consonant": "ty", 401 | "vowel": "a" 402 | }, 403 | { 404 | "mora": "ティ", 405 | "consonant": "t", 406 | "vowel": "i" 407 | }, 408 | { 409 | "mora": "テ", 410 | "consonant": "t", 411 | "vowel": "e" 412 | }, 413 | { 414 | "mora": "ツォ", 415 | "consonant": "ts", 416 | "vowel": "o" 417 | }, 418 | { 419 | "mora": "ツェ", 420 | "consonant": "ts", 421 | "vowel": "e" 422 | }, 423 | { 424 | "mora": "ツィ", 425 | "consonant": "ts", 426 | "vowel": "i" 427 | }, 428 | { 429 | "mora": "ツァ", 430 | "consonant": "ts", 431 | "vowel": "a" 432 | }, 433 | { 434 | "mora": "ツ", 435 | "consonant": "ts", 436 | "vowel": "u" 437 | }, 438 | { 439 | "mora": "ッ", 440 | "consonant": null, 441 | "vowel": "q" 442 | }, 443 | { 444 | "mora": "チョ", 445 | "consonant": "ch", 446 | "vowel": "o" 447 | }, 448 | { 449 | "mora": "チュ", 450 | "consonant": "ch", 451 | "vowel": "u" 452 | }, 453 | { 454 | "mora": "チャ", 455 | "consonant": "ch", 456 | "vowel": "a" 457 | }, 458 | { 459 | "mora": "チェ", 460 | "consonant": "ch", 461 | "vowel": "e" 462 | }, 463 | { 464 | "mora": "チ", 465 | "consonant": "ch", 466 | "vowel": "i" 467 | }, 468 | { 469 | "mora": "ダ", 470 | "consonant": "d", 471 | "vowel": "a" 472 | }, 473 | { 474 | "mora": "タ", 475 | "consonant": "t", 476 | "vowel": "a" 477 | }, 478 | { 479 | "mora": "ゾ", 480 | "consonant": "z", 481 | "vowel": "o" 482 | }, 483 | { 484 | "mora": "ソ", 485 | "consonant": "s", 486 | "vowel": "o" 487 | }, 488 | { 489 | "mora": "ゼ", 490 | "consonant": "z", 491 | "vowel": "e" 492 | }, 493 | { 494 | "mora": "セ", 495 | "consonant": "s", 496 | "vowel": "e" 497 | }, 498 | { 499 | "mora": "ズィ", 500 | "consonant": "z", 501 | "vowel": "i" 502 | }, 503 | { 504 | "mora": "ズ", 505 | "consonant": "z", 506 | "vowel": "u" 507 | }, 508 | { 509 | "mora": "スィ", 510 | "consonant": "s", 511 | "vowel": "i" 512 | }, 513 | { 514 | "mora": "ス", 515 | "consonant": "s", 516 | "vowel": "u" 517 | }, 518 | { 519 | "mora": "ジョ", 520 | "consonant": "j", 521 | "vowel": "o" 522 | }, 523 | { 524 | "mora": "ジュ", 525 | "consonant": "j", 526 | "vowel": "u" 527 | }, 528 | { 529 | "mora": "ジャ", 530 | "consonant": "j", 531 | "vowel": "a" 532 | }, 533 | { 534 | "mora": "ジェ", 535 | "consonant": "j", 536 | "vowel": "e" 537 | }, 538 | { 539 | "mora": "ジ", 540 | "consonant": "j", 541 | "vowel": "i" 542 | }, 543 | { 544 | "mora": "ショ", 545 | "consonant": "sh", 546 | "vowel": "o" 547 | }, 548 | { 549 | "mora": "シュ", 550 | "consonant": "sh", 551 | "vowel": "u" 552 | }, 553 | { 554 | "mora": "シャ", 555 | "consonant": "sh", 556 | "vowel": "a" 557 | }, 558 | { 559 | "mora": "シェ", 560 | "consonant": "sh", 561 | "vowel": "e" 562 | }, 563 | { 564 | "mora": "シ", 565 | "consonant": "sh", 566 | "vowel": "i" 567 | }, 568 | { 569 | "mora": "ザ", 570 | "consonant": "z", 571 | "vowel": "a" 572 | }, 573 | { 574 | "mora": "サ", 575 | "consonant": "s", 576 | "vowel": "a" 577 | }, 578 | { 579 | "mora": "ゴ", 580 | "consonant": "g", 581 | "vowel": "o" 582 | }, 583 | { 584 | "mora": "コ", 585 | "consonant": "k", 586 | "vowel": "o" 587 | }, 588 | { 589 | "mora": "ゲ", 590 | "consonant": "g", 591 | "vowel": "e" 592 | }, 593 | { 594 | "mora": "ケ", 595 | "consonant": "k", 596 | "vowel": "e" 597 | }, 598 | { 599 | "mora": "グヮ", 600 | "consonant": "gw", 601 | "vowel": "a" 602 | }, 603 | { 604 | "mora": "グ", 605 | "consonant": "g", 606 | "vowel": "u" 607 | }, 608 | { 609 | "mora": "クヮ", 610 | "consonant": "kw", 611 | "vowel": "a" 612 | }, 613 | { 614 | "mora": "ク", 615 | "consonant": "k", 616 | "vowel": "u" 617 | }, 618 | { 619 | "mora": "ギョ", 620 | "consonant": "gy", 621 | "vowel": "o" 622 | }, 623 | { 624 | "mora": "ギュ", 625 | "consonant": "gy", 626 | "vowel": "u" 627 | }, 628 | { 629 | "mora": "ギャ", 630 | "consonant": "gy", 631 | "vowel": "a" 632 | }, 633 | { 634 | "mora": "ギェ", 635 | "consonant": "gy", 636 | "vowel": "e" 637 | }, 638 | { 639 | "mora": "ギ", 640 | "consonant": "g", 641 | "vowel": "i" 642 | }, 643 | { 644 | "mora": "キョ", 645 | "consonant": "ky", 646 | "vowel": "o" 647 | }, 648 | { 649 | "mora": "キュ", 650 | "consonant": "ky", 651 | "vowel": "u" 652 | }, 653 | { 654 | "mora": "キャ", 655 | "consonant": "ky", 656 | "vowel": "a" 657 | }, 658 | { 659 | "mora": "キェ", 660 | "consonant": "ky", 661 | "vowel": "e" 662 | }, 663 | { 664 | "mora": "キ", 665 | "consonant": "k", 666 | "vowel": "i" 667 | }, 668 | { 669 | "mora": "ガ", 670 | "consonant": "g", 671 | "vowel": "a" 672 | }, 673 | { 674 | "mora": "カ", 675 | "consonant": "k", 676 | "vowel": "a" 677 | }, 678 | { 679 | "mora": "オ", 680 | "consonant": null, 681 | "vowel": "o" 682 | }, 683 | { 684 | "mora": "エ", 685 | "consonant": null, 686 | "vowel": "e" 687 | }, 688 | { 689 | "mora": "ウォ", 690 | "consonant": "w", 691 | "vowel": "o" 692 | }, 693 | { 694 | "mora": "ウェ", 695 | "consonant": "w", 696 | "vowel": "e" 697 | }, 698 | { 699 | "mora": "ウィ", 700 | "consonant": "w", 701 | "vowel": "i" 702 | }, 703 | { 704 | "mora": "ウ", 705 | "consonant": null, 706 | "vowel": "u" 707 | }, 708 | { 709 | "mora": "イェ", 710 | "consonant": "y", 711 | "vowel": "e" 712 | }, 713 | { 714 | "mora": "イ", 715 | "consonant": null, 716 | "vowel": "i" 717 | }, 718 | { 719 | "mora": "ア", 720 | "consonant": null, 721 | "vowel": "a" 722 | } 723 | ], 724 | "additional": [ 725 | { 726 | "mora": "ヴョ", 727 | "consonant": "by", 728 | "vowel": "o" 729 | }, 730 | { 731 | "mora": "ヴュ", 732 | "consonant": "by", 733 | "vowel": "u" 734 | }, 735 | { 736 | "mora": "ヴャ", 737 | "consonant": "by", 738 | "vowel": "a" 739 | }, 740 | { 741 | "mora": "ヲ", 742 | "consonant": null, 743 | "vowel": "o" 744 | }, 745 | { 746 | "mora": "ヱ", 747 | "consonant": null, 748 | "vowel": "e" 749 | }, 750 | { 751 | "mora": "ヰ", 752 | "consonant": null, 753 | "vowel": "i" 754 | }, 755 | { 756 | "mora": "ヮ", 757 | "consonant": "w", 758 | "vowel": "a" 759 | }, 760 | { 761 | "mora": "ョ", 762 | "consonant": "y", 763 | "vowel": "o" 764 | }, 765 | { 766 | "mora": "ュ", 767 | "consonant": "y", 768 | "vowel": "u" 769 | }, 770 | { 771 | "mora": "ヅ", 772 | "consonant": "z", 773 | "vowel": "u" 774 | }, 775 | { 776 | "mora": "ヂ", 777 | "consonant": "j", 778 | "vowel": "i" 779 | }, 780 | { 781 | "mora": "ヶ", 782 | "consonant": "k", 783 | "vowel": "e" 784 | }, 785 | { 786 | "mora": "ャ", 787 | "consonant": "y", 788 | "vowel": "a" 789 | }, 790 | { 791 | "mora": "ォ", 792 | "consonant": null, 793 | "vowel": "o" 794 | }, 795 | { 796 | "mora": "ェ", 797 | "consonant": null, 798 | "vowel": "e" 799 | }, 800 | { 801 | "mora": "ゥ", 802 | "consonant": null, 803 | "vowel": "u" 804 | }, 805 | { 806 | "mora": "ィ", 807 | "consonant": null, 808 | "vowel": "i" 809 | }, 810 | { 811 | "mora": "ァ", 812 | "consonant": null, 813 | "vowel": "a" 814 | } 815 | ] 816 | } -------------------------------------------------------------------------------- /crates/sbv2_core/src/nlp.rs: -------------------------------------------------------------------------------- 1 | use crate::norm::SYMBOLS; 2 | use once_cell::sync::Lazy; 3 | use std::collections::HashMap; 4 | 5 | static SYMBOL_TO_ID: Lazy> = Lazy::new(|| { 6 | let mut map = HashMap::new(); 7 | for (i, symbols) in SYMBOLS.iter().enumerate() { 8 | map.insert(symbols.to_string(), i as i32); 9 | } 10 | map 11 | }); 12 | 13 | pub fn cleaned_text_to_sequence( 14 | cleaned_phones: Vec, 15 | tones: Vec, 16 | ) -> (Vec, Vec, Vec) { 17 | let phones: Vec = cleaned_phones 18 | .iter() 19 | .map(|phone| *SYMBOL_TO_ID.get(phone).unwrap() as i64) 20 | .collect(); 21 | let tones: Vec = tones.iter().map(|tone| (*tone + 6) as i64).collect(); 22 | let lang_ids: Vec = vec![1; phones.len()]; 23 | (phones, tones, lang_ids) 24 | } 25 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/norm.rs: -------------------------------------------------------------------------------- 1 | use once_cell::sync::Lazy; 2 | use std::collections::{HashMap, HashSet}; 3 | 4 | static REPLACE_MAP: Lazy> = Lazy::new(|| { 5 | let mut map = HashMap::new(); 6 | map.insert(":", ","); 7 | map.insert(";", ","); 8 | map.insert(",", ","); 9 | map.insert("。", "."); 10 | map.insert("!", "!"); 11 | map.insert("?", "?"); 12 | map.insert("\n", "."); 13 | map.insert(".", "."); 14 | map.insert("…", "..."); 15 | map.insert("···", "..."); 16 | map.insert("・・・", "..."); 17 | map.insert("·", ","); 18 | map.insert("・", ","); 19 | map.insert("、", ","); 20 | map.insert("$", "."); 21 | map.insert("“", "'"); 22 | map.insert("”", "'"); 23 | map.insert("\"", "'"); 24 | map.insert("‘", "'"); 25 | map.insert("’", "'"); 26 | map.insert("(", "'"); 27 | map.insert(")", "'"); 28 | map.insert("(", "'"); 29 | map.insert(")", "'"); 30 | map.insert("《", "'"); 31 | map.insert("》", "'"); 32 | map.insert("【", "'"); 33 | map.insert("】", "'"); 34 | map.insert("[", "'"); 35 | map.insert("]", "'"); 36 | // NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換 37 | map.insert("\u{02d7}", "\u{002d}"); // ˗, Modifier Letter Minus Sign 38 | map.insert("\u{2010}", "\u{002d}"); // ‐, Hyphen, 39 | map.insert("\u{2012}", "\u{002d}"); // ‒, Figure Dash 40 | map.insert("\u{2013}", "\u{002d}"); // –, En Dash 41 | map.insert("\u{2014}", "\u{002d}"); // —, Em Dash 42 | map.insert("\u{2015}", "\u{002d}"); // ―, Horizontal Bar 43 | map.insert("\u{2043}", "\u{002d}"); // ⁃, Hyphen Bullet 44 | map.insert("\u{2212}", "\u{002d}"); // −, Minus Sign 45 | map.insert("\u{23af}", "\u{002d}"); // ⎯, Horizontal Line Extension 46 | map.insert("\u{23e4}", "\u{002d}"); // ⏤, Straightness 47 | map.insert("\u{2500}", "\u{002d}"); // ─, Box Drawings Light Horizontal 48 | map.insert("\u{2501}", "\u{002d}"); // ━, Box Drawings Heavy Horizontal 49 | map.insert("\u{2e3a}", "\u{002d}"); // ⸺, Two-Em Dash 50 | map.insert("\u{2e3b}", "\u{002d}"); // ⸻, Three-Em Dash 51 | map.insert("「", "'"); 52 | map.insert("」", "'"); 53 | 54 | map 55 | }); 56 | 57 | const ZH_SYMBOLS: [&str; 65] = [ 58 | "E", "En", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", 59 | "f", "g", "h", "i", "i0", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "ir", "iu", 60 | "j", "k", "l", "m", "n", "o", "ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", 61 | "uan", "uang", "ui", "un", "uo", "v", "van", "ve", "vn", "w", "x", "y", "z", "zh", "AA", "EE", 62 | "OO", 63 | ]; 64 | pub const JP_SYMBOLS: [&str; 42] = [ 65 | "N", "a", "a:", "b", "by", "ch", "d", "dy", "e", "e:", "f", "g", "gy", "h", "hy", "i", "i:", 66 | "j", "k", "ky", "m", "my", "n", "ny", "o", "o:", "p", "py", "q", "r", "ry", "s", "sh", "t", 67 | "ts", "ty", "u", "u:", "w", "y", "z", "zy", 68 | ]; 69 | pub const EN_SYMBOLS: [&str; 39] = [ 70 | "aa", "ae", "ah", "ao", "aw", "ay", "b", "ch", "d", "dh", "eh", "er", "ey", "f", "g", "hh", 71 | "ih", "iy", "jh", "k", "l", "m", "n", "ng", "ow", "oy", "p", "r", "s", "sh", "t", "th", "uh", 72 | "uw", "V", "w", "y", "z", "zh", 73 | ]; 74 | 75 | pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "…", ",", ".", "'", "-"]; 76 | pub static PUNCTUATION_SYMBOLS: Lazy> = Lazy::new(|| { 77 | let mut symbols = PUNCTUATIONS.to_vec(); 78 | symbols.append(&mut vec!["SP", "UNK"]); 79 | symbols 80 | }); 81 | const PAD: &str = "_"; 82 | pub static NORMAL_SYMBOLS: Lazy> = Lazy::new(|| { 83 | let mut symbols: Vec<&str> = ZH_SYMBOLS.to_vec(); 84 | symbols.append(&mut JP_SYMBOLS.to_vec()); 85 | symbols.append(&mut EN_SYMBOLS.to_vec()); 86 | let symbols: HashSet<&str> = symbols.drain(..).collect(); 87 | let mut symbols: Vec<&str> = symbols.into_iter().collect(); 88 | symbols.sort(); 89 | symbols 90 | }); 91 | pub static SYMBOLS: Lazy> = Lazy::new(|| { 92 | let mut symbols = vec![PAD]; 93 | symbols.append(&mut NORMAL_SYMBOLS.clone()); 94 | symbols.append(&mut PUNCTUATION_SYMBOLS.to_vec()); 95 | symbols 96 | }); 97 | 98 | static PUNCTUATION_CLEANUP_PATTERN: Lazy = Lazy::new(|| { 99 | let pattern = r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}" 100 | .to_owned() 101 | + r"\u{0041}-\u{005A}\u{0061}-\u{007A}" 102 | + r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}" 103 | + r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}" 104 | + &PUNCTUATIONS.join("") 105 | + r"]+"; 106 | regex::Regex::new(&pattern).unwrap() 107 | }); 108 | 109 | pub fn normalize_text(text: &str) -> String { 110 | // 日本語のテキストを正規化する 111 | let text = text.replace('~', "ー"); 112 | let text = text.replace('~', "ー"); 113 | 114 | let text = text.replace('〜', "ー"); 115 | 116 | replace_punctuation(text) 117 | } 118 | 119 | pub fn replace_punctuation(mut text: String) -> String { 120 | for (k, v) in REPLACE_MAP.iter() { 121 | text = text.replace(k, v); 122 | } 123 | let content = PUNCTUATION_CLEANUP_PATTERN 124 | .replace_all(&text, "") 125 | .to_string(); 126 | content 127 | } 128 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/sbv2file.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Cursor, Read}; 2 | 3 | use tar::Archive; 4 | use zstd::decode_all; 5 | 6 | use crate::error::{Error, Result}; 7 | 8 | /// Parse a .sbv2 file binary 9 | /// 10 | /// # Examples 11 | /// 12 | /// ```rs 13 | /// parse_sbv2file("tsukuyomi", std::fs::read("tsukuyomi.sbv2")?)?; 14 | /// ``` 15 | pub fn parse_sbv2file>(sbv2_bytes: P) -> Result<(Vec, Vec)> { 16 | let mut arc = Archive::new(Cursor::new(decode_all(Cursor::new(sbv2_bytes.as_ref()))?)); 17 | let mut vits2 = None; 18 | let mut style_vectors = None; 19 | let mut et = arc.entries()?; 20 | while let Some(Ok(mut e)) = et.next() { 21 | let pth = String::from_utf8_lossy(&e.path_bytes()).to_string(); 22 | let mut b = Vec::with_capacity(e.size() as usize); 23 | e.read_to_end(&mut b)?; 24 | match pth.as_str() { 25 | "model.onnx" => vits2 = Some(b), 26 | "style_vectors.json" => style_vectors = Some(b), 27 | _ => continue, 28 | } 29 | } 30 | if style_vectors.is_none() { 31 | return Err(Error::ModelNotFoundError("style_vectors".to_string())); 32 | } 33 | if vits2.is_none() { 34 | return Err(Error::ModelNotFoundError("vits2".to_string())); 35 | } 36 | Ok((style_vectors.unwrap(), vits2.unwrap())) 37 | } 38 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/style.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{Error, Result}; 2 | use ndarray::{s, Array1, Array2}; 3 | use serde::Deserialize; 4 | 5 | #[derive(Deserialize)] 6 | pub struct Data { 7 | pub shape: [usize; 2], 8 | pub data: Vec>, 9 | } 10 | 11 | pub fn load_style>(path: P) -> Result> { 12 | let data: Data = serde_json::from_slice(path.as_ref())?; 13 | Ok(Array2::from_shape_vec( 14 | data.shape, 15 | data.data.iter().flatten().copied().collect(), 16 | )?) 17 | } 18 | 19 | pub fn get_style_vector( 20 | style_vectors: &Array2, 21 | style_id: i32, 22 | weight: f32, 23 | ) -> Result> { 24 | if style_vectors.shape().len() != 2 { 25 | return Err(Error::StyleError( 26 | "Invalid shape for style vectors".to_string(), 27 | )); 28 | } 29 | if style_id < 0 || style_id >= style_vectors.shape()[0] as i32 { 30 | return Err(Error::StyleError(format!( 31 | "Invalid style ID: {}. Max ID: {}", 32 | style_id, 33 | style_vectors.shape()[0] - 1 34 | ))); 35 | } 36 | let mean = style_vectors.slice(s![0, ..]).to_owned(); 37 | let style_vector = style_vectors.slice(s![style_id as usize, ..]).to_owned(); 38 | let diff = (style_vector - &mean) * weight; 39 | Ok(mean + &diff) 40 | } 41 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Result; 2 | pub use tokenizers::Tokenizer; 3 | 4 | pub fn get_tokenizer>(p: P) -> Result { 5 | let tokenizer = Tokenizer::from_bytes(p)?; 6 | Ok(tokenizer) 7 | } 8 | 9 | pub fn tokenize(text: &str, tokenizer: &Tokenizer) -> Result<(Vec, Vec)> { 10 | let mut token_ids = vec![1]; 11 | let mut attention_masks = vec![1]; 12 | for content in text.chars() { 13 | let token = tokenizer.encode(content.to_string(), false)?; 14 | let ids = token.get_ids(); 15 | token_ids.extend(ids.iter().map(|&x| x as i64)); 16 | attention_masks.extend(token.get_attention_mask().iter().map(|&x| x as i64)); 17 | } 18 | token_ids.push(2); 19 | attention_masks.push(1); 20 | Ok((token_ids, attention_masks)) 21 | } 22 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/tts.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{Error, Result}; 2 | use crate::{jtalk, model, style, tokenizer, tts_util}; 3 | #[cfg(feature = "aivmx")] 4 | use base64::prelude::{Engine as _, BASE64_STANDARD}; 5 | #[cfg(feature = "aivmx")] 6 | use ndarray::ShapeBuilder; 7 | use ndarray::{concatenate, Array1, Array2, Array3, Axis}; 8 | use ort::session::Session; 9 | #[cfg(feature = "aivmx")] 10 | use std::io::Cursor; 11 | use tokenizers::Tokenizer; 12 | 13 | #[derive(PartialEq, Eq, Clone)] 14 | pub struct TTSIdent(String); 15 | 16 | impl std::fmt::Display for TTSIdent { 17 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 18 | f.write_str(&self.0)?; 19 | Ok(()) 20 | } 21 | } 22 | 23 | impl From for TTSIdent 24 | where 25 | S: AsRef, 26 | { 27 | fn from(value: S) -> Self { 28 | TTSIdent(value.as_ref().to_string()) 29 | } 30 | } 31 | 32 | pub struct TTSModel { 33 | vits2: Option, 34 | style_vectors: Array2, 35 | ident: TTSIdent, 36 | bytes: Option>, 37 | } 38 | 39 | /// High-level Style-Bert-VITS2's API 40 | pub struct TTSModelHolder { 41 | tokenizer: Tokenizer, 42 | bert: Session, 43 | models: Vec, 44 | pub jtalk: jtalk::JTalk, 45 | max_loaded_models: Option, 46 | } 47 | 48 | impl TTSModelHolder { 49 | /// Initialize a new TTSModelHolder 50 | /// 51 | /// # Examples 52 | /// 53 | /// ```rs 54 | /// let mut tts_holder = TTSModelHolder::new(std::fs::read("deberta.onnx")?, std::fs::read("tokenizer.json")?, None)?; 55 | /// ``` 56 | pub fn new>( 57 | bert_model_bytes: P, 58 | tokenizer_bytes: P, 59 | max_loaded_models: Option, 60 | ) -> Result { 61 | let bert = model::load_model(bert_model_bytes, true)?; 62 | let jtalk = jtalk::JTalk::new()?; 63 | let tokenizer = tokenizer::get_tokenizer(tokenizer_bytes)?; 64 | Ok(TTSModelHolder { 65 | bert, 66 | models: vec![], 67 | jtalk, 68 | tokenizer, 69 | max_loaded_models, 70 | }) 71 | } 72 | 73 | /// Return a list of model names 74 | pub fn models(&self) -> Vec { 75 | self.models.iter().map(|m| m.ident.to_string()).collect() 76 | } 77 | 78 | #[cfg(feature = "aivmx")] 79 | pub fn load_aivmx, P: AsRef<[u8]>>( 80 | &mut self, 81 | ident: I, 82 | aivmx_bytes: P, 83 | ) -> Result<()> { 84 | let ident = ident.into(); 85 | if self.find_model(ident.clone()).is_err() { 86 | let mut load = true; 87 | if let Some(max) = self.max_loaded_models { 88 | if self.models.iter().filter(|x| x.vits2.is_some()).count() >= max { 89 | load = false; 90 | } 91 | } 92 | let model = model::load_model(&aivmx_bytes, false)?; 93 | let metadata = model.metadata()?; 94 | if let Some(aivm_style_vectors) = metadata.custom("aivm_style_vectors")? { 95 | let aivm_style_vectors = BASE64_STANDARD.decode(aivm_style_vectors)?; 96 | let style_vectors = Cursor::new(&aivm_style_vectors); 97 | let reader = npyz::NpyFile::new(style_vectors)?; 98 | let style_vectors = { 99 | let shape = reader.shape().to_vec(); 100 | let order = reader.order(); 101 | let data = reader.into_vec::()?; 102 | let shape = match shape[..] { 103 | [i1, i2] => [i1 as usize, i2 as usize], 104 | _ => panic!("expected 2D array"), 105 | }; 106 | let true_shape = shape.set_f(order == npyz::Order::Fortran); 107 | ndarray::Array2::from_shape_vec(true_shape, data)? 108 | }; 109 | drop(metadata); 110 | self.models.push(TTSModel { 111 | vits2: if load { Some(model) } else { None }, 112 | bytes: if self.max_loaded_models.is_some() { 113 | Some(aivmx_bytes.as_ref().to_vec()) 114 | } else { 115 | None 116 | }, 117 | ident, 118 | style_vectors, 119 | }) 120 | } 121 | } 122 | Ok(()) 123 | } 124 | 125 | /// Load a .sbv2 file binary 126 | /// 127 | /// # Examples 128 | /// 129 | /// ```rs 130 | /// tts_holder.load_sbv2file("tsukuyomi", std::fs::read("tsukuyomi.sbv2")?)?; 131 | /// ``` 132 | pub fn load_sbv2file, P: AsRef<[u8]>>( 133 | &mut self, 134 | ident: I, 135 | sbv2_bytes: P, 136 | ) -> Result<()> { 137 | let (style_vectors, vits2) = crate::sbv2file::parse_sbv2file(sbv2_bytes)?; 138 | self.load(ident, style_vectors, vits2)?; 139 | Ok(()) 140 | } 141 | 142 | /// Load a style vector and onnx model binary 143 | /// 144 | /// # Examples 145 | /// 146 | /// ```rs 147 | /// tts_holder.load("tsukuyomi", std::fs::read("style_vectors.json")?, std::fs::read("model.onnx")?)?; 148 | /// ``` 149 | pub fn load, P: AsRef<[u8]>>( 150 | &mut self, 151 | ident: I, 152 | style_vectors_bytes: P, 153 | vits2_bytes: P, 154 | ) -> Result<()> { 155 | let ident = ident.into(); 156 | if self.find_model(ident.clone()).is_err() { 157 | let mut load = true; 158 | if let Some(max) = self.max_loaded_models { 159 | if self.models.iter().filter(|x| x.vits2.is_some()).count() >= max { 160 | load = false; 161 | } 162 | } 163 | self.models.push(TTSModel { 164 | vits2: if load { 165 | Some(model::load_model(&vits2_bytes, false)?) 166 | } else { 167 | None 168 | }, 169 | style_vectors: style::load_style(style_vectors_bytes)?, 170 | ident, 171 | bytes: if self.max_loaded_models.is_some() { 172 | Some(vits2_bytes.as_ref().to_vec()) 173 | } else { 174 | None 175 | }, 176 | }) 177 | } 178 | Ok(()) 179 | } 180 | 181 | /// Unload a model 182 | pub fn unload>(&mut self, ident: I) -> bool { 183 | let ident = ident.into(); 184 | if let Some((i, _)) = self 185 | .models 186 | .iter() 187 | .enumerate() 188 | .find(|(_, m)| m.ident == ident) 189 | { 190 | self.models.remove(i); 191 | true 192 | } else { 193 | false 194 | } 195 | } 196 | 197 | /// Parse text and return the input for synthesize 198 | /// 199 | /// # Note 200 | /// This function is for low-level usage, use `easy_synthesize` for high-level usage. 201 | #[allow(clippy::type_complexity)] 202 | pub fn parse_text( 203 | &mut self, 204 | text: &str, 205 | ) -> Result<(Array2, Array1, Array1, Array1)> { 206 | crate::tts_util::parse_text_blocking( 207 | text, 208 | None, 209 | &self.jtalk, 210 | &self.tokenizer, 211 | |token_ids, attention_masks| { 212 | crate::bert::predict(&mut self.bert, token_ids, attention_masks) 213 | }, 214 | ) 215 | } 216 | 217 | #[allow(clippy::type_complexity)] 218 | pub fn parse_text_neo( 219 | &mut self, 220 | text: String, 221 | given_tones: Option>, 222 | ) -> Result<(Array2, Array1, Array1, Array1)> { 223 | crate::tts_util::parse_text_blocking( 224 | &text, 225 | given_tones, 226 | &self.jtalk, 227 | &self.tokenizer, 228 | |token_ids, attention_masks| { 229 | crate::bert::predict(&mut self.bert, token_ids, attention_masks) 230 | }, 231 | ) 232 | } 233 | 234 | fn find_model>(&mut self, ident: I) -> Result<&mut TTSModel> { 235 | let ident = ident.into(); 236 | self.models 237 | .iter_mut() 238 | .find(|m| m.ident == ident) 239 | .ok_or(Error::ModelNotFoundError(ident.to_string())) 240 | } 241 | fn find_and_load_model>(&mut self, ident: I) -> Result { 242 | let ident = ident.into(); 243 | let (bytes, style_vectors) = { 244 | let model = self 245 | .models 246 | .iter() 247 | .find(|m| m.ident == ident) 248 | .ok_or(Error::ModelNotFoundError(ident.to_string()))?; 249 | if model.vits2.is_some() { 250 | return Ok(true); 251 | } 252 | (model.bytes.clone().unwrap(), model.style_vectors.clone()) 253 | }; 254 | self.unload(ident.clone()); 255 | let s = model::load_model(&bytes, false)?; 256 | if let Some(max) = self.max_loaded_models { 257 | if self.models.iter().filter(|x| x.vits2.is_some()).count() >= max { 258 | self.unload(self.models.first().unwrap().ident.clone()); 259 | } 260 | } 261 | self.models.push(TTSModel { 262 | bytes: Some(bytes.to_vec()), 263 | vits2: Some(s), 264 | style_vectors, 265 | ident: ident.clone(), 266 | }); 267 | let model = self 268 | .models 269 | .iter() 270 | .find(|m| m.ident == ident) 271 | .ok_or(Error::ModelNotFoundError(ident.to_string()))?; 272 | if model.vits2.is_some() { 273 | return Ok(true); 274 | } 275 | Err(Error::ModelNotFoundError(ident.to_string())) 276 | } 277 | 278 | /// Get style vector by style id and weight 279 | /// 280 | /// # Note 281 | /// This function is for low-level usage, use `easy_synthesize` for high-level usage. 282 | pub fn get_style_vector>( 283 | &mut self, 284 | ident: I, 285 | style_id: i32, 286 | weight: f32, 287 | ) -> Result> { 288 | style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight) 289 | } 290 | 291 | /// Synthesize text to audio 292 | /// 293 | /// # Examples 294 | /// 295 | /// ```rs 296 | /// let audio = tts_holder.easy_synthesize("tsukuyomi", "こんにちは", 0, SynthesizeOptions::default())?; 297 | /// ``` 298 | pub fn easy_synthesize + Copy>( 299 | &mut self, 300 | ident: I, 301 | text: &str, 302 | style_id: i32, 303 | speaker_id: i64, 304 | options: SynthesizeOptions, 305 | ) -> Result> { 306 | self.find_and_load_model(ident)?; 307 | let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?; 308 | let audio_array = if options.split_sentences { 309 | let texts: Vec<&str> = text.split('\n').collect(); 310 | let mut audios = vec![]; 311 | for (i, t) in texts.iter().enumerate() { 312 | if t.is_empty() { 313 | continue; 314 | } 315 | let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?; 316 | 317 | let vits2 = self 318 | .find_model(ident)? 319 | .vits2 320 | .as_mut() 321 | .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; 322 | let audio = model::synthesize( 323 | vits2, 324 | bert_ori.to_owned(), 325 | phones, 326 | Array1::from_vec(vec![speaker_id]), 327 | tones, 328 | lang_ids, 329 | style_vector.clone(), 330 | options.sdp_ratio, 331 | options.length_scale, 332 | 0.677, 333 | 0.8, 334 | )?; 335 | audios.push(audio.clone()); 336 | if i != texts.len() - 1 { 337 | audios.push(Array3::zeros((1, 1, 22050))); 338 | } 339 | } 340 | concatenate( 341 | Axis(2), 342 | &audios.iter().map(|x| x.view()).collect::>(), 343 | )? 344 | } else { 345 | let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; 346 | 347 | let vits2 = self 348 | .find_model(ident)? 349 | .vits2 350 | .as_mut() 351 | .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; 352 | model::synthesize( 353 | vits2, 354 | bert_ori.to_owned(), 355 | phones, 356 | Array1::from_vec(vec![speaker_id]), 357 | tones, 358 | lang_ids, 359 | style_vector, 360 | options.sdp_ratio, 361 | options.length_scale, 362 | 0.677, 363 | 0.8, 364 | )? 365 | }; 366 | tts_util::array_to_vec(audio_array) 367 | } 368 | 369 | pub fn easy_synthesize_neo + Copy>( 370 | &mut self, 371 | ident: I, 372 | text: &str, 373 | given_tones: Option>, 374 | style_id: i32, 375 | speaker_id: i64, 376 | options: SynthesizeOptions, 377 | ) -> Result> { 378 | self.find_and_load_model(ident)?; 379 | let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?; 380 | let audio_array = if options.split_sentences { 381 | let texts: Vec<&str> = text.split('\n').collect(); 382 | let mut audios = vec![]; 383 | for (i, t) in texts.iter().enumerate() { 384 | if t.is_empty() { 385 | continue; 386 | } 387 | let (bert_ori, phones, tones, lang_ids) = 388 | self.parse_text_neo(t.to_string(), given_tones.clone())?; 389 | 390 | let vits2 = self 391 | .find_model(ident)? 392 | .vits2 393 | .as_mut() 394 | .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; 395 | let audio = model::synthesize( 396 | vits2, 397 | bert_ori.to_owned(), 398 | phones, 399 | Array1::from_vec(vec![speaker_id]), 400 | tones, 401 | lang_ids, 402 | style_vector.clone(), 403 | options.sdp_ratio, 404 | options.length_scale, 405 | 0.677, 406 | 0.8, 407 | )?; 408 | audios.push(audio.clone()); 409 | if i != texts.len() - 1 { 410 | audios.push(Array3::zeros((1, 1, 22050))); 411 | } 412 | } 413 | concatenate( 414 | Axis(2), 415 | &audios.iter().map(|x| x.view()).collect::>(), 416 | )? 417 | } else { 418 | let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; 419 | 420 | let vits2 = self 421 | .find_model(ident)? 422 | .vits2 423 | .as_mut() 424 | .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; 425 | model::synthesize( 426 | vits2, 427 | bert_ori.to_owned(), 428 | phones, 429 | Array1::from_vec(vec![speaker_id]), 430 | tones, 431 | lang_ids, 432 | style_vector, 433 | options.sdp_ratio, 434 | options.length_scale, 435 | 0.677, 436 | 0.8, 437 | )? 438 | }; 439 | tts_util::array_to_vec(audio_array) 440 | } 441 | } 442 | 443 | /// Synthesize options 444 | /// 445 | /// # Fields 446 | /// - `sdp_ratio`: SDP ratio 447 | /// - `length_scale`: Length scale 448 | /// - `style_weight`: Style weight 449 | /// - `split_sentences`: Split sentences 450 | pub struct SynthesizeOptions { 451 | pub sdp_ratio: f32, 452 | pub length_scale: f32, 453 | pub style_weight: f32, 454 | pub split_sentences: bool, 455 | } 456 | 457 | impl Default for SynthesizeOptions { 458 | fn default() -> Self { 459 | SynthesizeOptions { 460 | sdp_ratio: 0.0, 461 | length_scale: 1.0, 462 | style_weight: 1.0, 463 | split_sentences: true, 464 | } 465 | } 466 | } 467 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/tts_util.rs: -------------------------------------------------------------------------------- 1 | use std::io::Cursor; 2 | 3 | use crate::error::Result; 4 | use crate::jtalk::JTalkProcess; 5 | use crate::mora::MORA_KATA_TO_MORA_PHONEMES; 6 | use crate::norm::PUNCTUATIONS; 7 | use crate::{jtalk, nlp, norm, tokenizer, utils}; 8 | use hound::{SampleFormat, WavSpec, WavWriter}; 9 | use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; 10 | use tokenizers::Tokenizer; 11 | 12 | pub fn preprocess_parse_text(text: &str, jtalk: &jtalk::JTalk) -> Result<(String, JTalkProcess)> { 13 | let text = jtalk.num2word(text)?; 14 | let normalized_text = norm::normalize_text(&text); 15 | 16 | let process = jtalk.process_text(&normalized_text)?; 17 | Ok((normalized_text, process)) 18 | } 19 | 20 | /// Parse text and return the input for synthesize 21 | /// 22 | /// # Note 23 | /// This function is for low-level usage, use `easy_synthesize` for high-level usage. 24 | #[allow(clippy::type_complexity)] 25 | pub async fn parse_text( 26 | text: &str, 27 | jtalk: &jtalk::JTalk, 28 | tokenizer: &Tokenizer, 29 | bert_predict: impl FnOnce( 30 | Vec, 31 | Vec, 32 | ) -> std::pin::Pin< 33 | Box>>>, 34 | >, 35 | ) -> Result<(Array2, Array1, Array1, Array1)> { 36 | let (normalized_text, process) = preprocess_parse_text(text, jtalk)?; 37 | let (phones, tones, mut word2ph) = process.g2p()?; 38 | let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); 39 | let phones = utils::intersperse(&phones, 0); 40 | let tones = utils::intersperse(&tones, 0); 41 | let lang_ids = utils::intersperse(&lang_ids, 0); 42 | for item in &mut word2ph { 43 | *item *= 2; 44 | } 45 | word2ph[0] += 1; 46 | 47 | let text = { 48 | let (seq_text, _) = process.text_to_seq_kata()?; 49 | seq_text.join("") 50 | }; 51 | let (token_ids, attention_masks) = tokenizer::tokenize(&text, tokenizer)?; 52 | 53 | let bert_content = bert_predict(token_ids, attention_masks).await?; 54 | 55 | assert!( 56 | word2ph.len() == text.chars().count() + 2, 57 | "{} {}", 58 | word2ph.len(), 59 | normalized_text.chars().count() 60 | ); 61 | 62 | let mut phone_level_feature = vec![]; 63 | for (i, reps) in word2ph.iter().enumerate() { 64 | let repeat_feature = { 65 | let (reps_rows, reps_cols) = (*reps, 1); 66 | let arr_len = bert_content.slice(s![i, ..]).len(); 67 | 68 | let mut results: Array2 = Array::zeros((reps_rows as usize, arr_len * reps_cols)); 69 | 70 | for j in 0..reps_rows { 71 | for k in 0..reps_cols { 72 | let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]); 73 | view.assign(&bert_content.slice(s![i, ..])); 74 | } 75 | } 76 | results 77 | }; 78 | phone_level_feature.push(repeat_feature); 79 | } 80 | let phone_level_feature = concatenate( 81 | Axis(0), 82 | &phone_level_feature 83 | .iter() 84 | .map(|x| x.view()) 85 | .collect::>(), 86 | )?; 87 | let bert_ori = phone_level_feature.t(); 88 | Ok(( 89 | bert_ori.to_owned(), 90 | phones.into(), 91 | tones.into(), 92 | lang_ids.into(), 93 | )) 94 | } 95 | 96 | /// Parse text and return the input for synthesize 97 | /// 98 | /// # Note 99 | /// This function is for low-level usage, use `easy_synthesize` for high-level usage. 100 | #[allow(clippy::type_complexity)] 101 | pub fn parse_text_blocking( 102 | text: &str, 103 | given_tones: Option>, 104 | jtalk: &jtalk::JTalk, 105 | tokenizer: &Tokenizer, 106 | bert_predict: impl FnOnce(Vec, Vec) -> Result>, 107 | ) -> Result<(Array2, Array1, Array1, Array1)> { 108 | let text = jtalk.num2word(text)?; 109 | let normalized_text = norm::normalize_text(&text); 110 | 111 | let process = jtalk.process_text(&normalized_text)?; 112 | let (phones, mut tones, mut word2ph) = process.g2p()?; 113 | if let Some(given_tones) = given_tones { 114 | tones = given_tones; 115 | } 116 | let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); 117 | 118 | let phones = utils::intersperse(&phones, 0); 119 | let tones = utils::intersperse(&tones, 0); 120 | let lang_ids = utils::intersperse(&lang_ids, 0); 121 | for item in &mut word2ph { 122 | *item *= 2; 123 | } 124 | word2ph[0] += 1; 125 | 126 | let text = { 127 | let (seq_text, _) = process.text_to_seq_kata()?; 128 | seq_text.join("") 129 | }; 130 | let (token_ids, attention_masks) = tokenizer::tokenize(&text, tokenizer)?; 131 | 132 | let bert_content = bert_predict(token_ids, attention_masks)?; 133 | 134 | assert!( 135 | word2ph.len() == text.chars().count() + 2, 136 | "{} {}", 137 | word2ph.len(), 138 | normalized_text.chars().count() 139 | ); 140 | 141 | let mut phone_level_feature = vec![]; 142 | for (i, reps) in word2ph.iter().enumerate() { 143 | let repeat_feature = { 144 | let (reps_rows, reps_cols) = (*reps, 1); 145 | let arr_len = bert_content.slice(s![i, ..]).len(); 146 | 147 | let mut results: Array2 = Array::zeros((reps_rows as usize, arr_len * reps_cols)); 148 | 149 | for j in 0..reps_rows { 150 | for k in 0..reps_cols { 151 | let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]); 152 | view.assign(&bert_content.slice(s![i, ..])); 153 | } 154 | } 155 | results 156 | }; 157 | phone_level_feature.push(repeat_feature); 158 | } 159 | let phone_level_feature = concatenate( 160 | Axis(0), 161 | &phone_level_feature 162 | .iter() 163 | .map(|x| x.view()) 164 | .collect::>(), 165 | )?; 166 | let bert_ori = phone_level_feature.t(); 167 | Ok(( 168 | bert_ori.to_owned(), 169 | phones.into(), 170 | tones.into(), 171 | lang_ids.into(), 172 | )) 173 | } 174 | 175 | pub fn array_to_vec(audio_array: Array3) -> Result> { 176 | let spec = WavSpec { 177 | channels: 1, 178 | sample_rate: 44100, 179 | bits_per_sample: 32, 180 | sample_format: SampleFormat::Float, 181 | }; 182 | let mut cursor = Cursor::new(Vec::new()); 183 | let mut writer = WavWriter::new(&mut cursor, spec)?; 184 | for i in 0..audio_array.shape()[0] { 185 | let output = audio_array.slice(s![i, 0, ..]).to_vec(); 186 | for sample in output { 187 | writer.write_sample(sample)?; 188 | } 189 | } 190 | writer.finalize()?; 191 | Ok(cursor.into_inner()) 192 | } 193 | 194 | pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> { 195 | let mut results = vec![("_".to_string(), 0)]; 196 | for (mora, tone) in kata_tone { 197 | if PUNCTUATIONS.contains(&mora.as_str()) { 198 | results.push((mora, 0)); 199 | continue; 200 | } else { 201 | let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap(); 202 | if let Some(consonant) = consonant { 203 | results.push((consonant.to_string(), tone)); 204 | results.push((vowel.to_string(), tone)); 205 | } else { 206 | results.push((vowel.to_string(), tone)); 207 | } 208 | } 209 | } 210 | results.push(("_".to_string(), 0)); 211 | results 212 | } 213 | -------------------------------------------------------------------------------- /crates/sbv2_core/src/utils.rs: -------------------------------------------------------------------------------- 1 | pub fn intersperse(slice: &[T], sep: T) -> Vec 2 | where 3 | T: Clone, 4 | { 5 | let mut result = vec![sep.clone(); slice.len() * 2 + 1]; 6 | result 7 | .iter_mut() 8 | .step_by(2) 9 | .zip(slice.iter()) 10 | .for_each(|(r, s)| *r = s.clone()); 11 | result 12 | } 13 | -------------------------------------------------------------------------------- /crates/sbv2_editor/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sbv2_editor" 3 | version.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | license.workspace = true 7 | readme.workspace = true 8 | repository.workspace = true 9 | documentation.workspace = true 10 | 11 | [dependencies] 12 | anyhow.workspace = true 13 | axum = "0.8.1" 14 | dotenvy.workspace = true 15 | env_logger.workspace = true 16 | log = "0.4.27" 17 | sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core", features = ["aivmx"] } 18 | serde = { version = "1.0.219", features = ["derive"] } 19 | tokio = { version = "1.44.1", features = ["full"] } 20 | -------------------------------------------------------------------------------- /crates/sbv2_editor/README.md: -------------------------------------------------------------------------------- 1 | # sbv2-voicevox 2 | sbv2-apiをvoicevox化します。 -------------------------------------------------------------------------------- /crates/sbv2_editor/query2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accent_phrases": [ 3 | { 4 | "moras": [ 5 | { 6 | "text": "コ", 7 | "consonant": "k", 8 | "consonant_length": 0.10002632439136505, 9 | "vowel": "o", 10 | "vowel_length": 0.15740256011486053, 11 | "pitch": 5.749961853027344 12 | }, 13 | { 14 | "text": "ン", 15 | "consonant": null, 16 | "consonant_length": null, 17 | "vowel": "N", 18 | "vowel_length": 0.08265873789787292, 19 | "pitch": 5.89122200012207 20 | }, 21 | { 22 | "text": "ニ", 23 | "consonant": "n", 24 | "consonant_length": 0.03657080978155136, 25 | "vowel": "i", 26 | "vowel_length": 0.1175866425037384, 27 | "pitch": 5.969866752624512 28 | }, 29 | { 30 | "text": "チ", 31 | "consonant": "ch", 32 | "consonant_length": 0.09005842357873917, 33 | "vowel": "i", 34 | "vowel_length": 0.08666137605905533, 35 | "pitch": 5.958892822265625 36 | }, 37 | { 38 | "text": "ワ", 39 | "consonant": "w", 40 | "consonant_length": 0.07833231985569, 41 | "vowel": "a", 42 | "vowel_length": 0.21250136196613312, 43 | "pitch": 5.949411392211914 44 | } 45 | ], 46 | "accent": 5, 47 | "pause_mora": { 48 | "text": "、", 49 | "consonant": null, 50 | "consonant_length": null, 51 | "vowel": "pau", 52 | "vowel_length": 0.4723339378833771, 53 | "pitch": 0.0 54 | }, 55 | "is_interrogative": false 56 | }, 57 | { 58 | "moras": [ 59 | { 60 | "text": "オ", 61 | "consonant": null, 62 | "consonant_length": null, 63 | "vowel": "o", 64 | "vowel_length": 0.22004225850105286, 65 | "pitch": 5.6870927810668945 66 | }, 67 | { 68 | "text": "ン", 69 | "consonant": null, 70 | "consonant_length": null, 71 | "vowel": "N", 72 | "vowel_length": 0.09161105751991272, 73 | "pitch": 5.93472957611084 74 | }, 75 | { 76 | "text": "セ", 77 | "consonant": "s", 78 | "consonant_length": 0.08924821764230728, 79 | "vowel": "e", 80 | "vowel_length": 0.14142127335071564, 81 | "pitch": 6.121850490570068 82 | }, 83 | { 84 | "text": "エ", 85 | "consonant": null, 86 | "consonant_length": null, 87 | "vowel": "e", 88 | "vowel_length": 0.10636933892965317, 89 | "pitch": 6.157896041870117 90 | }, 91 | { 92 | "text": "ゴ", 93 | "consonant": "g", 94 | "consonant_length": 0.07600915431976318, 95 | "vowel": "o", 96 | "vowel_length": 0.09598273783922195, 97 | "pitch": 6.188933849334717 98 | }, 99 | { 100 | "text": "オ", 101 | "consonant": null, 102 | "consonant_length": null, 103 | "vowel": "o", 104 | "vowel_length": 0.1079121008515358, 105 | "pitch": 6.235202789306641 106 | }, 107 | { 108 | "text": "セ", 109 | "consonant": "s", 110 | "consonant_length": 0.09591838717460632, 111 | "vowel": "e", 112 | "vowel_length": 0.10286372154951096, 113 | "pitch": 6.153214454650879 114 | }, 115 | { 116 | "text": "エ", 117 | "consonant": null, 118 | "consonant_length": null, 119 | "vowel": "e", 120 | "vowel_length": 0.08992656320333481, 121 | "pitch": 6.02571439743042 122 | }, 123 | { 124 | "text": "ノ", 125 | "consonant": "n", 126 | "consonant_length": 0.05660202354192734, 127 | "vowel": "o", 128 | "vowel_length": 0.09676017612218857, 129 | "pitch": 5.711844444274902 130 | } 131 | ], 132 | "accent": 5, 133 | "pause_mora": null, 134 | "is_interrogative": false 135 | }, 136 | { 137 | "moras": [ 138 | { 139 | "text": "セ", 140 | "consonant": "s", 141 | "consonant_length": 0.07805486768484116, 142 | "vowel": "e", 143 | "vowel_length": 0.09617523103952408, 144 | "pitch": 5.774399280548096 145 | }, 146 | { 147 | "text": "カ", 148 | "consonant": "k", 149 | "consonant_length": 0.06712044775485992, 150 | "vowel": "a", 151 | "vowel_length": 0.148829385638237, 152 | "pitch": 6.063965797424316 153 | }, 154 | { 155 | "text": "イ", 156 | "consonant": null, 157 | "consonant_length": null, 158 | "vowel": "i", 159 | "vowel_length": 0.11061104387044907, 160 | "pitch": 6.040698051452637 161 | }, 162 | { 163 | "text": "エ", 164 | "consonant": null, 165 | "consonant_length": null, 166 | "vowel": "e", 167 | "vowel_length": 0.13046696782112122, 168 | "pitch": 5.806027889251709 169 | } 170 | ], 171 | "accent": 1, 172 | "pause_mora": null, 173 | "is_interrogative": false 174 | }, 175 | { 176 | "moras": [ 177 | { 178 | "text": "ヨ", 179 | "consonant": "y", 180 | "consonant_length": 0.07194744795560837, 181 | "vowel": "o", 182 | "vowel_length": 0.08622600883245468, 183 | "pitch": 5.694094657897949 184 | }, 185 | { 186 | "text": "オ", 187 | "consonant": null, 188 | "consonant_length": null, 189 | "vowel": "o", 190 | "vowel_length": 0.10635452717542648, 191 | "pitch": 5.787222385406494 192 | }, 193 | { 194 | "text": "コ", 195 | "consonant": "k", 196 | "consonant_length": 0.07077334076166153, 197 | "vowel": "o", 198 | "vowel_length": 0.09248624742031097, 199 | "pitch": 5.793357849121094 200 | }, 201 | { 202 | "text": "ソ", 203 | "consonant": "s", 204 | "consonant_length": 0.08705667406320572, 205 | "vowel": "o", 206 | "vowel_length": 0.2238258570432663, 207 | "pitch": 5.643765449523926 208 | } 209 | ], 210 | "accent": 1, 211 | "pause_mora": null, 212 | "is_interrogative": false 213 | } 214 | ], 215 | "speedScale": 1.0, 216 | "pitchScale": 0.0, 217 | "intonationScale": 1.0, 218 | "volumeScale": 1.0, 219 | "prePhonemeLength": 0.1, 220 | "postPhonemeLength": 0.1, 221 | "pauseLength": null, 222 | "pauseLengthScale": 1.0, 223 | "outputSamplingRate": 24000, 224 | "outputStereo": false, 225 | "kana": "コンニチワ'、オンセエゴ'オセエノ/セ'カイエ/ヨ'オコソ" 226 | } -------------------------------------------------------------------------------- /crates/sbv2_editor/src/error.rs: -------------------------------------------------------------------------------- 1 | use axum::{ 2 | http::StatusCode, 3 | response::{IntoResponse, Response}, 4 | }; 5 | 6 | pub type AppResult = std::result::Result; 7 | 8 | pub struct AppError(anyhow::Error); 9 | 10 | impl IntoResponse for AppError { 11 | fn into_response(self) -> Response { 12 | ( 13 | StatusCode::INTERNAL_SERVER_ERROR, 14 | format!("Something went wrong: {}", self.0), 15 | ) 16 | .into_response() 17 | } 18 | } 19 | 20 | impl From for AppError 21 | where 22 | E: Into, 23 | { 24 | fn from(err: E) -> Self { 25 | Self(err.into()) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /crates/sbv2_editor/src/main.rs: -------------------------------------------------------------------------------- 1 | use axum::extract::State; 2 | use axum::{ 3 | extract::Query, 4 | http::header::CONTENT_TYPE, 5 | response::IntoResponse, 6 | routing::{get, post}, 7 | Json, Router, 8 | }; 9 | use sbv2_core::tts_util::kata_tone2phone_tone; 10 | use sbv2_core::{ 11 | tts::{SynthesizeOptions, TTSModelHolder}, 12 | tts_util::preprocess_parse_text, 13 | }; 14 | use serde::{Deserialize, Serialize}; 15 | use tokio::{fs, net::TcpListener, sync::Mutex}; 16 | 17 | use std::env; 18 | use std::sync::Arc; 19 | 20 | use error::AppResult; 21 | 22 | mod error; 23 | 24 | #[derive(Deserialize)] 25 | struct RequestCreateAudioQuery { 26 | text: String, 27 | } 28 | 29 | #[derive(Serialize, Deserialize)] 30 | struct AudioQuery { 31 | kana: String, 32 | tone: i32, 33 | } 34 | 35 | #[derive(Serialize)] 36 | struct ResponseCreateAudioQuery { 37 | audio_query: Vec, 38 | text: String, 39 | } 40 | 41 | async fn create_audio_query( 42 | State(state): State, 43 | Query(request): Query, 44 | ) -> AppResult { 45 | let (text, process) = { 46 | let tts_model = state.tts_model.lock().await; 47 | preprocess_parse_text(&request.text, &tts_model.jtalk)? 48 | }; 49 | let kana_tone_list = process.g2kana_tone()?; 50 | let audio_query = kana_tone_list 51 | .iter() 52 | .map(|(kana, tone)| AudioQuery { 53 | kana: kana.clone(), 54 | tone: *tone, 55 | }) 56 | .collect::>(); 57 | Ok(Json(ResponseCreateAudioQuery { audio_query, text })) 58 | } 59 | 60 | #[derive(Deserialize)] 61 | pub struct RequestSynthesis { 62 | text: String, 63 | speaker_id: i64, 64 | sdp_ratio: f32, 65 | length_scale: f32, 66 | style_id: i32, 67 | audio_query: Vec, 68 | ident: String, 69 | } 70 | 71 | async fn synthesis( 72 | State(state): State, 73 | Json(request): Json, 74 | ) -> AppResult { 75 | let phone_tone = request 76 | .audio_query 77 | .iter() 78 | .map(|query| (query.kana.clone(), query.tone)) 79 | .collect::>(); 80 | let phone_tone = kata_tone2phone_tone(phone_tone); 81 | let tones = phone_tone.iter().map(|(_, tone)| *tone).collect::>(); 82 | let buffer = { 83 | let mut tts_model = state.tts_model.lock().await; 84 | tts_model.easy_synthesize_neo( 85 | &request.ident, 86 | &request.text, 87 | Some(tones), 88 | request.style_id, 89 | request.speaker_id, 90 | SynthesizeOptions { 91 | sdp_ratio: request.sdp_ratio, 92 | length_scale: request.length_scale, 93 | ..Default::default() 94 | }, 95 | )? 96 | }; 97 | Ok(([(CONTENT_TYPE, "audio/wav")], buffer)) 98 | } 99 | 100 | #[derive(Clone)] 101 | struct AppState { 102 | tts_model: Arc>, 103 | } 104 | 105 | impl AppState { 106 | pub async fn new() -> anyhow::Result { 107 | let mut tts_model = TTSModelHolder::new( 108 | &fs::read(env::var("BERT_MODEL_PATH")?).await?, 109 | &fs::read(env::var("TOKENIZER_PATH")?).await?, 110 | env::var("HOLDER_MAX_LOADED_MODElS") 111 | .ok() 112 | .and_then(|x| x.parse().ok()), 113 | )?; 114 | let models = env::var("MODELS_PATH").unwrap_or("models".to_string()); 115 | let mut f = fs::read_dir(&models).await?; 116 | let mut entries = vec![]; 117 | while let Ok(Some(e)) = f.next_entry().await { 118 | let name = e.file_name().to_string_lossy().to_string(); 119 | if name.ends_with(".onnx") && name.starts_with("model_") { 120 | let name_len = name.len(); 121 | let name = name.chars(); 122 | entries.push( 123 | name.collect::>()[6..name_len - 5] 124 | .iter() 125 | .collect::(), 126 | ); 127 | } else if name.ends_with(".sbv2") { 128 | let entry = &name[..name.len() - 5]; 129 | log::info!("Try loading: {entry}"); 130 | let sbv2_bytes = match fs::read(format!("{models}/{entry}.sbv2")).await { 131 | Ok(b) => b, 132 | Err(e) => { 133 | log::warn!("Error loading sbv2_bytes from file {entry}: {e}"); 134 | continue; 135 | } 136 | }; 137 | if let Err(e) = tts_model.load_sbv2file(entry, sbv2_bytes) { 138 | log::warn!("Error loading {entry}: {e}"); 139 | }; 140 | log::info!("Loaded: {entry}"); 141 | } else if name.ends_with(".aivmx") { 142 | let entry = &name[..name.len() - 6]; 143 | log::info!("Try loading: {entry}"); 144 | let aivmx_bytes = match fs::read(format!("{models}/{entry}.aivmx")).await { 145 | Ok(b) => b, 146 | Err(e) => { 147 | log::warn!("Error loading aivmx bytes from file {entry}: {e}"); 148 | continue; 149 | } 150 | }; 151 | if let Err(e) = tts_model.load_aivmx(entry, aivmx_bytes) { 152 | log::error!("Error loading {entry}: {e}"); 153 | } 154 | log::info!("Loaded: {entry}"); 155 | } 156 | } 157 | for entry in entries { 158 | log::info!("Try loading: {entry}"); 159 | let style_vectors_bytes = 160 | match fs::read(format!("{models}/style_vectors_{entry}.json")).await { 161 | Ok(b) => b, 162 | Err(e) => { 163 | log::warn!("Error loading style_vectors_bytes from file {entry}: {e}"); 164 | continue; 165 | } 166 | }; 167 | let vits2_bytes = match fs::read(format!("{models}/model_{entry}.onnx")).await { 168 | Ok(b) => b, 169 | Err(e) => { 170 | log::warn!("Error loading vits2_bytes from file {entry}: {e}"); 171 | continue; 172 | } 173 | }; 174 | if let Err(e) = tts_model.load(&entry, style_vectors_bytes, vits2_bytes) { 175 | log::warn!("Error loading {entry}: {e}"); 176 | }; 177 | log::info!("Loaded: {entry}"); 178 | } 179 | Ok(Self { 180 | tts_model: Arc::new(Mutex::new(tts_model)), 181 | }) 182 | } 183 | } 184 | 185 | #[tokio::main] 186 | async fn main() -> anyhow::Result<()> { 187 | dotenvy::dotenv_override().ok(); 188 | env_logger::init(); 189 | let app = Router::new() 190 | .route("/", get(|| async { "Hello, world!" })) 191 | .route("/audio_query", get(create_audio_query)) 192 | .route("/synthesis", post(synthesis)) 193 | .with_state(AppState::new().await?); 194 | let listener = TcpListener::bind("0.0.0.0:8080").await?; 195 | axum::serve(listener, app).await?; 196 | Ok(()) 197 | } 198 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sbv2_wasm" 3 | version.workspace = true 4 | edition.workspace = true 5 | description.workspace = true 6 | readme.workspace = true 7 | repository.workspace = true 8 | documentation.workspace = true 9 | license.workspace = true 10 | 11 | [lib] 12 | crate-type = ["cdylib", "rlib"] 13 | 14 | [dependencies] 15 | wasm-bindgen = "0.2.93" 16 | sbv2_core = { path = "../sbv2_core", default-features = false, features = ["no_std"] } 17 | once_cell.workspace = true 18 | js-sys = "0.3.70" 19 | ndarray.workspace = true 20 | wasm-bindgen-futures = "0.4.43" 21 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/README.md: -------------------------------------------------------------------------------- 1 | # StyleBertVITS2 wasm 2 | refer to https://github.com/neodyland/sbv2-api 3 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.9.2/schema.json", 3 | "vcs": { 4 | "enabled": false, 5 | "clientKind": "git", 6 | "useIgnoreFile": false 7 | }, 8 | "files": { 9 | "ignoreUnknown": false, 10 | "ignore": [] 11 | }, 12 | "formatter": { 13 | "enabled": true, 14 | "indentStyle": "tab", 15 | "ignore": ["dist/", "pkg/"] 16 | }, 17 | "organizeImports": { 18 | "enabled": true 19 | }, 20 | "linter": { 21 | "enabled": true, 22 | "rules": { 23 | "recommended": true 24 | } 25 | }, 26 | "javascript": { 27 | "formatter": { 28 | "quoteStyle": "double" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | wasm-pack build --target web ./crates/sbv2_wasm --release 3 | wasm-opt -O3 -o ./crates/sbv2_wasm/pkg/sbv2_wasm_bg.wasm ./crates/sbv2_wasm/pkg/sbv2_wasm_bg.wasm 4 | wasm-strip ./crates/sbv2_wasm/pkg/sbv2_wasm_bg.wasm 5 | mkdir -p ./crates/sbv2_wasm/dist 6 | cp ./crates/sbv2_wasm/pkg/sbv2_wasm_bg.wasm ./crates/sbv2_wasm/dist/sbv2_wasm_bg.wasm 7 | cd ./crates/sbv2_wasm 8 | pnpm build -------------------------------------------------------------------------------- /crates/sbv2_wasm/example.js: -------------------------------------------------------------------------------- 1 | import { ModelHolder } from "./dist/index.js"; 2 | import fs from "node:fs/promises"; 3 | 4 | ModelHolder.globalInit(await fs.readFile("./dist/sbv2_wasm_bg.wasm")); 5 | const holder = await ModelHolder.create( 6 | (await fs.readFile("../../models/tokenizer.json")).toString("utf-8"), 7 | await fs.readFile("../../models/deberta.onnx"), 8 | ); 9 | await holder.load( 10 | "tsukuyomi", 11 | await fs.readFile("../../models/tsukuyomi.sbv2"), 12 | ); 13 | await fs.writeFile("out.wav", await holder.synthesize("tsukuyomi", "おはよう")); 14 | holder.unload("tsukuyomi"); 15 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sbv2", 3 | "version": "0.2.0-alpha6", 4 | "description": "Style Bert VITS2 wasm", 5 | "main": "dist/index.js", 6 | "types": "dist/index.d.ts", 7 | "type": "module", 8 | "scripts": { 9 | "build": "tsc && esbuild src-js/index.ts --outfile=dist/index.js --minify --format=esm --bundle --external:onnxruntime-web", 10 | "format": "biome format --write ." 11 | }, 12 | "keywords": [], 13 | "author": "tuna2134", 14 | "contributes": ["neodyland"], 15 | "license": "MIT", 16 | "devDependencies": { 17 | "@biomejs/biome": "^1.9.4", 18 | "@types/node": "^22.13.5", 19 | "esbuild": "^0.25.0", 20 | "typescript": "^5.7.3" 21 | }, 22 | "dependencies": { 23 | "onnxruntime-web": "^1.20.1" 24 | }, 25 | "files": ["dist/*", "package.json", "README.md", "pkg/*.ts", "pkg/*.js"] 26 | } 27 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/pnpm-lock.yaml: -------------------------------------------------------------------------------- 1 | lockfileVersion: '9.0' 2 | 3 | settings: 4 | autoInstallPeers: true 5 | excludeLinksFromLockfile: false 6 | 7 | importers: 8 | 9 | .: 10 | dependencies: 11 | onnxruntime-web: 12 | specifier: ^1.20.1 13 | version: 1.20.1 14 | devDependencies: 15 | '@biomejs/biome': 16 | specifier: ^1.9.4 17 | version: 1.9.4 18 | '@types/node': 19 | specifier: ^22.13.5 20 | version: 22.13.5 21 | esbuild: 22 | specifier: ^0.25.0 23 | version: 0.25.0 24 | typescript: 25 | specifier: ^5.7.3 26 | version: 5.8.3 27 | 28 | packages: 29 | 30 | '@biomejs/biome@1.9.4': 31 | resolution: {integrity: sha512-1rkd7G70+o9KkTn5KLmDYXihGoTaIGO9PIIN2ZB7UJxFrWw04CZHPYiMRjYsaDvVV7hP1dYNRLxSANLaBFGpog==} 32 | engines: {node: '>=14.21.3'} 33 | hasBin: true 34 | 35 | '@biomejs/cli-darwin-arm64@1.9.4': 36 | resolution: {integrity: sha512-bFBsPWrNvkdKrNCYeAp+xo2HecOGPAy9WyNyB/jKnnedgzl4W4Hb9ZMzYNbf8dMCGmUdSavlYHiR01QaYR58cw==} 37 | engines: {node: '>=14.21.3'} 38 | cpu: [arm64] 39 | os: [darwin] 40 | 41 | '@biomejs/cli-darwin-x64@1.9.4': 42 | resolution: {integrity: sha512-ngYBh/+bEedqkSevPVhLP4QfVPCpb+4BBe2p7Xs32dBgs7rh9nY2AIYUL6BgLw1JVXV8GlpKmb/hNiuIxfPfZg==} 43 | engines: {node: '>=14.21.3'} 44 | cpu: [x64] 45 | os: [darwin] 46 | 47 | '@biomejs/cli-linux-arm64-musl@1.9.4': 48 | resolution: {integrity: sha512-v665Ct9WCRjGa8+kTr0CzApU0+XXtRgwmzIf1SeKSGAv+2scAlW6JR5PMFo6FzqqZ64Po79cKODKf3/AAmECqA==} 49 | engines: {node: '>=14.21.3'} 50 | cpu: [arm64] 51 | os: [linux] 52 | 53 | '@biomejs/cli-linux-arm64@1.9.4': 54 | resolution: {integrity: sha512-fJIW0+LYujdjUgJJuwesP4EjIBl/N/TcOX3IvIHJQNsAqvV2CHIogsmA94BPG6jZATS4Hi+xv4SkBBQSt1N4/g==} 55 | engines: {node: '>=14.21.3'} 56 | cpu: [arm64] 57 | os: [linux] 58 | 59 | '@biomejs/cli-linux-x64-musl@1.9.4': 60 | resolution: {integrity: sha512-gEhi/jSBhZ2m6wjV530Yy8+fNqG8PAinM3oV7CyO+6c3CEh16Eizm21uHVsyVBEB6RIM8JHIl6AGYCv6Q6Q9Tg==} 61 | engines: {node: '>=14.21.3'} 62 | cpu: [x64] 63 | os: [linux] 64 | 65 | '@biomejs/cli-linux-x64@1.9.4': 66 | resolution: {integrity: sha512-lRCJv/Vi3Vlwmbd6K+oQ0KhLHMAysN8lXoCI7XeHlxaajk06u7G+UsFSO01NAs5iYuWKmVZjmiOzJ0OJmGsMwg==} 67 | engines: {node: '>=14.21.3'} 68 | cpu: [x64] 69 | os: [linux] 70 | 71 | '@biomejs/cli-win32-arm64@1.9.4': 72 | resolution: {integrity: sha512-tlbhLk+WXZmgwoIKwHIHEBZUwxml7bRJgk0X2sPyNR3S93cdRq6XulAZRQJ17FYGGzWne0fgrXBKpl7l4M87Hg==} 73 | engines: {node: '>=14.21.3'} 74 | cpu: [arm64] 75 | os: [win32] 76 | 77 | '@biomejs/cli-win32-x64@1.9.4': 78 | resolution: {integrity: sha512-8Y5wMhVIPaWe6jw2H+KlEm4wP/f7EW3810ZLmDlrEEy5KvBsb9ECEfu/kMWD484ijfQ8+nIi0giMgu9g1UAuuA==} 79 | engines: {node: '>=14.21.3'} 80 | cpu: [x64] 81 | os: [win32] 82 | 83 | '@esbuild/aix-ppc64@0.25.0': 84 | resolution: {integrity: sha512-O7vun9Sf8DFjH2UtqK8Ku3LkquL9SZL8OLY1T5NZkA34+wG3OQF7cl4Ql8vdNzM6fzBbYfLaiRLIOZ+2FOCgBQ==} 85 | engines: {node: '>=18'} 86 | cpu: [ppc64] 87 | os: [aix] 88 | 89 | '@esbuild/android-arm64@0.25.0': 90 | resolution: {integrity: sha512-grvv8WncGjDSyUBjN9yHXNt+cq0snxXbDxy5pJtzMKGmmpPxeAmAhWxXI+01lU5rwZomDgD3kJwulEnhTRUd6g==} 91 | engines: {node: '>=18'} 92 | cpu: [arm64] 93 | os: [android] 94 | 95 | '@esbuild/android-arm@0.25.0': 96 | resolution: {integrity: sha512-PTyWCYYiU0+1eJKmw21lWtC+d08JDZPQ5g+kFyxP0V+es6VPPSUhM6zk8iImp2jbV6GwjX4pap0JFbUQN65X1g==} 97 | engines: {node: '>=18'} 98 | cpu: [arm] 99 | os: [android] 100 | 101 | '@esbuild/android-x64@0.25.0': 102 | resolution: {integrity: sha512-m/ix7SfKG5buCnxasr52+LI78SQ+wgdENi9CqyCXwjVR2X4Jkz+BpC3le3AoBPYTC9NHklwngVXvbJ9/Akhrfg==} 103 | engines: {node: '>=18'} 104 | cpu: [x64] 105 | os: [android] 106 | 107 | '@esbuild/darwin-arm64@0.25.0': 108 | resolution: {integrity: sha512-mVwdUb5SRkPayVadIOI78K7aAnPamoeFR2bT5nszFUZ9P8UpK4ratOdYbZZXYSqPKMHfS1wdHCJk1P1EZpRdvw==} 109 | engines: {node: '>=18'} 110 | cpu: [arm64] 111 | os: [darwin] 112 | 113 | '@esbuild/darwin-x64@0.25.0': 114 | resolution: {integrity: sha512-DgDaYsPWFTS4S3nWpFcMn/33ZZwAAeAFKNHNa1QN0rI4pUjgqf0f7ONmXf6d22tqTY+H9FNdgeaAa+YIFUn2Rg==} 115 | engines: {node: '>=18'} 116 | cpu: [x64] 117 | os: [darwin] 118 | 119 | '@esbuild/freebsd-arm64@0.25.0': 120 | resolution: {integrity: sha512-VN4ocxy6dxefN1MepBx/iD1dH5K8qNtNe227I0mnTRjry8tj5MRk4zprLEdG8WPyAPb93/e4pSgi1SoHdgOa4w==} 121 | engines: {node: '>=18'} 122 | cpu: [arm64] 123 | os: [freebsd] 124 | 125 | '@esbuild/freebsd-x64@0.25.0': 126 | resolution: {integrity: sha512-mrSgt7lCh07FY+hDD1TxiTyIHyttn6vnjesnPoVDNmDfOmggTLXRv8Id5fNZey1gl/V2dyVK1VXXqVsQIiAk+A==} 127 | engines: {node: '>=18'} 128 | cpu: [x64] 129 | os: [freebsd] 130 | 131 | '@esbuild/linux-arm64@0.25.0': 132 | resolution: {integrity: sha512-9QAQjTWNDM/Vk2bgBl17yWuZxZNQIF0OUUuPZRKoDtqF2k4EtYbpyiG5/Dk7nqeK6kIJWPYldkOcBqjXjrUlmg==} 133 | engines: {node: '>=18'} 134 | cpu: [arm64] 135 | os: [linux] 136 | 137 | '@esbuild/linux-arm@0.25.0': 138 | resolution: {integrity: sha512-vkB3IYj2IDo3g9xX7HqhPYxVkNQe8qTK55fraQyTzTX/fxaDtXiEnavv9geOsonh2Fd2RMB+i5cbhu2zMNWJwg==} 139 | engines: {node: '>=18'} 140 | cpu: [arm] 141 | os: [linux] 142 | 143 | '@esbuild/linux-ia32@0.25.0': 144 | resolution: {integrity: sha512-43ET5bHbphBegyeqLb7I1eYn2P/JYGNmzzdidq/w0T8E2SsYL1U6un2NFROFRg1JZLTzdCoRomg8Rvf9M6W6Gg==} 145 | engines: {node: '>=18'} 146 | cpu: [ia32] 147 | os: [linux] 148 | 149 | '@esbuild/linux-loong64@0.25.0': 150 | resolution: {integrity: sha512-fC95c/xyNFueMhClxJmeRIj2yrSMdDfmqJnyOY4ZqsALkDrrKJfIg5NTMSzVBr5YW1jf+l7/cndBfP3MSDpoHw==} 151 | engines: {node: '>=18'} 152 | cpu: [loong64] 153 | os: [linux] 154 | 155 | '@esbuild/linux-mips64el@0.25.0': 156 | resolution: {integrity: sha512-nkAMFju7KDW73T1DdH7glcyIptm95a7Le8irTQNO/qtkoyypZAnjchQgooFUDQhNAy4iu08N79W4T4pMBwhPwQ==} 157 | engines: {node: '>=18'} 158 | cpu: [mips64el] 159 | os: [linux] 160 | 161 | '@esbuild/linux-ppc64@0.25.0': 162 | resolution: {integrity: sha512-NhyOejdhRGS8Iwv+KKR2zTq2PpysF9XqY+Zk77vQHqNbo/PwZCzB5/h7VGuREZm1fixhs4Q/qWRSi5zmAiO4Fw==} 163 | engines: {node: '>=18'} 164 | cpu: [ppc64] 165 | os: [linux] 166 | 167 | '@esbuild/linux-riscv64@0.25.0': 168 | resolution: {integrity: sha512-5S/rbP5OY+GHLC5qXp1y/Mx//e92L1YDqkiBbO9TQOvuFXM+iDqUNG5XopAnXoRH3FjIUDkeGcY1cgNvnXp/kA==} 169 | engines: {node: '>=18'} 170 | cpu: [riscv64] 171 | os: [linux] 172 | 173 | '@esbuild/linux-s390x@0.25.0': 174 | resolution: {integrity: sha512-XM2BFsEBz0Fw37V0zU4CXfcfuACMrppsMFKdYY2WuTS3yi8O1nFOhil/xhKTmE1nPmVyvQJjJivgDT+xh8pXJA==} 175 | engines: {node: '>=18'} 176 | cpu: [s390x] 177 | os: [linux] 178 | 179 | '@esbuild/linux-x64@0.25.0': 180 | resolution: {integrity: sha512-9yl91rHw/cpwMCNytUDxwj2XjFpxML0y9HAOH9pNVQDpQrBxHy01Dx+vaMu0N1CKa/RzBD2hB4u//nfc+Sd3Cw==} 181 | engines: {node: '>=18'} 182 | cpu: [x64] 183 | os: [linux] 184 | 185 | '@esbuild/netbsd-arm64@0.25.0': 186 | resolution: {integrity: sha512-RuG4PSMPFfrkH6UwCAqBzauBWTygTvb1nxWasEJooGSJ/NwRw7b2HOwyRTQIU97Hq37l3npXoZGYMy3b3xYvPw==} 187 | engines: {node: '>=18'} 188 | cpu: [arm64] 189 | os: [netbsd] 190 | 191 | '@esbuild/netbsd-x64@0.25.0': 192 | resolution: {integrity: sha512-jl+qisSB5jk01N5f7sPCsBENCOlPiS/xptD5yxOx2oqQfyourJwIKLRA2yqWdifj3owQZCL2sn6o08dBzZGQzA==} 193 | engines: {node: '>=18'} 194 | cpu: [x64] 195 | os: [netbsd] 196 | 197 | '@esbuild/openbsd-arm64@0.25.0': 198 | resolution: {integrity: sha512-21sUNbq2r84YE+SJDfaQRvdgznTD8Xc0oc3p3iW/a1EVWeNj/SdUCbm5U0itZPQYRuRTW20fPMWMpcrciH2EJw==} 199 | engines: {node: '>=18'} 200 | cpu: [arm64] 201 | os: [openbsd] 202 | 203 | '@esbuild/openbsd-x64@0.25.0': 204 | resolution: {integrity: sha512-2gwwriSMPcCFRlPlKx3zLQhfN/2WjJ2NSlg5TKLQOJdV0mSxIcYNTMhk3H3ulL/cak+Xj0lY1Ym9ysDV1igceg==} 205 | engines: {node: '>=18'} 206 | cpu: [x64] 207 | os: [openbsd] 208 | 209 | '@esbuild/sunos-x64@0.25.0': 210 | resolution: {integrity: sha512-bxI7ThgLzPrPz484/S9jLlvUAHYMzy6I0XiU1ZMeAEOBcS0VePBFxh1JjTQt3Xiat5b6Oh4x7UC7IwKQKIJRIg==} 211 | engines: {node: '>=18'} 212 | cpu: [x64] 213 | os: [sunos] 214 | 215 | '@esbuild/win32-arm64@0.25.0': 216 | resolution: {integrity: sha512-ZUAc2YK6JW89xTbXvftxdnYy3m4iHIkDtK3CLce8wg8M2L+YZhIvO1DKpxrd0Yr59AeNNkTiic9YLf6FTtXWMw==} 217 | engines: {node: '>=18'} 218 | cpu: [arm64] 219 | os: [win32] 220 | 221 | '@esbuild/win32-ia32@0.25.0': 222 | resolution: {integrity: sha512-eSNxISBu8XweVEWG31/JzjkIGbGIJN/TrRoiSVZwZ6pkC6VX4Im/WV2cz559/TXLcYbcrDN8JtKgd9DJVIo8GA==} 223 | engines: {node: '>=18'} 224 | cpu: [ia32] 225 | os: [win32] 226 | 227 | '@esbuild/win32-x64@0.25.0': 228 | resolution: {integrity: sha512-ZENoHJBxA20C2zFzh6AI4fT6RraMzjYw4xKWemRTRmRVtN9c5DcH9r/f2ihEkMjOW5eGgrwCslG/+Y/3bL+DHQ==} 229 | engines: {node: '>=18'} 230 | cpu: [x64] 231 | os: [win32] 232 | 233 | '@protobufjs/aspromise@1.1.2': 234 | resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} 235 | 236 | '@protobufjs/base64@1.1.2': 237 | resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} 238 | 239 | '@protobufjs/codegen@2.0.4': 240 | resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==} 241 | 242 | '@protobufjs/eventemitter@1.1.0': 243 | resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==} 244 | 245 | '@protobufjs/fetch@1.1.0': 246 | resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==} 247 | 248 | '@protobufjs/float@1.0.2': 249 | resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} 250 | 251 | '@protobufjs/inquire@1.1.0': 252 | resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==} 253 | 254 | '@protobufjs/path@1.1.2': 255 | resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} 256 | 257 | '@protobufjs/pool@1.1.0': 258 | resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} 259 | 260 | '@protobufjs/utf8@1.1.0': 261 | resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==} 262 | 263 | '@types/node@22.13.5': 264 | resolution: {integrity: sha512-+lTU0PxZXn0Dr1NBtC7Y8cR21AJr87dLLU953CWA6pMxxv/UDc7jYAY90upcrie1nRcD6XNG5HOYEDtgW5TxAg==} 265 | 266 | esbuild@0.25.0: 267 | resolution: {integrity: sha512-BXq5mqc8ltbaN34cDqWuYKyNhX8D/Z0J1xdtdQ8UcIIIyJyz+ZMKUt58tF3SrZ85jcfN/PZYhjR5uDQAYNVbuw==} 268 | engines: {node: '>=18'} 269 | hasBin: true 270 | 271 | flatbuffers@1.12.0: 272 | resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==} 273 | 274 | guid-typescript@1.0.9: 275 | resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} 276 | 277 | long@5.3.1: 278 | resolution: {integrity: sha512-ka87Jz3gcx/I7Hal94xaN2tZEOPoUOEVftkQqZx2EeQRN7LGdfLlI3FvZ+7WDplm+vK2Urx9ULrvSowtdCieng==} 279 | 280 | onnxruntime-common@1.20.1: 281 | resolution: {integrity: sha512-YiU0s0IzYYC+gWvqD1HzLc46Du1sXpSiwzKb63PACIJr6LfL27VsXSXQvt68EzD3V0D5Bc0vyJTjmMxp0ylQiw==} 282 | 283 | onnxruntime-web@1.20.1: 284 | resolution: {integrity: sha512-TePF6XVpLL1rWVMIl5Y9ACBQcyCNFThZON/jgElNd9Txb73CIEGlklhYR3UEr1cp5r0rbGI6nDwwrs79g7WjoA==} 285 | 286 | platform@1.3.6: 287 | resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} 288 | 289 | protobufjs@7.4.0: 290 | resolution: {integrity: sha512-mRUWCc3KUU4w1jU8sGxICXH/gNS94DvI1gxqDvBzhj1JpcsimQkYiOJfwsPUykUI5ZaspFbSgmBLER8IrQ3tqw==} 291 | engines: {node: '>=12.0.0'} 292 | 293 | typescript@5.8.3: 294 | resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==} 295 | engines: {node: '>=14.17'} 296 | hasBin: true 297 | 298 | undici-types@6.20.0: 299 | resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} 300 | 301 | snapshots: 302 | 303 | '@biomejs/biome@1.9.4': 304 | optionalDependencies: 305 | '@biomejs/cli-darwin-arm64': 1.9.4 306 | '@biomejs/cli-darwin-x64': 1.9.4 307 | '@biomejs/cli-linux-arm64': 1.9.4 308 | '@biomejs/cli-linux-arm64-musl': 1.9.4 309 | '@biomejs/cli-linux-x64': 1.9.4 310 | '@biomejs/cli-linux-x64-musl': 1.9.4 311 | '@biomejs/cli-win32-arm64': 1.9.4 312 | '@biomejs/cli-win32-x64': 1.9.4 313 | 314 | '@biomejs/cli-darwin-arm64@1.9.4': 315 | optional: true 316 | 317 | '@biomejs/cli-darwin-x64@1.9.4': 318 | optional: true 319 | 320 | '@biomejs/cli-linux-arm64-musl@1.9.4': 321 | optional: true 322 | 323 | '@biomejs/cli-linux-arm64@1.9.4': 324 | optional: true 325 | 326 | '@biomejs/cli-linux-x64-musl@1.9.4': 327 | optional: true 328 | 329 | '@biomejs/cli-linux-x64@1.9.4': 330 | optional: true 331 | 332 | '@biomejs/cli-win32-arm64@1.9.4': 333 | optional: true 334 | 335 | '@biomejs/cli-win32-x64@1.9.4': 336 | optional: true 337 | 338 | '@esbuild/aix-ppc64@0.25.0': 339 | optional: true 340 | 341 | '@esbuild/android-arm64@0.25.0': 342 | optional: true 343 | 344 | '@esbuild/android-arm@0.25.0': 345 | optional: true 346 | 347 | '@esbuild/android-x64@0.25.0': 348 | optional: true 349 | 350 | '@esbuild/darwin-arm64@0.25.0': 351 | optional: true 352 | 353 | '@esbuild/darwin-x64@0.25.0': 354 | optional: true 355 | 356 | '@esbuild/freebsd-arm64@0.25.0': 357 | optional: true 358 | 359 | '@esbuild/freebsd-x64@0.25.0': 360 | optional: true 361 | 362 | '@esbuild/linux-arm64@0.25.0': 363 | optional: true 364 | 365 | '@esbuild/linux-arm@0.25.0': 366 | optional: true 367 | 368 | '@esbuild/linux-ia32@0.25.0': 369 | optional: true 370 | 371 | '@esbuild/linux-loong64@0.25.0': 372 | optional: true 373 | 374 | '@esbuild/linux-mips64el@0.25.0': 375 | optional: true 376 | 377 | '@esbuild/linux-ppc64@0.25.0': 378 | optional: true 379 | 380 | '@esbuild/linux-riscv64@0.25.0': 381 | optional: true 382 | 383 | '@esbuild/linux-s390x@0.25.0': 384 | optional: true 385 | 386 | '@esbuild/linux-x64@0.25.0': 387 | optional: true 388 | 389 | '@esbuild/netbsd-arm64@0.25.0': 390 | optional: true 391 | 392 | '@esbuild/netbsd-x64@0.25.0': 393 | optional: true 394 | 395 | '@esbuild/openbsd-arm64@0.25.0': 396 | optional: true 397 | 398 | '@esbuild/openbsd-x64@0.25.0': 399 | optional: true 400 | 401 | '@esbuild/sunos-x64@0.25.0': 402 | optional: true 403 | 404 | '@esbuild/win32-arm64@0.25.0': 405 | optional: true 406 | 407 | '@esbuild/win32-ia32@0.25.0': 408 | optional: true 409 | 410 | '@esbuild/win32-x64@0.25.0': 411 | optional: true 412 | 413 | '@protobufjs/aspromise@1.1.2': {} 414 | 415 | '@protobufjs/base64@1.1.2': {} 416 | 417 | '@protobufjs/codegen@2.0.4': {} 418 | 419 | '@protobufjs/eventemitter@1.1.0': {} 420 | 421 | '@protobufjs/fetch@1.1.0': 422 | dependencies: 423 | '@protobufjs/aspromise': 1.1.2 424 | '@protobufjs/inquire': 1.1.0 425 | 426 | '@protobufjs/float@1.0.2': {} 427 | 428 | '@protobufjs/inquire@1.1.0': {} 429 | 430 | '@protobufjs/path@1.1.2': {} 431 | 432 | '@protobufjs/pool@1.1.0': {} 433 | 434 | '@protobufjs/utf8@1.1.0': {} 435 | 436 | '@types/node@22.13.5': 437 | dependencies: 438 | undici-types: 6.20.0 439 | 440 | esbuild@0.25.0: 441 | optionalDependencies: 442 | '@esbuild/aix-ppc64': 0.25.0 443 | '@esbuild/android-arm': 0.25.0 444 | '@esbuild/android-arm64': 0.25.0 445 | '@esbuild/android-x64': 0.25.0 446 | '@esbuild/darwin-arm64': 0.25.0 447 | '@esbuild/darwin-x64': 0.25.0 448 | '@esbuild/freebsd-arm64': 0.25.0 449 | '@esbuild/freebsd-x64': 0.25.0 450 | '@esbuild/linux-arm': 0.25.0 451 | '@esbuild/linux-arm64': 0.25.0 452 | '@esbuild/linux-ia32': 0.25.0 453 | '@esbuild/linux-loong64': 0.25.0 454 | '@esbuild/linux-mips64el': 0.25.0 455 | '@esbuild/linux-ppc64': 0.25.0 456 | '@esbuild/linux-riscv64': 0.25.0 457 | '@esbuild/linux-s390x': 0.25.0 458 | '@esbuild/linux-x64': 0.25.0 459 | '@esbuild/netbsd-arm64': 0.25.0 460 | '@esbuild/netbsd-x64': 0.25.0 461 | '@esbuild/openbsd-arm64': 0.25.0 462 | '@esbuild/openbsd-x64': 0.25.0 463 | '@esbuild/sunos-x64': 0.25.0 464 | '@esbuild/win32-arm64': 0.25.0 465 | '@esbuild/win32-ia32': 0.25.0 466 | '@esbuild/win32-x64': 0.25.0 467 | 468 | flatbuffers@1.12.0: {} 469 | 470 | guid-typescript@1.0.9: {} 471 | 472 | long@5.3.1: {} 473 | 474 | onnxruntime-common@1.20.1: {} 475 | 476 | onnxruntime-web@1.20.1: 477 | dependencies: 478 | flatbuffers: 1.12.0 479 | guid-typescript: 1.0.9 480 | long: 5.3.1 481 | onnxruntime-common: 1.20.1 482 | platform: 1.3.6 483 | protobufjs: 7.4.0 484 | 485 | platform@1.3.6: {} 486 | 487 | protobufjs@7.4.0: 488 | dependencies: 489 | '@protobufjs/aspromise': 1.1.2 490 | '@protobufjs/base64': 1.1.2 491 | '@protobufjs/codegen': 2.0.4 492 | '@protobufjs/eventemitter': 1.1.0 493 | '@protobufjs/fetch': 1.1.0 494 | '@protobufjs/float': 1.0.2 495 | '@protobufjs/inquire': 1.1.0 496 | '@protobufjs/path': 1.1.2 497 | '@protobufjs/pool': 1.1.0 498 | '@protobufjs/utf8': 1.1.0 499 | '@types/node': 22.13.5 500 | long: 5.3.1 501 | 502 | typescript@5.8.3: {} 503 | 504 | undici-types@6.20.0: {} 505 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/src-js/index.ts: -------------------------------------------------------------------------------- 1 | import * as wasm from "../pkg/sbv2_wasm.js"; 2 | import { InferenceSession, Tensor } from "onnxruntime-web"; 3 | 4 | export class ModelHolder { 5 | private models: Map = 6 | new Map(); 7 | constructor( 8 | private tok: wasm.TokenizerWrap, 9 | private deberta: InferenceSession, 10 | ) {} 11 | public static async globalInit(buf: ArrayBufferLike) { 12 | await wasm.default(buf); 13 | } 14 | public static async create(tok: string, deberta: ArrayBufferLike) { 15 | return new ModelHolder( 16 | wasm.load_tokenizer(tok), 17 | await InferenceSession.create(deberta, { 18 | executionProviders: ["webnn", "webgpu", "wasm", "cpu"], 19 | graphOptimizationLevel: "all", 20 | }), 21 | ); 22 | } 23 | public async synthesize( 24 | name: string, 25 | text: string, 26 | style_id: number = 0, 27 | style_weight: number = 1.0, 28 | sdp_ratio: number = 0.4, 29 | speed: number = 1.0, 30 | ) { 31 | const mod = this.models.get(name); 32 | if (!mod) throw new Error(`No model named ${name}`); 33 | const [vits2, style] = mod; 34 | return wasm.synthesize( 35 | text, 36 | this.tok, 37 | async (a: BigInt64Array, b: BigInt64Array) => { 38 | try { 39 | const res = ( 40 | await this.deberta.run({ 41 | input_ids: new Tensor("int64", a, [1, a.length]), 42 | attention_mask: new Tensor("int64", b, [1, b.length]), 43 | }) 44 | )["output"]; 45 | return [new Uint32Array(res.dims), await res.getData(true)]; 46 | } catch (e) { 47 | console.warn(e); 48 | throw e; 49 | } 50 | }, 51 | async ( 52 | [a_shape, a_array]: any, 53 | b_d: any, 54 | c_d: any, 55 | d_d: any, 56 | e_d: any, 57 | f: number, 58 | g: number, 59 | ) => { 60 | try { 61 | const a = new Tensor("float32", a_array, [1, ...a_shape]); 62 | const b = new Tensor("int64", b_d, [1, b_d.length]); 63 | const c = new Tensor("int64", c_d, [1, c_d.length]); 64 | const d = new Tensor("int64", d_d, [1, d_d.length]); 65 | const e = new Tensor("float32", e_d, [1, e_d.length]); 66 | const res = ( 67 | await vits2.run({ 68 | x_tst: b, 69 | x_tst_lengths: new Tensor("int64", [b_d.length]), 70 | sid: new Tensor("int64", [0]), 71 | tones: c, 72 | language: d, 73 | bert: a, 74 | style_vec: e, 75 | sdp_ratio: new Tensor("float32", [f]), 76 | length_scale: new Tensor("float32", [g]), 77 | noise_scale: new Tensor("float32", [0.677]), 78 | noise_scale_w: new Tensor("float32", [0.8]), 79 | }) 80 | ).output; 81 | return [new Uint32Array(res.dims), await res.getData(true)]; 82 | } catch (e) { 83 | console.warn(e); 84 | throw e; 85 | } 86 | }, 87 | sdp_ratio, 88 | 1.0 / speed, 89 | style_id, 90 | style_weight, 91 | style, 92 | ); 93 | } 94 | public async load(name: string, b: Uint8Array) { 95 | const [style, vits2_b] = wasm.load_sbv2file(b); 96 | const vits2 = await InferenceSession.create(vits2_b as Uint8Array, { 97 | executionProviders: ["webnn", "webgpu", "wasm", "cpu"], 98 | graphOptimizationLevel: "all", 99 | }); 100 | this.models.set(name, [vits2, style]); 101 | } 102 | public async unload(name: string) { 103 | return this.models.delete(name); 104 | } 105 | public modelList() { 106 | return this.models.keys(); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/src/array_helper.rs: -------------------------------------------------------------------------------- 1 | pub fn vec8_to_array8(v: Vec) -> js_sys::Uint8Array { 2 | let arr = js_sys::Uint8Array::new_with_length(v.len() as u32); 3 | arr.copy_from(&v); 4 | arr 5 | } 6 | 7 | pub fn vec_f32_to_array_f32(v: Vec) -> js_sys::Float32Array { 8 | let arr = js_sys::Float32Array::new_with_length(v.len() as u32); 9 | arr.copy_from(&v); 10 | arr 11 | } 12 | 13 | pub fn array8_to_vec8(buf: js_sys::Uint8Array) -> Vec { 14 | let mut body = vec![0; buf.length() as usize]; 15 | buf.copy_to(&mut body[..]); 16 | body 17 | } 18 | 19 | pub fn vec64_to_array64(v: Vec) -> js_sys::BigInt64Array { 20 | let arr = js_sys::BigInt64Array::new_with_length(v.len() as u32); 21 | arr.copy_from(&v); 22 | arr 23 | } 24 | 25 | pub fn vec_to_array(v: Vec) -> js_sys::Array { 26 | let arr = js_sys::Array::new_with_length(v.len() as u32); 27 | for (i, v) in v.into_iter().enumerate() { 28 | arr.set(i as u32, v); 29 | } 30 | arr 31 | } 32 | 33 | struct A { 34 | shape: Vec, 35 | data: Vec, 36 | } 37 | 38 | impl TryFrom for A { 39 | type Error = sbv2_core::error::Error; 40 | 41 | fn try_from(value: wasm_bindgen::JsValue) -> Result { 42 | let value: js_sys::Array = value.into(); 43 | let mut shape = vec![]; 44 | let mut data = vec![]; 45 | for (i, v) in value.iter().enumerate() { 46 | match i { 47 | 0 => { 48 | let v: js_sys::Uint32Array = v.into(); 49 | shape = vec![0; v.length() as usize]; 50 | v.copy_to(&mut shape); 51 | } 52 | 1 => { 53 | let v: js_sys::Float32Array = v.into(); 54 | data = vec![0.0; v.length() as usize]; 55 | v.copy_to(&mut data); 56 | } 57 | _ => {} 58 | }; 59 | } 60 | Ok(A { shape, data }) 61 | } 62 | } 63 | 64 | pub fn array_to_array2_f32( 65 | a: wasm_bindgen::JsValue, 66 | ) -> sbv2_core::error::Result> { 67 | let a = A::try_from(a)?; 68 | if a.shape.len() != 2 { 69 | return Err(sbv2_core::error::Error::OtherError( 70 | "Length mismatch".to_string(), 71 | )); 72 | } 73 | let shape = [a.shape[0] as usize, a.shape[1] as usize]; 74 | let arr = ndarray::Array2::from_shape_vec(shape, a.data.to_vec()) 75 | .map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?; 76 | Ok(arr) 77 | } 78 | pub fn array_to_array3_f32( 79 | a: wasm_bindgen::JsValue, 80 | ) -> sbv2_core::error::Result> { 81 | let a = A::try_from(a)?; 82 | if a.shape.len() != 3 { 83 | return Err(sbv2_core::error::Error::OtherError( 84 | "Length mismatch".to_string(), 85 | )); 86 | } 87 | let shape = [ 88 | a.shape[0] as usize, 89 | a.shape[1] as usize, 90 | a.shape[2] as usize, 91 | ]; 92 | let arr = ndarray::Array3::from_shape_vec(shape, a.data.to_vec()) 93 | .map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?; 94 | Ok(arr) 95 | } 96 | 97 | pub fn array2_f32_to_array(a: ndarray::Array2) -> js_sys::Array { 98 | let shape: Vec = a.shape().iter().map(|f| (*f as u32).into()).collect(); 99 | let typed_array = js_sys::Float32Array::new_with_length(a.len() as u32); 100 | typed_array.copy_from(&a.into_flat().to_vec()); 101 | vec_to_array(vec![vec_to_array(shape).into(), typed_array.into()]) 102 | } 103 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/src/lib.rs: -------------------------------------------------------------------------------- 1 | use once_cell::sync::Lazy; 2 | use sbv2_core::*; 3 | use wasm_bindgen::prelude::*; 4 | use wasm_bindgen_futures::JsFuture; 5 | mod array_helper; 6 | 7 | static JTALK: Lazy = Lazy::new(|| jtalk::JTalk::new().unwrap()); 8 | 9 | #[wasm_bindgen] 10 | pub struct TokenizerWrap { 11 | tokenizer: tokenizer::Tokenizer, 12 | } 13 | 14 | #[wasm_bindgen] 15 | pub fn load_tokenizer(s: js_sys::JsString) -> Result { 16 | if let Some(s) = s.as_string() { 17 | Ok(TokenizerWrap { 18 | tokenizer: tokenizer::Tokenizer::from_bytes(s.as_bytes()) 19 | .map_err(|e| JsError::new(&e.to_string()))?, 20 | }) 21 | } else { 22 | Err(JsError::new("invalid utf8")) 23 | } 24 | } 25 | 26 | #[wasm_bindgen] 27 | pub struct StyleVectorWrap { 28 | style_vector: ndarray::Array2, 29 | } 30 | 31 | #[wasm_bindgen] 32 | pub fn load_sbv2file(buf: js_sys::Uint8Array) -> Result { 33 | let (style_vectors, vits2) = sbv2file::parse_sbv2file(array_helper::array8_to_vec8(buf))?; 34 | let buf = array_helper::vec8_to_array8(vits2); 35 | Ok(array_helper::vec_to_array(vec![ 36 | StyleVectorWrap { 37 | style_vector: style::load_style(style_vectors)?, 38 | } 39 | .into(), 40 | buf.into(), 41 | ])) 42 | } 43 | 44 | #[allow(clippy::too_many_arguments)] 45 | #[wasm_bindgen] 46 | pub async fn synthesize( 47 | text: &str, 48 | tokenizer: &TokenizerWrap, 49 | bert_predict_fn: js_sys::Function, 50 | synthesize_fn: js_sys::Function, 51 | sdp_ratio: f32, 52 | length_scale: f32, 53 | style_id: i32, 54 | style_weight: f32, 55 | style_vectors: &StyleVectorWrap, 56 | ) -> Result { 57 | let synthesize_wrap = |bert_ori: ndarray::Array2, 58 | x_tst: ndarray::Array1, 59 | tones: ndarray::Array1, 60 | lang_ids: ndarray::Array1, 61 | style_vector: ndarray::Array1, 62 | sdp_ratio: f32, 63 | length_scale: f32| async move { 64 | let arr = array_helper::vec_to_array(vec![ 65 | array_helper::array2_f32_to_array(bert_ori).into(), 66 | array_helper::vec64_to_array64(x_tst.to_vec()).into(), 67 | array_helper::vec64_to_array64(tones.to_vec()).into(), 68 | array_helper::vec64_to_array64(lang_ids.to_vec()).into(), 69 | array_helper::vec_f32_to_array_f32(style_vector.to_vec()).into(), 70 | sdp_ratio.into(), 71 | length_scale.into(), 72 | ]); 73 | let res = synthesize_fn 74 | .apply(&js_sys::Object::new().into(), &arr) 75 | .map_err(|e| { 76 | error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) 77 | })?; 78 | let res = JsFuture::from(Into::::into(res)) 79 | .await 80 | .map_err(|e| { 81 | sbv2_core::error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) 82 | })?; 83 | array_helper::array_to_array3_f32(res) 84 | }; 85 | let (bert_ori, phones, tones, lang_ids) = tts_util::parse_text( 86 | text, 87 | &JTALK, 88 | &tokenizer.tokenizer, 89 | |token_ids: Vec, attention_masks: Vec| { 90 | Box::pin(async move { 91 | let arr = array_helper::vec_to_array(vec![ 92 | array_helper::vec64_to_array64(token_ids).into(), 93 | array_helper::vec64_to_array64(attention_masks).into(), 94 | ]); 95 | let res = bert_predict_fn 96 | .apply(&js_sys::Object::new().into(), &arr) 97 | .map_err(|e| { 98 | error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) 99 | })?; 100 | let res = JsFuture::from(Into::::into(res)) 101 | .await 102 | .map_err(|e| { 103 | sbv2_core::error::Error::OtherError( 104 | e.as_string().unwrap_or("unknown".to_string()), 105 | ) 106 | })?; 107 | array_helper::array_to_array2_f32(res) 108 | }) 109 | }, 110 | ) 111 | .await?; 112 | let audio = synthesize_wrap( 113 | bert_ori.to_owned(), 114 | phones, 115 | tones, 116 | lang_ids, 117 | style::get_style_vector(&style_vectors.style_vector, style_id, style_weight)?, 118 | sdp_ratio, 119 | length_scale, 120 | ) 121 | .await?; 122 | Ok(array_helper::vec8_to_array8(tts_util::array_to_vec(audio)?)) 123 | } 124 | -------------------------------------------------------------------------------- /crates/sbv2_wasm/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ESNext", 4 | "module": "ESNext", 5 | "rootDir": "./src-js", 6 | "outDir": "./dist", 7 | "moduleResolution": "node", 8 | "esModuleInterop": true, 9 | "forceConsistentCasingInFileNames": true, 10 | "strict": true, 11 | "skipLibCheck": true, 12 | "declaration": true, 13 | "emitDeclarationOnly": true 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neodyland/sbv2-api/96b53d42cd855b733c5bdb6477fcd7948d7b044a/models/.gitkeep -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /scripts/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | venv/ 3 | tmp/ 4 | *.safetensors 5 | *.npy -------------------------------------------------------------------------------- /scripts/convert/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 -------------------------------------------------------------------------------- /scripts/convert/README.md: -------------------------------------------------------------------------------- 1 | # 変換方法 2 | 3 | ## 初心者向け準備 4 | 5 | わかる人は飛ばしてください。 6 | 7 | 1. pythonを入れます。3.11.8で動作確認をしていますが、最近のバージョンなら大体動くはずです。 8 | 9 | 4. `cd convert` 10 | 11 | 3. `python -m venv venv` 12 | 13 | 4. `source venv/bin/activate` 14 | 15 | 5. `pip install -r requirements.txt` 16 | 17 | ## モデル変換 18 | 19 | 1. 変換したいモデルの`.safetensors`で終わるファイルの位置を特定してください。 20 | 21 | 2. 同様に`config.json`、`style_vectors.npy`というファイルを探してください。 22 | 23 | 3. 以下のコマンドを実行します。 24 | ```sh 25 | python convert_model.py --style_file "ここにstyle_vectors.npyの場所" --config_file "同様にconfig.json場所" --model_file "同様に.safetensorsで終わるファイルの場所" 26 | ``` 27 | 28 | 4. `models/名前.sbv2`というファイルが出力されます。GUI版のモデルファイルに入れてあげたら使えます。 29 | 30 | ## Deberta変換 31 | 32 | 意味が分からないならおそらく変換しなくてもいいってことです。 33 | 34 | venvを用意し、requirementsを入れて、`python convert_model.py`を実行するだけです。 35 | 36 | `models/deberta.onnx`と`models/tokenizer.json`が出力されたら成功です。 -------------------------------------------------------------------------------- /scripts/convert/convert_deberta.py: -------------------------------------------------------------------------------- 1 | from transformers.convert_slow_tokenizer import BertConverter 2 | from style_bert_vits2.nlp import bert_models 3 | from style_bert_vits2.constants import Languages 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | import torch 6 | from torch import nn 7 | from argparse import ArgumentParser 8 | import os 9 | 10 | parser = ArgumentParser() 11 | parser.add_argument("--model", default="ku-nlp/deberta-v2-large-japanese-char-wwm") 12 | args = parser.parse_args() 13 | model_name = args.model 14 | 15 | bert_models.load_tokenizer(Languages.JP, model_name) 16 | tokenizer = bert_models.load_tokenizer(Languages.JP) 17 | converter = BertConverter(tokenizer) 18 | tokenizer = converter.converted() 19 | tokenizer.save("../../models/tokenizer.json") 20 | 21 | 22 | class ORTDeberta(nn.Module): 23 | def __init__(self, model_name): 24 | super(ORTDeberta, self).__init__() 25 | self.model = AutoModelForMaskedLM.from_pretrained(model_name) 26 | 27 | def forward(self, input_ids, token_type_ids, attention_mask): 28 | inputs = { 29 | "input_ids": input_ids, 30 | "token_type_ids": token_type_ids, 31 | "attention_mask": attention_mask, 32 | } 33 | res = self.model(**inputs, output_hidden_states=True) 34 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 35 | return res 36 | 37 | 38 | model = ORTDeberta(model_name) 39 | inputs = AutoTokenizer.from_pretrained(model_name)( 40 | "今日はいい天気ですね", return_tensors="pt" 41 | ) 42 | 43 | torch.onnx.export( 44 | model, 45 | (inputs["input_ids"], inputs["token_type_ids"], inputs["attention_mask"]), 46 | "../../models/deberta.onnx", 47 | input_names=["input_ids", "token_type_ids", "attention_mask"], 48 | output_names=["output"], 49 | verbose=True, 50 | dynamic_axes={"input_ids": {1: "batch_size"}, "attention_mask": {1: "batch_size"}}, 51 | ) 52 | os.system("onnxsim ../../models/deberta.onnx ../../models/deberta.onnx") -------------------------------------------------------------------------------- /scripts/convert/convert_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | from io import BytesIO 4 | from style_bert_vits2.nlp import bert_models 5 | from style_bert_vits2.constants import Languages 6 | from style_bert_vits2.models.infer import get_net_g, get_text 7 | from style_bert_vits2.models.hyper_parameters import HyperParameters 8 | import torch 9 | from style_bert_vits2.constants import ( 10 | DEFAULT_ASSIST_TEXT_WEIGHT, 11 | DEFAULT_STYLE, 12 | DEFAULT_STYLE_WEIGHT, 13 | Languages, 14 | ) 15 | import os 16 | from tarfile import open as taropen, TarInfo 17 | from zstandard import ZstdCompressor 18 | from style_bert_vits2.tts_model import TTSModel 19 | import numpy as np 20 | from argparse import ArgumentParser 21 | 22 | parser = ArgumentParser() 23 | parser.add_argument("--style_file", required=True) 24 | parser.add_argument("--config_file", required=True) 25 | parser.add_argument("--model_file", required=True) 26 | args = parser.parse_args() 27 | style_file = args.style_file 28 | config_file = args.config_file 29 | model_file = args.model_file 30 | 31 | bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm") 32 | bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm") 33 | 34 | array = np.load(style_file) 35 | data = array.tolist() 36 | hyper_parameters = HyperParameters.load_from_json(config_file) 37 | out_name = hyper_parameters.model_name 38 | 39 | with open(f"../../models/style_vectors_{out_name}.json", "w") as f: 40 | json.dump( 41 | { 42 | "data": data, 43 | "shape": array.shape, 44 | }, 45 | f, 46 | ) 47 | text = "今日はいい天気ですね。" 48 | 49 | bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( 50 | text, 51 | Languages.JP, 52 | hyper_parameters, 53 | "cpu", 54 | assist_text=None, 55 | assist_text_weight=DEFAULT_ASSIST_TEXT_WEIGHT, 56 | given_phone=None, 57 | given_tone=None, 58 | ) 59 | 60 | tts_model = TTSModel( 61 | model_path=model_file, 62 | config_path=config_file, 63 | style_vec_path=style_file, 64 | device="cpu", 65 | ) 66 | device = "cpu" 67 | style_id = tts_model.style2id[DEFAULT_STYLE] 68 | 69 | 70 | def get_style_vector(style_id, weight): 71 | style_vectors = np.load(style_file) 72 | mean = style_vectors[0] 73 | style_vec = style_vectors[style_id] 74 | style_vec = mean + (style_vec - mean) * weight 75 | return style_vec 76 | 77 | 78 | style_vector = get_style_vector(style_id, DEFAULT_STYLE_WEIGHT) 79 | 80 | x_tst = phones.to(device).unsqueeze(0) 81 | tones = tones.to(device).unsqueeze(0) 82 | lang_ids = lang_ids.to(device).unsqueeze(0) 83 | bert = bert.to(device).unsqueeze(0) 84 | ja_bert = ja_bert.to(device).unsqueeze(0) 85 | en_bert = en_bert.to(device).unsqueeze(0) 86 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 87 | style_vec_tensor = torch.from_numpy(style_vector).to(device).unsqueeze(0) 88 | 89 | model = get_net_g( 90 | model_file, 91 | hyper_parameters.version, 92 | device, 93 | hyper_parameters, 94 | ) 95 | 96 | 97 | def forward(x, x_len, sid, tone, lang, bert, style, length_scale, sdp_ratio, noise_scale, noise_scale_w): 98 | return model.infer( 99 | x, 100 | x_len, 101 | sid, 102 | tone, 103 | lang, 104 | bert, 105 | style, 106 | sdp_ratio=sdp_ratio, 107 | length_scale=length_scale, 108 | noise_scale=noise_scale, 109 | noise_scale_w=noise_scale_w, 110 | ) 111 | 112 | 113 | model.forward = forward 114 | 115 | torch.onnx.export( 116 | model, 117 | ( 118 | x_tst, 119 | x_tst_lengths, 120 | torch.LongTensor([0]).to(device), 121 | tones, 122 | lang_ids, 123 | bert, 124 | style_vec_tensor, 125 | torch.tensor(1.0), 126 | torch.tensor(0.0), 127 | torch.tensor(0.6777), 128 | torch.tensor(0.8), 129 | ), 130 | f"../../models/model_{out_name}.onnx", 131 | verbose=True, 132 | dynamic_axes={ 133 | "x_tst": {0: "batch_size", 1: "x_tst_max_length"}, 134 | "x_tst_lengths": {0: "batch_size"}, 135 | "sid": {0: "batch_size"}, 136 | "tones": {0: "batch_size", 1: "x_tst_max_length"}, 137 | "language": {0: "batch_size", 1: "x_tst_max_length"}, 138 | "bert": {0: "batch_size", 2: "x_tst_max_length"}, 139 | "style_vec": {0: "batch_size"}, 140 | }, 141 | input_names=[ 142 | "x_tst", 143 | "x_tst_lengths", 144 | "sid", 145 | "tones", 146 | "language", 147 | "bert", 148 | "style_vec", 149 | "length_scale", 150 | "sdp_ratio", 151 | "noise_scale", 152 | "noise_scale_w" 153 | ], 154 | output_names=["output"], 155 | ) 156 | os.system(f"onnxsim ../../models/model_{out_name}.onnx ../../models/model_{out_name}.onnx") 157 | onnxfile = open(f"../../models/model_{out_name}.onnx", "rb").read() 158 | stylefile = open(f"../../models/style_vectors_{out_name}.json", "rb").read() 159 | version = bytes("1", "utf8") 160 | with taropen(f"../../models/tmp_{out_name}.sbv2tar", "w") as w: 161 | 162 | def add_tar(f, b): 163 | t = TarInfo(f) 164 | t.size = len(b) 165 | w.addfile(t, BytesIO(b)) 166 | 167 | add_tar("version.txt", version) 168 | add_tar("model.onnx", onnxfile) 169 | add_tar("style_vectors.json", stylefile) 170 | open(f"../../models/{out_name}.sbv2", "wb").write( 171 | ZstdCompressor(threads=-1, level=22).compress( 172 | open(f"../../models/tmp_{out_name}.sbv2tar", "rb").read() 173 | ) 174 | ) 175 | os.unlink(f"../../models/tmp_{out_name}.sbv2tar") 176 | -------------------------------------------------------------------------------- /scripts/convert/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/neodyland/style-bert-vits2-ref 2 | onnxsim 3 | numpy<2 4 | zstandard 5 | onnxruntime 6 | cmake<4 -------------------------------------------------------------------------------- /scripts/docker/cpu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust AS builder 2 | WORKDIR /work 3 | COPY . . 4 | RUN cargo build -r --bin sbv2_api 5 | FROM ubuntu AS upx 6 | WORKDIR /work 7 | RUN apt update && apt-get install -y upx binutils 8 | COPY --from=builder /work/target/release/sbv2_api /work/main 9 | COPY --from=builder /work/target/release/*.so /work 10 | RUN upx --best --lzma /work/main 11 | RUN find /work -maxdepth 1 -name "*.so" -exec strip --strip-unneeded {} + 12 | RUN find /work -maxdepth 1 -name "*.so" -exec upx --best --lzma {} + 13 | FROM gcr.io/distroless/cc-debian12 14 | WORKDIR /work 15 | COPY --from=upx /work/main /work/main 16 | COPY --from=upx /work/*.so /work 17 | CMD ["/work/main"] 18 | -------------------------------------------------------------------------------- /scripts/docker/cuda.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust AS builder 2 | WORKDIR /work 3 | COPY . . 4 | RUN cargo build -r --bin sbv2_api -F cuda,cuda_tf32 5 | FROM ubuntu AS upx 6 | WORKDIR /work 7 | RUN apt update && apt-get install -y upx binutils 8 | COPY --from=builder /work/target/release/sbv2_api /work/main 9 | COPY --from=builder /work/target/release/*.so /work 10 | RUN upx --best --lzma /work/main 11 | RUN find /work -maxdepth 1 -name "*.so" -exec strip --strip-unneeded {} + 12 | FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04 13 | WORKDIR /work 14 | COPY --from=upx /work/main /work/main 15 | COPY --from=upx /work/*.so /work 16 | ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/work 17 | CMD ["/work/main"] 18 | -------------------------------------------------------------------------------- /scripts/docker/run_cpu.sh: -------------------------------------------------------------------------------- 1 | docker run -it --rm -p 3000:3000 --name sbv2 \ 2 | -v ./models:/work/models --env-file .env \ 3 | ghcr.io/neodyland/sbv2-api:cpu 4 | -------------------------------------------------------------------------------- /scripts/docker/run_cuda.sh: -------------------------------------------------------------------------------- 1 | docker run -it --rm -p 3000:3000 --name sbv2 \ 2 | -v ./models:/work/models --env-file .env \ 3 | --gpus all \ 4 | ghcr.io/neodyland/sbv2-api:cuda 5 | -------------------------------------------------------------------------------- /scripts/make_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | git clone https://github.com/Aivis-Project/AivisSpeech-Engine ./scripts/tmp --filter=blob:none -n 4 | cd ./scripts/tmp 5 | git checkout 168b2a1144afe300b0490d9a6dd773ec6e927667 -- resources/dictionaries/*.csv 6 | cd ../.. 7 | rm -rf ./crates/sbv2_core/src/dic 8 | cp -r ./scripts/tmp/resources/dictionaries ./crates/sbv2_core/src/dic 9 | rm -rf ./scripts/tmp 10 | for file in ./crates/sbv2_core/src/dic/0*.csv; do 11 | /usr/bin/cat "$file" 12 | echo 13 | done > ./crates/sbv2_core/src/all.csv 14 | lindera build ./crates/sbv2_core/src/all.csv ./crates/sbv2_core/src/dic/all.dic -u -k ipadic -------------------------------------------------------------------------------- /scripts/sbv2-bindings-colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 音声合成プログラム\n", 8 | "\n", 9 | "このノートブックでは、`sbv2_bindings` パッケージを使用して音声合成を行います。必要なモデルをダウンロードし、ユーザーが入力したテキストから音声を生成します。音声合成が終わったら、再度テキストの入力を求め、ユーザーが終了するまで繰り返します。" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# 必要なパッケージのインストール\n", 19 | "%pip install sbv2_bindings\n", 20 | "\n", 21 | "# 必要なモジュールのインポート\n", 22 | "import os\n", 23 | "import urllib.request\n", 24 | "import time\n", 25 | "from sbv2_bindings import TTSModel" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## モデルのダウンロード\n", 33 | "\n", 34 | "モデルファイルとトークナイザーをダウンロードします。ユーザーが独自のモデルを使用したい場合は、該当するURLまたはローカルパスを指定してください。" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# モデルの URL またはローカルパスの指定\n", 44 | "user_sbv2_model_url = \"\" # カスタムモデルのURLがあればここに指定\n", 45 | "user_sbv2_model_path = \"\" # カスタムモデルのローカルパスがあればここに指定\n", 46 | "\n", 47 | "# モデル用のディレクトリを作成\n", 48 | "model_dir = 'models'\n", 49 | "os.makedirs(model_dir, exist_ok=True)\n", 50 | "\n", 51 | "# ダウンロードするファイルの URL\n", 52 | "file_urls = [\n", 53 | " \"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/tokenizer.json\",\n", 54 | " \"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/deberta.onnx\",\n", 55 | "]\n", 56 | "\n", 57 | "# モデルのパス決定\n", 58 | "if user_sbv2_model_path:\n", 59 | " sbv2_model_path = user_sbv2_model_path # ローカルモデルのパスを使用\n", 60 | "elif user_sbv2_model_url:\n", 61 | " sbv2_model_filename = os.path.basename(user_sbv2_model_url)\n", 62 | " sbv2_model_path = os.path.join(model_dir, sbv2_model_filename)\n", 63 | " file_urls.append(user_sbv2_model_url)\n", 64 | "else:\n", 65 | " # デフォルトのモデルを使用\n", 66 | " sbv2_model_filename = \"tsukuyomi.sbv2\"\n", 67 | " sbv2_model_path = os.path.join(model_dir, sbv2_model_filename)\n", 68 | " file_urls.append(\"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/tsukuyomi.sbv2\")\n", 69 | "\n", 70 | "# ファイルをダウンロード\n", 71 | "for url in file_urls:\n", 72 | " file_name = os.path.join(model_dir, os.path.basename(url))\n", 73 | " if not os.path.exists(file_name):\n", 74 | " print(f\"{file_name} をダウンロードしています...\")\n", 75 | " urllib.request.urlretrieve(url, file_name)\n", 76 | " else:\n", 77 | " print(f\"{file_name} は既に存在します。\")\n", 78 | "\n", 79 | "# ダウンロードまたは使用するファイルを確認\n", 80 | "print(\"\\n使用するファイル:\")\n", 81 | "for file in os.listdir(model_dir):\n", 82 | " print(file)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## モデルの読み込みと音声合成\n", 90 | "\n", 91 | "モデルを読み込み、ユーザーが入力したテキストから音声を生成します。話者名は使用する `.sbv2` ファイル名から自動的に取得します。音声合成が終わったら、再度テキストの入力を求め、ユーザーが終了するまで繰り返します。" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# 音声合成の実行\n", 101 | "def main():\n", 102 | " try:\n", 103 | " print(\"\\nモデルを読み込んでいます...\")\n", 104 | " model = TTSModel.from_path(\n", 105 | " os.path.join(model_dir, \"deberta.onnx\"),\n", 106 | " os.path.join(model_dir, \"tokenizer.json\")\n", 107 | " )\n", 108 | " print(\"モデルの読み込みが完了しました!\")\n", 109 | " except Exception as e:\n", 110 | " print(f\"モデルの読み込みに失敗しました: {e}\")\n", 111 | " return\n", 112 | "\n", 113 | " # 話者名を取得(.sbv2 ファイル名の拡張子を除いた部分)\n", 114 | " speaker_name = os.path.splitext(os.path.basename(sbv2_model_path))[0]\n", 115 | " \n", 116 | " # 指定されたモデルのパスを使用\n", 117 | " try:\n", 118 | " model.load_sbv2file_from_path(speaker_name, sbv2_model_path)\n", 119 | " print(f\"話者 '{speaker_name}' のセットアップが完了しました!\")\n", 120 | " except Exception as e:\n", 121 | " print(f\"SBV2ファイルの読み込みに失敗しました: {e}\")\n", 122 | " return\n", 123 | "\n", 124 | " # 音声合成を繰り返し実行\n", 125 | " while True:\n", 126 | " # 合成したいテキストをユーザーから入力\n", 127 | " user_input = input(\"\\n音声合成したいテキストを入力してください(終了するには 'exit' と入力): \")\n", 128 | " \n", 129 | " if user_input.strip().lower() == 'exit':\n", 130 | " print(\"音声合成を終了します。\")\n", 131 | " break\n", 132 | "\n", 133 | " # 出力ファイル名\n", 134 | " output_file = \"output.wav\"\n", 135 | "\n", 136 | " # 音声合成を実行\n", 137 | " try:\n", 138 | " print(\"\\n音声合成を開始します...\")\n", 139 | " start_time = time.time()\n", 140 | "\n", 141 | " audio_data = model.synthesize(user_input, speaker_name, 0, 0.0, 1)\n", 142 | "\n", 143 | " with open(output_file, \"wb\") as f:\n", 144 | " f.write(audio_data)\n", 145 | "\n", 146 | " end_time = time.time()\n", 147 | " elapsed_time = end_time - start_time\n", 148 | "\n", 149 | " print(f\"\\n音声が '{output_file}' に保存されました。\")\n", 150 | " print(f\"音声合成にかかった時間: {elapsed_time:.2f} 秒\")\n", 151 | " except Exception as e:\n", 152 | " print(f\"音声合成に失敗しました: {e}\")\n", 153 | "\n", 154 | "if __name__ == \"__main__\":\n", 155 | " main()" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Python 3", 162 | "language": "python", 163 | "name": "python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.x" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 4 180 | } 181 | -------------------------------------------------------------------------------- /scripts/sbv2-test-api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | res = requests.post( 4 | "http://localhost:3000/synthesize", 5 | json={"text": "おはようございます", "ident": "tsukuyomi"}, 6 | ) 7 | with open("output.wav", "wb") as f: 8 | f.write(res.content) 9 | -------------------------------------------------------------------------------- /scripts/sbv2-test-bindings.py: -------------------------------------------------------------------------------- 1 | from sbv2_bindings import TTSModel 2 | 3 | 4 | def main(): 5 | print("Loading models...") 6 | model = TTSModel.from_path("./models/debert.onnx", "./models/tokenizer.json") 7 | print("Models loaded!") 8 | 9 | model.load_sbv2file_from_path("amitaro", "./models/amitaro.sbv2") 10 | print("All setup is done!") 11 | 12 | style_vector = model.get_style_vector("amitaro", 0, 1.0) 13 | with open("output.wav", "wb") as f: 14 | f.write( 15 | model.synthesize("おはようございます。", "amitaro", 0, 0, 0.0, 0.5) 16 | ) 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | data = (requests.get("http://localhost:8080/audio_query", params={ 5 | "text": "こんにちは、今日はいい天気ですね。", 6 | })).json() 7 | print(data) 8 | 9 | data = (requests.post("http://localhost:8080/synthesis", json={ 10 | "text": data["text"], 11 | "ident": "tsukuyomi", 12 | "speaker_id": 0, 13 | "style_id": 0, 14 | "sdp_ratio": 0.5, 15 | "length_scale": 0.5, 16 | "audio_query": data["audio_query"], 17 | })).content 18 | with open("test.wav", "wb") as f: 19 | f.write(data) --------------------------------------------------------------------------------