├── .clang-format ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── lint.yml ├── .gitignore ├── CPPLINT.cfg ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py └── index.rst ├── examples ├── aishell-3 │ ├── configs │ │ ├── v1.json │ │ ├── v2.json │ │ └── v3.json │ ├── local │ │ ├── download_data.sh │ │ └── prepare_data.py │ ├── run.sh │ ├── tools │ └── vits ├── baker │ ├── configs │ │ ├── v1.json │ │ ├── v2.json │ │ ├── v3.json │ │ ├── vits2_v1.json │ │ ├── vits2_vocos_v1.json │ │ └── vocos.json │ ├── local │ │ └── prepare_data.py │ ├── run.sh │ ├── tools │ └── vits ├── chinese_prosody_polyphone │ ├── README.md │ ├── frontend │ ├── lexicon │ │ ├── pinyin_dict.txt │ │ ├── polyphone.txt │ │ └── prosody.txt │ ├── run.sh │ └── tools ├── ljspeech │ ├── configs │ │ ├── v1.json │ │ ├── v2.json │ │ └── v3.json │ ├── local │ │ ├── download_data.sh │ │ └── prepare_data.py │ ├── path.sh │ ├── run.sh │ ├── tools │ └── vits └── multilingual │ ├── configs │ ├── v1.json │ ├── v2.json │ └── v3.json │ ├── run.sh │ ├── tools │ └── vits ├── requirements.txt ├── runtime ├── android │ ├── .gitignore │ ├── README.md │ ├── app │ │ ├── .gitignore │ │ ├── build.gradle │ │ ├── proguard-rules.pro │ │ └── src │ │ │ ├── androidTest │ │ │ └── java │ │ │ │ └── cn │ │ │ │ └── org │ │ │ │ └── wenet │ │ │ │ └── wetts │ │ │ │ └── ExampleInstrumentedTest.java │ │ │ ├── main │ │ │ ├── AndroidManifest.xml │ │ │ ├── assets │ │ │ │ └── .gitkeep │ │ │ ├── cpp │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── cmake │ │ │ │ ├── frontend │ │ │ │ ├── model │ │ │ │ ├── utils │ │ │ │ └── wetts.cc │ │ │ ├── java │ │ │ │ └── cn │ │ │ │ │ └── org │ │ │ │ │ └── wenet │ │ │ │ │ └── wetts │ │ │ │ │ ├── MainActivity.java │ │ │ │ │ └── Synthesis.java │ │ │ └── res │ │ │ │ ├── drawable-v24 │ │ │ │ └── ic_launcher_foreground.xml │ │ │ │ ├── drawable │ │ │ │ └── ic_launcher_background.xml │ │ │ │ ├── layout │ │ │ │ └── activity_main.xml │ │ │ │ ├── mipmap-anydpi-v26 │ │ │ │ ├── ic_launcher.xml │ │ │ │ └── ic_launcher_round.xml │ │ │ │ ├── mipmap-hdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-mdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-xhdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-xxhdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-xxxhdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── values-night │ │ │ │ └── themes.xml │ │ │ │ ├── values │ │ │ │ ├── attrs.xml │ │ │ │ ├── colors.xml │ │ │ │ ├── strings.xml │ │ │ │ └── themes.xml │ │ │ │ └── xml │ │ │ │ ├── backup_rules.xml │ │ │ │ └── data_extraction_rules.xml │ │ │ └── test │ │ │ └── java │ │ │ └── cn │ │ │ └── org │ │ │ └── wenet │ │ │ └── wetts │ │ │ └── ExampleUnitTest.java │ ├── build.gradle │ ├── gradle.properties │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ └── settings.gradle ├── core │ ├── bin │ │ ├── CMakeLists.txt │ │ ├── http_server_main.cc │ │ └── tts_main.cc │ ├── cmake │ │ ├── boost.cmake │ │ ├── gflags.cmake │ │ ├── glog.cmake │ │ ├── gtest.cmake │ │ ├── jsoncpp.cmake │ │ ├── onnxruntime.cmake │ │ └── wetextprocessing.cmake │ ├── frontend │ │ ├── CMakeLists.txt │ │ ├── g2p_en.cc │ │ ├── g2p_en.h │ │ ├── g2p_prosody.cc │ │ ├── g2p_prosody.h │ │ ├── lexicon.cc │ │ ├── lexicon.h │ │ └── wav.h │ ├── http │ │ ├── CMakeLists.txt │ │ ├── http_server.cc │ │ └── http_server.h │ ├── model │ │ ├── CMakeLists.txt │ │ ├── onnx_model.cc │ │ ├── onnx_model.h │ │ ├── tts_model.cc │ │ └── tts_model.h │ ├── test │ │ └── CMakeLists.txt │ └── utils │ │ ├── CMakeLists.txt │ │ ├── fst.cc │ │ ├── fst.h │ │ ├── string.cc │ │ ├── string.h │ │ ├── timer.h │ │ ├── utils.cc │ │ └── utils.h ├── cpu_triton_stream │ ├── .gitignore │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── client │ │ ├── client.py │ │ ├── stream_client.py │ │ ├── text.scp │ │ └── web_ui.py │ ├── model_repo │ │ ├── decoder │ │ │ ├── 1 │ │ │ │ └── .gitkeep │ │ │ └── config.pbtxt │ │ ├── encoder │ │ │ ├── 1 │ │ │ │ └── .gitkeep │ │ │ └── config.pbtxt │ │ ├── stream_tts │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ └── tts │ │ │ ├── 1 │ │ │ └── model.py │ │ │ └── config.pbtxt │ ├── requirements-client.txt │ └── requirements-web.txt ├── gpu_triton │ ├── Dockerfile │ ├── README.md │ ├── client │ │ ├── client.py │ │ ├── generate_input.py │ │ └── text.scp │ └── model_repo │ │ ├── generator │ │ ├── 1 │ │ │ └── .gitkeep │ │ └── config.pbtxt │ │ └── tts │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt ├── onnxruntime │ ├── CMakeLists.txt │ ├── bin │ ├── cmake │ ├── frontend │ ├── http │ ├── model │ └── utils └── web │ ├── README.md │ ├── app.py │ └── requirements.txt ├── setup.cfg ├── setup.py ├── tools ├── cleaners.py ├── compute_spec_length.py ├── gen_pinyin_lexicon.py └── parse_options.sh └── wetts ├── __init__.py ├── cli ├── __init__.py ├── frontend.py ├── hub.py ├── model.py └── tts.py ├── frontend ├── README.md ├── dataset.py ├── export_onnx.py ├── g2p_prosody.py ├── hanzi2pinyin.py ├── model.py ├── test_polyphone.py ├── test_prosody.py ├── train.py └── utils.py └── vits ├── data_utils.py ├── export_onnx.py ├── inference.py ├── inference_onnx.py ├── losses.py ├── model ├── attentions.py ├── decoders.py ├── discriminators.py ├── duration_predictors.py ├── encoders.py ├── flows.py ├── models.py ├── modules.py └── normalization.py ├── train.py └── utils ├── commons.py ├── mel_processing.py ├── monotonic_align.py ├── stft.py ├── task.py └── transforms.py /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlinesLeft: true 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: true 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | BeforeCatch: false 33 | BeforeElse: false 34 | IndentBraces: false 35 | BreakBeforeBinaryOperators: None 36 | BreakBeforeBraces: Attach 37 | BreakBeforeTernaryOperators: true 38 | BreakConstructorInitializersBeforeComma: false 39 | BreakAfterJavaFieldAnnotations: false 40 | BreakStringLiterals: true 41 | ColumnLimit: 80 42 | CommentPragmas: '^ IWYU pragma:' 43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 44 | ConstructorInitializerIndentWidth: 4 45 | ContinuationIndentWidth: 4 46 | Cpp11BracedListStyle: true 47 | DisableFormat: false 48 | ExperimentalAutoDetectBinPacking: false 49 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 50 | IncludeCategories: 51 | - Regex: '^<.*\.h>' 52 | Priority: 1 53 | - Regex: '^<.*' 54 | Priority: 2 55 | - Regex: '.*' 56 | Priority: 3 57 | IncludeIsMainRegex: '([-_](test|unittest))?$' 58 | IndentCaseLabels: true 59 | IndentWidth: 2 60 | IndentWrappedFunctionNames: false 61 | JavaScriptQuotes: Leave 62 | JavaScriptWrapImports: true 63 | KeepEmptyLinesAtTheStartOfBlocks: false 64 | MacroBlockBegin: '' 65 | MacroBlockEnd: '' 66 | MaxEmptyLinesToKeep: 1 67 | NamespaceIndentation: None 68 | ObjCBlockIndentWidth: 2 69 | ObjCSpaceAfterProperty: false 70 | ObjCSpaceBeforeProtocolList: false 71 | PenaltyBreakBeforeFirstCallParameter: 1 72 | PenaltyBreakComment: 300 73 | PenaltyBreakFirstLessLess: 120 74 | PenaltyBreakString: 1000 75 | PenaltyExcessCharacter: 1000000 76 | PenaltyReturnTypeOnItsOwnLine: 200 77 | PointerAlignment: Left 78 | ReflowComments: true 79 | SortIncludes: true 80 | SpaceAfterCStyleCast: false 81 | SpaceBeforeAssignmentOperators: true 82 | SpaceBeforeParens: ControlStatements 83 | SpaceInEmptyParentheses: false 84 | SpacesBeforeTrailingComments: 2 85 | SpacesInAngles: false 86 | SpacesInContainerLiterals: true 87 | SpacesInCStyleCastParentheses: false 88 | SpacesInParentheses: false 89 | SpacesInSquareBrackets: false 90 | Standard: Auto 91 | TabWidth: 8 92 | UseTab: Never 93 | ... 94 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = B,C,E,F,P,T4,W,B9 3 | max-line-length = 80 4 | # C408 ignored because we like the dict keyword argument syntax 5 | # E501 is not flexible enough, we're using B950 instead 6 | ignore = 7 | E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, 8 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying 9 | # to line this up with executable bit 10 | EXE001, 11 | # these ignores are from flake8-bugbear; please fix! 12 | B006,B007,B008,B905 13 | # these ignores are from flake8-comprehensions; please fix! 14 | C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415 15 | exclude = 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | quick-checks: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Fetch WeTTS 14 | uses: actions/checkout@v1 15 | - name: Checkout PR tip 16 | run: | 17 | set -eux 18 | if [[ "${{ github.event_name }}" == "pull_request" ]]; then 19 | # We are on a PR, so actions/checkout leaves us on a merge commit. 20 | # Check out the actual tip of the branch. 21 | git checkout ${{ github.event.pull_request.head.sha }} 22 | fi 23 | echo ::set-output name=commit_sha::$(git rev-parse HEAD) 24 | id: get_pr_tip 25 | - name: Ensure no tabs 26 | run: | 27 | (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false)) 28 | - name: Ensure no trailing whitespace 29 | run: | 30 | (! git grep -I -n $' $' -- . ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have trailing whitespace; please remove them"; false)) 31 | 32 | flake8-py3: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - name: Setup Python 36 | uses: actions/setup-python@v1 37 | with: 38 | python-version: 3.9 39 | architecture: x64 40 | - name: Fetch WeTTS 41 | uses: actions/checkout@v1 42 | - name: Checkout PR tip 43 | run: | 44 | set -eux 45 | if [[ "${{ github.event_name }}" == "pull_request" ]]; then 46 | # We are on a PR, so actions/checkout leaves us on a merge commit. 47 | # Check out the actual tip of the branch. 48 | git checkout ${{ github.event.pull_request.head.sha }} 49 | fi 50 | echo ::set-output name=commit_sha::$(git rev-parse HEAD) 51 | id: get_pr_tip 52 | - name: Run flake8 53 | run: | 54 | set -eux 55 | pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0 56 | flake8 --version 57 | flake8 58 | if [ $? != 0 ]; then exit 1; fi 59 | 60 | cpplint: 61 | runs-on: ubuntu-latest 62 | steps: 63 | - name: Setup Python 64 | uses: actions/setup-python@v1 65 | with: 66 | python-version: 3.x 67 | architecture: x64 68 | - name: Fetch WeTTS 69 | uses: actions/checkout@v1 70 | - name: Checkout PR tip 71 | run: | 72 | set -eux 73 | if [[ "${{ github.event_name }}" == "pull_request" ]]; then 74 | # We are on a PR, so actions/checkout leaves us on a merge commit. 75 | # Check out the actual tip of the branch. 76 | git checkout ${{ github.event.pull_request.head.sha }} 77 | fi 78 | echo ::set-output name=commit_sha::$(git rev-parse HEAD) 79 | id: get_pr_tip 80 | - name: Run cpplint 81 | run: | 82 | set -eux 83 | pip install cpplint 84 | cpplint --version 85 | cpplint --recursive . 86 | if [ $? != 0 ]; then exit 1; fi 87 | 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Visual Studio Code files 7 | .vscode 8 | .vs 9 | 10 | # PyCharm files 11 | .idea 12 | 13 | # Eclipse Project settings 14 | *.*project 15 | .settings 16 | 17 | # Sublime Text settings 18 | *.sublime-workspace 19 | *.sublime-project 20 | 21 | # Editor temporaries 22 | *.swn 23 | *.swo 24 | *.swp 25 | *.swm 26 | *~ 27 | 28 | # IPython notebook checkpoints 29 | .ipynb_checkpoints 30 | 31 | # macOS dir files 32 | .DS_Store 33 | 34 | exp 35 | data 36 | raw_wav 37 | tensorboard 38 | **/*build* 39 | /BZNSYP 40 | -------------------------------------------------------------------------------- /CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | root=runtime/core 2 | filter=-build/c++11 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WeTTS 2 | 3 | Production First and Production Ready End-to-End Text-to-Speech Toolkit 4 | 5 | ## Install 6 | 7 | ### Install python package 8 | ``` sh 9 | pip install git+https://github.com/wenet-e2e/wetts.git 10 | ``` 11 | **Command-line usage** (use `-h` for parameters): 12 | 13 | ``` sh 14 | wetts --text "今天天气怎么样" --wav output.wav 15 | ``` 16 | 17 | **Python programming usage**: 18 | 19 | ``` python 20 | import wetts 21 | 22 | # TODO 23 | ``` 24 | 25 | ### Install for development & deployment 26 | 27 | We suggest to install WeTTS with Anaconda or Miniconda. 28 | 29 | Clone this repo: 30 | 31 | ```sh 32 | git clone https://github.com/wenet-e2e/wetts.git 33 | ``` 34 | 35 | Create the environment: 36 | 37 | ```bash 38 | conda create -n wetts python=3.8 -y 39 | conda activate wetts 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | ## Roadmap 44 | 45 | We mainly focus on end to end, production, and on-device TTS. We are going to use: 46 | 47 | * backend: end to end model, such as: 48 | * [VITS](https://arxiv.org/pdf/2106.06103.pdf) 49 | * frontend: 50 | * Text Normalization: [WeTextProcessing](https://github.com/wenet-e2e/WeTextProcessing) 51 | * Prosody & Polyphones: [Unified Mandarin TTS Front-end Based on Distilled BERT Model](https://arxiv.org/pdf/2012.15404.pdf) 52 | 53 | ## Dataset 54 | 55 | We plan to support a variaty of open source TTS datasets, include but not limited to: 56 | 57 | * [Baker](https://www.data-baker.com/data/index/TNtts), Chinese Standard Mandarin Speech corpus open sourced by Data Baker. 58 | * [AISHELL-3](https://openslr.org/93), a large-scale and high-fidelity multi-speaker Mandarin speech corpus. 59 | * [Opencpop](https://wenet.org.cn/opencpop), Mandarin singing voice synthesis (SVS) corpus open sourced by Netease Fuxi. 60 | 61 | ## Pretrained Models 62 | 63 | | Dataset | Language | Checkpoint Model | Runtime Model | 64 | | -------------- | -------- | ---------------- | ------------- | 65 | | Baker | CN | [BERT](https://wenet.org.cn/downloads?models=wetts&version=baker_bert_exp.tar.gz) | [BERT](https://wenet.org.cn/downloads?models=wetts&version=baker_bert_onnx.tar.gz) | 66 | | Multilingual | CN | [VITS](https://wenet.org.cn/downloads?models=wetts&version=multilingual_vits_v3_exp.tar.gz) | [VITS](https://wenet.org.cn/downloads?models=wetts&version=multilingual_vits_v3_onnx.tar.gz) | 67 | 68 | ## Runtime 69 | 70 | We plan to support a variaty of hardwares and platforms, including: 71 | 72 | * x86 73 | * Android 74 | * Raspberry Pi 75 | * Other on-device platforms 76 | 77 | ``` bash 78 | export GLOG_logtostderr=1 79 | export GLOG_v=2 80 | 81 | cd runtime/onnxruntime 82 | cmake -B build -DCMAKE_BUILD_TYPE=Release 83 | cmake --build build 84 | ./build/bin/tts_main \ 85 | --frontend_flags baker_bert_onnx/frontend.flags \ 86 | --vits_flags multilingual_vits_v3_onnx/vits.flags \ 87 | --sname baker \ 88 | --text "hello我是小明。" \ 89 | --wav_path audio.wav 90 | ``` 91 | 92 | ## Discussion & Communication 93 | 94 | For Chinese users, you can aslo scan the QR code on the left to follow our offical account of WeNet. 95 | We created a WeChat group for better discussion and quicker response. 96 | Please scan the personal QR code on the right, and the guy is responsible for inviting you to the chat group. 97 | 98 | | | | 99 | | ---- | ---- | 100 | 101 | Or you can directly discuss on [Github Issues](https://github.com/wenet-e2e/wetts/issues). 102 | 103 | ## Acknowledgement 104 | 105 | 1. We borrow a lot of code from [vits](https://github.com/jaywalnut310/vits) for VITS implementation. 106 | 2. We refer [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) for `pinyin` lexicon generation. 107 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SPHINXPROJ = Wenet 9 | SOURCEDIR = . 10 | BUILDDIR = _build 11 | 12 | # Put it first so that "make" without argument is like "make help". 13 | help: 14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 15 | 16 | .PHONY: help Makefile 17 | 18 | # Catch-all target: route all unknown targets to Sphinx using the new 19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 20 | %: Makefile 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Wenet' 21 | copyright = '2020, wenet-team' 22 | author = 'wenet-team' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "nbsphinx", 32 | "sphinx.ext.autodoc", 33 | 'sphinx.ext.napoleon', 34 | 'sphinx.ext.viewcode', 35 | "sphinx.ext.mathjax", 36 | "sphinx.ext.todo", 37 | # "sphinxarg.ext", 38 | "sphinx_markdown_tables", 39 | 'recommonmark', 40 | 'sphinx_rtd_theme', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | source_suffix = { 50 | '.rst': 'restructuredtext', 51 | '.txt': 'markdown', 52 | '.md': 'markdown', 53 | } 54 | 55 | # List of patterns, relative to source directory, that match files and 56 | # directories to ignore when looking for source files. 57 | # This pattern also affects html_static_path and html_extra_path. 58 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 59 | 60 | 61 | # -- Options for HTML output ------------------------------------------------- 62 | 63 | # The theme to use for HTML and HTML Help pages. See the documentation for 64 | # a list of builtin themes. 65 | # html_theme = 'alabaster' 66 | html_theme = "sphinx_rtd_theme" 67 | 68 | # Add any paths that contain custom static files (such as style sheets) here, 69 | # relative to this directory. They are copied after the builtin static files, 70 | # so a file named "default.css" will overwrite the builtin "default.css". 71 | html_static_path = ['_static'] 72 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Wenet documentation master file, created by 2 | sphinx-quickstart on Thu Dec 3 11:43:53 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to WeTTS's documentation! 7 | ================================= 8 | 9 | Production First and Production Ready End-to-End Text-to-Speech Toolkit 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Tutorial: 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /examples/aishell-3/configs/v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 44100, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 512, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/aishell-3/configs/v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 128, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/aishell-3/configs/v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 16000, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "2", 39 | "upsample_rates": [8,8,4], 40 | "upsample_kernel_sizes": [16,16,8], 41 | "upsample_initial_channel": 256, 42 | "resblock_kernel_sizes": [3,5,7], 43 | "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], 44 | "n_layers_q": 3, 45 | "use_sdp": false, 46 | "use_spectral_norm": false, 47 | "gin_channels": 256 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/aishell-3/local/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 3 | 4 | if [ $# -ne 2 ]; then 5 | echo "Usage: $0 " 6 | exit 0; 7 | fi 8 | 9 | url=$1 10 | dir=$2 11 | 12 | [ ! -d $dir ] && mkdir -p $dir 13 | 14 | # Download data 15 | if [ ! -f $dir/data_aishell3.tgz ]; then 16 | if ! which wget >/dev/null; then 17 | echo "$0: wget is not installed." 18 | exit 1; 19 | fi 20 | echo "$0: downloading data from $url. This may take some time, please wait" 21 | 22 | cd $dir 23 | if ! wget --no-check-certificate $url; then 24 | echo "$0: error executing wget $url" 25 | exit 1; 26 | fi 27 | fi 28 | 29 | 30 | cd $dir 31 | if ! tar -xvzf data_aishell3.tgz; then 32 | echo "$0: error un-tarring archive $dir/data_aishell3.tgz" 33 | exit 1; 34 | fi 35 | -------------------------------------------------------------------------------- /examples/aishell-3/local/prepare_data.py: -------------------------------------------------------------------------------- 1 | #!/user/bin/env python3 2 | 3 | # Copyright (c) 2022 Binbin Zhang(binbzha@qq.com) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import os 18 | import sys 19 | 20 | if len(sys.argv) != 4: 21 | print("Usage: prepare_data.py lexicon in_data_dir out_data") 22 | sys.exit(-1) 23 | 24 | lexicon = {} 25 | with open(sys.argv[1], "r", encoding="utf8") as fin: 26 | for line in fin: 27 | arr = line.strip().split() 28 | lexicon[arr[0]] = arr[1:] 29 | 30 | train_set_label_file = os.path.join(sys.argv[2], "train", "label_train-set.txt") 31 | with open(train_set_label_file, encoding="utf8") as fin, open( 32 | sys.argv[3], "w", encoding="utf8" 33 | ) as fout: 34 | # skip the first five lines in label_train-set.txt 35 | lines = [x.strip() for x in fin.readlines()][5:] 36 | for line in lines: 37 | key, text, _ = line.split("|") 38 | speaker = key[:-4] 39 | wav_path = os.path.join( 40 | sys.argv[2], "train", "wav", speaker, "{}.wav".format(key) 41 | ) 42 | phones = [] 43 | for x in text.split(): 44 | if x == "%" or x == "$": 45 | phones.append(x) 46 | elif x in lexicon: 47 | phones.extend(lexicon[x]) 48 | else: 49 | print("{} OOV {}".format(key, x)) 50 | sys.exit(-1) 51 | fout.write("{}|{}|sil {}\n".format(wav_path, speaker, " ".join(phones))) 52 | -------------------------------------------------------------------------------- /examples/aishell-3/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2022 Jie Chen 4 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 5 | 6 | [ -f path.sh ] && . path.sh 7 | 8 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 9 | 10 | stage=0 # start from -1 if you need to download data 11 | stop_stage=3 12 | 13 | dataset_url=https://openslr.magicdatatech.com/resources/93/data_aishell3.tgz 14 | dataset_dir=. # path to dataset directory 15 | 16 | dir=exp/v1 # training dir 17 | config=configs/v1.json 18 | 19 | data=data 20 | test_audio=test_audio 21 | 22 | . tools/parse_options.sh || exit 1; 23 | 24 | 25 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then 26 | # Download data 27 | local/download_data.sh $dataset_url $dataset_dir 28 | fi 29 | 30 | 31 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 32 | # Prepare data for training/validation 33 | mkdir -p $data 34 | python tools/gen_pinyin_lexicon.py \ 35 | --with-zero-initial --with-tone --with-r \ 36 | $data/lexicon.txt \ 37 | $data/phones.list 38 | python local/prepare_data.py \ 39 | $data/lexicon.txt \ 40 | $dataset_dir/data_aishell3 \ 41 | $data/all.txt 42 | 43 | # Compute spec length (optional, but recommended) 44 | python tools/compute_spec_length.py \ 45 | $data/all.txt \ 46 | $config \ 47 | $data/all_spec_length.txt 48 | mv $data/all_spec_length.txt $data/all.txt 49 | 50 | cat $data/all.txt | awk -F '|' '{print $2}' | \ 51 | sort | uniq | awk '{print $0, NR-1}' > $data/speaker.txt 52 | echo 'sil 0' > $data/phones.txt 53 | cat $data/all.txt | awk -F '|' '{print $3}' | \ 54 | awk '{for (i=1;i<=NF;i++) print $i}' | sort | uniq | \ 55 | grep -v 'sil' | awk '{print $0, NR}' >> $data/phones.txt 56 | 57 | # Split train/validation 58 | shuf --random-source=<(yes 777) $data/all.txt > $data/train.txt 59 | head -n 100 $data/train.txt > $data/val.txt 60 | sed -i '1,100d' $data/train.txt 61 | head -n 10 $data/train.txt > $data/test.txt 62 | sed -i '1,10d' $data/train.txt 63 | fi 64 | 65 | 66 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 67 | num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}') 68 | torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ 69 | vits/train.py -c $config -m $dir \ 70 | --train_data $data/train.txt \ 71 | --val_data $data/val.txt \ 72 | --speaker_table $data/speaker.txt \ 73 | --phone_table $data/phones.txt \ 74 | --num_workers 8 75 | fi 76 | 77 | 78 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 79 | mkdir -p $test_audio 80 | python vits/inference.py --cfg $config \ 81 | --speaker_table $data/speaker.txt \ 82 | --phone_table $data/phones.txt \ 83 | --checkpoint $dir/G_90000.pth \ 84 | --test_file $data/test.txt \ 85 | --outdir $test_audio 86 | fi 87 | 88 | 89 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 90 | mkdir -p $test_audio 91 | python vits/export_onnx.py --cfg $config \ 92 | --speaker_table $data/speaker.txt \ 93 | --phone_table $data/phones.txt \ 94 | --checkpoint $dir/G_90000.pth \ 95 | --onnx_model $dir/G_90000.onnx 96 | 97 | python vits/inference_onnx.py --cfg $config \ 98 | --speaker_table $data/speaker.txt \ 99 | --phone_table $data/phones.txt \ 100 | --onnx_model $dir/G_90000.onnx \ 101 | --test_file $data/test.txt \ 102 | --outdir $test_audio 103 | fi 104 | -------------------------------------------------------------------------------- /examples/aishell-3/tools: -------------------------------------------------------------------------------- 1 | ../../tools -------------------------------------------------------------------------------- /examples/aishell-3/vits: -------------------------------------------------------------------------------- 1 | ../../wetts/vits -------------------------------------------------------------------------------- /examples/baker/configs/v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 512, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256, 47 | "use_wd": true, 48 | "slm_model": "exp/slm/wavlm-base-plus", 49 | "slm_sr": 16000, 50 | "slm_hidden": 768, 51 | "slm_nlayers": 13, 52 | "slm_initial_channel": 64 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /examples/baker/configs/v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 128, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/baker/configs/v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 16000, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "2", 39 | "upsample_rates": [8,8,4], 40 | "upsample_kernel_sizes": [16,16,8], 41 | "upsample_initial_channel": 256, 42 | "resblock_kernel_sizes": [3,5,7], 43 | "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], 44 | "n_layers_q": 3, 45 | "use_sdp": false, 46 | "use_spectral_norm": false, 47 | "gin_channels": 256 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/baker/configs/vits2_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "use_mel_posterior_encoder": true, 21 | "max_wav_value": 32768.0, 22 | "sampling_rate": 22050, 23 | "filter_length": 1024, 24 | "hop_length": 256, 25 | "win_length": 1024, 26 | "n_mel_channels": 80, 27 | "mel_fmin": 0.0, 28 | "mel_fmax": null 29 | }, 30 | "model": { 31 | "use_mel_posterior_encoder": true, 32 | "use_transformer_flows": true, 33 | "transformer_flow_type": "pre_conv", 34 | "use_spk_conditioned_encoder": false, 35 | "use_noise_scaled_mas": true, 36 | "use_duration_discriminator": true, 37 | "inter_channels": 192, 38 | "hidden_channels": 192, 39 | "filter_channels": 768, 40 | "n_heads": 2, 41 | "n_layers": 6, 42 | "kernel_size": 3, 43 | "p_dropout": 0.1, 44 | "resblock": "1", 45 | "resblock_kernel_sizes": [3,7,11], 46 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 47 | "upsample_rates": [8,8,2,2], 48 | "upsample_initial_channel": 512, 49 | "upsample_kernel_sizes": [16,16,4,4], 50 | "n_layers_q": 3, 51 | "use_sdp": true, 52 | "use_spectral_norm": false, 53 | "gin_channels": 256, 54 | "use_wd": true, 55 | "slm_model": "exp/slm/wavlm-base-plus", 56 | "slm_sr": 16000, 57 | "slm_hidden": 768, 58 | "slm_nlayers": 13, 59 | "slm_initial_channel": 64 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/baker/configs/vits2_vocos_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "use_mel_posterior_encoder": true, 21 | "max_wav_value": 32768.0, 22 | "sampling_rate": 24000, 23 | "filter_length": 1024, 24 | "hop_length": 256, 25 | "win_length": 1024, 26 | "n_mel_channels": 100, 27 | "mel_fmin": 0.0, 28 | "mel_fmax": null 29 | }, 30 | "model": { 31 | "vocoder_type": "vocos", 32 | "use_mrd_disc": true, 33 | "use_mel_posterior_encoder": true, 34 | "use_transformer_flows": true, 35 | "transformer_flow_type": "pre_conv", 36 | "use_spk_conditioned_encoder": false, 37 | "use_noise_scaled_mas": true, 38 | "use_duration_discriminator": true, 39 | "inter_channels": 192, 40 | "hidden_channels": 192, 41 | "filter_channels": 768, 42 | "n_heads": 2, 43 | "n_layers": 6, 44 | "kernel_size": 3, 45 | "p_dropout": 0.1, 46 | "vocos_channels": 512, 47 | "vocos_h_channels": 1536, 48 | "vocos_out_channels": 1026, 49 | "vocos_num_layers": 8, 50 | "vocos_istft_config": { 51 | "n_fft": 1024, 52 | "hop_length": 256, 53 | "win_length": 1024, 54 | "center": true 55 | }, 56 | "resblock": "1", 57 | "resblock_kernel_sizes": [3,7,11], 58 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 59 | "upsample_rates": [8,8,2,2], 60 | "upsample_initial_channel": 512, 61 | "upsample_kernel_sizes": [16,16,4,4], 62 | "n_layers_q": 3, 63 | "use_sdp": true, 64 | "use_spectral_norm": false, 65 | "gin_channels": 256 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /examples/baker/configs/vocos.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-9, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 8192, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0 21 | }, 22 | "data": { 23 | "max_wav_value": 32768.0, 24 | "sampling_rate": 16000, 25 | "filter_length": 1024, 26 | "hop_length": 256, 27 | "win_length": 1024, 28 | "n_mel_channels": 80, 29 | "mel_fmin": 0.0, 30 | "mel_fmax": null 31 | }, 32 | "model": { 33 | "use_mel_posterior_encoder": false, 34 | "vocoder_type": "vocos", 35 | "inter_channels": 192, 36 | "hidden_channels": 192, 37 | "filter_channels": 768, 38 | "n_heads": 2, 39 | "n_layers": 6, 40 | "kernel_size": 3, 41 | "p_dropout": 0.1, 42 | "vocos_channels": 512, 43 | "vocos_h_channels": 1536, 44 | "vocos_out_channels": 1026, 45 | "vocos_num_layers": 8, 46 | "vocos_istft_config": { 47 | "n_fft": 1024, 48 | "hop_length": 256, 49 | "win_length": 1024, 50 | "center": true 51 | }, 52 | "resblock": "1", 53 | "resblock_kernel_sizes": [3,7,11], 54 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 55 | "upsample_rates": [8,8,2,2], 56 | "upsample_initial_channel": 512, 57 | "upsample_kernel_sizes": [16,16,4,4], 58 | "n_layers_q": 3, 59 | "use_sdp": false, 60 | "use_spectral_norm": false, 61 | "gin_channels": 256 62 | } 63 | } -------------------------------------------------------------------------------- /examples/baker/local/prepare_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | lexicon = {} 5 | 6 | with open(sys.argv[1], "r", encoding="utf8") as fin: 7 | for line in fin: 8 | arr = line.strip().split() 9 | lexicon[arr[0]] = arr[1:] 10 | 11 | with open(sys.argv[2], "r", encoding="utf8") as fin: 12 | lines = fin.readlines() 13 | for i in range(0, len(lines), 2): 14 | key = lines[i][:6] 15 | content = lines[i][7:].strip() 16 | content = re.sub("[。,、“”?:……!( )—;]", "", content) 17 | if "P" in content: # ignore utt 002365 18 | continue 19 | chars = [] 20 | prosody = {} 21 | 22 | j = 0 23 | while j < len(content): 24 | if content[j] == "#": 25 | prosody[len(chars) - 1] = content[j : j + 2] 26 | j += 2 27 | else: 28 | chars.append(content[j]) 29 | j += 1 30 | if key == "005107": 31 | lines[i + 1] = lines[i + 1].replace(" ng1", " en1") 32 | syllable = lines[i + 1].strip().split() 33 | s_index = 0 34 | phones = [] 35 | for k, char in enumerate(chars): 36 | # 儿化音处理 37 | er_flag = False 38 | if char == "儿" and ( 39 | s_index == len(syllable) or syllable[s_index][0:2] != "er" 40 | ): 41 | er_flag = True 42 | else: 43 | phones.extend(lexicon[syllable[s_index]]) 44 | s_index += 1 45 | if k in prosody: 46 | if er_flag: 47 | phones[-1] = prosody[k] 48 | else: 49 | phones.append(prosody[k]) 50 | else: 51 | phones.append("#0") 52 | print("{}/{}.wav|baker|sil {}\n".format(sys.argv[3], key, " ".join(phones))) 53 | -------------------------------------------------------------------------------- /examples/baker/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 4 | 5 | [ -f path.sh ] && . path.sh 6 | 7 | export CUDA_VISIBLE_DEVICES="0,1,2,3" # specify your gpu id for training 8 | 9 | stage=0 # start from -1 if you need to download data 10 | stop_stage=3 11 | 12 | dir=exp/v3 # training dir 13 | config=configs/v3.json 14 | 15 | # Please download data from https://www.data-baker.com/data/index/TNtts, and 16 | # set `raw_data_dir` to your data. 17 | raw_data_dir=. # path to dataset directory 18 | data=data 19 | test_audio=test_audio 20 | ckpt_step=200000 21 | 22 | . tools/parse_options.sh || exit 1; 23 | 24 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 25 | # Prepare data for training/validation 26 | mkdir -p $data 27 | python tools/gen_pinyin_lexicon.py \ 28 | --with-zero-initial --with-tone --with-r \ 29 | $data/lexicon.txt \ 30 | $data/phones.list 31 | python local/prepare_data.py \ 32 | $data/lexicon.txt \ 33 | $raw_data_dir/ProsodyLabeling/000001-010000.txt \ 34 | $raw_data_dir/Wave > $data/all.txt 35 | 36 | cat $data/all.txt | awk -F '|' '{print $2}' | \ 37 | sort | uniq | awk '{print $0, NR-1}' > $data/speaker.txt 38 | echo 'sil 0' > $data/phones.txt 39 | cat $data/all.txt | awk -F '|' '{print $3}' | \ 40 | awk '{for (i=1;i<=NF;i++) print $i}' | sort | uniq | \ 41 | grep -v 'sil' | awk '{print $0, NR}' >> $data/phones.txt 42 | 43 | # Split train/validation 44 | shuf --random-source=<(yes 777) $data/all.txt > $data/train.txt 45 | head -n 100 $data/train.txt > $data/val.txt 46 | sed -i '1,100d' $data/train.txt 47 | head -n 10 $data/train.txt > $data/test.txt 48 | sed -i '1,10d' $data/train.txt 49 | fi 50 | 51 | 52 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 53 | num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}') 54 | torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ 55 | vits/train.py -c $config -m $dir \ 56 | --train_data $data/train.txt \ 57 | --val_data $data/val.txt \ 58 | --speaker_table $data/speaker.txt \ 59 | --phone_table $data/phones.txt \ 60 | --num_workers 8 61 | fi 62 | 63 | 64 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 65 | mkdir -p $test_audio 66 | python vits/inference.py --cfg $config \ 67 | --speaker_table $data/speaker.txt \ 68 | --phone_table $data/phones.txt \ 69 | --checkpoint $dir/G_$ckpt_step.pth \ 70 | --test_file $data/test.txt \ 71 | --outdir $test_audio 72 | fi 73 | 74 | 75 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 76 | mkdir -p $test_audio 77 | python vits/export_onnx.py --cfg $config \ 78 | --speaker_table $data/speaker.txt \ 79 | --phone_table $data/phones.txt \ 80 | --checkpoint $dir/G_$ckpt_step.pth \ 81 | --onnx_model $dir/G_$ckpt_step.onnx 82 | 83 | python vits/inference_onnx.py --cfg $config \ 84 | --speaker_table $data/speaker.txt \ 85 | --phone_table $data/phones.txt \ 86 | --onnx_model $dir/G_$ckpt_step.onnx \ 87 | --test_file $data/test.txt \ 88 | --outdir $test_audio 89 | fi 90 | 91 | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then 92 | mkdir -p $test_audio 93 | python vits/export_onnx.py --cfg $config \ 94 | --streaming \ 95 | --speaker_table $data/speaker.txt \ 96 | --phone_table $data/phones.txt \ 97 | --checkpoint $dir/G_$ckpt_step.pth \ 98 | --onnx_model $dir/G_$ckpt_step.onnx 99 | 100 | python vits/inference_onnx.py --cfg $config \ 101 | --streaming \ 102 | --speaker_table $data/speaker.txt \ 103 | --phone_table $data/phones.txt \ 104 | --onnx_model $dir/G_$ckpt_step.onnx \ 105 | --test_file $data/test.txt \ 106 | --outdir $test_audio 107 | fi 108 | -------------------------------------------------------------------------------- /examples/baker/tools: -------------------------------------------------------------------------------- 1 | ../../tools -------------------------------------------------------------------------------- /examples/baker/vits: -------------------------------------------------------------------------------- 1 | ../../wetts/vits -------------------------------------------------------------------------------- /examples/chinese_prosody_polyphone/README.md: -------------------------------------------------------------------------------- 1 | ## Model Method 2 | 3 | Please see [doc](../../wetts/frontend/README.md) for details. 4 | 5 | ## Data Description 6 | 7 | Here are the details of the prosody and polyphone data used in the recipe. 8 | The data are either collected from web or contributed by the community. 9 | 10 | 11 | ### Polyphone 12 | 13 | | corpus | number | source or contributors | 14 | |--------|--------|------------------------------------| 15 | | g2pM | 100000 | https://github.com/kakaobrain/g2pM | 16 | | | | | 17 | 18 | TODO(Binbin Zhang): Add more data 19 | 20 | 21 | ### Prosody 22 | 23 | | corpus | number | source or contributors | 24 | |---------|--------|---------------------------------------------| 25 | | biaobei | 10000 | https://www.data-baker.com/open_source.html | 26 | | | | | 27 | 28 | TODO(Binbin Zhang): Add more data 29 | 30 | ## Benchmark 31 | 32 | BERT-MLT is for polyphone and prosody joint training. 33 | 34 | ### Polyphone 35 | 36 | | system | ACC | 37 | |----------------|--------| 38 | | BERT-polyphone | 0.9778 | 39 | | BERT-MLT | 0.9797 | 40 | 41 | 42 | ### Prosody 43 | 44 | | system | PW-F1 | PPH-F1 | IPH-F1 | 45 | |---------------------------|--------|--------|--------| 46 | | BERT-prosody | 0.9308 | 0.8058 | 0.8596 | 47 | | BERT-MLT | 0.9334 | 0.8088 | 0.8559 | 48 | | BERT-prosody (exclude #4) | 0.9233 | 0.7074 | 0.6120 | 49 | | BERT-MLT (exclude #4) | 0.9261 | 0.7146 | 0.6140 | 50 | -------------------------------------------------------------------------------- /examples/chinese_prosody_polyphone/frontend: -------------------------------------------------------------------------------- 1 | ../../wetts/frontend -------------------------------------------------------------------------------- /examples/chinese_prosody_polyphone/lexicon/prosody.txt: -------------------------------------------------------------------------------- 1 | #0 2 | #1 3 | #2 4 | #3 5 | #4 6 | -------------------------------------------------------------------------------- /examples/chinese_prosody_polyphone/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 3 | 4 | stage=0 5 | stop_stage=4 6 | url=https://wetts-1256283475.cos.ap-shanghai.myqcloud.com/data 7 | 8 | dir=exp 9 | 10 | . tools/parse_options.sh 11 | 12 | 13 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then 14 | # Download prosody and polyphone 15 | mkdir -p data/download 16 | pushd data/download 17 | wget -c $url/polyphone.tar.gz && tar zxf polyphone.tar.gz 18 | wget -c $url/prosody.tar.gz && tar zxf prosody.tar.gz 19 | popd 20 | fi 21 | 22 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 23 | # Combine prosody data 24 | mkdir -p data/prosody 25 | cat data/download/prosody/biaobei/train.txt > data/prosody/train.txt 26 | cat data/download/prosody/biaobei/cv.txt > data/prosody/cv.txt 27 | # Combine polyphone data 28 | mkdir -p data/polyphone 29 | cat data/download/polyphone/g2pM/train.txt > data/polyphone/train.txt 30 | cat data/download/polyphone/g2pM/dev.txt > data/polyphone/cv.txt 31 | cat data/download/polyphone/g2pM/test.txt > data/polyphone/test.txt 32 | fi 33 | 34 | 35 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 36 | mkdir -p $dir 37 | python frontend/train.py \ 38 | --gpu 2 \ 39 | --lr 0.001 \ 40 | --num_epochs 10 \ 41 | --batch_size 32 \ 42 | --log_interval 10 \ 43 | --polyphone_weight 0.1 \ 44 | --polyphone_dict lexicon/polyphone.txt \ 45 | --train_polyphone_data data/polyphone/train.txt \ 46 | --cv_polyphone_data data/polyphone/cv.txt \ 47 | --prosody_dict lexicon/prosody.txt \ 48 | --train_prosody_data data/prosody/train.txt \ 49 | --cv_prosody_data data/prosody/cv.txt \ 50 | --model_dir $dir 51 | fi 52 | 53 | 54 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 55 | # Test polyphone, metric: accuracy 56 | python frontend/test_polyphone.py \ 57 | --polyphone_dict lexicon/polyphone.txt \ 58 | --prosody_dict lexicon/prosody.txt \ 59 | --test_data data/polyphone/test.txt \ 60 | --batch_size 32 \ 61 | --checkpoint $dir/9.pt 62 | 63 | # Test prosody, metric: F1-score 64 | python frontend/test_prosody.py \ 65 | --polyphone_dict lexicon/polyphone.txt \ 66 | --prosody_dict lexicon/prosody.txt \ 67 | --test_data data/prosody/cv.txt \ 68 | --batch_size 32 \ 69 | --checkpoint $dir/9.pt 70 | fi 71 | 72 | 73 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 74 | # export onnx model 75 | python frontend/export_onnx.py \ 76 | --polyphone_dict lexicon/polyphone.txt \ 77 | --prosody_dict lexicon/prosody.txt \ 78 | --checkpoint $dir/9.pt \ 79 | --onnx_model $dir/9.onnx 80 | fi 81 | 82 | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then 83 | # g2p 84 | # text: 八方财宝进 85 | # pinyin ['ba1', 'fang1', 'cai2', 'bao3', 'jin4'] 86 | # prosody [0 1 0 0 4] 87 | python frontend/g2p_prosody.py \ 88 | --text "八方财宝进" \ 89 | --hanzi2pinyin_file lexicon/pinyin_dict.txt \ 90 | --polyphone_file lexicon/polyphone.txt \ 91 | --polyphone_prosody_model $dir/9.onnx 92 | fi 93 | -------------------------------------------------------------------------------- /examples/chinese_prosody_polyphone/tools: -------------------------------------------------------------------------------- 1 | ../../tools -------------------------------------------------------------------------------- /examples/ljspeech/configs/v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 512, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/ljspeech/configs/v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 128, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/ljspeech/configs/v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 16000, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "2", 39 | "upsample_rates": [8,8,4], 40 | "upsample_kernel_sizes": [16,16,8], 41 | "upsample_initial_channel": 256, 42 | "resblock_kernel_sizes": [3,5,7], 43 | "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], 44 | "n_layers_q": 3, 45 | "use_sdp": false, 46 | "use_spectral_norm": false, 47 | "gin_channels": 256 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/ljspeech/local/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 3 | 4 | if [ $# -ne 2 ]; then 5 | echo "Usage: $0 " 6 | exit 0; 7 | fi 8 | 9 | url=$1 10 | dir=$2 11 | 12 | [ ! -d $dir ] && mkdir -p $dir 13 | 14 | # Download data 15 | if [ ! -f $dir/LJSpeech-1.1.tar.bz2 ]; then 16 | if ! which wget >/dev/null; then 17 | echo "$0: wget is not installed." 18 | exit 1; 19 | fi 20 | echo "$0: downloading data from $url. This may take some time, please wait" 21 | 22 | cd $dir 23 | if ! wget --no-check-certificate $url; then 24 | echo "$0: error executing wget $url" 25 | exit 1; 26 | fi 27 | fi 28 | 29 | 30 | cd $dir 31 | if ! tar -xvf LJSpeech-1.1.tar.bz2; then 32 | echo "$0: error un-tarring archive $dir/LJSpeech-1.1.tar.bz2" 33 | exit 1; 34 | fi 35 | -------------------------------------------------------------------------------- /examples/ljspeech/local/prepare_data.py: -------------------------------------------------------------------------------- 1 | #!/user/bin/env python3 2 | 3 | # Copyright (c) 2022 Binbin Zhang(binbzha@qq.com) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import csv 19 | import os 20 | 21 | from tools.cleaners import english_cleaners 22 | 23 | 24 | def get_args(): 25 | parser = argparse.ArgumentParser(description="prepare data") 26 | parser.add_argument("--data_dir", required=True, help="input data dir") 27 | parser.add_argument("--output", required=True, help="output file") 28 | parser.add_argument("--use_prosody", default=True, help="whether use prosody") 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(): 34 | args = get_args() 35 | 36 | metadata = os.path.join(args.data_dir, "metadata.csv") 37 | with open(metadata) as fin, open(args.output, "w", encoding="utf8") as fout: 38 | for row in csv.reader(fin, delimiter="|"): 39 | wav_path = os.path.join(args.data_dir, f"wavs/{row[0]}.wav") 40 | phones = english_cleaners(row[-1], args.use_prosody) 41 | fout.write("{}|ljspeech|sil {}\n".format(wav_path, " ".join(phones))) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /examples/ljspeech/path.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=.:$PYTHONPATH -------------------------------------------------------------------------------- /examples/ljspeech/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 4 | 5 | [ -f path.sh ] && . path.sh 6 | 7 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 8 | 9 | stage=0 # start from -1 if you need to download data 10 | stop_stage=3 11 | 12 | dataset_url=https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 13 | dataset_dir=. # path to dataset directory 14 | 15 | dir=exp/v3 # training dir 16 | config=configs/v3.json 17 | 18 | data=data 19 | test_audio=test_audio 20 | 21 | . tools/parse_options.sh || exit 1; 22 | 23 | 24 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then 25 | # Download data 26 | local/download_data.sh $dataset_url $dataset_dir 27 | fi 28 | 29 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 30 | # Prepare data for training/validation 31 | mkdir -p $data 32 | python local/prepare_data.py \ 33 | --data_dir $(realpath $dataset_dir)/LJSpeech-1.1 \ 34 | --output $data/out.txt 35 | sed 's/#[0-9] //g' $data/out.txt > $data/all.txt 36 | 37 | cat $data/all.txt | awk -F '|' '{print $2}' | \ 38 | sort | uniq | awk '{print $0, NR-1}' > $data/speaker.txt 39 | echo 'sil 0' > $data/phones.txt 40 | cat $data/all.txt | awk -F '|' '{print $3}' | \ 41 | awk '{for (i=1;i<=NF;i++) print $i}' | sort | uniq | \ 42 | grep -v 'sil' | awk '{print $0, NR}' >> $data/phones.txt 43 | 44 | # Split train/validation 45 | shuf --random-source=<(yes 777) $data/all.txt > $data/train.txt 46 | head -n 100 $data/train.txt > $data/val.txt 47 | sed -i '1,100d' $data/train.txt 48 | head -n 10 $data/train.txt > $data/test.txt 49 | sed -i '1,10d' $data/train.txt 50 | fi 51 | 52 | 53 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 54 | num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}') 55 | torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ 56 | vits/train.py -c $config -m $dir \ 57 | --train_data $data/train.txt \ 58 | --val_data $data/val.txt \ 59 | --speaker_table $data/speaker.txt \ 60 | --phone_table $data/phones.txt \ 61 | --num_workers 8 62 | fi 63 | 64 | 65 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 66 | mkdir -p $test_audio 67 | python vits/inference.py --cfg $config \ 68 | --speaker_table $data/speaker.txt \ 69 | --phone_table $data/phones.txt \ 70 | --checkpoint $dir/G_90000.pth \ 71 | --test_file $data/test.txt \ 72 | --outdir $test_audio 73 | fi 74 | 75 | 76 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 77 | mkdir -p $test_audio 78 | python vits/export_onnx.py --cfg $config \ 79 | --speaker_table $data/speaker.txt \ 80 | --phone_table $data/phones.txt \ 81 | --checkpoint $dir/G_90000.pth \ 82 | --onnx_model $dir/G_90000.onnx 83 | 84 | python vits/inference_onnx.py --cfg $config \ 85 | --speaker_table $data/speaker.txt \ 86 | --phone_table $data/phones.txt \ 87 | --onnx_model $dir/G_90000.onnx \ 88 | --test_file $data/test.txt \ 89 | --outdir $test_audio 90 | fi 91 | -------------------------------------------------------------------------------- /examples/ljspeech/tools: -------------------------------------------------------------------------------- 1 | ../../tools -------------------------------------------------------------------------------- /examples/ljspeech/vits: -------------------------------------------------------------------------------- 1 | ../../wetts/vits -------------------------------------------------------------------------------- /examples/multilingual/configs/v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 512, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/multilingual/configs/v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 22050, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "1", 39 | "upsample_rates": [8,8,2,2], 40 | "upsample_kernel_sizes": [16,16,4,4], 41 | "upsample_initial_channel": 128, 42 | "resblock_kernel_sizes": [3,7,11], 43 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 44 | "n_layers_q": 3, 45 | "use_spectral_norm": false, 46 | "gin_channels": 256 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/multilingual/configs/v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "max_wav_value": 32768.0, 21 | "sampling_rate": 16000, 22 | "filter_length": 1024, 23 | "hop_length": 256, 24 | "win_length": 1024, 25 | "n_mel_channels": 80, 26 | "mel_fmin": 0.0, 27 | "mel_fmax": null 28 | }, 29 | "model": { 30 | "use_mel_posterior_encoder": false, 31 | "inter_channels": 192, 32 | "hidden_channels": 192, 33 | "filter_channels": 768, 34 | "n_heads": 2, 35 | "n_layers": 6, 36 | "kernel_size": 3, 37 | "p_dropout": 0.1, 38 | "resblock": "2", 39 | "upsample_rates": [8,8,4], 40 | "upsample_kernel_sizes": [16,16,8], 41 | "upsample_initial_channel": 256, 42 | "resblock_kernel_sizes": [3,5,7], 43 | "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], 44 | "n_layers_q": 3, 45 | "use_sdp": false, 46 | "use_spectral_norm": false, 47 | "gin_channels": 256 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/multilingual/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2022 Binbin Zhang(binbzha@qq.com) 4 | 5 | [ -f path.sh ] && . path.sh 6 | 7 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 8 | 9 | stage=0 # start from -1 if you need to download data 10 | stop_stage=3 11 | 12 | dir=exp/v3 # training dir 13 | config=configs/v3.json 14 | 15 | data=data 16 | test_audio=test_audio 17 | 18 | . tools/parse_options.sh || exit 1; 19 | 20 | 21 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 22 | mkdir -p $data 23 | cat ../baker/$data/all.txt \ 24 | ../ljspeech/$data/out.txt > $data/all.txt 25 | 26 | cat $data/all.txt | awk -F '|' '{print $2}' | \ 27 | sort | uniq | awk '{print $0, NR-1}' > $data/speaker.txt 28 | echo 'sil 0' > $data/phones.txt 29 | cat $data/all.txt | awk -F '|' '{print $3}' | \ 30 | awk '{for (i=1;i<=NF;i++) print $i}' | sort | uniq | \ 31 | grep -v 'sil' | awk '{print $0, NR}' >> $data/phones.txt 32 | 33 | # Split train/validation 34 | shuf --random-source=<(yes 777) $data/all.txt > $data/train.txt 35 | head -n 100 $data/train.txt > $data/val.txt 36 | sed -i '1,100d' $data/train.txt 37 | head -n 10 $data/train.txt > $data/test.txt 38 | sed -i '1,10d' $data/train.txt 39 | fi 40 | 41 | 42 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 43 | num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}') 44 | torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ 45 | vits/train.py -c $config -m $dir \ 46 | --train_data $data/train.txt \ 47 | --val_data $data/val.txt \ 48 | --speaker_table $data/speaker.txt \ 49 | --phone_table $data/phones.txt \ 50 | --num_workers 8 51 | fi 52 | 53 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 54 | mkdir -p $test_audio 55 | python vits/inference.py --cfg $config \ 56 | --speaker_table $data/speaker.txt \ 57 | --phone_table $data/phones.txt \ 58 | --checkpoint $dir/G_90000.pth \ 59 | --test_file $data/test.txt \ 60 | --outdir $test_audio 61 | fi 62 | 63 | 64 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 65 | mkdir -p $test_audio 66 | python vits/export_onnx.py --cfg $config \ 67 | --speaker_table $data/speaker.txt \ 68 | --phone_table $data/phones.txt \ 69 | --checkpoint $dir/G_90000.pth \ 70 | --onnx_model $dir/G_90000.onnx 71 | 72 | python vits/inference_onnx.py --cfg $config \ 73 | --speaker_table $data/speaker.txt \ 74 | --phone_table $data/phones.txt \ 75 | --onnx_model $dir/G_90000.onnx \ 76 | --test_file $data/test.txt \ 77 | --outdir $test_audio 78 | fi 79 | -------------------------------------------------------------------------------- /examples/multilingual/tools: -------------------------------------------------------------------------------- 1 | ../../tools -------------------------------------------------------------------------------- /examples/multilingual/vits: -------------------------------------------------------------------------------- 1 | ../../wetts/vits -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | g2p_en 2 | librosa 3 | nltk 4 | onnx 5 | onnxruntime 6 | scikit-learn 7 | scipy 8 | tensorboard 9 | torch 10 | torchvision 11 | tqdm 12 | transformers 13 | huggingface_hub 14 | soundfile 15 | -------------------------------------------------------------------------------- /runtime/android/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .gradle 3 | /local.properties 4 | /.idea/caches 5 | /.idea/libraries 6 | /.idea/modules.xml 7 | /.idea/workspace.xml 8 | /.idea/navEditor.xml 9 | /.idea/assetWizardSettings.xml 10 | .DS_Store 11 | /build 12 | /captures 13 | .externalNativeBuild 14 | .cxx 15 | local.properties 16 | -------------------------------------------------------------------------------- /runtime/android/README.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | Most of AI engineers are not familiar with Android development, this is a simple ‘how to’. 4 | 5 | 1. Train your model with your data 6 | 7 | 2. Export pytorch model to onnx model 8 | 9 | 3. Convert onnx model for mobile deployment 10 | 11 | ```bash 12 | python -m onnxruntime.tools.convert_onnx_models_to_ort your-model.onnx 13 | ``` 14 | 15 | you will get `your-model.ort` and `your-model.with_runtime_opt.ort` 16 | 17 | ``` bash 18 | $ tree app/src/main/assets 19 | app/src/main/assets 20 | ├── frontend 21 | │   ├── final.ort 22 | │   ├── frontend.flags 23 | │   ├── g2p_en 24 | │   │   ├── README.md 25 | │   │   ├── cmudict.dict 26 | │   │   ├── model.fst 27 | │   │   └── phones.sym 28 | │   ├── lexicon 29 | │   │   ├── lexicon.txt 30 | │   │   ├── pinyin_dict.txt 31 | │   │   ├── polyphone.txt 32 | │   │   ├── polyphone_phone.txt 33 | │   │   └── prosody.txt 34 | │   ├── tn 35 | │   │   ├── zh_tn_tagger.fst 36 | │   │   └── zh_tn_verbalizer.fst 37 | │   └── vocab.txt 38 | └── vits 39 | ├── final.ort 40 | ├── phones.txt 41 | ├── speaker.txt 42 | └── vits.flags 43 | 44 | $ head app/src/main/assets/frontend/frontend.flags 45 | --tagger=frontend/tn/zh_tn_tagger.fst 46 | --verbalizer=frontend/tn/zh_tn_verbalizer.fst 47 | --cmudict=frontend/g2p_en/cmudict.dict 48 | --g2p_en_model=frontend/g2p_en/model.fst 49 | --g2p_en_sym=frontend/g2p_en/phones.sym 50 | --char2pinyin=frontend/lexicon/pinyin_dict.txt 51 | --pinyin2id=frontend/lexicon/polyphone.txt 52 | --pinyin2phones=frontend/lexicon/lexicon.txt 53 | --vocab=frontend/vocab.txt 54 | --g2p_prosody_model=frontend/final.ort 55 | 56 | $ cat app/src/main/assets/vits/vits.flags 57 | --sampling_rate=16000 58 | --speaker2id=vits/speaker.txt 59 | --phone2id=vits/phones.txt 60 | --vits_model=vits/final.ort 61 | ``` 62 | 63 | 4. Install Android Studio and open path of wetts/runtime/android and build 64 | 65 | 5. Install `app/build/outputs/apk/debug/app-debug.apk` to your phone and try it. 66 | -------------------------------------------------------------------------------- /runtime/android/app/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /runtime/android/app/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'com.android.application' 3 | } 4 | 5 | android { 6 | compileSdk 32 7 | 8 | configurations { 9 | extractForNativeBuild 10 | } 11 | 12 | defaultConfig { 13 | applicationId "cn.org.wenet.wetts" 14 | minSdk 21 15 | targetSdk 32 16 | versionCode 1 17 | versionName "1.0" 18 | 19 | testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" 20 | externalNativeBuild { 21 | cmake { 22 | targets "wetts" 23 | } 24 | } 25 | } 26 | 27 | buildTypes { 28 | release { 29 | minifyEnabled false 30 | proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' 31 | } 32 | } 33 | externalNativeBuild { 34 | cmake { 35 | version "3.18.1" 36 | path "src/main/cpp/CMakeLists.txt" 37 | } 38 | } 39 | compileOptions { 40 | sourceCompatibility JavaVersion.VERSION_1_8 41 | targetCompatibility JavaVersion.VERSION_1_8 42 | } 43 | } 44 | 45 | dependencies { 46 | implementation 'androidx.appcompat:appcompat:1.3.0' 47 | implementation 'com.google.android.material:material:1.4.0' 48 | implementation 'androidx.constraintlayout:constraintlayout:2.0.4' 49 | implementation 'com.microsoft.onnxruntime:onnxruntime-android:1.15.1' 50 | extractForNativeBuild 'com.microsoft.onnxruntime:onnxruntime-android:1.15.1' 51 | implementation 'com.github.pengzhendong:wenet-openfst-android:1.0.2' 52 | extractForNativeBuild 'com.github.pengzhendong:wenet-openfst-android:1.0.2' 53 | testImplementation 'junit:junit:4.13.2' 54 | androidTestImplementation 'androidx.test.ext:junit:1.1.3' 55 | androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0' 56 | } 57 | 58 | task extractAARForNativeBuild { 59 | doLast { 60 | configurations.extractForNativeBuild.files.each { 61 | def file = it.absoluteFile 62 | copy { 63 | from zipTree(file) 64 | into "$buildDir/$file.name" 65 | include "headers/**" 66 | include "jni/**" 67 | } 68 | } 69 | } 70 | } 71 | 72 | tasks.whenTaskAdded { task -> 73 | if (task.name.contains('externalNativeBuild')) { 74 | task.dependsOn(extractAARForNativeBuild) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /runtime/android/app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # You can control the set of applied configuration files using the 3 | # proguardFiles setting in build.gradle. 4 | # 5 | # For more details, see 6 | # http://developer.android.com/guide/developing/tools/proguard.html 7 | 8 | # If your project uses WebView with JS, uncomment the following 9 | # and specify the fully qualified class name to the JavaScript interface 10 | # class: 11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 12 | # public *; 13 | #} 14 | 15 | # Uncomment this to preserve the line number information for 16 | # debugging stack traces. 17 | #-keepattributes SourceFile,LineNumberTable 18 | 19 | # If you keep the line number information, uncomment this to 20 | # hide the original source file name. 21 | #-renamesourcefileattribute SourceFile -------------------------------------------------------------------------------- /runtime/android/app/src/androidTest/java/cn/org/wenet/wetts/ExampleInstrumentedTest.java: -------------------------------------------------------------------------------- 1 | package cn.org.wenet.wetts; 2 | 3 | import android.content.Context; 4 | 5 | import androidx.test.platform.app.InstrumentationRegistry; 6 | import androidx.test.ext.junit.runners.AndroidJUnit4; 7 | 8 | import org.junit.Test; 9 | import org.junit.runner.RunWith; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | /** 14 | * Instrumented test, which will execute on an Android device. 15 | * 16 | * @see Testing documentation 17 | */ 18 | @RunWith(AndroidJUnit4.class) 19 | public class ExampleInstrumentedTest { 20 | @Test 21 | public void useAppContext() { 22 | // Context of the app under test. 23 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); 24 | assertEquals("cn.org.wenet.wetts", appContext.getPackageName()); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 16 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/assets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/wetts/0abb5117171b305f9150feba5d9bb1c1796088b9/runtime/android/app/src/main/assets/.gitkeep -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.1) 2 | project(wetts CXX) 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_VERBOSE_MAKEFILE on) 5 | 6 | set(build_DIR ${CMAKE_SOURCE_DIR}/../../../build) 7 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 8 | 9 | file(GLOB ONNXRUNTIME_INCLUDE_DIRS ${build_DIR}/onnxruntime*.aar/headers) 10 | file(GLOB ONNXRUNTIME_LINK_DIRS ${build_DIR}/onnxruntime*.aar/jni/${ANDROID_ABI}) 11 | link_directories(${ONNXRUNTIME_LINK_DIRS}) 12 | include_directories(${ONNXRUNTIME_INCLUDE_DIRS}) 13 | 14 | set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) 15 | link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) 16 | link_libraries(log gflags_nothreads glog fst) 17 | include_directories(${openfst_BINARY_DIR}/include) 18 | 19 | include(wetextprocessing) 20 | include_directories(${CMAKE_SOURCE_DIR}) 21 | 22 | add_subdirectory(utils) 23 | add_subdirectory(frontend) 24 | add_subdirectory(model) 25 | add_dependencies(frontend wetextprocessing) 26 | 27 | add_library(wetts SHARED wetts.cc) 28 | target_link_libraries(wetts PUBLIC tts_model onnxruntime) 29 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/cmake: -------------------------------------------------------------------------------- 1 | ../../../../../core/cmake -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/frontend: -------------------------------------------------------------------------------- 1 | ../../../../../core/frontend -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/model: -------------------------------------------------------------------------------- 1 | ../../../../../core/model -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/utils: -------------------------------------------------------------------------------- 1 | ../../../../../core/utils -------------------------------------------------------------------------------- /runtime/android/app/src/main/java/cn/org/wenet/wetts/Synthesis.java: -------------------------------------------------------------------------------- 1 | package cn.org.wenet.wetts; 2 | 3 | public class Synthesis { 4 | 5 | static { 6 | System.loadLibrary("wetts"); 7 | } 8 | 9 | public static native void init(String modelDir); 10 | public static native void run(String text, String speaker); 11 | } 12 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 15 | 18 | 21 | 22 | 23 | 24 | 30 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/layout/activity_main.xml: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 22 | 23 | 32 | 33 |