├── .github └── workflows │ └── pypi.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── install.md ├── quick_use.md └── training.md ├── logo.png ├── melo ├── __init__.py ├── api.py ├── app.py ├── attentions.py ├── commons.py ├── configs │ └── config.json ├── data │ └── example │ │ └── metadata.list ├── data_utils.py ├── download_utils.py ├── infer.py ├── init_downloads.py ├── losses.py ├── main.py ├── mel_processing.py ├── models.py ├── modules.py ├── monotonic_align │ ├── __init__.py │ └── core.py ├── preprocess_text.py ├── split_utils.py ├── text │ ├── __init__.py │ ├── chinese.py │ ├── chinese_bert.py │ ├── chinese_mix.py │ ├── cleaner.py │ ├── cleaner_multiling.py │ ├── cmudict.rep │ ├── cmudict_cache.pickle │ ├── english.py │ ├── english_bert.py │ ├── english_utils │ │ ├── __init__.py │ │ ├── abbreviations.py │ │ ├── number_norm.py │ │ └── time_norm.py │ ├── es_phonemizer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cleaner.py │ │ ├── es_symbols.json │ │ ├── es_symbols.txt │ │ ├── es_symbols_v2.json │ │ ├── es_to_ipa.py │ │ ├── example_ipa.txt │ │ ├── gruut_wrapper.py │ │ ├── punctuation.py │ │ ├── spanish_symbols.txt │ │ └── test.ipynb │ ├── fr_phonemizer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cleaner.py │ │ ├── en_symbols.json │ │ ├── example_ipa.txt │ │ ├── fr_symbols.json │ │ ├── fr_to_ipa.py │ │ ├── french_abbreviations.py │ │ ├── french_symbols.txt │ │ ├── gruut_wrapper.py │ │ └── punctuation.py │ ├── french.py │ ├── french_bert.py │ ├── japanese.py │ ├── japanese_bert.py │ ├── ko_dictionary.py │ ├── korean.py │ ├── opencpop-strict.txt │ ├── spanish.py │ ├── spanish_bert.py │ ├── symbols.py │ └── tone_sandhi.py ├── train.py ├── train.sh ├── transforms.py └── utils.py ├── requirements.txt ├── setup.py └── test ├── basetts_test_resources ├── en_egs_text.txt ├── es_egs_text.txt ├── fr_egs_text.txt ├── jp_egs_text.txt ├── kr_egs_text.txt └── zh_mix_en_egs_text.txt ├── test_base_model_tts_package.py └── test_base_model_tts_package_from_S3.py /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m ensurepip --upgrade 33 | pip install build 34 | - name: Build package 35 | run: python -m build 36 | - name: Publish package 37 | uses: pypa/gh-action-pypi-publish@release/v1.8 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | basetts_outputs_use_bert/ 4 | basetts_outputs/ 5 | multilingual_ckpts 6 | basetts_outputs_package/ 7 | build/ 8 | *.egg-info/ 9 | 10 | *.zip 11 | *.wav -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /app 3 | COPY . /app 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | build-essential libsndfile1 \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN pip install -e . 10 | RUN python -m unidic download 11 | RUN python melo/init_downloads.py 12 | 13 | CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 MyShell.ai 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
 
3 |
4 | myshell-ai%2FMeloTTS | Trendshift 5 |
6 | 7 | ## Introduction 8 | MeloTTS is a **high-quality multi-lingual** text-to-speech library by [MIT](https://www.mit.edu/) and [MyShell.ai](https://myshell.ai). Supported languages include: 9 | 10 | | Language | Example | 11 | | --- | --- | 12 | | English (American) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-US/speed_1.0/sent_000.wav) | 13 | | English (British) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-BR/speed_1.0/sent_000.wav) | 14 | | English (Indian) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN_INDIA/speed_1.0/sent_000.wav) | 15 | | English (Australian) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-AU/speed_1.0/sent_000.wav) | 16 | | English (Default) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-Default/speed_1.0/sent_000.wav) | 17 | | Spanish | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/es/ES/speed_1.0/sent_000.wav) | 18 | | French | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/fr/FR/speed_1.0/sent_000.wav) | 19 | | Chinese (mix EN) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/zh/ZH/speed_1.0/sent_008.wav) | 20 | | Japanese | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/jp/JP/speed_1.0/sent_000.wav) | 21 | | Korean | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/kr/KR/speed_1.0/sent_000.wav) | 22 | 23 | Some other features include: 24 | - The Chinese speaker supports `mixed Chinese and English`. 25 | - Fast enough for `CPU real-time inference`. 26 | 27 | ## Usage 28 | - [Use without Installation](docs/quick_use.md) 29 | - [Install and Use Locally](docs/install.md) 30 | - [Training on Custom Dataset](docs/training.md) 31 | 32 | The Python API and model cards can be found in [this repo](https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#python-api) or on [HuggingFace](https://huggingface.co/myshell-ai). 33 | 34 | **Contributing** 35 | 36 | If you find this work useful, please consider contributing to this repo. 37 | 38 | - Many thanks to [@fakerybakery](https://github.com/fakerybakery) for adding the Web UI and CLI part. 39 | 40 | ## Authors 41 | 42 | - [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University 43 | - [Xumin Yu](https://yuxumin.github.io) at Tsinghua University 44 | - [Zengyi Qin](https://www.qinzy.tech) (project lead) at MIT and MyShell 45 | 46 | **Citation** 47 | ``` 48 | @software{zhao2024melo, 49 | author={Zhao, Wenliang and Yu, Xumin and Qin, Zengyi}, 50 | title = {MeloTTS: High-quality Multi-lingual Multi-accent Text-to-Speech}, 51 | url = {https://github.com/myshell-ai/MeloTTS}, 52 | year = {2023} 53 | } 54 | ``` 55 | 56 | ## License 57 | 58 | This library is under MIT License, which means it is free for both commercial and non-commercial use. 59 | 60 | ## Acknowledgements 61 | 62 | This implementation is based on [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), [VITS2](https://github.com/daniilrobnikov/vits2) and [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2). We appreciate their awesome work. 63 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | ## Install and Use Locally 2 | 3 | ### Table of Content 4 | - [Linux and macOS Install](#linux-and-macos-install) 5 | - [Docker Install for Windows and macOS](#docker-install) 6 | - [Usage](#usage) 7 | - [Web UI](#webui) 8 | - [CLI](#cli) 9 | - [Python API](#python-api) 10 | 11 | ### Linux and macOS Install 12 | The repo is developed and tested on `Ubuntu 20.04` and `Python 3.9`. 13 | ```bash 14 | git clone https://github.com/myshell-ai/MeloTTS.git 15 | cd MeloTTS 16 | pip install -e . 17 | python -m unidic download 18 | ``` 19 | If you encountered issues in macOS install, try the [Docker Install](#docker-install) 20 | 21 | ### Docker Install 22 | To avoid compatibility issues, for Windows users and some macOS users, we suggest to run via Docker. Ensure that [you have Docker installed](https://docs.docker.com/engine/install/). 23 | 24 | **Build Docker** 25 | 26 | This could take a few minutes. 27 | ```bash 28 | git clone https://github.com/myshell-ai/MeloTTS.git 29 | cd MeloTTS 30 | docker build -t melotts . 31 | ``` 32 | 33 | **Run Docker** 34 | ```bash 35 | docker run -it -p 8888:8888 melotts 36 | ``` 37 | If your local machine has GPU, then you can choose to run: 38 | ```bash 39 | docker run --gpus all -it -p 8888:8888 melotts 40 | ``` 41 | Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app. 42 | 43 | ## Usage 44 | 45 | ### WebUI 46 | 47 | The WebUI supports muliple languages and voices. First, follow the installation steps. Then, simply run: 48 | 49 | ```bash 50 | melo-ui 51 | # Or: python melo/app.py 52 | ``` 53 | 54 | ### CLI 55 | 56 | You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples: 57 | 58 | **Read English text:** 59 | 60 | ```bash 61 | melo "Text to read" output.wav 62 | ``` 63 | 64 | **Specify a language:** 65 | 66 | ```bash 67 | melo "Text to read" output.wav --language EN 68 | ``` 69 | 70 | **Specify a speaker:** 71 | 72 | ```bash 73 | melo "Text to read" output.wav --language EN --speaker EN-US 74 | melo "Text to read" output.wav --language EN --speaker EN-AU 75 | ``` 76 | 77 | The available speakers are: `EN-Default`, `EN-US`, `EN-BR`, `EN_INDIA` `EN-AU`. 78 | 79 | **Specify a speed:** 80 | 81 | ```bash 82 | melo "Text to read" output.wav --language EN --speaker EN-US --speed 1.5 83 | melo "Text to read" output.wav --speed 1.5 84 | ``` 85 | 86 | **Use a different language:** 87 | 88 | ```bash 89 | melo "text-to-speech 领域近年来发展迅速" zh.wav -l ZH 90 | ``` 91 | 92 | **Load from a file:** 93 | 94 | ```bash 95 | melo file.txt out.wav --file 96 | ``` 97 | 98 | The full API documentation may be found using: 99 | 100 | ```bash 101 | melo --help 102 | ``` 103 | 104 | ### Python API 105 | 106 | #### English with Multiple Accents 107 | 108 | ```python 109 | from melo.api import TTS 110 | 111 | # Speed is adjustable 112 | speed = 1.0 113 | 114 | # CPU is sufficient for real-time inference. 115 | # You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps' 116 | device = 'auto' # Will automatically use GPU if available 117 | 118 | # English 119 | text = "Did you ever hear a folk tale about a giant turtle?" 120 | model = TTS(language='EN', device=device) 121 | speaker_ids = model.hps.data.spk2id 122 | 123 | # American accent 124 | output_path = 'en-us.wav' 125 | model.tts_to_file(text, speaker_ids['EN-US'], output_path, speed=speed) 126 | 127 | # British accent 128 | output_path = 'en-br.wav' 129 | model.tts_to_file(text, speaker_ids['EN-BR'], output_path, speed=speed) 130 | 131 | # Indian accent 132 | output_path = 'en-india.wav' 133 | model.tts_to_file(text, speaker_ids['EN_INDIA'], output_path, speed=speed) 134 | 135 | # Australian accent 136 | output_path = 'en-au.wav' 137 | model.tts_to_file(text, speaker_ids['EN-AU'], output_path, speed=speed) 138 | 139 | # Default accent 140 | output_path = 'en-default.wav' 141 | model.tts_to_file(text, speaker_ids['EN-Default'], output_path, speed=speed) 142 | 143 | ``` 144 | 145 | #### Spanish 146 | ```python 147 | from melo.api import TTS 148 | 149 | # Speed is adjustable 150 | speed = 1.0 151 | 152 | # CPU is sufficient for real-time inference. 153 | # You can also change to cuda:0 154 | device = 'cpu' 155 | 156 | text = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante." 157 | model = TTS(language='ES', device=device) 158 | speaker_ids = model.hps.data.spk2id 159 | 160 | output_path = 'es.wav' 161 | model.tts_to_file(text, speaker_ids['ES'], output_path, speed=speed) 162 | ``` 163 | 164 | #### French 165 | 166 | ```python 167 | from melo.api import TTS 168 | 169 | # Speed is adjustable 170 | speed = 1.0 171 | device = 'cpu' # or cuda:0 172 | 173 | text = "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante." 174 | model = TTS(language='FR', device=device) 175 | speaker_ids = model.hps.data.spk2id 176 | 177 | output_path = 'fr.wav' 178 | model.tts_to_file(text, speaker_ids['FR'], output_path, speed=speed) 179 | ``` 180 | 181 | #### Chinese 182 | 183 | ```python 184 | from melo.api import TTS 185 | 186 | # Speed is adjustable 187 | speed = 1.0 188 | device = 'cpu' # or cuda:0 189 | 190 | text = "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。" 191 | model = TTS(language='ZH', device=device) 192 | speaker_ids = model.hps.data.spk2id 193 | 194 | output_path = 'zh.wav' 195 | model.tts_to_file(text, speaker_ids['ZH'], output_path, speed=speed) 196 | ``` 197 | 198 | #### Japanese 199 | 200 | ```python 201 | from melo.api import TTS 202 | 203 | # Speed is adjustable 204 | speed = 1.0 205 | device = 'cpu' # or cuda:0 206 | 207 | text = "彼は毎朝ジョギングをして体を健康に保っています。" 208 | model = TTS(language='JP', device=device) 209 | speaker_ids = model.hps.data.spk2id 210 | 211 | output_path = 'jp.wav' 212 | model.tts_to_file(text, speaker_ids['JP'], output_path, speed=speed) 213 | ``` 214 | 215 | #### Korean 216 | 217 | ```python 218 | from melo.api import TTS 219 | 220 | # Speed is adjustable 221 | speed = 1.0 222 | device = 'cpu' # or cuda:0 223 | 224 | text = "안녕하세요! 오늘은 날씨가 정말 좋네요." 225 | model = TTS(language='KR', device=device) 226 | speaker_ids = model.hps.data.spk2id 227 | 228 | output_path = 'kr.wav' 229 | model.tts_to_file(text, speaker_ids['KR'], output_path, speed=speed) 230 | ``` 231 | -------------------------------------------------------------------------------- /docs/quick_use.md: -------------------------------------------------------------------------------- 1 | ## Use MeloTTS without Installation 2 | 3 | **Quick Demo** 4 | 5 | - [Official live demo](https://app.myshell.ai/bot/UN77N3/1709094629) on Myshell. 6 | - Hugging Face Space [live demo](https://huggingface.co/spaces/mrfakename/MeloTTS). 7 | 8 | **Use on MyShell** 9 | 10 | There are hundreds of TTS models on MyShell, much more than MeloTTS. For example: 11 | 12 | English 13 | - [gentle British male voice](https://app.myshell.ai/widget/nIfamm) 14 | - [cheerful young female voice](https://app.myshell.ai/widget/AjIjqy) 15 | - [sultry and robust male voice](https://app.myshell.ai/widget/zQJJN3) 16 | 17 | Spanish 18 | - [voz femenina adorable](https://app.myshell.ai/widget/buIZBf) 19 | - [voz masculina joven](https://app.myshell.ai/widget/rayuiy) 20 | - [voz de niña inmadura](https://app.myshell.ai/widget/mYFV3e) 21 | 22 | French 23 | - [voix adorable de fille](https://app.myshell.ai/widget/3IfEfy) 24 | - [voix douce masculine](https://app.myshell.ai/widget/IRR3M3) 25 | - [voix douce féminine](https://app.myshell.ai/widget/NRbaUj) 26 | 27 | German 28 | - [sanfte Männerstimme](https://app.myshell.ai/widget/JFnAn2) 29 | - [sanfte Frauenstimme](https://app.myshell.ai/widget/MrU7Nb) 30 | - [unreife Mädchenstimme](https://app.myshell.ai/widget/UFbYBj) 31 | 32 | Portuguese 33 | - [voz feminina nítida](https://app.myshell.ai/widget/VzMb6j) 34 | - [voz de menino imaturo](https://app.myshell.ai/widget/nAzeei) 35 | - [voz masculina sóbria](https://app.myshell.ai/widget/JZRNJz) 36 | 37 | Russian 38 | - [зрелый женский голос](https://app.myshell.ai/widget/6byMZ3) 39 | - [зрелый мужской голос](https://app.myshell.ai/widget/NB7jmm) 40 | 41 | Chinese 42 | - [甜美女声](https://app.myshell.ai/widget/ymeUjm) 43 | - [青年男声](https://app.myshell.ai/widget/NZnERb) 44 | 45 | More can be found at the widget center of [MyShell.ai](https://app.myshell.ai/robot-workshop). 46 | -------------------------------------------------------------------------------- /docs/training.md: -------------------------------------------------------------------------------- 1 | ## Training 2 | 3 | Before training, please install MeloTTS in dev mode and go to the `melo` folder. 4 | ``` 5 | pip install -e . 6 | cd melo 7 | ``` 8 | 9 | ### Data Preparation 10 | To train a TTS model, we need to prepare the audio files and a metadata file. We recommend using 44100Hz audio files and the metadata file should have the following format: 11 | 12 | ``` 13 | path/to/audio_001.wav ||| 14 | path/to/audio_002.wav ||| 15 | ``` 16 | The transcribed text can be obtained by ASR model, (e.g., [whisper](https://github.com/openai/whisper)). An example metadata can be found in `data/example/metadata.list` 17 | 18 | We can then run the preprocessing code: 19 | ``` 20 | python preprocess_text.py --metadata data/example/metadata.list 21 | ``` 22 | A config file `data/example/config.json` will be generated. Feel free to edit some hyper-parameters in that config file (for example, you may decrease the batch size if you have encountered the CUDA out-of-memory issue). 23 | 24 | ### Training 25 | The training can be launched by: 26 | ``` 27 | bash train.sh 28 | ``` 29 | 30 | We have found for some machine the training will sometimes crash due to an [issue](https://github.com/pytorch/pytorch/issues/2530) of gloo. Therefore, we add an auto-resume wrapper in the `train.sh`. 31 | 32 | ### Inference 33 | Simply run: 34 | ``` 35 | python infer.py --text "" -m /path/to/checkpoint/G_.pth -o 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/logo.png -------------------------------------------------------------------------------- /melo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/__init__.py -------------------------------------------------------------------------------- /melo/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import torch 5 | import librosa 6 | import soundfile 7 | import torchaudio 8 | import numpy as np 9 | import torch.nn as nn 10 | from tqdm import tqdm 11 | import torch 12 | 13 | from . import utils 14 | from . import commons 15 | from .models import SynthesizerTrn 16 | from .split_utils import split_sentence 17 | from .mel_processing import spectrogram_torch, spectrogram_torch_conv 18 | from .download_utils import load_or_download_config, load_or_download_model 19 | 20 | class TTS(nn.Module): 21 | def __init__(self, 22 | language, 23 | device='auto', 24 | use_hf=True, 25 | config_path=None, 26 | ckpt_path=None): 27 | super().__init__() 28 | if device == 'auto': 29 | device = 'cpu' 30 | if torch.cuda.is_available(): device = 'cuda' 31 | if torch.backends.mps.is_available(): device = 'mps' 32 | if 'cuda' in device: 33 | assert torch.cuda.is_available() 34 | 35 | # config_path = 36 | hps = load_or_download_config(language, use_hf=use_hf, config_path=config_path) 37 | 38 | num_languages = hps.num_languages 39 | num_tones = hps.num_tones 40 | symbols = hps.symbols 41 | 42 | model = SynthesizerTrn( 43 | len(symbols), 44 | hps.data.filter_length // 2 + 1, 45 | hps.train.segment_size // hps.data.hop_length, 46 | n_speakers=hps.data.n_speakers, 47 | num_tones=num_tones, 48 | num_languages=num_languages, 49 | **hps.model, 50 | ).to(device) 51 | 52 | model.eval() 53 | self.model = model 54 | self.symbol_to_id = {s: i for i, s in enumerate(symbols)} 55 | self.hps = hps 56 | self.device = device 57 | 58 | # load state_dict 59 | checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf, ckpt_path=ckpt_path) 60 | self.model.load_state_dict(checkpoint_dict['model'], strict=True) 61 | 62 | language = language.split('_')[0] 63 | self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model 64 | 65 | @staticmethod 66 | def audio_numpy_concat(segment_data_list, sr, speed=1.): 67 | audio_segments = [] 68 | for segment_data in segment_data_list: 69 | audio_segments += segment_data.reshape(-1).tolist() 70 | audio_segments += [0] * int((sr * 0.05) / speed) 71 | audio_segments = np.array(audio_segments).astype(np.float32) 72 | return audio_segments 73 | 74 | @staticmethod 75 | def split_sentences_into_pieces(text, language, quiet=False): 76 | texts = split_sentence(text, language_str=language) 77 | if not quiet: 78 | print(" > Text split to sentences.") 79 | print('\n'.join(texts)) 80 | print(" > ===========================") 81 | return texts 82 | 83 | def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,): 84 | language = self.language 85 | texts = self.split_sentences_into_pieces(text, language, quiet) 86 | audio_list = [] 87 | if pbar: 88 | tx = pbar(texts) 89 | else: 90 | if position: 91 | tx = tqdm(texts, position=position) 92 | elif quiet: 93 | tx = texts 94 | else: 95 | tx = tqdm(texts) 96 | for t in tx: 97 | if language in ['EN', 'ZH_MIX_EN']: 98 | t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t) 99 | device = self.device 100 | bert, ja_bert, phones, tones, lang_ids = utils.get_text_for_tts_infer(t, language, self.hps, device, self.symbol_to_id) 101 | with torch.no_grad(): 102 | x_tst = phones.to(device).unsqueeze(0) 103 | tones = tones.to(device).unsqueeze(0) 104 | lang_ids = lang_ids.to(device).unsqueeze(0) 105 | bert = bert.to(device).unsqueeze(0) 106 | ja_bert = ja_bert.to(device).unsqueeze(0) 107 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 108 | del phones 109 | speakers = torch.LongTensor([speaker_id]).to(device) 110 | audio = self.model.infer( 111 | x_tst, 112 | x_tst_lengths, 113 | speakers, 114 | tones, 115 | lang_ids, 116 | bert, 117 | ja_bert, 118 | sdp_ratio=sdp_ratio, 119 | noise_scale=noise_scale, 120 | noise_scale_w=noise_scale_w, 121 | length_scale=1. / speed, 122 | )[0][0, 0].data.cpu().float().numpy() 123 | del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers 124 | # 125 | audio_list.append(audio) 126 | torch.cuda.empty_cache() 127 | audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed) 128 | 129 | if output_path is None: 130 | return audio 131 | else: 132 | if format: 133 | soundfile.write(output_path, audio, self.hps.data.sampling_rate, format=format) 134 | else: 135 | soundfile.write(output_path, audio, self.hps.data.sampling_rate) 136 | -------------------------------------------------------------------------------- /melo/app.py: -------------------------------------------------------------------------------- 1 | # WebUI by mrfakename 2 | # Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS 3 | import gradio as gr 4 | import os, torch, io 5 | # os.system('python -m unidic download') 6 | print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.") 7 | from melo.api import TTS 8 | speed = 1.0 9 | import tempfile 10 | import click 11 | device = 'auto' 12 | models = { 13 | 'EN': TTS(language='EN', device=device), 14 | 'ES': TTS(language='ES', device=device), 15 | 'FR': TTS(language='FR', device=device), 16 | 'ZH': TTS(language='ZH', device=device), 17 | 'JP': TTS(language='JP', device=device), 18 | 'KR': TTS(language='KR', device=device), 19 | } 20 | speaker_ids = models['EN'].hps.data.spk2id 21 | 22 | default_text_dict = { 23 | 'EN': 'The field of text-to-speech has seen rapid development recently.', 24 | 'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.', 25 | 'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment', 26 | 'ZH': 'text-to-speech 领域近年来发展迅速', 27 | 'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています', 28 | 'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.', 29 | } 30 | 31 | def synthesize(speaker, text, speed, language, progress=gr.Progress()): 32 | bio = io.BytesIO() 33 | models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav') 34 | return bio.getvalue() 35 | def load_speakers(language, text): 36 | if text in list(default_text_dict.values()): 37 | newtext = default_text_dict[language] 38 | else: 39 | newtext = text 40 | return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext 41 | with gr.Blocks() as demo: 42 | gr.Markdown('# MeloTTS WebUI\n\nA WebUI for MeloTTS.') 43 | with gr.Group(): 44 | speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker') 45 | language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN') 46 | speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1) 47 | text = gr.Textbox(label="Text to speak", value=default_text_dict['EN']) 48 | language.input(load_speakers, inputs=[language, text], outputs=[speaker, text]) 49 | btn = gr.Button('Synthesize', variant='primary') 50 | aud = gr.Audio(interactive=False) 51 | btn.click(synthesize, inputs=[speaker, text, speed, language], outputs=[aud]) 52 | gr.Markdown('WebUI by [mrfakename](https://twitter.com/realmrfakename).') 53 | @click.command() 54 | @click.option('--share', '-s', is_flag=True, show_default=True, default=False, help="Expose a publicly-accessible shared Gradio link usable by anyone with the link. Only share the link with people you trust.") 55 | @click.option('--host', '-h', default=None) 56 | @click.option('--port', '-p', type=int, default=None) 57 | def main(share, host, port): 58 | demo.queue(api_open=False).launch(show_api=False, share=share, server_name=host, server_port=port) 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /melo/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def init_weights(m, mean=0.0, std=0.01): 7 | classname = m.__class__.__name__ 8 | if classname.find("Conv") != -1: 9 | m.weight.data.normal_(mean, std) 10 | 11 | 12 | def get_padding(kernel_size, dilation=1): 13 | return int((kernel_size * dilation - dilation) / 2) 14 | 15 | 16 | def convert_pad_shape(pad_shape): 17 | layer = pad_shape[::-1] 18 | pad_shape = [item for sublist in layer for item in sublist] 19 | return pad_shape 20 | 21 | 22 | def intersperse(lst, item): 23 | result = [item] * (len(lst) * 2 + 1) 24 | result[1::2] = lst 25 | return result 26 | 27 | 28 | def kl_divergence(m_p, logs_p, m_q, logs_q): 29 | """KL(P||Q)""" 30 | kl = (logs_q - logs_p) - 0.5 31 | kl += ( 32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 33 | ) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 68 | position = torch.arange(length, dtype=torch.float) 69 | num_timescales = channels // 2 70 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 71 | num_timescales - 1 72 | ) 73 | inv_timescales = min_timescale * torch.exp( 74 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 75 | ) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | layer = pad_shape[::-1] 112 | pad_shape = [item for sublist in layer for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | 134 | b, _, t_y, t_x = mask.shape 135 | cum_duration = torch.cumsum(duration, -1) 136 | 137 | cum_duration_flat = cum_duration.view(b * t_x) 138 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 139 | path = path.view(b, t_x, t_y) 140 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 141 | path = path.unsqueeze(1).transpose(2, 3) * mask 142 | return path 143 | 144 | 145 | def clip_grad_value_(parameters, clip_value, norm_type=2): 146 | if isinstance(parameters, torch.Tensor): 147 | parameters = [parameters] 148 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 149 | norm_type = float(norm_type) 150 | if clip_value is not None: 151 | clip_value = float(clip_value) 152 | 153 | total_norm = 0 154 | for p in parameters: 155 | param_norm = p.grad.data.norm(norm_type) 156 | total_norm += param_norm.item() ** norm_type 157 | if clip_value is not None: 158 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 159 | total_norm = total_norm ** (1.0 / norm_type) 160 | return total_norm 161 | -------------------------------------------------------------------------------- /melo/configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 52, 6 | "epochs": 10000, 7 | "learning_rate": 0.0003, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 6, 14 | "fp16_run": false, 15 | "lr_decay": 0.999875, 16 | "segment_size": 16384, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "skip_optimizer": true 22 | }, 23 | "data": { 24 | "training_files": "", 25 | "validation_files": "", 26 | "max_wav_value": 32768.0, 27 | "sampling_rate": 44100, 28 | "filter_length": 2048, 29 | "hop_length": 512, 30 | "win_length": 2048, 31 | "n_mel_channels": 128, 32 | "mel_fmin": 0.0, 33 | "mel_fmax": null, 34 | "add_blank": true, 35 | "n_speakers": 256, 36 | "cleaned_text": true, 37 | "spk2id": {} 38 | }, 39 | "model": { 40 | "use_spk_conditioned_encoder": true, 41 | "use_noise_scaled_mas": true, 42 | "use_mel_posterior_encoder": false, 43 | "use_duration_discriminator": true, 44 | "inter_channels": 192, 45 | "hidden_channels": 192, 46 | "filter_channels": 768, 47 | "n_heads": 2, 48 | "n_layers": 6, 49 | "n_layers_trans_flow": 3, 50 | "kernel_size": 3, 51 | "p_dropout": 0.1, 52 | "resblock": "1", 53 | "resblock_kernel_sizes": [ 54 | 3, 55 | 7, 56 | 11 57 | ], 58 | "resblock_dilation_sizes": [ 59 | [ 60 | 1, 61 | 3, 62 | 5 63 | ], 64 | [ 65 | 1, 66 | 3, 67 | 5 68 | ], 69 | [ 70 | 1, 71 | 3, 72 | 5 73 | ] 74 | ], 75 | "upsample_rates": [ 76 | 8, 77 | 8, 78 | 2, 79 | 2, 80 | 2 81 | ], 82 | "upsample_initial_channel": 512, 83 | "upsample_kernel_sizes": [ 84 | 16, 85 | 16, 86 | 8, 87 | 2, 88 | 2 89 | ], 90 | "n_layers_q": 3, 91 | "use_spectral_norm": false, 92 | "gin_channels": 256 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /melo/data/example/metadata.list: -------------------------------------------------------------------------------- 1 | data/example/wavs/000.wav|EN-default|EN|Well, there are always new trends and styles emerging in the fashion world, but I think some of the biggest trends at the moment include sustainability and ethical fashion, streetwear and athleisure, and oversized and deconstructed silhouettes. 2 | data/example/wavs/001.wav|EN-default|EN|Many designers and brands are focusing on creating more environmentally-friendly and socially responsible clothing, while others are incorporating elements of sportswear and casual wear into their collections. 3 | data/example/wavs/002.wav|EN-default|EN|And there's a growing interest in looser, more relaxed shapes and unconventional materials and finishes. 4 | data/example/wavs/003.wav|EN-default|EN|That's really insightful. 5 | data/example/wavs/004.wav|EN-default|EN|What do you think are some of the benefits of following fashion trends? 6 | data/example/wavs/005.wav|EN-default|EN|Well, I think one of the main benefits of following fashion trends is that it can be a way to express your creativity, personality, and individuality. 7 | data/example/wavs/006.wav|EN-default|EN|Fashion can be a powerful tool for self-expression and can help you feel more confident and comfortable in your own skin. 8 | data/example/wavs/007.wav|EN-default|EN|Additionally, staying up-to-date with fashion trends can help you develop your own sense of style and learn how to put together outfits that make you look and feel great. 9 | data/example/wavs/008.wav|EN-default|EN|That's a great point. 10 | data/example/wavs/009.wav|EN-default|EN|Do you think it's important to stay on top of the latest fashion trends, or is it more important to focus on timeless style? 11 | data/example/wavs/010.wav|EN-default|EN|I think it's really up to each individual to decide what approach to fashion works best for them. 12 | data/example/wavs/011.wav|EN-default|EN|Some people prefer to stick with classic, timeless styles that never go out of fashion, while others enjoy experimenting with new and innovative trends. 13 | data/example/wavs/012.wav|EN-default|EN|Ultimately, fashion is about personal expression and there's no right or wrong way to approach it. 14 | data/example/wavs/013.wav|EN-default|EN|The most important thing is to wear what makes you feel good and confident. 15 | data/example/wavs/014.wav|EN-default|EN|I completely agree. 16 | data/example/wavs/015.wav|EN-default|EN|Some popular ones that come to mind are oversized blazers, statement sleeves, printed maxi dresses, and chunky sneakers. 17 | data/example/wavs/016.wav|EN-default|EN|It's been really interesting chatting with you about fashion. 18 | data/example/wavs/017.wav|EN-default|EN|That's a good point. 19 | data/example/wavs/018.wav|EN-default|EN|What do you think are some current fashion trends that are popular right now? 20 | data/example/wavs/019.wav|EN-default|EN|There are so many trends happening right now, it's hard to keep track of them all! 21 | -------------------------------------------------------------------------------- /melo/download_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from . import utils 4 | from cached_path import cached_path 5 | from huggingface_hub import hf_hub_download 6 | 7 | DOWNLOAD_CKPT_URLS = { 8 | 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth', 9 | 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth', 10 | 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth', 11 | 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth', 12 | 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth', 13 | 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth', 14 | 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth', 15 | } 16 | 17 | DOWNLOAD_CONFIG_URLS = { 18 | 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/config.json', 19 | 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json', 20 | 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/config.json', 21 | 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/config.json', 22 | 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/config.json', 23 | 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json', 24 | 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/config.json', 25 | } 26 | 27 | PRETRAINED_MODELS = { 28 | 'G.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/G.pth', 29 | 'D.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/D.pth', 30 | 'DUR.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/DUR.pth', 31 | } 32 | 33 | LANG_TO_HF_REPO_ID = { 34 | 'EN': 'myshell-ai/MeloTTS-English', 35 | 'EN_V2': 'myshell-ai/MeloTTS-English-v2', 36 | 'EN_NEWEST': 'myshell-ai/MeloTTS-English-v3', 37 | 'FR': 'myshell-ai/MeloTTS-French', 38 | 'JP': 'myshell-ai/MeloTTS-Japanese', 39 | 'ES': 'myshell-ai/MeloTTS-Spanish', 40 | 'ZH': 'myshell-ai/MeloTTS-Chinese', 41 | 'KR': 'myshell-ai/MeloTTS-Korean', 42 | } 43 | 44 | def load_or_download_config(locale, use_hf=True, config_path=None): 45 | if config_path is None: 46 | language = locale.split('-')[0].upper() 47 | if use_hf: 48 | assert language in LANG_TO_HF_REPO_ID 49 | config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json") 50 | else: 51 | assert language in DOWNLOAD_CONFIG_URLS 52 | config_path = cached_path(DOWNLOAD_CONFIG_URLS[language]) 53 | return utils.get_hparams_from_file(config_path) 54 | 55 | def load_or_download_model(locale, device, use_hf=True, ckpt_path=None): 56 | if ckpt_path is None: 57 | language = locale.split('-')[0].upper() 58 | if use_hf: 59 | assert language in LANG_TO_HF_REPO_ID 60 | ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth") 61 | else: 62 | assert language in DOWNLOAD_CKPT_URLS 63 | ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language]) 64 | return torch.load(ckpt_path, map_location=device) 65 | 66 | def load_pretrain_model(): 67 | return [cached_path(url) for url in PRETRAINED_MODELS.values()] 68 | -------------------------------------------------------------------------------- /melo/infer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | from melo.api import TTS 4 | 5 | 6 | 7 | @click.command() 8 | @click.option('--ckpt_path', '-m', type=str, default=None, help="Path to the checkpoint file") 9 | @click.option('--text', '-t', type=str, default=None, help="Text to speak") 10 | @click.option('--language', '-l', type=str, default="EN", help="Language of the model") 11 | @click.option('--output_dir', '-o', type=str, default="outputs", help="Path to the output") 12 | def main(ckpt_path, text, language, output_dir): 13 | if ckpt_path is None: 14 | raise ValueError("The model_path must be specified") 15 | 16 | config_path = os.path.join(os.path.dirname(ckpt_path), 'config.json') 17 | model = TTS(language=language, config_path=config_path, ckpt_path=ckpt_path) 18 | 19 | for spk_name, spk_id in model.hps.data.spk2id.items(): 20 | save_path = f'{output_dir}/{spk_name}/output.wav' 21 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 22 | model.tts_to_file(text, spk_id, save_path) 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /melo/init_downloads.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | if __name__ == '__main__': 4 | 5 | from melo.api import TTS 6 | device = 'auto' 7 | models = { 8 | 'EN': TTS(language='EN', device=device), 9 | 'ES': TTS(language='ES', device=device), 10 | 'FR': TTS(language='FR', device=device), 11 | 'ZH': TTS(language='ZH', device=device), 12 | 'JP': TTS(language='JP', device=device), 13 | 'KR': TTS(language='KR', device=device), 14 | } -------------------------------------------------------------------------------- /melo/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /melo/main.py: -------------------------------------------------------------------------------- 1 | import click 2 | import warnings 3 | import os 4 | 5 | 6 | @click.command 7 | @click.argument('text') 8 | @click.argument('output_path') 9 | @click.option("--file", '-f', is_flag=True, show_default=True, default=False, help="Text is a file") 10 | @click.option('--language', '-l', default='EN', help='Language, defaults to English', type=click.Choice(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], case_sensitive=False)) 11 | @click.option('--speaker', '-spk', default='EN-Default', help='Speaker ID, only for English, leave empty for default, ignored if not English. If English, defaults to "EN-Default"', type=click.Choice(['EN-Default', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU'])) 12 | @click.option('--speed', '-s', default=1.0, help='Speed, defaults to 1.0', type=float) 13 | @click.option('--device', '-d', default='auto', help='Device, defaults to auto') 14 | def main(text, file, output_path, language, speaker, speed, device): 15 | if file: 16 | if not os.path.exists(text): 17 | raise FileNotFoundError(f'Trying to load text from file due to --file/-f flag, but file not found. Remove the --file/-f flag to pass a string.') 18 | else: 19 | with open(text) as f: 20 | text = f.read().strip() 21 | if text == '': 22 | raise ValueError('You entered empty text or the file you passed was empty.') 23 | language = language.upper() 24 | if language == '': language = 'EN' 25 | if speaker == '': speaker = None 26 | if (not language == 'EN') and speaker: 27 | warnings.warn('You specified a speaker but the language is English.') 28 | from melo.api import TTS 29 | model = TTS(language=language, device=device) 30 | speaker_ids = model.hps.data.spk2id 31 | if language == 'EN': 32 | if not speaker: speaker = 'EN-Default' 33 | spkr = speaker_ids[speaker] 34 | else: 35 | spkr = speaker_ids[list(speaker_ids.keys())[0]] 36 | model.tts_to_file(text, spkr, output_path, speed=speed) 37 | -------------------------------------------------------------------------------- /melo/mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | import librosa 4 | from librosa.filters import mel as librosa_mel_fn 5 | 6 | MAX_WAV_VALUE = 32768.0 7 | 8 | 9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 10 | """ 11 | PARAMS 12 | ------ 13 | C: compression factor 14 | """ 15 | return torch.log(torch.clamp(x, min=clip_val) * C) 16 | 17 | 18 | def dynamic_range_decompression_torch(x, C=1): 19 | """ 20 | PARAMS 21 | ------ 22 | C: compression factor used to compress 23 | """ 24 | return torch.exp(x) / C 25 | 26 | 27 | def spectral_normalize_torch(magnitudes): 28 | output = dynamic_range_compression_torch(magnitudes) 29 | return output 30 | 31 | 32 | def spectral_de_normalize_torch(magnitudes): 33 | output = dynamic_range_decompression_torch(magnitudes) 34 | return output 35 | 36 | 37 | mel_basis = {} 38 | hann_window = {} 39 | 40 | 41 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 42 | if torch.min(y) < -1.1: 43 | print("min value is ", torch.min(y)) 44 | if torch.max(y) > 1.1: 45 | print("max value is ", torch.max(y)) 46 | 47 | global hann_window 48 | dtype_device = str(y.dtype) + "_" + str(y.device) 49 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 50 | if wnsize_dtype_device not in hann_window: 51 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 52 | dtype=y.dtype, device=y.device 53 | ) 54 | 55 | y = torch.nn.functional.pad( 56 | y.unsqueeze(1), 57 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 58 | mode="reflect", 59 | ) 60 | y = y.squeeze(1) 61 | 62 | spec = torch.stft( 63 | y, 64 | n_fft, 65 | hop_length=hop_size, 66 | win_length=win_size, 67 | window=hann_window[wnsize_dtype_device], 68 | center=center, 69 | pad_mode="reflect", 70 | normalized=False, 71 | onesided=True, 72 | return_complex=False, 73 | ) 74 | 75 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 76 | return spec 77 | 78 | 79 | def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False): 80 | global hann_window 81 | dtype_device = str(y.dtype) + '_' + str(y.device) 82 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 83 | if wnsize_dtype_device not in hann_window: 84 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 85 | 86 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 87 | 88 | # ******************** original ************************# 89 | # y = y.squeeze(1) 90 | # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 91 | # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 92 | 93 | # ******************** ConvSTFT ************************# 94 | freq_cutoff = n_fft // 2 + 1 95 | fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft))) 96 | forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1]) 97 | forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float() 98 | 99 | import torch.nn.functional as F 100 | 101 | # if center: 102 | # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1) 103 | assert center is False 104 | 105 | forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size) 106 | spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1) 107 | 108 | 109 | # ******************** Verification ************************# 110 | spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 111 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 112 | assert torch.allclose(spec1, spec2, atol=1e-4) 113 | 114 | spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6) 115 | return spec 116 | 117 | 118 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 119 | global mel_basis 120 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 121 | fmax_dtype_device = str(fmax) + "_" + dtype_device 122 | if fmax_dtype_device not in mel_basis: 123 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 124 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 125 | dtype=spec.dtype, device=spec.device 126 | ) 127 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 128 | spec = spectral_normalize_torch(spec) 129 | return spec 130 | 131 | 132 | def mel_spectrogram_torch( 133 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 134 | ): 135 | global mel_basis, hann_window 136 | dtype_device = str(y.dtype) + "_" + str(y.device) 137 | fmax_dtype_device = str(fmax) + "_" + dtype_device 138 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 139 | if fmax_dtype_device not in mel_basis: 140 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 141 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 142 | dtype=y.dtype, device=y.device 143 | ) 144 | if wnsize_dtype_device not in hann_window: 145 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 146 | dtype=y.dtype, device=y.device 147 | ) 148 | 149 | y = torch.nn.functional.pad( 150 | y.unsqueeze(1), 151 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 152 | mode="reflect", 153 | ) 154 | y = y.squeeze(1) 155 | 156 | spec = torch.stft( 157 | y, 158 | n_fft, 159 | hop_length=hop_size, 160 | win_length=win_size, 161 | window=hann_window[wnsize_dtype_device], 162 | center=center, 163 | pad_mode="reflect", 164 | normalized=False, 165 | onesided=True, 166 | return_complex=False, 167 | ) 168 | 169 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 170 | 171 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 172 | spec = spectral_normalize_torch(spec) 173 | 174 | return spec 175 | -------------------------------------------------------------------------------- /melo/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | 7 | def maximum_path(neg_cent, mask): 8 | device = neg_cent.device 9 | dtype = neg_cent.dtype 10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 11 | path = zeros(neg_cent.shape, dtype=int32) 12 | 13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 16 | return from_numpy(path).to(device=device, dtype=dtype) 17 | -------------------------------------------------------------------------------- /melo/monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit( 5 | numba.void( 6 | numba.int32[:, :, ::1], 7 | numba.float32[:, :, ::1], 8 | numba.int32[::1], 9 | numba.int32[::1], 10 | ), 11 | nopython=True, 12 | nogil=True, 13 | ) 14 | def maximum_path_jit(paths, values, t_ys, t_xs): 15 | b = paths.shape[0] 16 | max_neg_val = -1e9 17 | for i in range(int(b)): 18 | path = paths[i] 19 | value = values[i] 20 | t_y = t_ys[i] 21 | t_x = t_xs[i] 22 | 23 | v_prev = v_cur = 0.0 24 | index = t_x - 1 25 | 26 | for y in range(t_y): 27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 28 | if x == y: 29 | v_cur = max_neg_val 30 | else: 31 | v_cur = value[y - 1, x] 32 | if x == 0: 33 | if y == 0: 34 | v_prev = 0.0 35 | else: 36 | v_prev = max_neg_val 37 | else: 38 | v_prev = value[y - 1, x - 1] 39 | value[y, x] += max(v_prev, v_cur) 40 | 41 | for y in range(t_y - 1, -1, -1): 42 | path[y, index] = 1 43 | if index != 0 and ( 44 | index == y or value[y - 1, index] < value[y - 1, index - 1] 45 | ): 46 | index = index - 1 47 | -------------------------------------------------------------------------------- /melo/preprocess_text.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import defaultdict 3 | from random import shuffle 4 | from typing import Optional 5 | 6 | from tqdm import tqdm 7 | import click 8 | from text.cleaner import clean_text_bert 9 | import os 10 | import torch 11 | from text.symbols import symbols, num_languages, num_tones 12 | 13 | @click.command() 14 | @click.option( 15 | "--metadata", 16 | default="data/example/metadata.list", 17 | type=click.Path(exists=True, file_okay=True, dir_okay=False), 18 | ) 19 | @click.option("--cleaned-path", default=None) 20 | @click.option("--train-path", default=None) 21 | @click.option("--val-path", default=None) 22 | @click.option( 23 | "--config_path", 24 | default="configs/config.json", 25 | type=click.Path(exists=True, file_okay=True, dir_okay=False), 26 | ) 27 | @click.option("--val-per-spk", default=4) 28 | @click.option("--max-val-total", default=8) 29 | @click.option("--clean/--no-clean", default=True) 30 | def main( 31 | metadata: str, 32 | cleaned_path: Optional[str], 33 | train_path: str, 34 | val_path: str, 35 | config_path: str, 36 | val_per_spk: int, 37 | max_val_total: int, 38 | clean: bool, 39 | ): 40 | if train_path is None: 41 | train_path = os.path.join(os.path.dirname(metadata), 'train.list') 42 | if val_path is None: 43 | val_path = os.path.join(os.path.dirname(metadata), 'val.list') 44 | out_config_path = os.path.join(os.path.dirname(metadata), 'config.json') 45 | 46 | if cleaned_path is None: 47 | cleaned_path = metadata + ".cleaned" 48 | 49 | if clean: 50 | out_file = open(cleaned_path, "w", encoding="utf-8") 51 | new_symbols = [] 52 | for line in tqdm(open(metadata, encoding="utf-8").readlines()): 53 | try: 54 | utt, spk, language, text = line.strip().split("|") 55 | norm_text, phones, tones, word2ph, bert = clean_text_bert(text, language, device='cuda:0') 56 | for ph in phones: 57 | if ph not in symbols and ph not in new_symbols: 58 | new_symbols.append(ph) 59 | print('update!, now symbols:') 60 | print(new_symbols) 61 | with open(f'{language}_symbol.txt', 'w') as f: 62 | f.write(f'{new_symbols}') 63 | 64 | assert len(phones) == len(tones) 65 | assert len(phones) == sum(word2ph) 66 | out_file.write( 67 | "{}|{}|{}|{}|{}|{}|{}\n".format( 68 | utt, 69 | spk, 70 | language, 71 | norm_text, 72 | " ".join(phones), 73 | " ".join([str(i) for i in tones]), 74 | " ".join([str(i) for i in word2ph]), 75 | ) 76 | ) 77 | bert_path = utt.replace(".wav", ".bert.pt") 78 | os.makedirs(os.path.dirname(bert_path), exist_ok=True) 79 | torch.save(bert.cpu(), bert_path) 80 | except Exception as error: 81 | print("err!", line, error) 82 | 83 | out_file.close() 84 | 85 | metadata = cleaned_path 86 | 87 | spk_utt_map = defaultdict(list) 88 | spk_id_map = {} 89 | current_sid = 0 90 | 91 | with open(metadata, encoding="utf-8") as f: 92 | for line in f.readlines(): 93 | utt, spk, language, text, phones, tones, word2ph = line.strip().split("|") 94 | spk_utt_map[spk].append(line) 95 | 96 | if spk not in spk_id_map.keys(): 97 | spk_id_map[spk] = current_sid 98 | current_sid += 1 99 | 100 | train_list = [] 101 | val_list = [] 102 | 103 | for spk, utts in spk_utt_map.items(): 104 | shuffle(utts) 105 | val_list += utts[:val_per_spk] 106 | train_list += utts[val_per_spk:] 107 | 108 | if len(val_list) > max_val_total: 109 | train_list += val_list[max_val_total:] 110 | val_list = val_list[:max_val_total] 111 | 112 | with open(train_path, "w", encoding="utf-8") as f: 113 | for line in train_list: 114 | f.write(line) 115 | 116 | with open(val_path, "w", encoding="utf-8") as f: 117 | for line in val_list: 118 | f.write(line) 119 | 120 | config = json.load(open(config_path, encoding="utf-8")) 121 | config["data"]["spk2id"] = spk_id_map 122 | 123 | config["data"]["training_files"] = train_path 124 | config["data"]["validation_files"] = val_path 125 | config["data"]["n_speakers"] = len(spk_id_map) 126 | config["num_languages"] = num_languages 127 | config["num_tones"] = num_tones 128 | config["symbols"] = symbols 129 | 130 | with open(out_config_path, "w", encoding="utf-8") as f: 131 | json.dump(config, f, indent=2, ensure_ascii=False) 132 | 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /melo/split_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import glob 4 | import numpy as np 5 | import soundfile as sf 6 | import torchaudio 7 | import re 8 | 9 | def split_sentence(text, min_len=10, language_str='EN'): 10 | if language_str in ['EN', 'FR', 'ES', 'SP']: 11 | sentences = split_sentences_latin(text, min_len=min_len) 12 | else: 13 | sentences = split_sentences_zh(text, min_len=min_len) 14 | return sentences 15 | 16 | 17 | def split_sentences_latin(text, min_len=10): 18 | text = re.sub('[。!?;]', '.', text) 19 | text = re.sub('[,]', ',', text) 20 | text = re.sub('[“”]', '"', text) 21 | text = re.sub('[‘’]', "'", text) 22 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) 23 | return [item.strip() for item in txtsplit(text, 256, 512) if item.strip()] 24 | 25 | 26 | def split_sentences_zh(text, min_len=10): 27 | text = re.sub('[。!?;]', '.', text) 28 | text = re.sub('[,]', ',', text) 29 | # 将文本中的换行符、空格和制表符替换为空格 30 | text = re.sub('[\n\t ]+', ' ', text) 31 | # 在标点符号后添加一个空格 32 | text = re.sub('([,.!?;])', r'\1 $#!', text) 33 | # 分隔句子并去除前后空格 34 | # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)] 35 | sentences = [s.strip() for s in text.split('$#!')] 36 | if len(sentences[-1]) == 0: del sentences[-1] 37 | 38 | new_sentences = [] 39 | new_sent = [] 40 | count_len = 0 41 | for ind, sent in enumerate(sentences): 42 | new_sent.append(sent) 43 | count_len += len(sent) 44 | if count_len > min_len or ind == len(sentences) - 1: 45 | count_len = 0 46 | new_sentences.append(' '.join(new_sent)) 47 | new_sent = [] 48 | return merge_short_sentences_zh(new_sentences) 49 | 50 | 51 | def merge_short_sentences_en(sens): 52 | """Avoid short sentences by merging them with the following sentence. 53 | 54 | Args: 55 | List[str]: list of input sentences. 56 | 57 | Returns: 58 | List[str]: list of output sentences. 59 | """ 60 | sens_out = [] 61 | for s in sens: 62 | # If the previous sentense is too short, merge them with 63 | # the current sentence. 64 | if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2: 65 | sens_out[-1] = sens_out[-1] + " " + s 66 | else: 67 | sens_out.append(s) 68 | try: 69 | if len(sens_out[-1].split(" ")) <= 2: 70 | sens_out[-2] = sens_out[-2] + " " + sens_out[-1] 71 | sens_out.pop(-1) 72 | except: 73 | pass 74 | return sens_out 75 | 76 | 77 | def merge_short_sentences_zh(sens): 78 | # return sens 79 | """Avoid short sentences by merging them with the following sentence. 80 | 81 | Args: 82 | List[str]: list of input sentences. 83 | 84 | Returns: 85 | List[str]: list of output sentences. 86 | """ 87 | sens_out = [] 88 | for s in sens: 89 | # If the previous sentense is too short, merge them with 90 | # the current sentence. 91 | if len(sens_out) > 0 and len(sens_out[-1]) <= 2: 92 | sens_out[-1] = sens_out[-1] + " " + s 93 | else: 94 | sens_out.append(s) 95 | try: 96 | if len(sens_out[-1]) <= 2: 97 | sens_out[-2] = sens_out[-2] + " " + sens_out[-1] 98 | sens_out.pop(-1) 99 | except: 100 | pass 101 | return sens_out 102 | 103 | 104 | 105 | def txtsplit(text, desired_length=100, max_length=200): 106 | """Split text it into chunks of a desired length trying to keep sentences intact.""" 107 | text = re.sub(r'\n\n+', '\n', text) 108 | text = re.sub(r'\s+', ' ', text) 109 | text = re.sub(r'[""]', '"', text) 110 | text = re.sub(r'([,.?!])', r'\1 ', text) 111 | text = re.sub(r'\s+', ' ', text) 112 | 113 | rv = [] 114 | in_quote = False 115 | current = "" 116 | split_pos = [] 117 | pos = -1 118 | end_pos = len(text) - 1 119 | def seek(delta): 120 | nonlocal pos, in_quote, current 121 | is_neg = delta < 0 122 | for _ in range(abs(delta)): 123 | if is_neg: 124 | pos -= 1 125 | current = current[:-1] 126 | else: 127 | pos += 1 128 | current += text[pos] 129 | if text[pos] == '"': 130 | in_quote = not in_quote 131 | return text[pos] 132 | def peek(delta): 133 | p = pos + delta 134 | return text[p] if p < end_pos and p >= 0 else "" 135 | def commit(): 136 | nonlocal rv, current, split_pos 137 | rv.append(current) 138 | current = "" 139 | split_pos = [] 140 | while pos < end_pos: 141 | c = seek(1) 142 | if len(current) >= max_length: 143 | if len(split_pos) > 0 and len(current) > (desired_length / 2): 144 | d = pos - split_pos[-1] 145 | seek(-d) 146 | else: 147 | while c not in '!?.\n ' and pos > 0 and len(current) > desired_length: 148 | c = seek(-1) 149 | commit() 150 | elif not in_quote and (c in '!?\n' or (c in '.,' and peek(1) in '\n ')): 151 | while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.': 152 | c = seek(1) 153 | split_pos.append(pos) 154 | if len(current) >= desired_length: 155 | commit() 156 | elif in_quote and peek(1) == '"' and peek(2) in '\n ': 157 | seek(2) 158 | split_pos.append(pos) 159 | rv.append(current) 160 | rv = [s.strip() for s in rv] 161 | rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)] 162 | return rv 163 | 164 | 165 | if __name__ == '__main__': 166 | zh_text = "好的,我来给你讲一个故事吧。从前有一个小姑娘,她叫做小红。小红非常喜欢在森林里玩耍,她经常会和她的小伙伴们一起去探险。有一天,小红和她的小伙伴们走到了森林深处,突然遇到了一只凶猛的野兽。小红的小伙伴们都吓得不敢动弹,但是小红并没有被吓倒,她勇敢地走向野兽,用她的智慧和勇气成功地制服了野兽,保护了她的小伙伴们。从那以后,小红变得更加勇敢和自信,成为了她小伙伴们心中的英雄。" 167 | en_text = "I didn’t know what to do. I said please kill her because it would be better than being kidnapped,” Ben, whose surname CNN is not using for security concerns, said on Wednesday. “It’s a nightmare. I said ‘please kill her, don’t take her there.’" 168 | sp_text = "¡Claro! ¿En qué tema te gustaría que te hable en español? Puedo proporcionarte información o conversar contigo sobre una amplia variedad de temas, desde cultura y comida hasta viajes y tecnología. ¿Tienes alguna preferencia en particular?" 169 | fr_text = "Bien sûr ! En quelle matière voudriez-vous que je vous parle en français ? Je peux vous fournir des informations ou discuter avec vous sur une grande variété de sujets, que ce soit la culture, la nourriture, les voyages ou la technologie. Avez-vous une préférence particulière ?" 170 | 171 | print(split_sentence(zh_text, language_str='ZH')) 172 | print(split_sentence(en_text, language_str='EN')) 173 | print(split_sentence(sp_text, language_str='SP')) 174 | print(split_sentence(fr_text, language_str='FR')) 175 | -------------------------------------------------------------------------------- /melo/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language, symbol_to_id=None): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | symbol_to_id_map = symbol_to_id if symbol_to_id else _symbol_to_id 15 | phones = [symbol_to_id_map[symbol] for symbol in cleaned_text] 16 | tone_start = language_tone_start_map[language] 17 | tones = [i + tone_start for i in tones] 18 | lang_id = language_id_map[language] 19 | lang_ids = [lang_id for i in phones] 20 | return phones, tones, lang_ids 21 | 22 | 23 | def get_bert(norm_text, word2ph, language, device): 24 | from .chinese_bert import get_bert_feature as zh_bert 25 | from .english_bert import get_bert_feature as en_bert 26 | from .japanese_bert import get_bert_feature as jp_bert 27 | from .chinese_mix import get_bert_feature as zh_mix_en_bert 28 | from .spanish_bert import get_bert_feature as sp_bert 29 | from .french_bert import get_bert_feature as fr_bert 30 | from .korean import get_bert_feature as kr_bert 31 | 32 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert, 'ZH_MIX_EN': zh_mix_en_bert, 33 | 'FR': fr_bert, 'SP': sp_bert, 'ES': sp_bert, "KR": kr_bert} 34 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 35 | return bert 36 | -------------------------------------------------------------------------------- /melo/text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import cn2an 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | from .symbols import punctuation 8 | from .tone_sandhi import ToneSandhi 9 | 10 | current_file_path = os.path.dirname(__file__) 11 | pinyin_to_symbol_map = { 12 | line.split("\t")[0]: line.strip().split("\t")[1] 13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 14 | } 15 | 16 | import jieba.posseg as psg 17 | 18 | 19 | rep_map = { 20 | ":": ",", 21 | ";": ",", 22 | ",": ",", 23 | "。": ".", 24 | "!": "!", 25 | "?": "?", 26 | "\n": ".", 27 | "·": ",", 28 | "、": ",", 29 | "...": "…", 30 | "$": ".", 31 | "“": "'", 32 | "”": "'", 33 | "‘": "'", 34 | "’": "'", 35 | "(": "'", 36 | ")": "'", 37 | "(": "'", 38 | ")": "'", 39 | "《": "'", 40 | "》": "'", 41 | "【": "'", 42 | "】": "'", 43 | "[": "'", 44 | "]": "'", 45 | "—": "-", 46 | "~": "-", 47 | "~": "-", 48 | "「": "'", 49 | "」": "'", 50 | } 51 | 52 | tone_modifier = ToneSandhi() 53 | 54 | 55 | def replace_punctuation(text): 56 | text = text.replace("嗯", "恩").replace("呣", "母") 57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 58 | 59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 60 | 61 | replaced_text = re.sub( 62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 63 | ) 64 | 65 | return replaced_text 66 | 67 | 68 | def g2p(text): 69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 71 | phones, tones, word2ph = _g2p(sentences) 72 | assert sum(word2ph) == len(phones) 73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch. 74 | phones = ["_"] + phones + ["_"] 75 | tones = [0] + tones + [0] 76 | word2ph = [1] + word2ph + [1] 77 | return phones, tones, word2ph 78 | 79 | 80 | def _get_initials_finals(word): 81 | initials = [] 82 | finals = [] 83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 84 | orig_finals = lazy_pinyin( 85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 86 | ) 87 | for c, v in zip(orig_initials, orig_finals): 88 | initials.append(c) 89 | finals.append(v) 90 | return initials, finals 91 | 92 | 93 | def _g2p(segments): 94 | phones_list = [] 95 | tones_list = [] 96 | word2ph = [] 97 | for seg in segments: 98 | # Replace all English words in the sentence 99 | seg = re.sub("[a-zA-Z]+", "", seg) 100 | seg_cut = psg.lcut(seg) 101 | initials = [] 102 | finals = [] 103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 104 | for word, pos in seg_cut: 105 | if pos == "eng": 106 | import pdb; pdb.set_trace() 107 | continue 108 | sub_initials, sub_finals = _get_initials_finals(word) 109 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 110 | initials.append(sub_initials) 111 | finals.append(sub_finals) 112 | 113 | # assert len(sub_initials) == len(sub_finals) == len(word) 114 | initials = sum(initials, []) 115 | finals = sum(finals, []) 116 | # 117 | for c, v in zip(initials, finals): 118 | raw_pinyin = c + v 119 | # NOTE: post process for pypinyin outputs 120 | # we discriminate i, ii and iii 121 | if c == v: 122 | assert c in punctuation 123 | phone = [c] 124 | tone = "0" 125 | word2ph.append(1) 126 | else: 127 | v_without_tone = v[:-1] 128 | tone = v[-1] 129 | 130 | pinyin = c + v_without_tone 131 | assert tone in "12345" 132 | 133 | if c: 134 | # 多音节 135 | v_rep_map = { 136 | "uei": "ui", 137 | "iou": "iu", 138 | "uen": "un", 139 | } 140 | if v_without_tone in v_rep_map.keys(): 141 | pinyin = c + v_rep_map[v_without_tone] 142 | else: 143 | # 单音节 144 | pinyin_rep_map = { 145 | "ing": "ying", 146 | "i": "yi", 147 | "in": "yin", 148 | "u": "wu", 149 | } 150 | if pinyin in pinyin_rep_map.keys(): 151 | pinyin = pinyin_rep_map[pinyin] 152 | else: 153 | single_rep_map = { 154 | "v": "yu", 155 | "e": "e", 156 | "i": "y", 157 | "u": "w", 158 | } 159 | if pinyin[0] in single_rep_map.keys(): 160 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 161 | 162 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 163 | phone = pinyin_to_symbol_map[pinyin].split(" ") 164 | word2ph.append(len(phone)) 165 | 166 | phones_list += phone 167 | tones_list += [int(tone)] * len(phone) 168 | return phones_list, tones_list, word2ph 169 | 170 | 171 | def text_normalize(text): 172 | numbers = re.findall(r"\d+(?:\.?\d+)?", text) 173 | for number in numbers: 174 | text = text.replace(number, cn2an.an2cn(number), 1) 175 | text = replace_punctuation(text) 176 | return text 177 | 178 | 179 | def get_bert_feature(text, word2ph, device=None): 180 | from text import chinese_bert 181 | 182 | return chinese_bert.get_bert_feature(text, word2ph, device=device) 183 | 184 | 185 | if __name__ == "__main__": 186 | from text.chinese_bert import get_bert_feature 187 | 188 | text = "啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" 189 | text = text_normalize(text) 190 | print(text) 191 | phones, tones, word2ph = g2p(text) 192 | bert = get_bert_feature(text, word2ph) 193 | 194 | print(phones, tones, word2ph, bert.shape) 195 | 196 | 197 | # # 示例用法 198 | # text = "这是一个示例文本:,你好!这是一个测试...." 199 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 200 | -------------------------------------------------------------------------------- /melo/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | 6 | # model_id = 'hfl/chinese-roberta-wwm-ext-large' 7 | local_path = "./bert/chinese-roberta-wwm-ext-large" 8 | 9 | 10 | tokenizers = {} 11 | models = {} 12 | 13 | def get_bert_feature(text, word2ph, device=None, model_id='hfl/chinese-roberta-wwm-ext-large'): 14 | if model_id not in models: 15 | models[model_id] = AutoModelForMaskedLM.from_pretrained( 16 | model_id 17 | ).to(device) 18 | tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id) 19 | model = models[model_id] 20 | tokenizer = tokenizers[model_id] 21 | 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | 31 | with torch.no_grad(): 32 | inputs = tokenizer(text, return_tensors="pt") 33 | for i in inputs: 34 | inputs[i] = inputs[i].to(device) 35 | res = model(**inputs, output_hidden_states=True) 36 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 37 | # import pdb; pdb.set_trace() 38 | # assert len(word2ph) == len(text) + 2 39 | word2phone = word2ph 40 | phone_level_feature = [] 41 | for i in range(len(word2phone)): 42 | repeat_feature = res[i].repeat(word2phone[i], 1) 43 | phone_level_feature.append(repeat_feature) 44 | 45 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 46 | return phone_level_feature.T 47 | 48 | 49 | if __name__ == "__main__": 50 | import torch 51 | 52 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 53 | word2phone = [ 54 | 1, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 1, 63 | 2, 64 | 2, 65 | 1, 66 | 2, 67 | 2, 68 | 2, 69 | 2, 70 | 2, 71 | 1, 72 | 1, 73 | 2, 74 | 2, 75 | 1, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 2, 86 | 1, 87 | 2, 88 | 2, 89 | 2, 90 | 2, 91 | 1, 92 | ] 93 | 94 | # 计算总帧数 95 | total_frames = sum(word2phone) 96 | print(word_level_feature.shape) 97 | print(word2phone) 98 | phone_level_feature = [] 99 | for i in range(len(word2phone)): 100 | print(word_level_feature[i].shape) 101 | 102 | # 对每个词重复word2phone[i]次 103 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 104 | phone_level_feature.append(repeat_feature) 105 | 106 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 107 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 108 | -------------------------------------------------------------------------------- /melo/text/chinese_mix.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import cn2an 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | # from text.symbols import punctuation 8 | from .symbols import language_tone_start_map 9 | from .tone_sandhi import ToneSandhi 10 | from .english import g2p as g2p_en 11 | from transformers import AutoTokenizer 12 | 13 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 14 | current_file_path = os.path.dirname(__file__) 15 | pinyin_to_symbol_map = { 16 | line.split("\t")[0]: line.strip().split("\t")[1] 17 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 18 | } 19 | 20 | import jieba.posseg as psg 21 | 22 | 23 | rep_map = { 24 | ":": ",", 25 | ";": ",", 26 | ",": ",", 27 | "。": ".", 28 | "!": "!", 29 | "?": "?", 30 | "\n": ".", 31 | "·": ",", 32 | "、": ",", 33 | "...": "…", 34 | "$": ".", 35 | "“": "'", 36 | "”": "'", 37 | "‘": "'", 38 | "’": "'", 39 | "(": "'", 40 | ")": "'", 41 | "(": "'", 42 | ")": "'", 43 | "《": "'", 44 | "》": "'", 45 | "【": "'", 46 | "】": "'", 47 | "[": "'", 48 | "]": "'", 49 | "—": "-", 50 | "~": "-", 51 | "~": "-", 52 | "「": "'", 53 | "」": "'", 54 | } 55 | 56 | tone_modifier = ToneSandhi() 57 | 58 | 59 | def replace_punctuation(text): 60 | text = text.replace("嗯", "恩").replace("呣", "母") 61 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 62 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 63 | replaced_text = re.sub(r"[^\u4e00-\u9fa5_a-zA-Z\s" + "".join(punctuation) + r"]+", "", replaced_text) 64 | replaced_text = re.sub(r"[\s]+", " ", replaced_text) 65 | 66 | return replaced_text 67 | 68 | 69 | def g2p(text, impl='v2'): 70 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 71 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 72 | if impl == 'v1': 73 | _func = _g2p 74 | elif impl == 'v2': 75 | _func = _g2p_v2 76 | else: 77 | raise NotImplementedError() 78 | phones, tones, word2ph = _func(sentences) 79 | assert sum(word2ph) == len(phones) 80 | # assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch. 81 | phones = ["_"] + phones + ["_"] 82 | tones = [0] + tones + [0] 83 | word2ph = [1] + word2ph + [1] 84 | return phones, tones, word2ph 85 | 86 | 87 | def _get_initials_finals(word): 88 | initials = [] 89 | finals = [] 90 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 91 | orig_finals = lazy_pinyin( 92 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 93 | ) 94 | for c, v in zip(orig_initials, orig_finals): 95 | initials.append(c) 96 | finals.append(v) 97 | return initials, finals 98 | 99 | model_id = 'bert-base-multilingual-uncased' 100 | tokenizer = AutoTokenizer.from_pretrained(model_id) 101 | def _g2p(segments): 102 | phones_list = [] 103 | tones_list = [] 104 | word2ph = [] 105 | for seg in segments: 106 | # Replace all English words in the sentence 107 | # seg = re.sub("[a-zA-Z]+", "", seg) 108 | seg_cut = psg.lcut(seg) 109 | initials = [] 110 | finals = [] 111 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 112 | for word, pos in seg_cut: 113 | if pos == "eng": 114 | initials.append(['EN_WORD']) 115 | finals.append([word]) 116 | else: 117 | sub_initials, sub_finals = _get_initials_finals(word) 118 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 119 | initials.append(sub_initials) 120 | finals.append(sub_finals) 121 | 122 | # assert len(sub_initials) == len(sub_finals) == len(word) 123 | initials = sum(initials, []) 124 | finals = sum(finals, []) 125 | # 126 | for c, v in zip(initials, finals): 127 | if c == 'EN_WORD': 128 | tokenized_en = tokenizer.tokenize(v) 129 | phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en) 130 | # apply offset to tones_en 131 | tones_en = [t + language_tone_start_map['EN'] for t in tones_en] 132 | phones_list += phones_en 133 | tones_list += tones_en 134 | word2ph += word2ph_en 135 | else: 136 | raw_pinyin = c + v 137 | # NOTE: post process for pypinyin outputs 138 | # we discriminate i, ii and iii 139 | if c == v: 140 | assert c in punctuation 141 | phone = [c] 142 | tone = "0" 143 | word2ph.append(1) 144 | else: 145 | v_without_tone = v[:-1] 146 | tone = v[-1] 147 | 148 | pinyin = c + v_without_tone 149 | assert tone in "12345" 150 | 151 | if c: 152 | # 多音节 153 | v_rep_map = { 154 | "uei": "ui", 155 | "iou": "iu", 156 | "uen": "un", 157 | } 158 | if v_without_tone in v_rep_map.keys(): 159 | pinyin = c + v_rep_map[v_without_tone] 160 | else: 161 | # 单音节 162 | pinyin_rep_map = { 163 | "ing": "ying", 164 | "i": "yi", 165 | "in": "yin", 166 | "u": "wu", 167 | } 168 | if pinyin in pinyin_rep_map.keys(): 169 | pinyin = pinyin_rep_map[pinyin] 170 | else: 171 | single_rep_map = { 172 | "v": "yu", 173 | "e": "e", 174 | "i": "y", 175 | "u": "w", 176 | } 177 | if pinyin[0] in single_rep_map.keys(): 178 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 179 | 180 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 181 | phone = pinyin_to_symbol_map[pinyin].split(" ") 182 | word2ph.append(len(phone)) 183 | 184 | phones_list += phone 185 | tones_list += [int(tone)] * len(phone) 186 | return phones_list, tones_list, word2ph 187 | 188 | 189 | def text_normalize(text): 190 | numbers = re.findall(r"\d+(?:\.?\d+)?", text) 191 | for number in numbers: 192 | text = text.replace(number, cn2an.an2cn(number), 1) 193 | text = replace_punctuation(text) 194 | return text 195 | 196 | 197 | def get_bert_feature(text, word2ph, device): 198 | from . import chinese_bert 199 | return chinese_bert.get_bert_feature(text, word2ph, model_id='bert-base-multilingual-uncased', device=device) 200 | 201 | from .chinese import _g2p as _chinese_g2p 202 | def _g2p_v2(segments): 203 | spliter = '#$&^!@' 204 | 205 | phones_list = [] 206 | tones_list = [] 207 | word2ph = [] 208 | 209 | for text in segments: 210 | assert spliter not in text 211 | # replace all english words 212 | text = re.sub('([a-zA-Z\s]+)', lambda x: f'{spliter}{x.group(1)}{spliter}', text) 213 | texts = text.split(spliter) 214 | texts = [t for t in texts if len(t) > 0] 215 | 216 | 217 | for text in texts: 218 | if re.match('[a-zA-Z\s]+', text): 219 | # english 220 | tokenized_en = tokenizer.tokenize(text) 221 | phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en) 222 | # apply offset to tones_en 223 | tones_en = [t + language_tone_start_map['EN'] for t in tones_en] 224 | phones_list += phones_en 225 | tones_list += tones_en 226 | word2ph += word2ph_en 227 | else: 228 | phones_zh, tones_zh, word2ph_zh = _chinese_g2p([text]) 229 | phones_list += phones_zh 230 | tones_list += tones_zh 231 | word2ph += word2ph_zh 232 | return phones_list, tones_list, word2ph 233 | 234 | 235 | 236 | if __name__ == "__main__": 237 | # from text.chinese_bert import get_bert_feature 238 | 239 | text = "NFT啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" 240 | text = '我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。' 241 | text = '今天下午,我们准备去shopping mall购物,然后晚上去看一场movie。' 242 | text = '我们现在 also 能够 help 很多公司 use some machine learning 的 algorithms 啊!' 243 | text = text_normalize(text) 244 | print(text) 245 | phones, tones, word2ph = g2p(text, impl='v2') 246 | bert = get_bert_feature(text, word2ph, device='cuda:0') 247 | print(phones) 248 | import pdb; pdb.set_trace() 249 | 250 | 251 | # # 示例用法 252 | # text = "这是一个示例文本:,你好!这是一个测试...." 253 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 254 | -------------------------------------------------------------------------------- /melo/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, chinese_mix, korean, french, spanish 2 | from . import cleaned_text_to_sequence 3 | import copy 4 | 5 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english, 'ZH_MIX_EN': chinese_mix, 'KR': korean, 6 | 'FR': french, 'SP': spanish, 'ES': spanish} 7 | 8 | 9 | def clean_text(text, language): 10 | language_module = language_module_map[language] 11 | norm_text = language_module.text_normalize(text) 12 | phones, tones, word2ph = language_module.g2p(norm_text) 13 | return norm_text, phones, tones, word2ph 14 | 15 | 16 | def clean_text_bert(text, language, device=None): 17 | language_module = language_module_map[language] 18 | norm_text = language_module.text_normalize(text) 19 | phones, tones, word2ph = language_module.g2p(norm_text) 20 | 21 | word2ph_bak = copy.deepcopy(word2ph) 22 | for i in range(len(word2ph)): 23 | word2ph[i] = word2ph[i] * 2 24 | word2ph[0] += 1 25 | bert = language_module.get_bert_feature(norm_text, word2ph, device=device) 26 | 27 | return norm_text, phones, tones, word2ph_bak, bert 28 | 29 | 30 | def text_to_sequence(text, language): 31 | norm_text, phones, tones, word2ph = clean_text(text, language) 32 | return cleaned_text_to_sequence(phones, tones, language) 33 | 34 | 35 | if __name__ == "__main__": 36 | pass -------------------------------------------------------------------------------- /melo/text/cleaner_multiling.py: -------------------------------------------------------------------------------- 1 | """Set of default text cleaners""" 2 | # TODO: pick the cleaner for languages dynamically 3 | 4 | import re 5 | 6 | # Regular expression matching whitespace: 7 | _whitespace_re = re.compile(r"\s+") 8 | 9 | rep_map = { 10 | ":": ",", 11 | ";": ",", 12 | ",": ",", 13 | "。": ".", 14 | "!": "!", 15 | "?": "?", 16 | "\n": ".", 17 | "·": ",", 18 | "、": ",", 19 | "...": ".", 20 | "…": ".", 21 | "$": ".", 22 | "“": "'", 23 | "”": "'", 24 | "‘": "'", 25 | "’": "'", 26 | "(": "'", 27 | ")": "'", 28 | "(": "'", 29 | ")": "'", 30 | "《": "'", 31 | "》": "'", 32 | "【": "'", 33 | "】": "'", 34 | "[": "'", 35 | "]": "'", 36 | "—": "", 37 | "~": "-", 38 | "~": "-", 39 | "「": "'", 40 | "」": "'", 41 | } 42 | 43 | def replace_punctuation(text): 44 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 45 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 46 | return replaced_text 47 | 48 | def lowercase(text): 49 | return text.lower() 50 | 51 | 52 | def collapse_whitespace(text): 53 | return re.sub(_whitespace_re, " ", text).strip() 54 | 55 | def remove_punctuation_at_begin(text): 56 | return re.sub(r'^[,.!?]+', '', text) 57 | 58 | def remove_aux_symbols(text): 59 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) 60 | return text 61 | 62 | 63 | def replace_symbols(text, lang="en"): 64 | """Replace symbols based on the lenguage tag. 65 | 66 | Args: 67 | text: 68 | Input text. 69 | lang: 70 | Lenguage identifier. ex: "en", "fr", "pt", "ca". 71 | 72 | Returns: 73 | The modified text 74 | example: 75 | input args: 76 | text: "si l'avi cau, diguem-ho" 77 | lang: "ca" 78 | Output: 79 | text: "si lavi cau, diguemho" 80 | """ 81 | text = text.replace(";", ",") 82 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") 83 | text = text.replace(":", ",") 84 | if lang == "en": 85 | text = text.replace("&", " and ") 86 | elif lang == "fr": 87 | text = text.replace("&", " et ") 88 | elif lang == "pt": 89 | text = text.replace("&", " e ") 90 | elif lang == "ca": 91 | text = text.replace("&", " i ") 92 | text = text.replace("'", "") 93 | elif lang== "es": 94 | text=text.replace("&","y") 95 | text = text.replace("'", "") 96 | return text 97 | 98 | def unicleaners(text, cased=False, lang='en'): 99 | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and 100 | numbers, phonemizer already does that""" 101 | if not cased: 102 | text = lowercase(text) 103 | text = replace_punctuation(text) 104 | text = replace_symbols(text, lang=lang) 105 | text = remove_aux_symbols(text) 106 | text = remove_punctuation_at_begin(text) 107 | text = collapse_whitespace(text) 108 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) 109 | return text 110 | 111 | -------------------------------------------------------------------------------- /melo/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /melo/text/english.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | from g2p_en import G2p 5 | 6 | from . import symbols 7 | 8 | from .english_utils.abbreviations import expand_abbreviations 9 | from .english_utils.time_norm import expand_time_english 10 | from .english_utils.number_norm import normalize_numbers 11 | from .japanese import distribute_phone 12 | 13 | from transformers import AutoTokenizer 14 | 15 | current_file_path = os.path.dirname(__file__) 16 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") 17 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle") 18 | _g2p = G2p() 19 | 20 | arpa = { 21 | "AH0", 22 | "S", 23 | "AH1", 24 | "EY2", 25 | "AE2", 26 | "EH0", 27 | "OW2", 28 | "UH0", 29 | "NG", 30 | "B", 31 | "G", 32 | "AY0", 33 | "M", 34 | "AA0", 35 | "F", 36 | "AO0", 37 | "ER2", 38 | "UH1", 39 | "IY1", 40 | "AH2", 41 | "DH", 42 | "IY0", 43 | "EY1", 44 | "IH0", 45 | "K", 46 | "N", 47 | "W", 48 | "IY2", 49 | "T", 50 | "AA1", 51 | "ER1", 52 | "EH2", 53 | "OY0", 54 | "UH2", 55 | "UW1", 56 | "Z", 57 | "AW2", 58 | "AW1", 59 | "V", 60 | "UW2", 61 | "AA2", 62 | "ER", 63 | "AW0", 64 | "UW0", 65 | "R", 66 | "OW1", 67 | "EH1", 68 | "ZH", 69 | "AE0", 70 | "IH2", 71 | "IH", 72 | "Y", 73 | "JH", 74 | "P", 75 | "AY1", 76 | "EY0", 77 | "OY2", 78 | "TH", 79 | "HH", 80 | "D", 81 | "ER0", 82 | "CH", 83 | "AO1", 84 | "AE1", 85 | "AO2", 86 | "OY1", 87 | "AY2", 88 | "IH1", 89 | "OW0", 90 | "L", 91 | "SH", 92 | } 93 | 94 | 95 | def post_replace_ph(ph): 96 | rep_map = { 97 | ":": ",", 98 | ";": ",", 99 | ",": ",", 100 | "。": ".", 101 | "!": "!", 102 | "?": "?", 103 | "\n": ".", 104 | "·": ",", 105 | "、": ",", 106 | "...": "…", 107 | "v": "V", 108 | } 109 | if ph in rep_map.keys(): 110 | ph = rep_map[ph] 111 | if ph in symbols: 112 | return ph 113 | if ph not in symbols: 114 | ph = "UNK" 115 | return ph 116 | 117 | 118 | def read_dict(): 119 | g2p_dict = {} 120 | start_line = 49 121 | with open(CMU_DICT_PATH) as f: 122 | line = f.readline() 123 | line_index = 1 124 | while line: 125 | if line_index >= start_line: 126 | line = line.strip() 127 | word_split = line.split(" ") 128 | word = word_split[0] 129 | 130 | syllable_split = word_split[1].split(" - ") 131 | g2p_dict[word] = [] 132 | for syllable in syllable_split: 133 | phone_split = syllable.split(" ") 134 | g2p_dict[word].append(phone_split) 135 | 136 | line_index = line_index + 1 137 | line = f.readline() 138 | 139 | return g2p_dict 140 | 141 | 142 | def cache_dict(g2p_dict, file_path): 143 | with open(file_path, "wb") as pickle_file: 144 | pickle.dump(g2p_dict, pickle_file) 145 | 146 | 147 | def get_dict(): 148 | if os.path.exists(CACHE_PATH): 149 | with open(CACHE_PATH, "rb") as pickle_file: 150 | g2p_dict = pickle.load(pickle_file) 151 | else: 152 | g2p_dict = read_dict() 153 | cache_dict(g2p_dict, CACHE_PATH) 154 | 155 | return g2p_dict 156 | 157 | 158 | eng_dict = get_dict() 159 | 160 | 161 | def refine_ph(phn): 162 | tone = 0 163 | if re.search(r"\d$", phn): 164 | tone = int(phn[-1]) + 1 165 | phn = phn[:-1] 166 | return phn.lower(), tone 167 | 168 | 169 | def refine_syllables(syllables): 170 | tones = [] 171 | phonemes = [] 172 | for phn_list in syllables: 173 | for i in range(len(phn_list)): 174 | phn = phn_list[i] 175 | phn, tone = refine_ph(phn) 176 | phonemes.append(phn) 177 | tones.append(tone) 178 | return phonemes, tones 179 | 180 | 181 | def text_normalize(text): 182 | text = text.lower() 183 | text = expand_time_english(text) 184 | text = normalize_numbers(text) 185 | text = expand_abbreviations(text) 186 | return text 187 | 188 | model_id = 'bert-base-uncased' 189 | tokenizer = AutoTokenizer.from_pretrained(model_id) 190 | def g2p_old(text): 191 | tokenized = tokenizer.tokenize(text) 192 | # import pdb; pdb.set_trace() 193 | phones = [] 194 | tones = [] 195 | words = re.split(r"([,;.\-\?\!\s+])", text) 196 | for w in words: 197 | if w.upper() in eng_dict: 198 | phns, tns = refine_syllables(eng_dict[w.upper()]) 199 | phones += phns 200 | tones += tns 201 | else: 202 | phone_list = list(filter(lambda p: p != " ", _g2p(w))) 203 | for ph in phone_list: 204 | if ph in arpa: 205 | ph, tn = refine_ph(ph) 206 | phones.append(ph) 207 | tones.append(tn) 208 | else: 209 | phones.append(ph) 210 | tones.append(0) 211 | # todo: implement word2ph 212 | word2ph = [1 for i in phones] 213 | 214 | phones = [post_replace_ph(i) for i in phones] 215 | return phones, tones, word2ph 216 | 217 | def g2p(text, pad_start_end=True, tokenized=None): 218 | if tokenized is None: 219 | tokenized = tokenizer.tokenize(text) 220 | # import pdb; pdb.set_trace() 221 | phs = [] 222 | ph_groups = [] 223 | for t in tokenized: 224 | if not t.startswith("#"): 225 | ph_groups.append([t]) 226 | else: 227 | ph_groups[-1].append(t.replace("#", "")) 228 | 229 | phones = [] 230 | tones = [] 231 | word2ph = [] 232 | for group in ph_groups: 233 | w = "".join(group) 234 | phone_len = 0 235 | word_len = len(group) 236 | if w.upper() in eng_dict: 237 | phns, tns = refine_syllables(eng_dict[w.upper()]) 238 | phones += phns 239 | tones += tns 240 | phone_len += len(phns) 241 | else: 242 | phone_list = list(filter(lambda p: p != " ", _g2p(w))) 243 | for ph in phone_list: 244 | if ph in arpa: 245 | ph, tn = refine_ph(ph) 246 | phones.append(ph) 247 | tones.append(tn) 248 | else: 249 | phones.append(ph) 250 | tones.append(0) 251 | phone_len += 1 252 | aaa = distribute_phone(phone_len, word_len) 253 | word2ph += aaa 254 | phones = [post_replace_ph(i) for i in phones] 255 | 256 | if pad_start_end: 257 | phones = ["_"] + phones + ["_"] 258 | tones = [0] + tones + [0] 259 | word2ph = [1] + word2ph + [1] 260 | return phones, tones, word2ph 261 | 262 | def get_bert_feature(text, word2ph, device=None): 263 | from text import english_bert 264 | 265 | return english_bert.get_bert_feature(text, word2ph, device=device) 266 | 267 | if __name__ == "__main__": 268 | # print(get_dict()) 269 | # print(eng_word_to_phoneme("hello")) 270 | from text.english_bert import get_bert_feature 271 | text = "In this paper, we propose 1 DSPGAN, a N-F-T GAN-based universal vocoder." 272 | text = text_normalize(text) 273 | phones, tones, word2ph = g2p(text) 274 | import pdb; pdb.set_trace() 275 | bert = get_bert_feature(text, word2ph) 276 | 277 | print(phones, tones, word2ph, bert.shape) 278 | 279 | # all_phones = set() 280 | # for k, syllables in eng_dict.items(): 281 | # for group in syllables: 282 | # for ph in group: 283 | # all_phones.add(ph) 284 | # print(all_phones) 285 | -------------------------------------------------------------------------------- /melo/text/english_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | model_id = 'bert-base-uncased' 6 | tokenizer = AutoTokenizer.from_pretrained(model_id) 7 | model = None 8 | 9 | def get_bert_feature(text, word2ph, device=None): 10 | global model 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if model is None: 20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to( 21 | device 22 | ) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = model(**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert inputs["input_ids"].shape[-1] == len(word2ph) 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | -------------------------------------------------------------------------------- /melo/text/english_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/english_utils/__init__.py -------------------------------------------------------------------------------- /melo/text/english_utils/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in english: 4 | abbreviations_en = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("mrs", "misess"), 8 | ("mr", "mister"), 9 | ("dr", "doctor"), 10 | ("st", "saint"), 11 | ("co", "company"), 12 | ("jr", "junior"), 13 | ("maj", "major"), 14 | ("gen", "general"), 15 | ("drs", "doctors"), 16 | ("rev", "reverend"), 17 | ("lt", "lieutenant"), 18 | ("hon", "honorable"), 19 | ("sgt", "sergeant"), 20 | ("capt", "captain"), 21 | ("esq", "esquire"), 22 | ("ltd", "limited"), 23 | ("col", "colonel"), 24 | ("ft", "fort"), 25 | ] 26 | ] 27 | 28 | def expand_abbreviations(text, lang="en"): 29 | if lang == "en": 30 | _abbreviations = abbreviations_en 31 | else: 32 | raise NotImplementedError() 33 | for regex, replacement in _abbreviations: 34 | text = re.sub(regex, replacement, text) 35 | return text -------------------------------------------------------------------------------- /melo/text/english_utils/number_norm.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | from typing import Dict 5 | 6 | import inflect 7 | 8 | _inflect = inflect.engine() 9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)") 12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 13 | _number_re = re.compile(r"-?[0-9]+") 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(",", "") 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace(".", " point ") 22 | 23 | 24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str: 25 | parts = value.replace(",", "").split(".") 26 | if len(parts) > 2: 27 | return f"{value} {inflection[2]}" # Unexpected format 28 | text = [] 29 | integer = int(parts[0]) if parts[0] else 0 30 | if integer > 0: 31 | integer_unit = inflection.get(integer, inflection[2]) 32 | text.append(f"{integer} {integer_unit}") 33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0 34 | if fraction > 0: 35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02]) 36 | text.append(f"{fraction} {fraction_unit}") 37 | if len(text) == 0: 38 | return f"zero {inflection[2]}" 39 | return " ".join(text) 40 | 41 | 42 | def _expand_currency(m: "re.Match") -> str: 43 | currencies = { 44 | "$": { 45 | 0.01: "cent", 46 | 0.02: "cents", 47 | 1: "dollar", 48 | 2: "dollars", 49 | }, 50 | "€": { 51 | 0.01: "cent", 52 | 0.02: "cents", 53 | 1: "euro", 54 | 2: "euros", 55 | }, 56 | "£": { 57 | 0.01: "penny", 58 | 0.02: "pence", 59 | 1: "pound sterling", 60 | 2: "pounds sterling", 61 | }, 62 | "¥": { 63 | # TODO rin 64 | 0.02: "sen", 65 | 2: "yen", 66 | }, 67 | } 68 | unit = m.group(1) 69 | currency = currencies[unit] 70 | value = m.group(2) 71 | return __expand_currency(value, currency) 72 | 73 | 74 | def _expand_ordinal(m): 75 | return _inflect.number_to_words(m.group(0)) 76 | 77 | 78 | def _expand_number(m): 79 | num = int(m.group(0)) 80 | if 1000 < num < 3000: 81 | if num == 2000: 82 | return "two thousand" 83 | if 2000 < num < 2010: 84 | return "two thousand " + _inflect.number_to_words(num % 100) 85 | if num % 100 == 0: 86 | return _inflect.number_to_words(num // 100) + " hundred" 87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 88 | return _inflect.number_to_words(num, andword="") 89 | 90 | 91 | def normalize_numbers(text): 92 | text = re.sub(_comma_number_re, _remove_commas, text) 93 | text = re.sub(_currency_re, _expand_currency, text) 94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 95 | text = re.sub(_ordinal_re, _expand_ordinal, text) 96 | text = re.sub(_number_re, _expand_number, text) 97 | return text -------------------------------------------------------------------------------- /melo/text/english_utils/time_norm.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | 7 | _time_re = re.compile( 8 | r"""\b 9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours 10 | : 11 | ([0-5][0-9]) # minutes 12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm 13 | \b""", 14 | re.IGNORECASE | re.X, 15 | ) 16 | 17 | 18 | def _expand_num(n: int) -> str: 19 | return _inflect.number_to_words(n) 20 | 21 | 22 | def _expand_time_english(match: "re.Match") -> str: 23 | hour = int(match.group(1)) 24 | past_noon = hour >= 12 25 | time = [] 26 | if hour > 12: 27 | hour -= 12 28 | elif hour == 0: 29 | hour = 12 30 | past_noon = True 31 | time.append(_expand_num(hour)) 32 | 33 | minute = int(match.group(6)) 34 | if minute > 0: 35 | if minute < 10: 36 | time.append("oh") 37 | time.append(_expand_num(minute)) 38 | am_pm = match.group(7) 39 | if am_pm is None: 40 | time.append("p m" if past_noon else "a m") 41 | else: 42 | time.extend(list(am_pm.replace(".", ""))) 43 | return " ".join(time) 44 | 45 | 46 | def expand_time_english(text: str) -> str: 47 | return re.sub(_time_re, _expand_time_english, text) -------------------------------------------------------------------------------- /melo/text/es_phonemizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/es_phonemizer/__init__.py -------------------------------------------------------------------------------- /melo/text/es_phonemizer/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import List, Tuple 3 | 4 | from .punctuation import Punctuation 5 | 6 | 7 | class BasePhonemizer(abc.ABC): 8 | """Base phonemizer class 9 | 10 | Phonemization follows the following steps: 11 | 1. Preprocessing: 12 | - remove empty lines 13 | - remove punctuation 14 | - keep track of punctuation marks 15 | 16 | 2. Phonemization: 17 | - convert text to phonemes 18 | 19 | 3. Postprocessing: 20 | - join phonemes 21 | - restore punctuation marks 22 | 23 | Args: 24 | language (str): 25 | Language used by the phonemizer. 26 | 27 | punctuations (List[str]): 28 | List of punctuation marks to be preserved. 29 | 30 | keep_puncs (bool): 31 | Whether to preserve punctuation marks or not. 32 | """ 33 | 34 | def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): 35 | # ensure the backend is installed on the system 36 | if not self.is_available(): 37 | raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover 38 | 39 | # ensure the backend support the requested language 40 | self._language = self._init_language(language) 41 | 42 | # setup punctuation processing 43 | self._keep_puncs = keep_puncs 44 | self._punctuator = Punctuation(punctuations) 45 | 46 | def _init_language(self, language): 47 | """Language initialization 48 | 49 | This method may be overloaded in child classes (see Segments backend) 50 | 51 | """ 52 | if not self.is_supported_language(language): 53 | raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") 54 | return language 55 | 56 | @property 57 | def language(self): 58 | """The language code configured to be used for phonemization""" 59 | return self._language 60 | 61 | @staticmethod 62 | @abc.abstractmethod 63 | def name(): 64 | """The name of the backend""" 65 | ... 66 | 67 | @classmethod 68 | @abc.abstractmethod 69 | def is_available(cls): 70 | """Returns True if the backend is installed, False otherwise""" 71 | ... 72 | 73 | @classmethod 74 | @abc.abstractmethod 75 | def version(cls): 76 | """Return the backend version as a tuple (major, minor, patch)""" 77 | ... 78 | 79 | @staticmethod 80 | @abc.abstractmethod 81 | def supported_languages(): 82 | """Return a dict of language codes -> name supported by the backend""" 83 | ... 84 | 85 | def is_supported_language(self, language): 86 | """Returns True if `language` is supported by the backend""" 87 | return language in self.supported_languages() 88 | 89 | @abc.abstractmethod 90 | def _phonemize(self, text, separator): 91 | """The main phonemization method""" 92 | 93 | def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: 94 | """Preprocess the text before phonemization 95 | 96 | 1. remove spaces 97 | 2. remove punctuation 98 | 99 | Override this if you need a different behaviour 100 | """ 101 | text = text.strip() 102 | if self._keep_puncs: 103 | # a tuple (text, punctuation marks) 104 | return self._punctuator.strip_to_restore(text) 105 | return [self._punctuator.strip(text)], [] 106 | 107 | def _phonemize_postprocess(self, phonemized, punctuations) -> str: 108 | """Postprocess the raw phonemized output 109 | 110 | Override this if you need a different behaviour 111 | """ 112 | if self._keep_puncs: 113 | return self._punctuator.restore(phonemized, punctuations)[0] 114 | return phonemized[0] 115 | 116 | def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument 117 | """Returns the `text` phonemized for the given language 118 | 119 | Args: 120 | text (str): 121 | Text to be phonemized. 122 | 123 | separator (str): 124 | string separator used between phonemes. Default to '_'. 125 | 126 | Returns: 127 | (str): Phonemized text 128 | """ 129 | text, punctuations = self._phonemize_preprocess(text) 130 | phonemized = [] 131 | for t in text: 132 | p = self._phonemize(t, separator) 133 | phonemized.append(p) 134 | phonemized = self._phonemize_postprocess(phonemized, punctuations) 135 | return phonemized 136 | 137 | def print_logs(self, level: int = 0): 138 | indent = "\t" * level 139 | print(f"{indent}| > phoneme language: {self.language}") 140 | print(f"{indent}| > phoneme backend: {self.name()}") -------------------------------------------------------------------------------- /melo/text/es_phonemizer/cleaner.py: -------------------------------------------------------------------------------- 1 | """Set of default text cleaners""" 2 | # TODO: pick the cleaner for languages dynamically 3 | 4 | import re 5 | 6 | # Regular expression matching whitespace: 7 | _whitespace_re = re.compile(r"\s+") 8 | 9 | rep_map = { 10 | ":": ",", 11 | ";": ",", 12 | ",": ",", 13 | "。": ".", 14 | "!": "!", 15 | "?": "?", 16 | "\n": ".", 17 | "·": ",", 18 | "、": ",", 19 | "...": ".", 20 | "…": ".", 21 | "$": ".", 22 | "“": "'", 23 | "”": "'", 24 | "‘": "'", 25 | "’": "'", 26 | "(": "'", 27 | ")": "'", 28 | "(": "'", 29 | ")": "'", 30 | "《": "'", 31 | "》": "'", 32 | "【": "'", 33 | "】": "'", 34 | "[": "'", 35 | "]": "'", 36 | "—": "", 37 | "~": "-", 38 | "~": "-", 39 | "「": "'", 40 | "」": "'", 41 | } 42 | 43 | def replace_punctuation(text): 44 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 45 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 46 | return replaced_text 47 | 48 | def lowercase(text): 49 | return text.lower() 50 | 51 | 52 | def collapse_whitespace(text): 53 | return re.sub(_whitespace_re, " ", text).strip() 54 | 55 | def remove_punctuation_at_begin(text): 56 | return re.sub(r'^[,.!?]+', '', text) 57 | 58 | def remove_aux_symbols(text): 59 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) 60 | return text 61 | 62 | 63 | def replace_symbols(text, lang="en"): 64 | """Replace symbols based on the lenguage tag. 65 | 66 | Args: 67 | text: 68 | Input text. 69 | lang: 70 | Lenguage identifier. ex: "en", "fr", "pt", "ca". 71 | 72 | Returns: 73 | The modified text 74 | example: 75 | input args: 76 | text: "si l'avi cau, diguem-ho" 77 | lang: "ca" 78 | Output: 79 | text: "si lavi cau, diguemho" 80 | """ 81 | text = text.replace(";", ",") 82 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") 83 | text = text.replace(":", ",") 84 | if lang == "en": 85 | text = text.replace("&", " and ") 86 | elif lang == "fr": 87 | text = text.replace("&", " et ") 88 | elif lang == "pt": 89 | text = text.replace("&", " e ") 90 | elif lang == "ca": 91 | text = text.replace("&", " i ") 92 | text = text.replace("'", "") 93 | elif lang== "es": 94 | text=text.replace("&","y") 95 | text = text.replace("'", "") 96 | return text 97 | 98 | def spanish_cleaners(text): 99 | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and 100 | numbers, phonemizer already does that""" 101 | text = lowercase(text) 102 | text = replace_symbols(text, lang="es") 103 | text = replace_punctuation(text) 104 | text = remove_aux_symbols(text) 105 | text = remove_punctuation_at_begin(text) 106 | text = collapse_whitespace(text) 107 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) 108 | return text 109 | 110 | -------------------------------------------------------------------------------- /melo/text/es_phonemizer/es_symbols.json: -------------------------------------------------------------------------------- 1 | { 2 | "symbols": [ 3 | "_", 4 | ",", 5 | ".", 6 | "!", 7 | "?", 8 | "-", 9 | "~", 10 | "\u2026", 11 | "N", 12 | "Q", 13 | "a", 14 | "b", 15 | "d", 16 | "e", 17 | "f", 18 | "g", 19 | "h", 20 | "i", 21 | "j", 22 | "k", 23 | "l", 24 | "m", 25 | "n", 26 | "o", 27 | "p", 28 | "s", 29 | "t", 30 | "u", 31 | "v", 32 | "w", 33 | "x", 34 | "y", 35 | "z", 36 | "\u0251", 37 | "\u00e6", 38 | "\u0283", 39 | "\u0291", 40 | "\u00e7", 41 | "\u026f", 42 | "\u026a", 43 | "\u0254", 44 | "\u025b", 45 | "\u0279", 46 | "\u00f0", 47 | "\u0259", 48 | "\u026b", 49 | "\u0265", 50 | "\u0278", 51 | "\u028a", 52 | "\u027e", 53 | "\u0292", 54 | "\u03b8", 55 | "\u03b2", 56 | "\u014b", 57 | "\u0266", 58 | "\u207c", 59 | "\u02b0", 60 | "`", 61 | "^", 62 | "#", 63 | "*", 64 | "=", 65 | "\u02c8", 66 | "\u02cc", 67 | "\u2192", 68 | "\u2193", 69 | "\u2191", 70 | " ", 71 | "\u0263", 72 | "\u0261", 73 | "r", 74 | "\u0272", 75 | "\u029d", 76 | "\u028e", 77 | "\u02d0" 78 | ] 79 | } -------------------------------------------------------------------------------- /melo/text/es_phonemizer/es_symbols.txt: -------------------------------------------------------------------------------- 1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɡrɲʝɣʎː—¿¡ -------------------------------------------------------------------------------- /melo/text/es_phonemizer/es_symbols_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "symbols": [ 3 | "_", 4 | ",", 5 | ".", 6 | "!", 7 | "?", 8 | "-", 9 | "~", 10 | "\u2026", 11 | "N", 12 | "Q", 13 | "a", 14 | "b", 15 | "d", 16 | "e", 17 | "f", 18 | "g", 19 | "h", 20 | "i", 21 | "j", 22 | "k", 23 | "l", 24 | "m", 25 | "n", 26 | "o", 27 | "p", 28 | "s", 29 | "t", 30 | "u", 31 | "v", 32 | "w", 33 | "x", 34 | "y", 35 | "z", 36 | "\u0251", 37 | "\u00e6", 38 | "\u0283", 39 | "\u0291", 40 | "\u00e7", 41 | "\u026f", 42 | "\u026a", 43 | "\u0254", 44 | "\u025b", 45 | "\u0279", 46 | "\u00f0", 47 | "\u0259", 48 | "\u026b", 49 | "\u0265", 50 | "\u0278", 51 | "\u028a", 52 | "\u027e", 53 | "\u0292", 54 | "\u03b8", 55 | "\u03b2", 56 | "\u014b", 57 | "\u0266", 58 | "\u207c", 59 | "\u02b0", 60 | "`", 61 | "^", 62 | "#", 63 | "*", 64 | "=", 65 | "\u02c8", 66 | "\u02cc", 67 | "\u2192", 68 | "\u2193", 69 | "\u2191", 70 | " ", 71 | "\u0261", 72 | "r", 73 | "\u0272", 74 | "\u029d", 75 | "\u0263", 76 | "\u028e", 77 | "\u02d0", 78 | 79 | "\u2014", 80 | "\u00bf", 81 | "\u00a1" 82 | ] 83 | } -------------------------------------------------------------------------------- /melo/text/es_phonemizer/es_to_ipa.py: -------------------------------------------------------------------------------- 1 | from .cleaner import spanish_cleaners 2 | from .gruut_wrapper import Gruut 3 | 4 | def es2ipa(text): 5 | e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True) 6 | # text = spanish_cleaners(text) 7 | phonemes = e.phonemize(text, separator="") 8 | return phonemes 9 | 10 | 11 | if __name__ == '__main__': 12 | print(es2ipa('¿Y a quién echaría de menos, en el mundo si no fuese a vos?')) -------------------------------------------------------------------------------- /melo/text/es_phonemizer/gruut_wrapper.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from typing import List 3 | 4 | import gruut 5 | from gruut_ipa import IPA # pip install gruut_ipa 6 | 7 | from .base import BasePhonemizer 8 | from .punctuation import Punctuation 9 | 10 | # Table for str.translate to fix gruut/TTS phoneme mismatch 11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") 12 | 13 | 14 | class Gruut(BasePhonemizer): 15 | """Gruut wrapper for G2P 16 | 17 | Args: 18 | language (str): 19 | Valid language code for the used backend. 20 | 21 | punctuations (str): 22 | Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`. 23 | 24 | keep_puncs (bool): 25 | If true, keep the punctuations after phonemization. Defaults to True. 26 | 27 | use_espeak_phonemes (bool): 28 | If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False. 29 | 30 | keep_stress (bool): 31 | If true, keep the stress characters after phonemization. Defaults to False. 32 | 33 | Example: 34 | 35 | >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut 36 | >>> phonemizer = Gruut('en-us') 37 | >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|") 38 | 'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?' 39 | """ 40 | 41 | def __init__( 42 | self, 43 | language: str, 44 | punctuations=Punctuation.default_puncs(), 45 | keep_puncs=True, 46 | use_espeak_phonemes=False, 47 | keep_stress=False, 48 | ): 49 | super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) 50 | self.use_espeak_phonemes = use_espeak_phonemes 51 | self.keep_stress = keep_stress 52 | 53 | @staticmethod 54 | def name(): 55 | return "gruut" 56 | 57 | def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument 58 | """Convert input text to phonemes. 59 | 60 | Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters 61 | that constitude a single sound. 62 | 63 | It doesn't affect 🐸TTS since it individually converts each character to token IDs. 64 | 65 | Examples:: 66 | "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ` 67 | 68 | Args: 69 | text (str): 70 | Text to be converted to phonemes. 71 | 72 | tie (bool, optional) : When True use a '͡' character between 73 | consecutive characters of a single phoneme. Else separate phoneme 74 | with '_'. This option requires espeak>=1.49. Default to False. 75 | """ 76 | ph_list = [] 77 | for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes): 78 | for word in sentence: 79 | if word.is_break: 80 | # Use actual character for break phoneme (e.g., comma) 81 | if ph_list: 82 | # Join with previous word 83 | ph_list[-1].append(word.text) 84 | else: 85 | # First word is punctuation 86 | ph_list.append([word.text]) 87 | elif word.phonemes: 88 | # Add phonemes for word 89 | word_phonemes = [] 90 | 91 | for word_phoneme in word.phonemes: 92 | if not self.keep_stress: 93 | # Remove primary/secondary stress 94 | word_phoneme = IPA.without_stress(word_phoneme) 95 | 96 | word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE) 97 | 98 | if word_phoneme: 99 | # Flatten phonemes 100 | word_phonemes.extend(word_phoneme) 101 | 102 | if word_phonemes: 103 | ph_list.append(word_phonemes) 104 | 105 | ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list] 106 | ph = f"{separator} ".join(ph_words) 107 | return ph 108 | 109 | def _phonemize(self, text, separator): 110 | return self.phonemize_gruut(text, separator, tie=False) 111 | 112 | def is_supported_language(self, language): 113 | """Returns True if `language` is supported by the backend""" 114 | return gruut.is_language_supported(language) 115 | 116 | @staticmethod 117 | def supported_languages() -> List: 118 | """Get a dictionary of supported languages. 119 | 120 | Returns: 121 | List: List of language codes. 122 | """ 123 | return list(gruut.get_supported_languages()) 124 | 125 | def version(self): 126 | """Get the version of the used backend. 127 | 128 | Returns: 129 | str: Version of the used backend. 130 | """ 131 | return gruut.__version__ 132 | 133 | @classmethod 134 | def is_available(cls): 135 | """Return true if ESpeak is available else false""" 136 | return importlib.util.find_spec("gruut") is not None 137 | 138 | 139 | if __name__ == "__main__": 140 | from es_to_ipa import es2ipa 141 | import json 142 | 143 | e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True) 144 | symbols = [ 145 | "_", 146 | ",", 147 | ".", 148 | "!", 149 | "?", 150 | "-", 151 | "~", 152 | "\u2026", 153 | "N", 154 | "Q", 155 | "a", 156 | "b", 157 | "d", 158 | "e", 159 | "f", 160 | "g", 161 | "h", 162 | "i", 163 | "j", 164 | "k", 165 | "l", 166 | "m", 167 | "n", 168 | "o", 169 | "p", 170 | "s", 171 | "t", 172 | "u", 173 | "v", 174 | "w", 175 | "x", 176 | "y", 177 | "z", 178 | "\u0251", 179 | "\u00e6", 180 | "\u0283", 181 | "\u0291", 182 | "\u00e7", 183 | "\u026f", 184 | "\u026a", 185 | "\u0254", 186 | "\u025b", 187 | "\u0279", 188 | "\u00f0", 189 | "\u0259", 190 | "\u026b", 191 | "\u0265", 192 | "\u0278", 193 | "\u028a", 194 | "\u027e", 195 | "\u0292", 196 | "\u03b8", 197 | "\u03b2", 198 | "\u014b", 199 | "\u0266", 200 | "\u207c", 201 | "\u02b0", 202 | "`", 203 | "^", 204 | "#", 205 | "*", 206 | "=", 207 | "\u02c8", 208 | "\u02cc", 209 | "\u2192", 210 | "\u2193", 211 | "\u2191", 212 | " ", 213 | ] 214 | with open('./text/es_phonemizer/spanish_text.txt', 'r') as f: 215 | lines = f.readlines() 216 | 217 | 218 | used_sym = [] 219 | not_existed_sym = [] 220 | phonemes = [] 221 | 222 | for line in lines[:400]: 223 | text = line.split('|')[-1].strip() 224 | ipa = es2ipa(text) 225 | phonemes.append(ipa + '\n') 226 | for s in ipa: 227 | if s not in symbols: 228 | if s not in not_existed_sym: 229 | print(f'not_existed char: {s}') 230 | not_existed_sym.append(s) 231 | else: 232 | if s not in used_sym: 233 | # print(f'used char: {s}') 234 | used_sym.append(s) 235 | 236 | print(used_sym) 237 | print(not_existed_sym) 238 | 239 | 240 | with open('./text/es_phonemizer/es_symbols.txt', 'w') as g: 241 | g.writelines(symbols + not_existed_sym) 242 | 243 | with open('./text/es_phonemizer/example_ipa.txt', 'w') as g: 244 | g.writelines(phonemes) 245 | 246 | data = {'symbols': symbols + not_existed_sym} 247 | with open('./text/es_phonemizer/es_symbols_v2.json', 'w') as f: 248 | json.dump(data, f, indent=4) 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /melo/text/es_phonemizer/punctuation.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | from enum import Enum 4 | 5 | import six 6 | 7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”' 8 | 9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"]) 10 | 11 | 12 | class PuncPosition(Enum): 13 | """Enum for the punctuations positions""" 14 | 15 | BEGIN = 0 16 | END = 1 17 | MIDDLE = 2 18 | ALONE = 3 19 | 20 | 21 | class Punctuation: 22 | """Handle punctuations in text. 23 | 24 | Just strip punctuations from text or strip and restore them later. 25 | 26 | Args: 27 | puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`. 28 | 29 | Example: 30 | >>> punc = Punctuation() 31 | >>> punc.strip("This is. example !") 32 | 'This is example' 33 | 34 | >>> text_striped, punc_map = punc.strip_to_restore("This is. example !") 35 | >>> ' '.join(text_striped) 36 | 'This is example' 37 | 38 | >>> text_restored = punc.restore(text_striped, punc_map) 39 | >>> text_restored[0] 40 | 'This is. example !' 41 | """ 42 | 43 | def __init__(self, puncs: str = _DEF_PUNCS): 44 | self.puncs = puncs 45 | 46 | @staticmethod 47 | def default_puncs(): 48 | """Return default set of punctuations.""" 49 | return _DEF_PUNCS 50 | 51 | @property 52 | def puncs(self): 53 | return self._puncs 54 | 55 | @puncs.setter 56 | def puncs(self, value): 57 | if not isinstance(value, six.string_types): 58 | raise ValueError("[!] Punctuations must be of type str.") 59 | self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder 60 | self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+") 61 | 62 | def strip(self, text): 63 | """Remove all the punctuations by replacing with `space`. 64 | 65 | Args: 66 | text (str): The text to be processed. 67 | 68 | Example:: 69 | 70 | "This is. example !" -> "This is example " 71 | """ 72 | return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip() 73 | 74 | def strip_to_restore(self, text): 75 | """Remove punctuations from text to restore them later. 76 | 77 | Args: 78 | text (str): The text to be processed. 79 | 80 | Examples :: 81 | 82 | "This is. example !" -> [["This is", "example"], [".", "!"]] 83 | 84 | """ 85 | text, puncs = self._strip_to_restore(text) 86 | return text, puncs 87 | 88 | def _strip_to_restore(self, text): 89 | """Auxiliary method for Punctuation.preserve()""" 90 | matches = list(re.finditer(self.puncs_regular_exp, text)) 91 | if not matches: 92 | return [text], [] 93 | # the text is only punctuations 94 | if len(matches) == 1 and matches[0].group() == text: 95 | return [], [_PUNC_IDX(text, PuncPosition.ALONE)] 96 | # build a punctuation map to be used later to restore punctuations 97 | puncs = [] 98 | for match in matches: 99 | position = PuncPosition.MIDDLE 100 | if match == matches[0] and text.startswith(match.group()): 101 | position = PuncPosition.BEGIN 102 | elif match == matches[-1] and text.endswith(match.group()): 103 | position = PuncPosition.END 104 | puncs.append(_PUNC_IDX(match.group(), position)) 105 | # convert str text to a List[str], each item is separated by a punctuation 106 | splitted_text = [] 107 | for idx, punc in enumerate(puncs): 108 | split = text.split(punc.punc) 109 | prefix, suffix = split[0], punc.punc.join(split[1:]) 110 | splitted_text.append(prefix) 111 | # if the text does not end with a punctuation, add it to the last item 112 | if idx == len(puncs) - 1 and len(suffix) > 0: 113 | splitted_text.append(suffix) 114 | text = suffix 115 | while splitted_text[0] == '': 116 | splitted_text = splitted_text[1:] 117 | return splitted_text, puncs 118 | 119 | @classmethod 120 | def restore(cls, text, puncs): 121 | """Restore punctuation in a text. 122 | 123 | Args: 124 | text (str): The text to be processed. 125 | puncs (List[str]): The list of punctuations map to be used for restoring. 126 | 127 | Examples :: 128 | 129 | ['This is', 'example'], ['.', '!'] -> "This is. example!" 130 | 131 | """ 132 | return cls._restore(text, puncs, 0) 133 | 134 | @classmethod 135 | def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements 136 | """Auxiliary method for Punctuation.restore()""" 137 | if not puncs: 138 | return text 139 | 140 | # nothing have been phonemized, returns the puncs alone 141 | if not text: 142 | return ["".join(m.punc for m in puncs)] 143 | 144 | current = puncs[0] 145 | 146 | if current.position == PuncPosition.BEGIN: 147 | return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num) 148 | 149 | if current.position == PuncPosition.END: 150 | return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1) 151 | 152 | if current.position == PuncPosition.ALONE: 153 | return [current.mark] + cls._restore(text, puncs[1:], num + 1) 154 | 155 | # POSITION == MIDDLE 156 | if len(text) == 1: # pragma: nocover 157 | # a corner case where the final part of an intermediate 158 | # mark (I) has not been phonemized 159 | return cls._restore([text[0] + current.punc], puncs[1:], num) 160 | 161 | return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) 162 | 163 | 164 | # if __name__ == "__main__": 165 | # punc = Punctuation() 166 | # text = "This is. This is, example!" 167 | 168 | # print(punc.strip(text)) 169 | 170 | # split_text, puncs = punc.strip_to_restore(text) 171 | # print(split_text, " ---- ", puncs) 172 | 173 | # restored_text = punc.restore(split_text, puncs) 174 | # print(restored_text) -------------------------------------------------------------------------------- /melo/text/es_phonemizer/spanish_symbols.txt: -------------------------------------------------------------------------------- 1 | dˌaβˈiðkopeɾfjl unθsbmtʃwɛxɪŋʊɣɡrɲʝʎː -------------------------------------------------------------------------------- /melo/text/es_phonemizer/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ImportError", 10 | "evalue": "attempted relative import with no known parent package", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[1;32m/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb Cell 1\u001b[0m line \u001b[0;36m5\n\u001b[1;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39msys\u001b[39;00m\n\u001b[1;32m 4\u001b[0m sys\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mappend(\u001b[39m'\u001b[39m\u001b[39m/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mes_to_ipa\u001b[39;00m \u001b[39mimport\u001b[39;00m es2ipa\n\u001b[1;32m 9\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msplit_sentences_en\u001b[39m(text, min_len\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[1;32m 10\u001b[0m \u001b[39m# 将文本中的换行符、空格和制表符替换为空格\u001b[39;00m\n\u001b[1;32m 11\u001b[0m text \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\u001b[39m'\u001b[39m\u001b[39m[\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m ]+\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m, text)\n", 16 | "File \u001b[0;32m/data/workspace/Bert-VITS2/text/es_phonemizer/es_to_ipa.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mcleaner\u001b[39;00m \u001b[39mimport\u001b[39;00m spanish_cleaners\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mgruut_wrapper\u001b[39;00m \u001b[39mimport\u001b[39;00m Gruut\n\u001b[1;32m 4\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mes2ipa\u001b[39m(text):\n", 17 | "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "import re\n", 23 | "import os\n", 24 | "import os, sys\n", 25 | "sys.path.append('/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/')\n", 26 | "from es_to_ipa import es2ipa\n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "def split_sentences_en(text, min_len=10):\n", 31 | " # 将文本中的换行符、空格和制表符替换为空格\n", 32 | " text = re.sub('[\\n\\t ]+', ' ', text)\n", 33 | " # 在标点符号后添加一个空格\n", 34 | " text = re.sub('([¿—¡])', r'\\1 $#!', text)\n", 35 | " # 分隔句子并去除前后空格\n", 36 | " \n", 37 | " sentences = [s.strip() for s in text.split(' $#!')]\n", 38 | " if len(sentences[-1]) == 0: del sentences[-1]\n", 39 | "\n", 40 | " new_sentences = []\n", 41 | " new_sent = []\n", 42 | " for ind, sent in enumerate(sentences):\n", 43 | " if sent in ['¿', '—', '¡']:\n", 44 | " new_sent.append(sent)\n", 45 | " else:\n", 46 | " new_sent.append(es2ipa(sent))\n", 47 | " \n", 48 | " \n", 49 | " new_sentences = ''.join(new_sent)\n", 50 | "\n", 51 | " return new_sentences" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "'—¿aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'" 63 | ] 64 | }, 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "split_sentences_en('—¿Habéis estado casada alguna vez?')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "'aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "es2ipa('—¿Habéis estado casada alguna vez?')" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "base", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.8.18" 119 | }, 120 | "orig_nbformat": 4 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/fr_phonemizer/__init__.py -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import List, Tuple 3 | 4 | from .punctuation import Punctuation 5 | 6 | 7 | class BasePhonemizer(abc.ABC): 8 | """Base phonemizer class 9 | 10 | Phonemization follows the following steps: 11 | 1. Preprocessing: 12 | - remove empty lines 13 | - remove punctuation 14 | - keep track of punctuation marks 15 | 16 | 2. Phonemization: 17 | - convert text to phonemes 18 | 19 | 3. Postprocessing: 20 | - join phonemes 21 | - restore punctuation marks 22 | 23 | Args: 24 | language (str): 25 | Language used by the phonemizer. 26 | 27 | punctuations (List[str]): 28 | List of punctuation marks to be preserved. 29 | 30 | keep_puncs (bool): 31 | Whether to preserve punctuation marks or not. 32 | """ 33 | 34 | def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): 35 | # ensure the backend is installed on the system 36 | if not self.is_available(): 37 | raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover 38 | 39 | # ensure the backend support the requested language 40 | self._language = self._init_language(language) 41 | 42 | # setup punctuation processing 43 | self._keep_puncs = keep_puncs 44 | self._punctuator = Punctuation(punctuations) 45 | 46 | def _init_language(self, language): 47 | """Language initialization 48 | 49 | This method may be overloaded in child classes (see Segments backend) 50 | 51 | """ 52 | if not self.is_supported_language(language): 53 | raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") 54 | return language 55 | 56 | @property 57 | def language(self): 58 | """The language code configured to be used for phonemization""" 59 | return self._language 60 | 61 | @staticmethod 62 | @abc.abstractmethod 63 | def name(): 64 | """The name of the backend""" 65 | ... 66 | 67 | @classmethod 68 | @abc.abstractmethod 69 | def is_available(cls): 70 | """Returns True if the backend is installed, False otherwise""" 71 | ... 72 | 73 | @classmethod 74 | @abc.abstractmethod 75 | def version(cls): 76 | """Return the backend version as a tuple (major, minor, patch)""" 77 | ... 78 | 79 | @staticmethod 80 | @abc.abstractmethod 81 | def supported_languages(): 82 | """Return a dict of language codes -> name supported by the backend""" 83 | ... 84 | 85 | def is_supported_language(self, language): 86 | """Returns True if `language` is supported by the backend""" 87 | return language in self.supported_languages() 88 | 89 | @abc.abstractmethod 90 | def _phonemize(self, text, separator): 91 | """The main phonemization method""" 92 | 93 | def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: 94 | """Preprocess the text before phonemization 95 | 96 | 1. remove spaces 97 | 2. remove punctuation 98 | 99 | Override this if you need a different behaviour 100 | """ 101 | text = text.strip() 102 | if self._keep_puncs: 103 | # a tuple (text, punctuation marks) 104 | return self._punctuator.strip_to_restore(text) 105 | return [self._punctuator.strip(text)], [] 106 | 107 | def _phonemize_postprocess(self, phonemized, punctuations) -> str: 108 | """Postprocess the raw phonemized output 109 | 110 | Override this if you need a different behaviour 111 | """ 112 | if self._keep_puncs: 113 | return self._punctuator.restore(phonemized, punctuations)[0] 114 | return phonemized[0] 115 | 116 | def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument 117 | """Returns the `text` phonemized for the given language 118 | 119 | Args: 120 | text (str): 121 | Text to be phonemized. 122 | 123 | separator (str): 124 | string separator used between phonemes. Default to '_'. 125 | 126 | Returns: 127 | (str): Phonemized text 128 | """ 129 | text, punctuations = self._phonemize_preprocess(text) 130 | phonemized = [] 131 | for t in text: 132 | p = self._phonemize(t, separator) 133 | phonemized.append(p) 134 | phonemized = self._phonemize_postprocess(phonemized, punctuations) 135 | return phonemized 136 | 137 | def print_logs(self, level: int = 0): 138 | indent = "\t" * level 139 | print(f"{indent}| > phoneme language: {self.language}") 140 | print(f"{indent}| > phoneme backend: {self.name()}") -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/cleaner.py: -------------------------------------------------------------------------------- 1 | """Set of default text cleaners""" 2 | # TODO: pick the cleaner for languages dynamically 3 | 4 | import re 5 | from .french_abbreviations import abbreviations_fr 6 | 7 | # Regular expression matching whitespace: 8 | _whitespace_re = re.compile(r"\s+") 9 | 10 | 11 | rep_map = { 12 | ":": ",", 13 | ";": ",", 14 | ",": ",", 15 | "。": ".", 16 | "!": "!", 17 | "?": "?", 18 | "\n": ".", 19 | "·": ",", 20 | "、": ",", 21 | "...": ".", 22 | "…": ".", 23 | "$": ".", 24 | "“": "", 25 | "”": "", 26 | "‘": "", 27 | "’": "", 28 | "(": "", 29 | ")": "", 30 | "(": "", 31 | ")": "", 32 | "《": "", 33 | "》": "", 34 | "【": "", 35 | "】": "", 36 | "[": "", 37 | "]": "", 38 | "—": "", 39 | "~": "-", 40 | "~": "-", 41 | "「": "", 42 | "」": "", 43 | "¿" : "", 44 | "¡" : "" 45 | } 46 | 47 | 48 | def replace_punctuation(text): 49 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 50 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 51 | return replaced_text 52 | 53 | def expand_abbreviations(text, lang="fr"): 54 | if lang == "fr": 55 | _abbreviations = abbreviations_fr 56 | for regex, replacement in _abbreviations: 57 | text = re.sub(regex, replacement, text) 58 | return text 59 | 60 | 61 | def lowercase(text): 62 | return text.lower() 63 | 64 | 65 | def collapse_whitespace(text): 66 | return re.sub(_whitespace_re, " ", text).strip() 67 | 68 | def remove_punctuation_at_begin(text): 69 | return re.sub(r'^[,.!?]+', '', text) 70 | 71 | def remove_aux_symbols(text): 72 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) 73 | return text 74 | 75 | 76 | def replace_symbols(text, lang="en"): 77 | """Replace symbols based on the lenguage tag. 78 | 79 | Args: 80 | text: 81 | Input text. 82 | lang: 83 | Lenguage identifier. ex: "en", "fr", "pt", "ca". 84 | 85 | Returns: 86 | The modified text 87 | example: 88 | input args: 89 | text: "si l'avi cau, diguem-ho" 90 | lang: "ca" 91 | Output: 92 | text: "si lavi cau, diguemho" 93 | """ 94 | text = text.replace(";", ",") 95 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") 96 | text = text.replace(":", ",") 97 | if lang == "en": 98 | text = text.replace("&", " and ") 99 | elif lang == "fr": 100 | text = text.replace("&", " et ") 101 | elif lang == "pt": 102 | text = text.replace("&", " e ") 103 | elif lang == "ca": 104 | text = text.replace("&", " i ") 105 | text = text.replace("'", "") 106 | elif lang== "es": 107 | text=text.replace("&","y") 108 | text = text.replace("'", "") 109 | return text 110 | 111 | def french_cleaners(text): 112 | """Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" 113 | text = expand_abbreviations(text, lang="fr") 114 | # text = lowercase(text) # as we use the cased bert 115 | text = replace_punctuation(text) 116 | text = replace_symbols(text, lang="fr") 117 | text = remove_aux_symbols(text) 118 | text = remove_punctuation_at_begin(text) 119 | text = collapse_whitespace(text) 120 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) 121 | return text 122 | 123 | -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/en_symbols.json: -------------------------------------------------------------------------------- 1 | {"symbols": [ 2 | "_", 3 | ",", 4 | ".", 5 | "!", 6 | "?", 7 | "-", 8 | "~", 9 | "\u2026", 10 | "N", 11 | "Q", 12 | "a", 13 | "b", 14 | "d", 15 | "e", 16 | "f", 17 | "g", 18 | "h", 19 | "i", 20 | "j", 21 | "k", 22 | "l", 23 | "m", 24 | "n", 25 | "o", 26 | "p", 27 | "s", 28 | "t", 29 | "u", 30 | "v", 31 | "w", 32 | "x", 33 | "y", 34 | "z", 35 | "\u0251", 36 | "\u00e6", 37 | "\u0283", 38 | "\u0291", 39 | "\u00e7", 40 | "\u026f", 41 | "\u026a", 42 | "\u0254", 43 | "\u025b", 44 | "\u0279", 45 | "\u00f0", 46 | "\u0259", 47 | "\u026b", 48 | "\u0265", 49 | "\u0278", 50 | "\u028a", 51 | "\u027e", 52 | "\u0292", 53 | "\u03b8", 54 | "\u03b2", 55 | "\u014b", 56 | "\u0266", 57 | "\u207c", 58 | "\u02b0", 59 | "`", 60 | "^", 61 | "#", 62 | "*", 63 | "=", 64 | "\u02c8", 65 | "\u02cc", 66 | "\u2192", 67 | "\u2193", 68 | "\u2191", 69 | " ", 70 | "ɣ", 71 | "ɡ", 72 | "r", 73 | "ɲ", 74 | "ʝ", 75 | "ʎ", 76 | "ː" 77 | ] 78 | } -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/fr_symbols.json: -------------------------------------------------------------------------------- 1 | { 2 | "symbols": [ 3 | "_", 4 | ",", 5 | ".", 6 | "!", 7 | "?", 8 | "-", 9 | "~", 10 | "\u2026", 11 | "N", 12 | "Q", 13 | "a", 14 | "b", 15 | "d", 16 | "e", 17 | "f", 18 | "g", 19 | "h", 20 | "i", 21 | "j", 22 | "k", 23 | "l", 24 | "m", 25 | "n", 26 | "o", 27 | "p", 28 | "s", 29 | "t", 30 | "u", 31 | "v", 32 | "w", 33 | "x", 34 | "y", 35 | "z", 36 | "\u0251", 37 | "\u00e6", 38 | "\u0283", 39 | "\u0291", 40 | "\u00e7", 41 | "\u026f", 42 | "\u026a", 43 | "\u0254", 44 | "\u025b", 45 | "\u0279", 46 | "\u00f0", 47 | "\u0259", 48 | "\u026b", 49 | "\u0265", 50 | "\u0278", 51 | "\u028a", 52 | "\u027e", 53 | "\u0292", 54 | "\u03b8", 55 | "\u03b2", 56 | "\u014b", 57 | "\u0266", 58 | "\u207c", 59 | "\u02b0", 60 | "`", 61 | "^", 62 | "#", 63 | "*", 64 | "=", 65 | "\u02c8", 66 | "\u02cc", 67 | "\u2192", 68 | "\u2193", 69 | "\u2191", 70 | " ", 71 | "\u0263", 72 | "\u0261", 73 | "r", 74 | "\u0272", 75 | "\u029d", 76 | "\u028e", 77 | "\u02d0", 78 | 79 | "\u0303", 80 | "\u0153", 81 | "\u00f8", 82 | "\u0281", 83 | "\u0252", 84 | "\u028c", 85 | "\u2014", 86 | "\u025c", 87 | "\u0250" 88 | ] 89 | } -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/fr_to_ipa.py: -------------------------------------------------------------------------------- 1 | from .cleaner import french_cleaners 2 | from .gruut_wrapper import Gruut 3 | 4 | 5 | def remove_consecutive_t(input_str): 6 | result = [] 7 | count = 0 8 | 9 | for char in input_str: 10 | if char == 't': 11 | count += 1 12 | else: 13 | if count < 3: 14 | result.extend(['t'] * count) 15 | count = 0 16 | result.append(char) 17 | 18 | if count < 3: 19 | result.extend(['t'] * count) 20 | 21 | return ''.join(result) 22 | 23 | def fr2ipa(text): 24 | e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True) 25 | # text = french_cleaners(text) 26 | phonemes = e.phonemize(text, separator="") 27 | # print(phonemes) 28 | phonemes = remove_consecutive_t(phonemes) 29 | # print(phonemes) 30 | return phonemes -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/french_abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in french: 4 | abbreviations_fr = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("M", "monsieur"), 8 | ("Mlle", "mademoiselle"), 9 | ("Mlles", "mesdemoiselles"), 10 | ("Mme", "Madame"), 11 | ("Mmes", "Mesdames"), 12 | ("N.B", "nota bene"), 13 | ("M", "monsieur"), 14 | ("p.c.q", "parce que"), 15 | ("Pr", "professeur"), 16 | ("qqch", "quelque chose"), 17 | ("rdv", "rendez-vous"), 18 | ("max", "maximum"), 19 | ("min", "minimum"), 20 | ("no", "numéro"), 21 | ("adr", "adresse"), 22 | ("dr", "docteur"), 23 | ("st", "saint"), 24 | ("co", "companie"), 25 | ("jr", "junior"), 26 | ("sgt", "sergent"), 27 | ("capt", "capitain"), 28 | ("col", "colonel"), 29 | ("av", "avenue"), 30 | ("av. J.-C", "avant Jésus-Christ"), 31 | ("apr. J.-C", "après Jésus-Christ"), 32 | ("art", "article"), 33 | ("boul", "boulevard"), 34 | ("c.-à-d", "c’est-à-dire"), 35 | ("etc", "et cetera"), 36 | ("ex", "exemple"), 37 | ("excl", "exclusivement"), 38 | ("boul", "boulevard"), 39 | ] 40 | ] + [ 41 | (re.compile("\\b%s" % x[0]), x[1]) 42 | for x in [ 43 | ("Mlle", "mademoiselle"), 44 | ("Mlles", "mesdemoiselles"), 45 | ("Mme", "Madame"), 46 | ("Mmes", "Mesdames"), 47 | ] 48 | ] -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/french_symbols.txt: -------------------------------------------------------------------------------- 1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/gruut_wrapper.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from typing import List 3 | 4 | import gruut 5 | from gruut_ipa import IPA # pip install gruut_ipa 6 | 7 | from .base import BasePhonemizer 8 | from .punctuation import Punctuation 9 | 10 | # Table for str.translate to fix gruut/TTS phoneme mismatch 11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") 12 | 13 | 14 | class Gruut(BasePhonemizer): 15 | """Gruut wrapper for G2P 16 | 17 | Args: 18 | language (str): 19 | Valid language code for the used backend. 20 | 21 | punctuations (str): 22 | Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`. 23 | 24 | keep_puncs (bool): 25 | If true, keep the punctuations after phonemization. Defaults to True. 26 | 27 | use_espeak_phonemes (bool): 28 | If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False. 29 | 30 | keep_stress (bool): 31 | If true, keep the stress characters after phonemization. Defaults to False. 32 | 33 | Example: 34 | 35 | >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut 36 | >>> phonemizer = Gruut('en-us') 37 | >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|") 38 | 'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?' 39 | """ 40 | 41 | def __init__( 42 | self, 43 | language: str, 44 | punctuations=Punctuation.default_puncs(), 45 | keep_puncs=True, 46 | use_espeak_phonemes=False, 47 | keep_stress=False, 48 | ): 49 | super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) 50 | self.use_espeak_phonemes = use_espeak_phonemes 51 | self.keep_stress = keep_stress 52 | 53 | @staticmethod 54 | def name(): 55 | return "gruut" 56 | 57 | def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument 58 | """Convert input text to phonemes. 59 | 60 | Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters 61 | that constitude a single sound. 62 | 63 | It doesn't affect 🐸TTS since it individually converts each character to token IDs. 64 | 65 | Examples:: 66 | "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ` 67 | 68 | Args: 69 | text (str): 70 | Text to be converted to phonemes. 71 | 72 | tie (bool, optional) : When True use a '͡' character between 73 | consecutive characters of a single phoneme. Else separate phoneme 74 | with '_'. This option requires espeak>=1.49. Default to False. 75 | """ 76 | ph_list = [] 77 | for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes): 78 | for word in sentence: 79 | if word.is_break: 80 | # Use actual character for break phoneme (e.g., comma) 81 | if ph_list: 82 | # Join with previous word 83 | ph_list[-1].append(word.text) 84 | else: 85 | # First word is punctuation 86 | ph_list.append([word.text]) 87 | elif word.phonemes: 88 | # Add phonemes for word 89 | word_phonemes = [] 90 | 91 | for word_phoneme in word.phonemes: 92 | if not self.keep_stress: 93 | # Remove primary/secondary stress 94 | word_phoneme = IPA.without_stress(word_phoneme) 95 | 96 | word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE) 97 | 98 | if word_phoneme: 99 | # Flatten phonemes 100 | word_phonemes.extend(word_phoneme) 101 | 102 | if word_phonemes: 103 | ph_list.append(word_phonemes) 104 | 105 | ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list] 106 | ph = f"{separator} ".join(ph_words) 107 | return ph 108 | 109 | def _phonemize(self, text, separator): 110 | return self.phonemize_gruut(text, separator, tie=False) 111 | 112 | def is_supported_language(self, language): 113 | """Returns True if `language` is supported by the backend""" 114 | return gruut.is_language_supported(language) 115 | 116 | @staticmethod 117 | def supported_languages() -> List: 118 | """Get a dictionary of supported languages. 119 | 120 | Returns: 121 | List: List of language codes. 122 | """ 123 | return list(gruut.get_supported_languages()) 124 | 125 | def version(self): 126 | """Get the version of the used backend. 127 | 128 | Returns: 129 | str: Version of the used backend. 130 | """ 131 | return gruut.__version__ 132 | 133 | @classmethod 134 | def is_available(cls): 135 | """Return true if ESpeak is available else false""" 136 | return importlib.util.find_spec("gruut") is not None 137 | 138 | 139 | if __name__ == "__main__": 140 | from cleaner import french_cleaners 141 | import json 142 | 143 | e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True) 144 | symbols = [ # en + sp 145 | "_", 146 | ",", 147 | ".", 148 | "!", 149 | "?", 150 | "-", 151 | "~", 152 | "\u2026", 153 | "N", 154 | "Q", 155 | "a", 156 | "b", 157 | "d", 158 | "e", 159 | "f", 160 | "g", 161 | "h", 162 | "i", 163 | "j", 164 | "k", 165 | "l", 166 | "m", 167 | "n", 168 | "o", 169 | "p", 170 | "s", 171 | "t", 172 | "u", 173 | "v", 174 | "w", 175 | "x", 176 | "y", 177 | "z", 178 | "\u0251", 179 | "\u00e6", 180 | "\u0283", 181 | "\u0291", 182 | "\u00e7", 183 | "\u026f", 184 | "\u026a", 185 | "\u0254", 186 | "\u025b", 187 | "\u0279", 188 | "\u00f0", 189 | "\u0259", 190 | "\u026b", 191 | "\u0265", 192 | "\u0278", 193 | "\u028a", 194 | "\u027e", 195 | "\u0292", 196 | "\u03b8", 197 | "\u03b2", 198 | "\u014b", 199 | "\u0266", 200 | "\u207c", 201 | "\u02b0", 202 | "`", 203 | "^", 204 | "#", 205 | "*", 206 | "=", 207 | "\u02c8", 208 | "\u02cc", 209 | "\u2192", 210 | "\u2193", 211 | "\u2191", 212 | " ", 213 | "ɣ", 214 | "ɡ", 215 | "r", 216 | "ɲ", 217 | "ʝ", 218 | "ʎ", 219 | "ː" 220 | ] 221 | with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f: 222 | lines = f.readlines() 223 | 224 | 225 | used_sym = [] 226 | not_existed_sym = [] 227 | phonemes = [] 228 | 229 | for line in lines: 230 | text = line.split('|')[-1].strip() 231 | text = french_cleaners(text) 232 | ipa = e.phonemize(text, separator="") 233 | phonemes.append(ipa) 234 | for s in ipa: 235 | if s not in symbols: 236 | if s not in not_existed_sym: 237 | print(f'not_existed char: {s}') 238 | not_existed_sym.append(s) 239 | else: 240 | if s not in used_sym: 241 | # print(f'used char: {s}') 242 | used_sym.append(s) 243 | 244 | print(used_sym) 245 | print(not_existed_sym) 246 | 247 | 248 | with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g: 249 | g.writelines(symbols + not_existed_sym) 250 | 251 | with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g: 252 | g.writelines(phonemes) 253 | 254 | data = {'symbols': symbols + not_existed_sym} 255 | 256 | with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f: 257 | json.dump(data, f, indent=4) 258 | 259 | -------------------------------------------------------------------------------- /melo/text/fr_phonemizer/punctuation.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | from enum import Enum 4 | 5 | import six 6 | 7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”' 8 | 9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"]) 10 | 11 | 12 | class PuncPosition(Enum): 13 | """Enum for the punctuations positions""" 14 | 15 | BEGIN = 0 16 | END = 1 17 | MIDDLE = 2 18 | ALONE = 3 19 | 20 | 21 | class Punctuation: 22 | """Handle punctuations in text. 23 | 24 | Just strip punctuations from text or strip and restore them later. 25 | 26 | Args: 27 | puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`. 28 | 29 | Example: 30 | >>> punc = Punctuation() 31 | >>> punc.strip("This is. example !") 32 | 'This is example' 33 | 34 | >>> text_striped, punc_map = punc.strip_to_restore("This is. example !") 35 | >>> ' '.join(text_striped) 36 | 'This is example' 37 | 38 | >>> text_restored = punc.restore(text_striped, punc_map) 39 | >>> text_restored[0] 40 | 'This is. example !' 41 | """ 42 | 43 | def __init__(self, puncs: str = _DEF_PUNCS): 44 | self.puncs = puncs 45 | 46 | @staticmethod 47 | def default_puncs(): 48 | """Return default set of punctuations.""" 49 | return _DEF_PUNCS 50 | 51 | @property 52 | def puncs(self): 53 | return self._puncs 54 | 55 | @puncs.setter 56 | def puncs(self, value): 57 | if not isinstance(value, six.string_types): 58 | raise ValueError("[!] Punctuations must be of type str.") 59 | self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder 60 | self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+") 61 | 62 | def strip(self, text): 63 | """Remove all the punctuations by replacing with `space`. 64 | 65 | Args: 66 | text (str): The text to be processed. 67 | 68 | Example:: 69 | 70 | "This is. example !" -> "This is example " 71 | """ 72 | return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip() 73 | 74 | def strip_to_restore(self, text): 75 | """Remove punctuations from text to restore them later. 76 | 77 | Args: 78 | text (str): The text to be processed. 79 | 80 | Examples :: 81 | 82 | "This is. example !" -> [["This is", "example"], [".", "!"]] 83 | 84 | """ 85 | text, puncs = self._strip_to_restore(text) 86 | return text, puncs 87 | 88 | def _strip_to_restore(self, text): 89 | """Auxiliary method for Punctuation.preserve()""" 90 | matches = list(re.finditer(self.puncs_regular_exp, text)) 91 | if not matches: 92 | return [text], [] 93 | # the text is only punctuations 94 | if len(matches) == 1 and matches[0].group() == text: 95 | return [], [_PUNC_IDX(text, PuncPosition.ALONE)] 96 | # build a punctuation map to be used later to restore punctuations 97 | puncs = [] 98 | for match in matches: 99 | position = PuncPosition.MIDDLE 100 | if match == matches[0] and text.startswith(match.group()): 101 | position = PuncPosition.BEGIN 102 | elif match == matches[-1] and text.endswith(match.group()): 103 | position = PuncPosition.END 104 | puncs.append(_PUNC_IDX(match.group(), position)) 105 | # convert str text to a List[str], each item is separated by a punctuation 106 | splitted_text = [] 107 | for idx, punc in enumerate(puncs): 108 | split = text.split(punc.punc) 109 | prefix, suffix = split[0], punc.punc.join(split[1:]) 110 | splitted_text.append(prefix) 111 | # if the text does not end with a punctuation, add it to the last item 112 | if idx == len(puncs) - 1 and len(suffix) > 0: 113 | splitted_text.append(suffix) 114 | text = suffix 115 | return splitted_text, puncs 116 | 117 | @classmethod 118 | def restore(cls, text, puncs): 119 | """Restore punctuation in a text. 120 | 121 | Args: 122 | text (str): The text to be processed. 123 | puncs (List[str]): The list of punctuations map to be used for restoring. 124 | 125 | Examples :: 126 | 127 | ['This is', 'example'], ['.', '!'] -> "This is. example!" 128 | 129 | """ 130 | return cls._restore(text, puncs, 0) 131 | 132 | @classmethod 133 | def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements 134 | """Auxiliary method for Punctuation.restore()""" 135 | if not puncs: 136 | return text 137 | 138 | # nothing have been phonemized, returns the puncs alone 139 | if not text: 140 | return ["".join(m.punc for m in puncs)] 141 | 142 | current = puncs[0] 143 | 144 | if current.position == PuncPosition.BEGIN: 145 | return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num) 146 | 147 | if current.position == PuncPosition.END: 148 | return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1) 149 | 150 | if current.position == PuncPosition.ALONE: 151 | return [current.mark] + cls._restore(text, puncs[1:], num + 1) 152 | 153 | # POSITION == MIDDLE 154 | if len(text) == 1: # pragma: nocover 155 | # a corner case where the final part of an intermediate 156 | # mark (I) has not been phonemized 157 | return cls._restore([text[0] + current.punc], puncs[1:], num) 158 | 159 | return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) 160 | 161 | 162 | # if __name__ == "__main__": 163 | # punc = Punctuation() 164 | # text = "This is. This is, example!" 165 | 166 | # print(punc.strip(text)) 167 | 168 | # split_text, puncs = punc.strip_to_restore(text) 169 | # print(split_text, " ---- ", puncs) 170 | 171 | # restored_text = punc.restore(split_text, puncs) 172 | # print(restored_text) -------------------------------------------------------------------------------- /melo/text/french.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | 5 | from . import symbols 6 | from .fr_phonemizer import cleaner as fr_cleaner 7 | from .fr_phonemizer import fr_to_ipa 8 | from transformers import AutoTokenizer 9 | 10 | 11 | def distribute_phone(n_phone, n_word): 12 | phones_per_word = [0] * n_word 13 | for task in range(n_phone): 14 | min_tasks = min(phones_per_word) 15 | min_index = phones_per_word.index(min_tasks) 16 | phones_per_word[min_index] += 1 17 | return phones_per_word 18 | 19 | def text_normalize(text): 20 | text = fr_cleaner.french_cleaners(text) 21 | return text 22 | 23 | model_id = 'dbmdz/bert-base-french-europeana-cased' 24 | tokenizer = AutoTokenizer.from_pretrained(model_id) 25 | 26 | def g2p(text, pad_start_end=True, tokenized=None): 27 | if tokenized is None: 28 | tokenized = tokenizer.tokenize(text) 29 | # import pdb; pdb.set_trace() 30 | phs = [] 31 | ph_groups = [] 32 | for t in tokenized: 33 | if not t.startswith("#"): 34 | ph_groups.append([t]) 35 | else: 36 | ph_groups[-1].append(t.replace("#", "")) 37 | 38 | phones = [] 39 | tones = [] 40 | word2ph = [] 41 | # print(ph_groups) 42 | for group in ph_groups: 43 | w = "".join(group) 44 | phone_len = 0 45 | word_len = len(group) 46 | if w == '[UNK]': 47 | phone_list = ['UNK'] 48 | else: 49 | phone_list = list(filter(lambda p: p != " ", fr_to_ipa.fr2ipa(w))) 50 | 51 | for ph in phone_list: 52 | phones.append(ph) 53 | tones.append(0) 54 | phone_len += 1 55 | aaa = distribute_phone(phone_len, word_len) 56 | word2ph += aaa 57 | # print(phone_list, aaa) 58 | # print('=' * 10) 59 | 60 | if pad_start_end: 61 | phones = ["_"] + phones + ["_"] 62 | tones = [0] + tones + [0] 63 | word2ph = [1] + word2ph + [1] 64 | return phones, tones, word2ph 65 | 66 | def get_bert_feature(text, word2ph, device=None): 67 | from text import french_bert 68 | return french_bert.get_bert_feature(text, word2ph, device=device) 69 | 70 | if __name__ == "__main__": 71 | ori_text = 'Ce service gratuit est“”"" 【disponible》 en chinois 【simplifié] et autres 123' 72 | # ori_text = "Ils essayaient vainement de faire comprendre à ma mère qu'avec les cent mille francs que m'avait laissé mon père," 73 | # print(ori_text) 74 | text = text_normalize(ori_text) 75 | print(text) 76 | phoneme = fr_to_ipa.fr2ipa(text) 77 | print(phoneme) 78 | 79 | 80 | from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer 81 | from text.cleaner_multiling import unicleaners 82 | 83 | def text_normalize(text): 84 | text = unicleaners(text, cased=True, lang='fr') 85 | return text 86 | 87 | # print(ori_text) 88 | text = text_normalize(ori_text) 89 | print(text) 90 | phonemizer = MultiPhonemizer({"fr-fr": "espeak"}) 91 | # phonemizer.lang_to_phonemizer['fr'].keep_stress = True 92 | # phonemizer.lang_to_phonemizer['fr'].use_espeak_phonemes = True 93 | phoneme = phonemizer.phonemize(text, separator="", language='fr-fr') 94 | print(phoneme) -------------------------------------------------------------------------------- /melo/text/french_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | model_id = 'dbmdz/bert-base-french-europeana-cased' 6 | tokenizer = AutoTokenizer.from_pretrained(model_id) 7 | model = None 8 | 9 | def get_bert_feature(text, word2ph, device=None): 10 | global model 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if model is None: 20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to( 21 | device 22 | ) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = model(**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert inputs["input_ids"].shape[-1] == len(word2ph) 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | -------------------------------------------------------------------------------- /melo/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | 6 | models = {} 7 | tokenizers = {} 8 | def get_bert_feature(text, word2ph, device=None, model_id='tohoku-nlp/bert-base-japanese-v3'): 9 | global model 10 | global tokenizer 11 | 12 | if ( 13 | sys.platform == "darwin" 14 | and torch.backends.mps.is_available() 15 | and device == "cpu" 16 | ): 17 | device = "mps" 18 | if not device: 19 | device = "cuda" 20 | if model_id not in models: 21 | model = AutoModelForMaskedLM.from_pretrained(model_id).to( 22 | device 23 | ) 24 | models[model_id] = model 25 | tokenizer = AutoTokenizer.from_pretrained(model_id) 26 | tokenizers[model_id] = tokenizer 27 | else: 28 | model = models[model_id] 29 | tokenizer = tokenizers[model_id] 30 | 31 | 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | tokenized = tokenizer.tokenize(text) 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = model(**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | 40 | assert inputs["input_ids"].shape[-1] == len(word2ph), f"{inputs['input_ids'].shape[-1]}/{len(word2ph)}" 41 | word2phone = word2ph 42 | phone_level_feature = [] 43 | for i in range(len(word2phone)): 44 | repeat_feature = res[i].repeat(word2phone[i], 1) 45 | phone_level_feature.append(repeat_feature) 46 | 47 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 48 | 49 | return phone_level_feature.T 50 | -------------------------------------------------------------------------------- /melo/text/ko_dictionary.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Add the word you want to the dictionary. 3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"} 4 | 5 | 6 | english_dictionary = { 7 | "KOREA": "코리아", 8 | "IDOL": "아이돌", 9 | "IT": "아이티", 10 | "IQ": "아이큐", 11 | "UP": "업", 12 | "DOWN": "다운", 13 | "PC": "피씨", 14 | "CCTV": "씨씨티비", 15 | "SNS": "에스엔에스", 16 | "AI": "에이아이", 17 | "CEO": "씨이오", 18 | "A": "에이", 19 | "B": "비", 20 | "C": "씨", 21 | "D": "디", 22 | "E": "이", 23 | "F": "에프", 24 | "G": "지", 25 | "H": "에이치", 26 | "I": "아이", 27 | "J": "제이", 28 | "K": "케이", 29 | "L": "엘", 30 | "M": "엠", 31 | "N": "엔", 32 | "O": "오", 33 | "P": "피", 34 | "Q": "큐", 35 | "R": "알", 36 | "S": "에스", 37 | "T": "티", 38 | "U": "유", 39 | "V": "브이", 40 | "W": "더블유", 41 | "X": "엑스", 42 | "Y": "와이", 43 | "Z": "제트", 44 | } 45 | -------------------------------------------------------------------------------- /melo/text/korean.py: -------------------------------------------------------------------------------- 1 | # Convert Japanese text to phonemes which is 2 | # compatible with Julius https://github.com/julius-speech/segmentation-kit 3 | import re 4 | import unicodedata 5 | 6 | from transformers import AutoTokenizer 7 | 8 | from . import punctuation, symbols 9 | 10 | 11 | from num2words import num2words 12 | from melo.text.ko_dictionary import english_dictionary, etc_dictionary 13 | from anyascii import anyascii 14 | from jamo import hangul_to_jamo 15 | 16 | def normalize(text): 17 | text = text.strip() 18 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text) 19 | text = normalize_with_dictionary(text, etc_dictionary) 20 | text = normalize_english(text) 21 | text = text.lower() 22 | return text 23 | 24 | 25 | def normalize_with_dictionary(text, dic): 26 | if any(key in text for key in dic.keys()): 27 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) 28 | return pattern.sub(lambda x: dic[x.group()], text) 29 | return text 30 | 31 | 32 | def normalize_english(text): 33 | def fn(m): 34 | word = m.group() 35 | if word in english_dictionary: 36 | return english_dictionary.get(word) 37 | return word 38 | 39 | text = re.sub("([A-Za-z]+)", fn, text) 40 | return text 41 | 42 | 43 | g2p_kr = None 44 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str: 45 | """ 46 | 47 | The input and output values look the same, but they are different in Unicode. 48 | 49 | example : 50 | 51 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘) 52 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ) 53 | 54 | """ 55 | global g2p_kr # pylint: disable=global-statement 56 | if g2p_kr is None: 57 | from g2pkk import G2p 58 | 59 | g2p_kr = G2p() 60 | 61 | if character == "english": 62 | from anyascii import anyascii 63 | text = normalize(text) 64 | text = g2p_kr(text) 65 | text = anyascii(text) 66 | return text 67 | 68 | text = normalize(text) 69 | text = g2p_kr(text) 70 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ'] 71 | return "".join(text) 72 | 73 | def text_normalize(text): 74 | # res = unicodedata.normalize("NFKC", text) 75 | # res = japanese_convert_numbers_to_words(res) 76 | # # res = "".join([i for i in res if is_japanese_character(i)]) 77 | # res = replace_punctuation(res) 78 | text = normalize(text) 79 | return text 80 | 81 | 82 | def distribute_phone(n_phone, n_word): 83 | phones_per_word = [0] * n_word 84 | for task in range(n_phone): 85 | min_tasks = min(phones_per_word) 86 | min_index = phones_per_word.index(min_tasks) 87 | phones_per_word[min_index] += 1 88 | return phones_per_word 89 | 90 | 91 | 92 | # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3') 93 | 94 | model_id = 'kykim/bert-kor-base' 95 | tokenizer = AutoTokenizer.from_pretrained(model_id) 96 | 97 | def g2p(norm_text): 98 | tokenized = tokenizer.tokenize(norm_text) 99 | phs = [] 100 | ph_groups = [] 101 | for t in tokenized: 102 | if not t.startswith("#"): 103 | ph_groups.append([t]) 104 | else: 105 | ph_groups[-1].append(t.replace("#", "")) 106 | word2ph = [] 107 | for group in ph_groups: 108 | text = "" 109 | for ch in group: 110 | text += ch 111 | if text == '[UNK]': 112 | phs += ['_'] 113 | word2ph += [1] 114 | continue 115 | elif text in punctuation: 116 | phs += [text] 117 | word2ph += [1] 118 | continue 119 | # import pdb; pdb.set_trace() 120 | # phonemes = japanese_text_to_phonemes(text) 121 | # text = g2p_kr(text) 122 | phonemes = korean_text_to_phonemes(text) 123 | # import pdb; pdb.set_trace() 124 | # # phonemes = [i for i in phonemes if i in symbols] 125 | # for i in phonemes: 126 | # assert i in symbols, (group, norm_text, tokenized, i) 127 | phone_len = len(phonemes) 128 | word_len = len(group) 129 | 130 | aaa = distribute_phone(phone_len, word_len) 131 | assert len(aaa) == word_len 132 | word2ph += aaa 133 | 134 | phs += phonemes 135 | phones = ["_"] + phs + ["_"] 136 | tones = [0 for i in phones] 137 | word2ph = [1] + word2ph + [1] 138 | assert len(word2ph) == len(tokenized) + 2 139 | return phones, tones, word2ph 140 | 141 | def get_bert_feature(text, word2ph, device='cuda'): 142 | from . import japanese_bert 143 | return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id) 144 | 145 | 146 | if __name__ == "__main__": 147 | # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 148 | from text.symbols import symbols 149 | text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다" 150 | import json 151 | 152 | # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json')) 153 | genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json')) 154 | from tqdm import tqdm 155 | new_symbols = [] 156 | for key, item in tqdm(genshin_data.items()): 157 | texts = item.get('voiceContent', '') 158 | if isinstance(texts, list): 159 | texts = ','.join(texts) 160 | if texts is None: 161 | continue 162 | if len(texts) == 0: 163 | continue 164 | 165 | text = text_normalize(text) 166 | phones, tones, word2ph = g2p(text) 167 | bert = get_bert_feature(text, word2ph) 168 | import pdb; pdb.set_trace() 169 | for ph in phones: 170 | if ph not in symbols and ph not in new_symbols: 171 | new_symbols.append(ph) 172 | print('update!, now symbols:') 173 | print(new_symbols) 174 | with open('korean_symbol.txt', 'w') as f: 175 | f.write(f'{new_symbols}') 176 | 177 | 178 | 179 | # if __name__ == '__main__': 180 | # from pykakasi import kakasi 181 | # # Initialize kakasi object 182 | # kakasi = kakasi() 183 | 184 | # # Set options for converting Chinese characters to Katakana 185 | # kakasi.setMode("J", "H") # Chinese to Katakana 186 | # kakasi.setMode("K", "H") # Hiragana to Katakana 187 | 188 | # # Convert Chinese characters to Katakana 189 | # conv = kakasi.getConverter() 190 | # katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text 191 | 192 | # print(katakana_text) # Output: ニーハオセカイ -------------------------------------------------------------------------------- /melo/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /melo/text/spanish.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | 5 | from . import symbols 6 | from .es_phonemizer import cleaner as es_cleaner 7 | from .es_phonemizer import es_to_ipa 8 | from transformers import AutoTokenizer 9 | 10 | 11 | def distribute_phone(n_phone, n_word): 12 | phones_per_word = [0] * n_word 13 | for task in range(n_phone): 14 | min_tasks = min(phones_per_word) 15 | min_index = phones_per_word.index(min_tasks) 16 | phones_per_word[min_index] += 1 17 | return phones_per_word 18 | 19 | def text_normalize(text): 20 | text = es_cleaner.spanish_cleaners(text) 21 | return text 22 | 23 | def post_replace_ph(ph): 24 | rep_map = { 25 | ":": ",", 26 | ";": ",", 27 | ",": ",", 28 | "。": ".", 29 | "!": "!", 30 | "?": "?", 31 | "\n": ".", 32 | "·": ",", 33 | "、": ",", 34 | "...": "…" 35 | } 36 | if ph in rep_map.keys(): 37 | ph = rep_map[ph] 38 | if ph in symbols: 39 | return ph 40 | if ph not in symbols: 41 | ph = "UNK" 42 | return ph 43 | 44 | def refine_ph(phn): 45 | tone = 0 46 | if re.search(r"\d$", phn): 47 | tone = int(phn[-1]) + 1 48 | phn = phn[:-1] 49 | return phn.lower(), tone 50 | 51 | 52 | def refine_syllables(syllables): 53 | tones = [] 54 | phonemes = [] 55 | for phn_list in syllables: 56 | for i in range(len(phn_list)): 57 | phn = phn_list[i] 58 | phn, tone = refine_ph(phn) 59 | phonemes.append(phn) 60 | tones.append(tone) 61 | return phonemes, tones 62 | 63 | 64 | # model_id = 'bert-base-uncased' 65 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased' 66 | tokenizer = AutoTokenizer.from_pretrained(model_id) 67 | 68 | def g2p(text, pad_start_end=True, tokenized=None): 69 | if tokenized is None: 70 | tokenized = tokenizer.tokenize(text) 71 | # import pdb; pdb.set_trace() 72 | phs = [] 73 | ph_groups = [] 74 | for t in tokenized: 75 | if not t.startswith("#"): 76 | ph_groups.append([t]) 77 | else: 78 | ph_groups[-1].append(t.replace("#", "")) 79 | 80 | phones = [] 81 | tones = [] 82 | word2ph = [] 83 | # print(ph_groups) 84 | for group in ph_groups: 85 | w = "".join(group) 86 | phone_len = 0 87 | word_len = len(group) 88 | if w == '[UNK]': 89 | phone_list = ['UNK'] 90 | else: 91 | phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w))) 92 | 93 | for ph in phone_list: 94 | phones.append(ph) 95 | tones.append(0) 96 | phone_len += 1 97 | aaa = distribute_phone(phone_len, word_len) 98 | word2ph += aaa 99 | # print(phone_list, aaa) 100 | # print('=' * 10) 101 | 102 | if pad_start_end: 103 | phones = ["_"] + phones + ["_"] 104 | tones = [0] + tones + [0] 105 | word2ph = [1] + word2ph + [1] 106 | return phones, tones, word2ph 107 | 108 | def get_bert_feature(text, word2ph, device=None): 109 | from text import spanish_bert 110 | return spanish_bert.get_bert_feature(text, word2ph, device=device) 111 | 112 | if __name__ == "__main__": 113 | text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789." 114 | # print(text) 115 | text = text_normalize(text) 116 | print(text) 117 | phones, tones, word2ph = g2p(text) 118 | bert = get_bert_feature(text, word2ph) 119 | print(phones) 120 | print(len(phones), tones, sum(word2ph), bert.shape) 121 | 122 | 123 | -------------------------------------------------------------------------------- /melo/text/spanish_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased' 6 | tokenizer = AutoTokenizer.from_pretrained(model_id) 7 | model = None 8 | 9 | def get_bert_feature(text, word2ph, device=None): 10 | global model 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if model is None: 20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to( 21 | device 22 | ) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = model(**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert inputs["input_ids"].shape[-1] == len(word2ph) 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | -------------------------------------------------------------------------------- /melo/text/symbols.py: -------------------------------------------------------------------------------- 1 | # punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | punctuation = ["!", "?", "…", ",", ".", "'", "-", "¿", "¡"] 3 | pu_symbols = punctuation + ["SP", "UNK"] 4 | pad = "_" 5 | 6 | # chinese 7 | zh_symbols = [ 8 | "E", 9 | "En", 10 | "a", 11 | "ai", 12 | "an", 13 | "ang", 14 | "ao", 15 | "b", 16 | "c", 17 | "ch", 18 | "d", 19 | "e", 20 | "ei", 21 | "en", 22 | "eng", 23 | "er", 24 | "f", 25 | "g", 26 | "h", 27 | "i", 28 | "i0", 29 | "ia", 30 | "ian", 31 | "iang", 32 | "iao", 33 | "ie", 34 | "in", 35 | "ing", 36 | "iong", 37 | "ir", 38 | "iu", 39 | "j", 40 | "k", 41 | "l", 42 | "m", 43 | "n", 44 | "o", 45 | "ong", 46 | "ou", 47 | "p", 48 | "q", 49 | "r", 50 | "s", 51 | "sh", 52 | "t", 53 | "u", 54 | "ua", 55 | "uai", 56 | "uan", 57 | "uang", 58 | "ui", 59 | "un", 60 | "uo", 61 | "v", 62 | "van", 63 | "ve", 64 | "vn", 65 | "w", 66 | "x", 67 | "y", 68 | "z", 69 | "zh", 70 | "AA", 71 | "EE", 72 | "OO", 73 | ] 74 | num_zh_tones = 6 75 | 76 | # japanese 77 | ja_symbols = [ 78 | "N", 79 | "a", 80 | "a:", 81 | "b", 82 | "by", 83 | "ch", 84 | "d", 85 | "dy", 86 | "e", 87 | "e:", 88 | "f", 89 | "g", 90 | "gy", 91 | "h", 92 | "hy", 93 | "i", 94 | "i:", 95 | "j", 96 | "k", 97 | "ky", 98 | "m", 99 | "my", 100 | "n", 101 | "ny", 102 | "o", 103 | "o:", 104 | "p", 105 | "py", 106 | "q", 107 | "r", 108 | "ry", 109 | "s", 110 | "sh", 111 | "t", 112 | "ts", 113 | "ty", 114 | "u", 115 | "u:", 116 | "w", 117 | "y", 118 | "z", 119 | "zy", 120 | ] 121 | num_ja_tones = 1 122 | 123 | # English 124 | en_symbols = [ 125 | "aa", 126 | "ae", 127 | "ah", 128 | "ao", 129 | "aw", 130 | "ay", 131 | "b", 132 | "ch", 133 | "d", 134 | "dh", 135 | "eh", 136 | "er", 137 | "ey", 138 | "f", 139 | "g", 140 | "hh", 141 | "ih", 142 | "iy", 143 | "jh", 144 | "k", 145 | "l", 146 | "m", 147 | "n", 148 | "ng", 149 | "ow", 150 | "oy", 151 | "p", 152 | "r", 153 | "s", 154 | "sh", 155 | "t", 156 | "th", 157 | "uh", 158 | "uw", 159 | "V", 160 | "w", 161 | "y", 162 | "z", 163 | "zh", 164 | ] 165 | num_en_tones = 4 166 | 167 | # Korean 168 | kr_symbols = ['ᄌ', 'ᅥ', 'ᆫ', 'ᅦ', 'ᄋ', 'ᅵ', 'ᄅ', 'ᅴ', 'ᄀ', 'ᅡ', 'ᄎ', 'ᅪ', 'ᄑ', 'ᅩ', 'ᄐ', 'ᄃ', 'ᅢ', 'ᅮ', 'ᆼ', 'ᅳ', 'ᄒ', 'ᄆ', 'ᆯ', 'ᆷ', 'ᄂ', 'ᄇ', 'ᄉ', 'ᆮ', 'ᄁ', 'ᅬ', 'ᅣ', 'ᄄ', 'ᆨ', 'ᄍ', 'ᅧ', 'ᄏ', 'ᆸ', 'ᅭ', '(', 'ᄊ', ')', 'ᅲ', 'ᅨ', 'ᄈ', 'ᅱ', 'ᅯ', 'ᅫ', 'ᅰ', 'ᅤ', '~', '\\', '[', ']', '/', '^', ':', 'ㄸ', '*'] 169 | num_kr_tones = 1 170 | 171 | # Spanish 172 | es_symbols = [ 173 | "N", 174 | "Q", 175 | "a", 176 | "b", 177 | "d", 178 | "e", 179 | "f", 180 | "g", 181 | "h", 182 | "i", 183 | "j", 184 | "k", 185 | "l", 186 | "m", 187 | "n", 188 | "o", 189 | "p", 190 | "s", 191 | "t", 192 | "u", 193 | "v", 194 | "w", 195 | "x", 196 | "y", 197 | "z", 198 | "ɑ", 199 | "æ", 200 | "ʃ", 201 | "ʑ", 202 | "ç", 203 | "ɯ", 204 | "ɪ", 205 | "ɔ", 206 | "ɛ", 207 | "ɹ", 208 | "ð", 209 | "ə", 210 | "ɫ", 211 | "ɥ", 212 | "ɸ", 213 | "ʊ", 214 | "ɾ", 215 | "ʒ", 216 | "θ", 217 | "β", 218 | "ŋ", 219 | "ɦ", 220 | "ɡ", 221 | "r", 222 | "ɲ", 223 | "ʝ", 224 | "ɣ", 225 | "ʎ", 226 | "ˈ", 227 | "ˌ", 228 | "ː" 229 | ] 230 | num_es_tones = 1 231 | 232 | # French 233 | fr_symbols = [ 234 | "\u0303", 235 | "œ", 236 | "ø", 237 | "ʁ", 238 | "ɒ", 239 | "ʌ", 240 | "ɜ", 241 | "ɐ" 242 | ] 243 | num_fr_tones = 1 244 | 245 | # German 246 | de_symbols = [ 247 | "ʏ", 248 | "̩" 249 | ] 250 | num_de_tones = 1 251 | 252 | # Russian 253 | ru_symbols = [ 254 | "ɭ", 255 | "ʲ", 256 | "ɕ", 257 | "\"", 258 | "ɵ", 259 | "^", 260 | "ɬ" 261 | ] 262 | num_ru_tones = 1 263 | 264 | # combine all symbols 265 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols + kr_symbols + es_symbols + fr_symbols + de_symbols + ru_symbols)) 266 | symbols = [pad] + normal_symbols + pu_symbols 267 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 268 | 269 | # combine all tones 270 | num_tones = num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones + num_fr_tones + num_de_tones + num_ru_tones 271 | 272 | # language maps 273 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2, "ZH_MIX_EN": 3, 'KR': 4, 'ES': 5, 'SP': 5 ,'FR': 6} 274 | num_languages = len(language_id_map.keys()) 275 | 276 | language_tone_start_map = { 277 | "ZH": 0, 278 | "ZH_MIX_EN": 0, 279 | "JP": num_zh_tones, 280 | "EN": num_zh_tones + num_ja_tones, 281 | 'KR': num_zh_tones + num_ja_tones + num_en_tones, 282 | "ES": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones, 283 | "SP": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones, 284 | "FR": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones, 285 | } 286 | 287 | if __name__ == "__main__": 288 | a = set(zh_symbols) 289 | b = set(en_symbols) 290 | print(sorted(a & b)) 291 | -------------------------------------------------------------------------------- /melo/train.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | GPUS=$2 3 | MODEL_NAME=$(basename "$(dirname $CONFIG)") 4 | 5 | PORT=10902 6 | 7 | while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus 8 | do 9 | torchrun --nproc_per_node=$GPUS \ 10 | --master_port=$PORT \ 11 | train.py --c $CONFIG --model $MODEL_NAME 12 | 13 | for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}') 14 | do 15 | echo $PID 16 | kill -9 $PID 17 | done 18 | sleep 30 19 | done -------------------------------------------------------------------------------- /melo/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform( 13 | inputs, 14 | unnormalized_widths, 15 | unnormalized_heights, 16 | unnormalized_derivatives, 17 | inverse=False, 18 | tails=None, 19 | tail_bound=1.0, 20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 22 | min_derivative=DEFAULT_MIN_DERIVATIVE, 23 | ): 24 | if tails is None: 25 | spline_fn = rational_quadratic_spline 26 | spline_kwargs = {} 27 | else: 28 | spline_fn = unconstrained_rational_quadratic_spline 29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound} 30 | 31 | outputs, logabsdet = spline_fn( 32 | inputs=inputs, 33 | unnormalized_widths=unnormalized_widths, 34 | unnormalized_heights=unnormalized_heights, 35 | unnormalized_derivatives=unnormalized_derivatives, 36 | inverse=inverse, 37 | min_bin_width=min_bin_width, 38 | min_bin_height=min_bin_height, 39 | min_derivative=min_derivative, 40 | **spline_kwargs 41 | ) 42 | return outputs, logabsdet 43 | 44 | 45 | def searchsorted(bin_locations, inputs, eps=1e-6): 46 | bin_locations[..., -1] += eps 47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 48 | 49 | 50 | def unconstrained_rational_quadratic_spline( 51 | inputs, 52 | unnormalized_widths, 53 | unnormalized_heights, 54 | unnormalized_derivatives, 55 | inverse=False, 56 | tails="linear", 57 | tail_bound=1.0, 58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 60 | min_derivative=DEFAULT_MIN_DERIVATIVE, 61 | ): 62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 63 | outside_interval_mask = ~inside_interval_mask 64 | 65 | outputs = torch.zeros_like(inputs) 66 | logabsdet = torch.zeros_like(inputs) 67 | 68 | if tails == "linear": 69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 70 | constant = np.log(np.exp(1 - min_derivative) - 1) 71 | unnormalized_derivatives[..., 0] = constant 72 | unnormalized_derivatives[..., -1] = constant 73 | 74 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 75 | logabsdet[outside_interval_mask] = 0 76 | else: 77 | raise RuntimeError("{} tails are not implemented.".format(tails)) 78 | 79 | ( 80 | outputs[inside_interval_mask], 81 | logabsdet[inside_interval_mask], 82 | ) = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, 89 | right=tail_bound, 90 | bottom=-tail_bound, 91 | top=tail_bound, 92 | min_bin_width=min_bin_width, 93 | min_bin_height=min_bin_height, 94 | min_derivative=min_derivative, 95 | ) 96 | 97 | return outputs, logabsdet 98 | 99 | 100 | def rational_quadratic_spline( 101 | inputs, 102 | unnormalized_widths, 103 | unnormalized_heights, 104 | unnormalized_derivatives, 105 | inverse=False, 106 | left=0.0, 107 | right=1.0, 108 | bottom=0.0, 109 | top=1.0, 110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 112 | min_derivative=DEFAULT_MIN_DERIVATIVE, 113 | ): 114 | if torch.min(inputs) < left or torch.max(inputs) > right: 115 | raise ValueError("Input to a transform is not within its domain") 116 | 117 | num_bins = unnormalized_widths.shape[-1] 118 | 119 | if min_bin_width * num_bins > 1.0: 120 | raise ValueError("Minimal bin width too large for the number of bins") 121 | if min_bin_height * num_bins > 1.0: 122 | raise ValueError("Minimal bin height too large for the number of bins") 123 | 124 | widths = F.softmax(unnormalized_widths, dim=-1) 125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 126 | cumwidths = torch.cumsum(widths, dim=-1) 127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) 128 | cumwidths = (right - left) * cumwidths + left 129 | cumwidths[..., 0] = left 130 | cumwidths[..., -1] = right 131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 132 | 133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 134 | 135 | heights = F.softmax(unnormalized_heights, dim=-1) 136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 137 | cumheights = torch.cumsum(heights, dim=-1) 138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) 139 | cumheights = (top - bottom) * cumheights + bottom 140 | cumheights[..., 0] = bottom 141 | cumheights[..., -1] = top 142 | heights = cumheights[..., 1:] - cumheights[..., :-1] 143 | 144 | if inverse: 145 | bin_idx = searchsorted(cumheights, inputs)[..., None] 146 | else: 147 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 148 | 149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 151 | 152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 153 | delta = heights / widths 154 | input_delta = delta.gather(-1, bin_idx)[..., 0] 155 | 156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 158 | 159 | input_heights = heights.gather(-1, bin_idx)[..., 0] 160 | 161 | if inverse: 162 | a = (inputs - input_cumheights) * ( 163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 164 | ) + input_heights * (input_delta - input_derivatives) 165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * ( 166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 167 | ) 168 | c = -input_delta * (inputs - input_cumheights) 169 | 170 | discriminant = b.pow(2) - 4 * a * c 171 | assert (discriminant >= 0).all() 172 | 173 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 174 | outputs = root * input_bin_widths + input_cumwidths 175 | 176 | theta_one_minus_theta = root * (1 - root) 177 | denominator = input_delta + ( 178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 179 | * theta_one_minus_theta 180 | ) 181 | derivative_numerator = input_delta.pow(2) * ( 182 | input_derivatives_plus_one * root.pow(2) 183 | + 2 * input_delta * theta_one_minus_theta 184 | + input_derivatives * (1 - root).pow(2) 185 | ) 186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 187 | 188 | return outputs, -logabsdet 189 | else: 190 | theta = (inputs - input_cumwidths) / input_bin_widths 191 | theta_one_minus_theta = theta * (1 - theta) 192 | 193 | numerator = input_heights * ( 194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta 195 | ) 196 | denominator = input_delta + ( 197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 198 | * theta_one_minus_theta 199 | ) 200 | outputs = input_cumheights + numerator / denominator 201 | 202 | derivative_numerator = input_delta.pow(2) * ( 203 | input_derivatives_plus_one * theta.pow(2) 204 | + 2 * input_delta * theta_one_minus_theta 205 | + input_derivatives * (1 - theta).pow(2) 206 | ) 207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 208 | 209 | return outputs, logabsdet 210 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | txtsplit 2 | torch 3 | torchaudio 4 | cached_path 5 | transformers==4.27.4 6 | num2words==0.5.12 7 | unidic_lite==1.0.8 8 | unidic==1.1.0 9 | mecab-python3==1.0.9 10 | pykakasi==2.2.1 11 | fugashi==1.3.0 12 | g2p_en==2.1.0 13 | anyascii==0.3.2 14 | jamo==0.4.1 15 | gruut[de,es,fr]==2.2.3 16 | g2pkk>=0.1.1 17 | librosa==0.9.1 18 | pydub==0.25.1 19 | eng_to_ipa==0.0.2 20 | inflect==7.0.0 21 | unidecode==1.3.7 22 | pypinyin==0.50.0 23 | cn2an==0.5.22 24 | jieba==0.42.1 25 | gradio 26 | langid==1.1.6 27 | tqdm 28 | tensorboard==2.16.2 29 | loguru==0.7.2 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | from setuptools.command.develop import develop 4 | from setuptools.command.install import install 5 | 6 | 7 | cwd = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | with open('requirements.txt') as f: 10 | reqs = f.read().splitlines() 11 | class PostInstallCommand(install): 12 | """Post-installation for installation mode.""" 13 | def run(self): 14 | install.run(self) 15 | os.system('python -m unidic download') 16 | 17 | 18 | class PostDevelopCommand(develop): 19 | """Post-installation for development mode.""" 20 | def run(self): 21 | develop.run(self) 22 | os.system('python -m unidic download') 23 | 24 | setup( 25 | name='melotts', 26 | version='0.1.2', 27 | packages=find_packages(), 28 | include_package_data=True, 29 | install_requires=reqs, 30 | package_data={ 31 | '': ['*.txt', 'cmudict_*'], 32 | }, 33 | entry_points={ 34 | "console_scripts": [ 35 | "melotts = melo.main:main", 36 | "melo = melo.main:main", 37 | "melo-ui = melo.app:main", 38 | ], 39 | }, 40 | ) 41 | -------------------------------------------------------------------------------- /test/basetts_test_resources/en_egs_text.txt: -------------------------------------------------------------------------------- 1 | Did you ever hear a folk tale about a giant turtle? 2 | Can you name five cars that were popular in the 1970s? 3 | May I ask what's your favorite university and why? 4 | Well, have you ever experienced violence in your life? 5 | Have you ever imposed restrictions? 6 | Did you ever feel guilty for not providing enough care for your pet? 7 | Would you prefer barbecue-flavored chips or plain chips? 8 | Are contractions common in English? 9 | Well, have you ever seen a slam poetry competition? 10 | Am I correct in assuming that bilateral trade agreements favor developed countries? 11 | Are there any scientific theories on why love exists in humans? 12 | Well, do you think figure skating is harder than gymnastics? 13 | Can you tell me if the apartment has a balcony or not? 14 | Have you ever overcome a challenging obstacle positively? 15 | Could you elaborate on the meaning behind that quote? 16 | Shall seniors receive higher taxes? 17 | Do you think adding a liquid flavor to coffee ruins it? 18 | Well, in our conversation about the restaurant, how would you review it overall? 19 | Have you consistently followed through with goals? 20 | Can pilots hear passengers coughing? 21 | Well, have you tried rainbow sprinkles? 22 | Are there any golden retrievers at the local animal shelter? 23 | Have you seen Tyler? 24 | Had you ever deployed to Mars? 25 | Well, have you ever felt intimidated by your competition's tactics? 26 | Are there any specific rules about when you can continue? 27 | Can you describe Antarctica's temperatures? 28 | May I ask, have you ever tasted a bloody mary before? 29 | Did anyone mention the order yet? 30 | Are automatic transmissions more fuel efficient? 31 | Shall we discuss the impact of self-control on personal success? 32 | Have you traveled internationally this May? 33 | Well, have you ever tried shrimp ceviche? 34 | Have you ever seen an act of extraordinary courage in person? 35 | Have you ever wondered how proceed affects the outcome of a project? 36 | Have you calculated the mean weight of all the participants? 37 | Should we bring confetti to the parade? 38 | Do influencers control behavior? 39 | Shall we discuss the price of the new car lease? 40 | Had Nice ever been your home? 41 | Have you ever encountered a gifted child who struggled academically? 42 | Can everyone work together? 43 | Did you know how long an ostrich can survive without water? 44 | Do nurses in long-term care facilities receive adequate training for dementia care? 45 | Has separation ever felt liberating? 46 | Would you prefer a flexible or fixed schedule for work? 47 | Does pension plan have rollover? 48 | Has Vital's mission expanded beyond health supplements? 49 | Have you ever witnessed a bombing attack? 50 | May I predict the outcome of the election based on polls? 51 | Do you think strict parenting leads to more successful children later in life? 52 | Shall we explore nearby parks? 53 | Are there any ways to verify the credibility of online reviews? 54 | Have you ever witnessed a roundabout accident? 55 | Well, upon reflection, do we really want sushi? 56 | Well, have you ever experienced workplace harassment? 57 | Do you think it's sure that the rain will stop soon? 58 | Would you say distance affects relationships? 59 | Can we truly deny the existence of higher power? 60 | Do you think crop yields will be affected by the drought? 61 | Do you think the backup plan is good enough? 62 | Can you tell me, meanwhile, what happened while I was gone? 63 | Did the wise old owl speak? 64 | Well, have you ever been to a retreat that truly transformed you? 65 | Have you ever had to calculate the exact measurements for a recipe? 66 | Can warning signs prevent accidents while driving on icy roads? 67 | Do you think the current job market offers equal opportunity? 68 | Have you ever analyzed your own dreams? 69 | May I ask if colonialism affected your ancestry? 70 | Well, what chest exercises target the upper pecs? 71 | Are there occasionally unexpected consequences of honesty? 72 | Do you think the new restaurant is overpriced? 73 | Do critics take into account audience preferences? 74 | Has translation technology reached a point where it can accurately translate idioms? 75 | Have you ever been to a music festival in another country? 76 | Do you think our taste in food is genetic? 77 | Are you a hopeless romantic at heart? 78 | Shall we explore abandoned urban places? 79 | Does agency promote individualism? 80 | Well, what implementing strategies? 81 | Have you ever noticed the smallest detail that changed your perspective? 82 | Have you ever seen a normal ghost? 83 | Have you ever considered the considerable effort? 84 | Are there holistic chronic cure? 85 | Did unemployment rates change recently? 86 | Does change come from within or without? 87 | Does the length of the patent term affect innovation rates? 88 | Can Junior play basketball? 89 | Shall we analyze the data? 90 | Have you ever tried the Szechuan cuisine before? 91 | Had you ever debated a controversial topic before? 92 | Have you ever analyzed case? 93 | Is it true that stripping originated in ancient Egypt or Greece? 94 | Have you ever dyed your hair a crazy color? 95 | Shall we compare the top-rated pizza places in our city? 96 | May people in different countries play soccer? 97 | Well, have you recycled? 98 | Shall we precisely measure ingredients? 99 | Can you embrace someone you don't love? -------------------------------------------------------------------------------- /test/basetts_test_resources/es_egs_text.txt: -------------------------------------------------------------------------------- 1 | El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante. 2 | Las estrellas bailan en la noche, creando un espectáculo celestial que despierta el alma. 3 | Las majestuosas montañas se alzan en silencio, guardianas inmutables del tiempo que pasa. 4 | El amor, como un suave perfume, envuelve nuestros corazones con un calor reconfortante. 5 | El susurro suave del viento atraviesa los campos de lavanda, llevándose consigo el aroma de la Provenza. 6 | El resplandor de la luna baña la ciudad dormida en una luz mística. 7 | Las calles empedradas revelan historias antiguas, cada piedra llevando el peso del pasado. 8 | La risa de los niños resuena como una melodía encantada en el suave aire de la primavera. 9 | Los jardines floridos estallan con colores vibrantes, creando un cuadro viviente de la naturaleza. 10 | Las olas acarician suavemente la playa, dejando tras de sí huellas efímeras en la arena. 11 | La Torre Eiffel se yergue con orgullo, testigo silencioso del amor eterno en París. 12 | Las mariposas danzan entre las flores, creando una coreografía grácil en el jardín. 13 | Los animados cafés resuenan con conversaciones apasionadas y el embriagador aroma del café recién molido. 14 | Los ríos serpenteantes atraviesan el campo, reflejando el cielo azul en sus aguas tranquilas. 15 | Los imponentes castillos cuentan historias de caballeros y princesas en un pasado lejano. 16 | Los viñedos se extienden hasta donde alcanza la vista, sus filas ordenadas testimonio de la antigua tradición vinícola. 17 | Las risas resuenan en las estrechas callejuelas, despertando la vieja ciudad de su quietud. 18 | Los campos de girasoles saludan al sol con sus caras doradas, un mar de oro bajo un cielo azul. 19 | Las notas melódicas de un acordeón flotan en el aire, capturando la esencia musical de las calles parisinas. 20 | Las cumbres nevadas de los Alpes brillan bajo la luz de la luna, un paisaje invernal de ensueño. -------------------------------------------------------------------------------- /test/basetts_test_resources/fr_egs_text.txt: -------------------------------------------------------------------------------- 1 | La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante. 2 | Les étoiles dansent dans la nuit, créant un spectacle céleste qui éveille l'âme. 3 | Les montagnes majestueuses se dressent en silence, gardiennes immuables du temps qui passe. 4 | L'amour, tel un doux parfum, enveloppe nos cœurs d'une chaleur réconfortante. 5 | Le doux murmure du vent traverse les champs de lavande, emportant avec lui le parfum de la Provence. 6 | La lueur de la lune baigne la ville endormie dans une lumière mystique. 7 | Les ruelles pavées révèlent des histoires anciennes, chaque pierre portant le poids du passé. 8 | Le rire des enfants résonne comme une mélodie enchantée dans l'air doux du printemps. 9 | Les jardins fleuris éclatent de couleurs vives, créant un tableau vivant de la nature. 10 | Les vagues caressent doucement la plage, laissant derrière elles des traces éphémères dans le sable. 11 | La Tour Eiffel se dresse fièrement, témoin silencieux de l'amour éternel à Paris. 12 | Les papillons dansent parmi les fleurs, créant une chorégraphie gracieuse dans le jardin. 13 | Les cafés animés résonnent de conversations passionnées et du parfum enivrant du café fraîchement moulu. 14 | Les rivières sinueuses traversent la campagne, reflétant le ciel azur dans leurs eaux calmes. 15 | Les châteaux imposants racontent des contes de chevaliers et de princesses dans un passé lointain. 16 | Les vignobles s'étendent à perte de vue, leurs rangées ordonnées témoignant du savoir-faire viticole ancestral. 17 | Les éclats de rire résonnent dans les ruelles étroites, réveillant la vieille ville de sa quiétude. 18 | Les champs de tournesols saluent le soleil avec leurs visages dorés, une mer d'or sous un ciel d'azur. 19 | Les notes mélodieuses d'un accordéon flottent dans l'air, capturant l'essence musicale des rues parisiennes. 20 | Les sommets enneigés des Alpes brillent sous la lumière de la lune, un paysage hivernal féérique. -------------------------------------------------------------------------------- /test/basetts_test_resources/jp_egs_text.txt: -------------------------------------------------------------------------------- 1 | 彼は毎朝ジョギングをして体を健康に保っています。 2 | 私たちは来年、友人たちと一緒にヨーロッパ旅行を計画しています。 3 | 新しいレストランで美味しい料理を試すことが楽しみです。 4 | 彼女の絵は情熱と芸術性が溢れていて、見る人を魅了します。 5 | 最近、忙しさに追われていて、ゆっくり休む時間がありません。 6 | 日本の文化は多様で魅力的であり、世界中から注目されています。 7 | 彼の犬は忠実で賢く、家族にとって大切な存在です。 8 | 私の友達は常に私をサポートしてくれる信頼できる存在です。 9 | 家族と一緒に過ごす時間は、私にとって何よりも大切です。 10 | 彼の夢は大きく、努力と決意でそれを実現しようとしています。 -------------------------------------------------------------------------------- /test/basetts_test_resources/kr_egs_text.txt: -------------------------------------------------------------------------------- 1 | 안녕하세요! 오늘은 날씨가 정말 좋네요. 2 | 한국 음식을 먹어보고 싶어요. 불고기랑 김치찌개가 제가 좋아하는 음식이에요. 3 | 요즘에는 한국 드라마를 자주 보고 있어요. 정말 재미있어요. 4 | 한글을 배우는 것이 재미있어요. 조금씩 읽고 쓸 수 있게 되고 있어요. 5 | 친구들과 함께 한국 여행을 계획 중이에요. 서울과 부산을 방문할 예정이에요., -------------------------------------------------------------------------------- /test/basetts_test_resources/zh_mix_en_egs_text.txt: -------------------------------------------------------------------------------- 1 | 人工智能是一种非常适合和促进自上而下集中控制的技术,而加密货币则是一种完全关注自下而上分散合作的技术。 2 | Web 3的一个目标是支持艺术家。 3 | 欢迎来到Web 3与A6Z,一个由团队打造的构建下一代互联网的节目。 4 | 我最喜欢的fruit是苹果。 5 | 今天我们要学习Python programming。 6 | 她在library看书。 7 | 你喜欢听pop music吗? 8 | 今天下午,我们准备去shopping mall购物,然后晚上去看一场movie。 9 | 我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。 10 | 在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。 11 | 今天天气真不错,我们去Paris吃蒸汽海鲜吧!, -------------------------------------------------------------------------------- /test/test_base_model_tts_package.py: -------------------------------------------------------------------------------- 1 | from melo.api import TTS 2 | import os 3 | import glob 4 | import sys 5 | 6 | 7 | language = sys.argv[1] 8 | model = TTS(language=language) 9 | 10 | speaker_ids = model.hps.data.spk2id 11 | speakers = list(speaker_ids.keys()) 12 | 13 | root_folder = language.lower() 14 | if 'zh' in root_folder: 15 | texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines() 16 | language = 'ZH_MIX_EN' 17 | elif 'es' in root_folder: 18 | texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines() 19 | language = 'SP' 20 | elif 'fr' in root_folder: 21 | texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines() 22 | language = 'FR' 23 | elif 'en' in root_folder: 24 | texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines() 25 | # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "] 26 | language = 'EN' 27 | elif 'jp' in root_folder: 28 | texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines() 29 | language = 'JP' 30 | elif 'kr' in root_folder: 31 | texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines() 32 | language = 'KR' 33 | else: 34 | raise NotImplementedError() 35 | 36 | save_dir = os.path.join('basetts_outputs_package', root_folder.split('/')[-1]) 37 | 38 | for speed in [1.0]: 39 | for speaker in speakers: 40 | for sent_id, text in enumerate(texts): 41 | output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav' 42 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 43 | model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed) -------------------------------------------------------------------------------- /test/test_base_model_tts_package_from_S3.py: -------------------------------------------------------------------------------- 1 | from melo.api import TTS 2 | import os 3 | import glob 4 | import sys 5 | 6 | 7 | language = sys.argv[1] 8 | model = TTS(language=language, use_hf=False) 9 | 10 | speaker_ids = model.hps.data.spk2id 11 | speakers = list(speaker_ids.keys()) 12 | 13 | root_folder = language.lower() 14 | if 'zh' in root_folder: 15 | texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines() 16 | language = 'ZH_MIX_EN' 17 | elif 'es' in root_folder: 18 | texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines() 19 | language = 'SP' 20 | elif 'fr' in root_folder: 21 | texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines() 22 | language = 'FR' 23 | elif 'en' in root_folder: 24 | texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines() 25 | # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "] 26 | language = 'EN' 27 | elif 'jp' in root_folder: 28 | texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines() 29 | language = 'JP' 30 | elif 'kr' in root_folder: 31 | texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines() 32 | language = 'KR' 33 | else: 34 | raise NotImplementedError() 35 | 36 | save_dir = os.path.join('basetts_outputs_package_from_S3', root_folder.split('/')[-1]) 37 | 38 | for speed in [1.0]: 39 | for speaker in speakers: 40 | for sent_id, text in enumerate(texts): 41 | output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav' 42 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 43 | model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed) --------------------------------------------------------------------------------