├── .github
└── workflows
│ └── pypi.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docs
├── install.md
├── quick_use.md
└── training.md
├── logo.png
├── melo
├── __init__.py
├── api.py
├── app.py
├── attentions.py
├── commons.py
├── configs
│ └── config.json
├── data
│ └── example
│ │ └── metadata.list
├── data_utils.py
├── download_utils.py
├── infer.py
├── init_downloads.py
├── losses.py
├── main.py
├── mel_processing.py
├── models.py
├── modules.py
├── monotonic_align
│ ├── __init__.py
│ └── core.py
├── preprocess_text.py
├── split_utils.py
├── text
│ ├── __init__.py
│ ├── chinese.py
│ ├── chinese_bert.py
│ ├── chinese_mix.py
│ ├── cleaner.py
│ ├── cleaner_multiling.py
│ ├── cmudict.rep
│ ├── cmudict_cache.pickle
│ ├── english.py
│ ├── english_bert.py
│ ├── english_utils
│ │ ├── __init__.py
│ │ ├── abbreviations.py
│ │ ├── number_norm.py
│ │ └── time_norm.py
│ ├── es_phonemizer
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cleaner.py
│ │ ├── es_symbols.json
│ │ ├── es_symbols.txt
│ │ ├── es_symbols_v2.json
│ │ ├── es_to_ipa.py
│ │ ├── example_ipa.txt
│ │ ├── gruut_wrapper.py
│ │ ├── punctuation.py
│ │ ├── spanish_symbols.txt
│ │ └── test.ipynb
│ ├── fr_phonemizer
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cleaner.py
│ │ ├── en_symbols.json
│ │ ├── example_ipa.txt
│ │ ├── fr_symbols.json
│ │ ├── fr_to_ipa.py
│ │ ├── french_abbreviations.py
│ │ ├── french_symbols.txt
│ │ ├── gruut_wrapper.py
│ │ └── punctuation.py
│ ├── french.py
│ ├── french_bert.py
│ ├── japanese.py
│ ├── japanese_bert.py
│ ├── ko_dictionary.py
│ ├── korean.py
│ ├── opencpop-strict.txt
│ ├── spanish.py
│ ├── spanish_bert.py
│ ├── symbols.py
│ └── tone_sandhi.py
├── train.py
├── train.sh
├── transforms.py
└── utils.py
├── requirements.txt
├── setup.py
└── test
├── basetts_test_resources
├── en_egs_text.txt
├── es_egs_text.txt
├── fr_egs_text.txt
├── jp_egs_text.txt
├── kr_egs_text.txt
└── zh_mix_en_egs_text.txt
├── test_base_model_tts_package.py
└── test_base_model_tts_package_from_S3.py
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | python -m ensurepip --upgrade
33 | pip install build
34 | - name: Build package
35 | run: python -m build
36 | - name: Publish package
37 | uses: pypa/gh-action-pypi-publish@release/v1.8
38 | with:
39 | user: __token__
40 | password: ${{ secrets.PYPI_API_TOKEN }}
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .ipynb_checkpoints/
3 | basetts_outputs_use_bert/
4 | basetts_outputs/
5 | multilingual_ckpts
6 | basetts_outputs_package/
7 | build/
8 | *.egg-info/
9 |
10 | *.zip
11 | *.wav
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 | WORKDIR /app
3 | COPY . /app
4 |
5 | RUN apt-get update && apt-get install -y \
6 | build-essential libsndfile1 \
7 | && rm -rf /var/lib/apt/lists/*
8 |
9 | RUN pip install -e .
10 | RUN python -m unidic download
11 | RUN python melo/init_downloads.py
12 |
13 | CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 MyShell.ai
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |

5 |
6 |
7 | ## Introduction
8 | MeloTTS is a **high-quality multi-lingual** text-to-speech library by [MIT](https://www.mit.edu/) and [MyShell.ai](https://myshell.ai). Supported languages include:
9 |
10 | | Language | Example |
11 | | --- | --- |
12 | | English (American) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-US/speed_1.0/sent_000.wav) |
13 | | English (British) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-BR/speed_1.0/sent_000.wav) |
14 | | English (Indian) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN_INDIA/speed_1.0/sent_000.wav) |
15 | | English (Australian) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-AU/speed_1.0/sent_000.wav) |
16 | | English (Default) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-Default/speed_1.0/sent_000.wav) |
17 | | Spanish | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/es/ES/speed_1.0/sent_000.wav) |
18 | | French | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/fr/FR/speed_1.0/sent_000.wav) |
19 | | Chinese (mix EN) | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/zh/ZH/speed_1.0/sent_008.wav) |
20 | | Japanese | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/jp/JP/speed_1.0/sent_000.wav) |
21 | | Korean | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/kr/KR/speed_1.0/sent_000.wav) |
22 |
23 | Some other features include:
24 | - The Chinese speaker supports `mixed Chinese and English`.
25 | - Fast enough for `CPU real-time inference`.
26 |
27 | ## Usage
28 | - [Use without Installation](docs/quick_use.md)
29 | - [Install and Use Locally](docs/install.md)
30 | - [Training on Custom Dataset](docs/training.md)
31 |
32 | The Python API and model cards can be found in [this repo](https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#python-api) or on [HuggingFace](https://huggingface.co/myshell-ai).
33 |
34 | **Contributing**
35 |
36 | If you find this work useful, please consider contributing to this repo.
37 |
38 | - Many thanks to [@fakerybakery](https://github.com/fakerybakery) for adding the Web UI and CLI part.
39 |
40 | ## Authors
41 |
42 | - [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
43 | - [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
44 | - [Zengyi Qin](https://www.qinzy.tech) (project lead) at MIT and MyShell
45 |
46 | **Citation**
47 | ```
48 | @software{zhao2024melo,
49 | author={Zhao, Wenliang and Yu, Xumin and Qin, Zengyi},
50 | title = {MeloTTS: High-quality Multi-lingual Multi-accent Text-to-Speech},
51 | url = {https://github.com/myshell-ai/MeloTTS},
52 | year = {2023}
53 | }
54 | ```
55 |
56 | ## License
57 |
58 | This library is under MIT License, which means it is free for both commercial and non-commercial use.
59 |
60 | ## Acknowledgements
61 |
62 | This implementation is based on [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), [VITS2](https://github.com/daniilrobnikov/vits2) and [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2). We appreciate their awesome work.
63 |
--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
1 | ## Install and Use Locally
2 |
3 | ### Table of Content
4 | - [Linux and macOS Install](#linux-and-macos-install)
5 | - [Docker Install for Windows and macOS](#docker-install)
6 | - [Usage](#usage)
7 | - [Web UI](#webui)
8 | - [CLI](#cli)
9 | - [Python API](#python-api)
10 |
11 | ### Linux and macOS Install
12 | The repo is developed and tested on `Ubuntu 20.04` and `Python 3.9`.
13 | ```bash
14 | git clone https://github.com/myshell-ai/MeloTTS.git
15 | cd MeloTTS
16 | pip install -e .
17 | python -m unidic download
18 | ```
19 | If you encountered issues in macOS install, try the [Docker Install](#docker-install)
20 |
21 | ### Docker Install
22 | To avoid compatibility issues, for Windows users and some macOS users, we suggest to run via Docker. Ensure that [you have Docker installed](https://docs.docker.com/engine/install/).
23 |
24 | **Build Docker**
25 |
26 | This could take a few minutes.
27 | ```bash
28 | git clone https://github.com/myshell-ai/MeloTTS.git
29 | cd MeloTTS
30 | docker build -t melotts .
31 | ```
32 |
33 | **Run Docker**
34 | ```bash
35 | docker run -it -p 8888:8888 melotts
36 | ```
37 | If your local machine has GPU, then you can choose to run:
38 | ```bash
39 | docker run --gpus all -it -p 8888:8888 melotts
40 | ```
41 | Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app.
42 |
43 | ## Usage
44 |
45 | ### WebUI
46 |
47 | The WebUI supports muliple languages and voices. First, follow the installation steps. Then, simply run:
48 |
49 | ```bash
50 | melo-ui
51 | # Or: python melo/app.py
52 | ```
53 |
54 | ### CLI
55 |
56 | You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples:
57 |
58 | **Read English text:**
59 |
60 | ```bash
61 | melo "Text to read" output.wav
62 | ```
63 |
64 | **Specify a language:**
65 |
66 | ```bash
67 | melo "Text to read" output.wav --language EN
68 | ```
69 |
70 | **Specify a speaker:**
71 |
72 | ```bash
73 | melo "Text to read" output.wav --language EN --speaker EN-US
74 | melo "Text to read" output.wav --language EN --speaker EN-AU
75 | ```
76 |
77 | The available speakers are: `EN-Default`, `EN-US`, `EN-BR`, `EN_INDIA` `EN-AU`.
78 |
79 | **Specify a speed:**
80 |
81 | ```bash
82 | melo "Text to read" output.wav --language EN --speaker EN-US --speed 1.5
83 | melo "Text to read" output.wav --speed 1.5
84 | ```
85 |
86 | **Use a different language:**
87 |
88 | ```bash
89 | melo "text-to-speech 领域近年来发展迅速" zh.wav -l ZH
90 | ```
91 |
92 | **Load from a file:**
93 |
94 | ```bash
95 | melo file.txt out.wav --file
96 | ```
97 |
98 | The full API documentation may be found using:
99 |
100 | ```bash
101 | melo --help
102 | ```
103 |
104 | ### Python API
105 |
106 | #### English with Multiple Accents
107 |
108 | ```python
109 | from melo.api import TTS
110 |
111 | # Speed is adjustable
112 | speed = 1.0
113 |
114 | # CPU is sufficient for real-time inference.
115 | # You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps'
116 | device = 'auto' # Will automatically use GPU if available
117 |
118 | # English
119 | text = "Did you ever hear a folk tale about a giant turtle?"
120 | model = TTS(language='EN', device=device)
121 | speaker_ids = model.hps.data.spk2id
122 |
123 | # American accent
124 | output_path = 'en-us.wav'
125 | model.tts_to_file(text, speaker_ids['EN-US'], output_path, speed=speed)
126 |
127 | # British accent
128 | output_path = 'en-br.wav'
129 | model.tts_to_file(text, speaker_ids['EN-BR'], output_path, speed=speed)
130 |
131 | # Indian accent
132 | output_path = 'en-india.wav'
133 | model.tts_to_file(text, speaker_ids['EN_INDIA'], output_path, speed=speed)
134 |
135 | # Australian accent
136 | output_path = 'en-au.wav'
137 | model.tts_to_file(text, speaker_ids['EN-AU'], output_path, speed=speed)
138 |
139 | # Default accent
140 | output_path = 'en-default.wav'
141 | model.tts_to_file(text, speaker_ids['EN-Default'], output_path, speed=speed)
142 |
143 | ```
144 |
145 | #### Spanish
146 | ```python
147 | from melo.api import TTS
148 |
149 | # Speed is adjustable
150 | speed = 1.0
151 |
152 | # CPU is sufficient for real-time inference.
153 | # You can also change to cuda:0
154 | device = 'cpu'
155 |
156 | text = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante."
157 | model = TTS(language='ES', device=device)
158 | speaker_ids = model.hps.data.spk2id
159 |
160 | output_path = 'es.wav'
161 | model.tts_to_file(text, speaker_ids['ES'], output_path, speed=speed)
162 | ```
163 |
164 | #### French
165 |
166 | ```python
167 | from melo.api import TTS
168 |
169 | # Speed is adjustable
170 | speed = 1.0
171 | device = 'cpu' # or cuda:0
172 |
173 | text = "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante."
174 | model = TTS(language='FR', device=device)
175 | speaker_ids = model.hps.data.spk2id
176 |
177 | output_path = 'fr.wav'
178 | model.tts_to_file(text, speaker_ids['FR'], output_path, speed=speed)
179 | ```
180 |
181 | #### Chinese
182 |
183 | ```python
184 | from melo.api import TTS
185 |
186 | # Speed is adjustable
187 | speed = 1.0
188 | device = 'cpu' # or cuda:0
189 |
190 | text = "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。"
191 | model = TTS(language='ZH', device=device)
192 | speaker_ids = model.hps.data.spk2id
193 |
194 | output_path = 'zh.wav'
195 | model.tts_to_file(text, speaker_ids['ZH'], output_path, speed=speed)
196 | ```
197 |
198 | #### Japanese
199 |
200 | ```python
201 | from melo.api import TTS
202 |
203 | # Speed is adjustable
204 | speed = 1.0
205 | device = 'cpu' # or cuda:0
206 |
207 | text = "彼は毎朝ジョギングをして体を健康に保っています。"
208 | model = TTS(language='JP', device=device)
209 | speaker_ids = model.hps.data.spk2id
210 |
211 | output_path = 'jp.wav'
212 | model.tts_to_file(text, speaker_ids['JP'], output_path, speed=speed)
213 | ```
214 |
215 | #### Korean
216 |
217 | ```python
218 | from melo.api import TTS
219 |
220 | # Speed is adjustable
221 | speed = 1.0
222 | device = 'cpu' # or cuda:0
223 |
224 | text = "안녕하세요! 오늘은 날씨가 정말 좋네요."
225 | model = TTS(language='KR', device=device)
226 | speaker_ids = model.hps.data.spk2id
227 |
228 | output_path = 'kr.wav'
229 | model.tts_to_file(text, speaker_ids['KR'], output_path, speed=speed)
230 | ```
231 |
--------------------------------------------------------------------------------
/docs/quick_use.md:
--------------------------------------------------------------------------------
1 | ## Use MeloTTS without Installation
2 |
3 | **Quick Demo**
4 |
5 | - [Official live demo](https://app.myshell.ai/bot/UN77N3/1709094629) on Myshell.
6 | - Hugging Face Space [live demo](https://huggingface.co/spaces/mrfakename/MeloTTS).
7 |
8 | **Use on MyShell**
9 |
10 | There are hundreds of TTS models on MyShell, much more than MeloTTS. For example:
11 |
12 | English
13 | - [gentle British male voice](https://app.myshell.ai/widget/nIfamm)
14 | - [cheerful young female voice](https://app.myshell.ai/widget/AjIjqy)
15 | - [sultry and robust male voice](https://app.myshell.ai/widget/zQJJN3)
16 |
17 | Spanish
18 | - [voz femenina adorable](https://app.myshell.ai/widget/buIZBf)
19 | - [voz masculina joven](https://app.myshell.ai/widget/rayuiy)
20 | - [voz de niña inmadura](https://app.myshell.ai/widget/mYFV3e)
21 |
22 | French
23 | - [voix adorable de fille](https://app.myshell.ai/widget/3IfEfy)
24 | - [voix douce masculine](https://app.myshell.ai/widget/IRR3M3)
25 | - [voix douce féminine](https://app.myshell.ai/widget/NRbaUj)
26 |
27 | German
28 | - [sanfte Männerstimme](https://app.myshell.ai/widget/JFnAn2)
29 | - [sanfte Frauenstimme](https://app.myshell.ai/widget/MrU7Nb)
30 | - [unreife Mädchenstimme](https://app.myshell.ai/widget/UFbYBj)
31 |
32 | Portuguese
33 | - [voz feminina nítida](https://app.myshell.ai/widget/VzMb6j)
34 | - [voz de menino imaturo](https://app.myshell.ai/widget/nAzeei)
35 | - [voz masculina sóbria](https://app.myshell.ai/widget/JZRNJz)
36 |
37 | Russian
38 | - [зрелый женский голос](https://app.myshell.ai/widget/6byMZ3)
39 | - [зрелый мужской голос](https://app.myshell.ai/widget/NB7jmm)
40 |
41 | Chinese
42 | - [甜美女声](https://app.myshell.ai/widget/ymeUjm)
43 | - [青年男声](https://app.myshell.ai/widget/NZnERb)
44 |
45 | More can be found at the widget center of [MyShell.ai](https://app.myshell.ai/robot-workshop).
46 |
--------------------------------------------------------------------------------
/docs/training.md:
--------------------------------------------------------------------------------
1 | ## Training
2 |
3 | Before training, please install MeloTTS in dev mode and go to the `melo` folder.
4 | ```
5 | pip install -e .
6 | cd melo
7 | ```
8 |
9 | ### Data Preparation
10 | To train a TTS model, we need to prepare the audio files and a metadata file. We recommend using 44100Hz audio files and the metadata file should have the following format:
11 |
12 | ```
13 | path/to/audio_001.wav |||
14 | path/to/audio_002.wav |||
15 | ```
16 | The transcribed text can be obtained by ASR model, (e.g., [whisper](https://github.com/openai/whisper)). An example metadata can be found in `data/example/metadata.list`
17 |
18 | We can then run the preprocessing code:
19 | ```
20 | python preprocess_text.py --metadata data/example/metadata.list
21 | ```
22 | A config file `data/example/config.json` will be generated. Feel free to edit some hyper-parameters in that config file (for example, you may decrease the batch size if you have encountered the CUDA out-of-memory issue).
23 |
24 | ### Training
25 | The training can be launched by:
26 | ```
27 | bash train.sh
28 | ```
29 |
30 | We have found for some machine the training will sometimes crash due to an [issue](https://github.com/pytorch/pytorch/issues/2530) of gloo. Therefore, we add an auto-resume wrapper in the `train.sh`.
31 |
32 | ### Inference
33 | Simply run:
34 | ```
35 | python infer.py --text "" -m /path/to/checkpoint/G_.pth -o
36 | ```
37 |
38 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/logo.png
--------------------------------------------------------------------------------
/melo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/__init__.py
--------------------------------------------------------------------------------
/melo/api.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import json
4 | import torch
5 | import librosa
6 | import soundfile
7 | import torchaudio
8 | import numpy as np
9 | import torch.nn as nn
10 | from tqdm import tqdm
11 | import torch
12 |
13 | from . import utils
14 | from . import commons
15 | from .models import SynthesizerTrn
16 | from .split_utils import split_sentence
17 | from .mel_processing import spectrogram_torch, spectrogram_torch_conv
18 | from .download_utils import load_or_download_config, load_or_download_model
19 |
20 | class TTS(nn.Module):
21 | def __init__(self,
22 | language,
23 | device='auto',
24 | use_hf=True,
25 | config_path=None,
26 | ckpt_path=None):
27 | super().__init__()
28 | if device == 'auto':
29 | device = 'cpu'
30 | if torch.cuda.is_available(): device = 'cuda'
31 | if torch.backends.mps.is_available(): device = 'mps'
32 | if 'cuda' in device:
33 | assert torch.cuda.is_available()
34 |
35 | # config_path =
36 | hps = load_or_download_config(language, use_hf=use_hf, config_path=config_path)
37 |
38 | num_languages = hps.num_languages
39 | num_tones = hps.num_tones
40 | symbols = hps.symbols
41 |
42 | model = SynthesizerTrn(
43 | len(symbols),
44 | hps.data.filter_length // 2 + 1,
45 | hps.train.segment_size // hps.data.hop_length,
46 | n_speakers=hps.data.n_speakers,
47 | num_tones=num_tones,
48 | num_languages=num_languages,
49 | **hps.model,
50 | ).to(device)
51 |
52 | model.eval()
53 | self.model = model
54 | self.symbol_to_id = {s: i for i, s in enumerate(symbols)}
55 | self.hps = hps
56 | self.device = device
57 |
58 | # load state_dict
59 | checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf, ckpt_path=ckpt_path)
60 | self.model.load_state_dict(checkpoint_dict['model'], strict=True)
61 |
62 | language = language.split('_')[0]
63 | self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model
64 |
65 | @staticmethod
66 | def audio_numpy_concat(segment_data_list, sr, speed=1.):
67 | audio_segments = []
68 | for segment_data in segment_data_list:
69 | audio_segments += segment_data.reshape(-1).tolist()
70 | audio_segments += [0] * int((sr * 0.05) / speed)
71 | audio_segments = np.array(audio_segments).astype(np.float32)
72 | return audio_segments
73 |
74 | @staticmethod
75 | def split_sentences_into_pieces(text, language, quiet=False):
76 | texts = split_sentence(text, language_str=language)
77 | if not quiet:
78 | print(" > Text split to sentences.")
79 | print('\n'.join(texts))
80 | print(" > ===========================")
81 | return texts
82 |
83 | def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,):
84 | language = self.language
85 | texts = self.split_sentences_into_pieces(text, language, quiet)
86 | audio_list = []
87 | if pbar:
88 | tx = pbar(texts)
89 | else:
90 | if position:
91 | tx = tqdm(texts, position=position)
92 | elif quiet:
93 | tx = texts
94 | else:
95 | tx = tqdm(texts)
96 | for t in tx:
97 | if language in ['EN', 'ZH_MIX_EN']:
98 | t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
99 | device = self.device
100 | bert, ja_bert, phones, tones, lang_ids = utils.get_text_for_tts_infer(t, language, self.hps, device, self.symbol_to_id)
101 | with torch.no_grad():
102 | x_tst = phones.to(device).unsqueeze(0)
103 | tones = tones.to(device).unsqueeze(0)
104 | lang_ids = lang_ids.to(device).unsqueeze(0)
105 | bert = bert.to(device).unsqueeze(0)
106 | ja_bert = ja_bert.to(device).unsqueeze(0)
107 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
108 | del phones
109 | speakers = torch.LongTensor([speaker_id]).to(device)
110 | audio = self.model.infer(
111 | x_tst,
112 | x_tst_lengths,
113 | speakers,
114 | tones,
115 | lang_ids,
116 | bert,
117 | ja_bert,
118 | sdp_ratio=sdp_ratio,
119 | noise_scale=noise_scale,
120 | noise_scale_w=noise_scale_w,
121 | length_scale=1. / speed,
122 | )[0][0, 0].data.cpu().float().numpy()
123 | del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers
124 | #
125 | audio_list.append(audio)
126 | torch.cuda.empty_cache()
127 | audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
128 |
129 | if output_path is None:
130 | return audio
131 | else:
132 | if format:
133 | soundfile.write(output_path, audio, self.hps.data.sampling_rate, format=format)
134 | else:
135 | soundfile.write(output_path, audio, self.hps.data.sampling_rate)
136 |
--------------------------------------------------------------------------------
/melo/app.py:
--------------------------------------------------------------------------------
1 | # WebUI by mrfakename
2 | # Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
3 | import gradio as gr
4 | import os, torch, io
5 | # os.system('python -m unidic download')
6 | print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
7 | from melo.api import TTS
8 | speed = 1.0
9 | import tempfile
10 | import click
11 | device = 'auto'
12 | models = {
13 | 'EN': TTS(language='EN', device=device),
14 | 'ES': TTS(language='ES', device=device),
15 | 'FR': TTS(language='FR', device=device),
16 | 'ZH': TTS(language='ZH', device=device),
17 | 'JP': TTS(language='JP', device=device),
18 | 'KR': TTS(language='KR', device=device),
19 | }
20 | speaker_ids = models['EN'].hps.data.spk2id
21 |
22 | default_text_dict = {
23 | 'EN': 'The field of text-to-speech has seen rapid development recently.',
24 | 'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
25 | 'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
26 | 'ZH': 'text-to-speech 领域近年来发展迅速',
27 | 'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
28 | 'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',
29 | }
30 |
31 | def synthesize(speaker, text, speed, language, progress=gr.Progress()):
32 | bio = io.BytesIO()
33 | models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
34 | return bio.getvalue()
35 | def load_speakers(language, text):
36 | if text in list(default_text_dict.values()):
37 | newtext = default_text_dict[language]
38 | else:
39 | newtext = text
40 | return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
41 | with gr.Blocks() as demo:
42 | gr.Markdown('# MeloTTS WebUI\n\nA WebUI for MeloTTS.')
43 | with gr.Group():
44 | speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
45 | language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
46 | speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
47 | text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
48 | language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
49 | btn = gr.Button('Synthesize', variant='primary')
50 | aud = gr.Audio(interactive=False)
51 | btn.click(synthesize, inputs=[speaker, text, speed, language], outputs=[aud])
52 | gr.Markdown('WebUI by [mrfakename](https://twitter.com/realmrfakename).')
53 | @click.command()
54 | @click.option('--share', '-s', is_flag=True, show_default=True, default=False, help="Expose a publicly-accessible shared Gradio link usable by anyone with the link. Only share the link with people you trust.")
55 | @click.option('--host', '-h', default=None)
56 | @click.option('--port', '-p', type=int, default=None)
57 | def main(share, host, port):
58 | demo.queue(api_open=False).launch(show_api=False, share=share, server_name=host, server_port=port)
59 |
60 | if __name__ == "__main__":
61 | main()
62 |
--------------------------------------------------------------------------------
/melo/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.nn import functional as F
4 |
5 |
6 | def init_weights(m, mean=0.0, std=0.01):
7 | classname = m.__class__.__name__
8 | if classname.find("Conv") != -1:
9 | m.weight.data.normal_(mean, std)
10 |
11 |
12 | def get_padding(kernel_size, dilation=1):
13 | return int((kernel_size * dilation - dilation) / 2)
14 |
15 |
16 | def convert_pad_shape(pad_shape):
17 | layer = pad_shape[::-1]
18 | pad_shape = [item for sublist in layer for item in sublist]
19 | return pad_shape
20 |
21 |
22 | def intersperse(lst, item):
23 | result = [item] * (len(lst) * 2 + 1)
24 | result[1::2] = lst
25 | return result
26 |
27 |
28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
29 | """KL(P||Q)"""
30 | kl = (logs_q - logs_p) - 0.5
31 | kl += (
32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33 | )
34 | return kl
35 |
36 |
37 | def rand_gumbel(shape):
38 | """Sample from the Gumbel distribution, protect from overflows."""
39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40 | return -torch.log(-torch.log(uniform_samples))
41 |
42 |
43 | def rand_gumbel_like(x):
44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45 | return g
46 |
47 |
48 | def slice_segments(x, ids_str, segment_size=4):
49 | ret = torch.zeros_like(x[:, :, :segment_size])
50 | for i in range(x.size(0)):
51 | idx_str = ids_str[i]
52 | idx_end = idx_str + segment_size
53 | ret[i] = x[i, :, idx_str:idx_end]
54 | return ret
55 |
56 |
57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
58 | b, d, t = x.size()
59 | if x_lengths is None:
60 | x_lengths = t
61 | ids_str_max = x_lengths - segment_size + 1
62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63 | ret = slice_segments(x, ids_str, segment_size)
64 | return ret, ids_str
65 |
66 |
67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
68 | position = torch.arange(length, dtype=torch.float)
69 | num_timescales = channels // 2
70 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
71 | num_timescales - 1
72 | )
73 | inv_timescales = min_timescale * torch.exp(
74 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
75 | )
76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78 | signal = F.pad(signal, [0, 0, 0, channels % 2])
79 | signal = signal.view(1, channels, length)
80 | return signal
81 |
82 |
83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84 | b, channels, length = x.size()
85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86 | return x + signal.to(dtype=x.dtype, device=x.device)
87 |
88 |
89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90 | b, channels, length = x.size()
91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93 |
94 |
95 | def subsequent_mask(length):
96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97 | return mask
98 |
99 |
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 | n_channels_int = n_channels[0]
103 | in_act = input_a + input_b
104 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 | acts = t_act * s_act
107 | return acts
108 |
109 |
110 | def convert_pad_shape(pad_shape):
111 | layer = pad_shape[::-1]
112 | pad_shape = [item for sublist in layer for item in sublist]
113 | return pad_shape
114 |
115 |
116 | def shift_1d(x):
117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 | return x
119 |
120 |
121 | def sequence_mask(length, max_length=None):
122 | if max_length is None:
123 | max_length = length.max()
124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 | return x.unsqueeze(0) < length.unsqueeze(1)
126 |
127 |
128 | def generate_path(duration, mask):
129 | """
130 | duration: [b, 1, t_x]
131 | mask: [b, 1, t_y, t_x]
132 | """
133 |
134 | b, _, t_y, t_x = mask.shape
135 | cum_duration = torch.cumsum(duration, -1)
136 |
137 | cum_duration_flat = cum_duration.view(b * t_x)
138 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139 | path = path.view(b, t_x, t_y)
140 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141 | path = path.unsqueeze(1).transpose(2, 3) * mask
142 | return path
143 |
144 |
145 | def clip_grad_value_(parameters, clip_value, norm_type=2):
146 | if isinstance(parameters, torch.Tensor):
147 | parameters = [parameters]
148 | parameters = list(filter(lambda p: p.grad is not None, parameters))
149 | norm_type = float(norm_type)
150 | if clip_value is not None:
151 | clip_value = float(clip_value)
152 |
153 | total_norm = 0
154 | for p in parameters:
155 | param_norm = p.grad.data.norm(norm_type)
156 | total_norm += param_norm.item() ** norm_type
157 | if clip_value is not None:
158 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
159 | total_norm = total_norm ** (1.0 / norm_type)
160 | return total_norm
161 |
--------------------------------------------------------------------------------
/melo/configs/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 1000,
5 | "seed": 52,
6 | "epochs": 10000,
7 | "learning_rate": 0.0003,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 6,
14 | "fp16_run": false,
15 | "lr_decay": 0.999875,
16 | "segment_size": 16384,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "skip_optimizer": true
22 | },
23 | "data": {
24 | "training_files": "",
25 | "validation_files": "",
26 | "max_wav_value": 32768.0,
27 | "sampling_rate": 44100,
28 | "filter_length": 2048,
29 | "hop_length": 512,
30 | "win_length": 2048,
31 | "n_mel_channels": 128,
32 | "mel_fmin": 0.0,
33 | "mel_fmax": null,
34 | "add_blank": true,
35 | "n_speakers": 256,
36 | "cleaned_text": true,
37 | "spk2id": {}
38 | },
39 | "model": {
40 | "use_spk_conditioned_encoder": true,
41 | "use_noise_scaled_mas": true,
42 | "use_mel_posterior_encoder": false,
43 | "use_duration_discriminator": true,
44 | "inter_channels": 192,
45 | "hidden_channels": 192,
46 | "filter_channels": 768,
47 | "n_heads": 2,
48 | "n_layers": 6,
49 | "n_layers_trans_flow": 3,
50 | "kernel_size": 3,
51 | "p_dropout": 0.1,
52 | "resblock": "1",
53 | "resblock_kernel_sizes": [
54 | 3,
55 | 7,
56 | 11
57 | ],
58 | "resblock_dilation_sizes": [
59 | [
60 | 1,
61 | 3,
62 | 5
63 | ],
64 | [
65 | 1,
66 | 3,
67 | 5
68 | ],
69 | [
70 | 1,
71 | 3,
72 | 5
73 | ]
74 | ],
75 | "upsample_rates": [
76 | 8,
77 | 8,
78 | 2,
79 | 2,
80 | 2
81 | ],
82 | "upsample_initial_channel": 512,
83 | "upsample_kernel_sizes": [
84 | 16,
85 | 16,
86 | 8,
87 | 2,
88 | 2
89 | ],
90 | "n_layers_q": 3,
91 | "use_spectral_norm": false,
92 | "gin_channels": 256
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/melo/data/example/metadata.list:
--------------------------------------------------------------------------------
1 | data/example/wavs/000.wav|EN-default|EN|Well, there are always new trends and styles emerging in the fashion world, but I think some of the biggest trends at the moment include sustainability and ethical fashion, streetwear and athleisure, and oversized and deconstructed silhouettes.
2 | data/example/wavs/001.wav|EN-default|EN|Many designers and brands are focusing on creating more environmentally-friendly and socially responsible clothing, while others are incorporating elements of sportswear and casual wear into their collections.
3 | data/example/wavs/002.wav|EN-default|EN|And there's a growing interest in looser, more relaxed shapes and unconventional materials and finishes.
4 | data/example/wavs/003.wav|EN-default|EN|That's really insightful.
5 | data/example/wavs/004.wav|EN-default|EN|What do you think are some of the benefits of following fashion trends?
6 | data/example/wavs/005.wav|EN-default|EN|Well, I think one of the main benefits of following fashion trends is that it can be a way to express your creativity, personality, and individuality.
7 | data/example/wavs/006.wav|EN-default|EN|Fashion can be a powerful tool for self-expression and can help you feel more confident and comfortable in your own skin.
8 | data/example/wavs/007.wav|EN-default|EN|Additionally, staying up-to-date with fashion trends can help you develop your own sense of style and learn how to put together outfits that make you look and feel great.
9 | data/example/wavs/008.wav|EN-default|EN|That's a great point.
10 | data/example/wavs/009.wav|EN-default|EN|Do you think it's important to stay on top of the latest fashion trends, or is it more important to focus on timeless style?
11 | data/example/wavs/010.wav|EN-default|EN|I think it's really up to each individual to decide what approach to fashion works best for them.
12 | data/example/wavs/011.wav|EN-default|EN|Some people prefer to stick with classic, timeless styles that never go out of fashion, while others enjoy experimenting with new and innovative trends.
13 | data/example/wavs/012.wav|EN-default|EN|Ultimately, fashion is about personal expression and there's no right or wrong way to approach it.
14 | data/example/wavs/013.wav|EN-default|EN|The most important thing is to wear what makes you feel good and confident.
15 | data/example/wavs/014.wav|EN-default|EN|I completely agree.
16 | data/example/wavs/015.wav|EN-default|EN|Some popular ones that come to mind are oversized blazers, statement sleeves, printed maxi dresses, and chunky sneakers.
17 | data/example/wavs/016.wav|EN-default|EN|It's been really interesting chatting with you about fashion.
18 | data/example/wavs/017.wav|EN-default|EN|That's a good point.
19 | data/example/wavs/018.wav|EN-default|EN|What do you think are some current fashion trends that are popular right now?
20 | data/example/wavs/019.wav|EN-default|EN|There are so many trends happening right now, it's hard to keep track of them all!
21 |
--------------------------------------------------------------------------------
/melo/download_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | from . import utils
4 | from cached_path import cached_path
5 | from huggingface_hub import hf_hub_download
6 |
7 | DOWNLOAD_CKPT_URLS = {
8 | 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
9 | 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
10 | 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth',
11 | 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth',
12 | 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth',
13 | 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth',
14 | 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth',
15 | }
16 |
17 | DOWNLOAD_CONFIG_URLS = {
18 | 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/config.json',
19 | 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json',
20 | 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/config.json',
21 | 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/config.json',
22 | 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/config.json',
23 | 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json',
24 | 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
25 | }
26 |
27 | PRETRAINED_MODELS = {
28 | 'G.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/G.pth',
29 | 'D.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/D.pth',
30 | 'DUR.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/DUR.pth',
31 | }
32 |
33 | LANG_TO_HF_REPO_ID = {
34 | 'EN': 'myshell-ai/MeloTTS-English',
35 | 'EN_V2': 'myshell-ai/MeloTTS-English-v2',
36 | 'EN_NEWEST': 'myshell-ai/MeloTTS-English-v3',
37 | 'FR': 'myshell-ai/MeloTTS-French',
38 | 'JP': 'myshell-ai/MeloTTS-Japanese',
39 | 'ES': 'myshell-ai/MeloTTS-Spanish',
40 | 'ZH': 'myshell-ai/MeloTTS-Chinese',
41 | 'KR': 'myshell-ai/MeloTTS-Korean',
42 | }
43 |
44 | def load_or_download_config(locale, use_hf=True, config_path=None):
45 | if config_path is None:
46 | language = locale.split('-')[0].upper()
47 | if use_hf:
48 | assert language in LANG_TO_HF_REPO_ID
49 | config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json")
50 | else:
51 | assert language in DOWNLOAD_CONFIG_URLS
52 | config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
53 | return utils.get_hparams_from_file(config_path)
54 |
55 | def load_or_download_model(locale, device, use_hf=True, ckpt_path=None):
56 | if ckpt_path is None:
57 | language = locale.split('-')[0].upper()
58 | if use_hf:
59 | assert language in LANG_TO_HF_REPO_ID
60 | ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth")
61 | else:
62 | assert language in DOWNLOAD_CKPT_URLS
63 | ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
64 | return torch.load(ckpt_path, map_location=device)
65 |
66 | def load_pretrain_model():
67 | return [cached_path(url) for url in PRETRAINED_MODELS.values()]
68 |
--------------------------------------------------------------------------------
/melo/infer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import click
3 | from melo.api import TTS
4 |
5 |
6 |
7 | @click.command()
8 | @click.option('--ckpt_path', '-m', type=str, default=None, help="Path to the checkpoint file")
9 | @click.option('--text', '-t', type=str, default=None, help="Text to speak")
10 | @click.option('--language', '-l', type=str, default="EN", help="Language of the model")
11 | @click.option('--output_dir', '-o', type=str, default="outputs", help="Path to the output")
12 | def main(ckpt_path, text, language, output_dir):
13 | if ckpt_path is None:
14 | raise ValueError("The model_path must be specified")
15 |
16 | config_path = os.path.join(os.path.dirname(ckpt_path), 'config.json')
17 | model = TTS(language=language, config_path=config_path, ckpt_path=ckpt_path)
18 |
19 | for spk_name, spk_id in model.hps.data.spk2id.items():
20 | save_path = f'{output_dir}/{spk_name}/output.wav'
21 | os.makedirs(os.path.dirname(save_path), exist_ok=True)
22 | model.tts_to_file(text, spk_id, save_path)
23 |
24 | if __name__ == "__main__":
25 | main()
26 |
--------------------------------------------------------------------------------
/melo/init_downloads.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | if __name__ == '__main__':
4 |
5 | from melo.api import TTS
6 | device = 'auto'
7 | models = {
8 | 'EN': TTS(language='EN', device=device),
9 | 'ES': TTS(language='ES', device=device),
10 | 'FR': TTS(language='FR', device=device),
11 | 'ZH': TTS(language='ZH', device=device),
12 | 'JP': TTS(language='JP', device=device),
13 | 'KR': TTS(language='KR', device=device),
14 | }
--------------------------------------------------------------------------------
/melo/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def feature_loss(fmap_r, fmap_g):
5 | loss = 0
6 | for dr, dg in zip(fmap_r, fmap_g):
7 | for rl, gl in zip(dr, dg):
8 | rl = rl.float().detach()
9 | gl = gl.float()
10 | loss += torch.mean(torch.abs(rl - gl))
11 |
12 | return loss * 2
13 |
14 |
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 | loss = 0
17 | r_losses = []
18 | g_losses = []
19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 | dr = dr.float()
21 | dg = dg.float()
22 | r_loss = torch.mean((1 - dr) ** 2)
23 | g_loss = torch.mean(dg**2)
24 | loss += r_loss + g_loss
25 | r_losses.append(r_loss.item())
26 | g_losses.append(g_loss.item())
27 |
28 | return loss, r_losses, g_losses
29 |
30 |
31 | def generator_loss(disc_outputs):
32 | loss = 0
33 | gen_losses = []
34 | for dg in disc_outputs:
35 | dg = dg.float()
36 | l = torch.mean((1 - dg) ** 2)
37 | gen_losses.append(l)
38 | loss += l
39 |
40 | return loss, gen_losses
41 |
42 |
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 | """
45 | z_p, logs_q: [b, h, t_t]
46 | m_p, logs_p: [b, h, t_t]
47 | """
48 | z_p = z_p.float()
49 | logs_q = logs_q.float()
50 | m_p = m_p.float()
51 | logs_p = logs_p.float()
52 | z_mask = z_mask.float()
53 |
54 | kl = logs_p - logs_q - 0.5
55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 | kl = torch.sum(kl * z_mask)
57 | l = kl / torch.sum(z_mask)
58 | return l
59 |
--------------------------------------------------------------------------------
/melo/main.py:
--------------------------------------------------------------------------------
1 | import click
2 | import warnings
3 | import os
4 |
5 |
6 | @click.command
7 | @click.argument('text')
8 | @click.argument('output_path')
9 | @click.option("--file", '-f', is_flag=True, show_default=True, default=False, help="Text is a file")
10 | @click.option('--language', '-l', default='EN', help='Language, defaults to English', type=click.Choice(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], case_sensitive=False))
11 | @click.option('--speaker', '-spk', default='EN-Default', help='Speaker ID, only for English, leave empty for default, ignored if not English. If English, defaults to "EN-Default"', type=click.Choice(['EN-Default', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU']))
12 | @click.option('--speed', '-s', default=1.0, help='Speed, defaults to 1.0', type=float)
13 | @click.option('--device', '-d', default='auto', help='Device, defaults to auto')
14 | def main(text, file, output_path, language, speaker, speed, device):
15 | if file:
16 | if not os.path.exists(text):
17 | raise FileNotFoundError(f'Trying to load text from file due to --file/-f flag, but file not found. Remove the --file/-f flag to pass a string.')
18 | else:
19 | with open(text) as f:
20 | text = f.read().strip()
21 | if text == '':
22 | raise ValueError('You entered empty text or the file you passed was empty.')
23 | language = language.upper()
24 | if language == '': language = 'EN'
25 | if speaker == '': speaker = None
26 | if (not language == 'EN') and speaker:
27 | warnings.warn('You specified a speaker but the language is English.')
28 | from melo.api import TTS
29 | model = TTS(language=language, device=device)
30 | speaker_ids = model.hps.data.spk2id
31 | if language == 'EN':
32 | if not speaker: speaker = 'EN-Default'
33 | spkr = speaker_ids[speaker]
34 | else:
35 | spkr = speaker_ids[list(speaker_ids.keys())[0]]
36 | model.tts_to_file(text, spkr, output_path, speed=speed)
37 |
--------------------------------------------------------------------------------
/melo/mel_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data
3 | import librosa
4 | from librosa.filters import mel as librosa_mel_fn
5 |
6 | MAX_WAV_VALUE = 32768.0
7 |
8 |
9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
10 | """
11 | PARAMS
12 | ------
13 | C: compression factor
14 | """
15 | return torch.log(torch.clamp(x, min=clip_val) * C)
16 |
17 |
18 | def dynamic_range_decompression_torch(x, C=1):
19 | """
20 | PARAMS
21 | ------
22 | C: compression factor used to compress
23 | """
24 | return torch.exp(x) / C
25 |
26 |
27 | def spectral_normalize_torch(magnitudes):
28 | output = dynamic_range_compression_torch(magnitudes)
29 | return output
30 |
31 |
32 | def spectral_de_normalize_torch(magnitudes):
33 | output = dynamic_range_decompression_torch(magnitudes)
34 | return output
35 |
36 |
37 | mel_basis = {}
38 | hann_window = {}
39 |
40 |
41 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
42 | if torch.min(y) < -1.1:
43 | print("min value is ", torch.min(y))
44 | if torch.max(y) > 1.1:
45 | print("max value is ", torch.max(y))
46 |
47 | global hann_window
48 | dtype_device = str(y.dtype) + "_" + str(y.device)
49 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
50 | if wnsize_dtype_device not in hann_window:
51 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
52 | dtype=y.dtype, device=y.device
53 | )
54 |
55 | y = torch.nn.functional.pad(
56 | y.unsqueeze(1),
57 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
58 | mode="reflect",
59 | )
60 | y = y.squeeze(1)
61 |
62 | spec = torch.stft(
63 | y,
64 | n_fft,
65 | hop_length=hop_size,
66 | win_length=win_size,
67 | window=hann_window[wnsize_dtype_device],
68 | center=center,
69 | pad_mode="reflect",
70 | normalized=False,
71 | onesided=True,
72 | return_complex=False,
73 | )
74 |
75 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
76 | return spec
77 |
78 |
79 | def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
80 | global hann_window
81 | dtype_device = str(y.dtype) + '_' + str(y.device)
82 | wnsize_dtype_device = str(win_size) + '_' + dtype_device
83 | if wnsize_dtype_device not in hann_window:
84 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
85 |
86 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
87 |
88 | # ******************** original ************************#
89 | # y = y.squeeze(1)
90 | # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
91 | # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
92 |
93 | # ******************** ConvSTFT ************************#
94 | freq_cutoff = n_fft // 2 + 1
95 | fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
96 | forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
97 | forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
98 |
99 | import torch.nn.functional as F
100 |
101 | # if center:
102 | # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
103 | assert center is False
104 |
105 | forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
106 | spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
107 |
108 |
109 | # ******************** Verification ************************#
110 | spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
111 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
112 | assert torch.allclose(spec1, spec2, atol=1e-4)
113 |
114 | spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
115 | return spec
116 |
117 |
118 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
119 | global mel_basis
120 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
121 | fmax_dtype_device = str(fmax) + "_" + dtype_device
122 | if fmax_dtype_device not in mel_basis:
123 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
124 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
125 | dtype=spec.dtype, device=spec.device
126 | )
127 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
128 | spec = spectral_normalize_torch(spec)
129 | return spec
130 |
131 |
132 | def mel_spectrogram_torch(
133 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
134 | ):
135 | global mel_basis, hann_window
136 | dtype_device = str(y.dtype) + "_" + str(y.device)
137 | fmax_dtype_device = str(fmax) + "_" + dtype_device
138 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
139 | if fmax_dtype_device not in mel_basis:
140 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
141 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
142 | dtype=y.dtype, device=y.device
143 | )
144 | if wnsize_dtype_device not in hann_window:
145 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
146 | dtype=y.dtype, device=y.device
147 | )
148 |
149 | y = torch.nn.functional.pad(
150 | y.unsqueeze(1),
151 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
152 | mode="reflect",
153 | )
154 | y = y.squeeze(1)
155 |
156 | spec = torch.stft(
157 | y,
158 | n_fft,
159 | hop_length=hop_size,
160 | win_length=win_size,
161 | window=hann_window[wnsize_dtype_device],
162 | center=center,
163 | pad_mode="reflect",
164 | normalized=False,
165 | onesided=True,
166 | return_complex=False,
167 | )
168 |
169 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
170 |
171 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
172 | spec = spectral_normalize_torch(spec)
173 |
174 | return spec
175 |
--------------------------------------------------------------------------------
/melo/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 | from numpy import zeros, int32, float32
2 | from torch import from_numpy
3 |
4 | from .core import maximum_path_jit
5 |
6 |
7 | def maximum_path(neg_cent, mask):
8 | device = neg_cent.device
9 | dtype = neg_cent.dtype
10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 | path = zeros(neg_cent.shape, dtype=int32)
12 |
13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 | return from_numpy(path).to(device=device, dtype=dtype)
17 |
--------------------------------------------------------------------------------
/melo/monotonic_align/core.py:
--------------------------------------------------------------------------------
1 | import numba
2 |
3 |
4 | @numba.jit(
5 | numba.void(
6 | numba.int32[:, :, ::1],
7 | numba.float32[:, :, ::1],
8 | numba.int32[::1],
9 | numba.int32[::1],
10 | ),
11 | nopython=True,
12 | nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 | b = paths.shape[0]
16 | max_neg_val = -1e9
17 | for i in range(int(b)):
18 | path = paths[i]
19 | value = values[i]
20 | t_y = t_ys[i]
21 | t_x = t_xs[i]
22 |
23 | v_prev = v_cur = 0.0
24 | index = t_x - 1
25 |
26 | for y in range(t_y):
27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 | if x == y:
29 | v_cur = max_neg_val
30 | else:
31 | v_cur = value[y - 1, x]
32 | if x == 0:
33 | if y == 0:
34 | v_prev = 0.0
35 | else:
36 | v_prev = max_neg_val
37 | else:
38 | v_prev = value[y - 1, x - 1]
39 | value[y, x] += max(v_prev, v_cur)
40 |
41 | for y in range(t_y - 1, -1, -1):
42 | path[y, index] = 1
43 | if index != 0 and (
44 | index == y or value[y - 1, index] < value[y - 1, index - 1]
45 | ):
46 | index = index - 1
47 |
--------------------------------------------------------------------------------
/melo/preprocess_text.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collections import defaultdict
3 | from random import shuffle
4 | from typing import Optional
5 |
6 | from tqdm import tqdm
7 | import click
8 | from text.cleaner import clean_text_bert
9 | import os
10 | import torch
11 | from text.symbols import symbols, num_languages, num_tones
12 |
13 | @click.command()
14 | @click.option(
15 | "--metadata",
16 | default="data/example/metadata.list",
17 | type=click.Path(exists=True, file_okay=True, dir_okay=False),
18 | )
19 | @click.option("--cleaned-path", default=None)
20 | @click.option("--train-path", default=None)
21 | @click.option("--val-path", default=None)
22 | @click.option(
23 | "--config_path",
24 | default="configs/config.json",
25 | type=click.Path(exists=True, file_okay=True, dir_okay=False),
26 | )
27 | @click.option("--val-per-spk", default=4)
28 | @click.option("--max-val-total", default=8)
29 | @click.option("--clean/--no-clean", default=True)
30 | def main(
31 | metadata: str,
32 | cleaned_path: Optional[str],
33 | train_path: str,
34 | val_path: str,
35 | config_path: str,
36 | val_per_spk: int,
37 | max_val_total: int,
38 | clean: bool,
39 | ):
40 | if train_path is None:
41 | train_path = os.path.join(os.path.dirname(metadata), 'train.list')
42 | if val_path is None:
43 | val_path = os.path.join(os.path.dirname(metadata), 'val.list')
44 | out_config_path = os.path.join(os.path.dirname(metadata), 'config.json')
45 |
46 | if cleaned_path is None:
47 | cleaned_path = metadata + ".cleaned"
48 |
49 | if clean:
50 | out_file = open(cleaned_path, "w", encoding="utf-8")
51 | new_symbols = []
52 | for line in tqdm(open(metadata, encoding="utf-8").readlines()):
53 | try:
54 | utt, spk, language, text = line.strip().split("|")
55 | norm_text, phones, tones, word2ph, bert = clean_text_bert(text, language, device='cuda:0')
56 | for ph in phones:
57 | if ph not in symbols and ph not in new_symbols:
58 | new_symbols.append(ph)
59 | print('update!, now symbols:')
60 | print(new_symbols)
61 | with open(f'{language}_symbol.txt', 'w') as f:
62 | f.write(f'{new_symbols}')
63 |
64 | assert len(phones) == len(tones)
65 | assert len(phones) == sum(word2ph)
66 | out_file.write(
67 | "{}|{}|{}|{}|{}|{}|{}\n".format(
68 | utt,
69 | spk,
70 | language,
71 | norm_text,
72 | " ".join(phones),
73 | " ".join([str(i) for i in tones]),
74 | " ".join([str(i) for i in word2ph]),
75 | )
76 | )
77 | bert_path = utt.replace(".wav", ".bert.pt")
78 | os.makedirs(os.path.dirname(bert_path), exist_ok=True)
79 | torch.save(bert.cpu(), bert_path)
80 | except Exception as error:
81 | print("err!", line, error)
82 |
83 | out_file.close()
84 |
85 | metadata = cleaned_path
86 |
87 | spk_utt_map = defaultdict(list)
88 | spk_id_map = {}
89 | current_sid = 0
90 |
91 | with open(metadata, encoding="utf-8") as f:
92 | for line in f.readlines():
93 | utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
94 | spk_utt_map[spk].append(line)
95 |
96 | if spk not in spk_id_map.keys():
97 | spk_id_map[spk] = current_sid
98 | current_sid += 1
99 |
100 | train_list = []
101 | val_list = []
102 |
103 | for spk, utts in spk_utt_map.items():
104 | shuffle(utts)
105 | val_list += utts[:val_per_spk]
106 | train_list += utts[val_per_spk:]
107 |
108 | if len(val_list) > max_val_total:
109 | train_list += val_list[max_val_total:]
110 | val_list = val_list[:max_val_total]
111 |
112 | with open(train_path, "w", encoding="utf-8") as f:
113 | for line in train_list:
114 | f.write(line)
115 |
116 | with open(val_path, "w", encoding="utf-8") as f:
117 | for line in val_list:
118 | f.write(line)
119 |
120 | config = json.load(open(config_path, encoding="utf-8"))
121 | config["data"]["spk2id"] = spk_id_map
122 |
123 | config["data"]["training_files"] = train_path
124 | config["data"]["validation_files"] = val_path
125 | config["data"]["n_speakers"] = len(spk_id_map)
126 | config["num_languages"] = num_languages
127 | config["num_tones"] = num_tones
128 | config["symbols"] = symbols
129 |
130 | with open(out_config_path, "w", encoding="utf-8") as f:
131 | json.dump(config, f, indent=2, ensure_ascii=False)
132 |
133 |
134 | if __name__ == "__main__":
135 | main()
136 |
--------------------------------------------------------------------------------
/melo/split_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import glob
4 | import numpy as np
5 | import soundfile as sf
6 | import torchaudio
7 | import re
8 |
9 | def split_sentence(text, min_len=10, language_str='EN'):
10 | if language_str in ['EN', 'FR', 'ES', 'SP']:
11 | sentences = split_sentences_latin(text, min_len=min_len)
12 | else:
13 | sentences = split_sentences_zh(text, min_len=min_len)
14 | return sentences
15 |
16 |
17 | def split_sentences_latin(text, min_len=10):
18 | text = re.sub('[。!?;]', '.', text)
19 | text = re.sub('[,]', ',', text)
20 | text = re.sub('[“”]', '"', text)
21 | text = re.sub('[‘’]', "'", text)
22 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
23 | return [item.strip() for item in txtsplit(text, 256, 512) if item.strip()]
24 |
25 |
26 | def split_sentences_zh(text, min_len=10):
27 | text = re.sub('[。!?;]', '.', text)
28 | text = re.sub('[,]', ',', text)
29 | # 将文本中的换行符、空格和制表符替换为空格
30 | text = re.sub('[\n\t ]+', ' ', text)
31 | # 在标点符号后添加一个空格
32 | text = re.sub('([,.!?;])', r'\1 $#!', text)
33 | # 分隔句子并去除前后空格
34 | # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
35 | sentences = [s.strip() for s in text.split('$#!')]
36 | if len(sentences[-1]) == 0: del sentences[-1]
37 |
38 | new_sentences = []
39 | new_sent = []
40 | count_len = 0
41 | for ind, sent in enumerate(sentences):
42 | new_sent.append(sent)
43 | count_len += len(sent)
44 | if count_len > min_len or ind == len(sentences) - 1:
45 | count_len = 0
46 | new_sentences.append(' '.join(new_sent))
47 | new_sent = []
48 | return merge_short_sentences_zh(new_sentences)
49 |
50 |
51 | def merge_short_sentences_en(sens):
52 | """Avoid short sentences by merging them with the following sentence.
53 |
54 | Args:
55 | List[str]: list of input sentences.
56 |
57 | Returns:
58 | List[str]: list of output sentences.
59 | """
60 | sens_out = []
61 | for s in sens:
62 | # If the previous sentense is too short, merge them with
63 | # the current sentence.
64 | if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
65 | sens_out[-1] = sens_out[-1] + " " + s
66 | else:
67 | sens_out.append(s)
68 | try:
69 | if len(sens_out[-1].split(" ")) <= 2:
70 | sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
71 | sens_out.pop(-1)
72 | except:
73 | pass
74 | return sens_out
75 |
76 |
77 | def merge_short_sentences_zh(sens):
78 | # return sens
79 | """Avoid short sentences by merging them with the following sentence.
80 |
81 | Args:
82 | List[str]: list of input sentences.
83 |
84 | Returns:
85 | List[str]: list of output sentences.
86 | """
87 | sens_out = []
88 | for s in sens:
89 | # If the previous sentense is too short, merge them with
90 | # the current sentence.
91 | if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
92 | sens_out[-1] = sens_out[-1] + " " + s
93 | else:
94 | sens_out.append(s)
95 | try:
96 | if len(sens_out[-1]) <= 2:
97 | sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
98 | sens_out.pop(-1)
99 | except:
100 | pass
101 | return sens_out
102 |
103 |
104 |
105 | def txtsplit(text, desired_length=100, max_length=200):
106 | """Split text it into chunks of a desired length trying to keep sentences intact."""
107 | text = re.sub(r'\n\n+', '\n', text)
108 | text = re.sub(r'\s+', ' ', text)
109 | text = re.sub(r'[""]', '"', text)
110 | text = re.sub(r'([,.?!])', r'\1 ', text)
111 | text = re.sub(r'\s+', ' ', text)
112 |
113 | rv = []
114 | in_quote = False
115 | current = ""
116 | split_pos = []
117 | pos = -1
118 | end_pos = len(text) - 1
119 | def seek(delta):
120 | nonlocal pos, in_quote, current
121 | is_neg = delta < 0
122 | for _ in range(abs(delta)):
123 | if is_neg:
124 | pos -= 1
125 | current = current[:-1]
126 | else:
127 | pos += 1
128 | current += text[pos]
129 | if text[pos] == '"':
130 | in_quote = not in_quote
131 | return text[pos]
132 | def peek(delta):
133 | p = pos + delta
134 | return text[p] if p < end_pos and p >= 0 else ""
135 | def commit():
136 | nonlocal rv, current, split_pos
137 | rv.append(current)
138 | current = ""
139 | split_pos = []
140 | while pos < end_pos:
141 | c = seek(1)
142 | if len(current) >= max_length:
143 | if len(split_pos) > 0 and len(current) > (desired_length / 2):
144 | d = pos - split_pos[-1]
145 | seek(-d)
146 | else:
147 | while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
148 | c = seek(-1)
149 | commit()
150 | elif not in_quote and (c in '!?\n' or (c in '.,' and peek(1) in '\n ')):
151 | while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
152 | c = seek(1)
153 | split_pos.append(pos)
154 | if len(current) >= desired_length:
155 | commit()
156 | elif in_quote and peek(1) == '"' and peek(2) in '\n ':
157 | seek(2)
158 | split_pos.append(pos)
159 | rv.append(current)
160 | rv = [s.strip() for s in rv]
161 | rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
162 | return rv
163 |
164 |
165 | if __name__ == '__main__':
166 | zh_text = "好的,我来给你讲一个故事吧。从前有一个小姑娘,她叫做小红。小红非常喜欢在森林里玩耍,她经常会和她的小伙伴们一起去探险。有一天,小红和她的小伙伴们走到了森林深处,突然遇到了一只凶猛的野兽。小红的小伙伴们都吓得不敢动弹,但是小红并没有被吓倒,她勇敢地走向野兽,用她的智慧和勇气成功地制服了野兽,保护了她的小伙伴们。从那以后,小红变得更加勇敢和自信,成为了她小伙伴们心中的英雄。"
167 | en_text = "I didn’t know what to do. I said please kill her because it would be better than being kidnapped,” Ben, whose surname CNN is not using for security concerns, said on Wednesday. “It’s a nightmare. I said ‘please kill her, don’t take her there.’"
168 | sp_text = "¡Claro! ¿En qué tema te gustaría que te hable en español? Puedo proporcionarte información o conversar contigo sobre una amplia variedad de temas, desde cultura y comida hasta viajes y tecnología. ¿Tienes alguna preferencia en particular?"
169 | fr_text = "Bien sûr ! En quelle matière voudriez-vous que je vous parle en français ? Je peux vous fournir des informations ou discuter avec vous sur une grande variété de sujets, que ce soit la culture, la nourriture, les voyages ou la technologie. Avez-vous une préférence particulière ?"
170 |
171 | print(split_sentence(zh_text, language_str='ZH'))
172 | print(split_sentence(en_text, language_str='EN'))
173 | print(split_sentence(sp_text, language_str='SP'))
174 | print(split_sentence(fr_text, language_str='FR'))
175 |
--------------------------------------------------------------------------------
/melo/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 |
7 | def cleaned_text_to_sequence(cleaned_text, tones, language, symbol_to_id=None):
8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9 | Args:
10 | text: string to convert to a sequence
11 | Returns:
12 | List of integers corresponding to the symbols in the text
13 | """
14 | symbol_to_id_map = symbol_to_id if symbol_to_id else _symbol_to_id
15 | phones = [symbol_to_id_map[symbol] for symbol in cleaned_text]
16 | tone_start = language_tone_start_map[language]
17 | tones = [i + tone_start for i in tones]
18 | lang_id = language_id_map[language]
19 | lang_ids = [lang_id for i in phones]
20 | return phones, tones, lang_ids
21 |
22 |
23 | def get_bert(norm_text, word2ph, language, device):
24 | from .chinese_bert import get_bert_feature as zh_bert
25 | from .english_bert import get_bert_feature as en_bert
26 | from .japanese_bert import get_bert_feature as jp_bert
27 | from .chinese_mix import get_bert_feature as zh_mix_en_bert
28 | from .spanish_bert import get_bert_feature as sp_bert
29 | from .french_bert import get_bert_feature as fr_bert
30 | from .korean import get_bert_feature as kr_bert
31 |
32 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert, 'ZH_MIX_EN': zh_mix_en_bert,
33 | 'FR': fr_bert, 'SP': sp_bert, 'ES': sp_bert, "KR": kr_bert}
34 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
35 | return bert
36 |
--------------------------------------------------------------------------------
/melo/text/chinese.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import cn2an
5 | from pypinyin import lazy_pinyin, Style
6 |
7 | from .symbols import punctuation
8 | from .tone_sandhi import ToneSandhi
9 |
10 | current_file_path = os.path.dirname(__file__)
11 | pinyin_to_symbol_map = {
12 | line.split("\t")[0]: line.strip().split("\t")[1]
13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14 | }
15 |
16 | import jieba.posseg as psg
17 |
18 |
19 | rep_map = {
20 | ":": ",",
21 | ";": ",",
22 | ",": ",",
23 | "。": ".",
24 | "!": "!",
25 | "?": "?",
26 | "\n": ".",
27 | "·": ",",
28 | "、": ",",
29 | "...": "…",
30 | "$": ".",
31 | "“": "'",
32 | "”": "'",
33 | "‘": "'",
34 | "’": "'",
35 | "(": "'",
36 | ")": "'",
37 | "(": "'",
38 | ")": "'",
39 | "《": "'",
40 | "》": "'",
41 | "【": "'",
42 | "】": "'",
43 | "[": "'",
44 | "]": "'",
45 | "—": "-",
46 | "~": "-",
47 | "~": "-",
48 | "「": "'",
49 | "」": "'",
50 | }
51 |
52 | tone_modifier = ToneSandhi()
53 |
54 |
55 | def replace_punctuation(text):
56 | text = text.replace("嗯", "恩").replace("呣", "母")
57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58 |
59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60 |
61 | replaced_text = re.sub(
62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63 | )
64 |
65 | return replaced_text
66 |
67 |
68 | def g2p(text):
69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71 | phones, tones, word2ph = _g2p(sentences)
72 | assert sum(word2ph) == len(phones)
73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74 | phones = ["_"] + phones + ["_"]
75 | tones = [0] + tones + [0]
76 | word2ph = [1] + word2ph + [1]
77 | return phones, tones, word2ph
78 |
79 |
80 | def _get_initials_finals(word):
81 | initials = []
82 | finals = []
83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84 | orig_finals = lazy_pinyin(
85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86 | )
87 | for c, v in zip(orig_initials, orig_finals):
88 | initials.append(c)
89 | finals.append(v)
90 | return initials, finals
91 |
92 |
93 | def _g2p(segments):
94 | phones_list = []
95 | tones_list = []
96 | word2ph = []
97 | for seg in segments:
98 | # Replace all English words in the sentence
99 | seg = re.sub("[a-zA-Z]+", "", seg)
100 | seg_cut = psg.lcut(seg)
101 | initials = []
102 | finals = []
103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 | for word, pos in seg_cut:
105 | if pos == "eng":
106 | import pdb; pdb.set_trace()
107 | continue
108 | sub_initials, sub_finals = _get_initials_finals(word)
109 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110 | initials.append(sub_initials)
111 | finals.append(sub_finals)
112 |
113 | # assert len(sub_initials) == len(sub_finals) == len(word)
114 | initials = sum(initials, [])
115 | finals = sum(finals, [])
116 | #
117 | for c, v in zip(initials, finals):
118 | raw_pinyin = c + v
119 | # NOTE: post process for pypinyin outputs
120 | # we discriminate i, ii and iii
121 | if c == v:
122 | assert c in punctuation
123 | phone = [c]
124 | tone = "0"
125 | word2ph.append(1)
126 | else:
127 | v_without_tone = v[:-1]
128 | tone = v[-1]
129 |
130 | pinyin = c + v_without_tone
131 | assert tone in "12345"
132 |
133 | if c:
134 | # 多音节
135 | v_rep_map = {
136 | "uei": "ui",
137 | "iou": "iu",
138 | "uen": "un",
139 | }
140 | if v_without_tone in v_rep_map.keys():
141 | pinyin = c + v_rep_map[v_without_tone]
142 | else:
143 | # 单音节
144 | pinyin_rep_map = {
145 | "ing": "ying",
146 | "i": "yi",
147 | "in": "yin",
148 | "u": "wu",
149 | }
150 | if pinyin in pinyin_rep_map.keys():
151 | pinyin = pinyin_rep_map[pinyin]
152 | else:
153 | single_rep_map = {
154 | "v": "yu",
155 | "e": "e",
156 | "i": "y",
157 | "u": "w",
158 | }
159 | if pinyin[0] in single_rep_map.keys():
160 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161 |
162 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163 | phone = pinyin_to_symbol_map[pinyin].split(" ")
164 | word2ph.append(len(phone))
165 |
166 | phones_list += phone
167 | tones_list += [int(tone)] * len(phone)
168 | return phones_list, tones_list, word2ph
169 |
170 |
171 | def text_normalize(text):
172 | numbers = re.findall(r"\d+(?:\.?\d+)?", text)
173 | for number in numbers:
174 | text = text.replace(number, cn2an.an2cn(number), 1)
175 | text = replace_punctuation(text)
176 | return text
177 |
178 |
179 | def get_bert_feature(text, word2ph, device=None):
180 | from text import chinese_bert
181 |
182 | return chinese_bert.get_bert_feature(text, word2ph, device=device)
183 |
184 |
185 | if __name__ == "__main__":
186 | from text.chinese_bert import get_bert_feature
187 |
188 | text = "啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
189 | text = text_normalize(text)
190 | print(text)
191 | phones, tones, word2ph = g2p(text)
192 | bert = get_bert_feature(text, word2ph)
193 |
194 | print(phones, tones, word2ph, bert.shape)
195 |
196 |
197 | # # 示例用法
198 | # text = "这是一个示例文本:,你好!这是一个测试...."
199 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
200 |
--------------------------------------------------------------------------------
/melo/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 |
6 | # model_id = 'hfl/chinese-roberta-wwm-ext-large'
7 | local_path = "./bert/chinese-roberta-wwm-ext-large"
8 |
9 |
10 | tokenizers = {}
11 | models = {}
12 |
13 | def get_bert_feature(text, word2ph, device=None, model_id='hfl/chinese-roberta-wwm-ext-large'):
14 | if model_id not in models:
15 | models[model_id] = AutoModelForMaskedLM.from_pretrained(
16 | model_id
17 | ).to(device)
18 | tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id)
19 | model = models[model_id]
20 | tokenizer = tokenizers[model_id]
21 |
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 |
31 | with torch.no_grad():
32 | inputs = tokenizer(text, return_tensors="pt")
33 | for i in inputs:
34 | inputs[i] = inputs[i].to(device)
35 | res = model(**inputs, output_hidden_states=True)
36 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
37 | # import pdb; pdb.set_trace()
38 | # assert len(word2ph) == len(text) + 2
39 | word2phone = word2ph
40 | phone_level_feature = []
41 | for i in range(len(word2phone)):
42 | repeat_feature = res[i].repeat(word2phone[i], 1)
43 | phone_level_feature.append(repeat_feature)
44 |
45 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
46 | return phone_level_feature.T
47 |
48 |
49 | if __name__ == "__main__":
50 | import torch
51 |
52 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
53 | word2phone = [
54 | 1,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 1,
63 | 2,
64 | 2,
65 | 1,
66 | 2,
67 | 2,
68 | 2,
69 | 2,
70 | 2,
71 | 1,
72 | 1,
73 | 2,
74 | 2,
75 | 1,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 2,
86 | 1,
87 | 2,
88 | 2,
89 | 2,
90 | 2,
91 | 1,
92 | ]
93 |
94 | # 计算总帧数
95 | total_frames = sum(word2phone)
96 | print(word_level_feature.shape)
97 | print(word2phone)
98 | phone_level_feature = []
99 | for i in range(len(word2phone)):
100 | print(word_level_feature[i].shape)
101 |
102 | # 对每个词重复word2phone[i]次
103 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
104 | phone_level_feature.append(repeat_feature)
105 |
106 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
107 | print(phone_level_feature.shape) # torch.Size([36, 1024])
108 |
--------------------------------------------------------------------------------
/melo/text/chinese_mix.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import cn2an
5 | from pypinyin import lazy_pinyin, Style
6 |
7 | # from text.symbols import punctuation
8 | from .symbols import language_tone_start_map
9 | from .tone_sandhi import ToneSandhi
10 | from .english import g2p as g2p_en
11 | from transformers import AutoTokenizer
12 |
13 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
14 | current_file_path = os.path.dirname(__file__)
15 | pinyin_to_symbol_map = {
16 | line.split("\t")[0]: line.strip().split("\t")[1]
17 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
18 | }
19 |
20 | import jieba.posseg as psg
21 |
22 |
23 | rep_map = {
24 | ":": ",",
25 | ";": ",",
26 | ",": ",",
27 | "。": ".",
28 | "!": "!",
29 | "?": "?",
30 | "\n": ".",
31 | "·": ",",
32 | "、": ",",
33 | "...": "…",
34 | "$": ".",
35 | "“": "'",
36 | "”": "'",
37 | "‘": "'",
38 | "’": "'",
39 | "(": "'",
40 | ")": "'",
41 | "(": "'",
42 | ")": "'",
43 | "《": "'",
44 | "》": "'",
45 | "【": "'",
46 | "】": "'",
47 | "[": "'",
48 | "]": "'",
49 | "—": "-",
50 | "~": "-",
51 | "~": "-",
52 | "「": "'",
53 | "」": "'",
54 | }
55 |
56 | tone_modifier = ToneSandhi()
57 |
58 |
59 | def replace_punctuation(text):
60 | text = text.replace("嗯", "恩").replace("呣", "母")
61 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
62 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
63 | replaced_text = re.sub(r"[^\u4e00-\u9fa5_a-zA-Z\s" + "".join(punctuation) + r"]+", "", replaced_text)
64 | replaced_text = re.sub(r"[\s]+", " ", replaced_text)
65 |
66 | return replaced_text
67 |
68 |
69 | def g2p(text, impl='v2'):
70 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
71 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
72 | if impl == 'v1':
73 | _func = _g2p
74 | elif impl == 'v2':
75 | _func = _g2p_v2
76 | else:
77 | raise NotImplementedError()
78 | phones, tones, word2ph = _func(sentences)
79 | assert sum(word2ph) == len(phones)
80 | # assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
81 | phones = ["_"] + phones + ["_"]
82 | tones = [0] + tones + [0]
83 | word2ph = [1] + word2ph + [1]
84 | return phones, tones, word2ph
85 |
86 |
87 | def _get_initials_finals(word):
88 | initials = []
89 | finals = []
90 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
91 | orig_finals = lazy_pinyin(
92 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
93 | )
94 | for c, v in zip(orig_initials, orig_finals):
95 | initials.append(c)
96 | finals.append(v)
97 | return initials, finals
98 |
99 | model_id = 'bert-base-multilingual-uncased'
100 | tokenizer = AutoTokenizer.from_pretrained(model_id)
101 | def _g2p(segments):
102 | phones_list = []
103 | tones_list = []
104 | word2ph = []
105 | for seg in segments:
106 | # Replace all English words in the sentence
107 | # seg = re.sub("[a-zA-Z]+", "", seg)
108 | seg_cut = psg.lcut(seg)
109 | initials = []
110 | finals = []
111 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
112 | for word, pos in seg_cut:
113 | if pos == "eng":
114 | initials.append(['EN_WORD'])
115 | finals.append([word])
116 | else:
117 | sub_initials, sub_finals = _get_initials_finals(word)
118 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
119 | initials.append(sub_initials)
120 | finals.append(sub_finals)
121 |
122 | # assert len(sub_initials) == len(sub_finals) == len(word)
123 | initials = sum(initials, [])
124 | finals = sum(finals, [])
125 | #
126 | for c, v in zip(initials, finals):
127 | if c == 'EN_WORD':
128 | tokenized_en = tokenizer.tokenize(v)
129 | phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
130 | # apply offset to tones_en
131 | tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
132 | phones_list += phones_en
133 | tones_list += tones_en
134 | word2ph += word2ph_en
135 | else:
136 | raw_pinyin = c + v
137 | # NOTE: post process for pypinyin outputs
138 | # we discriminate i, ii and iii
139 | if c == v:
140 | assert c in punctuation
141 | phone = [c]
142 | tone = "0"
143 | word2ph.append(1)
144 | else:
145 | v_without_tone = v[:-1]
146 | tone = v[-1]
147 |
148 | pinyin = c + v_without_tone
149 | assert tone in "12345"
150 |
151 | if c:
152 | # 多音节
153 | v_rep_map = {
154 | "uei": "ui",
155 | "iou": "iu",
156 | "uen": "un",
157 | }
158 | if v_without_tone in v_rep_map.keys():
159 | pinyin = c + v_rep_map[v_without_tone]
160 | else:
161 | # 单音节
162 | pinyin_rep_map = {
163 | "ing": "ying",
164 | "i": "yi",
165 | "in": "yin",
166 | "u": "wu",
167 | }
168 | if pinyin in pinyin_rep_map.keys():
169 | pinyin = pinyin_rep_map[pinyin]
170 | else:
171 | single_rep_map = {
172 | "v": "yu",
173 | "e": "e",
174 | "i": "y",
175 | "u": "w",
176 | }
177 | if pinyin[0] in single_rep_map.keys():
178 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
179 |
180 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
181 | phone = pinyin_to_symbol_map[pinyin].split(" ")
182 | word2ph.append(len(phone))
183 |
184 | phones_list += phone
185 | tones_list += [int(tone)] * len(phone)
186 | return phones_list, tones_list, word2ph
187 |
188 |
189 | def text_normalize(text):
190 | numbers = re.findall(r"\d+(?:\.?\d+)?", text)
191 | for number in numbers:
192 | text = text.replace(number, cn2an.an2cn(number), 1)
193 | text = replace_punctuation(text)
194 | return text
195 |
196 |
197 | def get_bert_feature(text, word2ph, device):
198 | from . import chinese_bert
199 | return chinese_bert.get_bert_feature(text, word2ph, model_id='bert-base-multilingual-uncased', device=device)
200 |
201 | from .chinese import _g2p as _chinese_g2p
202 | def _g2p_v2(segments):
203 | spliter = '#$&^!@'
204 |
205 | phones_list = []
206 | tones_list = []
207 | word2ph = []
208 |
209 | for text in segments:
210 | assert spliter not in text
211 | # replace all english words
212 | text = re.sub('([a-zA-Z\s]+)', lambda x: f'{spliter}{x.group(1)}{spliter}', text)
213 | texts = text.split(spliter)
214 | texts = [t for t in texts if len(t) > 0]
215 |
216 |
217 | for text in texts:
218 | if re.match('[a-zA-Z\s]+', text):
219 | # english
220 | tokenized_en = tokenizer.tokenize(text)
221 | phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
222 | # apply offset to tones_en
223 | tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
224 | phones_list += phones_en
225 | tones_list += tones_en
226 | word2ph += word2ph_en
227 | else:
228 | phones_zh, tones_zh, word2ph_zh = _chinese_g2p([text])
229 | phones_list += phones_zh
230 | tones_list += tones_zh
231 | word2ph += word2ph_zh
232 | return phones_list, tones_list, word2ph
233 |
234 |
235 |
236 | if __name__ == "__main__":
237 | # from text.chinese_bert import get_bert_feature
238 |
239 | text = "NFT啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
240 | text = '我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。'
241 | text = '今天下午,我们准备去shopping mall购物,然后晚上去看一场movie。'
242 | text = '我们现在 also 能够 help 很多公司 use some machine learning 的 algorithms 啊!'
243 | text = text_normalize(text)
244 | print(text)
245 | phones, tones, word2ph = g2p(text, impl='v2')
246 | bert = get_bert_feature(text, word2ph, device='cuda:0')
247 | print(phones)
248 | import pdb; pdb.set_trace()
249 |
250 |
251 | # # 示例用法
252 | # text = "这是一个示例文本:,你好!这是一个测试...."
253 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
254 |
--------------------------------------------------------------------------------
/melo/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, english, chinese_mix, korean, french, spanish
2 | from . import cleaned_text_to_sequence
3 | import copy
4 |
5 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english, 'ZH_MIX_EN': chinese_mix, 'KR': korean,
6 | 'FR': french, 'SP': spanish, 'ES': spanish}
7 |
8 |
9 | def clean_text(text, language):
10 | language_module = language_module_map[language]
11 | norm_text = language_module.text_normalize(text)
12 | phones, tones, word2ph = language_module.g2p(norm_text)
13 | return norm_text, phones, tones, word2ph
14 |
15 |
16 | def clean_text_bert(text, language, device=None):
17 | language_module = language_module_map[language]
18 | norm_text = language_module.text_normalize(text)
19 | phones, tones, word2ph = language_module.g2p(norm_text)
20 |
21 | word2ph_bak = copy.deepcopy(word2ph)
22 | for i in range(len(word2ph)):
23 | word2ph[i] = word2ph[i] * 2
24 | word2ph[0] += 1
25 | bert = language_module.get_bert_feature(norm_text, word2ph, device=device)
26 |
27 | return norm_text, phones, tones, word2ph_bak, bert
28 |
29 |
30 | def text_to_sequence(text, language):
31 | norm_text, phones, tones, word2ph = clean_text(text, language)
32 | return cleaned_text_to_sequence(phones, tones, language)
33 |
34 |
35 | if __name__ == "__main__":
36 | pass
--------------------------------------------------------------------------------
/melo/text/cleaner_multiling.py:
--------------------------------------------------------------------------------
1 | """Set of default text cleaners"""
2 | # TODO: pick the cleaner for languages dynamically
3 |
4 | import re
5 |
6 | # Regular expression matching whitespace:
7 | _whitespace_re = re.compile(r"\s+")
8 |
9 | rep_map = {
10 | ":": ",",
11 | ";": ",",
12 | ",": ",",
13 | "。": ".",
14 | "!": "!",
15 | "?": "?",
16 | "\n": ".",
17 | "·": ",",
18 | "、": ",",
19 | "...": ".",
20 | "…": ".",
21 | "$": ".",
22 | "“": "'",
23 | "”": "'",
24 | "‘": "'",
25 | "’": "'",
26 | "(": "'",
27 | ")": "'",
28 | "(": "'",
29 | ")": "'",
30 | "《": "'",
31 | "》": "'",
32 | "【": "'",
33 | "】": "'",
34 | "[": "'",
35 | "]": "'",
36 | "—": "",
37 | "~": "-",
38 | "~": "-",
39 | "「": "'",
40 | "」": "'",
41 | }
42 |
43 | def replace_punctuation(text):
44 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
45 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
46 | return replaced_text
47 |
48 | def lowercase(text):
49 | return text.lower()
50 |
51 |
52 | def collapse_whitespace(text):
53 | return re.sub(_whitespace_re, " ", text).strip()
54 |
55 | def remove_punctuation_at_begin(text):
56 | return re.sub(r'^[,.!?]+', '', text)
57 |
58 | def remove_aux_symbols(text):
59 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
60 | return text
61 |
62 |
63 | def replace_symbols(text, lang="en"):
64 | """Replace symbols based on the lenguage tag.
65 |
66 | Args:
67 | text:
68 | Input text.
69 | lang:
70 | Lenguage identifier. ex: "en", "fr", "pt", "ca".
71 |
72 | Returns:
73 | The modified text
74 | example:
75 | input args:
76 | text: "si l'avi cau, diguem-ho"
77 | lang: "ca"
78 | Output:
79 | text: "si lavi cau, diguemho"
80 | """
81 | text = text.replace(";", ",")
82 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
83 | text = text.replace(":", ",")
84 | if lang == "en":
85 | text = text.replace("&", " and ")
86 | elif lang == "fr":
87 | text = text.replace("&", " et ")
88 | elif lang == "pt":
89 | text = text.replace("&", " e ")
90 | elif lang == "ca":
91 | text = text.replace("&", " i ")
92 | text = text.replace("'", "")
93 | elif lang== "es":
94 | text=text.replace("&","y")
95 | text = text.replace("'", "")
96 | return text
97 |
98 | def unicleaners(text, cased=False, lang='en'):
99 | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
100 | numbers, phonemizer already does that"""
101 | if not cased:
102 | text = lowercase(text)
103 | text = replace_punctuation(text)
104 | text = replace_symbols(text, lang=lang)
105 | text = remove_aux_symbols(text)
106 | text = remove_punctuation_at_begin(text)
107 | text = collapse_whitespace(text)
108 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
109 | return text
110 |
111 |
--------------------------------------------------------------------------------
/melo/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/cmudict_cache.pickle
--------------------------------------------------------------------------------
/melo/text/english.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 | from g2p_en import G2p
5 |
6 | from . import symbols
7 |
8 | from .english_utils.abbreviations import expand_abbreviations
9 | from .english_utils.time_norm import expand_time_english
10 | from .english_utils.number_norm import normalize_numbers
11 | from .japanese import distribute_phone
12 |
13 | from transformers import AutoTokenizer
14 |
15 | current_file_path = os.path.dirname(__file__)
16 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
17 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
18 | _g2p = G2p()
19 |
20 | arpa = {
21 | "AH0",
22 | "S",
23 | "AH1",
24 | "EY2",
25 | "AE2",
26 | "EH0",
27 | "OW2",
28 | "UH0",
29 | "NG",
30 | "B",
31 | "G",
32 | "AY0",
33 | "M",
34 | "AA0",
35 | "F",
36 | "AO0",
37 | "ER2",
38 | "UH1",
39 | "IY1",
40 | "AH2",
41 | "DH",
42 | "IY0",
43 | "EY1",
44 | "IH0",
45 | "K",
46 | "N",
47 | "W",
48 | "IY2",
49 | "T",
50 | "AA1",
51 | "ER1",
52 | "EH2",
53 | "OY0",
54 | "UH2",
55 | "UW1",
56 | "Z",
57 | "AW2",
58 | "AW1",
59 | "V",
60 | "UW2",
61 | "AA2",
62 | "ER",
63 | "AW0",
64 | "UW0",
65 | "R",
66 | "OW1",
67 | "EH1",
68 | "ZH",
69 | "AE0",
70 | "IH2",
71 | "IH",
72 | "Y",
73 | "JH",
74 | "P",
75 | "AY1",
76 | "EY0",
77 | "OY2",
78 | "TH",
79 | "HH",
80 | "D",
81 | "ER0",
82 | "CH",
83 | "AO1",
84 | "AE1",
85 | "AO2",
86 | "OY1",
87 | "AY2",
88 | "IH1",
89 | "OW0",
90 | "L",
91 | "SH",
92 | }
93 |
94 |
95 | def post_replace_ph(ph):
96 | rep_map = {
97 | ":": ",",
98 | ";": ",",
99 | ",": ",",
100 | "。": ".",
101 | "!": "!",
102 | "?": "?",
103 | "\n": ".",
104 | "·": ",",
105 | "、": ",",
106 | "...": "…",
107 | "v": "V",
108 | }
109 | if ph in rep_map.keys():
110 | ph = rep_map[ph]
111 | if ph in symbols:
112 | return ph
113 | if ph not in symbols:
114 | ph = "UNK"
115 | return ph
116 |
117 |
118 | def read_dict():
119 | g2p_dict = {}
120 | start_line = 49
121 | with open(CMU_DICT_PATH) as f:
122 | line = f.readline()
123 | line_index = 1
124 | while line:
125 | if line_index >= start_line:
126 | line = line.strip()
127 | word_split = line.split(" ")
128 | word = word_split[0]
129 |
130 | syllable_split = word_split[1].split(" - ")
131 | g2p_dict[word] = []
132 | for syllable in syllable_split:
133 | phone_split = syllable.split(" ")
134 | g2p_dict[word].append(phone_split)
135 |
136 | line_index = line_index + 1
137 | line = f.readline()
138 |
139 | return g2p_dict
140 |
141 |
142 | def cache_dict(g2p_dict, file_path):
143 | with open(file_path, "wb") as pickle_file:
144 | pickle.dump(g2p_dict, pickle_file)
145 |
146 |
147 | def get_dict():
148 | if os.path.exists(CACHE_PATH):
149 | with open(CACHE_PATH, "rb") as pickle_file:
150 | g2p_dict = pickle.load(pickle_file)
151 | else:
152 | g2p_dict = read_dict()
153 | cache_dict(g2p_dict, CACHE_PATH)
154 |
155 | return g2p_dict
156 |
157 |
158 | eng_dict = get_dict()
159 |
160 |
161 | def refine_ph(phn):
162 | tone = 0
163 | if re.search(r"\d$", phn):
164 | tone = int(phn[-1]) + 1
165 | phn = phn[:-1]
166 | return phn.lower(), tone
167 |
168 |
169 | def refine_syllables(syllables):
170 | tones = []
171 | phonemes = []
172 | for phn_list in syllables:
173 | for i in range(len(phn_list)):
174 | phn = phn_list[i]
175 | phn, tone = refine_ph(phn)
176 | phonemes.append(phn)
177 | tones.append(tone)
178 | return phonemes, tones
179 |
180 |
181 | def text_normalize(text):
182 | text = text.lower()
183 | text = expand_time_english(text)
184 | text = normalize_numbers(text)
185 | text = expand_abbreviations(text)
186 | return text
187 |
188 | model_id = 'bert-base-uncased'
189 | tokenizer = AutoTokenizer.from_pretrained(model_id)
190 | def g2p_old(text):
191 | tokenized = tokenizer.tokenize(text)
192 | # import pdb; pdb.set_trace()
193 | phones = []
194 | tones = []
195 | words = re.split(r"([,;.\-\?\!\s+])", text)
196 | for w in words:
197 | if w.upper() in eng_dict:
198 | phns, tns = refine_syllables(eng_dict[w.upper()])
199 | phones += phns
200 | tones += tns
201 | else:
202 | phone_list = list(filter(lambda p: p != " ", _g2p(w)))
203 | for ph in phone_list:
204 | if ph in arpa:
205 | ph, tn = refine_ph(ph)
206 | phones.append(ph)
207 | tones.append(tn)
208 | else:
209 | phones.append(ph)
210 | tones.append(0)
211 | # todo: implement word2ph
212 | word2ph = [1 for i in phones]
213 |
214 | phones = [post_replace_ph(i) for i in phones]
215 | return phones, tones, word2ph
216 |
217 | def g2p(text, pad_start_end=True, tokenized=None):
218 | if tokenized is None:
219 | tokenized = tokenizer.tokenize(text)
220 | # import pdb; pdb.set_trace()
221 | phs = []
222 | ph_groups = []
223 | for t in tokenized:
224 | if not t.startswith("#"):
225 | ph_groups.append([t])
226 | else:
227 | ph_groups[-1].append(t.replace("#", ""))
228 |
229 | phones = []
230 | tones = []
231 | word2ph = []
232 | for group in ph_groups:
233 | w = "".join(group)
234 | phone_len = 0
235 | word_len = len(group)
236 | if w.upper() in eng_dict:
237 | phns, tns = refine_syllables(eng_dict[w.upper()])
238 | phones += phns
239 | tones += tns
240 | phone_len += len(phns)
241 | else:
242 | phone_list = list(filter(lambda p: p != " ", _g2p(w)))
243 | for ph in phone_list:
244 | if ph in arpa:
245 | ph, tn = refine_ph(ph)
246 | phones.append(ph)
247 | tones.append(tn)
248 | else:
249 | phones.append(ph)
250 | tones.append(0)
251 | phone_len += 1
252 | aaa = distribute_phone(phone_len, word_len)
253 | word2ph += aaa
254 | phones = [post_replace_ph(i) for i in phones]
255 |
256 | if pad_start_end:
257 | phones = ["_"] + phones + ["_"]
258 | tones = [0] + tones + [0]
259 | word2ph = [1] + word2ph + [1]
260 | return phones, tones, word2ph
261 |
262 | def get_bert_feature(text, word2ph, device=None):
263 | from text import english_bert
264 |
265 | return english_bert.get_bert_feature(text, word2ph, device=device)
266 |
267 | if __name__ == "__main__":
268 | # print(get_dict())
269 | # print(eng_word_to_phoneme("hello"))
270 | from text.english_bert import get_bert_feature
271 | text = "In this paper, we propose 1 DSPGAN, a N-F-T GAN-based universal vocoder."
272 | text = text_normalize(text)
273 | phones, tones, word2ph = g2p(text)
274 | import pdb; pdb.set_trace()
275 | bert = get_bert_feature(text, word2ph)
276 |
277 | print(phones, tones, word2ph, bert.shape)
278 |
279 | # all_phones = set()
280 | # for k, syllables in eng_dict.items():
281 | # for group in syllables:
282 | # for ph in group:
283 | # all_phones.add(ph)
284 | # print(all_phones)
285 |
--------------------------------------------------------------------------------
/melo/text/english_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | model_id = 'bert-base-uncased'
6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
7 | model = None
8 |
9 | def get_bert_feature(text, word2ph, device=None):
10 | global model
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if model is None:
20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 | device
22 | )
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = model(**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert inputs["input_ids"].shape[-1] == len(word2ph)
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
--------------------------------------------------------------------------------
/melo/text/english_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/english_utils/__init__.py
--------------------------------------------------------------------------------
/melo/text/english_utils/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in english:
4 | abbreviations_en = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("mrs", "misess"),
8 | ("mr", "mister"),
9 | ("dr", "doctor"),
10 | ("st", "saint"),
11 | ("co", "company"),
12 | ("jr", "junior"),
13 | ("maj", "major"),
14 | ("gen", "general"),
15 | ("drs", "doctors"),
16 | ("rev", "reverend"),
17 | ("lt", "lieutenant"),
18 | ("hon", "honorable"),
19 | ("sgt", "sergeant"),
20 | ("capt", "captain"),
21 | ("esq", "esquire"),
22 | ("ltd", "limited"),
23 | ("col", "colonel"),
24 | ("ft", "fort"),
25 | ]
26 | ]
27 |
28 | def expand_abbreviations(text, lang="en"):
29 | if lang == "en":
30 | _abbreviations = abbreviations_en
31 | else:
32 | raise NotImplementedError()
33 | for regex, replacement in _abbreviations:
34 | text = re.sub(regex, replacement, text)
35 | return text
--------------------------------------------------------------------------------
/melo/text/english_utils/number_norm.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import re
4 | from typing import Dict
5 |
6 | import inflect
7 |
8 | _inflect = inflect.engine()
9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"-?[0-9]+")
14 |
15 |
16 | def _remove_commas(m):
17 | return m.group(1).replace(",", "")
18 |
19 |
20 | def _expand_decimal_point(m):
21 | return m.group(1).replace(".", " point ")
22 |
23 |
24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
25 | parts = value.replace(",", "").split(".")
26 | if len(parts) > 2:
27 | return f"{value} {inflection[2]}" # Unexpected format
28 | text = []
29 | integer = int(parts[0]) if parts[0] else 0
30 | if integer > 0:
31 | integer_unit = inflection.get(integer, inflection[2])
32 | text.append(f"{integer} {integer_unit}")
33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
34 | if fraction > 0:
35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02])
36 | text.append(f"{fraction} {fraction_unit}")
37 | if len(text) == 0:
38 | return f"zero {inflection[2]}"
39 | return " ".join(text)
40 |
41 |
42 | def _expand_currency(m: "re.Match") -> str:
43 | currencies = {
44 | "$": {
45 | 0.01: "cent",
46 | 0.02: "cents",
47 | 1: "dollar",
48 | 2: "dollars",
49 | },
50 | "€": {
51 | 0.01: "cent",
52 | 0.02: "cents",
53 | 1: "euro",
54 | 2: "euros",
55 | },
56 | "£": {
57 | 0.01: "penny",
58 | 0.02: "pence",
59 | 1: "pound sterling",
60 | 2: "pounds sterling",
61 | },
62 | "¥": {
63 | # TODO rin
64 | 0.02: "sen",
65 | 2: "yen",
66 | },
67 | }
68 | unit = m.group(1)
69 | currency = currencies[unit]
70 | value = m.group(2)
71 | return __expand_currency(value, currency)
72 |
73 |
74 | def _expand_ordinal(m):
75 | return _inflect.number_to_words(m.group(0))
76 |
77 |
78 | def _expand_number(m):
79 | num = int(m.group(0))
80 | if 1000 < num < 3000:
81 | if num == 2000:
82 | return "two thousand"
83 | if 2000 < num < 2010:
84 | return "two thousand " + _inflect.number_to_words(num % 100)
85 | if num % 100 == 0:
86 | return _inflect.number_to_words(num // 100) + " hundred"
87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
88 | return _inflect.number_to_words(num, andword="")
89 |
90 |
91 | def normalize_numbers(text):
92 | text = re.sub(_comma_number_re, _remove_commas, text)
93 | text = re.sub(_currency_re, _expand_currency, text)
94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
95 | text = re.sub(_ordinal_re, _expand_ordinal, text)
96 | text = re.sub(_number_re, _expand_number, text)
97 | return text
--------------------------------------------------------------------------------
/melo/text/english_utils/time_norm.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 |
7 | _time_re = re.compile(
8 | r"""\b
9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
10 | :
11 | ([0-5][0-9]) # minutes
12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 | \b""",
14 | re.IGNORECASE | re.X,
15 | )
16 |
17 |
18 | def _expand_num(n: int) -> str:
19 | return _inflect.number_to_words(n)
20 |
21 |
22 | def _expand_time_english(match: "re.Match") -> str:
23 | hour = int(match.group(1))
24 | past_noon = hour >= 12
25 | time = []
26 | if hour > 12:
27 | hour -= 12
28 | elif hour == 0:
29 | hour = 12
30 | past_noon = True
31 | time.append(_expand_num(hour))
32 |
33 | minute = int(match.group(6))
34 | if minute > 0:
35 | if minute < 10:
36 | time.append("oh")
37 | time.append(_expand_num(minute))
38 | am_pm = match.group(7)
39 | if am_pm is None:
40 | time.append("p m" if past_noon else "a m")
41 | else:
42 | time.extend(list(am_pm.replace(".", "")))
43 | return " ".join(time)
44 |
45 |
46 | def expand_time_english(text: str) -> str:
47 | return re.sub(_time_re, _expand_time_english, text)
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/es_phonemizer/__init__.py
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/base.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from typing import List, Tuple
3 |
4 | from .punctuation import Punctuation
5 |
6 |
7 | class BasePhonemizer(abc.ABC):
8 | """Base phonemizer class
9 |
10 | Phonemization follows the following steps:
11 | 1. Preprocessing:
12 | - remove empty lines
13 | - remove punctuation
14 | - keep track of punctuation marks
15 |
16 | 2. Phonemization:
17 | - convert text to phonemes
18 |
19 | 3. Postprocessing:
20 | - join phonemes
21 | - restore punctuation marks
22 |
23 | Args:
24 | language (str):
25 | Language used by the phonemizer.
26 |
27 | punctuations (List[str]):
28 | List of punctuation marks to be preserved.
29 |
30 | keep_puncs (bool):
31 | Whether to preserve punctuation marks or not.
32 | """
33 |
34 | def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
35 | # ensure the backend is installed on the system
36 | if not self.is_available():
37 | raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
38 |
39 | # ensure the backend support the requested language
40 | self._language = self._init_language(language)
41 |
42 | # setup punctuation processing
43 | self._keep_puncs = keep_puncs
44 | self._punctuator = Punctuation(punctuations)
45 |
46 | def _init_language(self, language):
47 | """Language initialization
48 |
49 | This method may be overloaded in child classes (see Segments backend)
50 |
51 | """
52 | if not self.is_supported_language(language):
53 | raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
54 | return language
55 |
56 | @property
57 | def language(self):
58 | """The language code configured to be used for phonemization"""
59 | return self._language
60 |
61 | @staticmethod
62 | @abc.abstractmethod
63 | def name():
64 | """The name of the backend"""
65 | ...
66 |
67 | @classmethod
68 | @abc.abstractmethod
69 | def is_available(cls):
70 | """Returns True if the backend is installed, False otherwise"""
71 | ...
72 |
73 | @classmethod
74 | @abc.abstractmethod
75 | def version(cls):
76 | """Return the backend version as a tuple (major, minor, patch)"""
77 | ...
78 |
79 | @staticmethod
80 | @abc.abstractmethod
81 | def supported_languages():
82 | """Return a dict of language codes -> name supported by the backend"""
83 | ...
84 |
85 | def is_supported_language(self, language):
86 | """Returns True if `language` is supported by the backend"""
87 | return language in self.supported_languages()
88 |
89 | @abc.abstractmethod
90 | def _phonemize(self, text, separator):
91 | """The main phonemization method"""
92 |
93 | def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
94 | """Preprocess the text before phonemization
95 |
96 | 1. remove spaces
97 | 2. remove punctuation
98 |
99 | Override this if you need a different behaviour
100 | """
101 | text = text.strip()
102 | if self._keep_puncs:
103 | # a tuple (text, punctuation marks)
104 | return self._punctuator.strip_to_restore(text)
105 | return [self._punctuator.strip(text)], []
106 |
107 | def _phonemize_postprocess(self, phonemized, punctuations) -> str:
108 | """Postprocess the raw phonemized output
109 |
110 | Override this if you need a different behaviour
111 | """
112 | if self._keep_puncs:
113 | return self._punctuator.restore(phonemized, punctuations)[0]
114 | return phonemized[0]
115 |
116 | def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
117 | """Returns the `text` phonemized for the given language
118 |
119 | Args:
120 | text (str):
121 | Text to be phonemized.
122 |
123 | separator (str):
124 | string separator used between phonemes. Default to '_'.
125 |
126 | Returns:
127 | (str): Phonemized text
128 | """
129 | text, punctuations = self._phonemize_preprocess(text)
130 | phonemized = []
131 | for t in text:
132 | p = self._phonemize(t, separator)
133 | phonemized.append(p)
134 | phonemized = self._phonemize_postprocess(phonemized, punctuations)
135 | return phonemized
136 |
137 | def print_logs(self, level: int = 0):
138 | indent = "\t" * level
139 | print(f"{indent}| > phoneme language: {self.language}")
140 | print(f"{indent}| > phoneme backend: {self.name()}")
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/cleaner.py:
--------------------------------------------------------------------------------
1 | """Set of default text cleaners"""
2 | # TODO: pick the cleaner for languages dynamically
3 |
4 | import re
5 |
6 | # Regular expression matching whitespace:
7 | _whitespace_re = re.compile(r"\s+")
8 |
9 | rep_map = {
10 | ":": ",",
11 | ";": ",",
12 | ",": ",",
13 | "。": ".",
14 | "!": "!",
15 | "?": "?",
16 | "\n": ".",
17 | "·": ",",
18 | "、": ",",
19 | "...": ".",
20 | "…": ".",
21 | "$": ".",
22 | "“": "'",
23 | "”": "'",
24 | "‘": "'",
25 | "’": "'",
26 | "(": "'",
27 | ")": "'",
28 | "(": "'",
29 | ")": "'",
30 | "《": "'",
31 | "》": "'",
32 | "【": "'",
33 | "】": "'",
34 | "[": "'",
35 | "]": "'",
36 | "—": "",
37 | "~": "-",
38 | "~": "-",
39 | "「": "'",
40 | "」": "'",
41 | }
42 |
43 | def replace_punctuation(text):
44 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
45 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
46 | return replaced_text
47 |
48 | def lowercase(text):
49 | return text.lower()
50 |
51 |
52 | def collapse_whitespace(text):
53 | return re.sub(_whitespace_re, " ", text).strip()
54 |
55 | def remove_punctuation_at_begin(text):
56 | return re.sub(r'^[,.!?]+', '', text)
57 |
58 | def remove_aux_symbols(text):
59 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
60 | return text
61 |
62 |
63 | def replace_symbols(text, lang="en"):
64 | """Replace symbols based on the lenguage tag.
65 |
66 | Args:
67 | text:
68 | Input text.
69 | lang:
70 | Lenguage identifier. ex: "en", "fr", "pt", "ca".
71 |
72 | Returns:
73 | The modified text
74 | example:
75 | input args:
76 | text: "si l'avi cau, diguem-ho"
77 | lang: "ca"
78 | Output:
79 | text: "si lavi cau, diguemho"
80 | """
81 | text = text.replace(";", ",")
82 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
83 | text = text.replace(":", ",")
84 | if lang == "en":
85 | text = text.replace("&", " and ")
86 | elif lang == "fr":
87 | text = text.replace("&", " et ")
88 | elif lang == "pt":
89 | text = text.replace("&", " e ")
90 | elif lang == "ca":
91 | text = text.replace("&", " i ")
92 | text = text.replace("'", "")
93 | elif lang== "es":
94 | text=text.replace("&","y")
95 | text = text.replace("'", "")
96 | return text
97 |
98 | def spanish_cleaners(text):
99 | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
100 | numbers, phonemizer already does that"""
101 | text = lowercase(text)
102 | text = replace_symbols(text, lang="es")
103 | text = replace_punctuation(text)
104 | text = remove_aux_symbols(text)
105 | text = remove_punctuation_at_begin(text)
106 | text = collapse_whitespace(text)
107 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
108 | return text
109 |
110 |
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols.json:
--------------------------------------------------------------------------------
1 | {
2 | "symbols": [
3 | "_",
4 | ",",
5 | ".",
6 | "!",
7 | "?",
8 | "-",
9 | "~",
10 | "\u2026",
11 | "N",
12 | "Q",
13 | "a",
14 | "b",
15 | "d",
16 | "e",
17 | "f",
18 | "g",
19 | "h",
20 | "i",
21 | "j",
22 | "k",
23 | "l",
24 | "m",
25 | "n",
26 | "o",
27 | "p",
28 | "s",
29 | "t",
30 | "u",
31 | "v",
32 | "w",
33 | "x",
34 | "y",
35 | "z",
36 | "\u0251",
37 | "\u00e6",
38 | "\u0283",
39 | "\u0291",
40 | "\u00e7",
41 | "\u026f",
42 | "\u026a",
43 | "\u0254",
44 | "\u025b",
45 | "\u0279",
46 | "\u00f0",
47 | "\u0259",
48 | "\u026b",
49 | "\u0265",
50 | "\u0278",
51 | "\u028a",
52 | "\u027e",
53 | "\u0292",
54 | "\u03b8",
55 | "\u03b2",
56 | "\u014b",
57 | "\u0266",
58 | "\u207c",
59 | "\u02b0",
60 | "`",
61 | "^",
62 | "#",
63 | "*",
64 | "=",
65 | "\u02c8",
66 | "\u02cc",
67 | "\u2192",
68 | "\u2193",
69 | "\u2191",
70 | " ",
71 | "\u0263",
72 | "\u0261",
73 | "r",
74 | "\u0272",
75 | "\u029d",
76 | "\u028e",
77 | "\u02d0"
78 | ]
79 | }
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols.txt:
--------------------------------------------------------------------------------
1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɡrɲʝɣʎː—¿¡
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols_v2.json:
--------------------------------------------------------------------------------
1 | {
2 | "symbols": [
3 | "_",
4 | ",",
5 | ".",
6 | "!",
7 | "?",
8 | "-",
9 | "~",
10 | "\u2026",
11 | "N",
12 | "Q",
13 | "a",
14 | "b",
15 | "d",
16 | "e",
17 | "f",
18 | "g",
19 | "h",
20 | "i",
21 | "j",
22 | "k",
23 | "l",
24 | "m",
25 | "n",
26 | "o",
27 | "p",
28 | "s",
29 | "t",
30 | "u",
31 | "v",
32 | "w",
33 | "x",
34 | "y",
35 | "z",
36 | "\u0251",
37 | "\u00e6",
38 | "\u0283",
39 | "\u0291",
40 | "\u00e7",
41 | "\u026f",
42 | "\u026a",
43 | "\u0254",
44 | "\u025b",
45 | "\u0279",
46 | "\u00f0",
47 | "\u0259",
48 | "\u026b",
49 | "\u0265",
50 | "\u0278",
51 | "\u028a",
52 | "\u027e",
53 | "\u0292",
54 | "\u03b8",
55 | "\u03b2",
56 | "\u014b",
57 | "\u0266",
58 | "\u207c",
59 | "\u02b0",
60 | "`",
61 | "^",
62 | "#",
63 | "*",
64 | "=",
65 | "\u02c8",
66 | "\u02cc",
67 | "\u2192",
68 | "\u2193",
69 | "\u2191",
70 | " ",
71 | "\u0261",
72 | "r",
73 | "\u0272",
74 | "\u029d",
75 | "\u0263",
76 | "\u028e",
77 | "\u02d0",
78 |
79 | "\u2014",
80 | "\u00bf",
81 | "\u00a1"
82 | ]
83 | }
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_to_ipa.py:
--------------------------------------------------------------------------------
1 | from .cleaner import spanish_cleaners
2 | from .gruut_wrapper import Gruut
3 |
4 | def es2ipa(text):
5 | e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
6 | # text = spanish_cleaners(text)
7 | phonemes = e.phonemize(text, separator="")
8 | return phonemes
9 |
10 |
11 | if __name__ == '__main__':
12 | print(es2ipa('¿Y a quién echaría de menos, en el mundo si no fuese a vos?'))
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/gruut_wrapper.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from typing import List
3 |
4 | import gruut
5 | from gruut_ipa import IPA # pip install gruut_ipa
6 |
7 | from .base import BasePhonemizer
8 | from .punctuation import Punctuation
9 |
10 | # Table for str.translate to fix gruut/TTS phoneme mismatch
11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
12 |
13 |
14 | class Gruut(BasePhonemizer):
15 | """Gruut wrapper for G2P
16 |
17 | Args:
18 | language (str):
19 | Valid language code for the used backend.
20 |
21 | punctuations (str):
22 | Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
23 |
24 | keep_puncs (bool):
25 | If true, keep the punctuations after phonemization. Defaults to True.
26 |
27 | use_espeak_phonemes (bool):
28 | If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
29 |
30 | keep_stress (bool):
31 | If true, keep the stress characters after phonemization. Defaults to False.
32 |
33 | Example:
34 |
35 | >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
36 | >>> phonemizer = Gruut('en-us')
37 | >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
38 | 'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
39 | """
40 |
41 | def __init__(
42 | self,
43 | language: str,
44 | punctuations=Punctuation.default_puncs(),
45 | keep_puncs=True,
46 | use_espeak_phonemes=False,
47 | keep_stress=False,
48 | ):
49 | super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
50 | self.use_espeak_phonemes = use_espeak_phonemes
51 | self.keep_stress = keep_stress
52 |
53 | @staticmethod
54 | def name():
55 | return "gruut"
56 |
57 | def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
58 | """Convert input text to phonemes.
59 |
60 | Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
61 | that constitude a single sound.
62 |
63 | It doesn't affect 🐸TTS since it individually converts each character to token IDs.
64 |
65 | Examples::
66 | "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
67 |
68 | Args:
69 | text (str):
70 | Text to be converted to phonemes.
71 |
72 | tie (bool, optional) : When True use a '͡' character between
73 | consecutive characters of a single phoneme. Else separate phoneme
74 | with '_'. This option requires espeak>=1.49. Default to False.
75 | """
76 | ph_list = []
77 | for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
78 | for word in sentence:
79 | if word.is_break:
80 | # Use actual character for break phoneme (e.g., comma)
81 | if ph_list:
82 | # Join with previous word
83 | ph_list[-1].append(word.text)
84 | else:
85 | # First word is punctuation
86 | ph_list.append([word.text])
87 | elif word.phonemes:
88 | # Add phonemes for word
89 | word_phonemes = []
90 |
91 | for word_phoneme in word.phonemes:
92 | if not self.keep_stress:
93 | # Remove primary/secondary stress
94 | word_phoneme = IPA.without_stress(word_phoneme)
95 |
96 | word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
97 |
98 | if word_phoneme:
99 | # Flatten phonemes
100 | word_phonemes.extend(word_phoneme)
101 |
102 | if word_phonemes:
103 | ph_list.append(word_phonemes)
104 |
105 | ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
106 | ph = f"{separator} ".join(ph_words)
107 | return ph
108 |
109 | def _phonemize(self, text, separator):
110 | return self.phonemize_gruut(text, separator, tie=False)
111 |
112 | def is_supported_language(self, language):
113 | """Returns True if `language` is supported by the backend"""
114 | return gruut.is_language_supported(language)
115 |
116 | @staticmethod
117 | def supported_languages() -> List:
118 | """Get a dictionary of supported languages.
119 |
120 | Returns:
121 | List: List of language codes.
122 | """
123 | return list(gruut.get_supported_languages())
124 |
125 | def version(self):
126 | """Get the version of the used backend.
127 |
128 | Returns:
129 | str: Version of the used backend.
130 | """
131 | return gruut.__version__
132 |
133 | @classmethod
134 | def is_available(cls):
135 | """Return true if ESpeak is available else false"""
136 | return importlib.util.find_spec("gruut") is not None
137 |
138 |
139 | if __name__ == "__main__":
140 | from es_to_ipa import es2ipa
141 | import json
142 |
143 | e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
144 | symbols = [
145 | "_",
146 | ",",
147 | ".",
148 | "!",
149 | "?",
150 | "-",
151 | "~",
152 | "\u2026",
153 | "N",
154 | "Q",
155 | "a",
156 | "b",
157 | "d",
158 | "e",
159 | "f",
160 | "g",
161 | "h",
162 | "i",
163 | "j",
164 | "k",
165 | "l",
166 | "m",
167 | "n",
168 | "o",
169 | "p",
170 | "s",
171 | "t",
172 | "u",
173 | "v",
174 | "w",
175 | "x",
176 | "y",
177 | "z",
178 | "\u0251",
179 | "\u00e6",
180 | "\u0283",
181 | "\u0291",
182 | "\u00e7",
183 | "\u026f",
184 | "\u026a",
185 | "\u0254",
186 | "\u025b",
187 | "\u0279",
188 | "\u00f0",
189 | "\u0259",
190 | "\u026b",
191 | "\u0265",
192 | "\u0278",
193 | "\u028a",
194 | "\u027e",
195 | "\u0292",
196 | "\u03b8",
197 | "\u03b2",
198 | "\u014b",
199 | "\u0266",
200 | "\u207c",
201 | "\u02b0",
202 | "`",
203 | "^",
204 | "#",
205 | "*",
206 | "=",
207 | "\u02c8",
208 | "\u02cc",
209 | "\u2192",
210 | "\u2193",
211 | "\u2191",
212 | " ",
213 | ]
214 | with open('./text/es_phonemizer/spanish_text.txt', 'r') as f:
215 | lines = f.readlines()
216 |
217 |
218 | used_sym = []
219 | not_existed_sym = []
220 | phonemes = []
221 |
222 | for line in lines[:400]:
223 | text = line.split('|')[-1].strip()
224 | ipa = es2ipa(text)
225 | phonemes.append(ipa + '\n')
226 | for s in ipa:
227 | if s not in symbols:
228 | if s not in not_existed_sym:
229 | print(f'not_existed char: {s}')
230 | not_existed_sym.append(s)
231 | else:
232 | if s not in used_sym:
233 | # print(f'used char: {s}')
234 | used_sym.append(s)
235 |
236 | print(used_sym)
237 | print(not_existed_sym)
238 |
239 |
240 | with open('./text/es_phonemizer/es_symbols.txt', 'w') as g:
241 | g.writelines(symbols + not_existed_sym)
242 |
243 | with open('./text/es_phonemizer/example_ipa.txt', 'w') as g:
244 | g.writelines(phonemes)
245 |
246 | data = {'symbols': symbols + not_existed_sym}
247 | with open('./text/es_phonemizer/es_symbols_v2.json', 'w') as f:
248 | json.dump(data, f, indent=4)
249 |
250 |
251 |
252 |
253 |
254 |
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/punctuation.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import re
3 | from enum import Enum
4 |
5 | import six
6 |
7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
8 |
9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
10 |
11 |
12 | class PuncPosition(Enum):
13 | """Enum for the punctuations positions"""
14 |
15 | BEGIN = 0
16 | END = 1
17 | MIDDLE = 2
18 | ALONE = 3
19 |
20 |
21 | class Punctuation:
22 | """Handle punctuations in text.
23 |
24 | Just strip punctuations from text or strip and restore them later.
25 |
26 | Args:
27 | puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
28 |
29 | Example:
30 | >>> punc = Punctuation()
31 | >>> punc.strip("This is. example !")
32 | 'This is example'
33 |
34 | >>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
35 | >>> ' '.join(text_striped)
36 | 'This is example'
37 |
38 | >>> text_restored = punc.restore(text_striped, punc_map)
39 | >>> text_restored[0]
40 | 'This is. example !'
41 | """
42 |
43 | def __init__(self, puncs: str = _DEF_PUNCS):
44 | self.puncs = puncs
45 |
46 | @staticmethod
47 | def default_puncs():
48 | """Return default set of punctuations."""
49 | return _DEF_PUNCS
50 |
51 | @property
52 | def puncs(self):
53 | return self._puncs
54 |
55 | @puncs.setter
56 | def puncs(self, value):
57 | if not isinstance(value, six.string_types):
58 | raise ValueError("[!] Punctuations must be of type str.")
59 | self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
60 | self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
61 |
62 | def strip(self, text):
63 | """Remove all the punctuations by replacing with `space`.
64 |
65 | Args:
66 | text (str): The text to be processed.
67 |
68 | Example::
69 |
70 | "This is. example !" -> "This is example "
71 | """
72 | return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
73 |
74 | def strip_to_restore(self, text):
75 | """Remove punctuations from text to restore them later.
76 |
77 | Args:
78 | text (str): The text to be processed.
79 |
80 | Examples ::
81 |
82 | "This is. example !" -> [["This is", "example"], [".", "!"]]
83 |
84 | """
85 | text, puncs = self._strip_to_restore(text)
86 | return text, puncs
87 |
88 | def _strip_to_restore(self, text):
89 | """Auxiliary method for Punctuation.preserve()"""
90 | matches = list(re.finditer(self.puncs_regular_exp, text))
91 | if not matches:
92 | return [text], []
93 | # the text is only punctuations
94 | if len(matches) == 1 and matches[0].group() == text:
95 | return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
96 | # build a punctuation map to be used later to restore punctuations
97 | puncs = []
98 | for match in matches:
99 | position = PuncPosition.MIDDLE
100 | if match == matches[0] and text.startswith(match.group()):
101 | position = PuncPosition.BEGIN
102 | elif match == matches[-1] and text.endswith(match.group()):
103 | position = PuncPosition.END
104 | puncs.append(_PUNC_IDX(match.group(), position))
105 | # convert str text to a List[str], each item is separated by a punctuation
106 | splitted_text = []
107 | for idx, punc in enumerate(puncs):
108 | split = text.split(punc.punc)
109 | prefix, suffix = split[0], punc.punc.join(split[1:])
110 | splitted_text.append(prefix)
111 | # if the text does not end with a punctuation, add it to the last item
112 | if idx == len(puncs) - 1 and len(suffix) > 0:
113 | splitted_text.append(suffix)
114 | text = suffix
115 | while splitted_text[0] == '':
116 | splitted_text = splitted_text[1:]
117 | return splitted_text, puncs
118 |
119 | @classmethod
120 | def restore(cls, text, puncs):
121 | """Restore punctuation in a text.
122 |
123 | Args:
124 | text (str): The text to be processed.
125 | puncs (List[str]): The list of punctuations map to be used for restoring.
126 |
127 | Examples ::
128 |
129 | ['This is', 'example'], ['.', '!'] -> "This is. example!"
130 |
131 | """
132 | return cls._restore(text, puncs, 0)
133 |
134 | @classmethod
135 | def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
136 | """Auxiliary method for Punctuation.restore()"""
137 | if not puncs:
138 | return text
139 |
140 | # nothing have been phonemized, returns the puncs alone
141 | if not text:
142 | return ["".join(m.punc for m in puncs)]
143 |
144 | current = puncs[0]
145 |
146 | if current.position == PuncPosition.BEGIN:
147 | return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
148 |
149 | if current.position == PuncPosition.END:
150 | return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
151 |
152 | if current.position == PuncPosition.ALONE:
153 | return [current.mark] + cls._restore(text, puncs[1:], num + 1)
154 |
155 | # POSITION == MIDDLE
156 | if len(text) == 1: # pragma: nocover
157 | # a corner case where the final part of an intermediate
158 | # mark (I) has not been phonemized
159 | return cls._restore([text[0] + current.punc], puncs[1:], num)
160 |
161 | return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
162 |
163 |
164 | # if __name__ == "__main__":
165 | # punc = Punctuation()
166 | # text = "This is. This is, example!"
167 |
168 | # print(punc.strip(text))
169 |
170 | # split_text, puncs = punc.strip_to_restore(text)
171 | # print(split_text, " ---- ", puncs)
172 |
173 | # restored_text = punc.restore(split_text, puncs)
174 | # print(restored_text)
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/spanish_symbols.txt:
--------------------------------------------------------------------------------
1 | dˌaβˈiðkopeɾfjl unθsbmtʃwɛxɪŋʊɣɡrɲʝʎː
--------------------------------------------------------------------------------
/melo/text/es_phonemizer/test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "ename": "ImportError",
10 | "evalue": "attempted relative import with no known parent package",
11 | "output_type": "error",
12 | "traceback": [
13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
15 | "\u001b[1;32m/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb Cell 1\u001b[0m line \u001b[0;36m5\n\u001b[1;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39msys\u001b[39;00m\n\u001b[1;32m 4\u001b[0m sys\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mappend(\u001b[39m'\u001b[39m\u001b[39m/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mes_to_ipa\u001b[39;00m \u001b[39mimport\u001b[39;00m es2ipa\n\u001b[1;32m 9\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msplit_sentences_en\u001b[39m(text, min_len\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[1;32m 10\u001b[0m \u001b[39m# 将文本中的换行符、空格和制表符替换为空格\u001b[39;00m\n\u001b[1;32m 11\u001b[0m text \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\u001b[39m'\u001b[39m\u001b[39m[\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m ]+\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m, text)\n",
16 | "File \u001b[0;32m/data/workspace/Bert-VITS2/text/es_phonemizer/es_to_ipa.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mcleaner\u001b[39;00m \u001b[39mimport\u001b[39;00m spanish_cleaners\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mgruut_wrapper\u001b[39;00m \u001b[39mimport\u001b[39;00m Gruut\n\u001b[1;32m 4\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mes2ipa\u001b[39m(text):\n",
17 | "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
18 | ]
19 | }
20 | ],
21 | "source": [
22 | "import re\n",
23 | "import os\n",
24 | "import os, sys\n",
25 | "sys.path.append('/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/')\n",
26 | "from es_to_ipa import es2ipa\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "def split_sentences_en(text, min_len=10):\n",
31 | " # 将文本中的换行符、空格和制表符替换为空格\n",
32 | " text = re.sub('[\\n\\t ]+', ' ', text)\n",
33 | " # 在标点符号后添加一个空格\n",
34 | " text = re.sub('([¿—¡])', r'\\1 $#!', text)\n",
35 | " # 分隔句子并去除前后空格\n",
36 | " \n",
37 | " sentences = [s.strip() for s in text.split(' $#!')]\n",
38 | " if len(sentences[-1]) == 0: del sentences[-1]\n",
39 | "\n",
40 | " new_sentences = []\n",
41 | " new_sent = []\n",
42 | " for ind, sent in enumerate(sentences):\n",
43 | " if sent in ['¿', '—', '¡']:\n",
44 | " new_sent.append(sent)\n",
45 | " else:\n",
46 | " new_sent.append(es2ipa(sent))\n",
47 | " \n",
48 | " \n",
49 | " new_sentences = ''.join(new_sent)\n",
50 | "\n",
51 | " return new_sentences"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "'—¿aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
63 | ]
64 | },
65 | "execution_count": 3,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "split_sentences_en('—¿Habéis estado casada alguna vez?')"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "'aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
83 | ]
84 | },
85 | "execution_count": 4,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "es2ipa('—¿Habéis estado casada alguna vez?')"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "base",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.8.18"
119 | },
120 | "orig_nbformat": 4
121 | },
122 | "nbformat": 4,
123 | "nbformat_minor": 2
124 | }
125 |
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/fr_phonemizer/__init__.py
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/base.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from typing import List, Tuple
3 |
4 | from .punctuation import Punctuation
5 |
6 |
7 | class BasePhonemizer(abc.ABC):
8 | """Base phonemizer class
9 |
10 | Phonemization follows the following steps:
11 | 1. Preprocessing:
12 | - remove empty lines
13 | - remove punctuation
14 | - keep track of punctuation marks
15 |
16 | 2. Phonemization:
17 | - convert text to phonemes
18 |
19 | 3. Postprocessing:
20 | - join phonemes
21 | - restore punctuation marks
22 |
23 | Args:
24 | language (str):
25 | Language used by the phonemizer.
26 |
27 | punctuations (List[str]):
28 | List of punctuation marks to be preserved.
29 |
30 | keep_puncs (bool):
31 | Whether to preserve punctuation marks or not.
32 | """
33 |
34 | def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
35 | # ensure the backend is installed on the system
36 | if not self.is_available():
37 | raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
38 |
39 | # ensure the backend support the requested language
40 | self._language = self._init_language(language)
41 |
42 | # setup punctuation processing
43 | self._keep_puncs = keep_puncs
44 | self._punctuator = Punctuation(punctuations)
45 |
46 | def _init_language(self, language):
47 | """Language initialization
48 |
49 | This method may be overloaded in child classes (see Segments backend)
50 |
51 | """
52 | if not self.is_supported_language(language):
53 | raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
54 | return language
55 |
56 | @property
57 | def language(self):
58 | """The language code configured to be used for phonemization"""
59 | return self._language
60 |
61 | @staticmethod
62 | @abc.abstractmethod
63 | def name():
64 | """The name of the backend"""
65 | ...
66 |
67 | @classmethod
68 | @abc.abstractmethod
69 | def is_available(cls):
70 | """Returns True if the backend is installed, False otherwise"""
71 | ...
72 |
73 | @classmethod
74 | @abc.abstractmethod
75 | def version(cls):
76 | """Return the backend version as a tuple (major, minor, patch)"""
77 | ...
78 |
79 | @staticmethod
80 | @abc.abstractmethod
81 | def supported_languages():
82 | """Return a dict of language codes -> name supported by the backend"""
83 | ...
84 |
85 | def is_supported_language(self, language):
86 | """Returns True if `language` is supported by the backend"""
87 | return language in self.supported_languages()
88 |
89 | @abc.abstractmethod
90 | def _phonemize(self, text, separator):
91 | """The main phonemization method"""
92 |
93 | def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
94 | """Preprocess the text before phonemization
95 |
96 | 1. remove spaces
97 | 2. remove punctuation
98 |
99 | Override this if you need a different behaviour
100 | """
101 | text = text.strip()
102 | if self._keep_puncs:
103 | # a tuple (text, punctuation marks)
104 | return self._punctuator.strip_to_restore(text)
105 | return [self._punctuator.strip(text)], []
106 |
107 | def _phonemize_postprocess(self, phonemized, punctuations) -> str:
108 | """Postprocess the raw phonemized output
109 |
110 | Override this if you need a different behaviour
111 | """
112 | if self._keep_puncs:
113 | return self._punctuator.restore(phonemized, punctuations)[0]
114 | return phonemized[0]
115 |
116 | def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
117 | """Returns the `text` phonemized for the given language
118 |
119 | Args:
120 | text (str):
121 | Text to be phonemized.
122 |
123 | separator (str):
124 | string separator used between phonemes. Default to '_'.
125 |
126 | Returns:
127 | (str): Phonemized text
128 | """
129 | text, punctuations = self._phonemize_preprocess(text)
130 | phonemized = []
131 | for t in text:
132 | p = self._phonemize(t, separator)
133 | phonemized.append(p)
134 | phonemized = self._phonemize_postprocess(phonemized, punctuations)
135 | return phonemized
136 |
137 | def print_logs(self, level: int = 0):
138 | indent = "\t" * level
139 | print(f"{indent}| > phoneme language: {self.language}")
140 | print(f"{indent}| > phoneme backend: {self.name()}")
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/cleaner.py:
--------------------------------------------------------------------------------
1 | """Set of default text cleaners"""
2 | # TODO: pick the cleaner for languages dynamically
3 |
4 | import re
5 | from .french_abbreviations import abbreviations_fr
6 |
7 | # Regular expression matching whitespace:
8 | _whitespace_re = re.compile(r"\s+")
9 |
10 |
11 | rep_map = {
12 | ":": ",",
13 | ";": ",",
14 | ",": ",",
15 | "。": ".",
16 | "!": "!",
17 | "?": "?",
18 | "\n": ".",
19 | "·": ",",
20 | "、": ",",
21 | "...": ".",
22 | "…": ".",
23 | "$": ".",
24 | "“": "",
25 | "”": "",
26 | "‘": "",
27 | "’": "",
28 | "(": "",
29 | ")": "",
30 | "(": "",
31 | ")": "",
32 | "《": "",
33 | "》": "",
34 | "【": "",
35 | "】": "",
36 | "[": "",
37 | "]": "",
38 | "—": "",
39 | "~": "-",
40 | "~": "-",
41 | "「": "",
42 | "」": "",
43 | "¿" : "",
44 | "¡" : ""
45 | }
46 |
47 |
48 | def replace_punctuation(text):
49 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
50 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
51 | return replaced_text
52 |
53 | def expand_abbreviations(text, lang="fr"):
54 | if lang == "fr":
55 | _abbreviations = abbreviations_fr
56 | for regex, replacement in _abbreviations:
57 | text = re.sub(regex, replacement, text)
58 | return text
59 |
60 |
61 | def lowercase(text):
62 | return text.lower()
63 |
64 |
65 | def collapse_whitespace(text):
66 | return re.sub(_whitespace_re, " ", text).strip()
67 |
68 | def remove_punctuation_at_begin(text):
69 | return re.sub(r'^[,.!?]+', '', text)
70 |
71 | def remove_aux_symbols(text):
72 | text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
73 | return text
74 |
75 |
76 | def replace_symbols(text, lang="en"):
77 | """Replace symbols based on the lenguage tag.
78 |
79 | Args:
80 | text:
81 | Input text.
82 | lang:
83 | Lenguage identifier. ex: "en", "fr", "pt", "ca".
84 |
85 | Returns:
86 | The modified text
87 | example:
88 | input args:
89 | text: "si l'avi cau, diguem-ho"
90 | lang: "ca"
91 | Output:
92 | text: "si lavi cau, diguemho"
93 | """
94 | text = text.replace(";", ",")
95 | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
96 | text = text.replace(":", ",")
97 | if lang == "en":
98 | text = text.replace("&", " and ")
99 | elif lang == "fr":
100 | text = text.replace("&", " et ")
101 | elif lang == "pt":
102 | text = text.replace("&", " e ")
103 | elif lang == "ca":
104 | text = text.replace("&", " i ")
105 | text = text.replace("'", "")
106 | elif lang== "es":
107 | text=text.replace("&","y")
108 | text = text.replace("'", "")
109 | return text
110 |
111 | def french_cleaners(text):
112 | """Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
113 | text = expand_abbreviations(text, lang="fr")
114 | # text = lowercase(text) # as we use the cased bert
115 | text = replace_punctuation(text)
116 | text = replace_symbols(text, lang="fr")
117 | text = remove_aux_symbols(text)
118 | text = remove_punctuation_at_begin(text)
119 | text = collapse_whitespace(text)
120 | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
121 | return text
122 |
123 |
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/en_symbols.json:
--------------------------------------------------------------------------------
1 | {"symbols": [
2 | "_",
3 | ",",
4 | ".",
5 | "!",
6 | "?",
7 | "-",
8 | "~",
9 | "\u2026",
10 | "N",
11 | "Q",
12 | "a",
13 | "b",
14 | "d",
15 | "e",
16 | "f",
17 | "g",
18 | "h",
19 | "i",
20 | "j",
21 | "k",
22 | "l",
23 | "m",
24 | "n",
25 | "o",
26 | "p",
27 | "s",
28 | "t",
29 | "u",
30 | "v",
31 | "w",
32 | "x",
33 | "y",
34 | "z",
35 | "\u0251",
36 | "\u00e6",
37 | "\u0283",
38 | "\u0291",
39 | "\u00e7",
40 | "\u026f",
41 | "\u026a",
42 | "\u0254",
43 | "\u025b",
44 | "\u0279",
45 | "\u00f0",
46 | "\u0259",
47 | "\u026b",
48 | "\u0265",
49 | "\u0278",
50 | "\u028a",
51 | "\u027e",
52 | "\u0292",
53 | "\u03b8",
54 | "\u03b2",
55 | "\u014b",
56 | "\u0266",
57 | "\u207c",
58 | "\u02b0",
59 | "`",
60 | "^",
61 | "#",
62 | "*",
63 | "=",
64 | "\u02c8",
65 | "\u02cc",
66 | "\u2192",
67 | "\u2193",
68 | "\u2191",
69 | " ",
70 | "ɣ",
71 | "ɡ",
72 | "r",
73 | "ɲ",
74 | "ʝ",
75 | "ʎ",
76 | "ː"
77 | ]
78 | }
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/fr_symbols.json:
--------------------------------------------------------------------------------
1 | {
2 | "symbols": [
3 | "_",
4 | ",",
5 | ".",
6 | "!",
7 | "?",
8 | "-",
9 | "~",
10 | "\u2026",
11 | "N",
12 | "Q",
13 | "a",
14 | "b",
15 | "d",
16 | "e",
17 | "f",
18 | "g",
19 | "h",
20 | "i",
21 | "j",
22 | "k",
23 | "l",
24 | "m",
25 | "n",
26 | "o",
27 | "p",
28 | "s",
29 | "t",
30 | "u",
31 | "v",
32 | "w",
33 | "x",
34 | "y",
35 | "z",
36 | "\u0251",
37 | "\u00e6",
38 | "\u0283",
39 | "\u0291",
40 | "\u00e7",
41 | "\u026f",
42 | "\u026a",
43 | "\u0254",
44 | "\u025b",
45 | "\u0279",
46 | "\u00f0",
47 | "\u0259",
48 | "\u026b",
49 | "\u0265",
50 | "\u0278",
51 | "\u028a",
52 | "\u027e",
53 | "\u0292",
54 | "\u03b8",
55 | "\u03b2",
56 | "\u014b",
57 | "\u0266",
58 | "\u207c",
59 | "\u02b0",
60 | "`",
61 | "^",
62 | "#",
63 | "*",
64 | "=",
65 | "\u02c8",
66 | "\u02cc",
67 | "\u2192",
68 | "\u2193",
69 | "\u2191",
70 | " ",
71 | "\u0263",
72 | "\u0261",
73 | "r",
74 | "\u0272",
75 | "\u029d",
76 | "\u028e",
77 | "\u02d0",
78 |
79 | "\u0303",
80 | "\u0153",
81 | "\u00f8",
82 | "\u0281",
83 | "\u0252",
84 | "\u028c",
85 | "\u2014",
86 | "\u025c",
87 | "\u0250"
88 | ]
89 | }
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/fr_to_ipa.py:
--------------------------------------------------------------------------------
1 | from .cleaner import french_cleaners
2 | from .gruut_wrapper import Gruut
3 |
4 |
5 | def remove_consecutive_t(input_str):
6 | result = []
7 | count = 0
8 |
9 | for char in input_str:
10 | if char == 't':
11 | count += 1
12 | else:
13 | if count < 3:
14 | result.extend(['t'] * count)
15 | count = 0
16 | result.append(char)
17 |
18 | if count < 3:
19 | result.extend(['t'] * count)
20 |
21 | return ''.join(result)
22 |
23 | def fr2ipa(text):
24 | e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
25 | # text = french_cleaners(text)
26 | phonemes = e.phonemize(text, separator="")
27 | # print(phonemes)
28 | phonemes = remove_consecutive_t(phonemes)
29 | # print(phonemes)
30 | return phonemes
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/french_abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in french:
4 | abbreviations_fr = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("M", "monsieur"),
8 | ("Mlle", "mademoiselle"),
9 | ("Mlles", "mesdemoiselles"),
10 | ("Mme", "Madame"),
11 | ("Mmes", "Mesdames"),
12 | ("N.B", "nota bene"),
13 | ("M", "monsieur"),
14 | ("p.c.q", "parce que"),
15 | ("Pr", "professeur"),
16 | ("qqch", "quelque chose"),
17 | ("rdv", "rendez-vous"),
18 | ("max", "maximum"),
19 | ("min", "minimum"),
20 | ("no", "numéro"),
21 | ("adr", "adresse"),
22 | ("dr", "docteur"),
23 | ("st", "saint"),
24 | ("co", "companie"),
25 | ("jr", "junior"),
26 | ("sgt", "sergent"),
27 | ("capt", "capitain"),
28 | ("col", "colonel"),
29 | ("av", "avenue"),
30 | ("av. J.-C", "avant Jésus-Christ"),
31 | ("apr. J.-C", "après Jésus-Christ"),
32 | ("art", "article"),
33 | ("boul", "boulevard"),
34 | ("c.-à-d", "c’est-à-dire"),
35 | ("etc", "et cetera"),
36 | ("ex", "exemple"),
37 | ("excl", "exclusivement"),
38 | ("boul", "boulevard"),
39 | ]
40 | ] + [
41 | (re.compile("\\b%s" % x[0]), x[1])
42 | for x in [
43 | ("Mlle", "mademoiselle"),
44 | ("Mlles", "mesdemoiselles"),
45 | ("Mme", "Madame"),
46 | ("Mmes", "Mesdames"),
47 | ]
48 | ]
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/french_symbols.txt:
--------------------------------------------------------------------------------
1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/gruut_wrapper.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from typing import List
3 |
4 | import gruut
5 | from gruut_ipa import IPA # pip install gruut_ipa
6 |
7 | from .base import BasePhonemizer
8 | from .punctuation import Punctuation
9 |
10 | # Table for str.translate to fix gruut/TTS phoneme mismatch
11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
12 |
13 |
14 | class Gruut(BasePhonemizer):
15 | """Gruut wrapper for G2P
16 |
17 | Args:
18 | language (str):
19 | Valid language code for the used backend.
20 |
21 | punctuations (str):
22 | Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
23 |
24 | keep_puncs (bool):
25 | If true, keep the punctuations after phonemization. Defaults to True.
26 |
27 | use_espeak_phonemes (bool):
28 | If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
29 |
30 | keep_stress (bool):
31 | If true, keep the stress characters after phonemization. Defaults to False.
32 |
33 | Example:
34 |
35 | >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
36 | >>> phonemizer = Gruut('en-us')
37 | >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
38 | 'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
39 | """
40 |
41 | def __init__(
42 | self,
43 | language: str,
44 | punctuations=Punctuation.default_puncs(),
45 | keep_puncs=True,
46 | use_espeak_phonemes=False,
47 | keep_stress=False,
48 | ):
49 | super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
50 | self.use_espeak_phonemes = use_espeak_phonemes
51 | self.keep_stress = keep_stress
52 |
53 | @staticmethod
54 | def name():
55 | return "gruut"
56 |
57 | def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
58 | """Convert input text to phonemes.
59 |
60 | Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
61 | that constitude a single sound.
62 |
63 | It doesn't affect 🐸TTS since it individually converts each character to token IDs.
64 |
65 | Examples::
66 | "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
67 |
68 | Args:
69 | text (str):
70 | Text to be converted to phonemes.
71 |
72 | tie (bool, optional) : When True use a '͡' character between
73 | consecutive characters of a single phoneme. Else separate phoneme
74 | with '_'. This option requires espeak>=1.49. Default to False.
75 | """
76 | ph_list = []
77 | for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
78 | for word in sentence:
79 | if word.is_break:
80 | # Use actual character for break phoneme (e.g., comma)
81 | if ph_list:
82 | # Join with previous word
83 | ph_list[-1].append(word.text)
84 | else:
85 | # First word is punctuation
86 | ph_list.append([word.text])
87 | elif word.phonemes:
88 | # Add phonemes for word
89 | word_phonemes = []
90 |
91 | for word_phoneme in word.phonemes:
92 | if not self.keep_stress:
93 | # Remove primary/secondary stress
94 | word_phoneme = IPA.without_stress(word_phoneme)
95 |
96 | word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
97 |
98 | if word_phoneme:
99 | # Flatten phonemes
100 | word_phonemes.extend(word_phoneme)
101 |
102 | if word_phonemes:
103 | ph_list.append(word_phonemes)
104 |
105 | ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
106 | ph = f"{separator} ".join(ph_words)
107 | return ph
108 |
109 | def _phonemize(self, text, separator):
110 | return self.phonemize_gruut(text, separator, tie=False)
111 |
112 | def is_supported_language(self, language):
113 | """Returns True if `language` is supported by the backend"""
114 | return gruut.is_language_supported(language)
115 |
116 | @staticmethod
117 | def supported_languages() -> List:
118 | """Get a dictionary of supported languages.
119 |
120 | Returns:
121 | List: List of language codes.
122 | """
123 | return list(gruut.get_supported_languages())
124 |
125 | def version(self):
126 | """Get the version of the used backend.
127 |
128 | Returns:
129 | str: Version of the used backend.
130 | """
131 | return gruut.__version__
132 |
133 | @classmethod
134 | def is_available(cls):
135 | """Return true if ESpeak is available else false"""
136 | return importlib.util.find_spec("gruut") is not None
137 |
138 |
139 | if __name__ == "__main__":
140 | from cleaner import french_cleaners
141 | import json
142 |
143 | e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
144 | symbols = [ # en + sp
145 | "_",
146 | ",",
147 | ".",
148 | "!",
149 | "?",
150 | "-",
151 | "~",
152 | "\u2026",
153 | "N",
154 | "Q",
155 | "a",
156 | "b",
157 | "d",
158 | "e",
159 | "f",
160 | "g",
161 | "h",
162 | "i",
163 | "j",
164 | "k",
165 | "l",
166 | "m",
167 | "n",
168 | "o",
169 | "p",
170 | "s",
171 | "t",
172 | "u",
173 | "v",
174 | "w",
175 | "x",
176 | "y",
177 | "z",
178 | "\u0251",
179 | "\u00e6",
180 | "\u0283",
181 | "\u0291",
182 | "\u00e7",
183 | "\u026f",
184 | "\u026a",
185 | "\u0254",
186 | "\u025b",
187 | "\u0279",
188 | "\u00f0",
189 | "\u0259",
190 | "\u026b",
191 | "\u0265",
192 | "\u0278",
193 | "\u028a",
194 | "\u027e",
195 | "\u0292",
196 | "\u03b8",
197 | "\u03b2",
198 | "\u014b",
199 | "\u0266",
200 | "\u207c",
201 | "\u02b0",
202 | "`",
203 | "^",
204 | "#",
205 | "*",
206 | "=",
207 | "\u02c8",
208 | "\u02cc",
209 | "\u2192",
210 | "\u2193",
211 | "\u2191",
212 | " ",
213 | "ɣ",
214 | "ɡ",
215 | "r",
216 | "ɲ",
217 | "ʝ",
218 | "ʎ",
219 | "ː"
220 | ]
221 | with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f:
222 | lines = f.readlines()
223 |
224 |
225 | used_sym = []
226 | not_existed_sym = []
227 | phonemes = []
228 |
229 | for line in lines:
230 | text = line.split('|')[-1].strip()
231 | text = french_cleaners(text)
232 | ipa = e.phonemize(text, separator="")
233 | phonemes.append(ipa)
234 | for s in ipa:
235 | if s not in symbols:
236 | if s not in not_existed_sym:
237 | print(f'not_existed char: {s}')
238 | not_existed_sym.append(s)
239 | else:
240 | if s not in used_sym:
241 | # print(f'used char: {s}')
242 | used_sym.append(s)
243 |
244 | print(used_sym)
245 | print(not_existed_sym)
246 |
247 |
248 | with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g:
249 | g.writelines(symbols + not_existed_sym)
250 |
251 | with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g:
252 | g.writelines(phonemes)
253 |
254 | data = {'symbols': symbols + not_existed_sym}
255 |
256 | with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f:
257 | json.dump(data, f, indent=4)
258 |
259 |
--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/punctuation.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import re
3 | from enum import Enum
4 |
5 | import six
6 |
7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
8 |
9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
10 |
11 |
12 | class PuncPosition(Enum):
13 | """Enum for the punctuations positions"""
14 |
15 | BEGIN = 0
16 | END = 1
17 | MIDDLE = 2
18 | ALONE = 3
19 |
20 |
21 | class Punctuation:
22 | """Handle punctuations in text.
23 |
24 | Just strip punctuations from text or strip and restore them later.
25 |
26 | Args:
27 | puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
28 |
29 | Example:
30 | >>> punc = Punctuation()
31 | >>> punc.strip("This is. example !")
32 | 'This is example'
33 |
34 | >>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
35 | >>> ' '.join(text_striped)
36 | 'This is example'
37 |
38 | >>> text_restored = punc.restore(text_striped, punc_map)
39 | >>> text_restored[0]
40 | 'This is. example !'
41 | """
42 |
43 | def __init__(self, puncs: str = _DEF_PUNCS):
44 | self.puncs = puncs
45 |
46 | @staticmethod
47 | def default_puncs():
48 | """Return default set of punctuations."""
49 | return _DEF_PUNCS
50 |
51 | @property
52 | def puncs(self):
53 | return self._puncs
54 |
55 | @puncs.setter
56 | def puncs(self, value):
57 | if not isinstance(value, six.string_types):
58 | raise ValueError("[!] Punctuations must be of type str.")
59 | self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
60 | self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
61 |
62 | def strip(self, text):
63 | """Remove all the punctuations by replacing with `space`.
64 |
65 | Args:
66 | text (str): The text to be processed.
67 |
68 | Example::
69 |
70 | "This is. example !" -> "This is example "
71 | """
72 | return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
73 |
74 | def strip_to_restore(self, text):
75 | """Remove punctuations from text to restore them later.
76 |
77 | Args:
78 | text (str): The text to be processed.
79 |
80 | Examples ::
81 |
82 | "This is. example !" -> [["This is", "example"], [".", "!"]]
83 |
84 | """
85 | text, puncs = self._strip_to_restore(text)
86 | return text, puncs
87 |
88 | def _strip_to_restore(self, text):
89 | """Auxiliary method for Punctuation.preserve()"""
90 | matches = list(re.finditer(self.puncs_regular_exp, text))
91 | if not matches:
92 | return [text], []
93 | # the text is only punctuations
94 | if len(matches) == 1 and matches[0].group() == text:
95 | return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
96 | # build a punctuation map to be used later to restore punctuations
97 | puncs = []
98 | for match in matches:
99 | position = PuncPosition.MIDDLE
100 | if match == matches[0] and text.startswith(match.group()):
101 | position = PuncPosition.BEGIN
102 | elif match == matches[-1] and text.endswith(match.group()):
103 | position = PuncPosition.END
104 | puncs.append(_PUNC_IDX(match.group(), position))
105 | # convert str text to a List[str], each item is separated by a punctuation
106 | splitted_text = []
107 | for idx, punc in enumerate(puncs):
108 | split = text.split(punc.punc)
109 | prefix, suffix = split[0], punc.punc.join(split[1:])
110 | splitted_text.append(prefix)
111 | # if the text does not end with a punctuation, add it to the last item
112 | if idx == len(puncs) - 1 and len(suffix) > 0:
113 | splitted_text.append(suffix)
114 | text = suffix
115 | return splitted_text, puncs
116 |
117 | @classmethod
118 | def restore(cls, text, puncs):
119 | """Restore punctuation in a text.
120 |
121 | Args:
122 | text (str): The text to be processed.
123 | puncs (List[str]): The list of punctuations map to be used for restoring.
124 |
125 | Examples ::
126 |
127 | ['This is', 'example'], ['.', '!'] -> "This is. example!"
128 |
129 | """
130 | return cls._restore(text, puncs, 0)
131 |
132 | @classmethod
133 | def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
134 | """Auxiliary method for Punctuation.restore()"""
135 | if not puncs:
136 | return text
137 |
138 | # nothing have been phonemized, returns the puncs alone
139 | if not text:
140 | return ["".join(m.punc for m in puncs)]
141 |
142 | current = puncs[0]
143 |
144 | if current.position == PuncPosition.BEGIN:
145 | return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
146 |
147 | if current.position == PuncPosition.END:
148 | return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
149 |
150 | if current.position == PuncPosition.ALONE:
151 | return [current.mark] + cls._restore(text, puncs[1:], num + 1)
152 |
153 | # POSITION == MIDDLE
154 | if len(text) == 1: # pragma: nocover
155 | # a corner case where the final part of an intermediate
156 | # mark (I) has not been phonemized
157 | return cls._restore([text[0] + current.punc], puncs[1:], num)
158 |
159 | return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
160 |
161 |
162 | # if __name__ == "__main__":
163 | # punc = Punctuation()
164 | # text = "This is. This is, example!"
165 |
166 | # print(punc.strip(text))
167 |
168 | # split_text, puncs = punc.strip_to_restore(text)
169 | # print(split_text, " ---- ", puncs)
170 |
171 | # restored_text = punc.restore(split_text, puncs)
172 | # print(restored_text)
--------------------------------------------------------------------------------
/melo/text/french.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 |
5 | from . import symbols
6 | from .fr_phonemizer import cleaner as fr_cleaner
7 | from .fr_phonemizer import fr_to_ipa
8 | from transformers import AutoTokenizer
9 |
10 |
11 | def distribute_phone(n_phone, n_word):
12 | phones_per_word = [0] * n_word
13 | for task in range(n_phone):
14 | min_tasks = min(phones_per_word)
15 | min_index = phones_per_word.index(min_tasks)
16 | phones_per_word[min_index] += 1
17 | return phones_per_word
18 |
19 | def text_normalize(text):
20 | text = fr_cleaner.french_cleaners(text)
21 | return text
22 |
23 | model_id = 'dbmdz/bert-base-french-europeana-cased'
24 | tokenizer = AutoTokenizer.from_pretrained(model_id)
25 |
26 | def g2p(text, pad_start_end=True, tokenized=None):
27 | if tokenized is None:
28 | tokenized = tokenizer.tokenize(text)
29 | # import pdb; pdb.set_trace()
30 | phs = []
31 | ph_groups = []
32 | for t in tokenized:
33 | if not t.startswith("#"):
34 | ph_groups.append([t])
35 | else:
36 | ph_groups[-1].append(t.replace("#", ""))
37 |
38 | phones = []
39 | tones = []
40 | word2ph = []
41 | # print(ph_groups)
42 | for group in ph_groups:
43 | w = "".join(group)
44 | phone_len = 0
45 | word_len = len(group)
46 | if w == '[UNK]':
47 | phone_list = ['UNK']
48 | else:
49 | phone_list = list(filter(lambda p: p != " ", fr_to_ipa.fr2ipa(w)))
50 |
51 | for ph in phone_list:
52 | phones.append(ph)
53 | tones.append(0)
54 | phone_len += 1
55 | aaa = distribute_phone(phone_len, word_len)
56 | word2ph += aaa
57 | # print(phone_list, aaa)
58 | # print('=' * 10)
59 |
60 | if pad_start_end:
61 | phones = ["_"] + phones + ["_"]
62 | tones = [0] + tones + [0]
63 | word2ph = [1] + word2ph + [1]
64 | return phones, tones, word2ph
65 |
66 | def get_bert_feature(text, word2ph, device=None):
67 | from text import french_bert
68 | return french_bert.get_bert_feature(text, word2ph, device=device)
69 |
70 | if __name__ == "__main__":
71 | ori_text = 'Ce service gratuit est“”"" 【disponible》 en chinois 【simplifié] et autres 123'
72 | # ori_text = "Ils essayaient vainement de faire comprendre à ma mère qu'avec les cent mille francs que m'avait laissé mon père,"
73 | # print(ori_text)
74 | text = text_normalize(ori_text)
75 | print(text)
76 | phoneme = fr_to_ipa.fr2ipa(text)
77 | print(phoneme)
78 |
79 |
80 | from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
81 | from text.cleaner_multiling import unicleaners
82 |
83 | def text_normalize(text):
84 | text = unicleaners(text, cased=True, lang='fr')
85 | return text
86 |
87 | # print(ori_text)
88 | text = text_normalize(ori_text)
89 | print(text)
90 | phonemizer = MultiPhonemizer({"fr-fr": "espeak"})
91 | # phonemizer.lang_to_phonemizer['fr'].keep_stress = True
92 | # phonemizer.lang_to_phonemizer['fr'].use_espeak_phonemes = True
93 | phoneme = phonemizer.phonemize(text, separator="", language='fr-fr')
94 | print(phoneme)
--------------------------------------------------------------------------------
/melo/text/french_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | model_id = 'dbmdz/bert-base-french-europeana-cased'
6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
7 | model = None
8 |
9 | def get_bert_feature(text, word2ph, device=None):
10 | global model
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if model is None:
20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 | device
22 | )
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = model(**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert inputs["input_ids"].shape[-1] == len(word2ph)
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
--------------------------------------------------------------------------------
/melo/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 |
6 | models = {}
7 | tokenizers = {}
8 | def get_bert_feature(text, word2ph, device=None, model_id='tohoku-nlp/bert-base-japanese-v3'):
9 | global model
10 | global tokenizer
11 |
12 | if (
13 | sys.platform == "darwin"
14 | and torch.backends.mps.is_available()
15 | and device == "cpu"
16 | ):
17 | device = "mps"
18 | if not device:
19 | device = "cuda"
20 | if model_id not in models:
21 | model = AutoModelForMaskedLM.from_pretrained(model_id).to(
22 | device
23 | )
24 | models[model_id] = model
25 | tokenizer = AutoTokenizer.from_pretrained(model_id)
26 | tokenizers[model_id] = tokenizer
27 | else:
28 | model = models[model_id]
29 | tokenizer = tokenizers[model_id]
30 |
31 |
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | tokenized = tokenizer.tokenize(text)
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = model(**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 |
40 | assert inputs["input_ids"].shape[-1] == len(word2ph), f"{inputs['input_ids'].shape[-1]}/{len(word2ph)}"
41 | word2phone = word2ph
42 | phone_level_feature = []
43 | for i in range(len(word2phone)):
44 | repeat_feature = res[i].repeat(word2phone[i], 1)
45 | phone_level_feature.append(repeat_feature)
46 |
47 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
48 |
49 | return phone_level_feature.T
50 |
--------------------------------------------------------------------------------
/melo/text/ko_dictionary.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Add the word you want to the dictionary.
3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
4 |
5 |
6 | english_dictionary = {
7 | "KOREA": "코리아",
8 | "IDOL": "아이돌",
9 | "IT": "아이티",
10 | "IQ": "아이큐",
11 | "UP": "업",
12 | "DOWN": "다운",
13 | "PC": "피씨",
14 | "CCTV": "씨씨티비",
15 | "SNS": "에스엔에스",
16 | "AI": "에이아이",
17 | "CEO": "씨이오",
18 | "A": "에이",
19 | "B": "비",
20 | "C": "씨",
21 | "D": "디",
22 | "E": "이",
23 | "F": "에프",
24 | "G": "지",
25 | "H": "에이치",
26 | "I": "아이",
27 | "J": "제이",
28 | "K": "케이",
29 | "L": "엘",
30 | "M": "엠",
31 | "N": "엔",
32 | "O": "오",
33 | "P": "피",
34 | "Q": "큐",
35 | "R": "알",
36 | "S": "에스",
37 | "T": "티",
38 | "U": "유",
39 | "V": "브이",
40 | "W": "더블유",
41 | "X": "엑스",
42 | "Y": "와이",
43 | "Z": "제트",
44 | }
45 |
--------------------------------------------------------------------------------
/melo/text/korean.py:
--------------------------------------------------------------------------------
1 | # Convert Japanese text to phonemes which is
2 | # compatible with Julius https://github.com/julius-speech/segmentation-kit
3 | import re
4 | import unicodedata
5 |
6 | from transformers import AutoTokenizer
7 |
8 | from . import punctuation, symbols
9 |
10 |
11 | from num2words import num2words
12 | from melo.text.ko_dictionary import english_dictionary, etc_dictionary
13 | from anyascii import anyascii
14 | from jamo import hangul_to_jamo
15 |
16 | def normalize(text):
17 | text = text.strip()
18 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
19 | text = normalize_with_dictionary(text, etc_dictionary)
20 | text = normalize_english(text)
21 | text = text.lower()
22 | return text
23 |
24 |
25 | def normalize_with_dictionary(text, dic):
26 | if any(key in text for key in dic.keys()):
27 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
28 | return pattern.sub(lambda x: dic[x.group()], text)
29 | return text
30 |
31 |
32 | def normalize_english(text):
33 | def fn(m):
34 | word = m.group()
35 | if word in english_dictionary:
36 | return english_dictionary.get(word)
37 | return word
38 |
39 | text = re.sub("([A-Za-z]+)", fn, text)
40 | return text
41 |
42 |
43 | g2p_kr = None
44 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
45 | """
46 |
47 | The input and output values look the same, but they are different in Unicode.
48 |
49 | example :
50 |
51 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
52 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
53 |
54 | """
55 | global g2p_kr # pylint: disable=global-statement
56 | if g2p_kr is None:
57 | from g2pkk import G2p
58 |
59 | g2p_kr = G2p()
60 |
61 | if character == "english":
62 | from anyascii import anyascii
63 | text = normalize(text)
64 | text = g2p_kr(text)
65 | text = anyascii(text)
66 | return text
67 |
68 | text = normalize(text)
69 | text = g2p_kr(text)
70 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
71 | return "".join(text)
72 |
73 | def text_normalize(text):
74 | # res = unicodedata.normalize("NFKC", text)
75 | # res = japanese_convert_numbers_to_words(res)
76 | # # res = "".join([i for i in res if is_japanese_character(i)])
77 | # res = replace_punctuation(res)
78 | text = normalize(text)
79 | return text
80 |
81 |
82 | def distribute_phone(n_phone, n_word):
83 | phones_per_word = [0] * n_word
84 | for task in range(n_phone):
85 | min_tasks = min(phones_per_word)
86 | min_index = phones_per_word.index(min_tasks)
87 | phones_per_word[min_index] += 1
88 | return phones_per_word
89 |
90 |
91 |
92 | # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
93 |
94 | model_id = 'kykim/bert-kor-base'
95 | tokenizer = AutoTokenizer.from_pretrained(model_id)
96 |
97 | def g2p(norm_text):
98 | tokenized = tokenizer.tokenize(norm_text)
99 | phs = []
100 | ph_groups = []
101 | for t in tokenized:
102 | if not t.startswith("#"):
103 | ph_groups.append([t])
104 | else:
105 | ph_groups[-1].append(t.replace("#", ""))
106 | word2ph = []
107 | for group in ph_groups:
108 | text = ""
109 | for ch in group:
110 | text += ch
111 | if text == '[UNK]':
112 | phs += ['_']
113 | word2ph += [1]
114 | continue
115 | elif text in punctuation:
116 | phs += [text]
117 | word2ph += [1]
118 | continue
119 | # import pdb; pdb.set_trace()
120 | # phonemes = japanese_text_to_phonemes(text)
121 | # text = g2p_kr(text)
122 | phonemes = korean_text_to_phonemes(text)
123 | # import pdb; pdb.set_trace()
124 | # # phonemes = [i for i in phonemes if i in symbols]
125 | # for i in phonemes:
126 | # assert i in symbols, (group, norm_text, tokenized, i)
127 | phone_len = len(phonemes)
128 | word_len = len(group)
129 |
130 | aaa = distribute_phone(phone_len, word_len)
131 | assert len(aaa) == word_len
132 | word2ph += aaa
133 |
134 | phs += phonemes
135 | phones = ["_"] + phs + ["_"]
136 | tones = [0 for i in phones]
137 | word2ph = [1] + word2ph + [1]
138 | assert len(word2ph) == len(tokenized) + 2
139 | return phones, tones, word2ph
140 |
141 | def get_bert_feature(text, word2ph, device='cuda'):
142 | from . import japanese_bert
143 | return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
144 |
145 |
146 | if __name__ == "__main__":
147 | # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
148 | from text.symbols import symbols
149 | text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
150 | import json
151 |
152 | # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
153 | genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
154 | from tqdm import tqdm
155 | new_symbols = []
156 | for key, item in tqdm(genshin_data.items()):
157 | texts = item.get('voiceContent', '')
158 | if isinstance(texts, list):
159 | texts = ','.join(texts)
160 | if texts is None:
161 | continue
162 | if len(texts) == 0:
163 | continue
164 |
165 | text = text_normalize(text)
166 | phones, tones, word2ph = g2p(text)
167 | bert = get_bert_feature(text, word2ph)
168 | import pdb; pdb.set_trace()
169 | for ph in phones:
170 | if ph not in symbols and ph not in new_symbols:
171 | new_symbols.append(ph)
172 | print('update!, now symbols:')
173 | print(new_symbols)
174 | with open('korean_symbol.txt', 'w') as f:
175 | f.write(f'{new_symbols}')
176 |
177 |
178 |
179 | # if __name__ == '__main__':
180 | # from pykakasi import kakasi
181 | # # Initialize kakasi object
182 | # kakasi = kakasi()
183 |
184 | # # Set options for converting Chinese characters to Katakana
185 | # kakasi.setMode("J", "H") # Chinese to Katakana
186 | # kakasi.setMode("K", "H") # Hiragana to Katakana
187 |
188 | # # Convert Chinese characters to Katakana
189 | # conv = kakasi.getConverter()
190 | # katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text
191 |
192 | # print(katakana_text) # Output: ニーハオセカイ
--------------------------------------------------------------------------------
/melo/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
1 | a AA a
2 | ai AA ai
3 | an AA an
4 | ang AA ang
5 | ao AA ao
6 | ba b a
7 | bai b ai
8 | ban b an
9 | bang b ang
10 | bao b ao
11 | bei b ei
12 | ben b en
13 | beng b eng
14 | bi b i
15 | bian b ian
16 | biao b iao
17 | bie b ie
18 | bin b in
19 | bing b ing
20 | bo b o
21 | bu b u
22 | ca c a
23 | cai c ai
24 | can c an
25 | cang c ang
26 | cao c ao
27 | ce c e
28 | cei c ei
29 | cen c en
30 | ceng c eng
31 | cha ch a
32 | chai ch ai
33 | chan ch an
34 | chang ch ang
35 | chao ch ao
36 | che ch e
37 | chen ch en
38 | cheng ch eng
39 | chi ch ir
40 | chong ch ong
41 | chou ch ou
42 | chu ch u
43 | chua ch ua
44 | chuai ch uai
45 | chuan ch uan
46 | chuang ch uang
47 | chui ch ui
48 | chun ch un
49 | chuo ch uo
50 | ci c i0
51 | cong c ong
52 | cou c ou
53 | cu c u
54 | cuan c uan
55 | cui c ui
56 | cun c un
57 | cuo c uo
58 | da d a
59 | dai d ai
60 | dan d an
61 | dang d ang
62 | dao d ao
63 | de d e
64 | dei d ei
65 | den d en
66 | deng d eng
67 | di d i
68 | dia d ia
69 | dian d ian
70 | diao d iao
71 | die d ie
72 | ding d ing
73 | diu d iu
74 | dong d ong
75 | dou d ou
76 | du d u
77 | duan d uan
78 | dui d ui
79 | dun d un
80 | duo d uo
81 | e EE e
82 | ei EE ei
83 | en EE en
84 | eng EE eng
85 | er EE er
86 | fa f a
87 | fan f an
88 | fang f ang
89 | fei f ei
90 | fen f en
91 | feng f eng
92 | fo f o
93 | fou f ou
94 | fu f u
95 | ga g a
96 | gai g ai
97 | gan g an
98 | gang g ang
99 | gao g ao
100 | ge g e
101 | gei g ei
102 | gen g en
103 | geng g eng
104 | gong g ong
105 | gou g ou
106 | gu g u
107 | gua g ua
108 | guai g uai
109 | guan g uan
110 | guang g uang
111 | gui g ui
112 | gun g un
113 | guo g uo
114 | ha h a
115 | hai h ai
116 | han h an
117 | hang h ang
118 | hao h ao
119 | he h e
120 | hei h ei
121 | hen h en
122 | heng h eng
123 | hong h ong
124 | hou h ou
125 | hu h u
126 | hua h ua
127 | huai h uai
128 | huan h uan
129 | huang h uang
130 | hui h ui
131 | hun h un
132 | huo h uo
133 | ji j i
134 | jia j ia
135 | jian j ian
136 | jiang j iang
137 | jiao j iao
138 | jie j ie
139 | jin j in
140 | jing j ing
141 | jiong j iong
142 | jiu j iu
143 | ju j v
144 | jv j v
145 | juan j van
146 | jvan j van
147 | jue j ve
148 | jve j ve
149 | jun j vn
150 | jvn j vn
151 | ka k a
152 | kai k ai
153 | kan k an
154 | kang k ang
155 | kao k ao
156 | ke k e
157 | kei k ei
158 | ken k en
159 | keng k eng
160 | kong k ong
161 | kou k ou
162 | ku k u
163 | kua k ua
164 | kuai k uai
165 | kuan k uan
166 | kuang k uang
167 | kui k ui
168 | kun k un
169 | kuo k uo
170 | la l a
171 | lai l ai
172 | lan l an
173 | lang l ang
174 | lao l ao
175 | le l e
176 | lei l ei
177 | leng l eng
178 | li l i
179 | lia l ia
180 | lian l ian
181 | liang l iang
182 | liao l iao
183 | lie l ie
184 | lin l in
185 | ling l ing
186 | liu l iu
187 | lo l o
188 | long l ong
189 | lou l ou
190 | lu l u
191 | luan l uan
192 | lun l un
193 | luo l uo
194 | lv l v
195 | lve l ve
196 | ma m a
197 | mai m ai
198 | man m an
199 | mang m ang
200 | mao m ao
201 | me m e
202 | mei m ei
203 | men m en
204 | meng m eng
205 | mi m i
206 | mian m ian
207 | miao m iao
208 | mie m ie
209 | min m in
210 | ming m ing
211 | miu m iu
212 | mo m o
213 | mou m ou
214 | mu m u
215 | na n a
216 | nai n ai
217 | nan n an
218 | nang n ang
219 | nao n ao
220 | ne n e
221 | nei n ei
222 | nen n en
223 | neng n eng
224 | ni n i
225 | nian n ian
226 | niang n iang
227 | niao n iao
228 | nie n ie
229 | nin n in
230 | ning n ing
231 | niu n iu
232 | nong n ong
233 | nou n ou
234 | nu n u
235 | nuan n uan
236 | nun n un
237 | nuo n uo
238 | nv n v
239 | nve n ve
240 | o OO o
241 | ou OO ou
242 | pa p a
243 | pai p ai
244 | pan p an
245 | pang p ang
246 | pao p ao
247 | pei p ei
248 | pen p en
249 | peng p eng
250 | pi p i
251 | pian p ian
252 | piao p iao
253 | pie p ie
254 | pin p in
255 | ping p ing
256 | po p o
257 | pou p ou
258 | pu p u
259 | qi q i
260 | qia q ia
261 | qian q ian
262 | qiang q iang
263 | qiao q iao
264 | qie q ie
265 | qin q in
266 | qing q ing
267 | qiong q iong
268 | qiu q iu
269 | qu q v
270 | qv q v
271 | quan q van
272 | qvan q van
273 | que q ve
274 | qve q ve
275 | qun q vn
276 | qvn q vn
277 | ran r an
278 | rang r ang
279 | rao r ao
280 | re r e
281 | ren r en
282 | reng r eng
283 | ri r ir
284 | rong r ong
285 | rou r ou
286 | ru r u
287 | rua r ua
288 | ruan r uan
289 | rui r ui
290 | run r un
291 | ruo r uo
292 | sa s a
293 | sai s ai
294 | san s an
295 | sang s ang
296 | sao s ao
297 | se s e
298 | sen s en
299 | seng s eng
300 | sha sh a
301 | shai sh ai
302 | shan sh an
303 | shang sh ang
304 | shao sh ao
305 | she sh e
306 | shei sh ei
307 | shen sh en
308 | sheng sh eng
309 | shi sh ir
310 | shou sh ou
311 | shu sh u
312 | shua sh ua
313 | shuai sh uai
314 | shuan sh uan
315 | shuang sh uang
316 | shui sh ui
317 | shun sh un
318 | shuo sh uo
319 | si s i0
320 | song s ong
321 | sou s ou
322 | su s u
323 | suan s uan
324 | sui s ui
325 | sun s un
326 | suo s uo
327 | ta t a
328 | tai t ai
329 | tan t an
330 | tang t ang
331 | tao t ao
332 | te t e
333 | tei t ei
334 | teng t eng
335 | ti t i
336 | tian t ian
337 | tiao t iao
338 | tie t ie
339 | ting t ing
340 | tong t ong
341 | tou t ou
342 | tu t u
343 | tuan t uan
344 | tui t ui
345 | tun t un
346 | tuo t uo
347 | wa w a
348 | wai w ai
349 | wan w an
350 | wang w ang
351 | wei w ei
352 | wen w en
353 | weng w eng
354 | wo w o
355 | wu w u
356 | xi x i
357 | xia x ia
358 | xian x ian
359 | xiang x iang
360 | xiao x iao
361 | xie x ie
362 | xin x in
363 | xing x ing
364 | xiong x iong
365 | xiu x iu
366 | xu x v
367 | xv x v
368 | xuan x van
369 | xvan x van
370 | xue x ve
371 | xve x ve
372 | xun x vn
373 | xvn x vn
374 | ya y a
375 | yan y En
376 | yang y ang
377 | yao y ao
378 | ye y E
379 | yi y i
380 | yin y in
381 | ying y ing
382 | yo y o
383 | yong y ong
384 | you y ou
385 | yu y v
386 | yv y v
387 | yuan y van
388 | yvan y van
389 | yue y ve
390 | yve y ve
391 | yun y vn
392 | yvn y vn
393 | za z a
394 | zai z ai
395 | zan z an
396 | zang z ang
397 | zao z ao
398 | ze z e
399 | zei z ei
400 | zen z en
401 | zeng z eng
402 | zha zh a
403 | zhai zh ai
404 | zhan zh an
405 | zhang zh ang
406 | zhao zh ao
407 | zhe zh e
408 | zhei zh ei
409 | zhen zh en
410 | zheng zh eng
411 | zhi zh ir
412 | zhong zh ong
413 | zhou zh ou
414 | zhu zh u
415 | zhua zh ua
416 | zhuai zh uai
417 | zhuan zh uan
418 | zhuang zh uang
419 | zhui zh ui
420 | zhun zh un
421 | zhuo zh uo
422 | zi z i0
423 | zong z ong
424 | zou z ou
425 | zu z u
426 | zuan z uan
427 | zui z ui
428 | zun z un
429 | zuo z uo
430 |
--------------------------------------------------------------------------------
/melo/text/spanish.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 |
5 | from . import symbols
6 | from .es_phonemizer import cleaner as es_cleaner
7 | from .es_phonemizer import es_to_ipa
8 | from transformers import AutoTokenizer
9 |
10 |
11 | def distribute_phone(n_phone, n_word):
12 | phones_per_word = [0] * n_word
13 | for task in range(n_phone):
14 | min_tasks = min(phones_per_word)
15 | min_index = phones_per_word.index(min_tasks)
16 | phones_per_word[min_index] += 1
17 | return phones_per_word
18 |
19 | def text_normalize(text):
20 | text = es_cleaner.spanish_cleaners(text)
21 | return text
22 |
23 | def post_replace_ph(ph):
24 | rep_map = {
25 | ":": ",",
26 | ";": ",",
27 | ",": ",",
28 | "。": ".",
29 | "!": "!",
30 | "?": "?",
31 | "\n": ".",
32 | "·": ",",
33 | "、": ",",
34 | "...": "…"
35 | }
36 | if ph in rep_map.keys():
37 | ph = rep_map[ph]
38 | if ph in symbols:
39 | return ph
40 | if ph not in symbols:
41 | ph = "UNK"
42 | return ph
43 |
44 | def refine_ph(phn):
45 | tone = 0
46 | if re.search(r"\d$", phn):
47 | tone = int(phn[-1]) + 1
48 | phn = phn[:-1]
49 | return phn.lower(), tone
50 |
51 |
52 | def refine_syllables(syllables):
53 | tones = []
54 | phonemes = []
55 | for phn_list in syllables:
56 | for i in range(len(phn_list)):
57 | phn = phn_list[i]
58 | phn, tone = refine_ph(phn)
59 | phonemes.append(phn)
60 | tones.append(tone)
61 | return phonemes, tones
62 |
63 |
64 | # model_id = 'bert-base-uncased'
65 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
66 | tokenizer = AutoTokenizer.from_pretrained(model_id)
67 |
68 | def g2p(text, pad_start_end=True, tokenized=None):
69 | if tokenized is None:
70 | tokenized = tokenizer.tokenize(text)
71 | # import pdb; pdb.set_trace()
72 | phs = []
73 | ph_groups = []
74 | for t in tokenized:
75 | if not t.startswith("#"):
76 | ph_groups.append([t])
77 | else:
78 | ph_groups[-1].append(t.replace("#", ""))
79 |
80 | phones = []
81 | tones = []
82 | word2ph = []
83 | # print(ph_groups)
84 | for group in ph_groups:
85 | w = "".join(group)
86 | phone_len = 0
87 | word_len = len(group)
88 | if w == '[UNK]':
89 | phone_list = ['UNK']
90 | else:
91 | phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w)))
92 |
93 | for ph in phone_list:
94 | phones.append(ph)
95 | tones.append(0)
96 | phone_len += 1
97 | aaa = distribute_phone(phone_len, word_len)
98 | word2ph += aaa
99 | # print(phone_list, aaa)
100 | # print('=' * 10)
101 |
102 | if pad_start_end:
103 | phones = ["_"] + phones + ["_"]
104 | tones = [0] + tones + [0]
105 | word2ph = [1] + word2ph + [1]
106 | return phones, tones, word2ph
107 |
108 | def get_bert_feature(text, word2ph, device=None):
109 | from text import spanish_bert
110 | return spanish_bert.get_bert_feature(text, word2ph, device=device)
111 |
112 | if __name__ == "__main__":
113 | text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789."
114 | # print(text)
115 | text = text_normalize(text)
116 | print(text)
117 | phones, tones, word2ph = g2p(text)
118 | bert = get_bert_feature(text, word2ph)
119 | print(phones)
120 | print(len(phones), tones, sum(word2ph), bert.shape)
121 |
122 |
123 |
--------------------------------------------------------------------------------
/melo/text/spanish_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
7 | model = None
8 |
9 | def get_bert_feature(text, word2ph, device=None):
10 | global model
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if model is None:
20 | model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 | device
22 | )
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = model(**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert inputs["input_ids"].shape[-1] == len(word2ph)
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
--------------------------------------------------------------------------------
/melo/text/symbols.py:
--------------------------------------------------------------------------------
1 | # punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | punctuation = ["!", "?", "…", ",", ".", "'", "-", "¿", "¡"]
3 | pu_symbols = punctuation + ["SP", "UNK"]
4 | pad = "_"
5 |
6 | # chinese
7 | zh_symbols = [
8 | "E",
9 | "En",
10 | "a",
11 | "ai",
12 | "an",
13 | "ang",
14 | "ao",
15 | "b",
16 | "c",
17 | "ch",
18 | "d",
19 | "e",
20 | "ei",
21 | "en",
22 | "eng",
23 | "er",
24 | "f",
25 | "g",
26 | "h",
27 | "i",
28 | "i0",
29 | "ia",
30 | "ian",
31 | "iang",
32 | "iao",
33 | "ie",
34 | "in",
35 | "ing",
36 | "iong",
37 | "ir",
38 | "iu",
39 | "j",
40 | "k",
41 | "l",
42 | "m",
43 | "n",
44 | "o",
45 | "ong",
46 | "ou",
47 | "p",
48 | "q",
49 | "r",
50 | "s",
51 | "sh",
52 | "t",
53 | "u",
54 | "ua",
55 | "uai",
56 | "uan",
57 | "uang",
58 | "ui",
59 | "un",
60 | "uo",
61 | "v",
62 | "van",
63 | "ve",
64 | "vn",
65 | "w",
66 | "x",
67 | "y",
68 | "z",
69 | "zh",
70 | "AA",
71 | "EE",
72 | "OO",
73 | ]
74 | num_zh_tones = 6
75 |
76 | # japanese
77 | ja_symbols = [
78 | "N",
79 | "a",
80 | "a:",
81 | "b",
82 | "by",
83 | "ch",
84 | "d",
85 | "dy",
86 | "e",
87 | "e:",
88 | "f",
89 | "g",
90 | "gy",
91 | "h",
92 | "hy",
93 | "i",
94 | "i:",
95 | "j",
96 | "k",
97 | "ky",
98 | "m",
99 | "my",
100 | "n",
101 | "ny",
102 | "o",
103 | "o:",
104 | "p",
105 | "py",
106 | "q",
107 | "r",
108 | "ry",
109 | "s",
110 | "sh",
111 | "t",
112 | "ts",
113 | "ty",
114 | "u",
115 | "u:",
116 | "w",
117 | "y",
118 | "z",
119 | "zy",
120 | ]
121 | num_ja_tones = 1
122 |
123 | # English
124 | en_symbols = [
125 | "aa",
126 | "ae",
127 | "ah",
128 | "ao",
129 | "aw",
130 | "ay",
131 | "b",
132 | "ch",
133 | "d",
134 | "dh",
135 | "eh",
136 | "er",
137 | "ey",
138 | "f",
139 | "g",
140 | "hh",
141 | "ih",
142 | "iy",
143 | "jh",
144 | "k",
145 | "l",
146 | "m",
147 | "n",
148 | "ng",
149 | "ow",
150 | "oy",
151 | "p",
152 | "r",
153 | "s",
154 | "sh",
155 | "t",
156 | "th",
157 | "uh",
158 | "uw",
159 | "V",
160 | "w",
161 | "y",
162 | "z",
163 | "zh",
164 | ]
165 | num_en_tones = 4
166 |
167 | # Korean
168 | kr_symbols = ['ᄌ', 'ᅥ', 'ᆫ', 'ᅦ', 'ᄋ', 'ᅵ', 'ᄅ', 'ᅴ', 'ᄀ', 'ᅡ', 'ᄎ', 'ᅪ', 'ᄑ', 'ᅩ', 'ᄐ', 'ᄃ', 'ᅢ', 'ᅮ', 'ᆼ', 'ᅳ', 'ᄒ', 'ᄆ', 'ᆯ', 'ᆷ', 'ᄂ', 'ᄇ', 'ᄉ', 'ᆮ', 'ᄁ', 'ᅬ', 'ᅣ', 'ᄄ', 'ᆨ', 'ᄍ', 'ᅧ', 'ᄏ', 'ᆸ', 'ᅭ', '(', 'ᄊ', ')', 'ᅲ', 'ᅨ', 'ᄈ', 'ᅱ', 'ᅯ', 'ᅫ', 'ᅰ', 'ᅤ', '~', '\\', '[', ']', '/', '^', ':', 'ㄸ', '*']
169 | num_kr_tones = 1
170 |
171 | # Spanish
172 | es_symbols = [
173 | "N",
174 | "Q",
175 | "a",
176 | "b",
177 | "d",
178 | "e",
179 | "f",
180 | "g",
181 | "h",
182 | "i",
183 | "j",
184 | "k",
185 | "l",
186 | "m",
187 | "n",
188 | "o",
189 | "p",
190 | "s",
191 | "t",
192 | "u",
193 | "v",
194 | "w",
195 | "x",
196 | "y",
197 | "z",
198 | "ɑ",
199 | "æ",
200 | "ʃ",
201 | "ʑ",
202 | "ç",
203 | "ɯ",
204 | "ɪ",
205 | "ɔ",
206 | "ɛ",
207 | "ɹ",
208 | "ð",
209 | "ə",
210 | "ɫ",
211 | "ɥ",
212 | "ɸ",
213 | "ʊ",
214 | "ɾ",
215 | "ʒ",
216 | "θ",
217 | "β",
218 | "ŋ",
219 | "ɦ",
220 | "ɡ",
221 | "r",
222 | "ɲ",
223 | "ʝ",
224 | "ɣ",
225 | "ʎ",
226 | "ˈ",
227 | "ˌ",
228 | "ː"
229 | ]
230 | num_es_tones = 1
231 |
232 | # French
233 | fr_symbols = [
234 | "\u0303",
235 | "œ",
236 | "ø",
237 | "ʁ",
238 | "ɒ",
239 | "ʌ",
240 | "ɜ",
241 | "ɐ"
242 | ]
243 | num_fr_tones = 1
244 |
245 | # German
246 | de_symbols = [
247 | "ʏ",
248 | "̩"
249 | ]
250 | num_de_tones = 1
251 |
252 | # Russian
253 | ru_symbols = [
254 | "ɭ",
255 | "ʲ",
256 | "ɕ",
257 | "\"",
258 | "ɵ",
259 | "^",
260 | "ɬ"
261 | ]
262 | num_ru_tones = 1
263 |
264 | # combine all symbols
265 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols + kr_symbols + es_symbols + fr_symbols + de_symbols + ru_symbols))
266 | symbols = [pad] + normal_symbols + pu_symbols
267 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
268 |
269 | # combine all tones
270 | num_tones = num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones + num_fr_tones + num_de_tones + num_ru_tones
271 |
272 | # language maps
273 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2, "ZH_MIX_EN": 3, 'KR': 4, 'ES': 5, 'SP': 5 ,'FR': 6}
274 | num_languages = len(language_id_map.keys())
275 |
276 | language_tone_start_map = {
277 | "ZH": 0,
278 | "ZH_MIX_EN": 0,
279 | "JP": num_zh_tones,
280 | "EN": num_zh_tones + num_ja_tones,
281 | 'KR': num_zh_tones + num_ja_tones + num_en_tones,
282 | "ES": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
283 | "SP": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
284 | "FR": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones,
285 | }
286 |
287 | if __name__ == "__main__":
288 | a = set(zh_symbols)
289 | b = set(en_symbols)
290 | print(sorted(a & b))
291 |
--------------------------------------------------------------------------------
/melo/train.sh:
--------------------------------------------------------------------------------
1 | CONFIG=$1
2 | GPUS=$2
3 | MODEL_NAME=$(basename "$(dirname $CONFIG)")
4 |
5 | PORT=10902
6 |
7 | while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
8 | do
9 | torchrun --nproc_per_node=$GPUS \
10 | --master_port=$PORT \
11 | train.py --c $CONFIG --model $MODEL_NAME
12 |
13 | for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
14 | do
15 | echo $PID
16 | kill -9 $PID
17 | done
18 | sleep 30
19 | done
--------------------------------------------------------------------------------
/melo/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 | import numpy as np
5 |
6 |
7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
9 | DEFAULT_MIN_DERIVATIVE = 1e-3
10 |
11 |
12 | def piecewise_rational_quadratic_transform(
13 | inputs,
14 | unnormalized_widths,
15 | unnormalized_heights,
16 | unnormalized_derivatives,
17 | inverse=False,
18 | tails=None,
19 | tail_bound=1.0,
20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22 | min_derivative=DEFAULT_MIN_DERIVATIVE,
23 | ):
24 | if tails is None:
25 | spline_fn = rational_quadratic_spline
26 | spline_kwargs = {}
27 | else:
28 | spline_fn = unconstrained_rational_quadratic_spline
29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30 |
31 | outputs, logabsdet = spline_fn(
32 | inputs=inputs,
33 | unnormalized_widths=unnormalized_widths,
34 | unnormalized_heights=unnormalized_heights,
35 | unnormalized_derivatives=unnormalized_derivatives,
36 | inverse=inverse,
37 | min_bin_width=min_bin_width,
38 | min_bin_height=min_bin_height,
39 | min_derivative=min_derivative,
40 | **spline_kwargs
41 | )
42 | return outputs, logabsdet
43 |
44 |
45 | def searchsorted(bin_locations, inputs, eps=1e-6):
46 | bin_locations[..., -1] += eps
47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48 |
49 |
50 | def unconstrained_rational_quadratic_spline(
51 | inputs,
52 | unnormalized_widths,
53 | unnormalized_heights,
54 | unnormalized_derivatives,
55 | inverse=False,
56 | tails="linear",
57 | tail_bound=1.0,
58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60 | min_derivative=DEFAULT_MIN_DERIVATIVE,
61 | ):
62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63 | outside_interval_mask = ~inside_interval_mask
64 |
65 | outputs = torch.zeros_like(inputs)
66 | logabsdet = torch.zeros_like(inputs)
67 |
68 | if tails == "linear":
69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70 | constant = np.log(np.exp(1 - min_derivative) - 1)
71 | unnormalized_derivatives[..., 0] = constant
72 | unnormalized_derivatives[..., -1] = constant
73 |
74 | outputs[outside_interval_mask] = inputs[outside_interval_mask]
75 | logabsdet[outside_interval_mask] = 0
76 | else:
77 | raise RuntimeError("{} tails are not implemented.".format(tails))
78 |
79 | (
80 | outputs[inside_interval_mask],
81 | logabsdet[inside_interval_mask],
82 | ) = rational_quadratic_spline(
83 | inputs=inputs[inside_interval_mask],
84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87 | inverse=inverse,
88 | left=-tail_bound,
89 | right=tail_bound,
90 | bottom=-tail_bound,
91 | top=tail_bound,
92 | min_bin_width=min_bin_width,
93 | min_bin_height=min_bin_height,
94 | min_derivative=min_derivative,
95 | )
96 |
97 | return outputs, logabsdet
98 |
99 |
100 | def rational_quadratic_spline(
101 | inputs,
102 | unnormalized_widths,
103 | unnormalized_heights,
104 | unnormalized_derivatives,
105 | inverse=False,
106 | left=0.0,
107 | right=1.0,
108 | bottom=0.0,
109 | top=1.0,
110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 | min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 | if torch.min(inputs) < left or torch.max(inputs) > right:
115 | raise ValueError("Input to a transform is not within its domain")
116 |
117 | num_bins = unnormalized_widths.shape[-1]
118 |
119 | if min_bin_width * num_bins > 1.0:
120 | raise ValueError("Minimal bin width too large for the number of bins")
121 | if min_bin_height * num_bins > 1.0:
122 | raise ValueError("Minimal bin height too large for the number of bins")
123 |
124 | widths = F.softmax(unnormalized_widths, dim=-1)
125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 | cumwidths = torch.cumsum(widths, dim=-1)
127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 | cumwidths = (right - left) * cumwidths + left
129 | cumwidths[..., 0] = left
130 | cumwidths[..., -1] = right
131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 |
133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 |
135 | heights = F.softmax(unnormalized_heights, dim=-1)
136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 | cumheights = torch.cumsum(heights, dim=-1)
138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 | cumheights = (top - bottom) * cumheights + bottom
140 | cumheights[..., 0] = bottom
141 | cumheights[..., -1] = top
142 | heights = cumheights[..., 1:] - cumheights[..., :-1]
143 |
144 | if inverse:
145 | bin_idx = searchsorted(cumheights, inputs)[..., None]
146 | else:
147 | bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 |
149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 |
152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 | delta = heights / widths
154 | input_delta = delta.gather(-1, bin_idx)[..., 0]
155 |
156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 |
159 | input_heights = heights.gather(-1, bin_idx)[..., 0]
160 |
161 | if inverse:
162 | a = (inputs - input_cumheights) * (
163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 | ) + input_heights * (input_delta - input_derivatives)
165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 | )
168 | c = -input_delta * (inputs - input_cumheights)
169 |
170 | discriminant = b.pow(2) - 4 * a * c
171 | assert (discriminant >= 0).all()
172 |
173 | root = (2 * c) / (-b - torch.sqrt(discriminant))
174 | outputs = root * input_bin_widths + input_cumwidths
175 |
176 | theta_one_minus_theta = root * (1 - root)
177 | denominator = input_delta + (
178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 | * theta_one_minus_theta
180 | )
181 | derivative_numerator = input_delta.pow(2) * (
182 | input_derivatives_plus_one * root.pow(2)
183 | + 2 * input_delta * theta_one_minus_theta
184 | + input_derivatives * (1 - root).pow(2)
185 | )
186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 |
188 | return outputs, -logabsdet
189 | else:
190 | theta = (inputs - input_cumwidths) / input_bin_widths
191 | theta_one_minus_theta = theta * (1 - theta)
192 |
193 | numerator = input_heights * (
194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 | )
196 | denominator = input_delta + (
197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 | * theta_one_minus_theta
199 | )
200 | outputs = input_cumheights + numerator / denominator
201 |
202 | derivative_numerator = input_delta.pow(2) * (
203 | input_derivatives_plus_one * theta.pow(2)
204 | + 2 * input_delta * theta_one_minus_theta
205 | + input_derivatives * (1 - theta).pow(2)
206 | )
207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 |
209 | return outputs, logabsdet
210 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | txtsplit
2 | torch
3 | torchaudio
4 | cached_path
5 | transformers==4.27.4
6 | num2words==0.5.12
7 | unidic_lite==1.0.8
8 | unidic==1.1.0
9 | mecab-python3==1.0.9
10 | pykakasi==2.2.1
11 | fugashi==1.3.0
12 | g2p_en==2.1.0
13 | anyascii==0.3.2
14 | jamo==0.4.1
15 | gruut[de,es,fr]==2.2.3
16 | g2pkk>=0.1.1
17 | librosa==0.9.1
18 | pydub==0.25.1
19 | eng_to_ipa==0.0.2
20 | inflect==7.0.0
21 | unidecode==1.3.7
22 | pypinyin==0.50.0
23 | cn2an==0.5.22
24 | jieba==0.42.1
25 | gradio
26 | langid==1.1.6
27 | tqdm
28 | tensorboard==2.16.2
29 | loguru==0.7.2
30 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import setup, find_packages
3 | from setuptools.command.develop import develop
4 | from setuptools.command.install import install
5 |
6 |
7 | cwd = os.path.dirname(os.path.abspath(__file__))
8 |
9 | with open('requirements.txt') as f:
10 | reqs = f.read().splitlines()
11 | class PostInstallCommand(install):
12 | """Post-installation for installation mode."""
13 | def run(self):
14 | install.run(self)
15 | os.system('python -m unidic download')
16 |
17 |
18 | class PostDevelopCommand(develop):
19 | """Post-installation for development mode."""
20 | def run(self):
21 | develop.run(self)
22 | os.system('python -m unidic download')
23 |
24 | setup(
25 | name='melotts',
26 | version='0.1.2',
27 | packages=find_packages(),
28 | include_package_data=True,
29 | install_requires=reqs,
30 | package_data={
31 | '': ['*.txt', 'cmudict_*'],
32 | },
33 | entry_points={
34 | "console_scripts": [
35 | "melotts = melo.main:main",
36 | "melo = melo.main:main",
37 | "melo-ui = melo.app:main",
38 | ],
39 | },
40 | )
41 |
--------------------------------------------------------------------------------
/test/basetts_test_resources/en_egs_text.txt:
--------------------------------------------------------------------------------
1 | Did you ever hear a folk tale about a giant turtle?
2 | Can you name five cars that were popular in the 1970s?
3 | May I ask what's your favorite university and why?
4 | Well, have you ever experienced violence in your life?
5 | Have you ever imposed restrictions?
6 | Did you ever feel guilty for not providing enough care for your pet?
7 | Would you prefer barbecue-flavored chips or plain chips?
8 | Are contractions common in English?
9 | Well, have you ever seen a slam poetry competition?
10 | Am I correct in assuming that bilateral trade agreements favor developed countries?
11 | Are there any scientific theories on why love exists in humans?
12 | Well, do you think figure skating is harder than gymnastics?
13 | Can you tell me if the apartment has a balcony or not?
14 | Have you ever overcome a challenging obstacle positively?
15 | Could you elaborate on the meaning behind that quote?
16 | Shall seniors receive higher taxes?
17 | Do you think adding a liquid flavor to coffee ruins it?
18 | Well, in our conversation about the restaurant, how would you review it overall?
19 | Have you consistently followed through with goals?
20 | Can pilots hear passengers coughing?
21 | Well, have you tried rainbow sprinkles?
22 | Are there any golden retrievers at the local animal shelter?
23 | Have you seen Tyler?
24 | Had you ever deployed to Mars?
25 | Well, have you ever felt intimidated by your competition's tactics?
26 | Are there any specific rules about when you can continue?
27 | Can you describe Antarctica's temperatures?
28 | May I ask, have you ever tasted a bloody mary before?
29 | Did anyone mention the order yet?
30 | Are automatic transmissions more fuel efficient?
31 | Shall we discuss the impact of self-control on personal success?
32 | Have you traveled internationally this May?
33 | Well, have you ever tried shrimp ceviche?
34 | Have you ever seen an act of extraordinary courage in person?
35 | Have you ever wondered how proceed affects the outcome of a project?
36 | Have you calculated the mean weight of all the participants?
37 | Should we bring confetti to the parade?
38 | Do influencers control behavior?
39 | Shall we discuss the price of the new car lease?
40 | Had Nice ever been your home?
41 | Have you ever encountered a gifted child who struggled academically?
42 | Can everyone work together?
43 | Did you know how long an ostrich can survive without water?
44 | Do nurses in long-term care facilities receive adequate training for dementia care?
45 | Has separation ever felt liberating?
46 | Would you prefer a flexible or fixed schedule for work?
47 | Does pension plan have rollover?
48 | Has Vital's mission expanded beyond health supplements?
49 | Have you ever witnessed a bombing attack?
50 | May I predict the outcome of the election based on polls?
51 | Do you think strict parenting leads to more successful children later in life?
52 | Shall we explore nearby parks?
53 | Are there any ways to verify the credibility of online reviews?
54 | Have you ever witnessed a roundabout accident?
55 | Well, upon reflection, do we really want sushi?
56 | Well, have you ever experienced workplace harassment?
57 | Do you think it's sure that the rain will stop soon?
58 | Would you say distance affects relationships?
59 | Can we truly deny the existence of higher power?
60 | Do you think crop yields will be affected by the drought?
61 | Do you think the backup plan is good enough?
62 | Can you tell me, meanwhile, what happened while I was gone?
63 | Did the wise old owl speak?
64 | Well, have you ever been to a retreat that truly transformed you?
65 | Have you ever had to calculate the exact measurements for a recipe?
66 | Can warning signs prevent accidents while driving on icy roads?
67 | Do you think the current job market offers equal opportunity?
68 | Have you ever analyzed your own dreams?
69 | May I ask if colonialism affected your ancestry?
70 | Well, what chest exercises target the upper pecs?
71 | Are there occasionally unexpected consequences of honesty?
72 | Do you think the new restaurant is overpriced?
73 | Do critics take into account audience preferences?
74 | Has translation technology reached a point where it can accurately translate idioms?
75 | Have you ever been to a music festival in another country?
76 | Do you think our taste in food is genetic?
77 | Are you a hopeless romantic at heart?
78 | Shall we explore abandoned urban places?
79 | Does agency promote individualism?
80 | Well, what implementing strategies?
81 | Have you ever noticed the smallest detail that changed your perspective?
82 | Have you ever seen a normal ghost?
83 | Have you ever considered the considerable effort?
84 | Are there holistic chronic cure?
85 | Did unemployment rates change recently?
86 | Does change come from within or without?
87 | Does the length of the patent term affect innovation rates?
88 | Can Junior play basketball?
89 | Shall we analyze the data?
90 | Have you ever tried the Szechuan cuisine before?
91 | Had you ever debated a controversial topic before?
92 | Have you ever analyzed case?
93 | Is it true that stripping originated in ancient Egypt or Greece?
94 | Have you ever dyed your hair a crazy color?
95 | Shall we compare the top-rated pizza places in our city?
96 | May people in different countries play soccer?
97 | Well, have you recycled?
98 | Shall we precisely measure ingredients?
99 | Can you embrace someone you don't love?
--------------------------------------------------------------------------------
/test/basetts_test_resources/es_egs_text.txt:
--------------------------------------------------------------------------------
1 | El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.
2 | Las estrellas bailan en la noche, creando un espectáculo celestial que despierta el alma.
3 | Las majestuosas montañas se alzan en silencio, guardianas inmutables del tiempo que pasa.
4 | El amor, como un suave perfume, envuelve nuestros corazones con un calor reconfortante.
5 | El susurro suave del viento atraviesa los campos de lavanda, llevándose consigo el aroma de la Provenza.
6 | El resplandor de la luna baña la ciudad dormida en una luz mística.
7 | Las calles empedradas revelan historias antiguas, cada piedra llevando el peso del pasado.
8 | La risa de los niños resuena como una melodía encantada en el suave aire de la primavera.
9 | Los jardines floridos estallan con colores vibrantes, creando un cuadro viviente de la naturaleza.
10 | Las olas acarician suavemente la playa, dejando tras de sí huellas efímeras en la arena.
11 | La Torre Eiffel se yergue con orgullo, testigo silencioso del amor eterno en París.
12 | Las mariposas danzan entre las flores, creando una coreografía grácil en el jardín.
13 | Los animados cafés resuenan con conversaciones apasionadas y el embriagador aroma del café recién molido.
14 | Los ríos serpenteantes atraviesan el campo, reflejando el cielo azul en sus aguas tranquilas.
15 | Los imponentes castillos cuentan historias de caballeros y princesas en un pasado lejano.
16 | Los viñedos se extienden hasta donde alcanza la vista, sus filas ordenadas testimonio de la antigua tradición vinícola.
17 | Las risas resuenan en las estrechas callejuelas, despertando la vieja ciudad de su quietud.
18 | Los campos de girasoles saludan al sol con sus caras doradas, un mar de oro bajo un cielo azul.
19 | Las notas melódicas de un acordeón flotan en el aire, capturando la esencia musical de las calles parisinas.
20 | Las cumbres nevadas de los Alpes brillan bajo la luz de la luna, un paisaje invernal de ensueño.
--------------------------------------------------------------------------------
/test/basetts_test_resources/fr_egs_text.txt:
--------------------------------------------------------------------------------
1 | La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.
2 | Les étoiles dansent dans la nuit, créant un spectacle céleste qui éveille l'âme.
3 | Les montagnes majestueuses se dressent en silence, gardiennes immuables du temps qui passe.
4 | L'amour, tel un doux parfum, enveloppe nos cœurs d'une chaleur réconfortante.
5 | Le doux murmure du vent traverse les champs de lavande, emportant avec lui le parfum de la Provence.
6 | La lueur de la lune baigne la ville endormie dans une lumière mystique.
7 | Les ruelles pavées révèlent des histoires anciennes, chaque pierre portant le poids du passé.
8 | Le rire des enfants résonne comme une mélodie enchantée dans l'air doux du printemps.
9 | Les jardins fleuris éclatent de couleurs vives, créant un tableau vivant de la nature.
10 | Les vagues caressent doucement la plage, laissant derrière elles des traces éphémères dans le sable.
11 | La Tour Eiffel se dresse fièrement, témoin silencieux de l'amour éternel à Paris.
12 | Les papillons dansent parmi les fleurs, créant une chorégraphie gracieuse dans le jardin.
13 | Les cafés animés résonnent de conversations passionnées et du parfum enivrant du café fraîchement moulu.
14 | Les rivières sinueuses traversent la campagne, reflétant le ciel azur dans leurs eaux calmes.
15 | Les châteaux imposants racontent des contes de chevaliers et de princesses dans un passé lointain.
16 | Les vignobles s'étendent à perte de vue, leurs rangées ordonnées témoignant du savoir-faire viticole ancestral.
17 | Les éclats de rire résonnent dans les ruelles étroites, réveillant la vieille ville de sa quiétude.
18 | Les champs de tournesols saluent le soleil avec leurs visages dorés, une mer d'or sous un ciel d'azur.
19 | Les notes mélodieuses d'un accordéon flottent dans l'air, capturant l'essence musicale des rues parisiennes.
20 | Les sommets enneigés des Alpes brillent sous la lumière de la lune, un paysage hivernal féérique.
--------------------------------------------------------------------------------
/test/basetts_test_resources/jp_egs_text.txt:
--------------------------------------------------------------------------------
1 | 彼は毎朝ジョギングをして体を健康に保っています。
2 | 私たちは来年、友人たちと一緒にヨーロッパ旅行を計画しています。
3 | 新しいレストランで美味しい料理を試すことが楽しみです。
4 | 彼女の絵は情熱と芸術性が溢れていて、見る人を魅了します。
5 | 最近、忙しさに追われていて、ゆっくり休む時間がありません。
6 | 日本の文化は多様で魅力的であり、世界中から注目されています。
7 | 彼の犬は忠実で賢く、家族にとって大切な存在です。
8 | 私の友達は常に私をサポートしてくれる信頼できる存在です。
9 | 家族と一緒に過ごす時間は、私にとって何よりも大切です。
10 | 彼の夢は大きく、努力と決意でそれを実現しようとしています。
--------------------------------------------------------------------------------
/test/basetts_test_resources/kr_egs_text.txt:
--------------------------------------------------------------------------------
1 | 안녕하세요! 오늘은 날씨가 정말 좋네요.
2 | 한국 음식을 먹어보고 싶어요. 불고기랑 김치찌개가 제가 좋아하는 음식이에요.
3 | 요즘에는 한국 드라마를 자주 보고 있어요. 정말 재미있어요.
4 | 한글을 배우는 것이 재미있어요. 조금씩 읽고 쓸 수 있게 되고 있어요.
5 | 친구들과 함께 한국 여행을 계획 중이에요. 서울과 부산을 방문할 예정이에요.,
--------------------------------------------------------------------------------
/test/basetts_test_resources/zh_mix_en_egs_text.txt:
--------------------------------------------------------------------------------
1 | 人工智能是一种非常适合和促进自上而下集中控制的技术,而加密货币则是一种完全关注自下而上分散合作的技术。
2 | Web 3的一个目标是支持艺术家。
3 | 欢迎来到Web 3与A6Z,一个由团队打造的构建下一代互联网的节目。
4 | 我最喜欢的fruit是苹果。
5 | 今天我们要学习Python programming。
6 | 她在library看书。
7 | 你喜欢听pop music吗?
8 | 今天下午,我们准备去shopping mall购物,然后晚上去看一场movie。
9 | 我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。
10 | 在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。
11 | 今天天气真不错,我们去Paris吃蒸汽海鲜吧!,
--------------------------------------------------------------------------------
/test/test_base_model_tts_package.py:
--------------------------------------------------------------------------------
1 | from melo.api import TTS
2 | import os
3 | import glob
4 | import sys
5 |
6 |
7 | language = sys.argv[1]
8 | model = TTS(language=language)
9 |
10 | speaker_ids = model.hps.data.spk2id
11 | speakers = list(speaker_ids.keys())
12 |
13 | root_folder = language.lower()
14 | if 'zh' in root_folder:
15 | texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
16 | language = 'ZH_MIX_EN'
17 | elif 'es' in root_folder:
18 | texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
19 | language = 'SP'
20 | elif 'fr' in root_folder:
21 | texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
22 | language = 'FR'
23 | elif 'en' in root_folder:
24 | texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
25 | # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
26 | language = 'EN'
27 | elif 'jp' in root_folder:
28 | texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
29 | language = 'JP'
30 | elif 'kr' in root_folder:
31 | texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
32 | language = 'KR'
33 | else:
34 | raise NotImplementedError()
35 |
36 | save_dir = os.path.join('basetts_outputs_package', root_folder.split('/')[-1])
37 |
38 | for speed in [1.0]:
39 | for speaker in speakers:
40 | for sent_id, text in enumerate(texts):
41 | output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
42 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
43 | model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)
--------------------------------------------------------------------------------
/test/test_base_model_tts_package_from_S3.py:
--------------------------------------------------------------------------------
1 | from melo.api import TTS
2 | import os
3 | import glob
4 | import sys
5 |
6 |
7 | language = sys.argv[1]
8 | model = TTS(language=language, use_hf=False)
9 |
10 | speaker_ids = model.hps.data.spk2id
11 | speakers = list(speaker_ids.keys())
12 |
13 | root_folder = language.lower()
14 | if 'zh' in root_folder:
15 | texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
16 | language = 'ZH_MIX_EN'
17 | elif 'es' in root_folder:
18 | texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
19 | language = 'SP'
20 | elif 'fr' in root_folder:
21 | texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
22 | language = 'FR'
23 | elif 'en' in root_folder:
24 | texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
25 | # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
26 | language = 'EN'
27 | elif 'jp' in root_folder:
28 | texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
29 | language = 'JP'
30 | elif 'kr' in root_folder:
31 | texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
32 | language = 'KR'
33 | else:
34 | raise NotImplementedError()
35 |
36 | save_dir = os.path.join('basetts_outputs_package_from_S3', root_folder.split('/')[-1])
37 |
38 | for speed in [1.0]:
39 | for speaker in speakers:
40 | for sent_id, text in enumerate(texts):
41 | output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
42 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
43 | model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)
--------------------------------------------------------------------------------