├── .github
    └── workflows
    │   └── pypi.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docs
    ├── install.md
    ├── quick_use.md
    └── training.md
├── logo.png
├── melo
    ├── __init__.py
    ├── api.py
    ├── app.py
    ├── attentions.py
    ├── commons.py
    ├── configs
    │   └── config.json
    ├── data
    │   └── example
    │   │   └── metadata.list
    ├── data_utils.py
    ├── download_utils.py
    ├── infer.py
    ├── init_downloads.py
    ├── losses.py
    ├── main.py
    ├── mel_processing.py
    ├── models.py
    ├── modules.py
    ├── monotonic_align
    │   ├── __init__.py
    │   └── core.py
    ├── preprocess_text.py
    ├── split_utils.py
    ├── text
    │   ├── __init__.py
    │   ├── chinese.py
    │   ├── chinese_bert.py
    │   ├── chinese_mix.py
    │   ├── cleaner.py
    │   ├── cleaner_multiling.py
    │   ├── cmudict.rep
    │   ├── cmudict_cache.pickle
    │   ├── english.py
    │   ├── english_bert.py
    │   ├── english_utils
    │   │   ├── __init__.py
    │   │   ├── abbreviations.py
    │   │   ├── number_norm.py
    │   │   └── time_norm.py
    │   ├── es_phonemizer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── cleaner.py
    │   │   ├── es_symbols.json
    │   │   ├── es_symbols.txt
    │   │   ├── es_symbols_v2.json
    │   │   ├── es_to_ipa.py
    │   │   ├── example_ipa.txt
    │   │   ├── gruut_wrapper.py
    │   │   ├── punctuation.py
    │   │   ├── spanish_symbols.txt
    │   │   └── test.ipynb
    │   ├── fr_phonemizer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── cleaner.py
    │   │   ├── en_symbols.json
    │   │   ├── example_ipa.txt
    │   │   ├── fr_symbols.json
    │   │   ├── fr_to_ipa.py
    │   │   ├── french_abbreviations.py
    │   │   ├── french_symbols.txt
    │   │   ├── gruut_wrapper.py
    │   │   └── punctuation.py
    │   ├── french.py
    │   ├── french_bert.py
    │   ├── japanese.py
    │   ├── japanese_bert.py
    │   ├── ko_dictionary.py
    │   ├── korean.py
    │   ├── opencpop-strict.txt
    │   ├── spanish.py
    │   ├── spanish_bert.py
    │   ├── symbols.py
    │   └── tone_sandhi.py
    ├── train.py
    ├── train.sh
    ├── transforms.py
    └── utils.py
├── requirements.txt
├── setup.py
└── test
    ├── basetts_test_resources
        ├── en_egs_text.txt
        ├── es_egs_text.txt
        ├── fr_egs_text.txt
        ├── jp_egs_text.txt
        ├── kr_egs_text.txt
        └── zh_mix_en_egs_text.txt
    ├── test_base_model_tts_package.py
    └── test_base_model_tts_package_from_S3.py


/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         python -m ensurepip --upgrade
33 |         pip install build
34 |     - name: Build package
35 |       run: python -m build
36 |     - name: Publish package
37 |       uses: pypa/gh-action-pypi-publish@release/v1.8
38 |       with:
39 |         user: __token__
40 |         password: ${{ secrets.PYPI_API_TOKEN }}
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .ipynb_checkpoints/
 3 | basetts_outputs_use_bert/
 4 | basetts_outputs/
 5 | multilingual_ckpts
 6 | basetts_outputs_package/
 7 | build/
 8 | *.egg-info/
 9 | 
10 | *.zip
11 | *.wav


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | WORKDIR /app
 3 | COPY . /app
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |     build-essential libsndfile1 \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN pip install -e .
10 | RUN python -m unidic download
11 | RUN python melo/init_downloads.py
12 | 
13 | CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 MyShell.ai
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   <div>&nbsp;</div>
 3 |   <img src="logo.png" width="300"/> <br>
 4 |   <a href="https://trendshift.io/repositories/8133" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8133" alt="myshell-ai%2FMeloTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 5 | </div>
 6 | 
 7 | ## Introduction
 8 | MeloTTS is a **high-quality multi-lingual** text-to-speech library by [MIT](https://www.mit.edu/) and [MyShell.ai](https://myshell.ai). Supported languages include:
 9 | 
10 | | Language | Example |
11 | | --- | --- |
12 | | English (American)    | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-US/speed_1.0/sent_000.wav) |
13 | | English (British)     | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-BR/speed_1.0/sent_000.wav) |
14 | | English (Indian)      | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN_INDIA/speed_1.0/sent_000.wav) |
15 | | English (Australian)  | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-AU/speed_1.0/sent_000.wav) |
16 | | English (Default)     | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/en/EN-Default/speed_1.0/sent_000.wav) |
17 | | Spanish               | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/es/ES/speed_1.0/sent_000.wav) |
18 | | French                | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/fr/FR/speed_1.0/sent_000.wav) |
19 | | Chinese (mix EN)      | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/zh/ZH/speed_1.0/sent_008.wav) |
20 | | Japanese              | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/jp/JP/speed_1.0/sent_000.wav) |
21 | | Korean                | [Link](https://myshell-public-repo-host.s3.amazonaws.com/myshellttsbase/examples/kr/KR/speed_1.0/sent_000.wav) |
22 | 
23 | Some other features include:
24 | - The Chinese speaker supports `mixed Chinese and English`.
25 | - Fast enough for `CPU real-time inference`.
26 | 
27 | ## Usage
28 | - [Use without Installation](docs/quick_use.md)
29 | - [Install and Use Locally](docs/install.md)
30 | - [Training on Custom Dataset](docs/training.md)
31 | 
32 | The Python API and model cards can be found in [this repo](https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#python-api) or on [HuggingFace](https://huggingface.co/myshell-ai).
33 | 
34 | **Contributing**
35 | 
36 | If you find this work useful, please consider contributing to this repo.
37 | 
38 | - Many thanks to [@fakerybakery](https://github.com/fakerybakery) for adding the Web UI and CLI part.
39 | 
40 | ## Authors
41 | 
42 | - [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
43 | - [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
44 | - [Zengyi Qin](https://www.qinzy.tech) (project lead) at MIT and MyShell
45 | 
46 | **Citation**
47 | ```
48 | @software{zhao2024melo,
49 |   author={Zhao, Wenliang and Yu, Xumin and Qin, Zengyi},
50 |   title = {MeloTTS: High-quality Multi-lingual Multi-accent Text-to-Speech},
51 |   url = {https://github.com/myshell-ai/MeloTTS},
52 |   year = {2023}
53 | }
54 | ```
55 | 
56 | ## License
57 | 
58 | This library is under MIT License, which means it is free for both commercial and non-commercial use.
59 | 
60 | ## Acknowledgements
61 | 
62 | This implementation is based on [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), [VITS2](https://github.com/daniilrobnikov/vits2) and [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2). We appreciate their awesome work.
63 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
  1 | ## Install and Use Locally
  2 | 
  3 | ### Table of Content
  4 | - [Linux and macOS Install](#linux-and-macos-install)
  5 | - [Docker Install for Windows and macOS](#docker-install)
  6 | - [Usage](#usage)
  7 |   - [Web UI](#webui)
  8 |   - [CLI](#cli)
  9 |   - [Python API](#python-api)
 10 | 
 11 | ### Linux and macOS Install
 12 | The repo is developed and tested on `Ubuntu 20.04` and `Python 3.9`.
 13 | ```bash
 14 | git clone https://github.com/myshell-ai/MeloTTS.git
 15 | cd MeloTTS
 16 | pip install -e .
 17 | python -m unidic download
 18 | ```
 19 | If you encountered issues in macOS install, try the [Docker Install](#docker-install)
 20 | 
 21 | ### Docker Install
 22 | To avoid compatibility issues, for Windows users and some macOS users, we suggest to run via Docker. Ensure that [you have Docker installed](https://docs.docker.com/engine/install/).
 23 | 
 24 | **Build Docker**
 25 | 
 26 | This could take a few minutes.
 27 | ```bash
 28 | git clone https://github.com/myshell-ai/MeloTTS.git
 29 | cd MeloTTS
 30 | docker build -t melotts . 
 31 | ```
 32 | 
 33 | **Run Docker**
 34 | ```bash
 35 | docker run -it -p 8888:8888 melotts
 36 | ```
 37 | If your local machine has GPU, then you can choose to run:
 38 | ```bash
 39 | docker run --gpus all -it -p 8888:8888 melotts
 40 | ```
 41 | Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app.
 42 | 
 43 | ## Usage
 44 | 
 45 | ### WebUI
 46 | 
 47 | The WebUI supports muliple languages and voices. First, follow the installation steps. Then, simply run:
 48 | 
 49 | ```bash
 50 | melo-ui
 51 | # Or: python melo/app.py
 52 | ```
 53 | 
 54 | ### CLI
 55 | 
 56 | You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples:
 57 | 
 58 | **Read English text:**
 59 | 
 60 | ```bash
 61 | melo "Text to read" output.wav
 62 | ```
 63 | 
 64 | **Specify a language:**
 65 | 
 66 | ```bash
 67 | melo "Text to read" output.wav --language EN
 68 | ```
 69 | 
 70 | **Specify a speaker:**
 71 | 
 72 | ```bash
 73 | melo "Text to read" output.wav --language EN --speaker EN-US
 74 | melo "Text to read" output.wav --language EN --speaker EN-AU
 75 | ```
 76 | 
 77 | The available speakers are: `EN-Default`, `EN-US`, `EN-BR`, `EN_INDIA` `EN-AU`.
 78 | 
 79 | **Specify a speed:**
 80 | 
 81 | ```bash
 82 | melo "Text to read" output.wav --language EN --speaker EN-US --speed 1.5
 83 | melo "Text to read" output.wav --speed 1.5
 84 | ```
 85 | 
 86 | **Use a different language:**
 87 | 
 88 | ```bash
 89 | melo "text-to-speech 领域近年来发展迅速" zh.wav -l ZH
 90 | ```
 91 | 
 92 | **Load from a file:**
 93 | 
 94 | ```bash
 95 | melo file.txt out.wav --file
 96 | ```
 97 | 
 98 | The full API documentation may be found using:
 99 | 
100 | ```bash
101 | melo --help
102 | ```
103 | 
104 | ### Python API
105 | 
106 | #### English with Multiple Accents
107 | 
108 | ```python
109 | from melo.api import TTS
110 | 
111 | # Speed is adjustable
112 | speed = 1.0
113 | 
114 | # CPU is sufficient for real-time inference.
115 | # You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps'
116 | device = 'auto' # Will automatically use GPU if available
117 | 
118 | # English 
119 | text = "Did you ever hear a folk tale about a giant turtle?"
120 | model = TTS(language='EN', device=device)
121 | speaker_ids = model.hps.data.spk2id
122 | 
123 | # American accent
124 | output_path = 'en-us.wav'
125 | model.tts_to_file(text, speaker_ids['EN-US'], output_path, speed=speed)
126 | 
127 | # British accent
128 | output_path = 'en-br.wav'
129 | model.tts_to_file(text, speaker_ids['EN-BR'], output_path, speed=speed)
130 | 
131 | # Indian accent
132 | output_path = 'en-india.wav'
133 | model.tts_to_file(text, speaker_ids['EN_INDIA'], output_path, speed=speed)
134 | 
135 | # Australian accent
136 | output_path = 'en-au.wav'
137 | model.tts_to_file(text, speaker_ids['EN-AU'], output_path, speed=speed)
138 | 
139 | # Default accent
140 | output_path = 'en-default.wav'
141 | model.tts_to_file(text, speaker_ids['EN-Default'], output_path, speed=speed)
142 | 
143 | ```
144 | 
145 | #### Spanish
146 | ```python
147 | from melo.api import TTS
148 | 
149 | # Speed is adjustable
150 | speed = 1.0
151 | 
152 | # CPU is sufficient for real-time inference.
153 | # You can also change to cuda:0
154 | device = 'cpu'
155 | 
156 | text = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante."
157 | model = TTS(language='ES', device=device)
158 | speaker_ids = model.hps.data.spk2id
159 | 
160 | output_path = 'es.wav'
161 | model.tts_to_file(text, speaker_ids['ES'], output_path, speed=speed)
162 | ```
163 | 
164 | #### French
165 | 
166 | ```python
167 | from melo.api import TTS
168 | 
169 | # Speed is adjustable
170 | speed = 1.0
171 | device = 'cpu' # or cuda:0
172 | 
173 | text = "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante."
174 | model = TTS(language='FR', device=device)
175 | speaker_ids = model.hps.data.spk2id
176 | 
177 | output_path = 'fr.wav'
178 | model.tts_to_file(text, speaker_ids['FR'], output_path, speed=speed)
179 | ```
180 | 
181 | #### Chinese
182 | 
183 | ```python
184 | from melo.api import TTS
185 | 
186 | # Speed is adjustable
187 | speed = 1.0
188 | device = 'cpu' # or cuda:0
189 | 
190 | text = "我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。"
191 | model = TTS(language='ZH', device=device)
192 | speaker_ids = model.hps.data.spk2id
193 | 
194 | output_path = 'zh.wav'
195 | model.tts_to_file(text, speaker_ids['ZH'], output_path, speed=speed)
196 | ```
197 | 
198 | #### Japanese
199 | 
200 | ```python
201 | from melo.api import TTS
202 | 
203 | # Speed is adjustable
204 | speed = 1.0
205 | device = 'cpu' # or cuda:0
206 | 
207 | text = "彼は毎朝ジョギングをして体を健康に保っています。"
208 | model = TTS(language='JP', device=device)
209 | speaker_ids = model.hps.data.spk2id
210 | 
211 | output_path = 'jp.wav'
212 | model.tts_to_file(text, speaker_ids['JP'], output_path, speed=speed)
213 | ```
214 | 
215 | #### Korean
216 | 
217 | ```python
218 | from melo.api import TTS
219 | 
220 | # Speed is adjustable
221 | speed = 1.0
222 | device = 'cpu' # or cuda:0
223 | 
224 | text = "안녕하세요! 오늘은 날씨가 정말 좋네요."
225 | model = TTS(language='KR', device=device)
226 | speaker_ids = model.hps.data.spk2id
227 | 
228 | output_path = 'kr.wav'
229 | model.tts_to_file(text, speaker_ids['KR'], output_path, speed=speed)
230 | ```
231 | 


--------------------------------------------------------------------------------
/docs/quick_use.md:
--------------------------------------------------------------------------------
 1 | ## Use MeloTTS without Installation
 2 | 
 3 | **Quick Demo**
 4 | 
 5 | - [Official live demo](https://app.myshell.ai/bot/UN77N3/1709094629) on Myshell.
 6 | - Hugging Face Space [live demo](https://huggingface.co/spaces/mrfakename/MeloTTS).
 7 | 
 8 | **Use on MyShell**
 9 | 
10 | There are hundreds of TTS models on MyShell, much more than MeloTTS. For example:
11 | 
12 | English
13 | - [gentle British male voice](https://app.myshell.ai/widget/nIfamm)
14 | - [cheerful young female voice](https://app.myshell.ai/widget/AjIjqy)
15 | - [sultry and robust male voice](https://app.myshell.ai/widget/zQJJN3)
16 | 
17 | Spanish
18 | - [voz femenina adorable](https://app.myshell.ai/widget/buIZBf)
19 | - [voz masculina joven](https://app.myshell.ai/widget/rayuiy)
20 | - [voz de niña inmadura](https://app.myshell.ai/widget/mYFV3e)
21 | 
22 | French
23 | - [voix adorable de fille](https://app.myshell.ai/widget/3IfEfy)
24 | - [voix douce masculine](https://app.myshell.ai/widget/IRR3M3)
25 | - [voix douce féminine](https://app.myshell.ai/widget/NRbaUj)
26 | 
27 | German
28 | - [sanfte Männerstimme](https://app.myshell.ai/widget/JFnAn2)
29 | - [sanfte Frauenstimme](https://app.myshell.ai/widget/MrU7Nb)
30 | - [unreife Mädchenstimme](https://app.myshell.ai/widget/UFbYBj)
31 | 
32 | Portuguese
33 | - [voz feminina nítida](https://app.myshell.ai/widget/VzMb6j)
34 | - [voz de menino imaturo](https://app.myshell.ai/widget/nAzeei)
35 | - [voz masculina sóbria](https://app.myshell.ai/widget/JZRNJz)
36 | 
37 | Russian
38 | - [зрелый женский голос](https://app.myshell.ai/widget/6byMZ3)
39 | - [зрелый мужской голос](https://app.myshell.ai/widget/NB7jmm)
40 | 
41 | Chinese
42 | - [甜美女声](https://app.myshell.ai/widget/ymeUjm)
43 | - [青年男声](https://app.myshell.ai/widget/NZnERb)
44 | 
45 | More can be found at the widget center of [MyShell.ai](https://app.myshell.ai/robot-workshop).
46 | 


--------------------------------------------------------------------------------
/docs/training.md:
--------------------------------------------------------------------------------
 1 | ## Training
 2 | 
 3 | Before training, please install MeloTTS in dev mode and go to the `melo` folder. 
 4 | ```
 5 | pip install -e .
 6 | cd melo
 7 | ```
 8 | 
 9 | ### Data Preparation
10 | To train a TTS model, we need to prepare the audio files and a metadata file. We recommend using 44100Hz audio files and the metadata file should have the following format:
11 | 
12 | ```
13 | path/to/audio_001.wav |<speaker_name>|<language_code>|<text_001>
14 | path/to/audio_002.wav |<speaker_name>|<language_code>|<text_002>
15 | ```
16 | The transcribed text can be obtained by ASR model, (e.g., [whisper](https://github.com/openai/whisper)). An example metadata can be found in `data/example/metadata.list`
17 | 
18 | We can then run the preprocessing code:
19 | ```
20 | python preprocess_text.py --metadata data/example/metadata.list 
21 | ```
22 | A config file `data/example/config.json` will be generated. Feel free to edit some hyper-parameters in that config file (for example, you may decrease the batch size if you have encountered the CUDA out-of-memory issue).
23 | 
24 | ### Training
25 | The training can be launched by:
26 | ```
27 | bash train.sh <path/to/config.json> <num_of_gpus>
28 | ```
29 | 
30 | We have found for some machine the training will sometimes crash due to an [issue](https://github.com/pytorch/pytorch/issues/2530) of gloo. Therefore, we add an auto-resume wrapper in the `train.sh`.
31 | 
32 | ### Inference
33 | Simply run:
34 | ```
35 | python infer.py --text "<some text here>" -m /path/to/checkpoint/G_<iter>.pth -o <output_dir>
36 | ```
37 | 
38 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/logo.png


--------------------------------------------------------------------------------
/melo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/__init__.py


--------------------------------------------------------------------------------
/melo/api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import librosa
  6 | import soundfile
  7 | import torchaudio
  8 | import numpy as np
  9 | import torch.nn as nn
 10 | from tqdm import tqdm
 11 | import torch
 12 | 
 13 | from . import utils
 14 | from . import commons
 15 | from .models import SynthesizerTrn
 16 | from .split_utils import split_sentence
 17 | from .mel_processing import spectrogram_torch, spectrogram_torch_conv
 18 | from .download_utils import load_or_download_config, load_or_download_model
 19 | 
 20 | class TTS(nn.Module):
 21 |     def __init__(self, 
 22 |                 language,
 23 |                 device='auto',
 24 |                 use_hf=True,
 25 |                 config_path=None,
 26 |                 ckpt_path=None):
 27 |         super().__init__()
 28 |         if device == 'auto':
 29 |             device = 'cpu'
 30 |             if torch.cuda.is_available(): device = 'cuda'
 31 |             if torch.backends.mps.is_available(): device = 'mps'
 32 |         if 'cuda' in device:
 33 |             assert torch.cuda.is_available()
 34 | 
 35 |         # config_path = 
 36 |         hps = load_or_download_config(language, use_hf=use_hf, config_path=config_path)
 37 | 
 38 |         num_languages = hps.num_languages
 39 |         num_tones = hps.num_tones
 40 |         symbols = hps.symbols
 41 | 
 42 |         model = SynthesizerTrn(
 43 |             len(symbols),
 44 |             hps.data.filter_length // 2 + 1,
 45 |             hps.train.segment_size // hps.data.hop_length,
 46 |             n_speakers=hps.data.n_speakers,
 47 |             num_tones=num_tones,
 48 |             num_languages=num_languages,
 49 |             **hps.model,
 50 |         ).to(device)
 51 | 
 52 |         model.eval()
 53 |         self.model = model
 54 |         self.symbol_to_id = {s: i for i, s in enumerate(symbols)}
 55 |         self.hps = hps
 56 |         self.device = device
 57 |     
 58 |         # load state_dict
 59 |         checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf, ckpt_path=ckpt_path)
 60 |         self.model.load_state_dict(checkpoint_dict['model'], strict=True)
 61 |         
 62 |         language = language.split('_')[0]
 63 |         self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model
 64 | 
 65 |     @staticmethod
 66 |     def audio_numpy_concat(segment_data_list, sr, speed=1.):
 67 |         audio_segments = []
 68 |         for segment_data in segment_data_list:
 69 |             audio_segments += segment_data.reshape(-1).tolist()
 70 |             audio_segments += [0] * int((sr * 0.05) / speed)
 71 |         audio_segments = np.array(audio_segments).astype(np.float32)
 72 |         return audio_segments
 73 | 
 74 |     @staticmethod
 75 |     def split_sentences_into_pieces(text, language, quiet=False):
 76 |         texts = split_sentence(text, language_str=language)
 77 |         if not quiet:
 78 |             print(" > Text split to sentences.")
 79 |             print('\n'.join(texts))
 80 |             print(" > ===========================")
 81 |         return texts
 82 | 
 83 |     def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,):
 84 |         language = self.language
 85 |         texts = self.split_sentences_into_pieces(text, language, quiet)
 86 |         audio_list = []
 87 |         if pbar:
 88 |             tx = pbar(texts)
 89 |         else:
 90 |             if position:
 91 |                 tx = tqdm(texts, position=position)
 92 |             elif quiet:
 93 |                 tx = texts
 94 |             else:
 95 |                 tx = tqdm(texts)
 96 |         for t in tx:
 97 |             if language in ['EN', 'ZH_MIX_EN']:
 98 |                 t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
 99 |             device = self.device
100 |             bert, ja_bert, phones, tones, lang_ids = utils.get_text_for_tts_infer(t, language, self.hps, device, self.symbol_to_id)
101 |             with torch.no_grad():
102 |                 x_tst = phones.to(device).unsqueeze(0)
103 |                 tones = tones.to(device).unsqueeze(0)
104 |                 lang_ids = lang_ids.to(device).unsqueeze(0)
105 |                 bert = bert.to(device).unsqueeze(0)
106 |                 ja_bert = ja_bert.to(device).unsqueeze(0)
107 |                 x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
108 |                 del phones
109 |                 speakers = torch.LongTensor([speaker_id]).to(device)
110 |                 audio = self.model.infer(
111 |                         x_tst,
112 |                         x_tst_lengths,
113 |                         speakers,
114 |                         tones,
115 |                         lang_ids,
116 |                         bert,
117 |                         ja_bert,
118 |                         sdp_ratio=sdp_ratio,
119 |                         noise_scale=noise_scale,
120 |                         noise_scale_w=noise_scale_w,
121 |                         length_scale=1. / speed,
122 |                     )[0][0, 0].data.cpu().float().numpy()
123 |                 del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers
124 |                 # 
125 |             audio_list.append(audio)
126 |         torch.cuda.empty_cache()
127 |         audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
128 | 
129 |         if output_path is None:
130 |             return audio
131 |         else:
132 |             if format:
133 |                 soundfile.write(output_path, audio, self.hps.data.sampling_rate, format=format)
134 |             else:
135 |                 soundfile.write(output_path, audio, self.hps.data.sampling_rate)
136 | 


--------------------------------------------------------------------------------
/melo/app.py:
--------------------------------------------------------------------------------
 1 | # WebUI by mrfakename <X @realmrfakename / HF @mrfakename>
 2 | # Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
 3 | import gradio as gr
 4 | import os, torch, io
 5 | # os.system('python -m unidic download')
 6 | print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
 7 | from melo.api import TTS
 8 | speed = 1.0
 9 | import tempfile
10 | import click
11 | device = 'auto'
12 | models = {
13 |     'EN': TTS(language='EN', device=device),
14 |     'ES': TTS(language='ES', device=device),
15 |     'FR': TTS(language='FR', device=device),
16 |     'ZH': TTS(language='ZH', device=device),
17 |     'JP': TTS(language='JP', device=device),
18 |     'KR': TTS(language='KR', device=device),
19 | }
20 | speaker_ids = models['EN'].hps.data.spk2id
21 | 
22 | default_text_dict = {
23 |     'EN': 'The field of text-to-speech has seen rapid development recently.',
24 |     'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
25 |     'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
26 |     'ZH': 'text-to-speech 领域近年来发展迅速',
27 |     'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
28 |     'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',    
29 | }
30 |     
31 | def synthesize(speaker, text, speed, language, progress=gr.Progress()):
32 |     bio = io.BytesIO()
33 |     models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
34 |     return bio.getvalue()
35 | def load_speakers(language, text):
36 |     if text in list(default_text_dict.values()):
37 |         newtext = default_text_dict[language]
38 |     else:
39 |         newtext = text
40 |     return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
41 | with gr.Blocks() as demo:
42 |     gr.Markdown('# MeloTTS WebUI\n\nA WebUI for MeloTTS.')
43 |     with gr.Group():
44 |         speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
45 |         language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
46 |         speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
47 |         text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
48 |         language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
49 |     btn = gr.Button('Synthesize', variant='primary')
50 |     aud = gr.Audio(interactive=False)
51 |     btn.click(synthesize, inputs=[speaker, text, speed, language], outputs=[aud])
52 |     gr.Markdown('WebUI by [mrfakename](https://twitter.com/realmrfakename).')
53 | @click.command()
54 | @click.option('--share', '-s', is_flag=True, show_default=True, default=False, help="Expose a publicly-accessible shared Gradio link usable by anyone with the link. Only share the link with people you trust.")
55 | @click.option('--host', '-h', default=None)
56 | @click.option('--port', '-p', type=int, default=None)
57 | def main(share, host, port):
58 |     demo.queue(api_open=False).launch(show_api=False, share=share, server_name=host, server_port=port)
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/melo/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def init_weights(m, mean=0.0, std=0.01):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find("Conv") != -1:
  9 |         m.weight.data.normal_(mean, std)
 10 | 
 11 | 
 12 | def get_padding(kernel_size, dilation=1):
 13 |     return int((kernel_size * dilation - dilation) / 2)
 14 | 
 15 | 
 16 | def convert_pad_shape(pad_shape):
 17 |     layer = pad_shape[::-1]
 18 |     pad_shape = [item for sublist in layer for item in sublist]
 19 |     return pad_shape
 20 | 
 21 | 
 22 | def intersperse(lst, item):
 23 |     result = [item] * (len(lst) * 2 + 1)
 24 |     result[1::2] = lst
 25 |     return result
 26 | 
 27 | 
 28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 29 |     """KL(P||Q)"""
 30 |     kl = (logs_q - logs_p) - 0.5
 31 |     kl += (
 32 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 33 |     )
 34 |     return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |     """Sample from the Gumbel distribution, protect from overflows."""
 39 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |     return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |     return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |     ret = torch.zeros_like(x[:, :, :segment_size])
 50 |     for i in range(x.size(0)):
 51 |         idx_str = ids_str[i]
 52 |         idx_end = idx_str + segment_size
 53 |         ret[i] = x[i, :, idx_str:idx_end]
 54 |     return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |     b, d, t = x.size()
 59 |     if x_lengths is None:
 60 |         x_lengths = t
 61 |     ids_str_max = x_lengths - segment_size + 1
 62 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |     ret = slice_segments(x, ids_str, segment_size)
 64 |     return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 68 |     position = torch.arange(length, dtype=torch.float)
 69 |     num_timescales = channels // 2
 70 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 71 |         num_timescales - 1
 72 |     )
 73 |     inv_timescales = min_timescale * torch.exp(
 74 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 75 |     )
 76 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |     signal = signal.view(1, channels, length)
 80 |     return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |     b, channels, length = x.size()
 85 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |     return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |     b, channels, length = x.size()
 91 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |     return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |     n_channels_int = n_channels[0]
103 |     in_act = input_a + input_b
104 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |     acts = t_act * s_act
107 |     return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |     layer = pad_shape[::-1]
112 |     pad_shape = [item for sublist in layer for item in sublist]
113 |     return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |     return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |     if max_length is None:
123 |         max_length = length.max()
124 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |     return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |     """
130 |     duration: [b, 1, t_x]
131 |     mask: [b, 1, t_y, t_x]
132 |     """
133 | 
134 |     b, _, t_y, t_x = mask.shape
135 |     cum_duration = torch.cumsum(duration, -1)
136 | 
137 |     cum_duration_flat = cum_duration.view(b * t_x)
138 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139 |     path = path.view(b, t_x, t_y)
140 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141 |     path = path.unsqueeze(1).transpose(2, 3) * mask
142 |     return path
143 | 
144 | 
145 | def clip_grad_value_(parameters, clip_value, norm_type=2):
146 |     if isinstance(parameters, torch.Tensor):
147 |         parameters = [parameters]
148 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
149 |     norm_type = float(norm_type)
150 |     if clip_value is not None:
151 |         clip_value = float(clip_value)
152 | 
153 |     total_norm = 0
154 |     for p in parameters:
155 |         param_norm = p.grad.data.norm(norm_type)
156 |         total_norm += param_norm.item() ** norm_type
157 |         if clip_value is not None:
158 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
159 |     total_norm = total_norm ** (1.0 / norm_type)
160 |     return total_norm
161 | 


--------------------------------------------------------------------------------
/melo/configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 1000,
 5 |     "seed": 52,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 0.0003,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 6,
14 |     "fp16_run": false,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 16384,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "skip_optimizer": true
22 |   },
23 |   "data": {
24 |     "training_files": "",
25 |     "validation_files": "",
26 |     "max_wav_value": 32768.0,
27 |     "sampling_rate": 44100,
28 |     "filter_length": 2048,
29 |     "hop_length": 512,
30 |     "win_length": 2048,
31 |     "n_mel_channels": 128,
32 |     "mel_fmin": 0.0,
33 |     "mel_fmax": null,
34 |     "add_blank": true,
35 |     "n_speakers": 256,
36 |     "cleaned_text": true,
37 |     "spk2id": {}
38 |   },
39 |   "model": {
40 |     "use_spk_conditioned_encoder": true,
41 |     "use_noise_scaled_mas": true,
42 |     "use_mel_posterior_encoder": false,
43 |     "use_duration_discriminator": true,
44 |     "inter_channels": 192,
45 |     "hidden_channels": 192,
46 |     "filter_channels": 768,
47 |     "n_heads": 2,
48 |     "n_layers": 6,
49 |     "n_layers_trans_flow": 3,
50 |     "kernel_size": 3,
51 |     "p_dropout": 0.1,
52 |     "resblock": "1",
53 |     "resblock_kernel_sizes": [
54 |       3,
55 |       7,
56 |       11
57 |     ],
58 |     "resblock_dilation_sizes": [
59 |       [
60 |         1,
61 |         3,
62 |         5
63 |       ],
64 |       [
65 |         1,
66 |         3,
67 |         5
68 |       ],
69 |       [
70 |         1,
71 |         3,
72 |         5
73 |       ]
74 |     ],
75 |     "upsample_rates": [
76 |       8,
77 |       8,
78 |       2,
79 |       2,
80 |       2
81 |     ],
82 |     "upsample_initial_channel": 512,
83 |     "upsample_kernel_sizes": [
84 |       16,
85 |       16,
86 |       8,
87 |       2,
88 |       2
89 |     ],
90 |     "n_layers_q": 3,
91 |     "use_spectral_norm": false,
92 |     "gin_channels": 256
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/melo/data/example/metadata.list:
--------------------------------------------------------------------------------
 1 | data/example/wavs/000.wav|EN-default|EN|Well, there are always new trends and styles emerging in the fashion world, but I think some of the biggest trends at the moment include sustainability and ethical fashion, streetwear and athleisure, and oversized and deconstructed silhouettes.
 2 | data/example/wavs/001.wav|EN-default|EN|Many designers and brands are focusing on creating more environmentally-friendly and socially responsible clothing, while others are incorporating elements of sportswear and casual wear into their collections.
 3 | data/example/wavs/002.wav|EN-default|EN|And there's a growing interest in looser, more relaxed shapes and unconventional materials and finishes.
 4 | data/example/wavs/003.wav|EN-default|EN|That's really insightful.
 5 | data/example/wavs/004.wav|EN-default|EN|What do you think are some of the benefits of following fashion trends?
 6 | data/example/wavs/005.wav|EN-default|EN|Well, I think one of the main benefits of following fashion trends is that it can be a way to express your creativity, personality, and individuality.
 7 | data/example/wavs/006.wav|EN-default|EN|Fashion can be a powerful tool for self-expression and can help you feel more confident and comfortable in your own skin.
 8 | data/example/wavs/007.wav|EN-default|EN|Additionally, staying up-to-date with fashion trends can help you develop your own sense of style and learn how to put together outfits that make you look and feel great.
 9 | data/example/wavs/008.wav|EN-default|EN|That's a great point.
10 | data/example/wavs/009.wav|EN-default|EN|Do you think it's important to stay on top of the latest fashion trends, or is it more important to focus on timeless style?
11 | data/example/wavs/010.wav|EN-default|EN|I think it's really up to each individual to decide what approach to fashion works best for them.
12 | data/example/wavs/011.wav|EN-default|EN|Some people prefer to stick with classic, timeless styles that never go out of fashion, while others enjoy experimenting with new and innovative trends.
13 | data/example/wavs/012.wav|EN-default|EN|Ultimately, fashion is about personal expression and there's no right or wrong way to approach it.
14 | data/example/wavs/013.wav|EN-default|EN|The most important thing is to wear what makes you feel good and confident.
15 | data/example/wavs/014.wav|EN-default|EN|I completely agree.
16 | data/example/wavs/015.wav|EN-default|EN|Some popular ones that come to mind are oversized blazers, statement sleeves, printed maxi dresses, and chunky sneakers.
17 | data/example/wavs/016.wav|EN-default|EN|It's been really interesting chatting with you about fashion.
18 | data/example/wavs/017.wav|EN-default|EN|That's a good point.
19 | data/example/wavs/018.wav|EN-default|EN|What do you think are some current fashion trends that are popular right now?
20 | data/example/wavs/019.wav|EN-default|EN|There are so many trends happening right now, it's hard to keep track of them all!
21 | 


--------------------------------------------------------------------------------
/melo/download_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | from . import utils
 4 | from cached_path import cached_path
 5 | from huggingface_hub import hf_hub_download
 6 | 
 7 | DOWNLOAD_CKPT_URLS = {
 8 |     'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
 9 |     'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
10 |     'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth',
11 |     'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth',
12 |     'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth',
13 |     'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth',
14 |     'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth',
15 | }
16 | 
17 | DOWNLOAD_CONFIG_URLS = {
18 |     'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/config.json',
19 |     'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json',
20 |     'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/config.json',
21 |     'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/config.json',
22 |     'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/config.json',
23 |     'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json',
24 |     'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
25 | }
26 | 
27 | PRETRAINED_MODELS = {
28 |     'G.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/G.pth',
29 |     'D.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/D.pth',
30 |     'DUR.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/DUR.pth',
31 | }
32 | 
33 | LANG_TO_HF_REPO_ID = {
34 |     'EN': 'myshell-ai/MeloTTS-English',
35 |     'EN_V2': 'myshell-ai/MeloTTS-English-v2',
36 |     'EN_NEWEST': 'myshell-ai/MeloTTS-English-v3',
37 |     'FR': 'myshell-ai/MeloTTS-French',
38 |     'JP': 'myshell-ai/MeloTTS-Japanese',
39 |     'ES': 'myshell-ai/MeloTTS-Spanish',
40 |     'ZH': 'myshell-ai/MeloTTS-Chinese',
41 |     'KR': 'myshell-ai/MeloTTS-Korean',
42 | }
43 | 
44 | def load_or_download_config(locale, use_hf=True, config_path=None):
45 |     if config_path is None:
46 |         language = locale.split('-')[0].upper()
47 |         if use_hf:
48 |             assert language in LANG_TO_HF_REPO_ID
49 |             config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json")
50 |         else:
51 |             assert language in DOWNLOAD_CONFIG_URLS
52 |             config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
53 |     return utils.get_hparams_from_file(config_path)
54 | 
55 | def load_or_download_model(locale, device, use_hf=True, ckpt_path=None):
56 |     if ckpt_path is None:
57 |         language = locale.split('-')[0].upper()
58 |         if use_hf:
59 |             assert language in LANG_TO_HF_REPO_ID
60 |             ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth")
61 |         else:
62 |             assert language in DOWNLOAD_CKPT_URLS
63 |             ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
64 |     return torch.load(ckpt_path, map_location=device)
65 | 
66 | def load_pretrain_model():
67 |     return [cached_path(url) for url in PRETRAINED_MODELS.values()]
68 | 


--------------------------------------------------------------------------------
/melo/infer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | from melo.api import TTS
 4 | 
 5 |     
 6 |     
 7 | @click.command()
 8 | @click.option('--ckpt_path', '-m', type=str, default=None, help="Path to the checkpoint file")
 9 | @click.option('--text', '-t', type=str, default=None, help="Text to speak")
10 | @click.option('--language', '-l', type=str, default="EN", help="Language of the model")
11 | @click.option('--output_dir', '-o', type=str, default="outputs", help="Path to the output")
12 | def main(ckpt_path, text, language, output_dir):
13 |     if ckpt_path is None:
14 |         raise ValueError("The model_path must be specified")
15 |     
16 |     config_path = os.path.join(os.path.dirname(ckpt_path), 'config.json')
17 |     model = TTS(language=language, config_path=config_path, ckpt_path=ckpt_path)
18 |     
19 |     for spk_name, spk_id in model.hps.data.spk2id.items():
20 |         save_path = f'{output_dir}/{spk_name}/output.wav'
21 |         os.makedirs(os.path.dirname(save_path), exist_ok=True)
22 |         model.tts_to_file(text, spk_id, save_path)
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/melo/init_downloads.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | if __name__ == '__main__':
 4 | 
 5 |     from melo.api import TTS
 6 |     device = 'auto'
 7 |     models = {
 8 |         'EN': TTS(language='EN', device=device),
 9 |         'ES': TTS(language='ES', device=device),
10 |         'FR': TTS(language='FR', device=device),
11 |         'ZH': TTS(language='ZH', device=device),
12 |         'JP': TTS(language='JP', device=device),
13 |         'KR': TTS(language='KR', device=device),
14 |     }


--------------------------------------------------------------------------------
/melo/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 | 
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/melo/main.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import warnings
 3 | import os
 4 | 
 5 | 
 6 | @click.command
 7 | @click.argument('text')
 8 | @click.argument('output_path')
 9 | @click.option("--file", '-f', is_flag=True, show_default=True, default=False, help="Text is a file")
10 | @click.option('--language', '-l', default='EN', help='Language, defaults to English', type=click.Choice(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], case_sensitive=False))
11 | @click.option('--speaker', '-spk', default='EN-Default', help='Speaker ID, only for English, leave empty for default, ignored if not English. If English, defaults to "EN-Default"', type=click.Choice(['EN-Default', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU']))
12 | @click.option('--speed', '-s', default=1.0, help='Speed, defaults to 1.0', type=float)
13 | @click.option('--device', '-d', default='auto', help='Device, defaults to auto')
14 | def main(text, file, output_path, language, speaker, speed, device):
15 |     if file:
16 |         if not os.path.exists(text):
17 |             raise FileNotFoundError(f'Trying to load text from file due to --file/-f flag, but file not found. Remove the --file/-f flag to pass a string.')
18 |         else:
19 |             with open(text) as f:
20 |                 text = f.read().strip()
21 |     if text == '':
22 |         raise ValueError('You entered empty text or the file you passed was empty.')
23 |     language = language.upper()
24 |     if language == '': language = 'EN'
25 |     if speaker == '': speaker = None
26 |     if (not language == 'EN') and speaker:
27 |         warnings.warn('You specified a speaker but the language is English.')
28 |     from melo.api import TTS
29 |     model = TTS(language=language, device=device)
30 |     speaker_ids = model.hps.data.spk2id
31 |     if language == 'EN':
32 |         if not speaker: speaker = 'EN-Default'
33 |         spkr = speaker_ids[speaker]
34 |     else:
35 |         spkr = speaker_ids[list(speaker_ids.keys())[0]]
36 |     model.tts_to_file(text, spkr, output_path, speed=speed)
37 | 


--------------------------------------------------------------------------------
/melo/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | import librosa
  4 | from librosa.filters import mel as librosa_mel_fn
  5 | 
  6 | MAX_WAV_VALUE = 32768.0
  7 | 
  8 | 
  9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 10 |     """
 11 |     PARAMS
 12 |     ------
 13 |     C: compression factor
 14 |     """
 15 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 16 | 
 17 | 
 18 | def dynamic_range_decompression_torch(x, C=1):
 19 |     """
 20 |     PARAMS
 21 |     ------
 22 |     C: compression factor used to compress
 23 |     """
 24 |     return torch.exp(x) / C
 25 | 
 26 | 
 27 | def spectral_normalize_torch(magnitudes):
 28 |     output = dynamic_range_compression_torch(magnitudes)
 29 |     return output
 30 | 
 31 | 
 32 | def spectral_de_normalize_torch(magnitudes):
 33 |     output = dynamic_range_decompression_torch(magnitudes)
 34 |     return output
 35 | 
 36 | 
 37 | mel_basis = {}
 38 | hann_window = {}
 39 | 
 40 | 
 41 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 42 |     if torch.min(y) < -1.1:
 43 |         print("min value is ", torch.min(y))
 44 |     if torch.max(y) > 1.1:
 45 |         print("max value is ", torch.max(y))
 46 | 
 47 |     global hann_window
 48 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 49 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 50 |     if wnsize_dtype_device not in hann_window:
 51 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 52 |             dtype=y.dtype, device=y.device
 53 |         )
 54 | 
 55 |     y = torch.nn.functional.pad(
 56 |         y.unsqueeze(1),
 57 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 58 |         mode="reflect",
 59 |     )
 60 |     y = y.squeeze(1)
 61 | 
 62 |     spec = torch.stft(
 63 |         y,
 64 |         n_fft,
 65 |         hop_length=hop_size,
 66 |         win_length=win_size,
 67 |         window=hann_window[wnsize_dtype_device],
 68 |         center=center,
 69 |         pad_mode="reflect",
 70 |         normalized=False,
 71 |         onesided=True,
 72 |         return_complex=False,
 73 |     )
 74 | 
 75 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 76 |     return spec
 77 | 
 78 | 
 79 | def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 80 |     global hann_window
 81 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 82 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 83 |     if wnsize_dtype_device not in hann_window:
 84 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 85 | 
 86 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 87 |     
 88 |     # ******************** original ************************#
 89 |     # y = y.squeeze(1)
 90 |     # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 91 |     #                   center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 92 | 
 93 |     # ******************** ConvSTFT ************************#
 94 |     freq_cutoff = n_fft // 2 + 1
 95 |     fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
 96 |     forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
 97 |     forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
 98 | 
 99 |     import torch.nn.functional as F
100 | 
101 |     # if center:
102 |     #     signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
103 |     assert center is False
104 | 
105 |     forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
106 |     spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
107 | 
108 | 
109 |     # ******************** Verification ************************#
110 |     spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
111 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
112 |     assert torch.allclose(spec1, spec2, atol=1e-4)
113 | 
114 |     spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
115 |     return spec
116 | 
117 | 
118 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
119 |     global mel_basis
120 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
121 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
122 |     if fmax_dtype_device not in mel_basis:
123 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
124 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
125 |             dtype=spec.dtype, device=spec.device
126 |         )
127 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
128 |     spec = spectral_normalize_torch(spec)
129 |     return spec
130 | 
131 | 
132 | def mel_spectrogram_torch(
133 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
134 | ):
135 |     global mel_basis, hann_window
136 |     dtype_device = str(y.dtype) + "_" + str(y.device)
137 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
138 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
139 |     if fmax_dtype_device not in mel_basis:
140 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
141 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
142 |             dtype=y.dtype, device=y.device
143 |         )
144 |     if wnsize_dtype_device not in hann_window:
145 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
146 |             dtype=y.dtype, device=y.device
147 |         )
148 | 
149 |     y = torch.nn.functional.pad(
150 |         y.unsqueeze(1),
151 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
152 |         mode="reflect",
153 |     )
154 |     y = y.squeeze(1)
155 | 
156 |     spec = torch.stft(
157 |         y,
158 |         n_fft,
159 |         hop_length=hop_size,
160 |         win_length=win_size,
161 |         window=hann_window[wnsize_dtype_device],
162 |         center=center,
163 |         pad_mode="reflect",
164 |         normalized=False,
165 |         onesided=True,
166 |         return_complex=False,
167 |     )
168 | 
169 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
170 | 
171 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
172 |     spec = spectral_normalize_torch(spec)
173 | 
174 |     return spec
175 | 


--------------------------------------------------------------------------------
/melo/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | from numpy import zeros, int32, float32
 2 | from torch import from_numpy
 3 | 
 4 | from .core import maximum_path_jit
 5 | 
 6 | 
 7 | def maximum_path(neg_cent, mask):
 8 |     device = neg_cent.device
 9 |     dtype = neg_cent.dtype
10 |     neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 |     path = zeros(neg_cent.shape, dtype=int32)
12 | 
13 |     t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 |     t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 |     maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 |     return from_numpy(path).to(device=device, dtype=dtype)
17 | 


--------------------------------------------------------------------------------
/melo/monotonic_align/core.py:
--------------------------------------------------------------------------------
 1 | import numba
 2 | 
 3 | 
 4 | @numba.jit(
 5 |     numba.void(
 6 |         numba.int32[:, :, ::1],
 7 |         numba.float32[:, :, ::1],
 8 |         numba.int32[::1],
 9 |         numba.int32[::1],
10 |     ),
11 |     nopython=True,
12 |     nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 |     b = paths.shape[0]
16 |     max_neg_val = -1e9
17 |     for i in range(int(b)):
18 |         path = paths[i]
19 |         value = values[i]
20 |         t_y = t_ys[i]
21 |         t_x = t_xs[i]
22 | 
23 |         v_prev = v_cur = 0.0
24 |         index = t_x - 1
25 | 
26 |         for y in range(t_y):
27 |             for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 |                 if x == y:
29 |                     v_cur = max_neg_val
30 |                 else:
31 |                     v_cur = value[y - 1, x]
32 |                 if x == 0:
33 |                     if y == 0:
34 |                         v_prev = 0.0
35 |                     else:
36 |                         v_prev = max_neg_val
37 |                 else:
38 |                     v_prev = value[y - 1, x - 1]
39 |                 value[y, x] += max(v_prev, v_cur)
40 | 
41 |         for y in range(t_y - 1, -1, -1):
42 |             path[y, index] = 1
43 |             if index != 0 and (
44 |                 index == y or value[y - 1, index] < value[y - 1, index - 1]
45 |             ):
46 |                 index = index - 1
47 | 


--------------------------------------------------------------------------------
/melo/preprocess_text.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections import defaultdict
  3 | from random import shuffle
  4 | from typing import Optional
  5 | 
  6 | from tqdm import tqdm
  7 | import click
  8 | from text.cleaner import clean_text_bert
  9 | import os
 10 | import torch
 11 | from text.symbols import symbols, num_languages, num_tones
 12 | 
 13 | @click.command()
 14 | @click.option(
 15 |     "--metadata",
 16 |     default="data/example/metadata.list",
 17 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 18 | )
 19 | @click.option("--cleaned-path", default=None)
 20 | @click.option("--train-path", default=None)
 21 | @click.option("--val-path", default=None)
 22 | @click.option(
 23 |     "--config_path",
 24 |     default="configs/config.json",
 25 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 26 | )
 27 | @click.option("--val-per-spk", default=4)
 28 | @click.option("--max-val-total", default=8)
 29 | @click.option("--clean/--no-clean", default=True)
 30 | def main(
 31 |     metadata: str,
 32 |     cleaned_path: Optional[str],
 33 |     train_path: str,
 34 |     val_path: str,
 35 |     config_path: str,
 36 |     val_per_spk: int,
 37 |     max_val_total: int,
 38 |     clean: bool,
 39 | ):
 40 |     if train_path is None:
 41 |         train_path = os.path.join(os.path.dirname(metadata), 'train.list')
 42 |     if val_path is None:
 43 |         val_path = os.path.join(os.path.dirname(metadata), 'val.list')
 44 |     out_config_path = os.path.join(os.path.dirname(metadata), 'config.json')
 45 | 
 46 |     if cleaned_path is None:
 47 |         cleaned_path = metadata + ".cleaned"
 48 | 
 49 |     if clean:
 50 |         out_file = open(cleaned_path, "w", encoding="utf-8")
 51 |         new_symbols = []
 52 |         for line in tqdm(open(metadata, encoding="utf-8").readlines()):
 53 |             try:
 54 |                 utt, spk, language, text = line.strip().split("|")
 55 |                 norm_text, phones, tones, word2ph, bert = clean_text_bert(text, language, device='cuda:0')
 56 |                 for ph in phones:
 57 |                     if ph not in symbols and ph not in new_symbols:
 58 |                         new_symbols.append(ph)
 59 |                         print('update!, now symbols:')
 60 |                         print(new_symbols)
 61 |                         with open(f'{language}_symbol.txt', 'w') as f:
 62 |                             f.write(f'{new_symbols}')
 63 | 
 64 |                 assert len(phones) == len(tones)
 65 |                 assert len(phones) == sum(word2ph)
 66 |                 out_file.write(
 67 |                     "{}|{}|{}|{}|{}|{}|{}\n".format(
 68 |                         utt,
 69 |                         spk,
 70 |                         language,
 71 |                         norm_text,
 72 |                         " ".join(phones),
 73 |                         " ".join([str(i) for i in tones]),
 74 |                         " ".join([str(i) for i in word2ph]),
 75 |                     )
 76 |                 )
 77 |                 bert_path = utt.replace(".wav", ".bert.pt")
 78 |                 os.makedirs(os.path.dirname(bert_path), exist_ok=True)
 79 |                 torch.save(bert.cpu(), bert_path)
 80 |             except Exception as error:
 81 |                 print("err!", line, error)
 82 | 
 83 |         out_file.close()
 84 | 
 85 |         metadata = cleaned_path
 86 | 
 87 |     spk_utt_map = defaultdict(list)
 88 |     spk_id_map = {}
 89 |     current_sid = 0
 90 | 
 91 |     with open(metadata, encoding="utf-8") as f:
 92 |         for line in f.readlines():
 93 |             utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
 94 |             spk_utt_map[spk].append(line)
 95 | 
 96 |             if spk not in spk_id_map.keys():
 97 |                 spk_id_map[spk] = current_sid
 98 |                 current_sid += 1
 99 | 
100 |     train_list = []
101 |     val_list = []
102 | 
103 |     for spk, utts in spk_utt_map.items():
104 |         shuffle(utts)
105 |         val_list += utts[:val_per_spk]
106 |         train_list += utts[val_per_spk:]
107 | 
108 |     if len(val_list) > max_val_total:
109 |         train_list += val_list[max_val_total:]
110 |         val_list = val_list[:max_val_total]
111 | 
112 |     with open(train_path, "w", encoding="utf-8") as f:
113 |         for line in train_list:
114 |             f.write(line)
115 | 
116 |     with open(val_path, "w", encoding="utf-8") as f:
117 |         for line in val_list:
118 |             f.write(line)
119 | 
120 |     config = json.load(open(config_path, encoding="utf-8"))
121 |     config["data"]["spk2id"] = spk_id_map
122 | 
123 |     config["data"]["training_files"] = train_path
124 |     config["data"]["validation_files"] = val_path
125 |     config["data"]["n_speakers"] = len(spk_id_map)
126 |     config["num_languages"] = num_languages
127 |     config["num_tones"] = num_tones
128 |     config["symbols"] = symbols
129 | 
130 |     with open(out_config_path, "w", encoding="utf-8") as f:
131 |         json.dump(config, f, indent=2, ensure_ascii=False)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/melo/split_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import glob
  4 | import numpy as np
  5 | import soundfile as sf
  6 | import torchaudio
  7 | import re
  8 | 
  9 | def split_sentence(text, min_len=10, language_str='EN'):
 10 |     if language_str in ['EN', 'FR', 'ES', 'SP']:
 11 |         sentences = split_sentences_latin(text, min_len=min_len)
 12 |     else:
 13 |         sentences = split_sentences_zh(text, min_len=min_len)
 14 |     return sentences
 15 | 
 16 | 
 17 | def split_sentences_latin(text, min_len=10):
 18 |     text = re.sub('[。！？；]', '.', text)
 19 |     text = re.sub('[，]', ',', text)
 20 |     text = re.sub('[“”]', '"', text)
 21 |     text = re.sub('[‘’]', "'", text)
 22 |     text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
 23 |     return [item.strip() for item in txtsplit(text, 256, 512) if item.strip()]
 24 | 
 25 | 
 26 | def split_sentences_zh(text, min_len=10):
 27 |     text = re.sub('[。！？；]', '.', text)
 28 |     text = re.sub('[，]', ',', text)
 29 |     # 将文本中的换行符、空格和制表符替换为空格
 30 |     text = re.sub('[\n\t ]+', ' ', text)
 31 |     # 在标点符号后添加一个空格
 32 |     text = re.sub('([,.!?;])', r'\1 $#!', text)
 33 |     # 分隔句子并去除前后空格
 34 |     # sentences = [s.strip() for s in re.split('(。|！|？|；)', text)]
 35 |     sentences = [s.strip() for s in text.split('$#!')]
 36 |     if len(sentences[-1]) == 0: del sentences[-1]
 37 | 
 38 |     new_sentences = []
 39 |     new_sent = []
 40 |     count_len = 0
 41 |     for ind, sent in enumerate(sentences):
 42 |         new_sent.append(sent)
 43 |         count_len += len(sent)
 44 |         if count_len > min_len or ind == len(sentences) - 1:
 45 |             count_len = 0
 46 |             new_sentences.append(' '.join(new_sent))
 47 |             new_sent = []
 48 |     return merge_short_sentences_zh(new_sentences)
 49 | 
 50 | 
 51 | def merge_short_sentences_en(sens):
 52 |     """Avoid short sentences by merging them with the following sentence.
 53 | 
 54 |     Args:
 55 |         List[str]: list of input sentences.
 56 | 
 57 |     Returns:
 58 |         List[str]: list of output sentences.
 59 |     """
 60 |     sens_out = []
 61 |     for s in sens:
 62 |         # If the previous sentense is too short, merge them with
 63 |         # the current sentence.
 64 |         if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
 65 |             sens_out[-1] = sens_out[-1] + " " + s
 66 |         else:
 67 |             sens_out.append(s)
 68 |     try:
 69 |         if len(sens_out[-1].split(" ")) <= 2:
 70 |             sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
 71 |             sens_out.pop(-1)
 72 |     except:
 73 |         pass
 74 |     return sens_out
 75 | 
 76 | 
 77 | def merge_short_sentences_zh(sens):
 78 |     # return sens
 79 |     """Avoid short sentences by merging them with the following sentence.
 80 | 
 81 |     Args:
 82 |         List[str]: list of input sentences.
 83 | 
 84 |     Returns:
 85 |         List[str]: list of output sentences.
 86 |     """
 87 |     sens_out = []
 88 |     for s in sens:
 89 |         # If the previous sentense is too short, merge them with
 90 |         # the current sentence.
 91 |         if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
 92 |             sens_out[-1] = sens_out[-1] + " " + s
 93 |         else:
 94 |             sens_out.append(s)
 95 |     try:
 96 |         if len(sens_out[-1]) <= 2:
 97 |             sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
 98 |             sens_out.pop(-1)
 99 |     except:
100 |         pass
101 |     return sens_out
102 | 
103 | 
104 | 
105 | def txtsplit(text, desired_length=100, max_length=200):
106 |     """Split text it into chunks of a desired length trying to keep sentences intact."""
107 |     text = re.sub(r'\n\n+', '\n', text)
108 |     text = re.sub(r'\s+', ' ', text)
109 |     text = re.sub(r'[""]', '"', text)
110 |     text = re.sub(r'([,.?!])', r'\1 ', text)
111 |     text = re.sub(r'\s+', ' ', text)
112 |     
113 |     rv = []
114 |     in_quote = False
115 |     current = ""
116 |     split_pos = []
117 |     pos = -1
118 |     end_pos = len(text) - 1
119 |     def seek(delta):
120 |         nonlocal pos, in_quote, current
121 |         is_neg = delta < 0
122 |         for _ in range(abs(delta)):
123 |             if is_neg:
124 |                 pos -= 1
125 |                 current = current[:-1]
126 |             else:
127 |                 pos += 1
128 |                 current += text[pos]
129 |             if text[pos] == '"':
130 |                 in_quote = not in_quote
131 |         return text[pos]
132 |     def peek(delta):
133 |         p = pos + delta
134 |         return text[p] if p < end_pos and p >= 0 else ""
135 |     def commit():
136 |         nonlocal rv, current, split_pos
137 |         rv.append(current)
138 |         current = ""
139 |         split_pos = []
140 |     while pos < end_pos:
141 |         c = seek(1)
142 |         if len(current) >= max_length:
143 |             if len(split_pos) > 0 and len(current) > (desired_length / 2):
144 |                 d = pos - split_pos[-1]
145 |                 seek(-d)
146 |             else:
147 |                 while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
148 |                     c = seek(-1)
149 |             commit()
150 |         elif not in_quote and (c in '!?\n' or (c in '.,' and peek(1) in '\n ')):
151 |             while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
152 |                 c = seek(1)
153 |             split_pos.append(pos)
154 |             if len(current) >= desired_length:
155 |                 commit()
156 |         elif in_quote and peek(1) == '"' and peek(2) in '\n ':
157 |             seek(2)
158 |             split_pos.append(pos)
159 |     rv.append(current)
160 |     rv = [s.strip() for s in rv]
161 |     rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
162 |     return rv
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     zh_text = "好的，我来给你讲一个故事吧。从前有一个小姑娘，她叫做小红。小红非常喜欢在森林里玩耍，她经常会和她的小伙伴们一起去探险。有一天，小红和她的小伙伴们走到了森林深处，突然遇到了一只凶猛的野兽。小红的小伙伴们都吓得不敢动弹，但是小红并没有被吓倒，她勇敢地走向野兽，用她的智慧和勇气成功地制服了野兽，保护了她的小伙伴们。从那以后，小红变得更加勇敢和自信，成为了她小伙伴们心中的英雄。"
167 |     en_text = "I didn’t know what to do. I said please kill her because it would be better than being kidnapped,” Ben, whose surname CNN is not using for security concerns, said on Wednesday. “It’s a nightmare. I said ‘please kill her, don’t take her there.’"
168 |     sp_text = "¡Claro! ¿En qué tema te gustaría que te hable en español? Puedo proporcionarte información o conversar contigo sobre una amplia variedad de temas, desde cultura y comida hasta viajes y tecnología. ¿Tienes alguna preferencia en particular?"
169 |     fr_text = "Bien sûr ! En quelle matière voudriez-vous que je vous parle en français ? Je peux vous fournir des informations ou discuter avec vous sur une grande variété de sujets, que ce soit la culture, la nourriture, les voyages ou la technologie. Avez-vous une préférence particulière ?"
170 | 
171 |     print(split_sentence(zh_text, language_str='ZH'))
172 |     print(split_sentence(en_text, language_str='EN'))
173 |     print(split_sentence(sp_text, language_str='SP'))
174 |     print(split_sentence(fr_text, language_str='FR'))
175 | 


--------------------------------------------------------------------------------
/melo/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | 
 7 | def cleaned_text_to_sequence(cleaned_text, tones, language, symbol_to_id=None):
 8 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 9 |     Args:
10 |       text: string to convert to a sequence
11 |     Returns:
12 |       List of integers corresponding to the symbols in the text
13 |     """
14 |     symbol_to_id_map = symbol_to_id if symbol_to_id else _symbol_to_id
15 |     phones = [symbol_to_id_map[symbol] for symbol in cleaned_text]
16 |     tone_start = language_tone_start_map[language]
17 |     tones = [i + tone_start for i in tones]
18 |     lang_id = language_id_map[language]
19 |     lang_ids = [lang_id for i in phones]
20 |     return phones, tones, lang_ids
21 | 
22 | 
23 | def get_bert(norm_text, word2ph, language, device):
24 |     from .chinese_bert import get_bert_feature as zh_bert
25 |     from .english_bert import get_bert_feature as en_bert
26 |     from .japanese_bert import get_bert_feature as jp_bert
27 |     from .chinese_mix import get_bert_feature as zh_mix_en_bert
28 |     from .spanish_bert import get_bert_feature as sp_bert
29 |     from .french_bert import get_bert_feature as fr_bert
30 |     from .korean import get_bert_feature as kr_bert
31 | 
32 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert, 'ZH_MIX_EN': zh_mix_en_bert, 
33 |                           'FR': fr_bert, 'SP': sp_bert, 'ES': sp_bert, "KR": kr_bert}
34 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
35 |     return bert
36 | 


--------------------------------------------------------------------------------
/melo/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import cn2an
  5 | from pypinyin import lazy_pinyin, Style
  6 | 
  7 | from .symbols import punctuation
  8 | from .tone_sandhi import ToneSandhi
  9 | 
 10 | current_file_path = os.path.dirname(__file__)
 11 | pinyin_to_symbol_map = {
 12 |     line.split("\t")[0]: line.strip().split("\t")[1]
 13 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 14 | }
 15 | 
 16 | import jieba.posseg as psg
 17 | 
 18 | 
 19 | rep_map = {
 20 |     "：": ",",
 21 |     "；": ",",
 22 |     "，": ",",
 23 |     "。": ".",
 24 |     "！": "!",
 25 |     "？": "?",
 26 |     "\n": ".",
 27 |     "·": ",",
 28 |     "、": ",",
 29 |     "...": "…",
 30 |     "$": ".",
 31 |     "“": "'",
 32 |     "”": "'",
 33 |     "‘": "'",
 34 |     "’": "'",
 35 |     "（": "'",
 36 |     "）": "'",
 37 |     "(": "'",
 38 |     ")": "'",
 39 |     "《": "'",
 40 |     "》": "'",
 41 |     "【": "'",
 42 |     "】": "'",
 43 |     "[": "'",
 44 |     "]": "'",
 45 |     "—": "-",
 46 |     "～": "-",
 47 |     "~": "-",
 48 |     "「": "'",
 49 |     "」": "'",
 50 | }
 51 | 
 52 | tone_modifier = ToneSandhi()
 53 | 
 54 | 
 55 | def replace_punctuation(text):
 56 |     text = text.replace("嗯", "恩").replace("呣", "母")
 57 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 58 | 
 59 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 60 | 
 61 |     replaced_text = re.sub(
 62 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 63 |     )
 64 | 
 65 |     return replaced_text
 66 | 
 67 | 
 68 | def g2p(text):
 69 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 70 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 71 |     phones, tones, word2ph = _g2p(sentences)
 72 |     assert sum(word2ph) == len(phones)
 73 |     assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
 74 |     phones = ["_"] + phones + ["_"]
 75 |     tones = [0] + tones + [0]
 76 |     word2ph = [1] + word2ph + [1]
 77 |     return phones, tones, word2ph
 78 | 
 79 | 
 80 | def _get_initials_finals(word):
 81 |     initials = []
 82 |     finals = []
 83 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 84 |     orig_finals = lazy_pinyin(
 85 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 86 |     )
 87 |     for c, v in zip(orig_initials, orig_finals):
 88 |         initials.append(c)
 89 |         finals.append(v)
 90 |     return initials, finals
 91 | 
 92 | 
 93 | def _g2p(segments):
 94 |     phones_list = []
 95 |     tones_list = []
 96 |     word2ph = []
 97 |     for seg in segments:
 98 |         # Replace all English words in the sentence
 99 |         seg = re.sub("[a-zA-Z]+", "", seg)
100 |         seg_cut = psg.lcut(seg)
101 |         initials = []
102 |         finals = []
103 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 |         for word, pos in seg_cut:
105 |             if pos == "eng":
106 |                 import pdb; pdb.set_trace()
107 |                 continue
108 |             sub_initials, sub_finals = _get_initials_finals(word)
109 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110 |             initials.append(sub_initials)
111 |             finals.append(sub_finals)
112 | 
113 |             # assert len(sub_initials) == len(sub_finals) == len(word)
114 |         initials = sum(initials, [])
115 |         finals = sum(finals, [])
116 |         #
117 |         for c, v in zip(initials, finals):
118 |             raw_pinyin = c + v
119 |             # NOTE: post process for pypinyin outputs
120 |             # we discriminate i, ii and iii
121 |             if c == v:
122 |                 assert c in punctuation
123 |                 phone = [c]
124 |                 tone = "0"
125 |                 word2ph.append(1)
126 |             else:
127 |                 v_without_tone = v[:-1]
128 |                 tone = v[-1]
129 | 
130 |                 pinyin = c + v_without_tone
131 |                 assert tone in "12345"
132 | 
133 |                 if c:
134 |                     # 多音节
135 |                     v_rep_map = {
136 |                         "uei": "ui",
137 |                         "iou": "iu",
138 |                         "uen": "un",
139 |                     }
140 |                     if v_without_tone in v_rep_map.keys():
141 |                         pinyin = c + v_rep_map[v_without_tone]
142 |                 else:
143 |                     # 单音节
144 |                     pinyin_rep_map = {
145 |                         "ing": "ying",
146 |                         "i": "yi",
147 |                         "in": "yin",
148 |                         "u": "wu",
149 |                     }
150 |                     if pinyin in pinyin_rep_map.keys():
151 |                         pinyin = pinyin_rep_map[pinyin]
152 |                     else:
153 |                         single_rep_map = {
154 |                             "v": "yu",
155 |                             "e": "e",
156 |                             "i": "y",
157 |                             "u": "w",
158 |                         }
159 |                         if pinyin[0] in single_rep_map.keys():
160 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161 | 
162 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163 |                 phone = pinyin_to_symbol_map[pinyin].split(" ")
164 |                 word2ph.append(len(phone))
165 | 
166 |             phones_list += phone
167 |             tones_list += [int(tone)] * len(phone)
168 |     return phones_list, tones_list, word2ph
169 | 
170 | 
171 | def text_normalize(text):
172 |     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
173 |     for number in numbers:
174 |         text = text.replace(number, cn2an.an2cn(number), 1)
175 |     text = replace_punctuation(text)
176 |     return text
177 | 
178 | 
179 | def get_bert_feature(text, word2ph, device=None):
180 |     from text import chinese_bert
181 | 
182 |     return chinese_bert.get_bert_feature(text, word2ph, device=device)
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     from text.chinese_bert import get_bert_feature
187 | 
188 |     text = "啊！chemistry 但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
189 |     text = text_normalize(text)
190 |     print(text)
191 |     phones, tones, word2ph = g2p(text)
192 |     bert = get_bert_feature(text, word2ph)
193 | 
194 |     print(phones, tones, word2ph, bert.shape)
195 | 
196 | 
197 | # # 示例用法
198 | # text = "这是一个示例文本：,你好！这是一个测试...."
199 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
200 | 


--------------------------------------------------------------------------------
/melo/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
  4 | 
  5 | 
  6 | # model_id = 'hfl/chinese-roberta-wwm-ext-large'
  7 | local_path = "./bert/chinese-roberta-wwm-ext-large"
  8 | 
  9 | 
 10 | tokenizers = {}
 11 | models = {}
 12 | 
 13 | def get_bert_feature(text, word2ph, device=None, model_id='hfl/chinese-roberta-wwm-ext-large'):
 14 |     if model_id not in models:
 15 |         models[model_id] = AutoModelForMaskedLM.from_pretrained(
 16 |             model_id
 17 |         ).to(device)
 18 |         tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id)
 19 |     model = models[model_id]
 20 |     tokenizer = tokenizers[model_id]
 21 | 
 22 |     if (
 23 |         sys.platform == "darwin"
 24 |         and torch.backends.mps.is_available()
 25 |         and device == "cpu"
 26 |     ):
 27 |         device = "mps"
 28 |     if not device:
 29 |         device = "cuda"
 30 | 
 31 |     with torch.no_grad():
 32 |         inputs = tokenizer(text, return_tensors="pt")
 33 |         for i in inputs:
 34 |             inputs[i] = inputs[i].to(device)
 35 |         res = model(**inputs, output_hidden_states=True)
 36 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 37 |     # import pdb; pdb.set_trace()
 38 |     # assert len(word2ph) == len(text) + 2
 39 |     word2phone = word2ph
 40 |     phone_level_feature = []
 41 |     for i in range(len(word2phone)):
 42 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 43 |         phone_level_feature.append(repeat_feature)
 44 | 
 45 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 46 |     return phone_level_feature.T
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 |     import torch
 51 | 
 52 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 53 |     word2phone = [
 54 |         1,
 55 |         2,
 56 |         1,
 57 |         2,
 58 |         2,
 59 |         1,
 60 |         2,
 61 |         2,
 62 |         1,
 63 |         2,
 64 |         2,
 65 |         1,
 66 |         2,
 67 |         2,
 68 |         2,
 69 |         2,
 70 |         2,
 71 |         1,
 72 |         1,
 73 |         2,
 74 |         2,
 75 |         1,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         2,
 80 |         1,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         2,
 85 |         2,
 86 |         1,
 87 |         2,
 88 |         2,
 89 |         2,
 90 |         2,
 91 |         1,
 92 |     ]
 93 | 
 94 |     # 计算总帧数
 95 |     total_frames = sum(word2phone)
 96 |     print(word_level_feature.shape)
 97 |     print(word2phone)
 98 |     phone_level_feature = []
 99 |     for i in range(len(word2phone)):
100 |         print(word_level_feature[i].shape)
101 | 
102 |         # 对每个词重复word2phone[i]次
103 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
104 |         phone_level_feature.append(repeat_feature)
105 | 
106 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
107 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
108 | 


--------------------------------------------------------------------------------
/melo/text/chinese_mix.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import cn2an
  5 | from pypinyin import lazy_pinyin, Style
  6 | 
  7 | # from text.symbols import punctuation
  8 | from .symbols import language_tone_start_map
  9 | from .tone_sandhi import ToneSandhi
 10 | from .english import g2p as g2p_en
 11 | from transformers import AutoTokenizer
 12 | 
 13 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
 14 | current_file_path = os.path.dirname(__file__)
 15 | pinyin_to_symbol_map = {
 16 |     line.split("\t")[0]: line.strip().split("\t")[1]
 17 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 18 | }
 19 | 
 20 | import jieba.posseg as psg
 21 | 
 22 | 
 23 | rep_map = {
 24 |     "：": ",",
 25 |     "；": ",",
 26 |     "，": ",",
 27 |     "。": ".",
 28 |     "！": "!",
 29 |     "？": "?",
 30 |     "\n": ".",
 31 |     "·": ",",
 32 |     "、": ",",
 33 |     "...": "…",
 34 |     "$": ".",
 35 |     "“": "'",
 36 |     "”": "'",
 37 |     "‘": "'",
 38 |     "’": "'",
 39 |     "（": "'",
 40 |     "）": "'",
 41 |     "(": "'",
 42 |     ")": "'",
 43 |     "《": "'",
 44 |     "》": "'",
 45 |     "【": "'",
 46 |     "】": "'",
 47 |     "[": "'",
 48 |     "]": "'",
 49 |     "—": "-",
 50 |     "～": "-",
 51 |     "~": "-",
 52 |     "「": "'",
 53 |     "」": "'",
 54 | }
 55 | 
 56 | tone_modifier = ToneSandhi()
 57 | 
 58 | 
 59 | def replace_punctuation(text):
 60 |     text = text.replace("嗯", "恩").replace("呣", "母")
 61 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 62 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 63 |     replaced_text = re.sub(r"[^\u4e00-\u9fa5_a-zA-Z\s" + "".join(punctuation) + r"]+", "", replaced_text)
 64 |     replaced_text = re.sub(r"[\s]+", " ", replaced_text)
 65 | 
 66 |     return replaced_text
 67 | 
 68 | 
 69 | def g2p(text, impl='v2'):
 70 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 71 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 72 |     if impl == 'v1':
 73 |         _func = _g2p
 74 |     elif impl == 'v2':
 75 |         _func = _g2p_v2
 76 |     else:
 77 |         raise NotImplementedError()
 78 |     phones, tones, word2ph = _func(sentences)
 79 |     assert sum(word2ph) == len(phones)
 80 |     # assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
 81 |     phones = ["_"] + phones + ["_"]
 82 |     tones = [0] + tones + [0]
 83 |     word2ph = [1] + word2ph + [1]
 84 |     return phones, tones, word2ph
 85 | 
 86 | 
 87 | def _get_initials_finals(word):
 88 |     initials = []
 89 |     finals = []
 90 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 91 |     orig_finals = lazy_pinyin(
 92 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 93 |     )
 94 |     for c, v in zip(orig_initials, orig_finals):
 95 |         initials.append(c)
 96 |         finals.append(v)
 97 |     return initials, finals
 98 | 
 99 | model_id = 'bert-base-multilingual-uncased'
100 | tokenizer = AutoTokenizer.from_pretrained(model_id)
101 | def _g2p(segments):
102 |     phones_list = []
103 |     tones_list = []
104 |     word2ph = []
105 |     for seg in segments:
106 |         # Replace all English words in the sentence
107 |         # seg = re.sub("[a-zA-Z]+", "", seg)
108 |         seg_cut = psg.lcut(seg)
109 |         initials = []
110 |         finals = []
111 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
112 |         for word, pos in seg_cut:
113 |             if pos == "eng":
114 |                 initials.append(['EN_WORD'])
115 |                 finals.append([word])
116 |             else:
117 |                 sub_initials, sub_finals = _get_initials_finals(word)
118 |                 sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
119 |                 initials.append(sub_initials)
120 |                 finals.append(sub_finals)
121 | 
122 |             # assert len(sub_initials) == len(sub_finals) == len(word)
123 |         initials = sum(initials, [])
124 |         finals = sum(finals, [])
125 |         #
126 |         for c, v in zip(initials, finals):
127 |             if c == 'EN_WORD':
128 |                 tokenized_en = tokenizer.tokenize(v)
129 |                 phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
130 |                 # apply offset to tones_en
131 |                 tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
132 |                 phones_list += phones_en
133 |                 tones_list += tones_en
134 |                 word2ph += word2ph_en
135 |             else:
136 |                 raw_pinyin = c + v
137 |                 # NOTE: post process for pypinyin outputs
138 |                 # we discriminate i, ii and iii
139 |                 if c == v:
140 |                     assert c in punctuation
141 |                     phone = [c]
142 |                     tone = "0"
143 |                     word2ph.append(1)
144 |                 else:
145 |                     v_without_tone = v[:-1]
146 |                     tone = v[-1]
147 | 
148 |                     pinyin = c + v_without_tone
149 |                     assert tone in "12345"
150 | 
151 |                     if c:
152 |                         # 多音节
153 |                         v_rep_map = {
154 |                             "uei": "ui",
155 |                             "iou": "iu",
156 |                             "uen": "un",
157 |                         }
158 |                         if v_without_tone in v_rep_map.keys():
159 |                             pinyin = c + v_rep_map[v_without_tone]
160 |                     else:
161 |                         # 单音节
162 |                         pinyin_rep_map = {
163 |                             "ing": "ying",
164 |                             "i": "yi",
165 |                             "in": "yin",
166 |                             "u": "wu",
167 |                         }
168 |                         if pinyin in pinyin_rep_map.keys():
169 |                             pinyin = pinyin_rep_map[pinyin]
170 |                         else:
171 |                             single_rep_map = {
172 |                                 "v": "yu",
173 |                                 "e": "e",
174 |                                 "i": "y",
175 |                                 "u": "w",
176 |                             }
177 |                             if pinyin[0] in single_rep_map.keys():
178 |                                 pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
179 | 
180 |                     assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
181 |                     phone = pinyin_to_symbol_map[pinyin].split(" ")
182 |                     word2ph.append(len(phone))
183 | 
184 |                 phones_list += phone
185 |                 tones_list += [int(tone)] * len(phone)
186 |     return phones_list, tones_list, word2ph
187 | 
188 | 
189 | def text_normalize(text):
190 |     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
191 |     for number in numbers:
192 |         text = text.replace(number, cn2an.an2cn(number), 1)
193 |     text = replace_punctuation(text)
194 |     return text
195 | 
196 | 
197 | def get_bert_feature(text, word2ph, device):
198 |     from . import chinese_bert
199 |     return chinese_bert.get_bert_feature(text, word2ph, model_id='bert-base-multilingual-uncased', device=device)
200 | 
201 | from .chinese import _g2p as _chinese_g2p
202 | def _g2p_v2(segments):
203 |     spliter = '#$&^!@'
204 | 
205 |     phones_list = []
206 |     tones_list = []
207 |     word2ph = []
208 | 
209 |     for text in segments:
210 |         assert spliter not in text
211 |         # replace all english words
212 |         text = re.sub('([a-zA-Z\s]+)', lambda x: f'{spliter}{x.group(1)}{spliter}', text)
213 |         texts = text.split(spliter)
214 |         texts = [t for t in texts if len(t) > 0]
215 | 
216 |         
217 |         for text in texts:
218 |             if re.match('[a-zA-Z\s]+', text):
219 |                 # english
220 |                 tokenized_en = tokenizer.tokenize(text)
221 |                 phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
222 |                 # apply offset to tones_en
223 |                 tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
224 |                 phones_list += phones_en
225 |                 tones_list += tones_en
226 |                 word2ph += word2ph_en
227 |             else:
228 |                 phones_zh, tones_zh, word2ph_zh = _chinese_g2p([text])
229 |                 phones_list += phones_zh
230 |                 tones_list += tones_zh
231 |                 word2ph += word2ph_zh
232 |     return phones_list, tones_list, word2ph
233 | 
234 |     
235 | 
236 | if __name__ == "__main__":
237 |     # from text.chinese_bert import get_bert_feature
238 | 
239 |     text = "NFT啊！chemistry 但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
240 |     text = '我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。'
241 |     text = '今天下午，我们准备去shopping mall购物，然后晚上去看一场movie。'
242 |     text = '我们现在 also 能够 help 很多公司 use some machine learning 的 algorithms 啊!'
243 |     text = text_normalize(text)
244 |     print(text)
245 |     phones, tones, word2ph = g2p(text, impl='v2')
246 |     bert = get_bert_feature(text, word2ph, device='cuda:0')
247 |     print(phones)
248 |     import pdb; pdb.set_trace()
249 | 
250 | 
251 | # # 示例用法
252 | # text = "这是一个示例文本：,你好！这是一个测试...."
253 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
254 | 


--------------------------------------------------------------------------------
/melo/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, english, chinese_mix, korean, french, spanish
 2 | from . import cleaned_text_to_sequence
 3 | import copy
 4 | 
 5 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english, 'ZH_MIX_EN': chinese_mix, 'KR': korean,
 6 |                     'FR': french, 'SP': spanish, 'ES': spanish}
 7 | 
 8 | 
 9 | def clean_text(text, language):
10 |     language_module = language_module_map[language]
11 |     norm_text = language_module.text_normalize(text)
12 |     phones, tones, word2ph = language_module.g2p(norm_text)
13 |     return norm_text, phones, tones, word2ph
14 | 
15 | 
16 | def clean_text_bert(text, language, device=None):
17 |     language_module = language_module_map[language]
18 |     norm_text = language_module.text_normalize(text)
19 |     phones, tones, word2ph = language_module.g2p(norm_text)
20 |     
21 |     word2ph_bak = copy.deepcopy(word2ph)
22 |     for i in range(len(word2ph)):
23 |         word2ph[i] = word2ph[i] * 2
24 |     word2ph[0] += 1
25 |     bert = language_module.get_bert_feature(norm_text, word2ph, device=device)
26 |     
27 |     return norm_text, phones, tones, word2ph_bak, bert
28 | 
29 | 
30 | def text_to_sequence(text, language):
31 |     norm_text, phones, tones, word2ph = clean_text(text, language)
32 |     return cleaned_text_to_sequence(phones, tones, language)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     pass


--------------------------------------------------------------------------------
/melo/text/cleaner_multiling.py:
--------------------------------------------------------------------------------
  1 | """Set of default text cleaners"""
  2 | # TODO: pick the cleaner for languages dynamically
  3 | 
  4 | import re
  5 | 
  6 | # Regular expression matching whitespace:
  7 | _whitespace_re = re.compile(r"\s+")
  8 | 
  9 | rep_map = {
 10 |     "：": ",",
 11 |     "；": ",",
 12 |     "，": ",",
 13 |     "。": ".",
 14 |     "！": "!",
 15 |     "？": "?",
 16 |     "\n": ".",
 17 |     "·": ",",
 18 |     "、": ",",
 19 |     "...": ".",
 20 |     "…": ".",
 21 |     "$": ".",
 22 |     "“": "'",
 23 |     "”": "'",
 24 |     "‘": "'",
 25 |     "’": "'",
 26 |     "（": "'",
 27 |     "）": "'",
 28 |     "(": "'",
 29 |     ")": "'",
 30 |     "《": "'",
 31 |     "》": "'",
 32 |     "【": "'",
 33 |     "】": "'",
 34 |     "[": "'",
 35 |     "]": "'",
 36 |     "—": "",
 37 |     "～": "-",
 38 |     "~": "-",
 39 |     "「": "'",
 40 |     "」": "'",
 41 | }
 42 | 
 43 | def replace_punctuation(text):
 44 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 45 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 46 |     return replaced_text
 47 | 
 48 | def lowercase(text):
 49 |     return text.lower()
 50 | 
 51 | 
 52 | def collapse_whitespace(text):
 53 |     return re.sub(_whitespace_re, " ", text).strip()
 54 | 
 55 | def remove_punctuation_at_begin(text):
 56 |     return re.sub(r'^[,.!?]+', '', text)
 57 | 
 58 | def remove_aux_symbols(text):
 59 |     text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
 60 |     return text
 61 | 
 62 | 
 63 | def replace_symbols(text, lang="en"):
 64 |     """Replace symbols based on the lenguage tag.
 65 | 
 66 |     Args:
 67 |       text:
 68 |        Input text.
 69 |       lang:
 70 |         Lenguage identifier. ex: "en", "fr", "pt", "ca".
 71 | 
 72 |     Returns:
 73 |       The modified text
 74 |       example:
 75 |         input args:
 76 |             text: "si l'avi cau, diguem-ho"
 77 |             lang: "ca"
 78 |         Output:
 79 |             text: "si lavi cau, diguemho"
 80 |     """
 81 |     text = text.replace(";", ",")
 82 |     text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
 83 |     text = text.replace(":", ",")
 84 |     if lang == "en":
 85 |         text = text.replace("&", " and ")
 86 |     elif lang == "fr":
 87 |         text = text.replace("&", " et ")
 88 |     elif lang == "pt":
 89 |         text = text.replace("&", " e ")
 90 |     elif lang == "ca":
 91 |         text = text.replace("&", " i ")
 92 |         text = text.replace("'", "")
 93 |     elif lang== "es":
 94 |         text=text.replace("&","y")
 95 |         text = text.replace("'", "")
 96 |     return text
 97 | 
 98 | def unicleaners(text, cased=False, lang='en'):
 99 |     """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
100 |     numbers, phonemizer already does that"""
101 |     if not cased:
102 |         text = lowercase(text)
103 |     text = replace_punctuation(text)
104 |     text = replace_symbols(text, lang=lang)
105 |     text = remove_aux_symbols(text)
106 |     text = remove_punctuation_at_begin(text)
107 |     text = collapse_whitespace(text)
108 |     text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
109 |     return text
110 | 
111 | 


--------------------------------------------------------------------------------
/melo/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/melo/text/english.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import re
  4 | from g2p_en import G2p
  5 | 
  6 | from . import symbols
  7 | 
  8 | from .english_utils.abbreviations import expand_abbreviations
  9 | from .english_utils.time_norm import expand_time_english
 10 | from .english_utils.number_norm import normalize_numbers
 11 | from .japanese import distribute_phone
 12 | 
 13 | from transformers import AutoTokenizer
 14 | 
 15 | current_file_path = os.path.dirname(__file__)
 16 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
 17 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
 18 | _g2p = G2p()
 19 | 
 20 | arpa = {
 21 |     "AH0",
 22 |     "S",
 23 |     "AH1",
 24 |     "EY2",
 25 |     "AE2",
 26 |     "EH0",
 27 |     "OW2",
 28 |     "UH0",
 29 |     "NG",
 30 |     "B",
 31 |     "G",
 32 |     "AY0",
 33 |     "M",
 34 |     "AA0",
 35 |     "F",
 36 |     "AO0",
 37 |     "ER2",
 38 |     "UH1",
 39 |     "IY1",
 40 |     "AH2",
 41 |     "DH",
 42 |     "IY0",
 43 |     "EY1",
 44 |     "IH0",
 45 |     "K",
 46 |     "N",
 47 |     "W",
 48 |     "IY2",
 49 |     "T",
 50 |     "AA1",
 51 |     "ER1",
 52 |     "EH2",
 53 |     "OY0",
 54 |     "UH2",
 55 |     "UW1",
 56 |     "Z",
 57 |     "AW2",
 58 |     "AW1",
 59 |     "V",
 60 |     "UW2",
 61 |     "AA2",
 62 |     "ER",
 63 |     "AW0",
 64 |     "UW0",
 65 |     "R",
 66 |     "OW1",
 67 |     "EH1",
 68 |     "ZH",
 69 |     "AE0",
 70 |     "IH2",
 71 |     "IH",
 72 |     "Y",
 73 |     "JH",
 74 |     "P",
 75 |     "AY1",
 76 |     "EY0",
 77 |     "OY2",
 78 |     "TH",
 79 |     "HH",
 80 |     "D",
 81 |     "ER0",
 82 |     "CH",
 83 |     "AO1",
 84 |     "AE1",
 85 |     "AO2",
 86 |     "OY1",
 87 |     "AY2",
 88 |     "IH1",
 89 |     "OW0",
 90 |     "L",
 91 |     "SH",
 92 | }
 93 | 
 94 | 
 95 | def post_replace_ph(ph):
 96 |     rep_map = {
 97 |         "：": ",",
 98 |         "；": ",",
 99 |         "，": ",",
100 |         "。": ".",
101 |         "！": "!",
102 |         "？": "?",
103 |         "\n": ".",
104 |         "·": ",",
105 |         "、": ",",
106 |         "...": "…",
107 |         "v": "V",
108 |     }
109 |     if ph in rep_map.keys():
110 |         ph = rep_map[ph]
111 |     if ph in symbols:
112 |         return ph
113 |     if ph not in symbols:
114 |         ph = "UNK"
115 |     return ph
116 | 
117 | 
118 | def read_dict():
119 |     g2p_dict = {}
120 |     start_line = 49
121 |     with open(CMU_DICT_PATH) as f:
122 |         line = f.readline()
123 |         line_index = 1
124 |         while line:
125 |             if line_index >= start_line:
126 |                 line = line.strip()
127 |                 word_split = line.split("  ")
128 |                 word = word_split[0]
129 | 
130 |                 syllable_split = word_split[1].split(" - ")
131 |                 g2p_dict[word] = []
132 |                 for syllable in syllable_split:
133 |                     phone_split = syllable.split(" ")
134 |                     g2p_dict[word].append(phone_split)
135 | 
136 |             line_index = line_index + 1
137 |             line = f.readline()
138 | 
139 |     return g2p_dict
140 | 
141 | 
142 | def cache_dict(g2p_dict, file_path):
143 |     with open(file_path, "wb") as pickle_file:
144 |         pickle.dump(g2p_dict, pickle_file)
145 | 
146 | 
147 | def get_dict():
148 |     if os.path.exists(CACHE_PATH):
149 |         with open(CACHE_PATH, "rb") as pickle_file:
150 |             g2p_dict = pickle.load(pickle_file)
151 |     else:
152 |         g2p_dict = read_dict()
153 |         cache_dict(g2p_dict, CACHE_PATH)
154 | 
155 |     return g2p_dict
156 | 
157 | 
158 | eng_dict = get_dict()
159 | 
160 | 
161 | def refine_ph(phn):
162 |     tone = 0
163 |     if re.search(r"\d$", phn):
164 |         tone = int(phn[-1]) + 1
165 |         phn = phn[:-1]
166 |     return phn.lower(), tone
167 | 
168 | 
169 | def refine_syllables(syllables):
170 |     tones = []
171 |     phonemes = []
172 |     for phn_list in syllables:
173 |         for i in range(len(phn_list)):
174 |             phn = phn_list[i]
175 |             phn, tone = refine_ph(phn)
176 |             phonemes.append(phn)
177 |             tones.append(tone)
178 |     return phonemes, tones
179 | 
180 | 
181 | def text_normalize(text):
182 |     text = text.lower()
183 |     text = expand_time_english(text)
184 |     text = normalize_numbers(text)
185 |     text = expand_abbreviations(text)
186 |     return text
187 | 
188 | model_id = 'bert-base-uncased'
189 | tokenizer = AutoTokenizer.from_pretrained(model_id)
190 | def g2p_old(text):
191 |     tokenized = tokenizer.tokenize(text)
192 |     # import pdb; pdb.set_trace()
193 |     phones = []
194 |     tones = []
195 |     words = re.split(r"([,;.\-\?\!\s+])", text)
196 |     for w in words:
197 |         if w.upper() in eng_dict:
198 |             phns, tns = refine_syllables(eng_dict[w.upper()])
199 |             phones += phns
200 |             tones += tns
201 |         else:
202 |             phone_list = list(filter(lambda p: p != " ", _g2p(w)))
203 |             for ph in phone_list:
204 |                 if ph in arpa:
205 |                     ph, tn = refine_ph(ph)
206 |                     phones.append(ph)
207 |                     tones.append(tn)
208 |                 else:
209 |                     phones.append(ph)
210 |                     tones.append(0)
211 |     # todo: implement word2ph
212 |     word2ph = [1 for i in phones]
213 | 
214 |     phones = [post_replace_ph(i) for i in phones]
215 |     return phones, tones, word2ph
216 | 
217 | def g2p(text, pad_start_end=True, tokenized=None):
218 |     if tokenized is None:
219 |         tokenized = tokenizer.tokenize(text)
220 |     # import pdb; pdb.set_trace()
221 |     phs = []
222 |     ph_groups = []
223 |     for t in tokenized:
224 |         if not t.startswith("#"):
225 |             ph_groups.append([t])
226 |         else:
227 |             ph_groups[-1].append(t.replace("#", ""))
228 |     
229 |     phones = []
230 |     tones = []
231 |     word2ph = []
232 |     for group in ph_groups:
233 |         w = "".join(group)
234 |         phone_len = 0
235 |         word_len = len(group)
236 |         if w.upper() in eng_dict:
237 |             phns, tns = refine_syllables(eng_dict[w.upper()])
238 |             phones += phns
239 |             tones += tns
240 |             phone_len += len(phns)
241 |         else:
242 |             phone_list = list(filter(lambda p: p != " ", _g2p(w)))
243 |             for ph in phone_list:
244 |                 if ph in arpa:
245 |                     ph, tn = refine_ph(ph)
246 |                     phones.append(ph)
247 |                     tones.append(tn)
248 |                 else:
249 |                     phones.append(ph)
250 |                     tones.append(0)
251 |                 phone_len += 1
252 |         aaa = distribute_phone(phone_len, word_len)
253 |         word2ph += aaa
254 |     phones = [post_replace_ph(i) for i in phones]
255 | 
256 |     if pad_start_end:
257 |         phones = ["_"] + phones + ["_"]
258 |         tones = [0] + tones + [0]
259 |         word2ph = [1] + word2ph + [1]
260 |     return phones, tones, word2ph
261 | 
262 | def get_bert_feature(text, word2ph, device=None):
263 |     from text import english_bert
264 | 
265 |     return english_bert.get_bert_feature(text, word2ph, device=device)
266 | 
267 | if __name__ == "__main__":
268 |     # print(get_dict())
269 |     # print(eng_word_to_phoneme("hello"))
270 |     from text.english_bert import get_bert_feature
271 |     text = "In this paper, we propose 1 DSPGAN, a N-F-T GAN-based universal vocoder."
272 |     text = text_normalize(text)
273 |     phones, tones, word2ph = g2p(text)
274 |     import pdb; pdb.set_trace()
275 |     bert = get_bert_feature(text, word2ph)
276 |     
277 |     print(phones, tones, word2ph, bert.shape)
278 | 
279 |     # all_phones = set()
280 |     # for k, syllables in eng_dict.items():
281 |     #     for group in syllables:
282 |     #         for ph in group:
283 |     #             all_phones.add(ph)
284 |     # print(all_phones)
285 | 


--------------------------------------------------------------------------------
/melo/text/english_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | model_id = 'bert-base-uncased'
 6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 7 | model = None
 8 | 
 9 | def get_bert_feature(text, word2ph, device=None):
10 |     global model
11 |     if (
12 |         sys.platform == "darwin"
13 |         and torch.backends.mps.is_available()
14 |         and device == "cpu"
15 |     ):
16 |         device = "mps"
17 |     if not device:
18 |         device = "cuda"
19 |     if model is None:
20 |         model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 |             device
22 |         )
23 |     with torch.no_grad():
24 |         inputs = tokenizer(text, return_tensors="pt")
25 |         for i in inputs:
26 |             inputs[i] = inputs[i].to(device)
27 |         res = model(**inputs, output_hidden_states=True)
28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |         
30 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
31 |     word2phone = word2ph
32 |     phone_level_feature = []
33 |     for i in range(len(word2phone)):
34 |         repeat_feature = res[i].repeat(word2phone[i], 1)
35 |         phone_level_feature.append(repeat_feature)
36 | 
37 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 | 
39 |     return phone_level_feature.T
40 | 


--------------------------------------------------------------------------------
/melo/text/english_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/english_utils/__init__.py


--------------------------------------------------------------------------------
/melo/text/english_utils/abbreviations.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | # List of (regular expression, replacement) pairs for abbreviations in english:
 4 | abbreviations_en = [
 5 |     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
 6 |     for x in [
 7 |         ("mrs", "misess"),
 8 |         ("mr", "mister"),
 9 |         ("dr", "doctor"),
10 |         ("st", "saint"),
11 |         ("co", "company"),
12 |         ("jr", "junior"),
13 |         ("maj", "major"),
14 |         ("gen", "general"),
15 |         ("drs", "doctors"),
16 |         ("rev", "reverend"),
17 |         ("lt", "lieutenant"),
18 |         ("hon", "honorable"),
19 |         ("sgt", "sergeant"),
20 |         ("capt", "captain"),
21 |         ("esq", "esquire"),
22 |         ("ltd", "limited"),
23 |         ("col", "colonel"),
24 |         ("ft", "fort"),
25 |     ]
26 | ]
27 | 
28 | def expand_abbreviations(text, lang="en"):
29 |     if lang == "en":
30 |         _abbreviations = abbreviations_en
31 |     else:
32 |         raise NotImplementedError()
33 |     for regex, replacement in _abbreviations:
34 |         text = re.sub(regex, replacement, text)
35 |     return text


--------------------------------------------------------------------------------
/melo/text/english_utils/number_norm.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | from typing import Dict
 5 | 
 6 | import inflect
 7 | 
 8 | _inflect = inflect.engine()
 9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"-?[0-9]+")
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(",", "")
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace(".", " point ")
22 | 
23 | 
24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
25 |     parts = value.replace(",", "").split(".")
26 |     if len(parts) > 2:
27 |         return f"{value} {inflection[2]}"  # Unexpected format
28 |     text = []
29 |     integer = int(parts[0]) if parts[0] else 0
30 |     if integer > 0:
31 |         integer_unit = inflection.get(integer, inflection[2])
32 |         text.append(f"{integer} {integer_unit}")
33 |     fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
34 |     if fraction > 0:
35 |         fraction_unit = inflection.get(fraction / 100, inflection[0.02])
36 |         text.append(f"{fraction} {fraction_unit}")
37 |     if len(text) == 0:
38 |         return f"zero {inflection[2]}"
39 |     return " ".join(text)
40 | 
41 | 
42 | def _expand_currency(m: "re.Match") -> str:
43 |     currencies = {
44 |         "$": {
45 |             0.01: "cent",
46 |             0.02: "cents",
47 |             1: "dollar",
48 |             2: "dollars",
49 |         },
50 |         "€": {
51 |             0.01: "cent",
52 |             0.02: "cents",
53 |             1: "euro",
54 |             2: "euros",
55 |         },
56 |         "£": {
57 |             0.01: "penny",
58 |             0.02: "pence",
59 |             1: "pound sterling",
60 |             2: "pounds sterling",
61 |         },
62 |         "¥": {
63 |             # TODO rin
64 |             0.02: "sen",
65 |             2: "yen",
66 |         },
67 |     }
68 |     unit = m.group(1)
69 |     currency = currencies[unit]
70 |     value = m.group(2)
71 |     return __expand_currency(value, currency)
72 | 
73 | 
74 | def _expand_ordinal(m):
75 |     return _inflect.number_to_words(m.group(0))
76 | 
77 | 
78 | def _expand_number(m):
79 |     num = int(m.group(0))
80 |     if 1000 < num < 3000:
81 |         if num == 2000:
82 |             return "two thousand"
83 |         if 2000 < num < 2010:
84 |             return "two thousand " + _inflect.number_to_words(num % 100)
85 |         if num % 100 == 0:
86 |             return _inflect.number_to_words(num // 100) + " hundred"
87 |         return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
88 |     return _inflect.number_to_words(num, andword="")
89 | 
90 | 
91 | def normalize_numbers(text):
92 |     text = re.sub(_comma_number_re, _remove_commas, text)
93 |     text = re.sub(_currency_re, _expand_currency, text)
94 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
95 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
96 |     text = re.sub(_number_re, _expand_number, text)
97 |     return text


--------------------------------------------------------------------------------
/melo/text/english_utils/time_norm.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import inflect
 4 | 
 5 | _inflect = inflect.engine()
 6 | 
 7 | _time_re = re.compile(
 8 |     r"""\b
 9 |                           ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3]))  # hours
10 |                           :
11 |                           ([0-5][0-9])                            # minutes
12 |                           \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 |                           \b""",
14 |     re.IGNORECASE | re.X,
15 | )
16 | 
17 | 
18 | def _expand_num(n: int) -> str:
19 |     return _inflect.number_to_words(n)
20 | 
21 | 
22 | def _expand_time_english(match: "re.Match") -> str:
23 |     hour = int(match.group(1))
24 |     past_noon = hour >= 12
25 |     time = []
26 |     if hour > 12:
27 |         hour -= 12
28 |     elif hour == 0:
29 |         hour = 12
30 |         past_noon = True
31 |     time.append(_expand_num(hour))
32 | 
33 |     minute = int(match.group(6))
34 |     if minute > 0:
35 |         if minute < 10:
36 |             time.append("oh")
37 |         time.append(_expand_num(minute))
38 |     am_pm = match.group(7)
39 |     if am_pm is None:
40 |         time.append("p m" if past_noon else "a m")
41 |     else:
42 |         time.extend(list(am_pm.replace(".", "")))
43 |     return " ".join(time)
44 | 
45 | 
46 | def expand_time_english(text: str) -> str:
47 |     return re.sub(_time_re, _expand_time_english, text)


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/es_phonemizer/__init__.py


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from typing import List, Tuple
  3 | 
  4 | from .punctuation import Punctuation
  5 | 
  6 | 
  7 | class BasePhonemizer(abc.ABC):
  8 |     """Base phonemizer class
  9 | 
 10 |     Phonemization follows the following steps:
 11 |         1. Preprocessing:
 12 |             - remove empty lines
 13 |             - remove punctuation
 14 |             - keep track of punctuation marks
 15 | 
 16 |         2. Phonemization:
 17 |             - convert text to phonemes
 18 | 
 19 |         3. Postprocessing:
 20 |             - join phonemes
 21 |             - restore punctuation marks
 22 | 
 23 |     Args:
 24 |         language (str):
 25 |             Language used by the phonemizer.
 26 | 
 27 |         punctuations (List[str]):
 28 |             List of punctuation marks to be preserved.
 29 | 
 30 |         keep_puncs (bool):
 31 |             Whether to preserve punctuation marks or not.
 32 |     """
 33 | 
 34 |     def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
 35 |         # ensure the backend is installed on the system
 36 |         if not self.is_available():
 37 |             raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover
 38 | 
 39 |         # ensure the backend support the requested language
 40 |         self._language = self._init_language(language)
 41 | 
 42 |         # setup punctuation processing
 43 |         self._keep_puncs = keep_puncs
 44 |         self._punctuator = Punctuation(punctuations)
 45 | 
 46 |     def _init_language(self, language):
 47 |         """Language initialization
 48 | 
 49 |         This method may be overloaded in child classes (see Segments backend)
 50 | 
 51 |         """
 52 |         if not self.is_supported_language(language):
 53 |             raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
 54 |         return language
 55 | 
 56 |     @property
 57 |     def language(self):
 58 |         """The language code configured to be used for phonemization"""
 59 |         return self._language
 60 | 
 61 |     @staticmethod
 62 |     @abc.abstractmethod
 63 |     def name():
 64 |         """The name of the backend"""
 65 |         ...
 66 | 
 67 |     @classmethod
 68 |     @abc.abstractmethod
 69 |     def is_available(cls):
 70 |         """Returns True if the backend is installed, False otherwise"""
 71 |         ...
 72 | 
 73 |     @classmethod
 74 |     @abc.abstractmethod
 75 |     def version(cls):
 76 |         """Return the backend version as a tuple (major, minor, patch)"""
 77 |         ...
 78 | 
 79 |     @staticmethod
 80 |     @abc.abstractmethod
 81 |     def supported_languages():
 82 |         """Return a dict of language codes -> name supported by the backend"""
 83 |         ...
 84 | 
 85 |     def is_supported_language(self, language):
 86 |         """Returns True if `language` is supported by the backend"""
 87 |         return language in self.supported_languages()
 88 | 
 89 |     @abc.abstractmethod
 90 |     def _phonemize(self, text, separator):
 91 |         """The main phonemization method"""
 92 | 
 93 |     def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
 94 |         """Preprocess the text before phonemization
 95 | 
 96 |         1. remove spaces
 97 |         2. remove punctuation
 98 | 
 99 |         Override this if you need a different behaviour
100 |         """
101 |         text = text.strip()
102 |         if self._keep_puncs:
103 |             # a tuple (text, punctuation marks)
104 |             return self._punctuator.strip_to_restore(text)
105 |         return [self._punctuator.strip(text)], []
106 | 
107 |     def _phonemize_postprocess(self, phonemized, punctuations) -> str:
108 |         """Postprocess the raw phonemized output
109 | 
110 |         Override this if you need a different behaviour
111 |         """
112 |         if self._keep_puncs:
113 |             return self._punctuator.restore(phonemized, punctuations)[0]
114 |         return phonemized[0]
115 | 
116 |     def phonemize(self, text: str, separator="|", language: str = None) -> str:  # pylint: disable=unused-argument
117 |         """Returns the `text` phonemized for the given language
118 | 
119 |         Args:
120 |             text (str):
121 |                 Text to be phonemized.
122 | 
123 |             separator (str):
124 |                 string separator used between phonemes. Default to '_'.
125 | 
126 |         Returns:
127 |             (str): Phonemized text
128 |         """
129 |         text, punctuations = self._phonemize_preprocess(text)
130 |         phonemized = []
131 |         for t in text:
132 |             p = self._phonemize(t, separator)
133 |             phonemized.append(p)
134 |         phonemized = self._phonemize_postprocess(phonemized, punctuations)
135 |         return phonemized
136 | 
137 |     def print_logs(self, level: int = 0):
138 |         indent = "\t" * level
139 |         print(f"{indent}| > phoneme language: {self.language}")
140 |         print(f"{indent}| > phoneme backend: {self.name()}")


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/cleaner.py:
--------------------------------------------------------------------------------
  1 | """Set of default text cleaners"""
  2 | # TODO: pick the cleaner for languages dynamically
  3 | 
  4 | import re
  5 | 
  6 | # Regular expression matching whitespace:
  7 | _whitespace_re = re.compile(r"\s+")
  8 | 
  9 | rep_map = {
 10 |     "：": ",",
 11 |     "；": ",",
 12 |     "，": ",",
 13 |     "。": ".",
 14 |     "！": "!",
 15 |     "？": "?",
 16 |     "\n": ".",
 17 |     "·": ",",
 18 |     "、": ",",
 19 |     "...": ".",
 20 |     "…": ".",
 21 |     "$": ".",
 22 |     "“": "'",
 23 |     "”": "'",
 24 |     "‘": "'",
 25 |     "’": "'",
 26 |     "（": "'",
 27 |     "）": "'",
 28 |     "(": "'",
 29 |     ")": "'",
 30 |     "《": "'",
 31 |     "》": "'",
 32 |     "【": "'",
 33 |     "】": "'",
 34 |     "[": "'",
 35 |     "]": "'",
 36 |     "—": "",
 37 |     "～": "-",
 38 |     "~": "-",
 39 |     "「": "'",
 40 |     "」": "'",
 41 | }
 42 | 
 43 | def replace_punctuation(text):
 44 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 45 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 46 |     return replaced_text
 47 | 
 48 | def lowercase(text):
 49 |     return text.lower()
 50 | 
 51 | 
 52 | def collapse_whitespace(text):
 53 |     return re.sub(_whitespace_re, " ", text).strip()
 54 | 
 55 | def remove_punctuation_at_begin(text):
 56 |     return re.sub(r'^[,.!?]+', '', text)
 57 | 
 58 | def remove_aux_symbols(text):
 59 |     text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
 60 |     return text
 61 | 
 62 | 
 63 | def replace_symbols(text, lang="en"):
 64 |     """Replace symbols based on the lenguage tag.
 65 | 
 66 |     Args:
 67 |       text:
 68 |        Input text.
 69 |       lang:
 70 |         Lenguage identifier. ex: "en", "fr", "pt", "ca".
 71 | 
 72 |     Returns:
 73 |       The modified text
 74 |       example:
 75 |         input args:
 76 |             text: "si l'avi cau, diguem-ho"
 77 |             lang: "ca"
 78 |         Output:
 79 |             text: "si lavi cau, diguemho"
 80 |     """
 81 |     text = text.replace(";", ",")
 82 |     text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
 83 |     text = text.replace(":", ",")
 84 |     if lang == "en":
 85 |         text = text.replace("&", " and ")
 86 |     elif lang == "fr":
 87 |         text = text.replace("&", " et ")
 88 |     elif lang == "pt":
 89 |         text = text.replace("&", " e ")
 90 |     elif lang == "ca":
 91 |         text = text.replace("&", " i ")
 92 |         text = text.replace("'", "")
 93 |     elif lang== "es":
 94 |         text=text.replace("&","y")
 95 |         text = text.replace("'", "")
 96 |     return text
 97 | 
 98 | def spanish_cleaners(text):
 99 |     """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
100 |     numbers, phonemizer already does that"""
101 |     text = lowercase(text)
102 |     text = replace_symbols(text, lang="es")
103 |     text = replace_punctuation(text)
104 |     text = remove_aux_symbols(text)
105 |     text = remove_punctuation_at_begin(text)
106 |     text = collapse_whitespace(text)
107 |     text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
108 |     return text
109 | 
110 | 


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "symbols": [
 3 |         "_",
 4 |         ",",
 5 |         ".",
 6 |         "!",
 7 |         "?",
 8 |         "-",
 9 |         "~",
10 |         "\u2026",
11 |         "N",
12 |         "Q",
13 |         "a",
14 |         "b",
15 |         "d",
16 |         "e",
17 |         "f",
18 |         "g",
19 |         "h",
20 |         "i",
21 |         "j",
22 |         "k",
23 |         "l",
24 |         "m",
25 |         "n",
26 |         "o",
27 |         "p",
28 |         "s",
29 |         "t",
30 |         "u",
31 |         "v",
32 |         "w",
33 |         "x",
34 |         "y",
35 |         "z",
36 |         "\u0251",
37 |         "\u00e6",
38 |         "\u0283",
39 |         "\u0291",
40 |         "\u00e7",
41 |         "\u026f",
42 |         "\u026a",
43 |         "\u0254",
44 |         "\u025b",
45 |         "\u0279",
46 |         "\u00f0",
47 |         "\u0259",
48 |         "\u026b",
49 |         "\u0265",
50 |         "\u0278",
51 |         "\u028a",
52 |         "\u027e",
53 |         "\u0292",
54 |         "\u03b8",
55 |         "\u03b2",
56 |         "\u014b",
57 |         "\u0266",
58 |         "\u207c",
59 |         "\u02b0",
60 |         "`",
61 |         "^",
62 |         "#",
63 |         "*",
64 |         "=",
65 |         "\u02c8",
66 |         "\u02cc",
67 |         "\u2192",
68 |         "\u2193",
69 |         "\u2191",
70 |         " ",
71 |         "\u0263",
72 |         "\u0261",
73 |         "r",
74 |         "\u0272",
75 |         "\u029d",
76 |         "\u028e",
77 |         "\u02d0"
78 |     ]
79 | }


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols.txt:
--------------------------------------------------------------------------------
1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɡrɲʝɣʎː—¿¡


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_symbols_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "symbols": [
 3 |         "_",
 4 |         ",",
 5 |         ".",
 6 |         "!",
 7 |         "?",
 8 |         "-",
 9 |         "~",
10 |         "\u2026",
11 |         "N",
12 |         "Q",
13 |         "a",
14 |         "b",
15 |         "d",
16 |         "e",
17 |         "f",
18 |         "g",
19 |         "h",
20 |         "i",
21 |         "j",
22 |         "k",
23 |         "l",
24 |         "m",
25 |         "n",
26 |         "o",
27 |         "p",
28 |         "s",
29 |         "t",
30 |         "u",
31 |         "v",
32 |         "w",
33 |         "x",
34 |         "y",
35 |         "z",
36 |         "\u0251",
37 |         "\u00e6",
38 |         "\u0283",
39 |         "\u0291",
40 |         "\u00e7",
41 |         "\u026f",
42 |         "\u026a",
43 |         "\u0254",
44 |         "\u025b",
45 |         "\u0279",
46 |         "\u00f0",
47 |         "\u0259",
48 |         "\u026b",
49 |         "\u0265",
50 |         "\u0278",
51 |         "\u028a",
52 |         "\u027e",
53 |         "\u0292",
54 |         "\u03b8",
55 |         "\u03b2",
56 |         "\u014b",
57 |         "\u0266",
58 |         "\u207c",
59 |         "\u02b0",
60 |         "`",
61 |         "^",
62 |         "#",
63 |         "*",
64 |         "=",
65 |         "\u02c8",
66 |         "\u02cc",
67 |         "\u2192",
68 |         "\u2193",
69 |         "\u2191",
70 |         " ",
71 |         "\u0261",
72 |         "r",
73 |         "\u0272",
74 |         "\u029d",
75 |         "\u0263",
76 |         "\u028e",
77 |         "\u02d0",
78 |         
79 |         "\u2014",
80 |         "\u00bf",
81 |         "\u00a1"
82 |     ]
83 | }


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/es_to_ipa.py:
--------------------------------------------------------------------------------
 1 | from .cleaner import spanish_cleaners
 2 | from .gruut_wrapper import Gruut
 3 | 
 4 | def es2ipa(text):
 5 |     e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
 6 |     # text = spanish_cleaners(text)
 7 |     phonemes = e.phonemize(text, separator="")
 8 |     return phonemes
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |   print(es2ipa('¿Y a quién echaría de menos, en el mundo si no fuese a vos?'))


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/gruut_wrapper.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | from typing import List
  3 | 
  4 | import gruut
  5 | from gruut_ipa import IPA # pip install gruut_ipa
  6 | 
  7 | from .base import BasePhonemizer
  8 | from .punctuation import Punctuation
  9 | 
 10 | # Table for str.translate to fix gruut/TTS phoneme mismatch
 11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 12 | 
 13 | 
 14 | class Gruut(BasePhonemizer):
 15 |     """Gruut wrapper for G2P
 16 | 
 17 |     Args:
 18 |         language (str):
 19 |             Valid language code for the used backend.
 20 | 
 21 |         punctuations (str):
 22 |             Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
 23 | 
 24 |         keep_puncs (bool):
 25 |             If true, keep the punctuations after phonemization. Defaults to True.
 26 | 
 27 |         use_espeak_phonemes (bool):
 28 |             If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
 29 | 
 30 |         keep_stress (bool):
 31 |             If true, keep the stress characters after phonemization. Defaults to False.
 32 | 
 33 |     Example:
 34 | 
 35 |         >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
 36 |         >>> phonemizer = Gruut('en-us')
 37 |         >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
 38 |         'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         language: str,
 44 |         punctuations=Punctuation.default_puncs(),
 45 |         keep_puncs=True,
 46 |         use_espeak_phonemes=False,
 47 |         keep_stress=False,
 48 |     ):
 49 |         super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
 50 |         self.use_espeak_phonemes = use_espeak_phonemes
 51 |         self.keep_stress = keep_stress
 52 | 
 53 |     @staticmethod
 54 |     def name():
 55 |         return "gruut"
 56 | 
 57 |     def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:  # pylint: disable=unused-argument
 58 |         """Convert input text to phonemes.
 59 | 
 60 |         Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
 61 |         that constitude a single sound.
 62 | 
 63 |         It doesn't affect 🐸TTS since it individually converts each character to token IDs.
 64 | 
 65 |         Examples::
 66 |             "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
 67 | 
 68 |         Args:
 69 |             text (str):
 70 |                 Text to be converted to phonemes.
 71 | 
 72 |             tie (bool, optional) : When True use a '͡' character between
 73 |                 consecutive characters of a single phoneme. Else separate phoneme
 74 |                 with '_'. This option requires espeak>=1.49. Default to False.
 75 |         """
 76 |         ph_list = []
 77 |         for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
 78 |             for word in sentence:
 79 |                 if word.is_break:
 80 |                     # Use actual character for break phoneme (e.g., comma)
 81 |                     if ph_list:
 82 |                         # Join with previous word
 83 |                         ph_list[-1].append(word.text)
 84 |                     else:
 85 |                         # First word is punctuation
 86 |                         ph_list.append([word.text])
 87 |                 elif word.phonemes:
 88 |                     # Add phonemes for word
 89 |                     word_phonemes = []
 90 | 
 91 |                     for word_phoneme in word.phonemes:
 92 |                         if not self.keep_stress:
 93 |                             # Remove primary/secondary stress
 94 |                             word_phoneme = IPA.without_stress(word_phoneme)
 95 | 
 96 |                         word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
 97 | 
 98 |                         if word_phoneme:
 99 |                             # Flatten phonemes
100 |                             word_phonemes.extend(word_phoneme)
101 | 
102 |                     if word_phonemes:
103 |                         ph_list.append(word_phonemes)
104 | 
105 |         ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
106 |         ph = f"{separator} ".join(ph_words)
107 |         return ph
108 | 
109 |     def _phonemize(self, text, separator):
110 |         return self.phonemize_gruut(text, separator, tie=False)
111 | 
112 |     def is_supported_language(self, language):
113 |         """Returns True if `language` is supported by the backend"""
114 |         return gruut.is_language_supported(language)
115 | 
116 |     @staticmethod
117 |     def supported_languages() -> List:
118 |         """Get a dictionary of supported languages.
119 | 
120 |         Returns:
121 |             List: List of language codes.
122 |         """
123 |         return list(gruut.get_supported_languages())
124 | 
125 |     def version(self):
126 |         """Get the version of the used backend.
127 | 
128 |         Returns:
129 |             str: Version of the used backend.
130 |         """
131 |         return gruut.__version__
132 | 
133 |     @classmethod
134 |     def is_available(cls):
135 |         """Return true if ESpeak is available else false"""
136 |         return importlib.util.find_spec("gruut") is not None
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     from es_to_ipa import es2ipa
141 |     import json
142 | 
143 |     e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
144 |     symbols = [
145 |         "_",
146 |         ",",
147 |         ".",
148 |         "!",
149 |         "?",
150 |         "-",
151 |         "~",
152 |         "\u2026",
153 |         "N",
154 |         "Q",
155 |         "a",
156 |         "b",
157 |         "d",
158 |         "e",
159 |         "f",
160 |         "g",
161 |         "h",
162 |         "i",
163 |         "j",
164 |         "k",
165 |         "l",
166 |         "m",
167 |         "n",
168 |         "o",
169 |         "p",
170 |         "s",
171 |         "t",
172 |         "u",
173 |         "v",
174 |         "w",
175 |         "x",
176 |         "y",
177 |         "z",
178 |         "\u0251",
179 |         "\u00e6",
180 |         "\u0283",
181 |         "\u0291",
182 |         "\u00e7",
183 |         "\u026f",
184 |         "\u026a",
185 |         "\u0254",
186 |         "\u025b",
187 |         "\u0279",
188 |         "\u00f0",
189 |         "\u0259",
190 |         "\u026b",
191 |         "\u0265",
192 |         "\u0278",
193 |         "\u028a",
194 |         "\u027e",
195 |         "\u0292",
196 |         "\u03b8",
197 |         "\u03b2",
198 |         "\u014b",
199 |         "\u0266",
200 |         "\u207c",
201 |         "\u02b0",
202 |         "`",
203 |         "^",
204 |         "#",
205 |         "*",
206 |         "=",
207 |         "\u02c8",
208 |         "\u02cc",
209 |         "\u2192",
210 |         "\u2193",
211 |         "\u2191",
212 |         " ",
213 |     ]
214 |     with open('./text/es_phonemizer/spanish_text.txt', 'r') as f:
215 |         lines = f.readlines()
216 |     
217 | 
218 |     used_sym = []
219 |     not_existed_sym = []
220 |     phonemes = []
221 | 
222 |     for line in lines[:400]:
223 |         text = line.split('|')[-1].strip()
224 |         ipa = es2ipa(text)
225 |         phonemes.append(ipa + '\n')
226 |         for s in ipa:
227 |             if s not in symbols:
228 |                 if s not in not_existed_sym:
229 |                     print(f'not_existed char: {s}')
230 |                     not_existed_sym.append(s)
231 |             else:
232 |                 if s not in used_sym:
233 |                     # print(f'used char: {s}')
234 |                     used_sym.append(s)
235 |     
236 |     print(used_sym)
237 |     print(not_existed_sym)
238 | 
239 | 
240 |     with open('./text/es_phonemizer/es_symbols.txt', 'w') as g:
241 |         g.writelines(symbols + not_existed_sym)
242 |         
243 |     with open('./text/es_phonemizer/example_ipa.txt', 'w') as g:
244 |         g.writelines(phonemes)
245 |         
246 |     data = {'symbols': symbols + not_existed_sym}
247 |     with open('./text/es_phonemizer/es_symbols_v2.json', 'w') as f:
248 |         json.dump(data, f, indent=4)
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/punctuation.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import re
  3 | from enum import Enum
  4 | 
  5 | import six
  6 | 
  7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
  8 | 
  9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
 10 | 
 11 | 
 12 | class PuncPosition(Enum):
 13 |     """Enum for the punctuations positions"""
 14 | 
 15 |     BEGIN = 0
 16 |     END = 1
 17 |     MIDDLE = 2
 18 |     ALONE = 3
 19 | 
 20 | 
 21 | class Punctuation:
 22 |     """Handle punctuations in text.
 23 | 
 24 |     Just strip punctuations from text or strip and restore them later.
 25 | 
 26 |     Args:
 27 |         puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
 28 | 
 29 |     Example:
 30 |         >>> punc = Punctuation()
 31 |         >>> punc.strip("This is. example !")
 32 |         'This is example'
 33 | 
 34 |         >>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
 35 |         >>> ' '.join(text_striped)
 36 |         'This is example'
 37 | 
 38 |         >>> text_restored = punc.restore(text_striped, punc_map)
 39 |         >>> text_restored[0]
 40 |         'This is. example !'
 41 |     """
 42 | 
 43 |     def __init__(self, puncs: str = _DEF_PUNCS):
 44 |         self.puncs = puncs
 45 | 
 46 |     @staticmethod
 47 |     def default_puncs():
 48 |         """Return default set of punctuations."""
 49 |         return _DEF_PUNCS
 50 | 
 51 |     @property
 52 |     def puncs(self):
 53 |         return self._puncs
 54 | 
 55 |     @puncs.setter
 56 |     def puncs(self, value):
 57 |         if not isinstance(value, six.string_types):
 58 |             raise ValueError("[!] Punctuations must be of type str.")
 59 |         self._puncs = "".join(list(dict.fromkeys(list(value))))  # remove duplicates without changing the oreder
 60 |         self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
 61 | 
 62 |     def strip(self, text):
 63 |         """Remove all the punctuations by replacing with `space`.
 64 | 
 65 |         Args:
 66 |             text (str): The text to be processed.
 67 | 
 68 |         Example::
 69 | 
 70 |             "This is. example !" -> "This is example "
 71 |         """
 72 |         return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
 73 | 
 74 |     def strip_to_restore(self, text):
 75 |         """Remove punctuations from text to restore them later.
 76 | 
 77 |         Args:
 78 |             text (str): The text to be processed.
 79 | 
 80 |         Examples ::
 81 | 
 82 |             "This is. example !" -> [["This is", "example"], [".", "!"]]
 83 | 
 84 |         """
 85 |         text, puncs = self._strip_to_restore(text)
 86 |         return text, puncs
 87 | 
 88 |     def _strip_to_restore(self, text):
 89 |         """Auxiliary method for Punctuation.preserve()"""
 90 |         matches = list(re.finditer(self.puncs_regular_exp, text))
 91 |         if not matches:
 92 |             return [text], []
 93 |         # the text is only punctuations
 94 |         if len(matches) == 1 and matches[0].group() == text:
 95 |             return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
 96 |         # build a punctuation map to be used later to restore punctuations
 97 |         puncs = []
 98 |         for match in matches:
 99 |             position = PuncPosition.MIDDLE
100 |             if match == matches[0] and text.startswith(match.group()):
101 |                 position = PuncPosition.BEGIN
102 |             elif match == matches[-1] and text.endswith(match.group()):
103 |                 position = PuncPosition.END
104 |             puncs.append(_PUNC_IDX(match.group(), position))
105 |         # convert str text to a List[str], each item is separated by a punctuation
106 |         splitted_text = []
107 |         for idx, punc in enumerate(puncs):
108 |             split = text.split(punc.punc)
109 |             prefix, suffix = split[0], punc.punc.join(split[1:])
110 |             splitted_text.append(prefix)
111 |             # if the text does not end with a punctuation, add it to the last item
112 |             if idx == len(puncs) - 1 and len(suffix) > 0:
113 |                 splitted_text.append(suffix)
114 |             text = suffix
115 |         while splitted_text[0] == '':
116 |             splitted_text = splitted_text[1:]
117 |         return splitted_text, puncs
118 | 
119 |     @classmethod
120 |     def restore(cls, text, puncs):
121 |         """Restore punctuation in a text.
122 | 
123 |         Args:
124 |             text (str): The text to be processed.
125 |             puncs (List[str]): The list of punctuations map to be used for restoring.
126 | 
127 |         Examples ::
128 | 
129 |             ['This is', 'example'], ['.', '!'] -> "This is. example!"
130 | 
131 |         """
132 |         return cls._restore(text, puncs, 0)
133 | 
134 |     @classmethod
135 |     def _restore(cls, text, puncs, num):  # pylint: disable=too-many-return-statements
136 |         """Auxiliary method for Punctuation.restore()"""
137 |         if not puncs:
138 |             return text
139 | 
140 |         # nothing have been phonemized, returns the puncs alone
141 |         if not text:
142 |             return ["".join(m.punc for m in puncs)]
143 | 
144 |         current = puncs[0]
145 | 
146 |         if current.position == PuncPosition.BEGIN:
147 |             return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
148 | 
149 |         if current.position == PuncPosition.END:
150 |             return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
151 | 
152 |         if current.position == PuncPosition.ALONE:
153 |             return [current.mark] + cls._restore(text, puncs[1:], num + 1)
154 | 
155 |         # POSITION == MIDDLE
156 |         if len(text) == 1:  # pragma: nocover
157 |             # a corner case where the final part of an intermediate
158 |             # mark (I) has not been phonemized
159 |             return cls._restore([text[0] + current.punc], puncs[1:], num)
160 | 
161 |         return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
162 | 
163 | 
164 | # if __name__ == "__main__":
165 | #     punc = Punctuation()
166 | #     text = "This is. This is, example!"
167 | 
168 | #     print(punc.strip(text))
169 | 
170 | #     split_text, puncs = punc.strip_to_restore(text)
171 | #     print(split_text, " ---- ", puncs)
172 | 
173 | #     restored_text = punc.restore(split_text, puncs)
174 | #     print(restored_text)


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/spanish_symbols.txt:
--------------------------------------------------------------------------------
1 | dˌaβˈiðkopeɾfjl unθsbmtʃwɛxɪŋʊɣɡrɲʝʎː


--------------------------------------------------------------------------------
/melo/text/es_phonemizer/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "ImportError",
 10 |      "evalue": "attempted relative import with no known parent package",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 14 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 15 |       "\u001b[1;32m/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb Cell 1\u001b[0m line \u001b[0;36m5\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39msys\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m sys\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mappend(\u001b[39m'\u001b[39m\u001b[39m/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mes_to_ipa\u001b[39;00m \u001b[39mimport\u001b[39;00m es2ipa\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msplit_sentences_en\u001b[39m(text, min_len\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m   \u001b[39m# 将文本中的换行符、空格和制表符替换为空格\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=10'>11</a>\u001b[0m   text \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\u001b[39m'\u001b[39m\u001b[39m[\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m ]+\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m, text)\n",
 16 |       "File \u001b[0;32m/data/workspace/Bert-VITS2/text/es_phonemizer/es_to_ipa.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mcleaner\u001b[39;00m \u001b[39mimport\u001b[39;00m spanish_cleaners\n\u001b[1;32m      2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mgruut_wrapper\u001b[39;00m \u001b[39mimport\u001b[39;00m Gruut\n\u001b[1;32m      4\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mes2ipa\u001b[39m(text):\n",
 17 |       "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "import re\n",
 23 |     "import os\n",
 24 |     "import os, sys\n",
 25 |     "sys.path.append('/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/')\n",
 26 |     "from es_to_ipa import es2ipa\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "def split_sentences_en(text, min_len=10):\n",
 31 |     "  # 将文本中的换行符、空格和制表符替换为空格\n",
 32 |     "  text = re.sub('[\\n\\t ]+', ' ', text)\n",
 33 |     "  # 在标点符号后添加一个空格\n",
 34 |     "  text = re.sub('([¿—¡])', r'\\1 $#!', text)\n",
 35 |     "  # 分隔句子并去除前后空格\n",
 36 |     "  \n",
 37 |     "  sentences = [s.strip() for s in text.split(' $#!')]\n",
 38 |     "  if len(sentences[-1]) == 0: del sentences[-1]\n",
 39 |     "\n",
 40 |     "  new_sentences = []\n",
 41 |     "  new_sent = []\n",
 42 |     "  for ind, sent in enumerate(sentences):\n",
 43 |     "    if sent in ['¿', '—', '¡']:\n",
 44 |     "      new_sent.append(sent)\n",
 45 |     "    else:\n",
 46 |     "      new_sent.append(es2ipa(sent))\n",
 47 |     "    \n",
 48 |     "  \n",
 49 |     "  new_sentences = ''.join(new_sent)\n",
 50 |     "\n",
 51 |     "  return new_sentences"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "'—¿aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
 63 |       ]
 64 |      },
 65 |      "execution_count": 3,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "split_sentences_en('—¿Habéis estado casada alguna vez?')"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "'aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
 83 |       ]
 84 |      },
 85 |      "execution_count": 4,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "es2ipa('—¿Habéis estado casada alguna vez?')"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "base",
105 |    "language": "python",
106 |    "name": "python3"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 3
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython3",
118 |    "version": "3.8.18"
119 |   },
120 |   "orig_nbformat": 4
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 2
124 | }
125 | 


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myshell-ai/MeloTTS/209145371cff8fc3bd60d7be902ea69cbdb7965a/melo/text/fr_phonemizer/__init__.py


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from typing import List, Tuple
  3 | 
  4 | from .punctuation import Punctuation
  5 | 
  6 | 
  7 | class BasePhonemizer(abc.ABC):
  8 |     """Base phonemizer class
  9 | 
 10 |     Phonemization follows the following steps:
 11 |         1. Preprocessing:
 12 |             - remove empty lines
 13 |             - remove punctuation
 14 |             - keep track of punctuation marks
 15 | 
 16 |         2. Phonemization:
 17 |             - convert text to phonemes
 18 | 
 19 |         3. Postprocessing:
 20 |             - join phonemes
 21 |             - restore punctuation marks
 22 | 
 23 |     Args:
 24 |         language (str):
 25 |             Language used by the phonemizer.
 26 | 
 27 |         punctuations (List[str]):
 28 |             List of punctuation marks to be preserved.
 29 | 
 30 |         keep_puncs (bool):
 31 |             Whether to preserve punctuation marks or not.
 32 |     """
 33 | 
 34 |     def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
 35 |         # ensure the backend is installed on the system
 36 |         if not self.is_available():
 37 |             raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover
 38 | 
 39 |         # ensure the backend support the requested language
 40 |         self._language = self._init_language(language)
 41 | 
 42 |         # setup punctuation processing
 43 |         self._keep_puncs = keep_puncs
 44 |         self._punctuator = Punctuation(punctuations)
 45 | 
 46 |     def _init_language(self, language):
 47 |         """Language initialization
 48 | 
 49 |         This method may be overloaded in child classes (see Segments backend)
 50 | 
 51 |         """
 52 |         if not self.is_supported_language(language):
 53 |             raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
 54 |         return language
 55 | 
 56 |     @property
 57 |     def language(self):
 58 |         """The language code configured to be used for phonemization"""
 59 |         return self._language
 60 | 
 61 |     @staticmethod
 62 |     @abc.abstractmethod
 63 |     def name():
 64 |         """The name of the backend"""
 65 |         ...
 66 | 
 67 |     @classmethod
 68 |     @abc.abstractmethod
 69 |     def is_available(cls):
 70 |         """Returns True if the backend is installed, False otherwise"""
 71 |         ...
 72 | 
 73 |     @classmethod
 74 |     @abc.abstractmethod
 75 |     def version(cls):
 76 |         """Return the backend version as a tuple (major, minor, patch)"""
 77 |         ...
 78 | 
 79 |     @staticmethod
 80 |     @abc.abstractmethod
 81 |     def supported_languages():
 82 |         """Return a dict of language codes -> name supported by the backend"""
 83 |         ...
 84 | 
 85 |     def is_supported_language(self, language):
 86 |         """Returns True if `language` is supported by the backend"""
 87 |         return language in self.supported_languages()
 88 | 
 89 |     @abc.abstractmethod
 90 |     def _phonemize(self, text, separator):
 91 |         """The main phonemization method"""
 92 | 
 93 |     def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
 94 |         """Preprocess the text before phonemization
 95 | 
 96 |         1. remove spaces
 97 |         2. remove punctuation
 98 | 
 99 |         Override this if you need a different behaviour
100 |         """
101 |         text = text.strip()
102 |         if self._keep_puncs:
103 |             # a tuple (text, punctuation marks)
104 |             return self._punctuator.strip_to_restore(text)
105 |         return [self._punctuator.strip(text)], []
106 | 
107 |     def _phonemize_postprocess(self, phonemized, punctuations) -> str:
108 |         """Postprocess the raw phonemized output
109 | 
110 |         Override this if you need a different behaviour
111 |         """
112 |         if self._keep_puncs:
113 |             return self._punctuator.restore(phonemized, punctuations)[0]
114 |         return phonemized[0]
115 | 
116 |     def phonemize(self, text: str, separator="|", language: str = None) -> str:  # pylint: disable=unused-argument
117 |         """Returns the `text` phonemized for the given language
118 | 
119 |         Args:
120 |             text (str):
121 |                 Text to be phonemized.
122 | 
123 |             separator (str):
124 |                 string separator used between phonemes. Default to '_'.
125 | 
126 |         Returns:
127 |             (str): Phonemized text
128 |         """
129 |         text, punctuations = self._phonemize_preprocess(text)
130 |         phonemized = []
131 |         for t in text:
132 |             p = self._phonemize(t, separator)
133 |             phonemized.append(p)
134 |         phonemized = self._phonemize_postprocess(phonemized, punctuations)
135 |         return phonemized
136 | 
137 |     def print_logs(self, level: int = 0):
138 |         indent = "\t" * level
139 |         print(f"{indent}| > phoneme language: {self.language}")
140 |         print(f"{indent}| > phoneme backend: {self.name()}")


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/cleaner.py:
--------------------------------------------------------------------------------
  1 | """Set of default text cleaners"""
  2 | # TODO: pick the cleaner for languages dynamically
  3 | 
  4 | import re
  5 | from .french_abbreviations import abbreviations_fr
  6 | 
  7 | # Regular expression matching whitespace:
  8 | _whitespace_re = re.compile(r"\s+")
  9 | 
 10 | 
 11 | rep_map = {
 12 |     "：": ",",
 13 |     "；": ",",
 14 |     "，": ",",
 15 |     "。": ".",
 16 |     "！": "!",
 17 |     "？": "?",
 18 |     "\n": ".",
 19 |     "·": ",",
 20 |     "、": ",",
 21 |     "...": ".",
 22 |     "…": ".",
 23 |     "$": ".",
 24 |     "“": "",
 25 |     "”": "",
 26 |     "‘": "",
 27 |     "’": "",
 28 |     "（": "",
 29 |     "）": "",
 30 |     "(": "",
 31 |     ")": "",
 32 |     "《": "",
 33 |     "》": "",
 34 |     "【": "",
 35 |     "】": "",
 36 |     "[": "",
 37 |     "]": "",
 38 |     "—": "",
 39 |     "～": "-",
 40 |     "~": "-",
 41 |     "「": "",
 42 |     "」": "",
 43 |     "¿" : "",
 44 |     "¡" : ""
 45 | }
 46 | 
 47 | 
 48 | def replace_punctuation(text):
 49 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 50 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 51 |     return replaced_text
 52 | 
 53 | def expand_abbreviations(text, lang="fr"):
 54 |     if lang == "fr":
 55 |         _abbreviations = abbreviations_fr
 56 |     for regex, replacement in _abbreviations:
 57 |         text = re.sub(regex, replacement, text)
 58 |     return text
 59 | 
 60 | 
 61 | def lowercase(text):
 62 |     return text.lower()
 63 | 
 64 | 
 65 | def collapse_whitespace(text):
 66 |     return re.sub(_whitespace_re, " ", text).strip()
 67 | 
 68 | def remove_punctuation_at_begin(text):
 69 |     return re.sub(r'^[,.!?]+', '', text)
 70 | 
 71 | def remove_aux_symbols(text):
 72 |     text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
 73 |     return text
 74 | 
 75 | 
 76 | def replace_symbols(text, lang="en"):
 77 |     """Replace symbols based on the lenguage tag.
 78 | 
 79 |     Args:
 80 |       text:
 81 |        Input text.
 82 |       lang:
 83 |         Lenguage identifier. ex: "en", "fr", "pt", "ca".
 84 | 
 85 |     Returns:
 86 |       The modified text
 87 |       example:
 88 |         input args:
 89 |             text: "si l'avi cau, diguem-ho"
 90 |             lang: "ca"
 91 |         Output:
 92 |             text: "si lavi cau, diguemho"
 93 |     """
 94 |     text = text.replace(";", ",")
 95 |     text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
 96 |     text = text.replace(":", ",")
 97 |     if lang == "en":
 98 |         text = text.replace("&", " and ")
 99 |     elif lang == "fr":
100 |         text = text.replace("&", " et ")
101 |     elif lang == "pt":
102 |         text = text.replace("&", " e ")
103 |     elif lang == "ca":
104 |         text = text.replace("&", " i ")
105 |         text = text.replace("'", "")
106 |     elif lang== "es":
107 |         text=text.replace("&","y")
108 |         text = text.replace("'", "")
109 |     return text
110 | 
111 | def french_cleaners(text):
112 |     """Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
113 |     text = expand_abbreviations(text, lang="fr")
114 |     # text = lowercase(text) # as we use the cased bert
115 |     text = replace_punctuation(text)
116 |     text = replace_symbols(text, lang="fr")
117 |     text = remove_aux_symbols(text)
118 |     text = remove_punctuation_at_begin(text)
119 |     text = collapse_whitespace(text)
120 |     text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
121 |     return text
122 | 
123 | 


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/en_symbols.json:
--------------------------------------------------------------------------------
 1 | {"symbols": [
 2 |     "_",
 3 |     ",",
 4 |     ".",
 5 |     "!",
 6 |     "?",
 7 |     "-",
 8 |     "~",
 9 |     "\u2026",
10 |     "N",
11 |     "Q",
12 |     "a",
13 |     "b",
14 |     "d",
15 |     "e",
16 |     "f",
17 |     "g",
18 |     "h",
19 |     "i",
20 |     "j",
21 |     "k",
22 |     "l",
23 |     "m",
24 |     "n",
25 |     "o",
26 |     "p",
27 |     "s",
28 |     "t",
29 |     "u",
30 |     "v",
31 |     "w",
32 |     "x",
33 |     "y",
34 |     "z",
35 |     "\u0251",
36 |     "\u00e6",
37 |     "\u0283",
38 |     "\u0291",
39 |     "\u00e7",
40 |     "\u026f",
41 |     "\u026a",
42 |     "\u0254",
43 |     "\u025b",
44 |     "\u0279",
45 |     "\u00f0",
46 |     "\u0259",
47 |     "\u026b",
48 |     "\u0265",
49 |     "\u0278",
50 |     "\u028a",
51 |     "\u027e",
52 |     "\u0292",
53 |     "\u03b8",
54 |     "\u03b2",
55 |     "\u014b",
56 |     "\u0266",
57 |     "\u207c",
58 |     "\u02b0",
59 |     "`",
60 |     "^",
61 |     "#",
62 |     "*",
63 |     "=",
64 |     "\u02c8",
65 |     "\u02cc",
66 |     "\u2192",
67 |     "\u2193",
68 |     "\u2191",
69 |     " ",
70 |     "ɣ",
71 |     "ɡ", 
72 |     "r", 
73 |     "ɲ", 
74 |     "ʝ", 
75 |     "ʎ",
76 |     "ː"
77 |   ]
78 | }


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/fr_symbols.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "symbols": [
 3 |         "_",
 4 |         ",",
 5 |         ".",
 6 |         "!",
 7 |         "?",
 8 |         "-",
 9 |         "~",
10 |         "\u2026",
11 |         "N",
12 |         "Q",
13 |         "a",
14 |         "b",
15 |         "d",
16 |         "e",
17 |         "f",
18 |         "g",
19 |         "h",
20 |         "i",
21 |         "j",
22 |         "k",
23 |         "l",
24 |         "m",
25 |         "n",
26 |         "o",
27 |         "p",
28 |         "s",
29 |         "t",
30 |         "u",
31 |         "v",
32 |         "w",
33 |         "x",
34 |         "y",
35 |         "z",
36 |         "\u0251",
37 |         "\u00e6",
38 |         "\u0283",
39 |         "\u0291",
40 |         "\u00e7",
41 |         "\u026f",
42 |         "\u026a",
43 |         "\u0254",
44 |         "\u025b",
45 |         "\u0279",
46 |         "\u00f0",
47 |         "\u0259",
48 |         "\u026b",
49 |         "\u0265",
50 |         "\u0278",
51 |         "\u028a",
52 |         "\u027e",
53 |         "\u0292",
54 |         "\u03b8",
55 |         "\u03b2",
56 |         "\u014b",
57 |         "\u0266",
58 |         "\u207c",
59 |         "\u02b0",
60 |         "`",
61 |         "^",
62 |         "#",
63 |         "*",
64 |         "=",
65 |         "\u02c8",
66 |         "\u02cc",
67 |         "\u2192",
68 |         "\u2193",
69 |         "\u2191",
70 |         " ",
71 |         "\u0263",
72 |         "\u0261",
73 |         "r",
74 |         "\u0272",
75 |         "\u029d",
76 |         "\u028e",
77 |         "\u02d0",
78 |         
79 |         "\u0303",
80 |         "\u0153",
81 |         "\u00f8",
82 |         "\u0281",
83 |         "\u0252",
84 |         "\u028c",
85 |         "\u2014",
86 |         "\u025c",
87 |         "\u0250"
88 |     ]
89 | }


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/fr_to_ipa.py:
--------------------------------------------------------------------------------
 1 | from .cleaner import french_cleaners
 2 | from .gruut_wrapper import Gruut
 3 | 
 4 | 
 5 | def remove_consecutive_t(input_str):
 6 |     result = []
 7 |     count = 0
 8 | 
 9 |     for char in input_str:
10 |         if char == 't':
11 |             count += 1
12 |         else:
13 |             if count < 3:  
14 |                 result.extend(['t'] * count)
15 |             count = 0
16 |             result.append(char)
17 | 
18 |     if count < 3:
19 |         result.extend(['t'] * count)
20 | 
21 |     return ''.join(result)
22 | 
23 | def fr2ipa(text):
24 |     e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
25 |     # text = french_cleaners(text)
26 |     phonemes = e.phonemize(text, separator="")
27 |     # print(phonemes)
28 |     phonemes = remove_consecutive_t(phonemes)
29 |     # print(phonemes)
30 |     return phonemes


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/french_abbreviations.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | # List of (regular expression, replacement) pairs for abbreviations in french:
 4 | abbreviations_fr = [
 5 |     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
 6 |     for x in [
 7 |         ("M", "monsieur"),
 8 |         ("Mlle", "mademoiselle"),
 9 |         ("Mlles", "mesdemoiselles"),
10 |         ("Mme", "Madame"),
11 |         ("Mmes", "Mesdames"),
12 |         ("N.B", "nota bene"),
13 |         ("M", "monsieur"),
14 |         ("p.c.q", "parce que"),
15 |         ("Pr", "professeur"),
16 |         ("qqch", "quelque chose"),
17 |         ("rdv", "rendez-vous"),
18 |         ("max", "maximum"),
19 |         ("min", "minimum"),
20 |         ("no", "numéro"),
21 |         ("adr", "adresse"),
22 |         ("dr", "docteur"),
23 |         ("st", "saint"),
24 |         ("co", "companie"),
25 |         ("jr", "junior"),
26 |         ("sgt", "sergent"),
27 |         ("capt", "capitain"),
28 |         ("col", "colonel"),
29 |         ("av", "avenue"),
30 |         ("av. J.-C", "avant Jésus-Christ"),
31 |         ("apr. J.-C", "après Jésus-Christ"),
32 |         ("art", "article"),
33 |         ("boul", "boulevard"),
34 |         ("c.-à-d", "c’est-à-dire"),
35 |         ("etc", "et cetera"),
36 |         ("ex", "exemple"),
37 |         ("excl", "exclusivement"),
38 |         ("boul", "boulevard"),
39 |     ]
40 | ] + [
41 |     (re.compile("\\b%s" % x[0]), x[1])
42 |     for x in [
43 |         ("Mlle", "mademoiselle"),
44 |         ("Mlles", "mesdemoiselles"),
45 |         ("Mme", "Madame"),
46 |         ("Mmes", "Mesdames"),
47 |     ]
48 | ]


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/french_symbols.txt:
--------------------------------------------------------------------------------
1 | _,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/gruut_wrapper.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | from typing import List
  3 | 
  4 | import gruut
  5 | from gruut_ipa import IPA # pip install gruut_ipa
  6 | 
  7 | from .base import BasePhonemizer
  8 | from .punctuation import Punctuation
  9 | 
 10 | # Table for str.translate to fix gruut/TTS phoneme mismatch
 11 | GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 12 | 
 13 | 
 14 | class Gruut(BasePhonemizer):
 15 |     """Gruut wrapper for G2P
 16 | 
 17 |     Args:
 18 |         language (str):
 19 |             Valid language code for the used backend.
 20 | 
 21 |         punctuations (str):
 22 |             Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
 23 | 
 24 |         keep_puncs (bool):
 25 |             If true, keep the punctuations after phonemization. Defaults to True.
 26 | 
 27 |         use_espeak_phonemes (bool):
 28 |             If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
 29 | 
 30 |         keep_stress (bool):
 31 |             If true, keep the stress characters after phonemization. Defaults to False.
 32 | 
 33 |     Example:
 34 | 
 35 |         >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
 36 |         >>> phonemizer = Gruut('en-us')
 37 |         >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
 38 |         'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         language: str,
 44 |         punctuations=Punctuation.default_puncs(),
 45 |         keep_puncs=True,
 46 |         use_espeak_phonemes=False,
 47 |         keep_stress=False,
 48 |     ):
 49 |         super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
 50 |         self.use_espeak_phonemes = use_espeak_phonemes
 51 |         self.keep_stress = keep_stress
 52 | 
 53 |     @staticmethod
 54 |     def name():
 55 |         return "gruut"
 56 | 
 57 |     def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:  # pylint: disable=unused-argument
 58 |         """Convert input text to phonemes.
 59 | 
 60 |         Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
 61 |         that constitude a single sound.
 62 | 
 63 |         It doesn't affect 🐸TTS since it individually converts each character to token IDs.
 64 | 
 65 |         Examples::
 66 |             "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
 67 | 
 68 |         Args:
 69 |             text (str):
 70 |                 Text to be converted to phonemes.
 71 | 
 72 |             tie (bool, optional) : When True use a '͡' character between
 73 |                 consecutive characters of a single phoneme. Else separate phoneme
 74 |                 with '_'. This option requires espeak>=1.49. Default to False.
 75 |         """
 76 |         ph_list = []
 77 |         for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
 78 |             for word in sentence:
 79 |                 if word.is_break:
 80 |                     # Use actual character for break phoneme (e.g., comma)
 81 |                     if ph_list:
 82 |                         # Join with previous word
 83 |                         ph_list[-1].append(word.text)
 84 |                     else:
 85 |                         # First word is punctuation
 86 |                         ph_list.append([word.text])
 87 |                 elif word.phonemes:
 88 |                     # Add phonemes for word
 89 |                     word_phonemes = []
 90 | 
 91 |                     for word_phoneme in word.phonemes:
 92 |                         if not self.keep_stress:
 93 |                             # Remove primary/secondary stress
 94 |                             word_phoneme = IPA.without_stress(word_phoneme)
 95 | 
 96 |                         word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
 97 | 
 98 |                         if word_phoneme:
 99 |                             # Flatten phonemes
100 |                             word_phonemes.extend(word_phoneme)
101 | 
102 |                     if word_phonemes:
103 |                         ph_list.append(word_phonemes)
104 | 
105 |         ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
106 |         ph = f"{separator} ".join(ph_words)
107 |         return ph
108 | 
109 |     def _phonemize(self, text, separator):
110 |         return self.phonemize_gruut(text, separator, tie=False)
111 | 
112 |     def is_supported_language(self, language):
113 |         """Returns True if `language` is supported by the backend"""
114 |         return gruut.is_language_supported(language)
115 | 
116 |     @staticmethod
117 |     def supported_languages() -> List:
118 |         """Get a dictionary of supported languages.
119 | 
120 |         Returns:
121 |             List: List of language codes.
122 |         """
123 |         return list(gruut.get_supported_languages())
124 | 
125 |     def version(self):
126 |         """Get the version of the used backend.
127 | 
128 |         Returns:
129 |             str: Version of the used backend.
130 |         """
131 |         return gruut.__version__
132 | 
133 |     @classmethod
134 |     def is_available(cls):
135 |         """Return true if ESpeak is available else false"""
136 |         return importlib.util.find_spec("gruut") is not None
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     from cleaner import french_cleaners
141 |     import json
142 | 
143 |     e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
144 |     symbols = [  # en + sp
145 |         "_",
146 |         ",",
147 |         ".",
148 |         "!",
149 |         "?",
150 |         "-",
151 |         "~",
152 |         "\u2026",
153 |         "N",
154 |         "Q",
155 |         "a",
156 |         "b",
157 |         "d",
158 |         "e",
159 |         "f",
160 |         "g",
161 |         "h",
162 |         "i",
163 |         "j",
164 |         "k",
165 |         "l",
166 |         "m",
167 |         "n",
168 |         "o",
169 |         "p",
170 |         "s",
171 |         "t",
172 |         "u",
173 |         "v",
174 |         "w",
175 |         "x",
176 |         "y",
177 |         "z",
178 |         "\u0251",
179 |         "\u00e6",
180 |         "\u0283",
181 |         "\u0291",
182 |         "\u00e7",
183 |         "\u026f",
184 |         "\u026a",
185 |         "\u0254",
186 |         "\u025b",
187 |         "\u0279",
188 |         "\u00f0",
189 |         "\u0259",
190 |         "\u026b",
191 |         "\u0265",
192 |         "\u0278",
193 |         "\u028a",
194 |         "\u027e",
195 |         "\u0292",
196 |         "\u03b8",
197 |         "\u03b2",
198 |         "\u014b",
199 |         "\u0266",
200 |         "\u207c",
201 |         "\u02b0",
202 |         "`",
203 |         "^",
204 |         "#",
205 |         "*",
206 |         "=",
207 |         "\u02c8",
208 |         "\u02cc",
209 |         "\u2192",
210 |         "\u2193",
211 |         "\u2191",
212 |         " ",
213 |         "ɣ",
214 |         "ɡ", 
215 |         "r", 
216 |         "ɲ", 
217 |         "ʝ", 
218 |         "ʎ",
219 |         "ː"
220 |     ]
221 |     with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f:
222 |         lines = f.readlines()
223 |     
224 | 
225 |     used_sym = []
226 |     not_existed_sym = []
227 |     phonemes = []
228 | 
229 |     for line in lines:
230 |         text = line.split('|')[-1].strip()
231 |         text = french_cleaners(text)
232 |         ipa =  e.phonemize(text, separator="")
233 |         phonemes.append(ipa)
234 |         for s in ipa:
235 |             if s not in symbols:
236 |                 if s not in not_existed_sym:
237 |                     print(f'not_existed char: {s}')
238 |                     not_existed_sym.append(s)
239 |             else:
240 |                 if s not in used_sym:
241 |                     # print(f'used char: {s}')
242 |                     used_sym.append(s)
243 |     
244 |     print(used_sym)
245 |     print(not_existed_sym)
246 | 
247 | 
248 |     with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g:
249 |         g.writelines(symbols + not_existed_sym)
250 |         
251 |     with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g:
252 |         g.writelines(phonemes)
253 | 
254 |     data = {'symbols': symbols + not_existed_sym}
255 | 
256 |     with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f:
257 |         json.dump(data, f, indent=4)
258 | 
259 | 


--------------------------------------------------------------------------------
/melo/text/fr_phonemizer/punctuation.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import re
  3 | from enum import Enum
  4 | 
  5 | import six
  6 | 
  7 | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
  8 | 
  9 | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
 10 | 
 11 | 
 12 | class PuncPosition(Enum):
 13 |     """Enum for the punctuations positions"""
 14 | 
 15 |     BEGIN = 0
 16 |     END = 1
 17 |     MIDDLE = 2
 18 |     ALONE = 3
 19 | 
 20 | 
 21 | class Punctuation:
 22 |     """Handle punctuations in text.
 23 | 
 24 |     Just strip punctuations from text or strip and restore them later.
 25 | 
 26 |     Args:
 27 |         puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
 28 | 
 29 |     Example:
 30 |         >>> punc = Punctuation()
 31 |         >>> punc.strip("This is. example !")
 32 |         'This is example'
 33 | 
 34 |         >>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
 35 |         >>> ' '.join(text_striped)
 36 |         'This is example'
 37 | 
 38 |         >>> text_restored = punc.restore(text_striped, punc_map)
 39 |         >>> text_restored[0]
 40 |         'This is. example !'
 41 |     """
 42 | 
 43 |     def __init__(self, puncs: str = _DEF_PUNCS):
 44 |         self.puncs = puncs
 45 | 
 46 |     @staticmethod
 47 |     def default_puncs():
 48 |         """Return default set of punctuations."""
 49 |         return _DEF_PUNCS
 50 | 
 51 |     @property
 52 |     def puncs(self):
 53 |         return self._puncs
 54 | 
 55 |     @puncs.setter
 56 |     def puncs(self, value):
 57 |         if not isinstance(value, six.string_types):
 58 |             raise ValueError("[!] Punctuations must be of type str.")
 59 |         self._puncs = "".join(list(dict.fromkeys(list(value))))  # remove duplicates without changing the oreder
 60 |         self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
 61 | 
 62 |     def strip(self, text):
 63 |         """Remove all the punctuations by replacing with `space`.
 64 | 
 65 |         Args:
 66 |             text (str): The text to be processed.
 67 | 
 68 |         Example::
 69 | 
 70 |             "This is. example !" -> "This is example "
 71 |         """
 72 |         return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
 73 | 
 74 |     def strip_to_restore(self, text):
 75 |         """Remove punctuations from text to restore them later.
 76 | 
 77 |         Args:
 78 |             text (str): The text to be processed.
 79 | 
 80 |         Examples ::
 81 | 
 82 |             "This is. example !" -> [["This is", "example"], [".", "!"]]
 83 | 
 84 |         """
 85 |         text, puncs = self._strip_to_restore(text)
 86 |         return text, puncs
 87 | 
 88 |     def _strip_to_restore(self, text):
 89 |         """Auxiliary method for Punctuation.preserve()"""
 90 |         matches = list(re.finditer(self.puncs_regular_exp, text))
 91 |         if not matches:
 92 |             return [text], []
 93 |         # the text is only punctuations
 94 |         if len(matches) == 1 and matches[0].group() == text:
 95 |             return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
 96 |         # build a punctuation map to be used later to restore punctuations
 97 |         puncs = []
 98 |         for match in matches:
 99 |             position = PuncPosition.MIDDLE
100 |             if match == matches[0] and text.startswith(match.group()):
101 |                 position = PuncPosition.BEGIN
102 |             elif match == matches[-1] and text.endswith(match.group()):
103 |                 position = PuncPosition.END
104 |             puncs.append(_PUNC_IDX(match.group(), position))
105 |         # convert str text to a List[str], each item is separated by a punctuation
106 |         splitted_text = []
107 |         for idx, punc in enumerate(puncs):
108 |             split = text.split(punc.punc)
109 |             prefix, suffix = split[0], punc.punc.join(split[1:])
110 |             splitted_text.append(prefix)
111 |             # if the text does not end with a punctuation, add it to the last item
112 |             if idx == len(puncs) - 1 and len(suffix) > 0:
113 |                 splitted_text.append(suffix)
114 |             text = suffix
115 |         return splitted_text, puncs
116 | 
117 |     @classmethod
118 |     def restore(cls, text, puncs):
119 |         """Restore punctuation in a text.
120 | 
121 |         Args:
122 |             text (str): The text to be processed.
123 |             puncs (List[str]): The list of punctuations map to be used for restoring.
124 | 
125 |         Examples ::
126 | 
127 |             ['This is', 'example'], ['.', '!'] -> "This is. example!"
128 | 
129 |         """
130 |         return cls._restore(text, puncs, 0)
131 | 
132 |     @classmethod
133 |     def _restore(cls, text, puncs, num):  # pylint: disable=too-many-return-statements
134 |         """Auxiliary method for Punctuation.restore()"""
135 |         if not puncs:
136 |             return text
137 | 
138 |         # nothing have been phonemized, returns the puncs alone
139 |         if not text:
140 |             return ["".join(m.punc for m in puncs)]
141 | 
142 |         current = puncs[0]
143 | 
144 |         if current.position == PuncPosition.BEGIN:
145 |             return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
146 | 
147 |         if current.position == PuncPosition.END:
148 |             return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
149 | 
150 |         if current.position == PuncPosition.ALONE:
151 |             return [current.mark] + cls._restore(text, puncs[1:], num + 1)
152 | 
153 |         # POSITION == MIDDLE
154 |         if len(text) == 1:  # pragma: nocover
155 |             # a corner case where the final part of an intermediate
156 |             # mark (I) has not been phonemized
157 |             return cls._restore([text[0] + current.punc], puncs[1:], num)
158 | 
159 |         return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
160 | 
161 | 
162 | # if __name__ == "__main__":
163 | #     punc = Punctuation()
164 | #     text = "This is. This is, example!"
165 | 
166 | #     print(punc.strip(text))
167 | 
168 | #     split_text, puncs = punc.strip_to_restore(text)
169 | #     print(split_text, " ---- ", puncs)
170 | 
171 | #     restored_text = punc.restore(split_text, puncs)
172 | #     print(restored_text)


--------------------------------------------------------------------------------
/melo/text/french.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import re
 4 | 
 5 | from . import symbols
 6 | from .fr_phonemizer import cleaner as fr_cleaner
 7 | from .fr_phonemizer import fr_to_ipa
 8 | from transformers import AutoTokenizer
 9 | 
10 | 
11 | def distribute_phone(n_phone, n_word):
12 |     phones_per_word = [0] * n_word
13 |     for task in range(n_phone):
14 |         min_tasks = min(phones_per_word)
15 |         min_index = phones_per_word.index(min_tasks)
16 |         phones_per_word[min_index] += 1
17 |     return phones_per_word
18 | 
19 | def text_normalize(text):
20 |     text = fr_cleaner.french_cleaners(text)
21 |     return text
22 | 
23 | model_id = 'dbmdz/bert-base-french-europeana-cased'
24 | tokenizer = AutoTokenizer.from_pretrained(model_id)
25 | 
26 | def g2p(text, pad_start_end=True, tokenized=None):
27 |     if tokenized is None:
28 |         tokenized = tokenizer.tokenize(text)
29 |     # import pdb; pdb.set_trace()
30 |     phs = []
31 |     ph_groups = []
32 |     for t in tokenized:
33 |         if not t.startswith("#"):
34 |             ph_groups.append([t])
35 |         else:
36 |             ph_groups[-1].append(t.replace("#", ""))
37 |     
38 |     phones = []
39 |     tones = []
40 |     word2ph = []
41 |     # print(ph_groups)
42 |     for group in ph_groups:
43 |         w = "".join(group)
44 |         phone_len = 0
45 |         word_len = len(group)
46 |         if w == '[UNK]':
47 |             phone_list = ['UNK']
48 |         else:
49 |             phone_list = list(filter(lambda p: p != " ", fr_to_ipa.fr2ipa(w)))
50 |         
51 |         for ph in phone_list:
52 |             phones.append(ph)
53 |             tones.append(0)
54 |             phone_len += 1
55 |         aaa = distribute_phone(phone_len, word_len)
56 |         word2ph += aaa
57 |         # print(phone_list, aaa)
58 |         # print('=' * 10)
59 | 
60 |     if pad_start_end:
61 |         phones = ["_"] + phones + ["_"]
62 |         tones = [0] + tones + [0]
63 |         word2ph = [1] + word2ph + [1]
64 |     return phones, tones, word2ph
65 | 
66 | def get_bert_feature(text, word2ph, device=None):
67 |     from text import french_bert
68 |     return french_bert.get_bert_feature(text, word2ph, device=device)
69 | 
70 | if __name__ == "__main__":
71 |     ori_text = 'Ce service gratuit est“”"" 【disponible》 en chinois 【simplifié] et autres 123'
72 |     # ori_text = "Ils essayaient vainement de faire comprendre à ma mère qu'avec les cent mille francs que m'avait laissé mon père,"
73 |     # print(ori_text)
74 |     text = text_normalize(ori_text)
75 |     print(text)
76 |     phoneme = fr_to_ipa.fr2ipa(text)
77 |     print(phoneme)
78 | 
79 |     
80 |     from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
81 |     from text.cleaner_multiling import unicleaners
82 | 
83 |     def text_normalize(text):
84 |         text = unicleaners(text, cased=True, lang='fr')
85 |         return text
86 | 
87 |     # print(ori_text)
88 |     text = text_normalize(ori_text)
89 |     print(text)
90 |     phonemizer = MultiPhonemizer({"fr-fr": "espeak"})
91 |     # phonemizer.lang_to_phonemizer['fr'].keep_stress = True
92 |     # phonemizer.lang_to_phonemizer['fr'].use_espeak_phonemes = True
93 |     phoneme = phonemizer.phonemize(text, separator="", language='fr-fr')
94 |     print(phoneme)


--------------------------------------------------------------------------------
/melo/text/french_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | model_id = 'dbmdz/bert-base-french-europeana-cased'
 6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 7 | model = None
 8 | 
 9 | def get_bert_feature(text, word2ph, device=None):
10 |     global model
11 |     if (
12 |         sys.platform == "darwin"
13 |         and torch.backends.mps.is_available()
14 |         and device == "cpu"
15 |     ):
16 |         device = "mps"
17 |     if not device:
18 |         device = "cuda"
19 |     if model is None:
20 |         model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 |             device
22 |         )
23 |     with torch.no_grad():
24 |         inputs = tokenizer(text, return_tensors="pt")
25 |         for i in inputs:
26 |             inputs[i] = inputs[i].to(device)
27 |         res = model(**inputs, output_hidden_states=True)
28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |         
30 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
31 |     word2phone = word2ph
32 |     phone_level_feature = []
33 |     for i in range(len(word2phone)):
34 |         repeat_feature = res[i].repeat(word2phone[i], 1)
35 |         phone_level_feature.append(repeat_feature)
36 | 
37 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 | 
39 |     return phone_level_feature.T
40 | 


--------------------------------------------------------------------------------
/melo/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | 
 6 | models = {}
 7 | tokenizers = {}
 8 | def get_bert_feature(text, word2ph, device=None, model_id='tohoku-nlp/bert-base-japanese-v3'):
 9 |     global model
10 |     global tokenizer
11 | 
12 |     if (
13 |         sys.platform == "darwin"
14 |         and torch.backends.mps.is_available()
15 |         and device == "cpu"
16 |     ):
17 |         device = "mps"
18 |     if not device:
19 |         device = "cuda"
20 |     if model_id not in models:
21 |         model = AutoModelForMaskedLM.from_pretrained(model_id).to(
22 |             device
23 |         )
24 |         models[model_id] = model
25 |         tokenizer = AutoTokenizer.from_pretrained(model_id)
26 |         tokenizers[model_id] = tokenizer
27 |     else:
28 |         model = models[model_id]
29 |         tokenizer = tokenizers[model_id]
30 | 
31 | 
32 |     with torch.no_grad():
33 |         inputs = tokenizer(text, return_tensors="pt")
34 |         tokenized = tokenizer.tokenize(text)
35 |         for i in inputs:
36 |             inputs[i] = inputs[i].to(device)
37 |         res = model(**inputs, output_hidden_states=True)
38 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | 
40 |     assert inputs["input_ids"].shape[-1] == len(word2ph), f"{inputs['input_ids'].shape[-1]}/{len(word2ph)}"
41 |     word2phone = word2ph
42 |     phone_level_feature = []
43 |     for i in range(len(word2phone)):
44 |         repeat_feature = res[i].repeat(word2phone[i], 1)
45 |         phone_level_feature.append(repeat_feature)
46 | 
47 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
48 | 
49 |     return phone_level_feature.T
50 | 


--------------------------------------------------------------------------------
/melo/text/ko_dictionary.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # Add the word you want to the dictionary.
 3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
 4 | 
 5 | 
 6 | english_dictionary = {
 7 |     "KOREA": "코리아",
 8 |     "IDOL": "아이돌",
 9 |     "IT": "아이티",
10 |     "IQ": "아이큐",
11 |     "UP": "업",
12 |     "DOWN": "다운",
13 |     "PC": "피씨",
14 |     "CCTV": "씨씨티비",
15 |     "SNS": "에스엔에스",
16 |     "AI": "에이아이",
17 |     "CEO": "씨이오",
18 |     "A": "에이",
19 |     "B": "비",
20 |     "C": "씨",
21 |     "D": "디",
22 |     "E": "이",
23 |     "F": "에프",
24 |     "G": "지",
25 |     "H": "에이치",
26 |     "I": "아이",
27 |     "J": "제이",
28 |     "K": "케이",
29 |     "L": "엘",
30 |     "M": "엠",
31 |     "N": "엔",
32 |     "O": "오",
33 |     "P": "피",
34 |     "Q": "큐",
35 |     "R": "알",
36 |     "S": "에스",
37 |     "T": "티",
38 |     "U": "유",
39 |     "V": "브이",
40 |     "W": "더블유",
41 |     "X": "엑스",
42 |     "Y": "와이",
43 |     "Z": "제트",
44 | }
45 | 


--------------------------------------------------------------------------------
/melo/text/korean.py:
--------------------------------------------------------------------------------
  1 | # Convert Japanese text to phonemes which is
  2 | # compatible with Julius https://github.com/julius-speech/segmentation-kit
  3 | import re
  4 | import unicodedata
  5 | 
  6 | from transformers import AutoTokenizer
  7 | 
  8 | from . import punctuation, symbols
  9 | 
 10 | 
 11 | from num2words import num2words
 12 | from melo.text.ko_dictionary import english_dictionary, etc_dictionary
 13 | from anyascii import anyascii
 14 | from jamo import hangul_to_jamo
 15 | 
 16 | def normalize(text):
 17 |     text = text.strip()
 18 |     text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
 19 |     text = normalize_with_dictionary(text, etc_dictionary)
 20 |     text = normalize_english(text)
 21 |     text = text.lower()
 22 |     return text
 23 | 
 24 | 
 25 | def normalize_with_dictionary(text, dic):
 26 |     if any(key in text for key in dic.keys()):
 27 |         pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
 28 |         return pattern.sub(lambda x: dic[x.group()], text)
 29 |     return text
 30 | 
 31 | 
 32 | def normalize_english(text):
 33 |     def fn(m):
 34 |         word = m.group()
 35 |         if word in english_dictionary:
 36 |             return english_dictionary.get(word)
 37 |         return word
 38 | 
 39 |     text = re.sub("([A-Za-z]+)", fn, text)
 40 |     return text
 41 | 
 42 | 
 43 | g2p_kr = None
 44 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
 45 |     """
 46 | 
 47 |     The input and output values look the same, but they are different in Unicode.
 48 | 
 49 |     example :
 50 | 
 51 |         input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
 52 |         output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
 53 | 
 54 |     """
 55 |     global g2p_kr  # pylint: disable=global-statement
 56 |     if g2p_kr is None:
 57 |         from g2pkk import G2p
 58 | 
 59 |         g2p_kr = G2p()
 60 | 
 61 |     if character == "english":
 62 |         from anyascii import anyascii
 63 |         text = normalize(text)
 64 |         text = g2p_kr(text)
 65 |         text = anyascii(text)
 66 |         return text
 67 | 
 68 |     text = normalize(text)
 69 |     text = g2p_kr(text)
 70 |     text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
 71 |     return "".join(text)
 72 | 
 73 | def text_normalize(text):
 74 |     # res = unicodedata.normalize("NFKC", text)
 75 |     # res = japanese_convert_numbers_to_words(res)
 76 |     # # res = "".join([i for i in res if is_japanese_character(i)])
 77 |     # res = replace_punctuation(res)
 78 |     text = normalize(text)
 79 |     return text
 80 | 
 81 | 
 82 | def distribute_phone(n_phone, n_word):
 83 |     phones_per_word = [0] * n_word
 84 |     for task in range(n_phone):
 85 |         min_tasks = min(phones_per_word)
 86 |         min_index = phones_per_word.index(min_tasks)
 87 |         phones_per_word[min_index] += 1
 88 |     return phones_per_word
 89 | 
 90 | 
 91 | 
 92 | # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
 93 | 
 94 | model_id = 'kykim/bert-kor-base'
 95 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 96 | 
 97 | def g2p(norm_text):
 98 |     tokenized = tokenizer.tokenize(norm_text)
 99 |     phs = []
100 |     ph_groups = []
101 |     for t in tokenized:
102 |         if not t.startswith("#"):
103 |             ph_groups.append([t])
104 |         else:
105 |             ph_groups[-1].append(t.replace("#", ""))
106 |     word2ph = []
107 |     for group in ph_groups:
108 |         text = ""
109 |         for ch in group:
110 |             text += ch
111 |         if text == '[UNK]':
112 |             phs += ['_']
113 |             word2ph += [1]
114 |             continue
115 |         elif text in punctuation:
116 |             phs += [text]
117 |             word2ph += [1]
118 |             continue
119 |         # import pdb; pdb.set_trace()
120 |         # phonemes = japanese_text_to_phonemes(text)
121 |         # text = g2p_kr(text)
122 |         phonemes = korean_text_to_phonemes(text)
123 |         # import pdb; pdb.set_trace()
124 |         # # phonemes = [i for i in phonemes if i in symbols]
125 |         # for i in phonemes:
126 |         #     assert i in symbols, (group, norm_text, tokenized, i)
127 |         phone_len = len(phonemes)
128 |         word_len = len(group)
129 | 
130 |         aaa = distribute_phone(phone_len, word_len)
131 |         assert len(aaa) == word_len
132 |         word2ph += aaa
133 | 
134 |         phs += phonemes
135 |     phones = ["_"] + phs + ["_"]
136 |     tones = [0 for i in phones]
137 |     word2ph =  [1] + word2ph + [1]
138 |     assert len(word2ph) == len(tokenized) + 2
139 |     return phones, tones, word2ph
140 | 
141 | def get_bert_feature(text, word2ph, device='cuda'):
142 |     from . import japanese_bert
143 |     return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
148 |     from text.symbols import symbols
149 |     text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
150 |     import json
151 | 
152 |     # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
153 |     genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
154 |     from tqdm import tqdm
155 |     new_symbols = []
156 |     for key, item in tqdm(genshin_data.items()):
157 |         texts = item.get('voiceContent', '')
158 |         if isinstance(texts, list):
159 |             texts = ','.join(texts)
160 |         if texts is None:
161 |             continue
162 |         if len(texts) == 0:
163 |             continue
164 | 
165 |         text = text_normalize(text)
166 |         phones, tones, word2ph = g2p(text)
167 |         bert = get_bert_feature(text, word2ph)
168 |         import  pdb; pdb.set_trace()
169 |         for ph in phones:
170 |             if ph not in symbols and ph not in new_symbols:
171 |                 new_symbols.append(ph)
172 |                 print('update!, now symbols:')
173 |                 print(new_symbols)
174 |                 with open('korean_symbol.txt', 'w') as f:
175 |                     f.write(f'{new_symbols}')
176 | 
177 |         
178 | 
179 | # if __name__ == '__main__':
180 | #     from pykakasi import kakasi
181 | #     # Initialize kakasi object
182 | #     kakasi = kakasi()
183 | 
184 | #     # Set options for converting Chinese characters to Katakana
185 | #     kakasi.setMode("J", "H")  # Chinese to Katakana
186 | #     kakasi.setMode("K", "H")  # Hiragana to Katakana
187 | 
188 | #     # Convert Chinese characters to Katakana
189 | #     conv = kakasi.getConverter()
190 | #     katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?')  # Replace with your Chinese text
191 | 
192 | #     print(katakana_text)  # Output: ニーハオセカイ


--------------------------------------------------------------------------------
/melo/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/melo/text/spanish.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import re
  4 | 
  5 | from . import symbols
  6 | from .es_phonemizer import cleaner as es_cleaner
  7 | from .es_phonemizer import es_to_ipa
  8 | from transformers import AutoTokenizer
  9 | 
 10 | 
 11 | def distribute_phone(n_phone, n_word):
 12 |     phones_per_word = [0] * n_word
 13 |     for task in range(n_phone):
 14 |         min_tasks = min(phones_per_word)
 15 |         min_index = phones_per_word.index(min_tasks)
 16 |         phones_per_word[min_index] += 1
 17 |     return phones_per_word
 18 | 
 19 | def text_normalize(text):
 20 |     text = es_cleaner.spanish_cleaners(text)
 21 |     return text
 22 | 
 23 | def post_replace_ph(ph):
 24 |     rep_map = {
 25 |         "：": ",",
 26 |         "；": ",",
 27 |         "，": ",",
 28 |         "。": ".",
 29 |         "！": "!",
 30 |         "？": "?",
 31 |         "\n": ".",
 32 |         "·": ",",
 33 |         "、": ",",
 34 |         "...": "…"
 35 |     }
 36 |     if ph in rep_map.keys():
 37 |         ph = rep_map[ph]
 38 |     if ph in symbols:
 39 |         return ph
 40 |     if ph not in symbols:
 41 |         ph = "UNK"
 42 |     return ph
 43 | 
 44 | def refine_ph(phn):
 45 |     tone = 0
 46 |     if re.search(r"\d$", phn):
 47 |         tone = int(phn[-1]) + 1
 48 |         phn = phn[:-1]
 49 |     return phn.lower(), tone
 50 | 
 51 | 
 52 | def refine_syllables(syllables):
 53 |     tones = []
 54 |     phonemes = []
 55 |     for phn_list in syllables:
 56 |         for i in range(len(phn_list)):
 57 |             phn = phn_list[i]
 58 |             phn, tone = refine_ph(phn)
 59 |             phonemes.append(phn)
 60 |             tones.append(tone)
 61 |     return phonemes, tones
 62 | 
 63 | 
 64 | # model_id = 'bert-base-uncased'
 65 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
 66 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 67 | 
 68 | def g2p(text, pad_start_end=True, tokenized=None):
 69 |     if tokenized is None:
 70 |         tokenized = tokenizer.tokenize(text)
 71 |     # import pdb; pdb.set_trace()
 72 |     phs = []
 73 |     ph_groups = []
 74 |     for t in tokenized:
 75 |         if not t.startswith("#"):
 76 |             ph_groups.append([t])
 77 |         else:
 78 |             ph_groups[-1].append(t.replace("#", ""))
 79 |     
 80 |     phones = []
 81 |     tones = []
 82 |     word2ph = []
 83 |     # print(ph_groups)
 84 |     for group in ph_groups:
 85 |         w = "".join(group)
 86 |         phone_len = 0
 87 |         word_len = len(group)
 88 |         if w == '[UNK]':
 89 |             phone_list = ['UNK']
 90 |         else:
 91 |             phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w)))
 92 |         
 93 |         for ph in phone_list:
 94 |             phones.append(ph)
 95 |             tones.append(0)
 96 |             phone_len += 1
 97 |         aaa = distribute_phone(phone_len, word_len)
 98 |         word2ph += aaa
 99 |         # print(phone_list, aaa)
100 |         # print('=' * 10)
101 | 
102 |     if pad_start_end:
103 |         phones = ["_"] + phones + ["_"]
104 |         tones = [0] + tones + [0]
105 |         word2ph = [1] + word2ph + [1]
106 |     return phones, tones, word2ph
107 | 
108 | def get_bert_feature(text, word2ph, device=None):
109 |     from text import spanish_bert
110 |     return spanish_bert.get_bert_feature(text, word2ph, device=device)
111 | 
112 | if __name__ == "__main__":
113 |     text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789."
114 |     # print(text)
115 |     text = text_normalize(text)
116 |     print(text)
117 |     phones, tones, word2ph = g2p(text)
118 |     bert = get_bert_feature(text, word2ph)
119 |     print(phones)
120 |     print(len(phones), tones, sum(word2ph), bert.shape)
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/melo/text/spanish_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
 6 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 7 | model = None
 8 | 
 9 | def get_bert_feature(text, word2ph, device=None):
10 |     global model
11 |     if (
12 |         sys.platform == "darwin"
13 |         and torch.backends.mps.is_available()
14 |         and device == "cpu"
15 |     ):
16 |         device = "mps"
17 |     if not device:
18 |         device = "cuda"
19 |     if model is None:
20 |         model = AutoModelForMaskedLM.from_pretrained(model_id).to(
21 |             device
22 |         )
23 |     with torch.no_grad():
24 |         inputs = tokenizer(text, return_tensors="pt")
25 |         for i in inputs:
26 |             inputs[i] = inputs[i].to(device)
27 |         res = model(**inputs, output_hidden_states=True)
28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |         
30 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
31 |     word2phone = word2ph
32 |     phone_level_feature = []
33 |     for i in range(len(word2phone)):
34 |         repeat_feature = res[i].repeat(word2phone[i], 1)
35 |         phone_level_feature.append(repeat_feature)
36 | 
37 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 | 
39 |     return phone_level_feature.T
40 | 


--------------------------------------------------------------------------------
/melo/text/symbols.py:
--------------------------------------------------------------------------------
  1 | # punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | punctuation = ["!", "?", "…", ",", ".", "'", "-", "¿", "¡"]
  3 | pu_symbols = punctuation + ["SP", "UNK"]
  4 | pad = "_"
  5 | 
  6 | # chinese
  7 | zh_symbols = [
  8 |     "E",
  9 |     "En",
 10 |     "a",
 11 |     "ai",
 12 |     "an",
 13 |     "ang",
 14 |     "ao",
 15 |     "b",
 16 |     "c",
 17 |     "ch",
 18 |     "d",
 19 |     "e",
 20 |     "ei",
 21 |     "en",
 22 |     "eng",
 23 |     "er",
 24 |     "f",
 25 |     "g",
 26 |     "h",
 27 |     "i",
 28 |     "i0",
 29 |     "ia",
 30 |     "ian",
 31 |     "iang",
 32 |     "iao",
 33 |     "ie",
 34 |     "in",
 35 |     "ing",
 36 |     "iong",
 37 |     "ir",
 38 |     "iu",
 39 |     "j",
 40 |     "k",
 41 |     "l",
 42 |     "m",
 43 |     "n",
 44 |     "o",
 45 |     "ong",
 46 |     "ou",
 47 |     "p",
 48 |     "q",
 49 |     "r",
 50 |     "s",
 51 |     "sh",
 52 |     "t",
 53 |     "u",
 54 |     "ua",
 55 |     "uai",
 56 |     "uan",
 57 |     "uang",
 58 |     "ui",
 59 |     "un",
 60 |     "uo",
 61 |     "v",
 62 |     "van",
 63 |     "ve",
 64 |     "vn",
 65 |     "w",
 66 |     "x",
 67 |     "y",
 68 |     "z",
 69 |     "zh",
 70 |     "AA",
 71 |     "EE",
 72 |     "OO",
 73 | ]
 74 | num_zh_tones = 6
 75 | 
 76 | # japanese
 77 | ja_symbols = [
 78 |     "N",
 79 |     "a",
 80 |     "a:",
 81 |     "b",
 82 |     "by",
 83 |     "ch",
 84 |     "d",
 85 |     "dy",
 86 |     "e",
 87 |     "e:",
 88 |     "f",
 89 |     "g",
 90 |     "gy",
 91 |     "h",
 92 |     "hy",
 93 |     "i",
 94 |     "i:",
 95 |     "j",
 96 |     "k",
 97 |     "ky",
 98 |     "m",
 99 |     "my",
100 |     "n",
101 |     "ny",
102 |     "o",
103 |     "o:",
104 |     "p",
105 |     "py",
106 |     "q",
107 |     "r",
108 |     "ry",
109 |     "s",
110 |     "sh",
111 |     "t",
112 |     "ts",
113 |     "ty",
114 |     "u",
115 |     "u:",
116 |     "w",
117 |     "y",
118 |     "z",
119 |     "zy",
120 | ]
121 | num_ja_tones = 1
122 | 
123 | # English
124 | en_symbols = [
125 |     "aa",
126 |     "ae",
127 |     "ah",
128 |     "ao",
129 |     "aw",
130 |     "ay",
131 |     "b",
132 |     "ch",
133 |     "d",
134 |     "dh",
135 |     "eh",
136 |     "er",
137 |     "ey",
138 |     "f",
139 |     "g",
140 |     "hh",
141 |     "ih",
142 |     "iy",
143 |     "jh",
144 |     "k",
145 |     "l",
146 |     "m",
147 |     "n",
148 |     "ng",
149 |     "ow",
150 |     "oy",
151 |     "p",
152 |     "r",
153 |     "s",
154 |     "sh",
155 |     "t",
156 |     "th",
157 |     "uh",
158 |     "uw",
159 |     "V",
160 |     "w",
161 |     "y",
162 |     "z",
163 |     "zh",
164 | ]
165 | num_en_tones = 4
166 | 
167 | # Korean
168 | kr_symbols = ['ᄌ', 'ᅥ', 'ᆫ', 'ᅦ', 'ᄋ', 'ᅵ', 'ᄅ', 'ᅴ', 'ᄀ', 'ᅡ', 'ᄎ', 'ᅪ', 'ᄑ', 'ᅩ', 'ᄐ', 'ᄃ', 'ᅢ', 'ᅮ', 'ᆼ', 'ᅳ', 'ᄒ', 'ᄆ', 'ᆯ', 'ᆷ', 'ᄂ', 'ᄇ', 'ᄉ', 'ᆮ', 'ᄁ', 'ᅬ', 'ᅣ', 'ᄄ', 'ᆨ', 'ᄍ', 'ᅧ', 'ᄏ', 'ᆸ', 'ᅭ', '(', 'ᄊ', ')', 'ᅲ', 'ᅨ', 'ᄈ', 'ᅱ', 'ᅯ', 'ᅫ', 'ᅰ', 'ᅤ', '~', '\\', '[', ']', '/', '^', ':', 'ㄸ', '*']
169 | num_kr_tones = 1
170 | 
171 | # Spanish
172 | es_symbols = [
173 |         "N",
174 |         "Q",
175 |         "a",
176 |         "b",
177 |         "d",
178 |         "e",
179 |         "f",
180 |         "g",
181 |         "h",
182 |         "i",
183 |         "j",
184 |         "k",
185 |         "l",
186 |         "m",
187 |         "n",
188 |         "o",
189 |         "p",
190 |         "s",
191 |         "t",
192 |         "u",
193 |         "v",
194 |         "w",
195 |         "x",
196 |         "y",
197 |         "z",
198 |         "ɑ",
199 |         "æ",
200 |         "ʃ",
201 |         "ʑ",
202 |         "ç",
203 |         "ɯ",
204 |         "ɪ",
205 |         "ɔ",
206 |         "ɛ",
207 |         "ɹ",
208 |         "ð",
209 |         "ə",
210 |         "ɫ",
211 |         "ɥ",
212 |         "ɸ",
213 |         "ʊ",
214 |         "ɾ",
215 |         "ʒ",
216 |         "θ",
217 |         "β",
218 |         "ŋ",
219 |         "ɦ",
220 |         "ɡ",
221 |         "r",
222 |         "ɲ",
223 |         "ʝ",
224 |         "ɣ",
225 |         "ʎ",
226 |         "ˈ",
227 |         "ˌ",
228 |         "ː"
229 |     ]
230 | num_es_tones = 1
231 | 
232 | # French 
233 | fr_symbols = [
234 |     "\u0303",
235 |     "œ",
236 |     "ø",
237 |     "ʁ",
238 |     "ɒ",
239 |     "ʌ",
240 |     "ɜ",
241 |     "ɐ"
242 | ]
243 | num_fr_tones = 1
244 | 
245 | # German 
246 | de_symbols = [
247 |     "ʏ",
248 |     "̩"
249 |   ]
250 | num_de_tones = 1
251 | 
252 | # Russian 
253 | ru_symbols = [
254 |     "ɭ",
255 |     "ʲ",
256 |     "ɕ",
257 |     "\"",
258 |     "ɵ",
259 |     "^",
260 |     "ɬ"
261 | ]
262 | num_ru_tones = 1
263 | 
264 | # combine all symbols
265 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols + kr_symbols + es_symbols + fr_symbols + de_symbols + ru_symbols))
266 | symbols = [pad] + normal_symbols + pu_symbols
267 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
268 | 
269 | # combine all tones
270 | num_tones = num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones + num_fr_tones + num_de_tones + num_ru_tones
271 | 
272 | # language maps
273 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2, "ZH_MIX_EN": 3, 'KR': 4, 'ES': 5, 'SP': 5 ,'FR': 6}
274 | num_languages = len(language_id_map.keys())
275 | 
276 | language_tone_start_map = {
277 |     "ZH": 0,
278 |     "ZH_MIX_EN": 0,
279 |     "JP": num_zh_tones,
280 |     "EN": num_zh_tones + num_ja_tones,
281 |     'KR': num_zh_tones + num_ja_tones + num_en_tones,
282 |     "ES": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
283 |     "SP": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
284 |     "FR": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones,
285 | }
286 | 
287 | if __name__ == "__main__":
288 |     a = set(zh_symbols)
289 |     b = set(en_symbols)
290 |     print(sorted(a & b))
291 | 


--------------------------------------------------------------------------------
/melo/train.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | GPUS=$2
 3 | MODEL_NAME=$(basename "$(dirname $CONFIG)")
 4 | 
 5 | PORT=10902
 6 | 
 7 | while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
 8 | do
 9 | torchrun --nproc_per_node=$GPUS \
10 |         --master_port=$PORT \
11 |     train.py --c $CONFIG --model $MODEL_NAME 
12 | 
13 | for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
14 | do
15 |     echo $PID
16 |     kill -9 $PID
17 | done
18 | sleep 30
19 | done


--------------------------------------------------------------------------------
/melo/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(
 13 |     inputs,
 14 |     unnormalized_widths,
 15 |     unnormalized_heights,
 16 |     unnormalized_derivatives,
 17 |     inverse=False,
 18 |     tails=None,
 19 |     tail_bound=1.0,
 20 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 21 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 22 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 23 | ):
 24 |     if tails is None:
 25 |         spline_fn = rational_quadratic_spline
 26 |         spline_kwargs = {}
 27 |     else:
 28 |         spline_fn = unconstrained_rational_quadratic_spline
 29 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 30 | 
 31 |     outputs, logabsdet = spline_fn(
 32 |         inputs=inputs,
 33 |         unnormalized_widths=unnormalized_widths,
 34 |         unnormalized_heights=unnormalized_heights,
 35 |         unnormalized_derivatives=unnormalized_derivatives,
 36 |         inverse=inverse,
 37 |         min_bin_width=min_bin_width,
 38 |         min_bin_height=min_bin_height,
 39 |         min_derivative=min_derivative,
 40 |         **spline_kwargs
 41 |     )
 42 |     return outputs, logabsdet
 43 | 
 44 | 
 45 | def searchsorted(bin_locations, inputs, eps=1e-6):
 46 |     bin_locations[..., -1] += eps
 47 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 48 | 
 49 | 
 50 | def unconstrained_rational_quadratic_spline(
 51 |     inputs,
 52 |     unnormalized_widths,
 53 |     unnormalized_heights,
 54 |     unnormalized_derivatives,
 55 |     inverse=False,
 56 |     tails="linear",
 57 |     tail_bound=1.0,
 58 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 59 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 60 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 61 | ):
 62 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 63 |     outside_interval_mask = ~inside_interval_mask
 64 | 
 65 |     outputs = torch.zeros_like(inputs)
 66 |     logabsdet = torch.zeros_like(inputs)
 67 | 
 68 |     if tails == "linear":
 69 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 70 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 71 |         unnormalized_derivatives[..., 0] = constant
 72 |         unnormalized_derivatives[..., -1] = constant
 73 | 
 74 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 75 |         logabsdet[outside_interval_mask] = 0
 76 |     else:
 77 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 78 | 
 79 |     (
 80 |         outputs[inside_interval_mask],
 81 |         logabsdet[inside_interval_mask],
 82 |     ) = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound,
 89 |         right=tail_bound,
 90 |         bottom=-tail_bound,
 91 |         top=tail_bound,
 92 |         min_bin_width=min_bin_width,
 93 |         min_bin_height=min_bin_height,
 94 |         min_derivative=min_derivative,
 95 |     )
 96 | 
 97 |     return outputs, logabsdet
 98 | 
 99 | 
100 | def rational_quadratic_spline(
101 |     inputs,
102 |     unnormalized_widths,
103 |     unnormalized_heights,
104 |     unnormalized_derivatives,
105 |     inverse=False,
106 |     left=0.0,
107 |     right=1.0,
108 |     bottom=0.0,
109 |     top=1.0,
110 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 |     if torch.min(inputs) < left or torch.max(inputs) > right:
115 |         raise ValueError("Input to a transform is not within its domain")
116 | 
117 |     num_bins = unnormalized_widths.shape[-1]
118 | 
119 |     if min_bin_width * num_bins > 1.0:
120 |         raise ValueError("Minimal bin width too large for the number of bins")
121 |     if min_bin_height * num_bins > 1.0:
122 |         raise ValueError("Minimal bin height too large for the number of bins")
123 | 
124 |     widths = F.softmax(unnormalized_widths, dim=-1)
125 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 |     cumwidths = torch.cumsum(widths, dim=-1)
127 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 |     cumwidths = (right - left) * cumwidths + left
129 |     cumwidths[..., 0] = left
130 |     cumwidths[..., -1] = right
131 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 | 
133 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 | 
135 |     heights = F.softmax(unnormalized_heights, dim=-1)
136 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 |     cumheights = torch.cumsum(heights, dim=-1)
138 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 |     cumheights = (top - bottom) * cumheights + bottom
140 |     cumheights[..., 0] = bottom
141 |     cumheights[..., -1] = top
142 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
143 | 
144 |     if inverse:
145 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
146 |     else:
147 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 | 
149 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 | 
152 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 |     delta = heights / widths
154 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
155 | 
156 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 | 
159 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
160 | 
161 |     if inverse:
162 |         a = (inputs - input_cumheights) * (
163 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 |         ) + input_heights * (input_delta - input_derivatives)
165 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 |         )
168 |         c = -input_delta * (inputs - input_cumheights)
169 | 
170 |         discriminant = b.pow(2) - 4 * a * c
171 |         assert (discriminant >= 0).all()
172 | 
173 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
174 |         outputs = root * input_bin_widths + input_cumwidths
175 | 
176 |         theta_one_minus_theta = root * (1 - root)
177 |         denominator = input_delta + (
178 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 |             * theta_one_minus_theta
180 |         )
181 |         derivative_numerator = input_delta.pow(2) * (
182 |             input_derivatives_plus_one * root.pow(2)
183 |             + 2 * input_delta * theta_one_minus_theta
184 |             + input_derivatives * (1 - root).pow(2)
185 |         )
186 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 | 
188 |         return outputs, -logabsdet
189 |     else:
190 |         theta = (inputs - input_cumwidths) / input_bin_widths
191 |         theta_one_minus_theta = theta * (1 - theta)
192 | 
193 |         numerator = input_heights * (
194 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 |         )
196 |         denominator = input_delta + (
197 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 |             * theta_one_minus_theta
199 |         )
200 |         outputs = input_cumheights + numerator / denominator
201 | 
202 |         derivative_numerator = input_delta.pow(2) * (
203 |             input_derivatives_plus_one * theta.pow(2)
204 |             + 2 * input_delta * theta_one_minus_theta
205 |             + input_derivatives * (1 - theta).pow(2)
206 |         )
207 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 | 
209 |         return outputs, logabsdet
210 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | txtsplit
 2 | torch
 3 | torchaudio
 4 | cached_path
 5 | transformers==4.27.4
 6 | num2words==0.5.12
 7 | unidic_lite==1.0.8
 8 | unidic==1.1.0
 9 | mecab-python3==1.0.9
10 | pykakasi==2.2.1
11 | fugashi==1.3.0
12 | g2p_en==2.1.0
13 | anyascii==0.3.2
14 | jamo==0.4.1
15 | gruut[de,es,fr]==2.2.3
16 | g2pkk>=0.1.1
17 | librosa==0.9.1
18 | pydub==0.25.1
19 | eng_to_ipa==0.0.2
20 | inflect==7.0.0
21 | unidecode==1.3.7
22 | pypinyin==0.50.0
23 | cn2an==0.5.22
24 | jieba==0.42.1
25 | gradio
26 | langid==1.1.6
27 | tqdm
28 | tensorboard==2.16.2
29 | loguru==0.7.2
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | from setuptools import setup, find_packages
 3 | from setuptools.command.develop import develop
 4 | from setuptools.command.install import install
 5 | 
 6 | 
 7 | cwd = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | with open('requirements.txt') as f:
10 |     reqs = f.read().splitlines()
11 | class PostInstallCommand(install):
12 |     """Post-installation for installation mode."""
13 |     def run(self):
14 |         install.run(self)
15 |         os.system('python -m unidic download')
16 | 
17 | 
18 | class PostDevelopCommand(develop):
19 |     """Post-installation for development mode."""
20 |     def run(self):
21 |         develop.run(self)
22 |         os.system('python -m unidic download')
23 | 
24 | setup(
25 |     name='melotts',
26 |     version='0.1.2',
27 |     packages=find_packages(),
28 |     include_package_data=True,
29 |     install_requires=reqs,
30 |     package_data={
31 |         '': ['*.txt', 'cmudict_*'],
32 |     },
33 |     entry_points={
34 |         "console_scripts": [
35 |             "melotts = melo.main:main",
36 |             "melo = melo.main:main",
37 |             "melo-ui = melo.app:main",
38 |         ],
39 |     },
40 | )
41 | 


--------------------------------------------------------------------------------
/test/basetts_test_resources/en_egs_text.txt:
--------------------------------------------------------------------------------
 1 | Did you ever hear a folk tale about a giant turtle?
 2 | Can you name five cars that were popular in the 1970s?
 3 | May I ask what's your favorite university and why?
 4 | Well, have you ever experienced violence in your life?
 5 | Have you ever imposed restrictions?
 6 | Did you ever feel guilty for not providing enough care for your pet?
 7 | Would you prefer barbecue-flavored chips or plain chips?
 8 | Are contractions common in English?
 9 | Well, have you ever seen a slam poetry competition?
10 | Am I correct in assuming that bilateral trade agreements favor developed countries?
11 | Are there any scientific theories on why love exists in humans?
12 | Well, do you think figure skating is harder than gymnastics?
13 | Can you tell me if the apartment has a balcony or not?
14 | Have you ever overcome a challenging obstacle positively?
15 | Could you elaborate on the meaning behind that quote?
16 | Shall seniors receive higher taxes?
17 | Do you think adding a liquid flavor to coffee ruins it?
18 | Well, in our conversation about the restaurant, how would you review it overall?
19 | Have you consistently followed through with goals?
20 | Can pilots hear passengers coughing?
21 | Well, have you tried rainbow sprinkles?
22 | Are there any golden retrievers at the local animal shelter?
23 | Have you seen Tyler?
24 | Had you ever deployed to Mars?
25 | Well, have you ever felt intimidated by your competition's tactics?
26 | Are there any specific rules about when you can continue?
27 | Can you describe Antarctica's temperatures?
28 | May I ask, have you ever tasted a bloody mary before?
29 | Did anyone mention the order yet?
30 | Are automatic transmissions more fuel efficient?
31 | Shall we discuss the impact of self-control on personal success?
32 | Have you traveled internationally this May?
33 | Well, have you ever tried shrimp ceviche?
34 | Have you ever seen an act of extraordinary courage in person?
35 | Have you ever wondered how proceed affects the outcome of a project?
36 | Have you calculated the mean weight of all the participants?
37 | Should we bring confetti to the parade?
38 | Do influencers control behavior?
39 | Shall we discuss the price of the new car lease?
40 | Had Nice ever been your home?
41 | Have you ever encountered a gifted child who struggled academically?
42 | Can everyone work together?
43 | Did you know how long an ostrich can survive without water?
44 | Do nurses in long-term care facilities receive adequate training for dementia care?
45 | Has separation ever felt liberating?
46 | Would you prefer a flexible or fixed schedule for work?
47 | Does pension plan have rollover?
48 | Has Vital's mission expanded beyond health supplements?
49 | Have you ever witnessed a bombing attack?
50 | May I predict the outcome of the election based on polls?
51 | Do you think strict parenting leads to more successful children later in life?
52 | Shall we explore nearby parks?
53 | Are there any ways to verify the credibility of online reviews?
54 | Have you ever witnessed a roundabout accident?
55 | Well, upon reflection, do we really want sushi?
56 | Well, have you ever experienced workplace harassment?
57 | Do you think it's sure that the rain will stop soon?
58 | Would you say distance affects relationships?
59 | Can we truly deny the existence of higher power?
60 | Do you think crop yields will be affected by the drought?
61 | Do you think the backup plan is good enough?
62 | Can you tell me, meanwhile, what happened while I was gone?
63 | Did the wise old owl speak?
64 | Well, have you ever been to a retreat that truly transformed you?
65 | Have you ever had to calculate the exact measurements for a recipe?
66 | Can warning signs prevent accidents while driving on icy roads?
67 | Do you think the current job market offers equal opportunity?
68 | Have you ever analyzed your own dreams?
69 | May I ask if colonialism affected your ancestry?
70 | Well, what chest exercises target the upper pecs?
71 | Are there occasionally unexpected consequences of honesty?
72 | Do you think the new restaurant is overpriced?
73 | Do critics take into account audience preferences?
74 | Has translation technology reached a point where it can accurately translate idioms?
75 | Have you ever been to a music festival in another country?
76 | Do you think our taste in food is genetic?
77 | Are you a hopeless romantic at heart?
78 | Shall we explore abandoned urban places?
79 | Does agency promote individualism?
80 | Well, what implementing strategies?
81 | Have you ever noticed the smallest detail that changed your perspective?
82 | Have you ever seen a normal ghost?
83 | Have you ever considered the considerable effort?
84 | Are there holistic chronic cure?
85 | Did unemployment rates change recently?
86 | Does change come from within or without?
87 | Does the length of the patent term affect innovation rates?
88 | Can Junior play basketball?
89 | Shall we analyze the data?
90 | Have you ever tried the Szechuan cuisine before?
91 | Had you ever debated a controversial topic before?
92 | Have you ever analyzed case?
93 | Is it true that stripping originated in ancient Egypt or Greece?
94 | Have you ever dyed your hair a crazy color?
95 | Shall we compare the top-rated pizza places in our city?
96 | May people in different countries play soccer?
97 | Well, have you recycled?
98 | Shall we precisely measure ingredients?
99 | Can you embrace someone you don't love?


--------------------------------------------------------------------------------
/test/basetts_test_resources/es_egs_text.txt:
--------------------------------------------------------------------------------
 1 | El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.
 2 | Las estrellas bailan en la noche, creando un espectáculo celestial que despierta el alma.
 3 | Las majestuosas montañas se alzan en silencio, guardianas inmutables del tiempo que pasa.
 4 | El amor, como un suave perfume, envuelve nuestros corazones con un calor reconfortante.
 5 | El susurro suave del viento atraviesa los campos de lavanda, llevándose consigo el aroma de la Provenza.
 6 | El resplandor de la luna baña la ciudad dormida en una luz mística.
 7 | Las calles empedradas revelan historias antiguas, cada piedra llevando el peso del pasado.
 8 | La risa de los niños resuena como una melodía encantada en el suave aire de la primavera.
 9 | Los jardines floridos estallan con colores vibrantes, creando un cuadro viviente de la naturaleza.
10 | Las olas acarician suavemente la playa, dejando tras de sí huellas efímeras en la arena.
11 | La Torre Eiffel se yergue con orgullo, testigo silencioso del amor eterno en París.
12 | Las mariposas danzan entre las flores, creando una coreografía grácil en el jardín.
13 | Los animados cafés resuenan con conversaciones apasionadas y el embriagador aroma del café recién molido.
14 | Los ríos serpenteantes atraviesan el campo, reflejando el cielo azul en sus aguas tranquilas.
15 | Los imponentes castillos cuentan historias de caballeros y princesas en un pasado lejano.
16 | Los viñedos se extienden hasta donde alcanza la vista, sus filas ordenadas testimonio de la antigua tradición vinícola.
17 | Las risas resuenan en las estrechas callejuelas, despertando la vieja ciudad de su quietud.
18 | Los campos de girasoles saludan al sol con sus caras doradas, un mar de oro bajo un cielo azul.
19 | Las notas melódicas de un acordeón flotan en el aire, capturando la esencia musical de las calles parisinas.
20 | Las cumbres nevadas de los Alpes brillan bajo la luz de la luna, un paisaje invernal de ensueño.


--------------------------------------------------------------------------------
/test/basetts_test_resources/fr_egs_text.txt:
--------------------------------------------------------------------------------
 1 | La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.
 2 | Les étoiles dansent dans la nuit, créant un spectacle céleste qui éveille l'âme.
 3 | Les montagnes majestueuses se dressent en silence, gardiennes immuables du temps qui passe.
 4 | L'amour, tel un doux parfum, enveloppe nos cœurs d'une chaleur réconfortante.
 5 | Le doux murmure du vent traverse les champs de lavande, emportant avec lui le parfum de la Provence.
 6 | La lueur de la lune baigne la ville endormie dans une lumière mystique.
 7 | Les ruelles pavées révèlent des histoires anciennes, chaque pierre portant le poids du passé.
 8 | Le rire des enfants résonne comme une mélodie enchantée dans l'air doux du printemps.
 9 | Les jardins fleuris éclatent de couleurs vives, créant un tableau vivant de la nature.
10 | Les vagues caressent doucement la plage, laissant derrière elles des traces éphémères dans le sable.
11 | La Tour Eiffel se dresse fièrement, témoin silencieux de l'amour éternel à Paris.
12 | Les papillons dansent parmi les fleurs, créant une chorégraphie gracieuse dans le jardin.
13 | Les cafés animés résonnent de conversations passionnées et du parfum enivrant du café fraîchement moulu.
14 | Les rivières sinueuses traversent la campagne, reflétant le ciel azur dans leurs eaux calmes.
15 | Les châteaux imposants racontent des contes de chevaliers et de princesses dans un passé lointain.
16 | Les vignobles s'étendent à perte de vue, leurs rangées ordonnées témoignant du savoir-faire viticole ancestral.
17 | Les éclats de rire résonnent dans les ruelles étroites, réveillant la vieille ville de sa quiétude.
18 | Les champs de tournesols saluent le soleil avec leurs visages dorés, une mer d'or sous un ciel d'azur.
19 | Les notes mélodieuses d'un accordéon flottent dans l'air, capturant l'essence musicale des rues parisiennes.
20 | Les sommets enneigés des Alpes brillent sous la lumière de la lune, un paysage hivernal féérique.


--------------------------------------------------------------------------------
/test/basetts_test_resources/jp_egs_text.txt:
--------------------------------------------------------------------------------
 1 | 彼は毎朝ジョギングをして体を健康に保っています。
 2 | 私たちは来年、友人たちと一緒にヨーロッパ旅行を計画しています。
 3 | 新しいレストランで美味しい料理を試すことが楽しみです。
 4 | 彼女の絵は情熱と芸術性が溢れていて、見る人を魅了します。
 5 | 最近、忙しさに追われていて、ゆっくり休む時間がありません。
 6 | 日本の文化は多様で魅力的であり、世界中から注目されています。
 7 | 彼の犬は忠実で賢く、家族にとって大切な存在です。
 8 | 私の友達は常に私をサポートしてくれる信頼できる存在です。
 9 | 家族と一緒に過ごす時間は、私にとって何よりも大切です。
10 | 彼の夢は大きく、努力と決意でそれを実現しようとしています。


--------------------------------------------------------------------------------
/test/basetts_test_resources/kr_egs_text.txt:
--------------------------------------------------------------------------------
1 | 안녕하세요! 오늘은 날씨가 정말 좋네요.
2 | 한국 음식을 먹어보고 싶어요. 불고기랑 김치찌개가 제가 좋아하는 음식이에요.
3 | 요즘에는 한국 드라마를 자주 보고 있어요. 정말 재미있어요.
4 | 한글을 배우는 것이 재미있어요. 조금씩 읽고 쓸 수 있게 되고 있어요.
5 | 친구들과 함께 한국 여행을 계획 중이에요. 서울과 부산을 방문할 예정이에요.,


--------------------------------------------------------------------------------
/test/basetts_test_resources/zh_mix_en_egs_text.txt:
--------------------------------------------------------------------------------
 1 | 人工智能是一种非常适合和促进自上而下集中控制的技术,而加密货币则是一种完全关注自下而上分散合作的技术。
 2 | Web 3的一个目标是支持艺术家。
 3 | 欢迎来到Web 3与A6Z,一个由团队打造的构建下一代互联网的节目。
 4 | 我最喜欢的fruit是苹果。
 5 | 今天我们要学习Python programming。
 6 | 她在library看书。
 7 | 你喜欢听pop music吗？
 8 | 今天下午，我们准备去shopping mall购物，然后晚上去看一场movie。
 9 | 我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。
10 | 在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。
11 | 今天天气真不错，我们去Paris吃蒸汽海鲜吧！,


--------------------------------------------------------------------------------
/test/test_base_model_tts_package.py:
--------------------------------------------------------------------------------
 1 | from melo.api import TTS
 2 | import os
 3 | import glob
 4 | import sys
 5 | 
 6 | 
 7 | language = sys.argv[1]
 8 | model = TTS(language=language)
 9 | 
10 | speaker_ids = model.hps.data.spk2id
11 | speakers = list(speaker_ids.keys())
12 | 
13 | root_folder = language.lower()
14 | if 'zh' in root_folder:
15 |     texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
16 |     language = 'ZH_MIX_EN'
17 | elif 'es' in root_folder:
18 |     texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
19 |     language = 'SP'
20 | elif 'fr' in root_folder:
21 |     texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
22 |     language = 'FR'
23 | elif 'en' in root_folder:
24 |     texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
25 |     # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
26 |     language = 'EN'
27 | elif 'jp' in root_folder:
28 |     texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
29 |     language = 'JP'
30 | elif 'kr' in root_folder:
31 |     texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
32 |     language = 'KR'
33 | else:
34 |     raise NotImplementedError()
35 | 
36 | save_dir = os.path.join('basetts_outputs_package', root_folder.split('/')[-1])
37 | 
38 | for speed in [1.0]:
39 |     for speaker in speakers:
40 |         for sent_id, text in enumerate(texts):
41 |             output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
42 |             os.makedirs(os.path.dirname(output_path), exist_ok=True)
43 |             model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)


--------------------------------------------------------------------------------
/test/test_base_model_tts_package_from_S3.py:
--------------------------------------------------------------------------------
 1 | from melo.api import TTS
 2 | import os
 3 | import glob
 4 | import sys
 5 | 
 6 | 
 7 | language = sys.argv[1]
 8 | model = TTS(language=language, use_hf=False)
 9 | 
10 | speaker_ids = model.hps.data.spk2id
11 | speakers = list(speaker_ids.keys())
12 | 
13 | root_folder = language.lower()
14 | if 'zh' in root_folder:
15 |     texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
16 |     language = 'ZH_MIX_EN'
17 | elif 'es' in root_folder:
18 |     texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
19 |     language = 'SP'
20 | elif 'fr' in root_folder:
21 |     texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
22 |     language = 'FR'
23 | elif 'en' in root_folder:
24 |     texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
25 |     # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
26 |     language = 'EN'
27 | elif 'jp' in root_folder:
28 |     texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
29 |     language = 'JP'
30 | elif 'kr' in root_folder:
31 |     texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
32 |     language = 'KR'
33 | else:
34 |     raise NotImplementedError()
35 | 
36 | save_dir = os.path.join('basetts_outputs_package_from_S3', root_folder.split('/')[-1])
37 | 
38 | for speed in [1.0]:
39 |     for speaker in speakers:
40 |         for sent_id, text in enumerate(texts):
41 |             output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
42 |             os.makedirs(os.path.dirname(output_path), exist_ok=True)
43 |             model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)


--------------------------------------------------------------------------------