├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README-zh.md ├── README.md ├── app.py ├── audiosample ├── demo.mp3 ├── fastspeech2.mp3 └── tacotron2.mp3 ├── demo.wav ├── setup.cfg ├── setup.py ├── templates └── index.html └── zhtts ├── __init__.py ├── asset ├── baker_mapper.json ├── fastspeech2_quan.tflite ├── mb_melgan.tflite └── tacotron2_quan.tflite ├── tensorflow_tts ├── __init__.py └── processor │ ├── __init__.py │ ├── baker.py │ ├── base_processor.py │ └── cn_tn.py └── tts.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | refe 3 | .vscode 4 | demo.py 5 | model 6 | run_test.py 7 | test.txt 8 | eval.txt 9 | tts_one_sentence.py 10 | zhtts/asset/tacotron2.tflite 11 | notes.md 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | pip-wheel-metadata/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 107 | __pypackages__/ 108 | 109 | # Celery stuff 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Environments 117 | .env 118 | .venv 119 | env/ 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 jackiexiao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include zhtts/asset * -------------------------------------------------------------------------------- /README-zh.md: -------------------------------------------------------------------------------- 1 | # ZhTTS 2 | [English](https://github.com/Jackiexiao/zhtts/blob/main/README.md) 3 | 4 | 在CPU上实时运行的中文语音合成系统(一个简单的示例,使用 Fastspeech2 + MbMelGan),但总体效果离“能用”还有很大差距,供大家参考 5 | 6 | > 实时率RTF:0.2 Cpu: Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz 采样率24khz fastspeech2, RTF1.6 for tacotron2 7 | 8 | 这个项目**主要依赖**于 [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS),做了非常简单的改进: 9 | 10 | * tflite 模型来源于[colab](https://colab.research.google.com/drive/1Ma3MIcSdLsOxqOKcN1MlElncYMhrOg3J?usp=sharing), 感谢[@azraelkuan](https://github.com/azraelkuan) 11 | * 在标点符号处停顿 12 | * 增加了简单的文本正则化(数字转汉字)TN (Text Normalization) 使用 [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization) 13 | 14 | ## 合成效果 15 | text = "2020年,这是一个开源的端到端中文语音合成系统" 16 | 17 | * [zhtts synthesis mp3](https://shimo.im/docs/tcXPY9pdrdRdwqk6/ ) 18 | 19 | 20 | ## 安装 21 | ``` 22 | pip install zhtts 23 | ``` 24 | or clone this repo, then ` pip install . ` 25 | 26 | ## 使用 27 | ```python 28 | import zhtts 29 | 30 | text = "2020年,这是一个开源的端到端中文语音合成系统" 31 | tts = zhtts.TTS() # use fastspeech2 by default 32 | 33 | tts.text2wav(text, "demo.wav") 34 | >>> Save wav to demo.wav 35 | 36 | tts.frontend(text) 37 | >>> ('二零二零年,这是一个开源的端到端中文语音合成系统', 'sil ^ er4 #0 l ing2 #0 ^ er4 #0 l ing2 #0 n ian2 #0 #3 zh e4 #0 sh iii4 #0 ^ i2 #0 g e4 #0 k ai1 #0 ^ van2 #0 d e5 #0 d uan1 #0 d ao4 #0 d uan1 #0 zh ong1 #0 ^ uen2 #0 ^ v3 #0 ^ in1 #0 h e2 #0 ch eng2 #0 x i4 #0 t ong3 sil') 38 | 39 | tts.synthesis(text) 40 | >>> array([0., 0., 0., ..., 0., 0., 0.], dtype=float32) 41 | ``` 42 | 43 | ### 网页 api demo 44 | 下载这个项目, `pip install flask` first, then 45 | ``` 46 | python app.py 47 | ``` 48 | * 访问 http://localhost:5000 可以直接进行语音合成交互 49 | * do HTTP GET at http://localhost:5000/api/tts?text=your%20sentence to get WAV audio back: 50 | 51 | ```sh 52 | $ curl -o "helloworld.wav" "http://localhost:5000/api/tts?text=%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C" 53 | ``` 54 | `%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C` 是"你好,世界!"的 url 编码 55 | 56 | ## 使用 Tacotron2 模型 57 | 某些情况下 Tacotron2 合成效果会好一点,不过合成速度会慢不少 58 | ```python 59 | import zhtts 60 | tts = zhtts.TTS(text2mel_name="TACOTRON") 61 | # tts = zhtts.TTS(text2mel_name="FASTSPEECH2") 62 | ``` 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 建议使用 [paddlespeech来做中文语音合成](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/README_cn.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) 2 | 3 | # ZhTTS 4 | [中文](https://github.com/Jackiexiao/zhtts/blob/main/README-zh.md) 5 | 6 | A demo of zh/Chinese Text to Speech system run on CPU in real time. (fastspeech2 + mbmelgan) 7 | 8 | > RTF(real time factor): 0.2 with cpu: Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz 24khz audio use fastspeech2, RTF1.6 for tacotron2 9 | 10 | This repo is **mainly based on** [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) with little improvement. 11 | 12 | * tflite model come from [colab](https://colab.research.google.com/drive/1Ma3MIcSdLsOxqOKcN1MlElncYMhrOg3J?usp=sharing), thx to [@azraelkuan](https://github.com/azraelkuan) 13 | * add pause at punctuation 14 | * add TN (Text Normalization) from [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization) 15 | 16 | ## demo wav 17 | text = "2020年,这是一个开源的端到端中文语音合成系统" 18 | 19 | [zhtts synthesis mp3](https://shimo.im/docs/tcXPY9pdrdRdwqk6/ ) 20 | 21 | ## Install 22 | ``` 23 | pip install zhtts 24 | ``` 25 | or clone this repo, then ` pip install . ` 26 | 27 | ## Usage 28 | ```python 29 | import zhtts 30 | 31 | text = "2020年,这是一个开源的端到端中文语音合成系统" 32 | tts = zhtts.TTS() # use fastspeech2 by default 33 | 34 | tts.text2wav(text, "demo.wav") 35 | >>> Save wav to demo.wav 36 | 37 | tts.frontend(text) 38 | >>> ('二零二零年,这是一个开源的端到端中文语音合成系统', 'sil ^ er4 #0 l ing2 #0 ^ er4 #0 l ing2 #0 n ian2 #0 #3 zh e4 #0 sh iii4 #0 ^ i2 #0 g e4 #0 k ai1 #0 ^ van2 #0 d e5 #0 d uan1 #0 d ao4 #0 d uan1 #0 zh ong1 #0 ^ uen2 #0 ^ v3 #0 ^ in1 #0 h e2 #0 ch eng2 #0 x i4 #0 t ong3 sil') 39 | 40 | tts.synthesis(text) 41 | >>> array([0., 0., 0., ..., 0., 0., 0.], dtype=float32) 42 | ``` 43 | 44 | ### web api demo 45 | clone this repo, `pip install flask` first, then 46 | ``` 47 | python app.py 48 | ``` 49 | * visit http://localhost:5000 for tts interaction 50 | * do HTTP GET at http://localhost:5000/api/tts?text=your%20sentence to get WAV audio back: 51 | 52 | ```sh 53 | $ curl -o "helloworld.wav" "http://localhost:5000/api/tts?text=%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C" 54 | ``` 55 | `%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C` is url code of"你好,世界!" 56 | 57 | ## Use tacotron2 instead of fastspeech2 58 | wav generate from tacotron model is better than fast speech, however tacotron is much slower , to use Tacotron, change code 59 | ```python 60 | import zhtts 61 | tts = zhtts.TTS(text2mel_name="TACOTRON") 62 | # tts = zhtts.TTS(text2mel_name="FASTSPEECH2") 63 | ``` 64 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from zhtts import TTS 3 | 4 | tts = TTS(text2mel_name="FASTSPEECH2") 5 | #tts = TTS(text2mel_name="TACOTRON") 6 | 7 | import io 8 | import time 9 | from pathlib import Path 10 | import scipy 11 | from scipy.io import wavfile 12 | 13 | from flask import Flask, Response, render_template, request 14 | # from flask_cors import CORS 15 | 16 | app = Flask("__name__") 17 | # CORS(app) 18 | 19 | @app.route("/api/tts") 20 | def api_tts(): 21 | text = request.args.get("text", "").strip() 22 | audio = tts.synthesis(text) 23 | 24 | with io.BytesIO() as out: 25 | wavfile.write(out, 24000, audio) 26 | return Response(out.getvalue(), mimetype="audio/wav") 27 | 28 | @app.route("/") 29 | def index(): 30 | return render_template("index.html") 31 | 32 | 33 | if __name__ == "__main__": 34 | app.run(host="0.0.0.0", port=5000) 35 | -------------------------------------------------------------------------------- /audiosample/demo.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/audiosample/demo.mp3 -------------------------------------------------------------------------------- /audiosample/fastspeech2.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/audiosample/fastspeech2.mp3 -------------------------------------------------------------------------------- /audiosample/tacotron2.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/audiosample/tacotron2.mp3 -------------------------------------------------------------------------------- /demo.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/demo.wav -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | desciption-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='zhtts', 6 | version='0.0.1', 7 | packages=find_packages(), 8 | url='https://github.com/jackiexiao/zhtts', 9 | license='MIT', 10 | author='jackiexiao', 11 | author_email='707610215@qq.com', 12 | description="A demo of zh/Chinese Text to Speech system run on CPU", 13 | long_description=open("README.md", 'r', encoding='utf-8').read(), 14 | long_description_content_type="text/markdown", 15 | include_package_data=True, 16 | install_requires=( 17 | "tensorflow-cpu>=2.4.0", 18 | "numpy", 19 | "scipy", 20 | "pypinyin", 21 | "dataclasses" 22 | ), 23 | classifiers=( 24 | 'License :: OSI Approved :: MIT License', 25 | 'Programming Language :: Python :: 3.6', 26 | ) 27 | ) 28 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Zhtts - Text2Speech Demo 12 | 13 | 14 | 16 | 17 | 18 | 29 | 30 | 31 | 32 | Fork me on GitHub 33 | 34 | 35 |
36 |
37 |
38 |

基于TensorFlowTTS的中文TTS-Demo

39 |

实时中文语音合成样例

40 |
    41 |
42 | 43 |

44 | 45 |

46 |
47 |
48 |
49 | 50 | 51 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /zhtts/__init__.py: -------------------------------------------------------------------------------- 1 | from .tts import TTS -------------------------------------------------------------------------------- /zhtts/asset/baker_mapper.json: -------------------------------------------------------------------------------- 1 | {"symbol_to_id": {"pad": 0, "sil": 1, "#0": 2, "#1": 3, "#2": 4, "#3": 5, "^": 6, "b": 7, "c": 8, "ch": 9, "d": 10, "f": 11, "g": 12, "h": 13, "j": 14, "k": 15, "l": 16, "m": 17, "n": 18, "p": 19, "q": 20, "r": 21, "s": 22, "sh": 23, "t": 24, "x": 25, "z": 26, "zh": 27, "a1": 28, "a2": 29, "a3": 30, "a4": 31, "a5": 32, "ai1": 33, "ai2": 34, "ai3": 35, "ai4": 36, "ai5": 37, "an1": 38, "an2": 39, "an3": 40, "an4": 41, "an5": 42, "ang1": 43, "ang2": 44, "ang3": 45, "ang4": 46, "ang5": 47, "ao1": 48, "ao2": 49, "ao3": 50, "ao4": 51, "ao5": 52, "e1": 53, "e2": 54, "e3": 55, "e4": 56, "e5": 57, "ei1": 58, "ei2": 59, "ei3": 60, "ei4": 61, "ei5": 62, "en1": 63, "en2": 64, "en3": 65, "en4": 66, "en5": 67, "eng1": 68, "eng2": 69, "eng3": 70, "eng4": 71, "eng5": 72, "er1": 73, "er2": 74, "er3": 75, "er4": 76, "er5": 77, "i1": 78, "i2": 79, "i3": 80, "i4": 81, "i5": 82, "ia1": 83, "ia2": 84, "ia3": 85, "ia4": 86, "ia5": 87, "ian1": 88, "ian2": 89, "ian3": 90, "ian4": 91, "ian5": 92, "iang1": 93, "iang2": 94, "iang3": 95, "iang4": 96, "iang5": 97, "iao1": 98, "iao2": 99, "iao3": 100, "iao4": 101, "iao5": 102, "ie1": 103, "ie2": 104, "ie3": 105, "ie4": 106, "ie5": 107, "ii1": 108, "ii2": 109, "ii3": 110, "ii4": 111, "ii5": 112, "iii1": 113, "iii2": 114, "iii3": 115, "iii4": 116, "iii5": 117, "in1": 118, "in2": 119, "in3": 120, "in4": 121, "in5": 122, "ing1": 123, "ing2": 124, "ing3": 125, "ing4": 126, "ing5": 127, "iong1": 128, "iong2": 129, "iong3": 130, "iong4": 131, "iong5": 132, "iou1": 133, "iou2": 134, "iou3": 135, "iou4": 136, "iou5": 137, "o1": 138, "o2": 139, "o3": 140, "o4": 141, "o5": 142, "ong1": 143, "ong2": 144, "ong3": 145, "ong4": 146, "ong5": 147, "ou1": 148, "ou2": 149, "ou3": 150, "ou4": 151, "ou5": 152, "u1": 153, "u2": 154, "u3": 155, "u4": 156, "u5": 157, "ua1": 158, "ua2": 159, "ua3": 160, "ua4": 161, "ua5": 162, "uai1": 163, "uai2": 164, "uai3": 165, "uai4": 166, "uai5": 167, "uan1": 168, "uan2": 169, "uan3": 170, "uan4": 171, "uan5": 172, "uang1": 173, "uang2": 174, "uang3": 175, "uang4": 176, "uang5": 177, "uei1": 178, "uei2": 179, "uei3": 180, "uei4": 181, "uei5": 182, "uen1": 183, "uen2": 184, "uen3": 185, "uen4": 186, "uen5": 187, "ueng1": 188, "ueng2": 189, "ueng3": 190, "ueng4": 191, "ueng5": 192, "uo1": 193, "uo2": 194, "uo3": 195, "uo4": 196, "uo5": 197, "v1": 198, "v2": 199, "v3": 200, "v4": 201, "v5": 202, "van1": 203, "van2": 204, "van3": 205, "van4": 206, "van5": 207, "ve1": 208, "ve2": 209, "ve3": 210, "ve4": 211, "ve5": 212, "vn1": 213, "vn2": 214, "vn3": 215, "vn4": 216, "vn5": 217, "eos": 218}, "id_to_symbol": {"0": "pad", "1": "sil", "2": "#0", "3": "#1", "4": "#2", "5": "#3", "6": "^", "7": "b", "8": "c", "9": "ch", "10": "d", "11": "f", "12": "g", "13": "h", "14": "j", "15": "k", "16": "l", "17": "m", "18": "n", "19": "p", "20": "q", "21": "r", "22": "s", "23": "sh", "24": "t", "25": "x", "26": "z", "27": "zh", "28": "a1", "29": "a2", "30": "a3", "31": "a4", "32": "a5", "33": "ai1", "34": "ai2", "35": "ai3", "36": "ai4", "37": "ai5", "38": "an1", "39": "an2", "40": "an3", "41": "an4", "42": "an5", "43": "ang1", "44": "ang2", "45": "ang3", "46": "ang4", "47": "ang5", "48": "ao1", "49": "ao2", "50": "ao3", "51": "ao4", "52": "ao5", "53": "e1", "54": "e2", "55": "e3", "56": "e4", "57": "e5", "58": "ei1", "59": "ei2", "60": "ei3", "61": "ei4", "62": "ei5", "63": "en1", "64": "en2", "65": "en3", "66": "en4", "67": "en5", "68": "eng1", "69": "eng2", "70": "eng3", "71": "eng4", "72": "eng5", "73": "er1", "74": "er2", "75": "er3", "76": "er4", "77": "er5", "78": "i1", "79": "i2", "80": "i3", "81": "i4", "82": "i5", "83": "ia1", "84": "ia2", "85": "ia3", "86": "ia4", "87": "ia5", "88": "ian1", "89": "ian2", "90": "ian3", "91": "ian4", "92": "ian5", "93": "iang1", "94": "iang2", "95": "iang3", "96": "iang4", "97": "iang5", "98": "iao1", "99": "iao2", "100": "iao3", "101": "iao4", "102": "iao5", "103": "ie1", "104": "ie2", "105": "ie3", "106": "ie4", "107": "ie5", "108": "ii1", "109": "ii2", "110": "ii3", "111": "ii4", "112": "ii5", "113": "iii1", "114": "iii2", "115": "iii3", "116": "iii4", "117": "iii5", "118": "in1", "119": "in2", "120": "in3", "121": "in4", "122": "in5", "123": "ing1", "124": "ing2", "125": "ing3", "126": "ing4", "127": "ing5", "128": "iong1", "129": "iong2", "130": "iong3", "131": "iong4", "132": "iong5", "133": "iou1", "134": "iou2", "135": "iou3", "136": "iou4", "137": "iou5", "138": "o1", "139": "o2", "140": "o3", "141": "o4", "142": "o5", "143": "ong1", "144": "ong2", "145": "ong3", "146": "ong4", "147": "ong5", "148": "ou1", "149": "ou2", "150": "ou3", "151": "ou4", "152": "ou5", "153": "u1", "154": "u2", "155": "u3", "156": "u4", "157": "u5", "158": "ua1", "159": "ua2", "160": "ua3", "161": "ua4", "162": "ua5", "163": "uai1", "164": "uai2", "165": "uai3", "166": "uai4", "167": "uai5", "168": "uan1", "169": "uan2", "170": "uan3", "171": "uan4", "172": "uan5", "173": "uang1", "174": "uang2", "175": "uang3", "176": "uang4", "177": "uang5", "178": "uei1", "179": "uei2", "180": "uei3", "181": "uei4", "182": "uei5", "183": "uen1", "184": "uen2", "185": "uen3", "186": "uen4", "187": "uen5", "188": "ueng1", "189": "ueng2", "190": "ueng3", "191": "ueng4", "192": "ueng5", "193": "uo1", "194": "uo2", "195": "uo3", "196": "uo4", "197": "uo5", "198": "v1", "199": "v2", "200": "v3", "201": "v4", "202": "v5", "203": "van1", "204": "van2", "205": "van3", "206": "van4", "207": "van5", "208": "ve1", "209": "ve2", "210": "ve3", "211": "ve4", "212": "ve5", "213": "vn1", "214": "vn2", "215": "vn3", "216": "vn4", "217": "vn5", "218": "eos"}, "speakers_map": {"baker": 0}, "processor_name": "BakerProcessor", "pinyin_dict": {"a": ["^", "a"], "ai": ["^", "ai"], "an": ["^", "an"], "ang": ["^", "ang"], "ao": ["^", "ao"], "ba": ["b", "a"], "bai": ["b", "ai"], "ban": ["b", "an"], "bang": ["b", "ang"], "bao": ["b", "ao"], "be": ["b", "e"], "bei": ["b", "ei"], "ben": ["b", "en"], "beng": ["b", "eng"], "bi": ["b", "i"], "bian": ["b", "ian"], "biao": ["b", "iao"], "bie": ["b", "ie"], "bin": ["b", "in"], "bing": ["b", "ing"], "bo": ["b", "o"], "bu": ["b", "u"], "ca": ["c", "a"], "cai": ["c", "ai"], "can": ["c", "an"], "cang": ["c", "ang"], "cao": ["c", "ao"], "ce": ["c", "e"], "cen": ["c", "en"], "ceng": ["c", "eng"], "cha": ["ch", "a"], "chai": ["ch", "ai"], "chan": ["ch", "an"], "chang": ["ch", "ang"], "chao": ["ch", "ao"], "che": ["ch", "e"], "chen": ["ch", "en"], "cheng": ["ch", "eng"], "chi": ["ch", "iii"], "chong": ["ch", "ong"], "chou": ["ch", "ou"], "chu": ["ch", "u"], "chua": ["ch", "ua"], "chuai": ["ch", "uai"], "chuan": ["ch", "uan"], "chuang": ["ch", "uang"], "chui": ["ch", "uei"], "chun": ["ch", "uen"], "chuo": ["ch", "uo"], "ci": ["c", "ii"], "cong": ["c", "ong"], "cou": ["c", "ou"], "cu": ["c", "u"], "cuan": ["c", "uan"], "cui": ["c", "uei"], "cun": ["c", "uen"], "cuo": ["c", "uo"], "da": ["d", "a"], "dai": ["d", "ai"], "dan": ["d", "an"], "dang": ["d", "ang"], "dao": ["d", "ao"], "de": ["d", "e"], "dei": ["d", "ei"], "den": ["d", "en"], "deng": ["d", "eng"], "di": ["d", "i"], "dia": ["d", "ia"], "dian": ["d", "ian"], "diao": ["d", "iao"], "die": ["d", "ie"], "ding": ["d", "ing"], "diu": ["d", "iou"], "dong": ["d", "ong"], "dou": ["d", "ou"], "du": ["d", "u"], "duan": ["d", "uan"], "dui": ["d", "uei"], "dun": ["d", "uen"], "duo": ["d", "uo"], "e": ["^", "e"], "ei": ["^", "ei"], "en": ["^", "en"], "ng": ["^", "en"], "eng": ["^", "eng"], "er": ["^", "er"], "fa": ["f", "a"], "fan": ["f", "an"], "fang": ["f", "ang"], "fei": ["f", "ei"], "fen": ["f", "en"], "feng": ["f", "eng"], "fo": ["f", "o"], "fou": ["f", "ou"], "fu": ["f", "u"], "ga": ["g", "a"], "gai": ["g", "ai"], "gan": ["g", "an"], "gang": ["g", "ang"], "gao": ["g", "ao"], "ge": ["g", "e"], "gei": ["g", "ei"], "gen": ["g", "en"], "geng": ["g", "eng"], "gong": ["g", "ong"], "gou": ["g", "ou"], "gu": ["g", "u"], "gua": ["g", "ua"], "guai": ["g", "uai"], "guan": ["g", "uan"], "guang": ["g", "uang"], "gui": ["g", "uei"], "gun": ["g", "uen"], "guo": ["g", "uo"], "ha": ["h", "a"], "hai": ["h", "ai"], "han": ["h", "an"], "hang": ["h", "ang"], "hao": ["h", "ao"], "he": ["h", "e"], "hei": ["h", "ei"], "hen": ["h", "en"], "heng": ["h", "eng"], "hong": ["h", "ong"], "hou": ["h", "ou"], "hu": ["h", "u"], "hua": ["h", "ua"], "huai": ["h", "uai"], "huan": ["h", "uan"], "huang": ["h", "uang"], "hui": ["h", "uei"], "hun": ["h", "uen"], "huo": ["h", "uo"], "ji": ["j", "i"], "jia": ["j", "ia"], "jian": ["j", "ian"], "jiang": ["j", "iang"], "jiao": ["j", "iao"], "jie": ["j", "ie"], "jin": ["j", "in"], "jing": ["j", "ing"], "jiong": ["j", "iong"], "jiu": ["j", "iou"], "ju": ["j", "v"], "juan": ["j", "van"], "jue": ["j", "ve"], "jun": ["j", "vn"], "ka": ["k", "a"], "kai": ["k", "ai"], "kan": ["k", "an"], "kang": ["k", "ang"], "kao": ["k", "ao"], "ke": ["k", "e"], "kei": ["k", "ei"], "ken": ["k", "en"], "keng": ["k", "eng"], "kong": ["k", "ong"], "kou": ["k", "ou"], "ku": ["k", "u"], "kua": ["k", "ua"], "kuai": ["k", "uai"], "kuan": ["k", "uan"], "kuang": ["k", "uang"], "kui": ["k", "uei"], "kun": ["k", "uen"], "kuo": ["k", "uo"], "la": ["l", "a"], "lai": ["l", "ai"], "lan": ["l", "an"], "lang": ["l", "ang"], "lao": ["l", "ao"], "le": ["l", "e"], "lei": ["l", "ei"], "leng": ["l", "eng"], "li": ["l", "i"], "lia": ["l", "ia"], "lian": ["l", "ian"], "liang": ["l", "iang"], "liao": ["l", "iao"], "lie": ["l", "ie"], "lin": ["l", "in"], "ling": ["l", "ing"], "liu": ["l", "iou"], "lo": ["l", "o"], "long": ["l", "ong"], "lou": ["l", "ou"], "lu": ["l", "u"], "lv": ["l", "v"], "luan": ["l", "uan"], "lve": ["l", "ve"], "lue": ["l", "ve"], "lun": ["l", "uen"], "luo": ["l", "uo"], "ma": ["m", "a"], "mai": ["m", "ai"], "man": ["m", "an"], "mang": ["m", "ang"], "mao": ["m", "ao"], "me": ["m", "e"], "mei": ["m", "ei"], "men": ["m", "en"], "meng": ["m", "eng"], "mi": ["m", "i"], "mian": ["m", "ian"], "miao": ["m", "iao"], "mie": ["m", "ie"], "min": ["m", "in"], "ming": ["m", "ing"], "miu": ["m", "iou"], "mo": ["m", "o"], "mou": ["m", "ou"], "mu": ["m", "u"], "na": ["n", "a"], "nai": ["n", "ai"], "nan": ["n", "an"], "nang": ["n", "ang"], "nao": ["n", "ao"], "ne": ["n", "e"], "nei": ["n", "ei"], "nen": ["n", "en"], "neng": ["n", "eng"], "ni": ["n", "i"], "nia": ["n", "ia"], "nian": ["n", "ian"], "niang": ["n", "iang"], "niao": ["n", "iao"], "nie": ["n", "ie"], "nin": ["n", "in"], "ning": ["n", "ing"], "niu": ["n", "iou"], "nong": ["n", "ong"], "nou": ["n", "ou"], "nu": ["n", "u"], "nv": ["n", "v"], "nuan": ["n", "uan"], "nve": ["n", "ve"], "nue": ["n", "ve"], "nuo": ["n", "uo"], "o": ["^", "o"], "ou": ["^", "ou"], "pa": ["p", "a"], "pai": ["p", "ai"], "pan": ["p", "an"], "pang": ["p", "ang"], "pao": ["p", "ao"], "pe": ["p", "e"], "pei": ["p", "ei"], "pen": ["p", "en"], "peng": ["p", "eng"], "pi": ["p", "i"], "pian": ["p", "ian"], "piao": ["p", "iao"], "pie": ["p", "ie"], "pin": ["p", "in"], "ping": ["p", "ing"], "po": ["p", "o"], "pou": ["p", "ou"], "pu": ["p", "u"], "qi": ["q", "i"], "qia": ["q", "ia"], "qian": ["q", "ian"], "qiang": ["q", "iang"], "qiao": ["q", "iao"], "qie": ["q", "ie"], "qin": ["q", "in"], "qing": ["q", "ing"], "qiong": ["q", "iong"], "qiu": ["q", "iou"], "qu": ["q", "v"], "quan": ["q", "van"], "que": ["q", "ve"], "qun": ["q", "vn"], "ran": ["r", "an"], "rang": ["r", "ang"], "rao": ["r", "ao"], "re": ["r", "e"], "ren": ["r", "en"], "reng": ["r", "eng"], "ri": ["r", "iii"], "rong": ["r", "ong"], "rou": ["r", "ou"], "ru": ["r", "u"], "rua": ["r", "ua"], "ruan": ["r", "uan"], "rui": ["r", "uei"], "run": ["r", "uen"], "ruo": ["r", "uo"], "sa": ["s", "a"], "sai": ["s", "ai"], "san": ["s", "an"], "sang": ["s", "ang"], "sao": ["s", "ao"], "se": ["s", "e"], "sen": ["s", "en"], "seng": ["s", "eng"], "sha": ["sh", "a"], "shai": ["sh", "ai"], "shan": ["sh", "an"], "shang": ["sh", "ang"], "shao": ["sh", "ao"], "she": ["sh", "e"], "shei": ["sh", "ei"], "shen": ["sh", "en"], "sheng": ["sh", "eng"], "shi": ["sh", "iii"], "shou": ["sh", "ou"], "shu": ["sh", "u"], "shua": ["sh", "ua"], "shuai": ["sh", "uai"], "shuan": ["sh", "uan"], "shuang": ["sh", "uang"], "shui": ["sh", "uei"], "shun": ["sh", "uen"], "shuo": ["sh", "uo"], "si": ["s", "ii"], "song": ["s", "ong"], "sou": ["s", "ou"], "su": ["s", "u"], "suan": ["s", "uan"], "sui": ["s", "uei"], "sun": ["s", "uen"], "suo": ["s", "uo"], "ta": ["t", "a"], "tai": ["t", "ai"], "tan": ["t", "an"], "tang": ["t", "ang"], "tao": ["t", "ao"], "te": ["t", "e"], "tei": ["t", "ei"], "teng": ["t", "eng"], "ti": ["t", "i"], "tian": ["t", "ian"], "tiao": ["t", "iao"], "tie": ["t", "ie"], "ting": ["t", "ing"], "tong": ["t", "ong"], "tou": ["t", "ou"], "tu": ["t", "u"], "tuan": ["t", "uan"], "tui": ["t", "uei"], "tun": ["t", "uen"], "tuo": ["t", "uo"], "wa": ["^", "ua"], "wai": ["^", "uai"], "wan": ["^", "uan"], "wang": ["^", "uang"], "wei": ["^", "uei"], "wen": ["^", "uen"], "weng": ["^", "ueng"], "wo": ["^", "uo"], "wu": ["^", "u"], "xi": ["x", "i"], "xia": ["x", "ia"], "xian": ["x", "ian"], "xiang": ["x", "iang"], "xiao": ["x", "iao"], "xie": ["x", "ie"], "xin": ["x", "in"], "xing": ["x", "ing"], "xiong": ["x", "iong"], "xiu": ["x", "iou"], "xu": ["x", "v"], "xuan": ["x", "van"], "xue": ["x", "ve"], "xun": ["x", "vn"], "ya": ["^", "ia"], "yan": ["^", "ian"], "yang": ["^", "iang"], "yao": ["^", "iao"], "ye": ["^", "ie"], "yi": ["^", "i"], "yin": ["^", "in"], "ying": ["^", "ing"], "yo": ["^", "iou"], "yong": ["^", "iong"], "you": ["^", "iou"], "yu": ["^", "v"], "yuan": ["^", "van"], "yue": ["^", "ve"], "yun": ["^", "vn"], "za": ["z", "a"], "zai": ["z", "ai"], "zan": ["z", "an"], "zang": ["z", "ang"], "zao": ["z", "ao"], "ze": ["z", "e"], "zei": ["z", "ei"], "zen": ["z", "en"], "zeng": ["z", "eng"], "zha": ["zh", "a"], "zhai": ["zh", "ai"], "zhan": ["zh", "an"], "zhang": ["zh", "ang"], "zhao": ["zh", "ao"], "zhe": ["zh", "e"], "zhei": ["zh", "ei"], "zhen": ["zh", "en"], "zheng": ["zh", "eng"], "zhi": ["zh", "iii"], "zhong": ["zh", "ong"], "zhou": ["zh", "ou"], "zhu": ["zh", "u"], "zhua": ["zh", "ua"], "zhuai": ["zh", "uai"], "zhuan": ["zh", "uan"], "zhuang": ["zh", "uang"], "zhui": ["zh", "uei"], "zhun": ["zh", "uen"], "zhuo": ["zh", "uo"], "zi": ["z", "ii"], "zong": ["z", "ong"], "zou": ["z", "ou"], "zu": ["z", "u"], "zuan": ["z", "uan"], "zui": ["z", "uei"], "zun": ["z", "uen"], "zuo": ["z", "uo"]}} -------------------------------------------------------------------------------- /zhtts/asset/fastspeech2_quan.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/zhtts/asset/fastspeech2_quan.tflite -------------------------------------------------------------------------------- /zhtts/asset/mb_melgan.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/zhtts/asset/mb_melgan.tflite -------------------------------------------------------------------------------- /zhtts/asset/tacotron2_quan.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/zhtts/asset/tacotron2_quan.tflite -------------------------------------------------------------------------------- /zhtts/tensorflow_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackiexiao/zhtts/f96d084506aaeb3d1bcdd066525aad891ef66949/zhtts/tensorflow_tts/__init__.py -------------------------------------------------------------------------------- /zhtts/tensorflow_tts/processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_processor import BaseProcessor 2 | from .baker import BakerProcessor -------------------------------------------------------------------------------- /zhtts/tensorflow_tts/processor/baker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2020 TensorFlowTTS Team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Perform preprocessing and raw feature extraction for Baker dataset.""" 16 | 17 | import os 18 | import re 19 | from typing import Dict, List, Union, Tuple, Any 20 | 21 | # import librosa 22 | import numpy as np 23 | # import soundfile as sf 24 | from dataclasses import dataclass, field 25 | from pypinyin import Style 26 | from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin 27 | from pypinyin.converter import DefaultConverter 28 | from pypinyin.core import Pinyin 29 | from .base_processor import BaseProcessor 30 | from .cn_tn import NSWNormalizer 31 | 32 | _pad = ["pad"] 33 | _eos = ["eos"] 34 | _pause = ["sil", "#0", "#1", "#2", "#3"] 35 | 36 | _initials = [ 37 | "^", 38 | "b", 39 | "c", 40 | "ch", 41 | "d", 42 | "f", 43 | "g", 44 | "h", 45 | "j", 46 | "k", 47 | "l", 48 | "m", 49 | "n", 50 | "p", 51 | "q", 52 | "r", 53 | "s", 54 | "sh", 55 | "t", 56 | "x", 57 | "z", 58 | "zh", 59 | ] 60 | 61 | _tones = ["1", "2", "3", "4", "5"] 62 | 63 | _finals = [ 64 | "a", 65 | "ai", 66 | "an", 67 | "ang", 68 | "ao", 69 | "e", 70 | "ei", 71 | "en", 72 | "eng", 73 | "er", 74 | "i", 75 | "ia", 76 | "ian", 77 | "iang", 78 | "iao", 79 | "ie", 80 | "ii", 81 | "iii", 82 | "in", 83 | "ing", 84 | "iong", 85 | "iou", 86 | "o", 87 | "ong", 88 | "ou", 89 | "u", 90 | "ua", 91 | "uai", 92 | "uan", 93 | "uang", 94 | "uei", 95 | "uen", 96 | "ueng", 97 | "uo", 98 | "v", 99 | "van", 100 | "ve", 101 | "vn", 102 | ] 103 | 104 | 105 | ALPHA_PHONE_DICT = { 106 | 'A': ['EE', 'EI1'], 107 | 'B': ['B', 'I4'], 108 | 'C': ['S', 'I1'], 109 | 'D': ['D', 'I4'], 110 | 'E': ['II', 'I4'], 111 | 'F': ['EE', 'EI2', 'F', 'U5'], 112 | 'G': ['J', 'I4'], 113 | 'H': ['EE', 'EI1', 'Q', 'U1'], 114 | 'I': ['AA', 'AI4'], 115 | 'J': ['J', 'IE4'], 116 | 'K': ['K', 'IE4'], 117 | 'L': ['EE', 'EI2', 'L', 'E5'], 118 | 'M': ['EE', 'EI2', 'M', 'ENG5'], 119 | 'N': ['EE', 'EN1'], 120 | 'O': ['OO', 'OU1'], 121 | 'P': ['P', 'I1'], 122 | 'Q': ['Q', 'OU1'], 123 | 'R': ['AA', 'AI1', 'EE', 'ER5'], 124 | 'S': ['EE', 'EI2', 'S', 'IY1'], 125 | 'T': ['T', 'I4'], 126 | 'U': ['II', 'IU1'], 127 | 'V': ['UU', 'UI1'], 128 | 'W': ['D', 'A2', 'B', 'U5', 'L', 'IU5'], 129 | 'X': ['EE', 'EI2', 'K', 'IE5', 'S', 'IY1'], 130 | 'Y': ['UU', 'UAI1'], 131 | 'Z': ['Z', 'E1']} 132 | 133 | _alpha_phones = [] 134 | [_alpha_phones.extend(i) for i in ALPHA_PHONE_DICT.values()] 135 | 136 | # BAKER_SYMBOLS = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _eos + _alpha_phones 137 | # TODO 等待支持英文字母 138 | BAKER_SYMBOLS = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _eos 139 | 140 | PINYIN_DICT = { 141 | "a": ("^", "a"), 142 | "ai": ("^", "ai"), 143 | "an": ("^", "an"), 144 | "ang": ("^", "ang"), 145 | "ao": ("^", "ao"), 146 | "ba": ("b", "a"), 147 | "bai": ("b", "ai"), 148 | "ban": ("b", "an"), 149 | "bang": ("b", "ang"), 150 | "bao": ("b", "ao"), 151 | "be": ("b", "e"), 152 | "bei": ("b", "ei"), 153 | "ben": ("b", "en"), 154 | "beng": ("b", "eng"), 155 | "bi": ("b", "i"), 156 | "bian": ("b", "ian"), 157 | "biao": ("b", "iao"), 158 | "bie": ("b", "ie"), 159 | "bin": ("b", "in"), 160 | "bing": ("b", "ing"), 161 | "bo": ("b", "o"), 162 | "bu": ("b", "u"), 163 | "ca": ("c", "a"), 164 | "cai": ("c", "ai"), 165 | "can": ("c", "an"), 166 | "cang": ("c", "ang"), 167 | "cao": ("c", "ao"), 168 | "ce": ("c", "e"), 169 | "cen": ("c", "en"), 170 | "ceng": ("c", "eng"), 171 | "cha": ("ch", "a"), 172 | "chai": ("ch", "ai"), 173 | "chan": ("ch", "an"), 174 | "chang": ("ch", "ang"), 175 | "chao": ("ch", "ao"), 176 | "che": ("ch", "e"), 177 | "chen": ("ch", "en"), 178 | "cheng": ("ch", "eng"), 179 | "chi": ("ch", "iii"), 180 | "chong": ("ch", "ong"), 181 | "chou": ("ch", "ou"), 182 | "chu": ("ch", "u"), 183 | "chua": ("ch", "ua"), 184 | "chuai": ("ch", "uai"), 185 | "chuan": ("ch", "uan"), 186 | "chuang": ("ch", "uang"), 187 | "chui": ("ch", "uei"), 188 | "chun": ("ch", "uen"), 189 | "chuo": ("ch", "uo"), 190 | "ci": ("c", "ii"), 191 | "cong": ("c", "ong"), 192 | "cou": ("c", "ou"), 193 | "cu": ("c", "u"), 194 | "cuan": ("c", "uan"), 195 | "cui": ("c", "uei"), 196 | "cun": ("c", "uen"), 197 | "cuo": ("c", "uo"), 198 | "da": ("d", "a"), 199 | "dai": ("d", "ai"), 200 | "dan": ("d", "an"), 201 | "dang": ("d", "ang"), 202 | "dao": ("d", "ao"), 203 | "de": ("d", "e"), 204 | "dei": ("d", "ei"), 205 | "den": ("d", "en"), 206 | "deng": ("d", "eng"), 207 | "di": ("d", "i"), 208 | "dia": ("d", "ia"), 209 | "dian": ("d", "ian"), 210 | "diao": ("d", "iao"), 211 | "die": ("d", "ie"), 212 | "ding": ("d", "ing"), 213 | "diu": ("d", "iou"), 214 | "dong": ("d", "ong"), 215 | "dou": ("d", "ou"), 216 | "du": ("d", "u"), 217 | "duan": ("d", "uan"), 218 | "dui": ("d", "uei"), 219 | "dun": ("d", "uen"), 220 | "duo": ("d", "uo"), 221 | "e": ("^", "e"), 222 | "ei": ("^", "ei"), 223 | "en": ("^", "en"), 224 | "ng": ("^", "en"), 225 | "eng": ("^", "eng"), 226 | "er": ("^", "er"), 227 | "fa": ("f", "a"), 228 | "fan": ("f", "an"), 229 | "fang": ("f", "ang"), 230 | "fei": ("f", "ei"), 231 | "fen": ("f", "en"), 232 | "feng": ("f", "eng"), 233 | "fo": ("f", "o"), 234 | "fou": ("f", "ou"), 235 | "fu": ("f", "u"), 236 | "ga": ("g", "a"), 237 | "gai": ("g", "ai"), 238 | "gan": ("g", "an"), 239 | "gang": ("g", "ang"), 240 | "gao": ("g", "ao"), 241 | "ge": ("g", "e"), 242 | "gei": ("g", "ei"), 243 | "gen": ("g", "en"), 244 | "geng": ("g", "eng"), 245 | "gong": ("g", "ong"), 246 | "gou": ("g", "ou"), 247 | "gu": ("g", "u"), 248 | "gua": ("g", "ua"), 249 | "guai": ("g", "uai"), 250 | "guan": ("g", "uan"), 251 | "guang": ("g", "uang"), 252 | "gui": ("g", "uei"), 253 | "gun": ("g", "uen"), 254 | "guo": ("g", "uo"), 255 | "ha": ("h", "a"), 256 | "hai": ("h", "ai"), 257 | "han": ("h", "an"), 258 | "hang": ("h", "ang"), 259 | "hao": ("h", "ao"), 260 | "he": ("h", "e"), 261 | "hei": ("h", "ei"), 262 | "hen": ("h", "en"), 263 | "heng": ("h", "eng"), 264 | "hong": ("h", "ong"), 265 | "hou": ("h", "ou"), 266 | "hu": ("h", "u"), 267 | "hua": ("h", "ua"), 268 | "huai": ("h", "uai"), 269 | "huan": ("h", "uan"), 270 | "huang": ("h", "uang"), 271 | "hui": ("h", "uei"), 272 | "hun": ("h", "uen"), 273 | "huo": ("h", "uo"), 274 | "ji": ("j", "i"), 275 | "jia": ("j", "ia"), 276 | "jian": ("j", "ian"), 277 | "jiang": ("j", "iang"), 278 | "jiao": ("j", "iao"), 279 | "jie": ("j", "ie"), 280 | "jin": ("j", "in"), 281 | "jing": ("j", "ing"), 282 | "jiong": ("j", "iong"), 283 | "jiu": ("j", "iou"), 284 | "ju": ("j", "v"), 285 | "juan": ("j", "van"), 286 | "jue": ("j", "ve"), 287 | "jun": ("j", "vn"), 288 | "ka": ("k", "a"), 289 | "kai": ("k", "ai"), 290 | "kan": ("k", "an"), 291 | "kang": ("k", "ang"), 292 | "kao": ("k", "ao"), 293 | "ke": ("k", "e"), 294 | "kei": ("k", "ei"), 295 | "ken": ("k", "en"), 296 | "keng": ("k", "eng"), 297 | "kong": ("k", "ong"), 298 | "kou": ("k", "ou"), 299 | "ku": ("k", "u"), 300 | "kua": ("k", "ua"), 301 | "kuai": ("k", "uai"), 302 | "kuan": ("k", "uan"), 303 | "kuang": ("k", "uang"), 304 | "kui": ("k", "uei"), 305 | "kun": ("k", "uen"), 306 | "kuo": ("k", "uo"), 307 | "la": ("l", "a"), 308 | "lai": ("l", "ai"), 309 | "lan": ("l", "an"), 310 | "lang": ("l", "ang"), 311 | "lao": ("l", "ao"), 312 | "le": ("l", "e"), 313 | "lei": ("l", "ei"), 314 | "leng": ("l", "eng"), 315 | "li": ("l", "i"), 316 | "lia": ("l", "ia"), 317 | "lian": ("l", "ian"), 318 | "liang": ("l", "iang"), 319 | "liao": ("l", "iao"), 320 | "lie": ("l", "ie"), 321 | "lin": ("l", "in"), 322 | "ling": ("l", "ing"), 323 | "liu": ("l", "iou"), 324 | "lo": ("l", "o"), 325 | "long": ("l", "ong"), 326 | "lou": ("l", "ou"), 327 | "lu": ("l", "u"), 328 | "lv": ("l", "v"), 329 | "luan": ("l", "uan"), 330 | "lve": ("l", "ve"), 331 | "lue": ("l", "ve"), 332 | "lun": ("l", "uen"), 333 | "luo": ("l", "uo"), 334 | "ma": ("m", "a"), 335 | "mai": ("m", "ai"), 336 | "man": ("m", "an"), 337 | "mang": ("m", "ang"), 338 | "mao": ("m", "ao"), 339 | "me": ("m", "e"), 340 | "mei": ("m", "ei"), 341 | "men": ("m", "en"), 342 | "meng": ("m", "eng"), 343 | "mi": ("m", "i"), 344 | "mian": ("m", "ian"), 345 | "miao": ("m", "iao"), 346 | "mie": ("m", "ie"), 347 | "min": ("m", "in"), 348 | "ming": ("m", "ing"), 349 | "miu": ("m", "iou"), 350 | "mo": ("m", "o"), 351 | "mou": ("m", "ou"), 352 | "mu": ("m", "u"), 353 | "na": ("n", "a"), 354 | "nai": ("n", "ai"), 355 | "nan": ("n", "an"), 356 | "nang": ("n", "ang"), 357 | "nao": ("n", "ao"), 358 | "ne": ("n", "e"), 359 | "nei": ("n", "ei"), 360 | "nen": ("n", "en"), 361 | "neng": ("n", "eng"), 362 | "ni": ("n", "i"), 363 | "nia": ("n", "ia"), 364 | "nian": ("n", "ian"), 365 | "niang": ("n", "iang"), 366 | "niao": ("n", "iao"), 367 | "nie": ("n", "ie"), 368 | "nin": ("n", "in"), 369 | "ning": ("n", "ing"), 370 | "niu": ("n", "iou"), 371 | "nong": ("n", "ong"), 372 | "nou": ("n", "ou"), 373 | "nu": ("n", "u"), 374 | "nv": ("n", "v"), 375 | "nuan": ("n", "uan"), 376 | "nve": ("n", "ve"), 377 | "nue": ("n", "ve"), 378 | "nuo": ("n", "uo"), 379 | "o": ("^", "o"), 380 | "ou": ("^", "ou"), 381 | "pa": ("p", "a"), 382 | "pai": ("p", "ai"), 383 | "pan": ("p", "an"), 384 | "pang": ("p", "ang"), 385 | "pao": ("p", "ao"), 386 | "pe": ("p", "e"), 387 | "pei": ("p", "ei"), 388 | "pen": ("p", "en"), 389 | "peng": ("p", "eng"), 390 | "pi": ("p", "i"), 391 | "pian": ("p", "ian"), 392 | "piao": ("p", "iao"), 393 | "pie": ("p", "ie"), 394 | "pin": ("p", "in"), 395 | "ping": ("p", "ing"), 396 | "po": ("p", "o"), 397 | "pou": ("p", "ou"), 398 | "pu": ("p", "u"), 399 | "qi": ("q", "i"), 400 | "qia": ("q", "ia"), 401 | "qian": ("q", "ian"), 402 | "qiang": ("q", "iang"), 403 | "qiao": ("q", "iao"), 404 | "qie": ("q", "ie"), 405 | "qin": ("q", "in"), 406 | "qing": ("q", "ing"), 407 | "qiong": ("q", "iong"), 408 | "qiu": ("q", "iou"), 409 | "qu": ("q", "v"), 410 | "quan": ("q", "van"), 411 | "que": ("q", "ve"), 412 | "qun": ("q", "vn"), 413 | "ran": ("r", "an"), 414 | "rang": ("r", "ang"), 415 | "rao": ("r", "ao"), 416 | "re": ("r", "e"), 417 | "ren": ("r", "en"), 418 | "reng": ("r", "eng"), 419 | "ri": ("r", "iii"), 420 | "rong": ("r", "ong"), 421 | "rou": ("r", "ou"), 422 | "ru": ("r", "u"), 423 | "rua": ("r", "ua"), 424 | "ruan": ("r", "uan"), 425 | "rui": ("r", "uei"), 426 | "run": ("r", "uen"), 427 | "ruo": ("r", "uo"), 428 | "sa": ("s", "a"), 429 | "sai": ("s", "ai"), 430 | "san": ("s", "an"), 431 | "sang": ("s", "ang"), 432 | "sao": ("s", "ao"), 433 | "se": ("s", "e"), 434 | "sen": ("s", "en"), 435 | "seng": ("s", "eng"), 436 | "sha": ("sh", "a"), 437 | "shai": ("sh", "ai"), 438 | "shan": ("sh", "an"), 439 | "shang": ("sh", "ang"), 440 | "shao": ("sh", "ao"), 441 | "she": ("sh", "e"), 442 | "shei": ("sh", "ei"), 443 | "shen": ("sh", "en"), 444 | "sheng": ("sh", "eng"), 445 | "shi": ("sh", "iii"), 446 | "shou": ("sh", "ou"), 447 | "shu": ("sh", "u"), 448 | "shua": ("sh", "ua"), 449 | "shuai": ("sh", "uai"), 450 | "shuan": ("sh", "uan"), 451 | "shuang": ("sh", "uang"), 452 | "shui": ("sh", "uei"), 453 | "shun": ("sh", "uen"), 454 | "shuo": ("sh", "uo"), 455 | "si": ("s", "ii"), 456 | "song": ("s", "ong"), 457 | "sou": ("s", "ou"), 458 | "su": ("s", "u"), 459 | "suan": ("s", "uan"), 460 | "sui": ("s", "uei"), 461 | "sun": ("s", "uen"), 462 | "suo": ("s", "uo"), 463 | "ta": ("t", "a"), 464 | "tai": ("t", "ai"), 465 | "tan": ("t", "an"), 466 | "tang": ("t", "ang"), 467 | "tao": ("t", "ao"), 468 | "te": ("t", "e"), 469 | "tei": ("t", "ei"), 470 | "teng": ("t", "eng"), 471 | "ti": ("t", "i"), 472 | "tian": ("t", "ian"), 473 | "tiao": ("t", "iao"), 474 | "tie": ("t", "ie"), 475 | "ting": ("t", "ing"), 476 | "tong": ("t", "ong"), 477 | "tou": ("t", "ou"), 478 | "tu": ("t", "u"), 479 | "tuan": ("t", "uan"), 480 | "tui": ("t", "uei"), 481 | "tun": ("t", "uen"), 482 | "tuo": ("t", "uo"), 483 | "wa": ("^", "ua"), 484 | "wai": ("^", "uai"), 485 | "wan": ("^", "uan"), 486 | "wang": ("^", "uang"), 487 | "wei": ("^", "uei"), 488 | "wen": ("^", "uen"), 489 | "weng": ("^", "ueng"), 490 | "wo": ("^", "uo"), 491 | "wu": ("^", "u"), 492 | "xi": ("x", "i"), 493 | "xia": ("x", "ia"), 494 | "xian": ("x", "ian"), 495 | "xiang": ("x", "iang"), 496 | "xiao": ("x", "iao"), 497 | "xie": ("x", "ie"), 498 | "xin": ("x", "in"), 499 | "xing": ("x", "ing"), 500 | "xiong": ("x", "iong"), 501 | "xiu": ("x", "iou"), 502 | "xu": ("x", "v"), 503 | "xuan": ("x", "van"), 504 | "xue": ("x", "ve"), 505 | "xun": ("x", "vn"), 506 | "ya": ("^", "ia"), 507 | "yan": ("^", "ian"), 508 | "yang": ("^", "iang"), 509 | "yao": ("^", "iao"), 510 | "ye": ("^", "ie"), 511 | "yi": ("^", "i"), 512 | "yin": ("^", "in"), 513 | "ying": ("^", "ing"), 514 | "yo": ("^", "iou"), 515 | "yong": ("^", "iong"), 516 | "you": ("^", "iou"), 517 | "yu": ("^", "v"), 518 | "yuan": ("^", "van"), 519 | "yue": ("^", "ve"), 520 | "yun": ("^", "vn"), 521 | "za": ("z", "a"), 522 | "zai": ("z", "ai"), 523 | "zan": ("z", "an"), 524 | "zang": ("z", "ang"), 525 | "zao": ("z", "ao"), 526 | "ze": ("z", "e"), 527 | "zei": ("z", "ei"), 528 | "zen": ("z", "en"), 529 | "zeng": ("z", "eng"), 530 | "zha": ("zh", "a"), 531 | "zhai": ("zh", "ai"), 532 | "zhan": ("zh", "an"), 533 | "zhang": ("zh", "ang"), 534 | "zhao": ("zh", "ao"), 535 | "zhe": ("zh", "e"), 536 | "zhei": ("zh", "ei"), 537 | "zhen": ("zh", "en"), 538 | "zheng": ("zh", "eng"), 539 | "zhi": ("zh", "iii"), 540 | "zhong": ("zh", "ong"), 541 | "zhou": ("zh", "ou"), 542 | "zhu": ("zh", "u"), 543 | "zhua": ("zh", "ua"), 544 | "zhuai": ("zh", "uai"), 545 | "zhuan": ("zh", "uan"), 546 | "zhuang": ("zh", "uang"), 547 | "zhui": ("zh", "uei"), 548 | "zhun": ("zh", "uen"), 549 | "zhuo": ("zh", "uo"), 550 | "zi": ("z", "ii"), 551 | "zong": ("z", "ong"), 552 | "zou": ("z", "ou"), 553 | "zu": ("z", "u"), 554 | "zuan": ("z", "uan"), 555 | "zui": ("z", "uei"), 556 | "zun": ("z", "uen"), 557 | "zuo": ("z", "uo"), 558 | } 559 | 560 | 561 | zh_pattern = re.compile("[\u4e00-\u9fa5]") 562 | alpha_pattern = re.compile(r"[a-zA-Z]") 563 | 564 | 565 | def is_zh(word): 566 | global zh_pattern 567 | match = zh_pattern.search(word) 568 | return match is not None 569 | 570 | def is_alpha(word): 571 | global alpha_pattern 572 | match = alpha_pattern.search(word) 573 | return match is not None 574 | 575 | 576 | class MyConverter(NeutralToneWith5Mixin, DefaultConverter): 577 | pass 578 | 579 | 580 | @dataclass 581 | class BakerProcessor(BaseProcessor): 582 | 583 | pinyin_dict: Dict[str, Tuple[str, str]] = field(default_factory=lambda: PINYIN_DICT) 584 | cleaner_names: str = None 585 | target_rate: int = 24000 586 | speaker_name: str = "baker" 587 | 588 | def __post_init__(self): 589 | super().__post_init__() 590 | self.pinyin_parser = self.get_pinyin_parser() 591 | 592 | def setup_eos_token(self): 593 | return _eos[0] 594 | 595 | def create_items(self): 596 | items = [] 597 | if self.data_dir: 598 | with open( 599 | os.path.join(self.data_dir, "ProsodyLabeling/000001-010000.txt"), 600 | encoding="utf-8", 601 | ) as ttf: 602 | lines = ttf.readlines() 603 | for idx in range(0, len(lines), 2): 604 | utt_id, chn_char = lines[idx].strip().split() # [100001, 中文] 605 | pinyin = lines[idx + 1].strip().split() # ['zhong1', 'wen2'] 606 | phonemes = self.get_phoneme_from_char_and_pinyin(chn_char, pinyin) 607 | wav_path = os.path.join(self.data_dir, "Wave", "%s.wav" % utt_id) 608 | items.append( 609 | [" ".join(phonemes), wav_path, utt_id, self.speaker_name] 610 | ) 611 | self.items = items 612 | 613 | def get_phoneme_from_char_and_pinyin(self, chn_char, pinyin): 614 | # we do not need #4, use sil to replace it 615 | chn_char = chn_char.replace("#4", "") 616 | char_len = len(chn_char) 617 | i, j = 0, 0 618 | result = ["sil"] 619 | while i < char_len: 620 | cur_char = chn_char[i] 621 | if is_zh(cur_char): 622 | if pinyin[j][:-1] == 'n': # 处理特殊“嗯” 特殊拼音 623 | pinyin[j] = 'en' + pinyin[j][-1] 624 | if pinyin[j][:-1] not in self.pinyin_dict: #处理儿化音 625 | assert chn_char[i + 1] == "儿", f"current_char : {cur_char}, next_char: {chn_char[i+1]}, cur_pinyin: {pinyin[j]}" 626 | assert pinyin[j][-2] == "r" 627 | tone = pinyin[j][-1] 628 | a = pinyin[j][:-2] 629 | a1, a2 = self.pinyin_dict[a] 630 | result += [a1, a2 + tone, "er5"] 631 | if i + 2 < char_len and chn_char[i + 2] != "#": 632 | result.append("#0") 633 | 634 | i += 2 635 | j += 1 636 | else: 637 | tone = pinyin[j][-1] 638 | a = pinyin[j][:-1] 639 | a1, a2 = self.pinyin_dict[a] # a="wen" a1="^", a2="en" 640 | result += [a1, a2 + tone] # result = [zh, ong1, ^,en2] 641 | 642 | if i + 1 < char_len and chn_char[i + 1] != "#": # 每个字后面接一个#0 643 | result.append("#0") 644 | 645 | i += 1 646 | j += 1 647 | # TODO support English alpha 648 | # elif is_alpha(cur_char): 649 | # result += ALPHA_PHONE_DICT[cur_char.upper()] 650 | # if i + 1 < char_len and chn_char[i + 1] not in "#、,。!?:" : # 每个字后面接一个#0 651 | # result.append("#0") 652 | # i += 1 653 | # j += 1 # baker alpha dataset "ABC" in pinyin 654 | elif cur_char == "#": 655 | result.append(chn_char[i : i + 2]) 656 | i += 2 657 | # elif cur_char in "、,。!?:": # 遇到标点符号,添加停顿 658 | # result.pop() # 去掉#0 659 | # result.append("#3") 660 | # i += 1 661 | else: 662 | # ignore the unknown char 663 | # result.append(chn_char[i]) 664 | i += 1 665 | if result[-1] == "#0": # 去掉最后的#0,改为sil 666 | result = result[:-1] 667 | if result[-1] != "sil": 668 | result.append("sil") 669 | assert j == len(pinyin) 670 | return result 671 | 672 | def get_one_sample(self, item): 673 | text, wav_file, utt_id, speaker_name = item 674 | 675 | # normalize audio signal to be [-1, 1], soundfile already norm. 676 | audio, rate = sf.read(wav_file) 677 | audio = audio.astype(np.float32) 678 | if rate != self.target_rate: 679 | assert rate > self.target_rate 680 | audio = librosa.resample(audio, rate, self.target_rate) 681 | 682 | # convert text to ids 683 | try: 684 | text_ids = np.asarray(self.text_to_sequence(text), np.int32) 685 | except Exception as e: 686 | print(e, utt_id, text) 687 | return None 688 | 689 | # return None 690 | sample = { 691 | "raw_text": text, 692 | "text_ids": text_ids, 693 | "audio": audio, 694 | "utt_id": str(int(utt_id)), 695 | "speaker_name": speaker_name, 696 | "rate": self.target_rate, 697 | } 698 | 699 | return sample 700 | 701 | def get_pinyin_parser(self): 702 | my_pinyin = Pinyin(MyConverter()) 703 | pinyin = my_pinyin.pinyin 704 | return pinyin 705 | 706 | 707 | def text_to_phone(self, text): 708 | """ return string like 'sil c e4 #0 sh iii4 #0 ^ uen2 #0 b en3 sil' """ 709 | text = NSWNormalizer(text.strip()).normalize() 710 | pinyin = self.pinyin_parser(text, style=Style.TONE3, errors="ignore") 711 | new_pinyin = [] 712 | for x in pinyin: 713 | x = "".join(x) 714 | if "#" not in x: 715 | new_pinyin.append(x) 716 | phonemes = self.get_phoneme_from_char_and_pinyin(text, new_pinyin) # phoneme seq: [sil c e4 #0 sh iii4 #0 ^ uen2 #0 b en3 sil] string 的list 717 | phones = " ".join(phonemes) 718 | return text, phones 719 | 720 | def text_to_sequence(self, text, inference=False): 721 | """ string 'sil c e4 #0 sh iii4 #0 ^ uen2 #0 b en3 sil' to list[int], use mapper.json symbol_to_id """ 722 | if inference: 723 | _, phones = self.text_to_phone(text) 724 | 725 | sequence = [] 726 | for symbol in phones.split(): 727 | idx = self.symbol_to_id[symbol] 728 | sequence.append(idx) 729 | 730 | # add eos tokens 731 | sequence += [self.eos_id] 732 | return sequence 733 | -------------------------------------------------------------------------------- /zhtts/tensorflow_tts/processor/base_processor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2020 TensorFlowTTS Team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Base Processor for all processor.""" 16 | 17 | import abc 18 | import json 19 | import os 20 | from typing import Dict, List, Union 21 | 22 | from dataclasses import dataclass, field 23 | 24 | 25 | class DataProcessorError(Exception): 26 | pass 27 | 28 | 29 | @dataclass 30 | class BaseProcessor(abc.ABC): 31 | data_dir: str 32 | symbols: List[str] = field(default_factory=list) 33 | speakers_map: Dict[str, int] = field(default_factory=dict) 34 | train_f_name: str = "train.txt" 35 | delimiter: str = "|" 36 | positions = { 37 | "file": 0, 38 | "text": 1, 39 | "speaker_name": 2, 40 | } # positions of file,text,speaker_name after split line 41 | f_extension: str = ".wav" 42 | saved_mapper_path: str = None 43 | loaded_mapper_path: str = None 44 | # extras 45 | items: List[List[str]] = field(default_factory=list) # text, wav_path, speaker_name 46 | symbol_to_id: Dict[str, int] = field(default_factory=dict) 47 | id_to_symbol: Dict[int, str] = field(default_factory=dict) 48 | 49 | def __post_init__(self): 50 | 51 | if self.loaded_mapper_path is not None: 52 | self._load_mapper(loaded_path=self.loaded_mapper_path) 53 | if self.setup_eos_token(): 54 | self.add_symbol( 55 | self.setup_eos_token() 56 | ) # if this eos token not yet present in symbols list. 57 | self.eos_id = self.symbol_to_id[self.setup_eos_token()] 58 | return 59 | 60 | if self.symbols.__len__() < 1: 61 | raise DataProcessorError("Symbols list is empty but mapper isn't loaded") 62 | 63 | self.create_items() 64 | self.create_speaker_map() 65 | self.reverse_speaker = {v: k for k, v in self.speakers_map.items()} 66 | self.create_symbols() 67 | if self.saved_mapper_path is not None: 68 | self._save_mapper(saved_path=self.saved_mapper_path) 69 | 70 | # processor name. usefull to use it for AutoProcessor 71 | self._processor_name = type(self).__name__ 72 | 73 | if self.setup_eos_token(): 74 | self.add_symbol( 75 | self.setup_eos_token() 76 | ) # if this eos token not yet present in symbols list. 77 | self.eos_id = self.symbol_to_id[self.setup_eos_token()] 78 | 79 | def __getattr__(self, name: str) -> Union[str, int]: 80 | if "_id" in name: # map symbol to id 81 | return self.symbol_to_id[name.replace("_id", "")] 82 | return self.symbol_to_id[name] # map symbol to value 83 | 84 | def create_speaker_map(self): 85 | """ 86 | Create speaker map for dataset. 87 | """ 88 | sp_id = 0 89 | for i in self.items: 90 | speaker_name = i[-1] 91 | if speaker_name not in self.speakers_map: 92 | self.speakers_map[speaker_name] = sp_id 93 | sp_id += 1 94 | 95 | def get_speaker_id(self, name: str) -> int: 96 | return self.speakers_map[name] 97 | 98 | def get_speaker_name(self, speaker_id: int) -> str: 99 | return self.speakers_map[speaker_id] 100 | 101 | def create_symbols(self): 102 | self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} 103 | self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} 104 | 105 | def create_items(self): 106 | """ 107 | Method used to create items from training file 108 | items struct example => text, wav_file_path, speaker_name. 109 | Note that the speaker_name should be a last. 110 | """ 111 | with open( 112 | os.path.join(self.data_dir, self.train_f_name), mode="r", encoding="utf-8" 113 | ) as f: 114 | for line in f: 115 | parts = line.strip().split(self.delimiter) 116 | wav_path = os.path.join(self.data_dir, parts[self.positions["file"]]) 117 | wav_path = ( 118 | wav_path + self.f_extension 119 | if wav_path[-len(self.f_extension) :] != self.f_extension 120 | else wav_path 121 | ) 122 | text = parts[self.positions["text"]] 123 | speaker_name = parts[self.positions["speaker_name"]] 124 | self.items.append([text, wav_path, speaker_name]) 125 | 126 | def add_symbol(self, symbol: Union[str, list]): 127 | if isinstance(symbol, str): 128 | if symbol in self.symbol_to_id: 129 | return 130 | self.symbols.append(symbol) 131 | symbol_id = len(self.symbol_to_id) 132 | self.symbol_to_id[symbol] = symbol_id 133 | self.id_to_symbol[symbol_id] = symbol 134 | 135 | elif isinstance(symbol, list): 136 | for i in symbol: 137 | self.add_symbol(i) 138 | else: 139 | raise ValueError("A new_symbols must be a string or list of string.") 140 | 141 | @abc.abstractmethod 142 | def get_one_sample(self, item): 143 | """Get one sample from dataset items. 144 | Args: 145 | item: one item in Dataset items. 146 | Dataset items may include (raw_text, speaker_id, wav_path, ...) 147 | 148 | Returns: 149 | sample (dict): sample dictionary return all feature used for preprocessing later. 150 | """ 151 | sample = { 152 | "raw_text": None, 153 | "text_ids": None, 154 | "audio": None, 155 | "utt_id": None, 156 | "speaker_name": None, 157 | "rate": None, 158 | } 159 | return sample 160 | 161 | @abc.abstractmethod 162 | def text_to_sequence(self, text: str): 163 | return [] 164 | 165 | @abc.abstractmethod 166 | def setup_eos_token(self): 167 | """Return eos symbol of type string.""" 168 | return "eos" 169 | 170 | def convert_symbols_to_ids(self, symbols: Union[str, list]): 171 | sequence = [] 172 | if isinstance(symbols, str): 173 | sequence.append(self._symbol_to_id[symbols]) 174 | return sequence 175 | elif isinstance(symbols, list): 176 | for s in symbols: 177 | if isinstance(s, str): 178 | sequence.append(self._symbol_to_id[s]) 179 | else: 180 | raise ValueError("All elements of symbols must be a string.") 181 | else: 182 | raise ValueError("A symbols must be a string or list of string.") 183 | 184 | return sequence 185 | 186 | def _load_mapper(self, loaded_path: str = None): 187 | """ 188 | Save all needed mappers to file 189 | """ 190 | loaded_path = ( 191 | os.path.join(self.data_dir, "mapper.json") 192 | if loaded_path is None 193 | else loaded_path 194 | ) 195 | with open(loaded_path, "r") as f: 196 | data = json.load(f) 197 | self.speakers_map = data["speakers_map"] 198 | self.symbol_to_id = data["symbol_to_id"] 199 | self.id_to_symbol = {int(k): v for k, v in data["id_to_symbol"].items()} 200 | self._processor_name = data["processor_name"] 201 | 202 | # other keys 203 | all_data_keys = data.keys() 204 | for key in all_data_keys: 205 | if key not in ["speakers_map", "symbol_to_id", "id_to_symbol"]: 206 | setattr(self, key, data[key]) 207 | 208 | def _save_mapper(self, saved_path: str = None, extra_attrs_to_save: dict = None): 209 | """ 210 | Save all needed mappers to file 211 | """ 212 | saved_path = ( 213 | os.path.join(self.data_dir, "mapper.json") 214 | if saved_path is None 215 | else saved_path 216 | ) 217 | with open(saved_path, "w") as f: 218 | full_mapper = { 219 | "symbol_to_id": self.symbol_to_id, 220 | "id_to_symbol": self.id_to_symbol, 221 | "speakers_map": self.speakers_map, 222 | "processor_name": self._processor_name, 223 | } 224 | if extra_attrs_to_save: 225 | full_mapper = {**full_mapper, **extra_attrs_to_save} 226 | json.dump(full_mapper, f) 227 | -------------------------------------------------------------------------------- /zhtts/tensorflow_tts/processor/cn_tn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | # Authors: 4 | # 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git) 5 | # 2019.9 Jiayu DU 6 | # 7 | # requirements: 8 | # - python 3.X 9 | # notes: python 2.X WILL fail or produce misleading results 10 | 11 | import sys, os, argparse, codecs, string, re 12 | 13 | # ================================================================================ # 14 | # basic constant 15 | # ================================================================================ # 16 | CHINESE_DIGIS = u'零一二三四五六七八九' 17 | BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' 18 | BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' 19 | SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' 20 | SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' 21 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' 22 | LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' 23 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' 24 | SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' 25 | 26 | ZERO_ALT = u'〇' 27 | ONE_ALT = u'幺' 28 | TWO_ALTS = [u'两', u'兩'] 29 | 30 | POSITIVE = [u'正', u'正'] 31 | NEGATIVE = [u'负', u'負'] 32 | POINT = [u'点', u'點'] 33 | # PLUS = [u'加', u'加'] 34 | # SIL = [u'杠', u'槓'] 35 | 36 | # 中文数字系统类型 37 | NUMBERING_TYPES = ['low', 'mid', 'high'] 38 | 39 | CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ 40 | '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' 41 | CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' 42 | COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ 43 | '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ 44 | '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ 45 | '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ 46 | '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ 47 | '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' 48 | 49 | # punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) 50 | CHINESE_PUNC_STOP = '!?。。' 51 | CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' 52 | CHINESE_PUNC_OTHER = '·〈〉-' 53 | CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER 54 | 55 | # ================================================================================ # 56 | # basic class 57 | # ================================================================================ # 58 | class ChineseChar(object): 59 | """ 60 | 中文字符 61 | 每个字符对应简体和繁体, 62 | e.g. 简体 = '负', 繁体 = '負' 63 | 转换时可转换为简体或繁体 64 | """ 65 | 66 | def __init__(self, simplified, traditional): 67 | self.simplified = simplified 68 | self.traditional = traditional 69 | #self.__repr__ = self.__str__ 70 | 71 | def __str__(self): 72 | return self.simplified or self.traditional or None 73 | 74 | def __repr__(self): 75 | return self.__str__() 76 | 77 | 78 | class ChineseNumberUnit(ChineseChar): 79 | """ 80 | 中文数字/数位字符 81 | 每个字符除繁简体外还有一个额外的大写字符 82 | e.g. '陆' 和 '陸' 83 | """ 84 | 85 | def __init__(self, power, simplified, traditional, big_s, big_t): 86 | super(ChineseNumberUnit, self).__init__(simplified, traditional) 87 | self.power = power 88 | self.big_s = big_s 89 | self.big_t = big_t 90 | 91 | def __str__(self): 92 | return '10^{}'.format(self.power) 93 | 94 | @classmethod 95 | def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): 96 | 97 | if small_unit: 98 | return ChineseNumberUnit(power=index + 1, 99 | simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]) 100 | elif numbering_type == NUMBERING_TYPES[0]: 101 | return ChineseNumberUnit(power=index + 8, 102 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 103 | elif numbering_type == NUMBERING_TYPES[1]: 104 | return ChineseNumberUnit(power=(index + 2) * 4, 105 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 106 | elif numbering_type == NUMBERING_TYPES[2]: 107 | return ChineseNumberUnit(power=pow(2, index + 3), 108 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 109 | else: 110 | raise ValueError( 111 | 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type)) 112 | 113 | 114 | class ChineseNumberDigit(ChineseChar): 115 | """ 116 | 中文数字字符 117 | """ 118 | 119 | def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): 120 | super(ChineseNumberDigit, self).__init__(simplified, traditional) 121 | self.value = value 122 | self.big_s = big_s 123 | self.big_t = big_t 124 | self.alt_s = alt_s 125 | self.alt_t = alt_t 126 | 127 | def __str__(self): 128 | return str(self.value) 129 | 130 | @classmethod 131 | def create(cls, i, v): 132 | return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) 133 | 134 | 135 | class ChineseMath(ChineseChar): 136 | """ 137 | 中文数位字符 138 | """ 139 | 140 | def __init__(self, simplified, traditional, symbol, expression=None): 141 | super(ChineseMath, self).__init__(simplified, traditional) 142 | self.symbol = symbol 143 | self.expression = expression 144 | self.big_s = simplified 145 | self.big_t = traditional 146 | 147 | 148 | CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath 149 | 150 | 151 | class NumberSystem(object): 152 | """ 153 | 中文数字系统 154 | """ 155 | pass 156 | 157 | 158 | class MathSymbol(object): 159 | """ 160 | 用于中文数字系统的数学符号 (繁/简体), e.g. 161 | positive = ['正', '正'] 162 | negative = ['负', '負'] 163 | point = ['点', '點'] 164 | """ 165 | 166 | def __init__(self, positive, negative, point): 167 | self.positive = positive 168 | self.negative = negative 169 | self.point = point 170 | 171 | def __iter__(self): 172 | for v in self.__dict__.values(): 173 | yield v 174 | 175 | 176 | # class OtherSymbol(object): 177 | # """ 178 | # 其他符号 179 | # """ 180 | # 181 | # def __init__(self, sil): 182 | # self.sil = sil 183 | # 184 | # def __iter__(self): 185 | # for v in self.__dict__.values(): 186 | # yield v 187 | 188 | 189 | # ================================================================================ # 190 | # basic utils 191 | # ================================================================================ # 192 | def create_system(numbering_type=NUMBERING_TYPES[1]): 193 | """ 194 | 根据数字系统类型返回创建相应的数字系统,默认为 mid 195 | NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 196 | low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. 197 | mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. 198 | high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. 199 | 返回对应的数字系统 200 | """ 201 | 202 | # chinese number units of '亿' and larger 203 | all_larger_units = zip( 204 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) 205 | larger_units = [CNU.create(i, v, numbering_type, False) 206 | for i, v in enumerate(all_larger_units)] 207 | # chinese number units of '十, 百, 千, 万' 208 | all_smaller_units = zip( 209 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) 210 | smaller_units = [CNU.create(i, v, small_unit=True) 211 | for i, v in enumerate(all_smaller_units)] 212 | # digis 213 | chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, 214 | BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) 215 | digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] 216 | digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT 217 | digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT 218 | digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] 219 | 220 | # symbols 221 | positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) 222 | negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) 223 | point_cn = CM(POINT[0], POINT[1], '.', lambda x, 224 | y: float(str(x) + '.' + str(y))) 225 | # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) 226 | system = NumberSystem() 227 | system.units = smaller_units + larger_units 228 | system.digits = digits 229 | system.math = MathSymbol(positive_cn, negative_cn, point_cn) 230 | # system.symbols = OtherSymbol(sil_cn) 231 | return system 232 | 233 | 234 | def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): 235 | 236 | def get_symbol(char, system): 237 | for u in system.units: 238 | if char in [u.traditional, u.simplified, u.big_s, u.big_t]: 239 | return u 240 | for d in system.digits: 241 | if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]: 242 | return d 243 | for m in system.math: 244 | if char in [m.traditional, m.simplified]: 245 | return m 246 | 247 | def string2symbols(chinese_string, system): 248 | int_string, dec_string = chinese_string, '' 249 | for p in [system.math.point.simplified, system.math.point.traditional]: 250 | if p in chinese_string: 251 | int_string, dec_string = chinese_string.split(p) 252 | break 253 | return [get_symbol(c, system) for c in int_string], \ 254 | [get_symbol(c, system) for c in dec_string] 255 | 256 | def correct_symbols(integer_symbols, system): 257 | """ 258 | 一百八 to 一百八十 259 | 一亿一千三百万 to 一亿 一千万 三百万 260 | """ 261 | 262 | if integer_symbols and isinstance(integer_symbols[0], CNU): 263 | if integer_symbols[0].power == 1: 264 | integer_symbols = [system.digits[1]] + integer_symbols 265 | 266 | if len(integer_symbols) > 1: 267 | if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): 268 | integer_symbols.append( 269 | CNU(integer_symbols[-2].power - 1, None, None, None, None)) 270 | 271 | result = [] 272 | unit_count = 0 273 | for s in integer_symbols: 274 | if isinstance(s, CND): 275 | result.append(s) 276 | unit_count = 0 277 | elif isinstance(s, CNU): 278 | current_unit = CNU(s.power, None, None, None, None) 279 | unit_count += 1 280 | 281 | if unit_count == 1: 282 | result.append(current_unit) 283 | elif unit_count > 1: 284 | for i in range(len(result)): 285 | if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: 286 | result[-i - 1] = CNU(result[-i - 1].power + 287 | current_unit.power, None, None, None, None) 288 | return result 289 | 290 | def compute_value(integer_symbols): 291 | """ 292 | Compute the value. 293 | When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. 294 | e.g. '两千万' = 2000 * 10000 not 2000 + 10000 295 | """ 296 | value = [0] 297 | last_power = 0 298 | for s in integer_symbols: 299 | if isinstance(s, CND): 300 | value[-1] = s.value 301 | elif isinstance(s, CNU): 302 | value[-1] *= pow(10, s.power) 303 | if s.power > last_power: 304 | value[:-1] = list(map(lambda v: v * 305 | pow(10, s.power), value[:-1])) 306 | last_power = s.power 307 | value.append(0) 308 | return sum(value) 309 | 310 | system = create_system(numbering_type) 311 | int_part, dec_part = string2symbols(chinese_string, system) 312 | int_part = correct_symbols(int_part, system) 313 | int_str = str(compute_value(int_part)) 314 | dec_str = ''.join([str(d.value) for d in dec_part]) 315 | if dec_part: 316 | return '{0}.{1}'.format(int_str, dec_str) 317 | else: 318 | return int_str 319 | 320 | 321 | def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False, 322 | traditional=False, alt_zero=False, alt_one=False, alt_two=True, 323 | use_zeros=True, use_units=True): 324 | 325 | def get_value(value_string, use_zeros=True): 326 | 327 | striped_string = value_string.lstrip('0') 328 | 329 | # record nothing if all zeros 330 | if not striped_string: 331 | return [] 332 | 333 | # record one digits 334 | elif len(striped_string) == 1: 335 | if use_zeros and len(value_string) != len(striped_string): 336 | return [system.digits[0], system.digits[int(striped_string)]] 337 | else: 338 | return [system.digits[int(striped_string)]] 339 | 340 | # recursively record multiple digits 341 | else: 342 | result_unit = next(u for u in reversed( 343 | system.units) if u.power < len(striped_string)) 344 | result_string = value_string[:-result_unit.power] 345 | return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:]) 346 | 347 | system = create_system(numbering_type) 348 | 349 | int_dec = number_string.split('.') 350 | if len(int_dec) == 1: 351 | int_string = int_dec[0] 352 | dec_string = "" 353 | elif len(int_dec) == 2: 354 | int_string = int_dec[0] 355 | dec_string = int_dec[1] 356 | else: 357 | raise ValueError( 358 | "invalid input num string with more than one dot: {}".format(number_string)) 359 | 360 | if use_units and len(int_string) > 1: 361 | result_symbols = get_value(int_string) 362 | else: 363 | result_symbols = [system.digits[int(c)] for c in int_string] 364 | dec_symbols = [system.digits[int(c)] for c in dec_string] 365 | if dec_string: 366 | result_symbols += [system.math.point] + dec_symbols 367 | 368 | if alt_two: 369 | liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, 370 | system.digits[2].big_s, system.digits[2].big_t) 371 | for i, v in enumerate(result_symbols): 372 | if isinstance(v, CND) and v.value == 2: 373 | next_symbol = result_symbols[i + 374 | 1] if i < len(result_symbols) - 1 else None 375 | previous_symbol = result_symbols[i - 1] if i > 0 else None 376 | if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): 377 | if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): 378 | result_symbols[i] = liang 379 | 380 | # if big is True, '两' will not be used and `alt_two` has no impact on output 381 | if big: 382 | attr_name = 'big_' 383 | if traditional: 384 | attr_name += 't' 385 | else: 386 | attr_name += 's' 387 | else: 388 | if traditional: 389 | attr_name = 'traditional' 390 | else: 391 | attr_name = 'simplified' 392 | 393 | result = ''.join([getattr(s, attr_name) for s in result_symbols]) 394 | 395 | # if not use_zeros: 396 | # result = result.strip(getattr(system.digits[0], attr_name)) 397 | 398 | if alt_zero: 399 | result = result.replace( 400 | getattr(system.digits[0], attr_name), system.digits[0].alt_s) 401 | 402 | if alt_one: 403 | result = result.replace( 404 | getattr(system.digits[1], attr_name), system.digits[1].alt_s) 405 | 406 | for i, p in enumerate(POINT): 407 | if result.startswith(p): 408 | return CHINESE_DIGIS[0] + result 409 | 410 | # ^10, 11, .., 19 411 | if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], 412 | SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ 413 | result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: 414 | result = result[1:] 415 | 416 | return result 417 | 418 | 419 | # ================================================================================ # 420 | # different types of rewriters 421 | # ================================================================================ # 422 | class Cardinal: 423 | """ 424 | CARDINAL类 425 | """ 426 | 427 | def __init__(self, cardinal=None, chntext=None): 428 | self.cardinal = cardinal 429 | self.chntext = chntext 430 | 431 | def chntext2cardinal(self): 432 | return chn2num(self.chntext) 433 | 434 | def cardinal2chntext(self): 435 | return num2chn(self.cardinal) 436 | 437 | class Digit: 438 | """ 439 | DIGIT类 440 | """ 441 | 442 | def __init__(self, digit=None, chntext=None): 443 | self.digit = digit 444 | self.chntext = chntext 445 | 446 | # def chntext2digit(self): 447 | # return chn2num(self.chntext) 448 | 449 | def digit2chntext(self): 450 | return num2chn(self.digit, alt_two=False, use_units=False) 451 | 452 | 453 | class TelePhone: 454 | """ 455 | TELEPHONE类 456 | """ 457 | 458 | def __init__(self, telephone=None, raw_chntext=None, chntext=None): 459 | self.telephone = telephone 460 | self.raw_chntext = raw_chntext 461 | self.chntext = chntext 462 | 463 | # def chntext2telephone(self): 464 | # sil_parts = self.raw_chntext.split('') 465 | # self.telephone = '-'.join([ 466 | # str(chn2num(p)) for p in sil_parts 467 | # ]) 468 | # return self.telephone 469 | 470 | def telephone2chntext(self, fixed=False): 471 | 472 | if fixed: 473 | sil_parts = self.telephone.split('-') 474 | self.raw_chntext = ''.join([ 475 | num2chn(part, alt_two=False, use_units=False) for part in sil_parts 476 | ]) 477 | self.chntext = self.raw_chntext.replace('', '') 478 | else: 479 | sp_parts = self.telephone.strip('+').split() 480 | self.raw_chntext = ''.join([ 481 | num2chn(part, alt_two=False, use_units=False) for part in sp_parts 482 | ]) 483 | self.chntext = self.raw_chntext.replace('', '') 484 | return self.chntext 485 | 486 | 487 | class Fraction: 488 | """ 489 | FRACTION类 490 | """ 491 | 492 | def __init__(self, fraction=None, chntext=None): 493 | self.fraction = fraction 494 | self.chntext = chntext 495 | 496 | def chntext2fraction(self): 497 | denominator, numerator = self.chntext.split('分之') 498 | return chn2num(numerator) + '/' + chn2num(denominator) 499 | 500 | def fraction2chntext(self): 501 | numerator, denominator = self.fraction.split('/') 502 | return num2chn(denominator) + '分之' + num2chn(numerator) 503 | 504 | 505 | class Date: 506 | """ 507 | DATE类 508 | """ 509 | 510 | def __init__(self, date=None, chntext=None): 511 | self.date = date 512 | self.chntext = chntext 513 | 514 | # def chntext2date(self): 515 | # chntext = self.chntext 516 | # try: 517 | # year, other = chntext.strip().split('年', maxsplit=1) 518 | # year = Digit(chntext=year).digit2chntext() + '年' 519 | # except ValueError: 520 | # other = chntext 521 | # year = '' 522 | # if other: 523 | # try: 524 | # month, day = other.strip().split('月', maxsplit=1) 525 | # month = Cardinal(chntext=month).chntext2cardinal() + '月' 526 | # except ValueError: 527 | # day = chntext 528 | # month = '' 529 | # if day: 530 | # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] 531 | # else: 532 | # month = '' 533 | # day = '' 534 | # date = year + month + day 535 | # self.date = date 536 | # return self.date 537 | 538 | def date2chntext(self): 539 | date = self.date 540 | try: 541 | year, other = date.strip().split('年', 1) 542 | year = Digit(digit=year).digit2chntext() + '年' 543 | except ValueError: 544 | other = date 545 | year = '' 546 | if other: 547 | try: 548 | month, day = other.strip().split('月', 1) 549 | month = Cardinal(cardinal=month).cardinal2chntext() + '月' 550 | except ValueError: 551 | day = date 552 | month = '' 553 | if day: 554 | day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] 555 | else: 556 | month = '' 557 | day = '' 558 | chntext = year + month + day 559 | self.chntext = chntext 560 | return self.chntext 561 | 562 | 563 | class Money: 564 | """ 565 | MONEY类 566 | """ 567 | 568 | def __init__(self, money=None, chntext=None): 569 | self.money = money 570 | self.chntext = chntext 571 | 572 | # def chntext2money(self): 573 | # return self.money 574 | 575 | def money2chntext(self): 576 | money = self.money 577 | pattern = re.compile(r'(\d+(\.\d+)?)') 578 | matchers = pattern.findall(money) 579 | if matchers: 580 | for matcher in matchers: 581 | money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()) 582 | self.chntext = money 583 | return self.chntext 584 | 585 | 586 | class Percentage: 587 | """ 588 | PERCENTAGE类 589 | """ 590 | 591 | def __init__(self, percentage=None, chntext=None): 592 | self.percentage = percentage 593 | self.chntext = chntext 594 | 595 | def chntext2percentage(self): 596 | return chn2num(self.chntext.strip().strip('百分之')) + '%' 597 | 598 | def percentage2chntext(self): 599 | return '百分之' + num2chn(self.percentage.strip().strip('%')) 600 | 601 | 602 | # ================================================================================ # 603 | # NSW Normalizer 604 | # ================================================================================ # 605 | class NSWNormalizer: 606 | def __init__(self, raw_text): 607 | self.raw_text = '^' + raw_text + '$' 608 | self.norm_text = '' 609 | 610 | def _particular(self): 611 | text = self.norm_text 612 | pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") 613 | matchers = pattern.findall(text) 614 | if matchers: 615 | # print('particular') 616 | for matcher in matchers: 617 | text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1) 618 | self.norm_text = text 619 | return self.norm_text 620 | 621 | def normalize(self): 622 | text = self.raw_text 623 | 624 | # 规范化日期 625 | pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") 626 | matchers = pattern.findall(text) 627 | if matchers: 628 | #print('date') 629 | for matcher in matchers: 630 | text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) 631 | 632 | # 规范化金钱 633 | pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") 634 | matchers = pattern.findall(text) 635 | if matchers: 636 | #print('money') 637 | for matcher in matchers: 638 | text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) 639 | 640 | # 规范化固话/手机号码 641 | # 手机 642 | # http://www.jihaoba.com/news/show/13680 643 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 644 | # 联通:130、131、132、156、155、186、185、176 645 | # 电信:133、153、189、180、181、177 646 | pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") 647 | matchers = pattern.findall(text) 648 | if matchers: 649 | #print('telephone') 650 | for matcher in matchers: 651 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) 652 | # 固话 653 | pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") 654 | matchers = pattern.findall(text) 655 | if matchers: 656 | # print('fixed telephone') 657 | for matcher in matchers: 658 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) 659 | 660 | # 规范化分数 661 | pattern = re.compile(r"(\d+/\d+)") 662 | matchers = pattern.findall(text) 663 | if matchers: 664 | #print('fraction') 665 | for matcher in matchers: 666 | text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) 667 | 668 | # 规范化百分数 669 | text = text.replace('%', '%') 670 | pattern = re.compile(r"(\d+(\.\d+)?%)") 671 | matchers = pattern.findall(text) 672 | if matchers: 673 | #print('percentage') 674 | for matcher in matchers: 675 | text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) 676 | 677 | # 规范化纯数+量词 678 | pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) 679 | matchers = pattern.findall(text) 680 | if matchers: 681 | #print('cardinal+quantifier') 682 | for matcher in matchers: 683 | text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) 684 | 685 | # 规范化数字编号 686 | pattern = re.compile(r"(\d{4,32})") 687 | matchers = pattern.findall(text) 688 | if matchers: 689 | #print('digit') 690 | for matcher in matchers: 691 | text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) 692 | 693 | # 规范化纯数 694 | pattern = re.compile(r"(\d+(\.\d+)?)") 695 | matchers = pattern.findall(text) 696 | if matchers: 697 | #print('cardinal') 698 | for matcher in matchers: 699 | text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) 700 | 701 | self.norm_text = text 702 | self._particular() 703 | 704 | return self.norm_text.lstrip('^').rstrip('$') 705 | 706 | 707 | def nsw_test_case(raw_text): 708 | print('I:' + raw_text) 709 | print('O:' + NSWNormalizer(raw_text).normalize()) 710 | print('') 711 | 712 | 713 | def nsw_test(): 714 | nsw_test_case('固话:0595-23865596或23880880。') 715 | nsw_test_case('固话:0595-23865596或23880880。') 716 | nsw_test_case('手机:+86 19859213959或15659451527。') 717 | nsw_test_case('分数:32477/76391。') 718 | nsw_test_case('百分数:80.03%。') 719 | nsw_test_case('编号:31520181154418。') 720 | nsw_test_case('纯数:2983.07克或12345.60米。') 721 | nsw_test_case('日期:1999年2月20日或09年3月15号。') 722 | nsw_test_case('金钱:12块5,34.5元,20.1万') 723 | nsw_test_case('特殊:O2O或B2C。') 724 | nsw_test_case('3456万吨') 725 | nsw_test_case('2938个') 726 | nsw_test_case('938') 727 | nsw_test_case('今天吃了115个小笼包231个馒头') 728 | nsw_test_case('有62%的概率') 729 | 730 | 731 | if __name__ == '__main__': 732 | #nsw_test() 733 | 734 | p = argparse.ArgumentParser() 735 | p.add_argument('ifile', help='input filename, assume utf-8 encoding') 736 | p.add_argument('ofile', help='output filename') 737 | p.add_argument('--to_upper', action='store_true', help='convert to upper case') 738 | p.add_argument('--to_lower', action='store_true', help='convert to lower case') 739 | p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.") 740 | p.add_argument('--log_interval', type=int, default=100000, help='log interval in number of processed lines') 741 | args = p.parse_args() 742 | 743 | ifile = codecs.open(args.ifile, 'r', 'utf8') 744 | ofile = codecs.open(args.ofile, 'w+', 'utf8') 745 | 746 | n = 0 747 | for l in ifile: 748 | key = '' 749 | text = '' 750 | if args.has_key: 751 | cols = l.split(maxsplit=1) 752 | key = cols[0] 753 | if len(cols) == 2: 754 | text = cols[1].strip() 755 | else: 756 | text = '' 757 | else: 758 | text = l.strip() 759 | 760 | # cases 761 | if args.to_upper and args.to_lower: 762 | sys.stderr.write('cn_tn.py: to_upper OR to_lower?') 763 | exit(1) 764 | if args.to_upper: 765 | text = text.upper() 766 | if args.to_lower: 767 | text = text.lower() 768 | 769 | # NSW(Non-Standard-Word) normalization 770 | text = NSWNormalizer(text).normalize() 771 | 772 | # Punctuations removal 773 | old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations 774 | new_chars = ' ' * len(old_chars) 775 | del_chars = '' 776 | text = text.translate(str.maketrans(old_chars, new_chars, del_chars)) 777 | 778 | # 779 | if args.has_key: 780 | ofile.write(key + '\t' + text + '\n') 781 | else: 782 | if text.strip() != '': # skip empty line in pure text format(without Kaldi's utt key) 783 | ofile.write(text + '\n') 784 | 785 | n += 1 786 | if n % args.log_interval == 0: 787 | sys.stderr.write("cn_tn.py: {} lines done.\n".format(n)) 788 | sys.stderr.flush() 789 | 790 | sys.stderr.write("cn_tn.py: {} lines done in total.\n".format(n)) 791 | sys.stderr.flush() 792 | 793 | ifile.close() 794 | ofile.close() 795 | -------------------------------------------------------------------------------- /zhtts/tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from pathlib import Path 4 | import tensorflow as tf 5 | #import tflite_runtime.interpreter as tflite 6 | from scipy.io import wavfile 7 | import re 8 | 9 | from .tensorflow_tts.processor import BakerProcessor 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 12 | ASSET_DIR = Path(__file__).parent / "asset" 13 | 14 | def split_sens(text): 15 | """ split sentence and keep sperator to the left 16 | 17 | Args: 18 | text (str): 19 | 20 | Returns: 21 | list[str]: splited sentence 22 | 23 | Examples: 24 | >>> split_sens("中文:语音,合成!系统\n") 25 | ['中文:', '语音,', '合成!', '系统'] 26 | """ 27 | texts = re.split(r";", re.sub(r"([、,。!?])", r"\1;", text.strip())) 28 | return [x for x in texts if x] 29 | 30 | class TTS(): 31 | def __init__(self, text2mel_name="FASTSPEECH2"): 32 | """text2mel_name: ["FASTSPEECH2", "TACOTRON"] """ 33 | self.sample_rate = 24000 34 | self.processor = BakerProcessor( 35 | data_dir=None, loaded_mapper_path=ASSET_DIR / "baker_mapper.json") 36 | self.text2mel_name = text2mel_name 37 | if text2mel_name == "FASTSPEECH2": 38 | self.acoustic = tf.lite.Interpreter(model_path=str(ASSET_DIR / 'fastspeech2_quan.tflite')) 39 | elif text2mel_name == "TACOTRON": 40 | self.acoustic = tf.lite.Interpreter(model_path=str(ASSET_DIR / 'tacotron2_quan.tflite')) 41 | else: 42 | raise ValueError(f"unsported text2mel_name: {text2mel_name}") 43 | self.vocoder = tf.lite.Interpreter(model_path=str(ASSET_DIR / 'mb_melgan.tflite')) 44 | 45 | def prepare_input(self, input_ids): 46 | input_ids = np.expand_dims(np.array(input_ids, np.int32), 0) 47 | if self.text2mel_name == "TACOTRON": 48 | return (input_ids, 49 | np.array([input_ids.shape[1]], np.int32), 50 | np.array([0], np.int32),) 51 | elif self.text2mel_name == "FASTSPEECH2": 52 | return (input_ids, 53 | np.array([0], np.int32), 54 | np.array([1.0], np.float32), 55 | np.array([1.0], np.float32), 56 | np.array([1.0], np.float32),) 57 | 58 | def text2mel(self, input_text): 59 | input_details = self.acoustic.get_input_details() 60 | output_details = self.acoustic.get_output_details() 61 | input_ids = self.processor.text_to_sequence(input_text, inference=True) 62 | 63 | self.acoustic.resize_tensor_input( 64 | input_details[0]['index'], [1, len(input_ids)]) 65 | self.acoustic.allocate_tensors() 66 | 67 | input_data = self.prepare_input(input_ids) 68 | for i, detail in enumerate(input_details): 69 | self.acoustic.set_tensor(detail['index'], input_data[i]) 70 | self.acoustic.invoke() 71 | 72 | return self.acoustic.get_tensor(output_details[1]['index']) 73 | 74 | def mel2audio(self, mel): 75 | input_details = self.vocoder.get_input_details() 76 | output_details = self.vocoder.get_output_details() 77 | self.vocoder.resize_tensor_input(input_details[0]['index'], mel.shape) 78 | self.vocoder.allocate_tensors() 79 | self.vocoder.set_tensor(input_details[0]['index'], mel) 80 | self.vocoder.invoke() 81 | 82 | return self.vocoder.get_tensor(output_details[0]['index'])[0, :, 0] 83 | 84 | def synthesis(self, text, sil_time=0.2): 85 | """ synthesis text to audio 86 | 87 | Args: 88 | text (str) 89 | sil_time (float): silence duration between two wav 90 | Returns: 91 | ndarray: audio 92 | """ 93 | audios = [] 94 | texts = split_sens(text) 95 | silence = np.zeros(int(sil_time * self.sample_rate), dtype=np.float32) # 添加静音 96 | for i, text in enumerate(texts): 97 | print(f"index: {i}, text: {text}") 98 | print(f"frontend info: {self.frontend(text)}") 99 | # print(self.processor.text_to_sequence(text, inference=True)) 100 | mel = self.text2mel(text) 101 | audio = self.mel2audio(mel) 102 | if self.text2mel_name == "TACOTRON": 103 | audio = audio[:-2048] # tacotron will generate noise at the end 104 | audios.append(audio) 105 | if i < len(texts)-1: 106 | audios.append(silence) 107 | return np.concatenate(audios) 108 | 109 | def frontend(self, text): 110 | """ return normalize_text, phoneme_seq for debug 111 | 112 | Args: 113 | text (str) 114 | Returns: 115 | (tuple): tuple containing: 116 | 117 | normalize_text (str): text after text_normalize 118 | phoneme (str): " ".join(phones) 119 | """ 120 | return self.processor.text_to_phone(text) 121 | 122 | def text2wav(self, text, wavpath): 123 | """synthesis text and save to wavfile""" 124 | audio = self.synthesis(text) 125 | 126 | wavfile.write(wavpath, self.sample_rate, audio) 127 | print(f"Save wav to {wavpath}") 128 | 129 | --------------------------------------------------------------------------------