├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yml ├── ref_audios ├── default_zh.txt ├── default_zh.wav ├── leijun.txt ├── leijun.wav ├── wukong.txt └── wukong.wav ├── requirements.txt ├── tests ├── long_input.txt ├── test.sh └── test_cosyvoice.sh ├── tts_frontend.py └── tts_server.py /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /app 3 | COPY requirements.txt tts_server.py ref_audios/ . 4 | RUN pip install --no-cache-dir -r requirements.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Triton-OpenAI-Speech 2 | OpenAI-Compatible Frontend for Triton Inference ASR/TTS Server 3 | 4 | ### Quick Start 5 | Before starting, launch one of the supported ASR/TTS services using Docker Compose. 6 | | Model Repo | Supported | 7 | | --- | -- | 8 | | [Spark-TTS](https://github.com/SparkAudio/Spark-TTS/tree/main/runtime/triton_trtllm) | Yes | 9 | |[F5-TTS](https://github.com/SWivid/F5-TTS/tree/main/src/f5_tts/runtime/triton_trtllm)| Yes | 10 | |[Cosyvoice2](https://github.com/FunAudioLLM/CosyVoice/tree/main/runtime/triton_trtllm)| Yes | 11 | 12 | Then, launch the OpenAI-compatible API bridge server. 13 | ```sh 14 | docker compose up 15 | ``` 16 | 17 | ### Simple Test 18 | ```sh 19 | bash tests/test.sh 20 | ``` 21 | ### Usage 22 | 23 | ``` 24 | tts_server.py [-h] [--host HOST] [--port PORT] [--url URL] 25 | [--ref_audios_dir REF_AUDIOS_DIR] 26 | [--default_sample_rate DEFAULT_SAMPLE_RATE] 27 | 28 | options: 29 | -h, --help show this help message and exit 30 | --host HOST Host to bind the server to 31 | --port PORT Port to bind the server to 32 | --url URL Triton server URL 33 | --ref_audios_dir REF_AUDIOS_DIR 34 | Path to reference audio files 35 | --default_sample_rate DEFAULT_SAMPLE_RATE 36 | Default sample rate 37 | ``` -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | api-server: 3 | build: . 4 | container_name: openai_api_server 5 | ports: 6 | - "8080:8080" 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" 9 | command: > 10 | /bin/bash -c "python tts_server.py --url http://host.docker.internal:8000 --ref_audios_dir ./" 11 | 12 | -------------------------------------------------------------------------------- /ref_audios/default_zh.txt: -------------------------------------------------------------------------------- 1 | 吃燕窝就选燕之屋,本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝,营养更均衡,本节目由豆本豆豆奶特约播出。 -------------------------------------------------------------------------------- /ref_audios/default_zh.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/default_zh.wav -------------------------------------------------------------------------------- /ref_audios/leijun.txt: -------------------------------------------------------------------------------- 1 | 大家好!今天给大家带来一款重磅产品,性能提升了80%,但是价格只要友商的一半。 -------------------------------------------------------------------------------- /ref_audios/leijun.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/leijun.wav -------------------------------------------------------------------------------- /ref_audios/wukong.txt: -------------------------------------------------------------------------------- 1 | 俺老孙的金箍棒,打遍天下无敌手!什么妖魔鬼怪,统统都不在话下。 -------------------------------------------------------------------------------- /ref_audios/wukong.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/wukong.wav -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | soundfile 3 | numpy 4 | fastapi 5 | uvicorn[standard] 6 | pydantic 7 | python-multipart # Needed by FastAPI for form data, good to include 8 | WeTextProcessing 9 | inflect -------------------------------------------------------------------------------- /tests/long_input.txt: -------------------------------------------------------------------------------- 1 | 南京的美食实在太多了!我可以根据不同的类别推荐一些: 2 | 3 | 1. 经典的南京小吃: 4 | 5 | 鸭血粉丝汤: 这是南京最具代表性的美食,汤鲜味美,鸭血嫩滑,粉丝爽口。推荐:尹氏鸭血粉丝汤、刘鸣记鸭血粉丝汤。 6 | 小笼包: 南京的小笼包皮薄馅大,汤汁鲜美。推荐:南京大牌档、回味鸭血粉丝汤(也有小笼包)。 7 | 牛肉锅贴: 外皮酥脆,内馅鲜嫩多汁。推荐:李记牛肉锅贴、韩氏锅贴。 8 | 梅花糕: 软糯香甜,带有淡淡的梅花香气。推荐:连香斋。 9 | 赤豆元宵: 南京元宵的特色是用赤豆做馅料,香甜软糯。推荐:宏fie元宵。 10 | 盐水鸭: 南京盐水鸭皮脆肉嫩,味道鲜美。推荐:南京大牌档、尹氏盐水鸭。 11 | 臭豆腐: 南京的臭豆腐炸得金黄酥脆,外脆内嫩,配上蒜蓉辣椒酱,味道独特。推荐:夫子庙附近的臭豆腐摊。 12 | 2. 本地特色菜: 13 | 14 | 板鸭: 与盐水鸭不同,板鸭是经过腌制和风干的,风味独特。 15 | 清炖狮子头: 肥而不腻,入口即化。 16 | 松鼠桂鱼: 外形美观,酸甜可口。 17 | 金陵烤鸭: 南京烤鸭以色泽红润,皮脆肉嫩为特点。 18 | 啤酒鸭: 用啤酒炖制的鸭肉,鲜香入味。 19 | 3. 夫子庙小吃街: 20 | 21 | 夫子庙小吃街汇集了各种南京小吃,可以一次性品尝到很多美食,但通常价格会稍高,而且人流量大。 22 | 4. 餐厅推荐: 23 | 24 | 南京大牌档: 一家综合性的餐厅,可以品尝到各种南京特色菜和小吃。 25 | 尹氏鸭血粉丝汤: 老字号,鸭血粉丝汤味道正宗。 26 | 刘鸣记鸭血粉丝汤: 也是一家知名的鸭血粉丝汤店。 27 | 秋林里: 比较有情调的本地菜餐厅。 28 | 5. 一些其他推荐: 29 | 30 | **甘肃刀削面:**虽然不是南京本地的,但很多南京人喜欢吃。 31 | 老门东的小吃: 老门东是南京的文化街区,有很多小吃店,可以尝试一些当地特色小吃。 32 | 温馨提示: 33 | 34 | 南京的美食很多集中在夫子庙、新街口、老门东等区域。 35 | 可以根据自己的口味和喜好选择不同的美食。 36 | 在夫子庙小吃街等热门地点,要注意保管好自己的财物。 37 | 一些老字号的餐厅可能需要排队。 38 | 希望这些推荐能帮助你更好地了解南京的美食! 你如果对某一种美食或者某个区域感兴趣,我可以提供更详细的信息。 祝你旅途愉快,吃得开心! -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | # OPENAI_API_KEY=sk- 2 | # OPENAI_API_BASE="https://aihubmix.com/v1" 3 | # curl $OPENAI_API_BASE/audio/speech \ 4 | # -H "Content-Type: application/json" \ 5 | # -d '{ 6 | # "model": "tts-1", 7 | # "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 8 | # "voice": "coral" 9 | # }' \ 10 | # --output output.wav 11 | 12 | OPENAI_API_BASE="http://localhost:8080" 13 | 14 | curl $OPENAI_API_BASE/audio/speech \ 15 | -H "Content-Type: application/json" \ 16 | -d '{ 17 | "model": "spark_tts", 18 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 19 | "voice": "default_zh", 20 | "response_format": "wav" 21 | }' \ 22 | --output output.wav 23 | 24 | curl $OPENAI_API_BASE/audio/speech \ 25 | -H "Content-Type: application/json" \ 26 | -d '{ 27 | "model": "spark_tts", 28 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 29 | "voice": "wukong", 30 | "response_format": "wav" 31 | }' \ 32 | --output output2.wav 33 | 34 | curl $OPENAI_API_BASE/audio/speech \ 35 | -H "Content-Type: application/json" \ 36 | -d '{ 37 | "model": "spark_tts", 38 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 39 | "voice": "leijun", 40 | "response_format": "wav" 41 | }' \ 42 | --output output3.wav 43 | 44 | # output3 from pcm 45 | curl $OPENAI_API_BASE/audio/speech \ 46 | -H "Content-Type: application/json" \ 47 | -d '{ 48 | "model": "spark_tts", 49 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 50 | "voice": "leijun", 51 | "response_format": "pcm" 52 | }' | \ 53 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output3_from_pcm.wav 54 | 55 | # load input from long_input.txt 56 | input=$(cat long_input.txt) 57 | # Construct JSON payload using jq 58 | json_payload=$(jq -n --arg input_text "$input" '{model: "spark_tts", input: $input_text, voice: "default_zh", response_format: "wav"}') 59 | 60 | curl $OPENAI_API_BASE/audio/speech \ 61 | -H "Content-Type: application/json" \ 62 | -d "$json_payload" \ 63 | --output output4.wav 64 | 65 | json_payload=$(jq -n --arg input_text "$input" '{model: "spark_tts", input: $input_text, voice: "default_zh", response_format: "pcm"}') 66 | curl $OPENAI_API_BASE/audio/speech \ 67 | -H "Content-Type: application/json" \ 68 | -d "$json_payload" | \ 69 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output4_from_pcm.wav -------------------------------------------------------------------------------- /tests/test_cosyvoice.sh: -------------------------------------------------------------------------------- 1 | # python3 tts_server.py --url http://localhost:8000 --ref_audios_dir ./ref_audios/ --port 10086 --default_sample_rate 24000 2 | OPENAI_API_BASE="http://localhost:10086" 3 | 4 | curl $OPENAI_API_BASE/audio/speech \ 5 | -H "Content-Type: application/json" \ 6 | -d '{ 7 | "model": "cosyvoice2", 8 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 9 | "voice": "default_zh", 10 | "response_format": "wav" 11 | }' \ 12 | --output output.wav 13 | 14 | curl $OPENAI_API_BASE/audio/speech \ 15 | -H "Content-Type: application/json" \ 16 | -d '{ 17 | "model": "cosyvoice2", 18 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 19 | "voice": "wukong", 20 | "response_format": "wav" 21 | }' \ 22 | --output output2.wav 23 | 24 | curl $OPENAI_API_BASE/audio/speech \ 25 | -H "Content-Type: application/json" \ 26 | -d '{ 27 | "model": "cosyvoice2", 28 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 29 | "voice": "leijun", 30 | "response_format": "wav" 31 | }' \ 32 | --output output3.wav 33 | 34 | # output3 from pcm 35 | curl $OPENAI_API_BASE/audio/speech \ 36 | -H "Content-Type: application/json" \ 37 | -d '{ 38 | "model": "cosyvoice2", 39 | "input": "身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。", 40 | "voice": "leijun", 41 | "response_format": "pcm" 42 | }' | \ 43 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output3_from_pcm.wav 44 | 45 | # load input from long_input.txt 46 | input=$(cat long_input.txt) 47 | # Construct JSON payload using jq 48 | json_payload=$(jq -n --arg input_text "$input" '{model: "cosyvoice2", input: $input_text, voice: "default_zh", response_format: "wav"}') 49 | 50 | curl $OPENAI_API_BASE/audio/speech \ 51 | -H "Content-Type: application/json" \ 52 | -d "$json_payload" \ 53 | --output output4.wav 54 | 55 | json_payload=$(jq -n --arg input_text "$input" '{model: "cosyvoice2", input: $input_text, voice: "default_zh", response_format: "pcm"}') 56 | curl $OPENAI_API_BASE/audio/speech \ 57 | -H "Content-Type: application/json" \ 58 | -d "$json_payload" | \ 59 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output4_from_pcm.wav -------------------------------------------------------------------------------- /tts_frontend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | import regex 17 | from tn.chinese.normalizer import Normalizer as ZhNormalizer 18 | from tn.english.normalizer import Normalizer as EnNormalizer 19 | import inflect 20 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') 21 | 22 | 23 | class TextNormalizer: 24 | def __init__(self): 25 | self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True) 26 | self.en_tn_model = EnNormalizer() 27 | self.inflect_parser = inflect.engine() 28 | 29 | def text_normalize(self, text, split=True, text_frontend=True): 30 | """ 31 | Normalize text for TTS. 32 | Modified from https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/cli/frontend.py 33 | """ 34 | if text_frontend is False: 35 | return [text] if split is True else text 36 | text = text.strip() 37 | text = remove_asterisk(text) 38 | if contains_chinese(text): 39 | text = self.zh_tn_model.normalize(text) 40 | text = text.replace("\n", "") 41 | text = replace_blank(text) 42 | text = replace_corner_mark(text) 43 | text = text.replace(".", "。") 44 | text = text.replace(" - ", ",") 45 | text = remove_bracket(text) 46 | text = re.sub(r'[,,、]+$', '。', text) 47 | texts = list(split_paragraph(text, "zh", token_max_n=50, 48 | token_min_n=30, merge_len=10, comma_split=False)) 49 | else: 50 | text = self.en_tn_model.normalize(text) 51 | text = spell_out_number(text, self.inflect_parser) 52 | texts = list(split_paragraph(text, "en", token_max_n=50, 53 | token_min_n=30, merge_len=10, comma_split=False)) 54 | texts = [i for i in texts if not is_only_punctuation(i)] 55 | return texts if split is True else text 56 | 57 | # whether contain chinese character 58 | def contains_chinese(text): 59 | return bool(chinese_char_pattern.search(text)) 60 | 61 | 62 | # replace special symbol 63 | def replace_corner_mark(text): 64 | text = text.replace('²', '平方') 65 | text = text.replace('³', '立方') 66 | return text 67 | 68 | # remove * 69 | def remove_asterisk(text): 70 | text = text.replace('*', '') 71 | return text 72 | 73 | # remove meaningless symbol 74 | def remove_bracket(text): 75 | text = text.replace('(', '').replace(')', '') 76 | text = text.replace('【', '').replace('】', '') 77 | text = text.replace('`', '').replace('`', '') 78 | text = text.replace("——", " ") 79 | return text 80 | 81 | 82 | # spell Arabic numerals 83 | def spell_out_number(text: str, inflect_parser): 84 | new_text = [] 85 | st = None 86 | for i, c in enumerate(text): 87 | if not c.isdigit(): 88 | if st is not None: 89 | num_str = inflect_parser.number_to_words(text[st: i]) 90 | new_text.append(num_str) 91 | st = None 92 | new_text.append(c) 93 | else: 94 | if st is None: 95 | st = i 96 | if st is not None and st < len(text): 97 | num_str = inflect_parser.number_to_words(text[st:]) 98 | new_text.append(num_str) 99 | return ''.join(new_text) 100 | 101 | 102 | # split paragrah logic: 103 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len 104 | # 2. cal sentence len according to lang 105 | # 3. split sentence according to puncatation 106 | def split_paragraph(text: str, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): 107 | def calc_utt_length(_text: str): 108 | if lang == "zh": 109 | return len(_text) 110 | else: 111 | # Use word count for English 112 | return len(_text.split()) 113 | 114 | def should_merge(_text: str): 115 | if lang == "zh": 116 | return len(_text) < merge_len 117 | else: 118 | # Use word count for English 119 | return len(_text.split()) < merge_len 120 | 121 | if lang == "zh": 122 | pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] 123 | else: 124 | pounc = ['.', '?', '!', ';', ':'] 125 | if comma_split: 126 | pounc.extend([',', ',']) 127 | 128 | if text[-1] not in pounc: 129 | if lang == "zh": 130 | text += "。" 131 | else: 132 | text += "." 133 | 134 | st = 0 135 | utts = [] 136 | for i, c in enumerate(text): 137 | if c in pounc: 138 | if len(text[st: i]) > 0: 139 | utts.append(text[st: i] + c) 140 | if i + 1 < len(text) and text[i + 1] in ['"', '”']: 141 | tmp = utts.pop(-1) 142 | utts.append(tmp + text[i + 1]) 143 | st = i + 2 144 | else: 145 | st = i + 1 146 | 147 | final_utts = [] 148 | cur_utt = "" 149 | for utt in utts: 150 | if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: 151 | final_utts.append(cur_utt) 152 | cur_utt = "" 153 | cur_utt = cur_utt + utt 154 | if len(cur_utt) > 0: 155 | if should_merge(cur_utt) and len(final_utts) != 0: 156 | final_utts[-1] = final_utts[-1] + cur_utt 157 | else: 158 | final_utts.append(cur_utt) 159 | 160 | return final_utts 161 | 162 | 163 | # remove blank between chinese character 164 | def replace_blank(text: str): 165 | out_str = [] 166 | for i, c in enumerate(text): 167 | if c == " ": 168 | if ((text[i + 1].isascii() and text[i + 1] != " ") and 169 | (text[i - 1].isascii() and text[i - 1] != " ")): 170 | out_str.append(c) 171 | else: 172 | out_str.append(c) 173 | return "".join(out_str) 174 | 175 | 176 | def is_only_punctuation(text): 177 | # Regular expression: Match strings that consist only of punctuation marks or are empty. 178 | punctuation_pattern = r'^[\p{P}\p{S}]*$' 179 | return bool(regex.fullmatch(punctuation_pattern, text)) 180 | 181 | 182 | if __name__ == "__main__": 183 | text_normalizer = TextNormalizer() 184 | text = open("tests/long_input.txt", "r").read() 185 | print(text) 186 | print(text_normalizer.text_normalize(text)) 187 | -------------------------------------------------------------------------------- /tts_server.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import soundfile as sf 3 | import json 4 | import numpy as np 5 | import argparse 6 | import io 7 | import os # Added import 8 | from fastapi import FastAPI, HTTPException, Request 9 | from fastapi.responses import StreamingResponse 10 | from pydantic import BaseModel 11 | import uvicorn 12 | from pathlib import Path 13 | from typing import Optional # Added import 14 | from tts_frontend import TextNormalizer 15 | def register_voice(ref_audios_path): 16 | VOICE_CONFIG = {} 17 | for ref_audio in os.listdir(ref_audios_path): 18 | if ref_audio.endswith(".wav") or ref_audio.endswith(".mp3"): 19 | voice_name = Path(ref_audio).stem 20 | reference_text = open(os.path.join(ref_audios_path, ref_audio.replace(".wav", ".txt")), "r").read() 21 | VOICE_CONFIG[voice_name] = { 22 | "reference_audio": os.path.join(ref_audios_path, ref_audio), 23 | "reference_text": reference_text 24 | } 25 | return VOICE_CONFIG 26 | 27 | class TTSRequest(BaseModel): 28 | model: str # We might not use this directly if mapping voice to model config 29 | input: str 30 | voice: str 31 | instructions: Optional[str] = None # Optional field if needed 32 | response_format: Optional[str] = "pcm" # Added: default to raw pcm stream, allow "wav" 33 | 34 | def prepare_tts_request( 35 | waveform, 36 | reference_text, 37 | target_text, 38 | ): 39 | assert len(waveform.shape) == 1, "waveform should be 1D" 40 | lengths = np.array([[len(waveform)]], dtype=np.int32) 41 | 42 | samples = waveform.reshape(1, -1).astype(np.float32) 43 | 44 | data = { 45 | "inputs":[ 46 | { 47 | "name": "reference_wav", 48 | "shape": samples.shape, 49 | "datatype": "FP32", 50 | "data": samples.tolist() 51 | }, 52 | { 53 | "name": "reference_wav_len", 54 | "shape": lengths.shape, 55 | "datatype": "INT32", 56 | "data": lengths.tolist(), 57 | }, 58 | { 59 | "name": "reference_text", 60 | "shape": [1, 1], 61 | "datatype": "BYTES", 62 | "data": [reference_text] 63 | }, 64 | { 65 | "name": "target_text", 66 | "shape": [1, 1], 67 | "datatype": "BYTES", 68 | "data": [target_text] 69 | } 70 | ] 71 | } 72 | return data 73 | 74 | 75 | app = FastAPI() 76 | text_normalizer = TextNormalizer() 77 | 78 | async def _stream_audio_generator(request_data: TTSRequest): 79 | """Async generator to yield audio numpy arrays (int16) for each sentence.""" 80 | # Initial checks (voice, ref audio) are now done in the main endpoint. 81 | 82 | config = VOICE_CONFIG[request_data.voice] 83 | reference_audio_path = config["reference_audio"] 84 | reference_text = config["reference_text"] 85 | request_model_name = request_data.model 86 | triton_url = f"{TRITON_SERVER_URL}/v2/models/{request_model_name}/infer" 87 | target_text_list = text_normalizer.text_normalize(request_data.input) 88 | 89 | try: 90 | # Read reference audio once 91 | waveform, sr = sf.read(reference_audio_path) 92 | # Sample rate check already done in the main endpoint 93 | 94 | # Ensure reference samples are float32 as expected by prepare_tts_request 95 | samples = np.array(waveform, dtype=np.float32) 96 | 97 | for target_text in target_text_list: 98 | print(f"Generating audio array for: {target_text}") # Log start of processing 99 | triton_request_data = prepare_tts_request(samples, reference_text, target_text) 100 | 101 | try: 102 | rsp = requests.post( 103 | triton_url, 104 | headers={"Content-Type": "application/json"}, 105 | json=triton_request_data, 106 | # Consider adding a timeout 107 | # timeout=30 # Example: 30 seconds 108 | ) 109 | rsp.raise_for_status() # Raise exception for bad status codes (4xx or 5xx) 110 | result = rsp.json() 111 | 112 | if "error" in result: 113 | print(f"Triton server error for text '{target_text}': {result['error']}") 114 | continue # Skip yielding audio for this failed sentence 115 | 116 | if not result.get("outputs") or not result["outputs"][0].get("data"): 117 | print(f"Invalid response structure from Triton for text '{target_text}'") 118 | continue # Skip yielding audio for this invalid response 119 | 120 | audio_data = result["outputs"][0]["data"] 121 | # Assuming Triton returns float32 data 122 | audio_array = np.array(audio_data, dtype=np.float32) 123 | 124 | # Convert to 16-bit PCM 125 | audio_array = np.clip(audio_array, -1.0, 1.0) 126 | pcm_data = (audio_array * 32767).astype(np.int16) 127 | 128 | yield pcm_data # Yield the numpy array directly 129 | 130 | except requests.exceptions.Timeout: 131 | print(f"Triton request timed out for text '{target_text[:50]}...'") 132 | raise HTTPException(status_code=504, detail="Triton server request timed out during streaming.") 133 | except requests.exceptions.RequestException as e: 134 | print(f"Could not connect to Triton server for text '{target_text[:50]}...': {e}") 135 | raise HTTPException(status_code=503, detail=f"Could not connect to Triton server during streaming: {e}") 136 | except Exception as e: 137 | print(f"An unexpected error occurred processing text '{target_text[:50]}...': {str(e)}") 138 | raise HTTPException(status_code=502, detail=f"An unexpected error occurred during streaming: {str(e)}") 139 | 140 | print("Finished generating all sentence arrays.") 141 | 142 | except sf.SoundFileError as e: 143 | print(f"Error reading reference audio within generator: {e}") 144 | raise HTTPException(status_code=501, detail=f"Could not read reference audio file during streaming: {reference_audio_path}") 145 | except Exception as e: 146 | print(f"Unexpected error at start of/during generator execution: {e}") 147 | raise HTTPException(status_code=500, detail=f"Unexpected generator error: {str(e)}") 148 | 149 | 150 | @app.post("/audio/speech") 151 | async def generate_speech(request_data: TTSRequest): 152 | # --- Perform initial checks --- (same as before) 153 | if request_data.voice not in VOICE_CONFIG: 154 | raise HTTPException(status_code=400, detail=f"Voice '{request_data.voice}' not found.") 155 | 156 | config = VOICE_CONFIG[request_data.voice] 157 | reference_audio_path = config["reference_audio"] 158 | 159 | try: 160 | if not os.path.exists(reference_audio_path): 161 | raise FileNotFoundError 162 | info = sf.info(reference_audio_path) 163 | if info.samplerate != 16000: 164 | raise HTTPException(status_code=500, detail=f"Reference audio sample rate ({info.samplerate}) does not match expected 16000. Resampling not implemented yet.") 165 | except FileNotFoundError: 166 | raise HTTPException(status_code=501, detail=f"Reference audio file not found: {reference_audio_path}") 167 | except sf.SoundFileError: 168 | raise HTTPException(status_code=501, detail=f"Could not read reference audio file info (invalid format or corrupt?): {reference_audio_path}") 169 | except Exception as e: 170 | raise HTTPException(status_code=500, detail=f"Error checking reference audio file: {str(e)}") 171 | 172 | # --- Handle response format --- 173 | if request_data.response_format: 174 | response_format = request_data.response_format.lower() 175 | if response_format not in ["pcm", "wav"]: 176 | response_format = "wav" 177 | else: 178 | response_format = "wav" 179 | 180 | if response_format == "pcm": 181 | print("Streaming raw PCM audio.") 182 | media_type = f"audio/L16;rate={DEFAULT_SAMPLE_RATE};channels=1" 183 | 184 | async def pcm_byte_stream_generator(): 185 | """Consumes numpy arrays from the main generator and yields bytes.""" 186 | async for pcm_array in _stream_audio_generator(request_data): 187 | yield pcm_array.tobytes() 188 | print("Finished streaming PCM bytes.") 189 | 190 | return StreamingResponse(pcm_byte_stream_generator(), media_type=media_type) 191 | 192 | elif response_format == "wav": 193 | print("Generating buffered WAV file.") 194 | all_audio_arrays = [] 195 | try: 196 | async for pcm_array in _stream_audio_generator(request_data): 197 | all_audio_arrays.append(pcm_array) 198 | except HTTPException as e: 199 | # If the generator itself raises an HTTP exception, re-raise it 200 | raise e 201 | except Exception as e: 202 | # Catch unexpected errors during array collection 203 | print(f"Unexpected error collecting audio arrays for WAV: {e}") 204 | raise HTTPException(status_code=500, detail=f"Error generating full WAV file: {str(e)}") 205 | 206 | if not all_audio_arrays: 207 | print("No audio data generated, returning empty WAV.") 208 | # Return an empty WAV or perhaps an error? 209 | # Let's return a minimal valid empty WAV 210 | wav_buffer = io.BytesIO() 211 | sf.write(wav_buffer, np.array([], dtype=np.int16), DEFAULT_SAMPLE_RATE, format='WAV', subtype='PCM_16') 212 | wav_buffer.seek(0) 213 | return StreamingResponse(wav_buffer, media_type="audio/wav") 214 | # Alternatively: raise HTTPException(status_code=500, detail="Failed to generate any audio data") 215 | 216 | try: 217 | final_audio_array = np.concatenate(all_audio_arrays) 218 | print(f"Concatenated audio array shape: {final_audio_array.shape}") 219 | 220 | wav_buffer = io.BytesIO() 221 | sf.write(wav_buffer, final_audio_array, DEFAULT_SAMPLE_RATE, format='WAV', subtype='PCM_16') 222 | wav_buffer.seek(0) 223 | 224 | print("Returning complete WAV file.") 225 | return StreamingResponse(wav_buffer, media_type="audio/wav") 226 | except Exception as e: 227 | print(f"Error concatenating or writing WAV file: {e}") 228 | raise HTTPException(status_code=500, detail=f"Failed to create final WAV file: {str(e)}") 229 | 230 | else: 231 | raise HTTPException(status_code=400, detail=f"Unsupported response_format: '{request_data.response_format}'. Supported formats: 'pcm', 'wav'.") 232 | 233 | 234 | if __name__ == "__main__": 235 | parser = argparse.ArgumentParser() 236 | parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind the server to") 237 | parser.add_argument("--port", type=int, default=8080, help="Port to bind the server to") 238 | parser.add_argument("--url", type=str, default="http://localhost:8000", help="Triton server URL") 239 | parser.add_argument("--ref_audios_dir", type=str, default="./ref_audios", help="Path to reference audio files") 240 | parser.add_argument("--default_sample_rate", type=int, default=16000, help="Default sample rate") 241 | args = parser.parse_args() 242 | 243 | VOICE_CONFIG = register_voice(args.ref_audios_dir) 244 | TRITON_SERVER_URL = args.url 245 | DEFAULT_SAMPLE_RATE = args.default_sample_rate 246 | REF_AUDIO_BASE_PATH = args.ref_audios_dir 247 | args = parser.parse_args() 248 | 249 | print(f"Starting FastAPI server on {args.host}:{args.port}") 250 | print(f"Using Triton server at {TRITON_SERVER_URL}") 251 | voice_list = list(VOICE_CONFIG.keys()) 252 | if len(voice_list) == 0: 253 | raise ValueError("No voice found in the reference audio directory") 254 | print(f"Available voices: {voice_list}") 255 | 256 | uvicorn.run(app, host=args.host, port=args.port) --------------------------------------------------------------------------------