├── Dockerfile
├── LICENSE
├── README.md
├── docker-compose.yml
├── ref_audios
    ├── default_zh.txt
    ├── default_zh.wav
    ├── leijun.txt
    ├── leijun.wav
    ├── wukong.txt
    └── wukong.wav
├── requirements.txt
├── tests
    ├── long_input.txt
    ├── test.sh
    └── test_cosyvoice.sh
├── tts_frontend.py
└── tts_server.py


/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 | WORKDIR /app
3 | COPY requirements.txt  tts_server.py ref_audios/ .
4 | RUN pip install --no-cache-dir -r requirements.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Triton-OpenAI-Speech
 2 | OpenAI-Compatible Frontend for Triton Inference ASR/TTS Server
 3 | 
 4 | ### Quick Start
 5 | Before starting, launch one of the supported ASR/TTS services using Docker Compose.
 6 | | Model Repo | Supported |
 7 | | --- |  -- |
 8 | | [Spark-TTS](https://github.com/SparkAudio/Spark-TTS/tree/main/runtime/triton_trtllm) | Yes |
 9 | |[F5-TTS](https://github.com/SWivid/F5-TTS/tree/main/src/f5_tts/runtime/triton_trtllm)| Yes |
10 | |[Cosyvoice2](https://github.com/FunAudioLLM/CosyVoice/tree/main/runtime/triton_trtllm)| Yes |
11 | 
12 | Then, launch the OpenAI-compatible API bridge server.
13 | ```sh
14 | docker compose up
15 | ```
16 | 
17 | ### Simple Test
18 | ```sh
19 | bash tests/test.sh
20 | ```
21 | ### Usage
22 | 
23 | ```
24 | tts_server.py [-h] [--host HOST] [--port PORT] [--url URL]
25 |                      [--ref_audios_dir REF_AUDIOS_DIR]
26 |                      [--default_sample_rate DEFAULT_SAMPLE_RATE]
27 | 
28 | options:
29 |   -h, --help            show this help message and exit
30 |   --host HOST           Host to bind the server to
31 |   --port PORT           Port to bind the server to
32 |   --url URL             Triton server URL
33 |   --ref_audios_dir REF_AUDIOS_DIR
34 |                         Path to reference audio files
35 |   --default_sample_rate DEFAULT_SAMPLE_RATE
36 |                         Default sample rate
37 | ```


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   api-server:
 3 |     build: .
 4 |     container_name: openai_api_server
 5 |     ports:
 6 |       - "8080:8080"
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"
 9 |     command: >
10 |       /bin/bash -c "python tts_server.py --url http://host.docker.internal:8000 --ref_audios_dir ./"
11 | 
12 | 


--------------------------------------------------------------------------------
/ref_audios/default_zh.txt:
--------------------------------------------------------------------------------
1 | 吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。


--------------------------------------------------------------------------------
/ref_audios/default_zh.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/default_zh.wav


--------------------------------------------------------------------------------
/ref_audios/leijun.txt:
--------------------------------------------------------------------------------
1 | 大家好！今天给大家带来一款重磅产品，性能提升了80%，但是价格只要友商的一半。


--------------------------------------------------------------------------------
/ref_audios/leijun.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/leijun.wav


--------------------------------------------------------------------------------
/ref_audios/wukong.txt:
--------------------------------------------------------------------------------
1 | 俺老孙的金箍棒，打遍天下无敌手！什么妖魔鬼怪，统统都不在话下。


--------------------------------------------------------------------------------
/ref_audios/wukong.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuekaizhang/Triton-OpenAI-Speech/e4f5743a137c3fd0ba703f9b6b0791778d060437/ref_audios/wukong.wav


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | soundfile
3 | numpy
4 | fastapi
5 | uvicorn[standard]
6 | pydantic
7 | python-multipart # Needed by FastAPI for form data, good to include
8 | WeTextProcessing
9 | inflect


--------------------------------------------------------------------------------
/tests/long_input.txt:
--------------------------------------------------------------------------------
 1 | 南京的美食实在太多了！我可以根据不同的类别推荐一些：
 2 | 
 3 | 1. 经典的南京小吃：
 4 | 
 5 | 鸭血粉丝汤： 这是南京最具代表性的美食，汤鲜味美，鸭血嫩滑，粉丝爽口。推荐：尹氏鸭血粉丝汤、刘鸣记鸭血粉丝汤。
 6 | 小笼包： 南京的小笼包皮薄馅大，汤汁鲜美。推荐：南京大牌档、回味鸭血粉丝汤（也有小笼包）。
 7 | 牛肉锅贴： 外皮酥脆，内馅鲜嫩多汁。推荐：李记牛肉锅贴、韩氏锅贴。
 8 | 梅花糕： 软糯香甜，带有淡淡的梅花香气。推荐：连香斋。
 9 | 赤豆元宵： 南京元宵的特色是用赤豆做馅料，香甜软糯。推荐：宏fie元宵。
10 | 盐水鸭： 南京盐水鸭皮脆肉嫩，味道鲜美。推荐：南京大牌档、尹氏盐水鸭。
11 | 臭豆腐： 南京的臭豆腐炸得金黄酥脆，外脆内嫩，配上蒜蓉辣椒酱，味道独特。推荐：夫子庙附近的臭豆腐摊。
12 | 2. 本地特色菜：
13 | 
14 | 板鸭： 与盐水鸭不同，板鸭是经过腌制和风干的，风味独特。
15 | 清炖狮子头： 肥而不腻，入口即化。
16 | 松鼠桂鱼： 外形美观，酸甜可口。
17 | 金陵烤鸭： 南京烤鸭以色泽红润，皮脆肉嫩为特点。
18 | 啤酒鸭： 用啤酒炖制的鸭肉，鲜香入味。
19 | 3. 夫子庙小吃街:
20 | 
21 | 夫子庙小吃街汇集了各种南京小吃，可以一次性品尝到很多美食，但通常价格会稍高，而且人流量大。
22 | 4. 餐厅推荐：
23 | 
24 | 南京大牌档： 一家综合性的餐厅，可以品尝到各种南京特色菜和小吃。
25 | 尹氏鸭血粉丝汤： 老字号，鸭血粉丝汤味道正宗。
26 | 刘鸣记鸭血粉丝汤： 也是一家知名的鸭血粉丝汤店。
27 | 秋林里： 比较有情调的本地菜餐厅。
28 | 5. 一些其他推荐：
29 | 
30 | **甘肃刀削面：**虽然不是南京本地的，但很多南京人喜欢吃。
31 | 老门东的小吃： 老门东是南京的文化街区，有很多小吃店，可以尝试一些当地特色小吃。
32 | 温馨提示：
33 | 
34 | 南京的美食很多集中在夫子庙、新街口、老门东等区域。
35 | 可以根据自己的口味和喜好选择不同的美食。
36 | 在夫子庙小吃街等热门地点，要注意保管好自己的财物。
37 | 一些老字号的餐厅可能需要排队。
38 | 希望这些推荐能帮助你更好地了解南京的美食！ 你如果对某一种美食或者某个区域感兴趣，我可以提供更详细的信息。 祝你旅途愉快，吃得开心！


--------------------------------------------------------------------------------
/tests/test.sh:
--------------------------------------------------------------------------------
 1 | # OPENAI_API_KEY=sk-
 2 | # OPENAI_API_BASE="https://aihubmix.com/v1"
 3 | # curl $OPENAI_API_BASE/audio/speech \
 4 | #     -H "Content-Type: application/json" \
 5 | #     -d '{
 6 | #     "model": "tts-1",
 7 | #     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
 8 | #     "voice": "coral"
 9 | #     }' \
10 | #     --output output.wav
11 | 
12 | OPENAI_API_BASE="http://localhost:8080"
13 | 
14 | curl $OPENAI_API_BASE/audio/speech \
15 |     -H "Content-Type: application/json" \
16 |     -d '{
17 |     "model": "spark_tts",
18 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
19 |     "voice": "default_zh",
20 |     "response_format": "wav"
21 |     }' \
22 |     --output output.wav
23 | 
24 | curl $OPENAI_API_BASE/audio/speech \
25 |     -H "Content-Type: application/json" \
26 |     -d '{
27 |     "model": "spark_tts",
28 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
29 |     "voice": "wukong",
30 |     "response_format": "wav"
31 |     }' \
32 |     --output output2.wav
33 | 
34 | curl $OPENAI_API_BASE/audio/speech \
35 |     -H "Content-Type: application/json" \
36 |     -d '{
37 |     "model": "spark_tts",
38 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
39 |     "voice": "leijun",
40 |     "response_format": "wav"
41 |     }' \
42 |     --output output3.wav
43 | 
44 | # output3 from pcm
45 | curl $OPENAI_API_BASE/audio/speech \
46 |     -H "Content-Type: application/json" \
47 |     -d '{
48 |     "model": "spark_tts",
49 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
50 |     "voice": "leijun",
51 |     "response_format": "pcm"
52 |     }' | \
53 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output3_from_pcm.wav
54 | 
55 | # load input from long_input.txt
56 | input=$(cat long_input.txt)
57 | # Construct JSON payload using jq
58 | json_payload=$(jq -n --arg input_text "$input" '{model: "spark_tts", input: $input_text, voice: "default_zh", response_format: "wav"}')
59 | 
60 | curl $OPENAI_API_BASE/audio/speech \
61 |     -H "Content-Type: application/json" \
62 |     -d "$json_payload" \
63 |     --output output4.wav
64 | 
65 | json_payload=$(jq -n --arg input_text "$input" '{model: "spark_tts", input: $input_text, voice: "default_zh", response_format: "pcm"}')
66 | curl $OPENAI_API_BASE/audio/speech \
67 |     -H "Content-Type: application/json" \
68 |     -d "$json_payload" | \
69 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output4_from_pcm.wav


--------------------------------------------------------------------------------
/tests/test_cosyvoice.sh:
--------------------------------------------------------------------------------
 1 | # python3 tts_server.py --url http://localhost:8000 --ref_audios_dir ./ref_audios/ --port 10086 --default_sample_rate 24000
 2 | OPENAI_API_BASE="http://localhost:10086"
 3 | 
 4 | curl $OPENAI_API_BASE/audio/speech \
 5 |     -H "Content-Type: application/json" \
 6 |     -d '{
 7 |     "model": "cosyvoice2",
 8 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
 9 |     "voice": "default_zh",
10 |     "response_format": "wav"
11 |     }' \
12 |     --output output.wav
13 | 
14 | curl $OPENAI_API_BASE/audio/speech \
15 |     -H "Content-Type: application/json" \
16 |     -d '{
17 |     "model": "cosyvoice2",
18 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
19 |     "voice": "wukong",
20 |     "response_format": "wav"
21 |     }' \
22 |     --output output2.wav
23 | 
24 | curl $OPENAI_API_BASE/audio/speech \
25 |     -H "Content-Type: application/json" \
26 |     -d '{
27 |     "model": "cosyvoice2",
28 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
29 |     "voice": "leijun",
30 |     "response_format": "wav"
31 |     }' \
32 |     --output output3.wav
33 | 
34 | # output3 from pcm
35 | curl $OPENAI_API_BASE/audio/speech \
36 |     -H "Content-Type: application/json" \
37 |     -d '{
38 |     "model": "cosyvoice2",
39 |     "input": "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
40 |     "voice": "leijun",
41 |     "response_format": "pcm"
42 |     }' | \
43 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output3_from_pcm.wav
44 | 
45 | # load input from long_input.txt
46 | input=$(cat long_input.txt)
47 | # Construct JSON payload using jq
48 | json_payload=$(jq -n --arg input_text "$input" '{model: "cosyvoice2", input: $input_text, voice: "default_zh", response_format: "wav"}')
49 | 
50 | curl $OPENAI_API_BASE/audio/speech \
51 |     -H "Content-Type: application/json" \
52 |     -d "$json_payload" \
53 |     --output output4.wav
54 | 
55 | json_payload=$(jq -n --arg input_text "$input" '{model: "cosyvoice2", input: $input_text, voice: "default_zh", response_format: "pcm"}')
56 | curl $OPENAI_API_BASE/audio/speech \
57 |     -H "Content-Type: application/json" \
58 |     -d "$json_payload" | \
59 | sox -t raw -r 16000 -e signed-integer -b 16 -c 1 - output4_from_pcm.wav


--------------------------------------------------------------------------------
/tts_frontend.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | import regex
 17 | from tn.chinese.normalizer import Normalizer as ZhNormalizer
 18 | from tn.english.normalizer import Normalizer as EnNormalizer
 19 | import inflect
 20 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
 21 | 
 22 | 
 23 | class TextNormalizer:
 24 |     def __init__(self):
 25 |         self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
 26 |         self.en_tn_model = EnNormalizer()
 27 |         self.inflect_parser = inflect.engine()
 28 |         
 29 |     def text_normalize(self, text, split=True, text_frontend=True):
 30 |         """
 31 |         Normalize text for TTS.
 32 |         Modified from https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/cli/frontend.py
 33 |         """
 34 |         if text_frontend is False:
 35 |             return [text] if split is True else text
 36 |         text = text.strip()
 37 |         text = remove_asterisk(text)
 38 |         if contains_chinese(text):
 39 |             text = self.zh_tn_model.normalize(text)
 40 |             text = text.replace("\n", "")
 41 |             text = replace_blank(text)
 42 |             text = replace_corner_mark(text)
 43 |             text = text.replace(".", "。")
 44 |             text = text.replace(" - ", "，")
 45 |             text = remove_bracket(text)
 46 |             text = re.sub(r'[，,、]+$', '。', text)
 47 |             texts = list(split_paragraph(text, "zh", token_max_n=50,
 48 |                                             token_min_n=30, merge_len=10, comma_split=False))
 49 |         else:
 50 |             text = self.en_tn_model.normalize(text)
 51 |             text = spell_out_number(text, self.inflect_parser)
 52 |             texts = list(split_paragraph(text, "en", token_max_n=50,
 53 |                                             token_min_n=30, merge_len=10, comma_split=False))
 54 |         texts = [i for i in texts if not is_only_punctuation(i)]
 55 |         return texts if split is True else text
 56 | 
 57 | # whether contain chinese character
 58 | def contains_chinese(text):
 59 |     return bool(chinese_char_pattern.search(text))
 60 | 
 61 | 
 62 | # replace special symbol
 63 | def replace_corner_mark(text):
 64 |     text = text.replace('²', '平方')
 65 |     text = text.replace('³', '立方')
 66 |     return text
 67 | 
 68 | # remove *
 69 | def remove_asterisk(text):
 70 |     text = text.replace('*', '')
 71 |     return text
 72 | 
 73 | # remove meaningless symbol
 74 | def remove_bracket(text):
 75 |     text = text.replace('（', '').replace('）', '')
 76 |     text = text.replace('【', '').replace('】', '')
 77 |     text = text.replace('`', '').replace('`', '')
 78 |     text = text.replace("——", " ")
 79 |     return text
 80 | 
 81 | 
 82 | # spell Arabic numerals
 83 | def spell_out_number(text: str, inflect_parser):
 84 |     new_text = []
 85 |     st = None
 86 |     for i, c in enumerate(text):
 87 |         if not c.isdigit():
 88 |             if st is not None:
 89 |                 num_str = inflect_parser.number_to_words(text[st: i])
 90 |                 new_text.append(num_str)
 91 |                 st = None
 92 |             new_text.append(c)
 93 |         else:
 94 |             if st is None:
 95 |                 st = i
 96 |     if st is not None and st < len(text):
 97 |         num_str = inflect_parser.number_to_words(text[st:])
 98 |         new_text.append(num_str)
 99 |     return ''.join(new_text)
100 | 
101 | 
102 | # split paragrah logic：
103 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
104 | # 2. cal sentence len according to lang
105 | # 3. split sentence according to puncatation
106 | def split_paragraph(text: str, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
107 |     def calc_utt_length(_text: str):
108 |         if lang == "zh":
109 |             return len(_text)
110 |         else:
111 |             # Use word count for English
112 |             return len(_text.split())
113 | 
114 |     def should_merge(_text: str):
115 |         if lang == "zh":
116 |             return len(_text) < merge_len
117 |         else:
118 |             # Use word count for English
119 |             return len(_text.split()) < merge_len
120 | 
121 |     if lang == "zh":
122 |         pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
123 |     else:
124 |         pounc = ['.', '?', '!', ';', ':']
125 |     if comma_split:
126 |         pounc.extend(['，', ','])
127 | 
128 |     if text[-1] not in pounc:
129 |         if lang == "zh":
130 |             text += "。"
131 |         else:
132 |             text += "."
133 | 
134 |     st = 0
135 |     utts = []
136 |     for i, c in enumerate(text):
137 |         if c in pounc:
138 |             if len(text[st: i]) > 0:
139 |                 utts.append(text[st: i] + c)
140 |             if i + 1 < len(text) and text[i + 1] in ['"', '”']:
141 |                 tmp = utts.pop(-1)
142 |                 utts.append(tmp + text[i + 1])
143 |                 st = i + 2
144 |             else:
145 |                 st = i + 1
146 | 
147 |     final_utts = []
148 |     cur_utt = ""
149 |     for utt in utts:
150 |         if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
151 |             final_utts.append(cur_utt)
152 |             cur_utt = ""
153 |         cur_utt = cur_utt + utt
154 |     if len(cur_utt) > 0:
155 |         if should_merge(cur_utt) and len(final_utts) != 0:
156 |             final_utts[-1] = final_utts[-1] + cur_utt
157 |         else:
158 |             final_utts.append(cur_utt)
159 | 
160 |     return final_utts
161 | 
162 | 
163 | # remove blank between chinese character
164 | def replace_blank(text: str):
165 |     out_str = []
166 |     for i, c in enumerate(text):
167 |         if c == " ":
168 |             if ((text[i + 1].isascii() and text[i + 1] != " ") and
169 |                     (text[i - 1].isascii() and text[i - 1] != " ")):
170 |                 out_str.append(c)
171 |         else:
172 |             out_str.append(c)
173 |     return "".join(out_str)
174 | 
175 | 
176 | def is_only_punctuation(text):
177 |     # Regular expression: Match strings that consist only of punctuation marks or are empty.
178 |     punctuation_pattern = r'^[\p{P}\p{S}]*$'
179 |     return bool(regex.fullmatch(punctuation_pattern, text))
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     text_normalizer = TextNormalizer()
184 |     text = open("tests/long_input.txt", "r").read()
185 |     print(text)
186 |     print(text_normalizer.text_normalize(text))
187 | 


--------------------------------------------------------------------------------
/tts_server.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import soundfile as sf
  3 | import json
  4 | import numpy as np
  5 | import argparse
  6 | import io
  7 | import os  # Added import
  8 | from fastapi import FastAPI, HTTPException, Request
  9 | from fastapi.responses import StreamingResponse
 10 | from pydantic import BaseModel
 11 | import uvicorn
 12 | from pathlib import Path
 13 | from typing import Optional # Added import
 14 | from tts_frontend import TextNormalizer
 15 | def register_voice(ref_audios_path):
 16 |     VOICE_CONFIG = {}
 17 |     for ref_audio in os.listdir(ref_audios_path):
 18 |         if ref_audio.endswith(".wav") or ref_audio.endswith(".mp3"):
 19 |             voice_name = Path(ref_audio).stem
 20 |             reference_text = open(os.path.join(ref_audios_path, ref_audio.replace(".wav", ".txt")), "r").read()
 21 |             VOICE_CONFIG[voice_name] = {
 22 |                 "reference_audio": os.path.join(ref_audios_path, ref_audio),
 23 |                 "reference_text": reference_text
 24 |             }
 25 |     return VOICE_CONFIG
 26 | 
 27 | class TTSRequest(BaseModel):
 28 |     model: str # We might not use this directly if mapping voice to model config
 29 |     input: str
 30 |     voice: str
 31 |     instructions: Optional[str] = None # Optional field if needed
 32 |     response_format: Optional[str] = "pcm" # Added: default to raw pcm stream, allow "wav"
 33 | 
 34 | def prepare_tts_request(
 35 |     waveform,
 36 |     reference_text,
 37 |     target_text,
 38 | ):
 39 |     assert len(waveform.shape) == 1, "waveform should be 1D"
 40 |     lengths = np.array([[len(waveform)]], dtype=np.int32)
 41 | 
 42 |     samples = waveform.reshape(1, -1).astype(np.float32)
 43 | 
 44 |     data = {
 45 |         "inputs":[
 46 |             {
 47 |                 "name": "reference_wav",
 48 |                 "shape": samples.shape,
 49 |                 "datatype": "FP32",
 50 |                 "data": samples.tolist()
 51 |             },
 52 |             {
 53 |                 "name": "reference_wav_len",
 54 |                 "shape": lengths.shape,
 55 |                 "datatype": "INT32",
 56 |                 "data": lengths.tolist(),
 57 |             },
 58 |             {
 59 |                 "name": "reference_text",
 60 |                 "shape": [1, 1],
 61 |                 "datatype": "BYTES",
 62 |                 "data": [reference_text]
 63 |             },
 64 |             {
 65 |                 "name": "target_text",
 66 |                 "shape": [1, 1],
 67 |                 "datatype": "BYTES",
 68 |                 "data": [target_text]
 69 |             }
 70 |         ]
 71 |     }
 72 |     return data
 73 | 
 74 | 
 75 | app = FastAPI()
 76 | text_normalizer = TextNormalizer()
 77 | 
 78 | async def _stream_audio_generator(request_data: TTSRequest):
 79 |     """Async generator to yield audio numpy arrays (int16) for each sentence."""
 80 |     # Initial checks (voice, ref audio) are now done in the main endpoint.
 81 | 
 82 |     config = VOICE_CONFIG[request_data.voice]
 83 |     reference_audio_path = config["reference_audio"]
 84 |     reference_text = config["reference_text"]
 85 |     request_model_name = request_data.model
 86 |     triton_url = f"{TRITON_SERVER_URL}/v2/models/{request_model_name}/infer"
 87 |     target_text_list = text_normalizer.text_normalize(request_data.input)
 88 | 
 89 |     try:
 90 |         # Read reference audio once
 91 |         waveform, sr = sf.read(reference_audio_path)
 92 |         # Sample rate check already done in the main endpoint
 93 | 
 94 |         # Ensure reference samples are float32 as expected by prepare_tts_request
 95 |         samples = np.array(waveform, dtype=np.float32)
 96 | 
 97 |         for target_text in target_text_list:
 98 |             print(f"Generating audio array for: {target_text}") # Log start of processing
 99 |             triton_request_data = prepare_tts_request(samples, reference_text, target_text)
100 | 
101 |             try:
102 |                 rsp = requests.post(
103 |                     triton_url,
104 |                     headers={"Content-Type": "application/json"},
105 |                     json=triton_request_data,
106 |                     # Consider adding a timeout
107 |                     # timeout=30 # Example: 30 seconds
108 |                 )
109 |                 rsp.raise_for_status() # Raise exception for bad status codes (4xx or 5xx)
110 |                 result = rsp.json()
111 | 
112 |                 if "error" in result:
113 |                     print(f"Triton server error for text '{target_text}': {result['error']}")
114 |                     continue # Skip yielding audio for this failed sentence
115 | 
116 |                 if not result.get("outputs") or not result["outputs"][0].get("data"):
117 |                     print(f"Invalid response structure from Triton for text '{target_text}'")
118 |                     continue # Skip yielding audio for this invalid response
119 | 
120 |                 audio_data = result["outputs"][0]["data"]
121 |                 # Assuming Triton returns float32 data
122 |                 audio_array = np.array(audio_data, dtype=np.float32)
123 | 
124 |                 # Convert to 16-bit PCM
125 |                 audio_array = np.clip(audio_array, -1.0, 1.0)
126 |                 pcm_data = (audio_array * 32767).astype(np.int16)
127 | 
128 |                 yield pcm_data # Yield the numpy array directly
129 | 
130 |             except requests.exceptions.Timeout:
131 |                 print(f"Triton request timed out for text '{target_text[:50]}...'")
132 |                 raise HTTPException(status_code=504, detail="Triton server request timed out during streaming.")
133 |             except requests.exceptions.RequestException as e:
134 |                 print(f"Could not connect to Triton server for text '{target_text[:50]}...': {e}")
135 |                 raise HTTPException(status_code=503, detail=f"Could not connect to Triton server during streaming: {e}")
136 |             except Exception as e:
137 |                 print(f"An unexpected error occurred processing text '{target_text[:50]}...': {str(e)}")
138 |                 raise HTTPException(status_code=502, detail=f"An unexpected error occurred during streaming: {str(e)}")
139 | 
140 |         print("Finished generating all sentence arrays.")
141 | 
142 |     except sf.SoundFileError as e:
143 |          print(f"Error reading reference audio within generator: {e}")
144 |          raise HTTPException(status_code=501, detail=f"Could not read reference audio file during streaming: {reference_audio_path}")
145 |     except Exception as e:
146 |         print(f"Unexpected error at start of/during generator execution: {e}")
147 |         raise HTTPException(status_code=500, detail=f"Unexpected generator error: {str(e)}")
148 | 
149 | 
150 | @app.post("/audio/speech")
151 | async def generate_speech(request_data: TTSRequest):
152 |     # --- Perform initial checks --- (same as before)
153 |     if request_data.voice not in VOICE_CONFIG:
154 |         raise HTTPException(status_code=400, detail=f"Voice '{request_data.voice}' not found.")
155 | 
156 |     config = VOICE_CONFIG[request_data.voice]
157 |     reference_audio_path = config["reference_audio"]
158 | 
159 |     try:
160 |         if not os.path.exists(reference_audio_path):
161 |              raise FileNotFoundError
162 |         info = sf.info(reference_audio_path)
163 |         if info.samplerate != 16000:
164 |              raise HTTPException(status_code=500, detail=f"Reference audio sample rate ({info.samplerate}) does not match expected 16000. Resampling not implemented yet.")
165 |     except FileNotFoundError:
166 |          raise HTTPException(status_code=501, detail=f"Reference audio file not found: {reference_audio_path}")
167 |     except sf.SoundFileError:
168 |          raise HTTPException(status_code=501, detail=f"Could not read reference audio file info (invalid format or corrupt?): {reference_audio_path}")
169 |     except Exception as e:
170 |          raise HTTPException(status_code=500, detail=f"Error checking reference audio file: {str(e)}")
171 | 
172 |     # --- Handle response format --- 
173 |     if request_data.response_format:
174 |         response_format = request_data.response_format.lower()
175 |         if response_format not in ["pcm", "wav"]:
176 |             response_format = "wav"
177 |     else:
178 |         response_format = "wav"
179 | 
180 |     if response_format == "pcm":
181 |         print("Streaming raw PCM audio.")
182 |         media_type = f"audio/L16;rate={DEFAULT_SAMPLE_RATE};channels=1"
183 |         
184 |         async def pcm_byte_stream_generator():
185 |             """Consumes numpy arrays from the main generator and yields bytes."""
186 |             async for pcm_array in _stream_audio_generator(request_data):
187 |                  yield pcm_array.tobytes()
188 |             print("Finished streaming PCM bytes.")
189 |         
190 |         return StreamingResponse(pcm_byte_stream_generator(), media_type=media_type)
191 | 
192 |     elif response_format == "wav":
193 |         print("Generating buffered WAV file.")
194 |         all_audio_arrays = []
195 |         try:
196 |             async for pcm_array in _stream_audio_generator(request_data):
197 |                 all_audio_arrays.append(pcm_array)
198 |         except HTTPException as e:
199 |              # If the generator itself raises an HTTP exception, re-raise it
200 |              raise e
201 |         except Exception as e:
202 |             # Catch unexpected errors during array collection
203 |             print(f"Unexpected error collecting audio arrays for WAV: {e}")
204 |             raise HTTPException(status_code=500, detail=f"Error generating full WAV file: {str(e)}")
205 |         
206 |         if not all_audio_arrays:
207 |             print("No audio data generated, returning empty WAV.")
208 |             # Return an empty WAV or perhaps an error?
209 |             # Let's return a minimal valid empty WAV
210 |             wav_buffer = io.BytesIO()
211 |             sf.write(wav_buffer, np.array([], dtype=np.int16), DEFAULT_SAMPLE_RATE, format='WAV', subtype='PCM_16')
212 |             wav_buffer.seek(0)
213 |             return StreamingResponse(wav_buffer, media_type="audio/wav")
214 |             # Alternatively: raise HTTPException(status_code=500, detail="Failed to generate any audio data")
215 | 
216 |         try:
217 |             final_audio_array = np.concatenate(all_audio_arrays)
218 |             print(f"Concatenated audio array shape: {final_audio_array.shape}")
219 |             
220 |             wav_buffer = io.BytesIO()
221 |             sf.write(wav_buffer, final_audio_array, DEFAULT_SAMPLE_RATE, format='WAV', subtype='PCM_16')
222 |             wav_buffer.seek(0)
223 |             
224 |             print("Returning complete WAV file.")
225 |             return StreamingResponse(wav_buffer, media_type="audio/wav")
226 |         except Exception as e:
227 |             print(f"Error concatenating or writing WAV file: {e}")
228 |             raise HTTPException(status_code=500, detail=f"Failed to create final WAV file: {str(e)}")
229 | 
230 |     else:
231 |         raise HTTPException(status_code=400, detail=f"Unsupported response_format: '{request_data.response_format}'. Supported formats: 'pcm', 'wav'.")
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     parser = argparse.ArgumentParser() 
236 |     parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind the server to")
237 |     parser.add_argument("--port", type=int, default=8080, help="Port to bind the server to")
238 |     parser.add_argument("--url", type=str, default="http://localhost:8000", help="Triton server URL")
239 |     parser.add_argument("--ref_audios_dir", type=str, default="./ref_audios", help="Path to reference audio files")
240 |     parser.add_argument("--default_sample_rate", type=int, default=16000, help="Default sample rate")
241 |     args = parser.parse_args()
242 | 
243 |     VOICE_CONFIG = register_voice(args.ref_audios_dir)
244 |     TRITON_SERVER_URL = args.url
245 |     DEFAULT_SAMPLE_RATE = args.default_sample_rate
246 |     REF_AUDIO_BASE_PATH = args.ref_audios_dir
247 |     args = parser.parse_args()
248 | 
249 |     print(f"Starting FastAPI server on {args.host}:{args.port}")
250 |     print(f"Using Triton server at {TRITON_SERVER_URL}")
251 |     voice_list = list(VOICE_CONFIG.keys())
252 |     if len(voice_list) == 0:
253 |         raise ValueError("No voice found in the reference audio directory")
254 |     print(f"Available voices: {voice_list}")
255 | 
256 |     uvicorn.run(app, host=args.host, port=args.port) 


--------------------------------------------------------------------------------