├── .github └── workflows │ └── build-and-push-to-ghcr.yml ├── .gitignore ├── LICENSE ├── README.md ├── demo.py ├── server ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.cuda121 ├── main.py ├── requirements.txt └── requirements_cpu.txt └── test ├── default_speaker.json ├── requirements.txt └── test_streaming.py /.github/workflows/build-and-push-to-ghcr.yml: -------------------------------------------------------------------------------- 1 | name: Build and push to GHCR 2 | on: 3 | push: 4 | branches: [main] 5 | pull_request: 6 | jobs: 7 | build-and-push-to-ghcr-cuda118: 8 | runs-on: ubuntu-22.04 9 | steps: 10 | - 11 | name: Checkout 12 | uses: actions/checkout@v3 13 | 14 | - 15 | name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v3 17 | 18 | - name: 'Login to GitHub Container Registry' 19 | run: | 20 | set -xe 21 | docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io 22 | 23 | - name: 'Remove cache' 24 | run: | 25 | sudo rm -rf /usr/share/dotnet 26 | sudo rm -rf /opt/ghc 27 | sudo rm -rf "/usr/local/share/boost" 28 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 29 | 30 | - name: Build only for PR Cuda 11.8 31 | if: github.ref != 'refs/heads/main' 32 | uses: docker/build-push-action@v5 33 | with: 34 | context: "{{defaultContext}}:server" 35 | file: Dockerfile 36 | push: false # Do not push image for PR 37 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }} 38 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }} 39 | 40 | - name: Build and Push image Cuda 11.8 41 | if: github.ref == 'refs/heads/main' 42 | uses: docker/build-push-action@v5 43 | with: 44 | context: "{{defaultContext}}:server" 45 | file: Dockerfile 46 | push: true # Push if merged 47 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest 48 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest 49 | tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }} 50 | #build-args: 51 | 52 | build-and-push-to-ghcr-cuda121: 53 | runs-on: ubuntu-22.04 54 | steps: 55 | - 56 | name: Checkout 57 | uses: actions/checkout@v3 58 | 59 | - 60 | name: Set up Docker Buildx 61 | uses: docker/setup-buildx-action@v3 62 | 63 | - name: 'Login to GitHub Container Registry' 64 | run: | 65 | set -xe 66 | docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io 67 | 68 | - name: 'Remove cache' 69 | run: | 70 | sudo rm -rf /usr/share/dotnet 71 | sudo rm -rf /opt/ghc 72 | sudo rm -rf "/usr/local/share/boost" 73 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 74 | 75 | - name: Build only for PR cuda 12.1 76 | if: github.ref != 'refs/heads/main' 77 | uses: docker/build-push-action@v5 78 | with: 79 | context: "{{defaultContext}}:server" 80 | file: Dockerfile.cuda121 81 | push: false # Do not push image for PR 82 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} 83 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} 84 | 85 | - name: Build and Push image cuda 12.1 86 | if: github.ref == 'refs/heads/main' 87 | uses: docker/build-push-action@v5 88 | with: 89 | context: "{{defaultContext}}:server" 90 | file: Dockerfile.cuda121 91 | push: true # Push if merged 92 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 93 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 94 | tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }} 95 | #build-args: 96 | build-and-push-to-ghcr-cpu: 97 | runs-on: ubuntu-22.04 98 | steps: 99 | - 100 | name: Checkout 101 | uses: actions/checkout@v3 102 | 103 | - 104 | name: Set up Docker Buildx 105 | uses: docker/setup-buildx-action@v3 106 | 107 | - name: 'Login to GitHub Container Registry' 108 | run: | 109 | set -xe 110 | docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io 111 | 112 | - name: 'Remove cache' 113 | run: | 114 | sudo rm -rf /usr/share/dotnet 115 | sudo rm -rf /opt/ghc 116 | sudo rm -rf "/usr/local/share/boost" 117 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 118 | 119 | - name: Build only for PR CPU 120 | if: github.ref != 'refs/heads/main' 121 | uses: docker/build-push-action@v5 122 | with: 123 | context: "{{defaultContext}}:server" 124 | file: Dockerfile.cpu 125 | push: false # Do not push image for PR 126 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} 127 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cpu-${{ github.event.number }} 128 | 129 | - name: Build and Push image CPU 130 | if: github.ref == 'refs/heads/main' 131 | uses: docker/build-push-action@v5 132 | with: 133 | context: "{{defaultContext}}:server" 134 | file: Dockerfile.cpu 135 | push: true # Push if merged 136 | cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu 137 | cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu 138 | tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu, ghcr.io/coqui-ai/xtts-streaming-server:main-cpu-${{ github.sha }} 139 | #build-args: 140 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | demo_outputs -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XTTS streaming server 2 | *Warning: XTTS-streaming-server doesn't support concurrent streaming requests, it's a demo server, not meant for production.* 3 | 4 | https://github.com/coqui-ai/xtts-streaming-server/assets/17219561/7220442a-e88a-4288-8a73-608c4b39d06c 5 | 6 | 7 | ## 1) Run the server 8 | 9 | ### Use a pre-built image 10 | 11 | CUDA 12.1: 12 | 13 | ```bash 14 | $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121 15 | ``` 16 | 17 | CUDA 11.8 (for older cards): 18 | 19 | ```bash 20 | $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest 21 | ``` 22 | 23 | CPU (not recommended): 24 | 25 | ```bash 26 | $ docker run -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu 27 | ``` 28 | 29 | Run with a fine-tuned model: 30 | 31 | Make sure the model folder `/path/to/model/folder` contains the following files: 32 | - `config.json` 33 | - `model.pth` 34 | - `vocab.json` 35 | 36 | ```bash 37 | $ docker run -v /path/to/model/folder:/app/tts_models --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest` 38 | ``` 39 | 40 | Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to 41 | the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) 42 | 43 | ### Build the image yourself 44 | 45 | To build the Docker container Pytorch 2.1 and CUDA 11.8 : 46 | 47 | `DOCKERFILE` may be `Dockerfile`, `Dockerfile.cpu`, `Dockerfile.cuda121`, or your own custom Dockerfile. 48 | 49 | ```bash 50 | $ git clone git@github.com:coqui-ai/xtts-streaming-server.git 51 | $ cd xtts-streaming-server/server 52 | $ docker build -t xtts-stream . -f DOCKERFILE 53 | $ docker run --gpus all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream 54 | ``` 55 | 56 | Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to 57 | the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) 58 | 59 | ## 2) Testing the running server 60 | 61 | Once your Docker container is running, you can test that it's working properly. You will need to run the following code from a fresh terminal. 62 | 63 | ### Clone `xtts-streaming-server` if you haven't already 64 | 65 | ```bash 66 | $ git clone git@github.com:coqui-ai/xtts-streaming-server.git 67 | ``` 68 | 69 | ### Using the gradio demo 70 | 71 | ```bash 72 | $ cd xtts-streaming-server 73 | $ python -m pip install -r test/requirements.txt 74 | $ python demo.py 75 | ``` 76 | 77 | ### Using the test script 78 | 79 | ```bash 80 | $ cd xtts-streaming-server/test 81 | $ python -m pip install -r requirements.txt 82 | $ python test_streaming.py 83 | ``` 84 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import requests 3 | import base64 4 | import tempfile 5 | import json 6 | import os 7 | 8 | 9 | SERVER_URL = 'http://localhost:8000' 10 | OUTPUT = "./demo_outputs" 11 | cloned_speakers = {} 12 | 13 | print("Preparing file structure...") 14 | if not os.path.exists(OUTPUT): 15 | os.mkdir(OUTPUT) 16 | os.mkdir(os.path.join(OUTPUT, "cloned_speakers")) 17 | os.mkdir(os.path.join(OUTPUT, "generated_audios")) 18 | elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")): 19 | print("Loading existing cloned speakers...") 20 | for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")): 21 | if file.endswith(".json"): 22 | with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp: 23 | cloned_speakers[file[:-5]] = json.load(fp) 24 | print("Available cloned speakers:", ", ".join(cloned_speakers.keys())) 25 | 26 | try: 27 | print("Getting metadata from server ...") 28 | LANUGAGES = requests.get(SERVER_URL + "/languages").json() 29 | print("Available languages:", ", ".join(LANUGAGES)) 30 | STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() 31 | print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) 32 | except: 33 | raise Exception("Please make sure the server is running first.") 34 | 35 | 36 | def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): 37 | files = {"wav_file": ("reference.wav", open(upload_file, "rb"))} 38 | embeddings = requests.post(SERVER_URL + "/clone_speaker", files=files).json() 39 | with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp: 40 | json.dump(embeddings, fp) 41 | cloned_speakers[clone_speaker_name] = embeddings 42 | cloned_speaker_names.append(clone_speaker_name) 43 | return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names) 44 | 45 | def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang): 46 | embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] 47 | generated_audio = requests.post( 48 | SERVER_URL + "/tts", 49 | json={ 50 | "text": text, 51 | "language": lang, 52 | "speaker_embedding": embeddings["speaker_embedding"], 53 | "gpt_cond_latent": embeddings["gpt_cond_latent"] 54 | } 55 | ).content 56 | generated_audio_path = os.path.join("demo_outputs", "generated_audios", next(tempfile._get_candidate_names()) + ".wav") 57 | with open(generated_audio_path, "wb") as fp: 58 | fp.write(base64.b64decode(generated_audio)) 59 | return fp.name 60 | 61 | with gr.Blocks() as demo: 62 | cloned_speaker_names = gr.State(list(cloned_speakers.keys())) 63 | with gr.Tab("TTS"): 64 | with gr.Column() as row4: 65 | with gr.Row() as col4: 66 | speaker_name_studio = gr.Dropdown( 67 | label="Studio speaker", 68 | choices=STUDIO_SPEAKERS.keys(), 69 | value="Asya Anara" if "Asya Anara" in STUDIO_SPEAKERS.keys() else None, 70 | ) 71 | speaker_name_custom = gr.Dropdown( 72 | label="Cloned speaker", 73 | choices=cloned_speaker_names.value, 74 | value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None, 75 | ) 76 | speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") 77 | with gr.Column() as col2: 78 | lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="en") 79 | text = gr.Textbox(label="text", value="A quick brown fox jumps over the lazy dog.") 80 | tts_button = gr.Button(value="TTS") 81 | with gr.Column() as col3: 82 | generated_audio = gr.Audio(label="Generated audio", autoplay=True) 83 | with gr.Tab("Clone a new speaker"): 84 | with gr.Column() as col1: 85 | upload_file = gr.Audio(label="Upload reference audio", type="filepath") 86 | clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker") 87 | clone_button = gr.Button(value="Clone speaker") 88 | 89 | clone_button.click( 90 | fn=clone_speaker, 91 | inputs=[upload_file, clone_speaker_name, cloned_speaker_names], 92 | outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom], 93 | ) 94 | 95 | tts_button.click( 96 | fn=tts, 97 | inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang], 98 | outputs=[generated_audio], 99 | ) 100 | 101 | if __name__ == "__main__": 102 | print("Warming up server...") 103 | with open("test/default_speaker.json", "r") as fp: 104 | warmup_speaker = json.load(fp) 105 | resp = requests.post( 106 | SERVER_URL + "/tts", 107 | json={ 108 | "text": "This is a warmup request.", 109 | "language": "en", 110 | "speaker_embedding": warmup_speaker["speaker_embedding"], 111 | "gpt_cond_latent": warmup_speaker["gpt_cond_latent"], 112 | } 113 | ) 114 | resp.raise_for_status() 115 | print("Starting the demo...") 116 | demo.launch( 117 | share=False, 118 | debug=False, 119 | server_port=3009, 120 | server_name="0.0.0.0", 121 | ) 122 | -------------------------------------------------------------------------------- /server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel 2 | ARG DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \ 6 | apt-get clean && apt-get -y autoremove 7 | 8 | WORKDIR /app 9 | COPY requirements.txt . 10 | RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \ 11 | && python -m pip cache purge 12 | 13 | RUN python -m unidic download 14 | RUN mkdir -p /app/tts_models 15 | 16 | COPY main.py . 17 | ENV NVIDIA_DISABLE_REQUIRE=1 18 | 19 | ENV NUM_THREADS=2 20 | EXPOSE 80 21 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] 22 | -------------------------------------------------------------------------------- /server/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM python:3.11.7 2 | ARG DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \ 6 | apt-get clean && apt-get -y autoremove 7 | 8 | WORKDIR /app 9 | COPY requirements_cpu.txt . 10 | RUN python -m pip install --use-deprecated=legacy-resolver -r requirements_cpu.txt \ 11 | && python -m pip cache purge 12 | 13 | RUN python -m unidic download 14 | RUN mkdir -p /app/tts_models 15 | 16 | COPY main.py . 17 | ENV USE_CPU=1 18 | 19 | EXPOSE 80 20 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] 21 | -------------------------------------------------------------------------------- /server/Dockerfile.cuda121: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel 2 | ARG DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \ 6 | apt-get clean && apt-get -y autoremove 7 | 8 | WORKDIR /app 9 | COPY requirements.txt . 10 | RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \ 11 | && python -m pip cache purge 12 | 13 | RUN python -m unidic download 14 | RUN mkdir -p /app/tts_models 15 | 16 | COPY main.py . 17 | 18 | #Mark this 1 if you have older card 19 | ENV NVIDIA_DISABLE_REQUIRE=0 20 | 21 | ENV NUM_THREADS=2 22 | EXPOSE 80 23 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] 24 | -------------------------------------------------------------------------------- /server/main.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import os 4 | import tempfile 5 | import wave 6 | import torch 7 | import numpy as np 8 | from typing import List 9 | from pydantic import BaseModel 10 | 11 | from fastapi import FastAPI, UploadFile, Body 12 | from fastapi.responses import StreamingResponse 13 | 14 | from TTS.tts.configs.xtts_config import XttsConfig 15 | from TTS.tts.models.xtts import Xtts 16 | from TTS.utils.generic_utils import get_user_data_dir 17 | from TTS.utils.manage import ModelManager 18 | 19 | torch.set_num_threads(int(os.environ.get("NUM_THREADS", os.cpu_count()))) 20 | device = torch.device("cuda" if os.environ.get("USE_CPU", "0") == "0" else "cpu") 21 | if not torch.cuda.is_available() and device == "cuda": 22 | raise RuntimeError("CUDA device unavailable, please use Dockerfile.cpu instead.") 23 | 24 | custom_model_path = os.environ.get("CUSTOM_MODEL_PATH", "/app/tts_models") 25 | 26 | if os.path.exists(custom_model_path) and os.path.isfile(custom_model_path + "/config.json"): 27 | model_path = custom_model_path 28 | print("Loading custom model from", model_path, flush=True) 29 | else: 30 | print("Loading default model", flush=True) 31 | model_name = "tts_models/multilingual/multi-dataset/xtts_v2" 32 | print("Downloading XTTS Model:", model_name, flush=True) 33 | ModelManager().download_model(model_name) 34 | model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) 35 | print("XTTS Model downloaded", flush=True) 36 | 37 | print("Loading XTTS", flush=True) 38 | config = XttsConfig() 39 | config.load_json(os.path.join(model_path, "config.json")) 40 | model = Xtts.init_from_config(config) 41 | model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False) 42 | model.to(device) 43 | print("XTTS Loaded.", flush=True) 44 | 45 | print("Running XTTS Server ...", flush=True) 46 | 47 | ##### Run fastapi ##### 48 | app = FastAPI( 49 | title="XTTS Streaming server", 50 | description="""XTTS Streaming server""", 51 | version="0.0.1", 52 | docs_url="/", 53 | ) 54 | 55 | 56 | @app.post("/clone_speaker") 57 | def predict_speaker(wav_file: UploadFile): 58 | """Compute conditioning inputs from reference audio file.""" 59 | temp_audio_name = next(tempfile._get_candidate_names()) 60 | with open(temp_audio_name, "wb") as temp, torch.inference_mode(): 61 | temp.write(io.BytesIO(wav_file.file.read()).getbuffer()) 62 | gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( 63 | temp_audio_name 64 | ) 65 | return { 66 | "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(), 67 | "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(), 68 | } 69 | 70 | 71 | def postprocess(wav): 72 | """Post process the output waveform""" 73 | if isinstance(wav, list): 74 | wav = torch.cat(wav, dim=0) 75 | wav = wav.clone().detach().cpu().numpy() 76 | wav = wav[None, : int(wav.shape[0])] 77 | wav = np.clip(wav, -1, 1) 78 | wav = (wav * 32767).astype(np.int16) 79 | return wav 80 | 81 | 82 | def encode_audio_common( 83 | frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1 84 | ): 85 | """Return base64 encoded audio""" 86 | wav_buf = io.BytesIO() 87 | with wave.open(wav_buf, "wb") as vfout: 88 | vfout.setnchannels(channels) 89 | vfout.setsampwidth(sample_width) 90 | vfout.setframerate(sample_rate) 91 | vfout.writeframes(frame_input) 92 | 93 | wav_buf.seek(0) 94 | if encode_base64: 95 | b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8") 96 | return b64_encoded 97 | else: 98 | return wav_buf.read() 99 | 100 | 101 | class StreamingInputs(BaseModel): 102 | speaker_embedding: List[float] 103 | gpt_cond_latent: List[List[float]] 104 | text: str 105 | language: str 106 | add_wav_header: bool = True 107 | stream_chunk_size: str = "20" 108 | 109 | 110 | def predict_streaming_generator(parsed_input: dict = Body(...)): 111 | speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1) 112 | gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0) 113 | text = parsed_input.text 114 | language = parsed_input.language 115 | 116 | stream_chunk_size = int(parsed_input.stream_chunk_size) 117 | add_wav_header = parsed_input.add_wav_header 118 | 119 | 120 | chunks = model.inference_stream( 121 | text, 122 | language, 123 | gpt_cond_latent, 124 | speaker_embedding, 125 | stream_chunk_size=stream_chunk_size, 126 | enable_text_splitting=True 127 | ) 128 | 129 | for i, chunk in enumerate(chunks): 130 | chunk = postprocess(chunk) 131 | if i == 0 and add_wav_header: 132 | yield encode_audio_common(b"", encode_base64=False) 133 | yield chunk.tobytes() 134 | else: 135 | yield chunk.tobytes() 136 | 137 | 138 | @app.post("/tts_stream") 139 | def predict_streaming_endpoint(parsed_input: StreamingInputs): 140 | return StreamingResponse( 141 | predict_streaming_generator(parsed_input), 142 | media_type="audio/wav", 143 | ) 144 | 145 | class TTSInputs(BaseModel): 146 | speaker_embedding: List[float] 147 | gpt_cond_latent: List[List[float]] 148 | text: str 149 | language: str 150 | 151 | @app.post("/tts") 152 | def predict_speech(parsed_input: TTSInputs): 153 | speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1) 154 | gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0) 155 | text = parsed_input.text 156 | language = parsed_input.language 157 | 158 | out = model.inference( 159 | text, 160 | language, 161 | gpt_cond_latent, 162 | speaker_embedding, 163 | ) 164 | 165 | wav = postprocess(torch.tensor(out["wav"])) 166 | 167 | return encode_audio_common(wav.tobytes()) 168 | 169 | 170 | @app.get("/studio_speakers") 171 | def get_speakers(): 172 | if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"): 173 | return { 174 | speaker: { 175 | "speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(), 176 | "gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(), 177 | } 178 | for speaker in model.speaker_manager.speakers.keys() 179 | } 180 | else: 181 | return {} 182 | 183 | @app.get("/languages") 184 | def get_languages(): 185 | return config.languages -------------------------------------------------------------------------------- /server/requirements.txt: -------------------------------------------------------------------------------- 1 | TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62 2 | uvicorn[standard]==0.23.2 3 | fastapi==0.95.2 4 | deepspeed==0.10.3 5 | pydantic==1.10.13 6 | python-multipart==0.0.6 7 | typing-extensions>=4.8.0 8 | numpy==1.24.3 9 | cutlet 10 | mecab-python3==1.0.6 11 | unidic-lite==1.0.8 12 | unidic==1.1.0 13 | -------------------------------------------------------------------------------- /server/requirements_cpu.txt: -------------------------------------------------------------------------------- 1 | TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62 2 | uvicorn[standard]==0.23.2 3 | fastapi==0.95.2 4 | pydantic==1.10.13 5 | python-multipart==0.0.6 6 | typing-extensions>=4.8.0 7 | numpy==1.24.3 8 | cutlet 9 | mecab-python3==1.0.6 10 | unidic-lite==1.0.8 11 | unidic==1.1.0 12 | -------------------------------------------------------------------------------- /test/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.31.0 2 | gradio==3.50.2 3 | -------------------------------------------------------------------------------- /test/test_streaming.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import shutil 4 | import subprocess 5 | import sys 6 | import time 7 | from typing import Iterator 8 | 9 | import requests 10 | 11 | 12 | def is_installed(lib_name: str) -> bool: 13 | lib = shutil.which(lib_name) 14 | if lib is None: 15 | return False 16 | return True 17 | 18 | 19 | def save(audio: bytes, filename: str) -> None: 20 | with open(filename, "wb") as f: 21 | f.write(audio) 22 | 23 | 24 | def stream_ffplay(audio_stream, output_file, save=True): 25 | if not save: 26 | ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"] 27 | else: 28 | print("Saving to ", output_file) 29 | ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", output_file] 30 | 31 | ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE) 32 | for chunk in audio_stream: 33 | if chunk is not None: 34 | ffplay_proc.stdin.write(chunk) 35 | 36 | # close on finish 37 | ffplay_proc.stdin.close() 38 | ffplay_proc.wait() 39 | 40 | 41 | def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]: 42 | start = time.perf_counter() 43 | speaker["text"] = text 44 | speaker["language"] = language 45 | speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality 46 | res = requests.post( 47 | f"{server_url}/tts_stream", 48 | json=speaker, 49 | stream=True, 50 | ) 51 | end = time.perf_counter() 52 | print(f"Time to make POST: {end-start}s", file=sys.stderr) 53 | 54 | if res.status_code != 200: 55 | print("Error:", res.text) 56 | sys.exit(1) 57 | 58 | first = True 59 | for chunk in res.iter_content(chunk_size=512): 60 | if first: 61 | end = time.perf_counter() 62 | print(f"Time to first chunk: {end-start}s", file=sys.stderr) 63 | first = False 64 | if chunk: 65 | yield chunk 66 | 67 | print("⏱️ response.elapsed:", res.elapsed) 68 | 69 | 70 | def get_speaker(ref_audio,server_url): 71 | files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))} 72 | response = requests.post(f"{server_url}/clone_speaker", files=files) 73 | return response.json() 74 | 75 | 76 | if __name__ == "__main__": 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument( 79 | "--text", 80 | default="It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", 81 | help="text input for TTS" 82 | ) 83 | parser.add_argument( 84 | "--language", 85 | default="en", 86 | help="Language to use default is 'en' (English)" 87 | ) 88 | parser.add_argument( 89 | "--output_file", 90 | default=None, 91 | help="Save TTS output to given filename" 92 | ) 93 | parser.add_argument( 94 | "--ref_file", 95 | default=None, 96 | help="Reference audio file to use, when not given will use default" 97 | ) 98 | parser.add_argument( 99 | "--server_url", 100 | default="http://localhost:8000", 101 | help="Server url http://localhost:8000 default, change to your server location " 102 | ) 103 | parser.add_argument( 104 | "--stream_chunk_size", 105 | default="20", 106 | help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality" 107 | ) 108 | args = parser.parse_args() 109 | 110 | with open("./default_speaker.json", "r") as file: 111 | speaker = json.load(file) 112 | 113 | if args.ref_file is not None: 114 | print("Computing the latents for a new reference...") 115 | speaker = get_speaker(args.ref_file, args.server_url) 116 | 117 | audio = stream_ffplay( 118 | tts( 119 | args.text, 120 | speaker, 121 | args.language, 122 | args.server_url, 123 | args.stream_chunk_size 124 | ), 125 | args.output_file, 126 | save=bool(args.output_file) 127 | ) 128 | --------------------------------------------------------------------------------