├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── __init__.py ├── musicgen_hf_nodes.py ├── musicgen_nodes.py ├── requirements.txt ├── requirements_windows.txt ├── tacotron_nodes.py ├── tortoise_nodes.py ├── util.py ├── util_nodes.py └── valle_x_nodes.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "include/hifi-gan"] 2 | path = include/hifi-gan 3 | url = https://github.com/justinjohn0306/hifi-gan.git 4 | [submodule "include/tacotron2"] 5 | path = include/tacotron2 6 | url = https://github.com/justinjohn0306/TTS-TT2.git 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-audio 2 | 3 | generative audio tools for ComfyUI. highly experimental—expect things to break and/or change frequently or not at all. 4 | 5 | **NOTE**: for the foreseeable future, i will be unable to continue working on this extension. please consider forking this repository! 6 | 7 | 8 | ## features 9 | - [tacotron2 text-to-speech](https://github.com/NVIDIA/tacotron2) 10 | - uses justinjohn0306's forks of [tacotron2](https://github.com/justinjohn0306/TTS-TT2/) and [hifi-gan](https://github.com/justinjohn0306/hifi-gan/) 11 | - [musicgen text-to-music + audiogen text-to-sound](https://facebookresearch.github.io/audiocraft/docs/MUSICGEN.html) 12 | - audiocraft and transformers implementations 13 | - supports audio continuation, unconditional generation 14 | - [tortoise text-to-speech](https://github.com/neonbjb/tortoise-tts) 15 | - [vall-e x text-to-speech](https://github.com/Plachtaa/VALL-E-X) 16 | - uses [korakoe's fork](https://github.com/korakoe/VALL-E-X) 17 | - [voicefixer](https://github.com/voicefixer/voicefixer) 18 | - audio utility nodes 19 | - save audio, convert audio 20 | 21 | ## installation 22 | ```shell 23 | # TORCH_CUDA_INDEX_URL=https://download.pytorch.org/whl/cu118 # for cuda 11.8 24 | TORCH_CUDA_INDEX_URL=https://download.pytorch.org/whl/cu121 # for cuda 12.1 25 | 26 | cd ComfyUI/custom_nodes 27 | git clone https://github.com/eigenpunk/ComfyUI-audio 28 | cd ComfyUI-audio 29 | 30 | # for linux 31 | pip install -r requirements.txt --extra-index-url $TORCH_CUDA_INDEX_URL 32 | 33 | # for windows 34 | pip install -r requirements_windows.txt --extra-index-url $TORCH_CUDA_INDEX_URL 35 | ``` 36 | 37 | this extension is developed and tested on a Linux-based OS. i've not yet been able to get the extension fully working on Windows, so 38 | expect some difficulty if that is your platform. i've not tested the extension on macOS at all. 39 | 40 | ## would be nice to have maybe 41 | - audio uploads 42 | - audio previews 43 | - prompt weights for text-to-music/audio 44 | - stereo musicgen 45 | - multi-band diffusion 46 | - more/faster tts model support 47 | - [vits](https://huggingface.co/docs/transformers/model_doc/vits)? 48 | - ~~[tacotron2](https://github.com/NVIDIA/tacotron2)~~ 49 | - ~~[vall-e x](https://github.com/Plachtaa/VALL-E-X)~~ 50 | - ??? 51 | - split generator nodes by model stages 52 | - e.g. tortoise: 53 | - autoregressor 54 | - clvp/cvvp 55 | - spectrogram diffusion 56 | - e.g. musicgen: 57 | - t5 text encode 58 | - encodec audio encode 59 | - generate with decoder 60 | - more audio generation models 61 | - magnet, etc 62 | - demucs 63 | - ~~audiogen~~ 64 | 65 | 66 | *NOTE*: this work is solely a personal project; its development is not supported/sponsored by any past/present employer or any other external organization. 67 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import folder_paths 2 | 3 | from .util_nodes import ( 4 | NODE_CLASS_MAPPINGS as UTIL_NODE_CLASS_MAPPINGS, 5 | NODE_DISPLAY_NAME_MAPPINGS as UTIL_NODE_DISPLAY_NAME_MAPPINGS, 6 | ) 7 | 8 | 9 | try: 10 | from .musicgen_nodes import ( 11 | NODE_CLASS_MAPPINGS as MGAC_NODE_CLASS_MAPPINGS, 12 | NODE_DISPLAY_NAME_MAPPINGS as MGAC_NODE_DISPLAY_NAME_MAPPINGS, 13 | ) 14 | except Exception as e: 15 | print(f"WARNING: ComfyUI-audio failed to import musicgen nodes; reason: {e}") 16 | MGAC_NODE_CLASS_MAPPINGS = {} 17 | MGAC_NODE_DISPLAY_NAME_MAPPINGS = {} 18 | 19 | 20 | try: 21 | from .musicgen_hf_nodes import ( 22 | NODE_CLASS_MAPPINGS as MGHF_NODE_CLASS_MAPPINGS, 23 | NODE_DISPLAY_NAME_MAPPINGS as MGHF_NODE_DISPLAY_NAME_MAPPINGS, 24 | ) 25 | except Exception as e: 26 | print(f"WARNING: ComfyUI-audio failed to import musicgen_hf nodes; reason: {e}") 27 | MGHF_NODE_CLASS_MAPPINGS = {} 28 | MGHF_NODE_DISPLAY_NAME_MAPPINGS = {} 29 | 30 | 31 | try: 32 | from .tortoise_nodes import ( 33 | NODE_CLASS_MAPPINGS as TORTOISE_NODE_CLASS_MAPPINGS, 34 | NODE_DISPLAY_NAME_MAPPINGS as TORTOISE_NODE_DISPLAY_NAME_MAPPINGS, 35 | ) 36 | except Exception as e: 37 | print(f"WARNING: ComfyUI-audio failed to import tortoise nodes; reason: {e}") 38 | TORTOISE_NODE_CLASS_MAPPINGS = {} 39 | TORTOISE_NODE_DISPLAY_NAME_MAPPINGS = {} 40 | 41 | 42 | try: 43 | from .valle_x_nodes import ( 44 | NODE_CLASS_MAPPINGS as VEX_NODE_CLASS_MAPPINGS, 45 | NODE_DISPLAY_NAME_MAPPINGS as VEX_NODE_DISPLAY_MAPPINGS, 46 | ) 47 | except Exception as e: 48 | print(f"WARNING: ComfyUI-audio failed to import vall_e_x; reason: {e}") 49 | VEX_NODE_CLASS_MAPPINGS = {} 50 | VEX_NODE_DISPLAY_MAPPINGS = {} 51 | 52 | 53 | try: 54 | from .tacotron_nodes import ( 55 | NODE_CLASS_MAPPINGS as TT2_NODE_CLASS_MAPPINGS, 56 | NODE_DISPLAY_NAME_MAPPINGS as TT2_NODE_DISPLAY_NAME_MAPPINGS, 57 | ) 58 | except Exception as e: 59 | print(f"WARNING: ComfyUI-audio failed to import tacotron nodes; reason: {e}") 60 | TT2_NODE_CLASS_MAPPINGS = {} 61 | TT2_NODE_DISPLAY_NAME_MAPPINGS = {} 62 | 63 | 64 | NODE_CLASS_MAPPINGS = { 65 | **UTIL_NODE_CLASS_MAPPINGS, 66 | **MGAC_NODE_CLASS_MAPPINGS, 67 | **MGHF_NODE_CLASS_MAPPINGS, 68 | **TORTOISE_NODE_CLASS_MAPPINGS, 69 | **VEX_NODE_CLASS_MAPPINGS, 70 | **TT2_NODE_CLASS_MAPPINGS, 71 | } 72 | NODE_DISPLAY_NAME_MAPPINGS = { 73 | **UTIL_NODE_DISPLAY_NAME_MAPPINGS, 74 | **MGAC_NODE_DISPLAY_NAME_MAPPINGS, 75 | **MGHF_NODE_DISPLAY_NAME_MAPPINGS, 76 | **TORTOISE_NODE_DISPLAY_NAME_MAPPINGS, 77 | **VEX_NODE_DISPLAY_MAPPINGS, 78 | **TT2_NODE_DISPLAY_NAME_MAPPINGS, 79 | } 80 | -------------------------------------------------------------------------------- /musicgen_hf_nodes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Optional 3 | 4 | from transformers import MusicgenForConditionalGeneration, MusicgenProcessor 5 | 6 | from .util import do_cleanup, object_to, obj_on_device, on_device, tensors_to_cpu, tensors_to 7 | from .musicgen_nodes import MODEL_NAMES as _ACM_MODEL_NAMES 8 | 9 | 10 | # remove unsupported audiogen models from list 11 | MODEL_NAMES = [x for x in _ACM_MODEL_NAMES if "audiogen" not in x] 12 | 13 | 14 | class MusicgenHFLoader: 15 | def __init__(self): 16 | self.model = None 17 | self.processor = None 18 | 19 | @classmethod 20 | def INPUT_TYPES(cls): 21 | return {"required": {"model_name": (MODEL_NAMES,)}} 22 | 23 | RETURN_NAMES = ("musicgen_hf_model", "sample_rate") 24 | RETURN_TYPES = ("MUSICGEN_HF_MODEL", "INT") 25 | FUNCTION = "load" 26 | CATEGORY = "audio" 27 | 28 | def load(self, model_name: str): 29 | if self.model is not None: 30 | self.model = object_to(self.model, empty_cuda_cache=False) 31 | self.processor = object_to(self.processor, empty_cuda_cache=False) 32 | del self.model, self.processor 33 | do_cleanup() 34 | print("MusicgenHFLoader: unloaded model") 35 | 36 | print(f"MusicgenHFLoader: loading {model_name}") 37 | model_name = "facebook/" + model_name 38 | self.processor = MusicgenProcessor.from_pretrained(model_name) 39 | self.model = MusicgenForConditionalGeneration.from_pretrained(model_name) 40 | return (self.model, self.processor), 41 | 42 | 43 | MILLISECONDS_PER_TOKEN = 20 44 | 45 | 46 | class MusicgenHFGenerate: 47 | @classmethod 48 | def INPUT_TYPES(cls): 49 | return { 50 | "required": { 51 | "model": ("MUSICGEN_HF_MODEL",), 52 | "text": ("STRING", {"multiline": True, "default": ""}), 53 | "batch_size": ("INT", {"default": 1, "min": 1}), 54 | "duration": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 300.0, "step": 0.01}), 55 | "cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), 56 | "top_k": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}), 57 | "top_p": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}), 58 | "temperature": ("FLOAT", {"default": 1.0, "min": 0.001, "max": 10.0, "step": 0.001}), 59 | "seed": ("INT", {"default": 0, "min": 0}), 60 | }, 61 | "optional": {"audio": ("AUDIO",)}, 62 | } 63 | 64 | RETURN_TYPES = ("AUDIO",) 65 | FUNCTION = "generate" 66 | CATEGORY = "audio" 67 | 68 | def generate( 69 | self, 70 | model: tuple[MusicgenForConditionalGeneration, MusicgenProcessor], 71 | text: str = "", 72 | batch_size: int = 1, 73 | duration: float = 10.0, 74 | cfg: float = 1.0, 75 | top_k: int = 0, 76 | top_p: float = 1.0, 77 | temperature: float = 1.0, 78 | seed: int = 0, 79 | audio: Optional[torch.Tensor] = None, 80 | ): 81 | device = "cuda" if torch.cuda.is_available() else "cpu" 82 | sr = model[0].config.audio_encoder.sampling_rate 83 | 84 | # empty string = unconditional generation 85 | if text == "": 86 | text = None 87 | 88 | max_new_tokens = int(duration * 1000.0 / MILLISECONDS_PER_TOKEN) 89 | 90 | with ( 91 | torch.random.fork_rng(), 92 | obj_on_device(model[1], dst=device, verbose_move=True) as p, 93 | on_device(model[0], dst=device) as m, 94 | ): 95 | torch.manual_seed(seed) 96 | 97 | # create conditioning inputs for models: using encodec for audio, t5 for text 98 | if audio is not None or text is not None: 99 | text_input = [text] * batch_size if text is not None else text 100 | audio_input = ( 101 | [x.squeeze().numpy() for x in audio["waveform"]] if audio is not None else audio 102 | ) 103 | inputs = p( 104 | text=text_input, 105 | audio=audio_input, 106 | sampling_rate=sr, 107 | padding=True, 108 | return_tensors="pt", 109 | ) 110 | print(inputs) 111 | else: 112 | m: MusicgenForConditionalGeneration 113 | inputs = m.get_unconditional_inputs(batch_size) 114 | inputs.encoder_outputs = inputs.encoder_outputs[0] # wacky crap 115 | print(inputs) 116 | cfg = inputs.guidance_scale 117 | 118 | # move to device, remove redundant guidance scale 119 | inputs = dict(inputs) 120 | inputs = tensors_to(inputs, device) 121 | inputs.pop("guidance_scale", None) 122 | 123 | samples = m.generate( 124 | **inputs, 125 | max_new_tokens=max_new_tokens, 126 | temperature=temperature, 127 | top_k=top_k, 128 | top_p=top_p, 129 | guidance_scale=cfg 130 | ) 131 | inputs = tensors_to_cpu(inputs) 132 | del inputs 133 | 134 | samples = samples.unsqueeze(1) if samples.dim == 2 else samples 135 | do_cleanup() 136 | 137 | return {"waveform": samples.cpu(), "sample_rate": model[0].config.audio_encoder.sampling_rate}, 138 | 139 | 140 | # A dictionary that contains all nodes you want to export with their names 141 | # NOTE: names should be globally unique 142 | NODE_CLASS_MAPPINGS = { 143 | "MusicgenHFGenerate": MusicgenHFGenerate, 144 | "MusicgenHFLoader": MusicgenHFLoader, 145 | } 146 | 147 | # A dictionary that contains the friendly/humanly readable titles for the nodes 148 | NODE_DISPLAY_NAME_MAPPINGS = { 149 | "MusicgenHFGenerate": "Musicgen (HF) Generator", 150 | "MusicgenHFLoader": "Musicgen (HF) Loader", 151 | } 152 | -------------------------------------------------------------------------------- /musicgen_nodes.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import torch 4 | from audiocraft.models import AudioGen, MusicGen 5 | 6 | from .util import do_cleanup, object_to, obj_on_device, tensors_to, tensors_to_cpu 7 | 8 | 9 | MODEL_NAMES = [ 10 | "musicgen-small", 11 | "musicgen-medium", 12 | "musicgen-melody", 13 | "musicgen-large", 14 | "musicgen-melody-large", 15 | "musicgen-stereo-small", 16 | "musicgen-stereo-medium", 17 | "musicgen-stereo-melody", 18 | "musicgen-stereo-large", 19 | "musicgen-stereo-melody-large", 20 | "audiogen-medium", 21 | ] 22 | 23 | 24 | class MusicgenLoader: 25 | def __init__(self): 26 | self.model = None 27 | self.name = None 28 | 29 | @classmethod 30 | def INPUT_TYPES(s): 31 | return {"required": {"model_name": (MODEL_NAMES,)}} 32 | 33 | RETURN_NAMES = ("musicgen_model", "sample_rate") 34 | RETURN_TYPES = ("MUSICGEN_MODEL", "INT") 35 | FUNCTION = "load" 36 | CATEGORY = "audio" 37 | 38 | def load(self, model_name: str): 39 | self.unload() 40 | 41 | print(f"MusicgenLoader: loading {model_name}") 42 | 43 | self.name = "facebook/" + model_name 44 | model_class = AudioGen if "audiogen" in self.name else MusicGen 45 | 46 | self.model = model_class.get_pretrained(self.name) 47 | sr = self.model.sample_rate 48 | return self.model, sr 49 | 50 | def unload(self): 51 | if self.model is not None: 52 | # force move to cpu, delete/collect, clear cache 53 | self.model = object_to(self.model, empty_cuda_cache=False) 54 | del self.model 55 | do_cleanup() 56 | print("MusicgenLoader: unloaded model") 57 | 58 | 59 | class MusicgenGenerate: 60 | @classmethod 61 | def INPUT_TYPES(s): 62 | return { 63 | "required": { 64 | "model": ("MUSICGEN_MODEL",), 65 | "text": ("STRING", {"default": "", "multiline": True}), 66 | "batch_size": ("INT", {"default": 1, "min": 1}), 67 | "duration": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 300.0, "step": 0.01}), 68 | "cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), 69 | "top_k": ("INT", {"default": 250, "min": 0, "max": 10000, "step": 1}), 70 | "top_p": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), 71 | "temperature": ("FLOAT", {"default": 1.0, "min": 0.001, "step": 0.001}), 72 | "seed": ("INT", {"default": 0, "min": 0}), 73 | }, 74 | "optional": {"audio": ("AUDIO",)}, 75 | } 76 | 77 | RETURN_TYPES = ("AUDIO",) 78 | FUNCTION = "generate" 79 | CATEGORY = "audio" 80 | 81 | def generate( 82 | self, 83 | model: Union[AudioGen, MusicGen], 84 | text: str = "", 85 | batch_size: int = 1, 86 | duration: float = 10.0, 87 | cfg: float = 1.0, 88 | top_k: int = 250, 89 | top_p: float = 0.0, 90 | temperature: float = 1.0, 91 | seed: int = 0, 92 | audio = None, 93 | ): 94 | device = "cuda" if torch.cuda.is_available() else "cpu" 95 | # empty string = unconditional generation 96 | if text == "": 97 | text = None 98 | 99 | model.set_generation_params( 100 | top_k=top_k, 101 | top_p=top_p, 102 | temperature=temperature, 103 | duration=duration, 104 | cfg_coef=cfg, 105 | ) 106 | with torch.random.fork_rng(), obj_on_device(model, dst=device, verbose_move=True) as m: 107 | torch.manual_seed(seed) 108 | text_input = [text] * batch_size 109 | if audio is not None: 110 | # do continuation with input audio and (optional) text prompting 111 | audio_in = audio["waveform"] 112 | 113 | if audio_in.shape[0] < batch_size: 114 | # (try to) expand batch if smaller than requested 115 | audio_in = audio_in.expand(batch_size, -1, -1) 116 | elif audio_in.shape[0] > batch_size: 117 | # truncate batch if larger than requested 118 | audio_in = audio_in[:batch_size] 119 | 120 | audio_input = tensors_to(audio_in, device) 121 | audio_out = m.generate_continuation(audio_input, model.sample_rate, text_input, progress=True) 122 | elif text is not None: 123 | # do text-to-music 124 | audio_out = m.generate(text_input, progress=True) 125 | else: 126 | # do unconditional music generation 127 | audio_out = m.generate_unconditional(batch_size, progress=True) 128 | 129 | audio_out = tensors_to_cpu(audio_out) 130 | 131 | do_cleanup() 132 | return {"waveform": audio_out, "sample_rate": model.sample_rate}, 133 | 134 | 135 | NODE_CLASS_MAPPINGS = { 136 | "MusicgenGenerate": MusicgenGenerate, 137 | "MusicgenLoader": MusicgenLoader, 138 | } 139 | 140 | NODE_DISPLAY_NAME_MAPPINGS = { 141 | "MusicgenGenerate": "Musicgen Generator", 142 | "MusicgenLoader": "Musicgen Loader", 143 | } 144 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft 2 | git+https://github.com/korakoe/VALL-E-X#egg=vall_e_x 3 | tortoise-tts @ https://github.com/rsxdalv/tortoise-tts/releases/download/v3.0.1/tortoise_tts-3.0.1-py3-none-any.whl 4 | voicefixer 5 | deepspeed 6 | resampy -------------------------------------------------------------------------------- /requirements_windows.txt: -------------------------------------------------------------------------------- 1 | git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft 2 | git+https://github.com/korakoe/VALL-E-X#egg=vall_e_x 3 | git+https://github.com/neonbjb/tortoise-tts 4 | voicefixer 5 | resampy -------------------------------------------------------------------------------- /tacotron_nodes.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from glob import glob 5 | 6 | import torch 7 | 8 | 9 | base_incl_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "include") 10 | 11 | sys.path = [ 12 | os.path.join(base_incl_path, "hifi-gan"), 13 | ] + sys.path 14 | 15 | from denoiser import Denoiser as HifiGANDenoiser 16 | from env import AttrDict 17 | from meldataset import mel_spectrogram, MAX_WAV_VALUE 18 | from models import Generator as HifiGAN 19 | 20 | sys.path = [ 21 | os.path.join(base_incl_path, "tacotron2"), 22 | os.path.join(base_incl_path, "tacotron2", "waveglow"), 23 | ] + sys.path 24 | 25 | from hparams import create_hparams 26 | from model import Tacotron2 27 | from train import load_model 28 | from text import text_to_sequence 29 | from denoiser import Denoiser as WaveGlowDenoiser 30 | 31 | 32 | from .util import do_cleanup, get_device, models_dir, object_to, obj_on_device 33 | 34 | BIGINT = 2 ** 32 35 | 36 | MODELS_PATH = os.path.join(models_dir, "tacotron2") 37 | WAVEGLOW_MODELS_PATH = os.path.join(models_dir, "waveglow") 38 | HIFIGAN_MODELS_PATH = os.path.join(models_dir, "hifigan") 39 | os.makedirs(MODELS_PATH, exist_ok=True) 40 | os.makedirs(WAVEGLOW_MODELS_PATH, exist_ok=True) 41 | os.makedirs(HIFIGAN_MODELS_PATH, exist_ok=True) 42 | 43 | MODELS = { 44 | x.removeprefix(MODELS_PATH)[1:]: x 45 | for x in sorted(glob(os.path.join(MODELS_PATH, "*.pt"))) 46 | } 47 | WAVEGLOW_MODELS = { 48 | x.removeprefix(WAVEGLOW_MODELS_PATH)[1:]: x 49 | for x in sorted(glob(os.path.join(WAVEGLOW_MODELS_PATH, "*"))) 50 | } 51 | HIFIGAN_MODELS = { 52 | x.removeprefix(HIFIGAN_MODELS_PATH)[1:]: x 53 | for x in sorted(glob(os.path.join(HIFIGAN_MODELS_PATH, "*"))) 54 | } 55 | HIFIGAN_CONFIGS = { 56 | os.path.basename(x): x 57 | for x in glob(os.path.join(base_incl_path, "hifi-gan", "config_*.json")) 58 | } 59 | 60 | 61 | class Tacotron2Loader: 62 | """ 63 | loads a Tacotron2 model 64 | """ 65 | def __init__(self): 66 | self.model = None 67 | 68 | @classmethod 69 | def INPUT_TYPES(cls): 70 | return { 71 | "required": {"model_name": (list(MODELS.keys()),),} 72 | } 73 | 74 | RETURN_NAMES = ("tt2_model", "sample_rate") 75 | RETURN_TYPES = ("TT2_MODEL", "INT") 76 | FUNCTION = "load" 77 | CATEGORY = "audio" 78 | 79 | def load(self, model_name): 80 | if self.model is not None: 81 | self.model = object_to(self.model, empty_cuda_cache=False) 82 | del self.model 83 | do_cleanup() 84 | print("Tacotron2Loader: unloaded model") 85 | 86 | print("Tacotron2Loader: loading model") 87 | hparams = create_hparams() 88 | hparams.sampling_rate = 22050 89 | path = MODELS[model_name] 90 | 91 | self.model = load_model(hparams) 92 | sd = torch.load(path, map_location="cpu")["state_dict"] 93 | self.model.load_state_dict(sd) 94 | self.model.device = "cpu" 95 | self.model.eval().half() 96 | 97 | return self.model, hparams.sampling_rate, 98 | 99 | 100 | class WaveGlowLoader: 101 | """ 102 | loads a WaveGlow model 103 | """ 104 | def __init__(self): 105 | self.model = None 106 | self.denoiser = None 107 | 108 | @classmethod 109 | def INPUT_TYPES(cls): 110 | return {"required": {"model_name": (list(WAVEGLOW_MODELS.keys()),),}} 111 | 112 | RETURN_TYPES = ("WAVEGLOW_MODEL",) 113 | FUNCTION = "load" 114 | CATEGORY = "audio" 115 | 116 | def load(self, model_name): 117 | if self.model is not None: 118 | self.model = object_to(self.model, empty_cuda_cache=False) 119 | self.denoiser = object_to(self.denoiser, empty_cuda_cache=False) 120 | del self.model, self.denoiser 121 | do_cleanup() 122 | print("WaveGlowLoader: unloaded model") 123 | 124 | print("WaveGlowLoader: loading model") 125 | path = WAVEGLOW_MODELS[model_name] 126 | 127 | self.model = torch.load(path, map_location="cpu")["model"] 128 | self.model.eval().half() 129 | for k in self.model.convinv: 130 | k.float() 131 | self.denoiser = WaveGlowDenoiser(self.model) 132 | 133 | return (self.model, self.denoiser), 134 | 135 | 136 | class HifiGANLoader: 137 | """ 138 | loads a HifiGAN model 139 | """ 140 | def __init__(self): 141 | self.model = None 142 | self.denoiser = None 143 | 144 | @classmethod 145 | def INPUT_TYPES(cls): 146 | return { 147 | "required": { 148 | "model_name": (list(HIFIGAN_MODELS.keys()),), 149 | "config": (list(HIFIGAN_CONFIGS.keys()),), 150 | } 151 | } 152 | 153 | RETURN_TYPES = ("HIFIGAN_MODEL",) 154 | FUNCTION = "load" 155 | CATEGORY = "audio" 156 | 157 | def load(self, model_name, config): 158 | if self.model is not None: 159 | self.model = object_to(self.model, empty_cuda_cache=False) 160 | self.denoiser = object_to(self.denoiser, empty_cuda_cache=False) 161 | del self.model, self.denoiser 162 | do_cleanup() 163 | print("HifiGANLoader: unloaded model") 164 | 165 | print("HifiGANLoader: loading model") 166 | 167 | with open(HIFIGAN_CONFIGS[config], "r") as f: 168 | cfg = AttrDict(json.load(f)) 169 | 170 | path = HIFIGAN_MODELS[model_name] 171 | 172 | # model insists on choosing device itself 173 | device = HifiGANDenoiser.device 174 | self.model = HifiGAN(cfg).to(device) 175 | 176 | sd = torch.load(path, map_location=device)["generator"] 177 | self.model.load_state_dict(sd) 178 | self.model.eval() 179 | self.model.remove_weight_norm() 180 | 181 | self.denoiser = HifiGANDenoiser(self.model, mode="normal") 182 | 183 | self.model.cpu() 184 | self.denoiser.cpu() 185 | self.model.device = "cpu" 186 | self.denoiser.device = "cpu" 187 | 188 | return (self.model, self.denoiser, cfg), 189 | 190 | 191 | class Tacotron2Generate: 192 | """ 193 | generates speech mels from text using Tacotron2 194 | """ 195 | @classmethod 196 | def INPUT_TYPES(cls): 197 | return { 198 | "required": { 199 | "model": ("TT2_MODEL",), 200 | "text": ("STRING", {"default": "hello world", "multiline": True}), 201 | "seed": ("INT", {"default": 0, "min": 0}), 202 | }, 203 | } 204 | 205 | RETURN_NAMES = ("mel_outputs", "postnet_outputs") 206 | RETURN_TYPES = ("MELS", "MELS") 207 | FUNCTION = "generate" 208 | CATEGORY = "audio" 209 | 210 | def generate( 211 | self, 212 | model: Tacotron2, 213 | text: str = "", 214 | seed: int = 0, 215 | ): 216 | device = get_device() 217 | 218 | sequence = text_to_sequence(text, ['basic_cleaners']) 219 | 220 | with ( 221 | torch.no_grad(), 222 | torch.random.fork_rng(), 223 | obj_on_device(model, dst=device, verbose_move=True) as m 224 | ): 225 | prev_device = m.device 226 | m.device = device 227 | torch.manual_seed(seed) 228 | sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0).to(device) 229 | mel_outputs, mel_outputs_postnet, *_ = m.inference(sequence) 230 | m.device = prev_device 231 | 232 | do_cleanup() 233 | return mel_outputs, mel_outputs_postnet 234 | 235 | 236 | class WaveGlowApply: 237 | @classmethod 238 | def INPUT_TYPES(cls): 239 | return { 240 | "required": { 241 | "mels": ("MELS",), 242 | "model": ("WAVEGLOW_MODEL",), 243 | "sigma": ("FLOAT", {"default": 1.0, "min": 0.0}), 244 | "denoiser_strength": ("FLOAT", {"default": 0.06, "min": 0}), 245 | }, 246 | } 247 | 248 | RETURN_TYPES = ("AUDIO",) 249 | FUNCTION = "apply" 250 | CATEGORY = "audio" 251 | 252 | def apply( 253 | self, 254 | mels, 255 | model, 256 | sigma: float = 1.0, 257 | denoiser_strength: float = 0.06, 258 | ): 259 | device = get_device() 260 | waveglow, denoiser = model 261 | 262 | with ( 263 | torch.no_grad(), 264 | torch.random.fork_rng(), 265 | obj_on_device(waveglow, dst=device, verbose_move=True) as wg, 266 | obj_on_device(denoiser, dst=device, verbose_move=True) as dn, 267 | ): 268 | prev_device = wg.device 269 | wg.device = dn.device = device 270 | 271 | mels = mels.to(device) 272 | audio = wg.infer(mels, sigma=sigma) 273 | mels.cpu() 274 | 275 | if denoiser_strength != 0.0: 276 | audio = dn(audio, denoiser_strength=denoiser_strength) 277 | audio = audio.cpu().unbind(0) 278 | wg.device = dn.device = prev_device 279 | 280 | do_cleanup() 281 | return {"waveform": audio, "sample_rate": 22050}, # TODO: don't hardcode this 282 | 283 | 284 | class HifiGANApply: 285 | @classmethod 286 | def INPUT_TYPES(cls): 287 | return { 288 | "required": { 289 | "mels": ("MELS",), 290 | "model": ("HIFIGAN_MODEL",), 291 | "denoiser_strength": ("FLOAT", {"default": 0.06, "min": 0.0, "step": 0.001}), 292 | }, 293 | } 294 | 295 | RETURN_TYPES = ("AUDIO",) 296 | FUNCTION = "apply" 297 | CATEGORY = "audio" 298 | 299 | def apply(self, mels, model, denoiser_strength: float = 0.06): 300 | device = get_device() 301 | hifigan, denoiser, cfg = model 302 | 303 | with ( 304 | torch.no_grad(), 305 | torch.random.fork_rng(), 306 | obj_on_device(hifigan, dst=device, verbose_move=True) as hg, 307 | obj_on_device(denoiser, dst=device, verbose_move=True) as dn, 308 | ): 309 | prev_device = hg.device 310 | hg.device = dn.device = device 311 | 312 | mels = mels.to(device) 313 | audio = hg(mels.float()) 314 | mels.cpu() 315 | 316 | if denoiser_strength != 0.0: 317 | audio *= MAX_WAV_VALUE 318 | audio = dn(audio.squeeze(1), denoiser_strength) 319 | audio /= MAX_WAV_VALUE 320 | 321 | audio = audio.cpu() 322 | hg.device = dn.device = prev_device 323 | 324 | do_cleanup() 325 | return {"waveform": audio, "sample_rate": cfg.sample_rate}, 326 | 327 | 328 | class ToMelSpectrogram: 329 | @classmethod 330 | def INPUT_TYPES(cls): 331 | return { 332 | "required": { 333 | "audio": ("AUDIO",), 334 | "n_fft": ("INT", {"default": 1024, "min": 1, "max": BIGINT}), 335 | "n_mels": ("INT", {"default": 80, "min": 1}), 336 | "hop_len": ("INT", {"default": 256, "min": 1, "max": BIGINT}), 337 | "win_len": ("INT", {"default": 1024, "min":1, "max": BIGINT}), 338 | "fmin": ("INT", {"default": 0, "min": 0, "max": BIGINT}), 339 | "fmax": ("INT", {"default": 8000, "min": 0, "max": BIGINT}), 340 | }, 341 | } 342 | 343 | RETURN_TYPES = ("MELS",) 344 | FUNCTION = "apply" 345 | CATEGORY = "audio" 346 | 347 | def apply(self, audio, n_fft: int, n_mels: int, hop_len: int, win_len: int, fmin: int, fmax: int): 348 | sample_rate = audio["sample_rate"] 349 | with torch.no_grad(): 350 | mels = [mel_spectrogram(clip, n_fft, n_mels, sample_rate, hop_len, win_len, fmin, fmax) for clip in audio["waveform"].unbind(0)] 351 | mels = torch.cat(mels, 0) 352 | 353 | do_cleanup() 354 | return mels, 355 | 356 | 357 | class HifiGANModelParams: 358 | @classmethod 359 | def INPUT_TYPES(cls): 360 | return { 361 | "required": {"model": ("HIFIGAN_MODEL",)}, 362 | } 363 | 364 | RETURN_NAMES = ("sr", "n_mels", "n_fft", "hop_len", "win_len", "fmin", "fmax") 365 | RETURN_TYPES = ("INT", "INT", "INT", "INT", "INT", "INT", "INT") 366 | FUNCTION = "get" 367 | CATEGORY = "audio" 368 | 369 | def get(self, model): 370 | *_, cfg = model 371 | return cfg.sampling_rate, cfg.num_mels, cfg.n_fft, cfg.hop_size, cfg.win_size, cfg.fmin, cfg.fmax 372 | 373 | 374 | NODE_CLASS_MAPPINGS = { 375 | "Tacotron2Loader": Tacotron2Loader, 376 | "Tacotron2Generate": Tacotron2Generate, 377 | "HifiGANLoader": HifiGANLoader, 378 | "HifiGANModelParams": HifiGANModelParams, 379 | "HifiGANApply": HifiGANApply, 380 | "WaveGlowLoader": WaveGlowLoader, 381 | "WaveGlowApply": WaveGlowApply, 382 | "ToMelSpectrogram": ToMelSpectrogram, 383 | } 384 | 385 | NODE_DISPLAY_NAME_MAPPINGS = { 386 | "Tacotron2Loader": "Tacotron2 Loader", 387 | "Tacotron2Generate": "Tacotron2 Generator", 388 | "HifiGANLoader": "HifiGAN Loader", 389 | "HifiGANModelParams": "Get HifiGAN Model Parameters", 390 | "HifiGANApply": "Apply HifiGAN", 391 | "WaveGlowLoader": "WaveGlow Loader", 392 | "WaveGlowApply": "Apply WaveGlow", 393 | "ToMelSpectrogram": "Audio to Mel Spectrogram", 394 | } 395 | -------------------------------------------------------------------------------- /tortoise_nodes.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from tortoise.api import TextToSpeech, pick_best_batch_size_for_gpu 6 | from tortoise.api_fast import TextToSpeech as FastTextToSpeech 7 | from tortoise.models.cvvp import CVVP 8 | from tortoise.utils.audio import get_voices, load_voice 9 | 10 | from .util import do_cleanup, get_device, models_dir, object_to, obj_on_device 11 | 12 | 13 | MODELS_PATH = os.path.join(models_dir, "tortoise") 14 | VOICES_PATH = os.path.join(MODELS_PATH, "voices") 15 | os.makedirs(VOICES_PATH, exist_ok=True) 16 | 17 | VOICES = get_voices(extra_voice_dirs=[VOICES_PATH]) 18 | 19 | 20 | def _load_cvvp(self): 21 | from urllib.request import urlretrieve 22 | from tortoise.api import MODELS 23 | self.cvvp = CVVP( 24 | model_dim=512, 25 | transformer_heads=8, 26 | dropout=0, 27 | mel_codes=8192, 28 | conditioning_enc_depth=8, 29 | cond_mask_percentage=0, 30 | speech_enc_depth=8, 31 | speech_mask_percentage=0, 32 | latent_multiplier=1, 33 | ) 34 | self.cvvp.eval() 35 | ckpt_path = os.path.join(MODELS_PATH, "cvvp.pth") 36 | if not os.path.exists(ckpt_path): 37 | urlretrieve(MODELS["cvvp.pth"], ckpt_path) 38 | cvvp_sd = torch.load(ckpt_path, map_location="cpu") 39 | self.cvvp.load_state_dict(cvvp_sd) 40 | 41 | 42 | class TextToSpeech(TextToSpeech): 43 | load_cvvp = _load_cvvp 44 | 45 | 46 | class FastTextToSpeech(FastTextToSpeech): 47 | load_cvvp = _load_cvvp 48 | def tts( 49 | self, text, voice_samples=None, k=1, verbose=True, use_deterministic_seed=None, 50 | # autoregressive generation parameters follow 51 | num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, 52 | top_p=.8, max_mel_tokens=500, 53 | # CVVP parameters follow 54 | cvvp_amount=.0, 55 | **hf_generate_kwargs, 56 | ): 57 | """function adapted from the original tortoise implementation by neonbjb.""" 58 | self.deterministic_state(seed=use_deterministic_seed) 59 | 60 | text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device) 61 | text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. 62 | assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.' 63 | if voice_samples is not None: 64 | auto_conditioning = self.get_conditioning_latents(voice_samples, return_mels=False) 65 | else: 66 | auto_conditioning = self.get_random_conditioning_latents() 67 | auto_conditioning = auto_conditioning.to(self.device) 68 | 69 | with torch.no_grad(): 70 | if verbose: 71 | print("Generating autoregressive samples..") 72 | with torch.autocast( 73 | device_type="cuda" , dtype=torch.float16, enabled=self.half 74 | ): 75 | codes = self.autoregressive.inference_speech( 76 | auto_conditioning, 77 | text_tokens, 78 | top_k=num_autoregressive_samples, 79 | top_p=top_p, 80 | temperature=temperature, 81 | do_sample=True, 82 | num_beams=1, 83 | num_return_sequences=1, 84 | length_penalty=float(length_penalty), 85 | repetition_penalty=float(repetition_penalty), 86 | output_attentions=False, 87 | output_hidden_states=True, 88 | **hf_generate_kwargs, 89 | ) 90 | gpt_latents = self.autoregressive( 91 | auto_conditioning.repeat(k, 1), 92 | text_tokens.repeat(k, 1), 93 | torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), 94 | codes.repeat(k, 1), 95 | torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), 96 | return_latent=True, 97 | clip_inputs=False 98 | ) 99 | if verbose: 100 | print("generating audio..") 101 | wav_gen = self.hifi_decoder.inference(gpt_latents.to(self.device), auto_conditioning) 102 | return wav_gen.cpu() 103 | 104 | 105 | class TortoiseTTSLoader: 106 | """ 107 | loads the Tortoise TTS "model", which is actually just the tortoise tts api 108 | """ 109 | def __init__(self): 110 | self.model = None 111 | 112 | @classmethod 113 | def INPUT_TYPES(cls): 114 | return { 115 | "required": { 116 | "kv_cache": ("BOOLEAN", {"default": True}), 117 | "half": ("BOOLEAN", {"default": False}), 118 | "use_deepspeed": ("BOOLEAN", {"default": False}), 119 | "use_fast_api": ("BOOLEAN", {"default": False}), 120 | } 121 | } 122 | 123 | RETURN_NAMES = ("tortoise_tts_model", "sample_rate") 124 | RETURN_TYPES = ("TORTOISE_TTS", "INT") 125 | FUNCTION = "load" 126 | CATEGORY = "audio" 127 | 128 | def load(self, kv_cache=True, half=False, use_deepspeed=False, use_fast_api=False): 129 | if self.model is not None: 130 | self.model = object_to(self.model, empty_cuda_cache=False) 131 | del self.model 132 | do_cleanup() 133 | print("TortoiseTTSLoader: unloaded model") 134 | 135 | print("TortoiseTTSLoader: loading model") 136 | if use_fast_api: 137 | print( 138 | "TortoiseTTSLoader: using fast api; please note that diffusion, CLVP, and CVVP controls will " 139 | "not be used, num_autoregressive_samples is fixed to 50, and max_mel_tokens will be ignored." 140 | ) 141 | ctor = FastTextToSpeech if use_fast_api else TextToSpeech 142 | self.model = ctor( 143 | models_dir=MODELS_PATH, 144 | half=half, 145 | kv_cache=kv_cache, 146 | use_deepspeed=use_deepspeed, 147 | ) 148 | 149 | return self.model, 24000 150 | 151 | 152 | class TortoiseTTSGenerate: 153 | """ 154 | generates speech from text using tortoise. custom voices are supported; just add short clips of speech to a 155 | subdirectory of "ComfyUI/models/tortoise/voices". 156 | """ 157 | @classmethod 158 | def INPUT_TYPES(cls): 159 | return { 160 | "required": { 161 | "model": ("TORTOISE_TTS",), 162 | "voice": (["random", *list(VOICES.keys())],), 163 | "text": ("STRING", {"default": "hello world", "multiline": True}), 164 | "batch_size": ("INT", {"default": 1, "min": 1}), 165 | "num_autoregressive_samples": ("INT", {"default": 20, "min": 0, "max": 10000, "step": 1}), 166 | "autoregressive_batch_size": ("INT", {"default": 0, "min": 0, "max": 1024, "step": 1}), 167 | "temperature": ("FLOAT", {"default": 0.8, "min": 0.001, "step": 0.001}), 168 | "length_penalty": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.001}), 169 | "repetition_penalty": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 10.0, "step": 0.001}), 170 | "top_p": ("FLOAT", {"default": 0.8, "min": 0.001, "max": 1.0, "step": 0.001}), 171 | "max_mel_tokens": ("INT", {"default": 500, "min": 1, "max": 600, "step": 1}), 172 | "cvvp_amount": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), 173 | "diffusion_steps": ("INT", {"default": 20, "min": 0, "max": 4000}), 174 | "cond_free": ("BOOLEAN", {"default": True}), 175 | "cond_free_k": ("FLOAT", {"default": 2.0, "min": 0.0, "step": 0.01}), 176 | "diffusion_temperature": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}), 177 | "seed": ("INT", {"default": 0, "min": 0}), 178 | }, 179 | } 180 | 181 | RETURN_TYPES = ("AUDIO",) 182 | FUNCTION = "generate" 183 | CATEGORY = "audio" 184 | 185 | def generate( 186 | self, 187 | model: TextToSpeech, 188 | text: str = "", 189 | voice: str = "random", 190 | batch_size: int = 1, 191 | num_autoregressive_samples: int = 80, 192 | autoregressive_batch_size: int = 8, 193 | temperature: float = 1.0, 194 | length_penalty: float = 1.0, 195 | repetition_penalty: float = 2.0, 196 | top_p: float = 1.0, 197 | max_mel_tokens: int = 500, 198 | cvvp_amount: float = 0.0, 199 | diffusion_steps: int = 100, 200 | cond_free: bool = False, 201 | cond_free_k: float = 0.0, 202 | diffusion_temperature: float = 1.0, 203 | seed: int = 0, 204 | ): 205 | device = get_device() 206 | voice_samples, voice_latents = load_voice(voice, extra_voice_dirs=[VOICES_PATH]) 207 | 208 | if autoregressive_batch_size == 0: 209 | autoregressive_batch_size = pick_best_batch_size_for_gpu() 210 | 211 | model.autoregressive_batch_size = autoregressive_batch_size 212 | 213 | diffusion_kwargs = { 214 | "diffusion_iterations": diffusion_steps, 215 | "cond_free": cond_free, 216 | "cond_free_k": cond_free_k, 217 | "diffusion_temperature": diffusion_temperature, 218 | } if not isinstance(model, FastTextToSpeech) else {} 219 | 220 | with ( 221 | torch.random.fork_rng(), 222 | obj_on_device(model, dst=device, exclude={"rlg_auto", "rlg_diffusion"}, verbose_move=True) as m 223 | ): 224 | prev_device = m.device 225 | m.device = device 226 | torch.manual_seed(seed) 227 | audio_out = m.tts( 228 | text, 229 | voice_samples=voice_samples, 230 | conditioning_latents=voice_latents, 231 | k=batch_size, 232 | verbose=True, 233 | num_autoregressive_samples=num_autoregressive_samples, 234 | temperature=float(temperature), 235 | length_penalty=float(length_penalty), 236 | repetition_penalty=float(repetition_penalty), 237 | top_p=top_p, 238 | max_mel_tokens=max_mel_tokens, 239 | cvvp_amount=cvvp_amount, 240 | use_deterministic_seed=seed, 241 | **diffusion_kwargs, 242 | ) 243 | 244 | if isinstance(audio_out, list): 245 | lengths = [x.shape[-1] for x in audio_out] 246 | max_len = max(lengths) 247 | audio_out = [F.pad(x, [0, max_len - x.shape[-1]]) for x in audio_out] 248 | audio_out = torch.cat(audio_out, dim=0) 249 | else: 250 | audio_out = audio_out.view(1, 1, -1) 251 | 252 | m.device = prev_device 253 | 254 | do_cleanup() 255 | return {"waveform": audio_out, "sample_rate": 24000}, 256 | 257 | 258 | NODE_CLASS_MAPPINGS = { 259 | "TortoiseTTSGenerate": TortoiseTTSGenerate, 260 | "TortoiseTTSLoader": TortoiseTTSLoader, 261 | } 262 | 263 | NODE_DISPLAY_NAME_MAPPINGS = { 264 | "TortoiseTTSGenerate": "Tortoise TTS Generator", 265 | "TortoiseTTSLoader": "Tortoise TTS Loader", 266 | } 267 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import os 3 | from contextlib import contextmanager 4 | 5 | import torch 6 | from torch.nn.functional import pad 7 | 8 | 9 | # TODO: this sucks 10 | COMFY_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), "..", "..")) 11 | 12 | from folder_paths import ( 13 | models_dir, 14 | get_output_directory, 15 | get_temp_directory, 16 | get_save_image_path, 17 | ) 18 | 19 | 20 | def do_cleanup(cuda_cache=True): 21 | gc.collect() 22 | if cuda_cache: 23 | torch.cuda.empty_cache() 24 | 25 | 26 | def get_device(): 27 | return "cuda" if torch.cuda.is_available() else "cpu" 28 | 29 | 30 | def tensors_to(tensors, device): 31 | if isinstance(tensors, torch.Tensor): 32 | return tensors.to(device) 33 | if hasattr(tensors, "__dict__"): 34 | return object_to(tensors, device, empty_cuda_cache=False) 35 | if isinstance(tensors, (list, tuple)): 36 | return [tensors_to(x, device) for x in tensors] 37 | if isinstance(tensors, dict): 38 | return {k: tensors_to(v, device) for k, v in tensors.items()} 39 | if isinstance(tensors, set): 40 | return {tensors_to(x, device) for x in tensors} 41 | return tensors 42 | 43 | 44 | def tensors_to_cuda(tensors): 45 | return tensors_to(tensors, "cuda") 46 | 47 | 48 | def tensors_to_cpu(tensors): 49 | return tensors_to(tensors, "cpu") 50 | 51 | 52 | def object_to(obj, device=None, exclude=None, empty_cuda_cache=True, verbose=False): 53 | """ 54 | recurse through an object and move any pytorch tensors/parameters/modules to the given device. 55 | if device is None, cpu is used by default. if the device is a CUDA device and empty_cuda_cache is 56 | enabled, this will also free unused CUDA memory cached by pytorch. 57 | """ 58 | 59 | if not hasattr(obj, "__dict__"): 60 | return obj 61 | 62 | classname = type(obj).__name__ 63 | exclude = exclude or set() 64 | device = device or "cpu" 65 | 66 | def _move_and_recurse(o, name=""): 67 | child_moved = False 68 | for k, v in vars(o).items(): 69 | moved = False 70 | cur_name = f"{name}.{k}" if name != "" else k 71 | if cur_name in exclude: 72 | continue 73 | if isinstance(v, (torch.nn.Module, torch.nn.Parameter, torch.Tensor)): 74 | setattr(o, k, v.to(device)) 75 | moved = True 76 | elif hasattr(v, "__dict__"): 77 | v, moved = _move_and_recurse(v, name=cur_name) 78 | if moved: setattr(o, k, v) 79 | if verbose and moved: 80 | print(f"moved {classname}.{cur_name} to {device}") 81 | child_moved |= moved 82 | return o, child_moved 83 | 84 | if isinstance(obj, torch.nn.Module): 85 | obj = obj.to(device) 86 | 87 | obj, _ = _move_and_recurse(obj) 88 | if "cuda" in device and empty_cuda_cache: 89 | torch.cuda.empty_cache() 90 | return obj 91 | 92 | 93 | @contextmanager 94 | def obj_on_device(model, src="cpu", dst="cuda", exclude=None, empty_cuda_cache=True, verbose_move=False): 95 | model = object_to(model, dst, exclude=exclude, empty_cuda_cache=empty_cuda_cache, verbose=verbose_move) 96 | yield model 97 | model = object_to(model, src, exclude=exclude, empty_cuda_cache=empty_cuda_cache, verbose=verbose_move) 98 | 99 | 100 | @contextmanager 101 | def on_device(model, src="cpu", dst="cuda", empty_cuda_cache=True, **kwargs): 102 | model = model.to(dst) 103 | yield model 104 | model = model.to(src) 105 | if empty_cuda_cache: 106 | torch.cuda.empty_cache() 107 | 108 | 109 | def stack_audio_tensors(tensors, mode="pad"): 110 | # assert all(len(x.shape) == 2 for x in tensors) 111 | sizes = [x.shape[-1] for x in tensors] 112 | 113 | if mode in {"pad_l", "pad_r", "pad"}: 114 | # pad input tensors to be equal length 115 | dst_size = max(sizes) 116 | stack_tensors = ( 117 | [pad(x, pad=(0, dst_size - x.shape[-1])) for x in tensors] 118 | if mode == "pad_r" 119 | else [pad(x, pad=(dst_size - x.shape[-1], 0)) for x in tensors] 120 | ) 121 | elif mode in {"trunc_l", "trunc_r", "trunc"}: 122 | # truncate input tensors to be equal length 123 | dst_size = min(sizes) 124 | stack_tensors = ( 125 | [x[:, x.shape[-1] - dst_size:] for x in tensors] 126 | if mode == "trunc_r" 127 | else [x[:, :dst_size] for x in tensors] 128 | ) 129 | else: 130 | assert False, 'unknown mode "{pad}"' 131 | 132 | return torch.stack(stack_tensors) 133 | -------------------------------------------------------------------------------- /util_nodes.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import shutil 5 | import subprocess 6 | import librosa 7 | import torch 8 | from torch import hann_window 9 | 10 | import numpy as np 11 | import scipy 12 | import resampy 13 | import torchaudio 14 | import torchaudio.functional as TAF 15 | from PIL import Image 16 | 17 | from comfy.cli_args import args 18 | 19 | from .util import ( 20 | do_cleanup, 21 | get_device, 22 | get_output_directory, 23 | get_temp_directory, 24 | get_save_image_path, 25 | on_device, 26 | ) 27 | 28 | 29 | # filters that only require width 30 | FILTER_WINDOWS = { 31 | x.__name__.split(".")[-1]: x for x in [ 32 | scipy.signal.windows.boxcar, 33 | scipy.signal.windows.triang, 34 | scipy.signal.windows.blackman, 35 | scipy.signal.windows.hamming, 36 | scipy.signal.windows.hann, 37 | scipy.signal.windows.bartlett, 38 | scipy.signal.windows.flattop, 39 | scipy.signal.windows.parzen, 40 | scipy.signal.windows.bohman, 41 | scipy.signal.windows.blackmanharris, 42 | scipy.signal.windows.nuttall, 43 | scipy.signal.windows.barthann, 44 | scipy.signal.windows.cosine, 45 | scipy.signal.windows.exponential, 46 | scipy.signal.windows.tukey, 47 | scipy.signal.windows.taylor, 48 | scipy.signal.windows.lanczos, 49 | ] 50 | } 51 | MAX_WAV_VALUE = 32768.0 52 | 53 | 54 | def find_end_of_clip(x): 55 | x_mono = x.sum(dim=0) 56 | k = len(x_mono) - 1 57 | while k > 0 and x_mono[k] == 0.0: 58 | k -= 1 59 | return k + 1 60 | 61 | 62 | class NormalizeAudio: 63 | @classmethod 64 | def INPUT_TYPES(cls): 65 | return { 66 | "required": { 67 | "audio": ("AUDIO",), 68 | "power": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}) 69 | } 70 | } 71 | 72 | RETURN_TYPES = ("AUDIO",) 73 | FUNCTION = "normalize_audio" 74 | CATEGORY = "audio" 75 | 76 | def normalize_audio(self, audio, power): 77 | clip = audio["waveform"] 78 | normed_clip = clip * (1.0 / clip.abs().max(dim=-1, keepdim=True)[0]) ** power 79 | return {"waveform": normed_clip, "sample_rate": audio["sample_rate"]}, 80 | 81 | 82 | class ClipAudio: 83 | @classmethod 84 | def INPUT_TYPES(cls): 85 | return { 86 | "required": { 87 | "audio": ("AUDIO",), 88 | "from_s": ("FLOAT", {"default": 0.0, "step": 0.001}), 89 | "to_s": ("FLOAT", {"default": 0.0, "step": 0.001}), 90 | } 91 | } 92 | 93 | RETURN_TYPES = ("AUDIO",) 94 | FUNCTION = "clip_audio" 95 | CATEGORY = "audio" 96 | 97 | def clip_audio(self, audio, from_s, to_s): 98 | sr = audio["sample_rate"] 99 | from_sample = int(from_s * sr) 100 | to_sample = int(to_s * sr) 101 | return {"waveform": audio["waveform"][..., from_sample:to_sample], "sample_rate": sr}, 102 | 103 | 104 | class TrimAudio: 105 | @classmethod 106 | def INPUT_TYPES(cls): 107 | return { 108 | "required": { 109 | "audio": ("AUDIO",), 110 | "s_from_start": ("FLOAT", {"default": 0.0, "step": 0.001}), 111 | "s_from_end": ("FLOAT", {"default": 0.0, "step": 0.001}), 112 | } 113 | } 114 | 115 | RETURN_TYPES = ("AUDIO",) 116 | FUNCTION = "clip_audio" 117 | CATEGORY = "audio" 118 | 119 | def clip_audio(self, audio, s_from_start, s_from_end): 120 | sr = audio["sample_rate"] 121 | from_sample = int(s_from_start * sr) 122 | to_sample = (int(s_from_end * sr) + 1) 123 | return {"waveform": audio["waveform"][..., from_sample:-to_sample], "sample_rate": sr}, 124 | 125 | 126 | class TrimAudioSamples: 127 | @classmethod 128 | def INPUT_TYPES(cls): 129 | return { 130 | "required": { 131 | "audio": ("AUDIO",), 132 | "from_start": ("INT", {"default": 0, "min": 0, "max": 2 ** 32, "step": 1}), 133 | "from_end": ("INT", {"default": 0, "min": 0, "max": 2 ** 32, "step": 1}), 134 | } 135 | } 136 | 137 | RETURN_TYPES = ("AUDIO",) 138 | FUNCTION = "clip_audio" 139 | CATEGORY = "audio" 140 | 141 | def clip_audio(self, audio, from_start, from_end): 142 | from_sample = from_start 143 | to_sample = from_end + 1 144 | return {"audio": audio["waveform"][..., from_sample:-to_sample], "sample_rate": audio["sample_rate"]}, 145 | 146 | 147 | class FlattenAudioBatch: 148 | """ 149 | flatten a batch of audio into a single audio tensor 150 | """ 151 | @classmethod 152 | def INPUT_TYPES(cls): 153 | return {"required": {"audio_batch": ("AUDIO",)}} 154 | 155 | RETURN_TYPES = ("AUDIO",) 156 | FUNCTION = "concat_audio" 157 | CATEGORY = "audio" 158 | 159 | def concat_audio(self, audio_batch): 160 | audio = audio_batch["waveform"] 161 | n, c, t = audio.shape 162 | audio = audio.permute(0, 2, 1) 163 | return {"waveform": audio.reshape(1, -1, c).permute(0, 2, 1), "sample_rate": audio["sample_rate"]}, 164 | 165 | 166 | class ConcatAudio: 167 | """ 168 | concatenate two batches of audio along their time dimensions 169 | 170 | mismatched batch sizes are not supported unless one of the batches is size 1: if a batch has only 171 | one item it will be repeated to match the size of the other batch if necessary. 172 | """ 173 | @classmethod 174 | def INPUT_TYPES(cls): 175 | return { 176 | "required": { 177 | "batch1": ("AUDIO",), 178 | "batch2": ("AUDIO",), 179 | } 180 | } 181 | 182 | RETURN_TYPES = ("AUDIO",) 183 | FUNCTION = "concat_audio" 184 | CATEGORY = "audio" 185 | 186 | def concat_audio(self, batch1, batch2): 187 | # TODO: validate that the sample rates are the same 188 | b1 = batch1["waveform"] 189 | b2 = batch2["waveform"] 190 | 191 | if len(b1) == 1 and len(b2) != 1: 192 | b1 = b1.expand(len(b2), -1, -1) 193 | elif len(b2) == 1 and len(b1) != 1: 194 | b2 = b2.expand(len(b1), -1, -1) 195 | 196 | return {"waveform": torch.concat([b1, b2], dim=-1), "sample_rate": batch1["sample_rate"]}, 197 | 198 | 199 | class BatchAudio: 200 | """ 201 | combine two AUDIO batches together. 202 | """ 203 | @classmethod 204 | def INPUT_TYPES(cls): 205 | return { 206 | "required": { 207 | "batch1": ("AUDIO",), 208 | "batch2": ("AUDIO",), 209 | } 210 | } 211 | 212 | RETURN_TYPES = ("AUDIO",) 213 | FUNCTION = "batch_audio" 214 | CATEGORY = "audio" 215 | 216 | def batch_audio(self, batch1, batch2): 217 | batch = torch.cat([batch1["waveform"], batch2["waveform"]], dim=0) 218 | return {"waveform": batch, "sample_rate": batch1["sample_rate"]}, 219 | 220 | 221 | class ConvertAudio: 222 | """ 223 | convert audio sample rate and/or number of channels 224 | """ 225 | def __init__(self): 226 | pass 227 | 228 | @classmethod 229 | def INPUT_TYPES(cls): 230 | return { 231 | "required": { 232 | "audio": ("AUDIO",), 233 | "to_rate": ("INT", {"default": 32000, "min": 1, "max": 2 ** 32}), 234 | "to_channels": ("INT", {"default": 1, "min": 1, "max": 2, "step": 1}), 235 | } 236 | } 237 | 238 | RETURN_TYPES = ("AUDIO",) 239 | FUNCTION = "convert" 240 | CATEGORY = "audio" 241 | 242 | def convert(self, audio, to_rate, to_channels): 243 | from_rate = audio["sample_rate"] 244 | waveform = audio["waveform"] 245 | waveform = TAF.resample(waveform, from_rate, to_rate) 246 | if to_channels == 1: 247 | waveform = waveform.mean(dim=1, keepdim=True) 248 | elif to_channels == 2 and waveform.shape[1] == 1: 249 | waveform = waveform.expand(-1, to_channels, -1) 250 | 251 | return {"waveform": waveform, "sample_rate": to_rate}, 252 | 253 | 254 | class ResampleAudio: 255 | @classmethod 256 | def INPUT_TYPES(cls): 257 | return { 258 | "required": { 259 | "audio": ("AUDIO",), 260 | "from_rate": ("INT", {"default": 44100, "min": 1, "max": 2 ** 32}), 261 | "to_rate": ("INT", {"default": 32000, "min": 1, "max": 2 ** 32}), 262 | "filter": (["sinc_window", "kaiser_best", "kaiser_fast"], ), 263 | "window": (list(FILTER_WINDOWS.keys()),), 264 | "num_zeros": ("INT", {"default": 64, "min": 1, "max": 2 ** 32}) 265 | } 266 | } 267 | 268 | RETURN_TYPES = ("AUDIO",) 269 | FUNCTION = "convert" 270 | CATEGORY = "audio" 271 | 272 | def convert(self, audio, from_rate, to_rate, filter, window, num_zeros): 273 | converted = [] 274 | w = FILTER_WINDOWS[window] 275 | for clip in audio["waveform"]: 276 | new_clip = resampy.resample(clip.numpy(), from_rate, to_rate, filter=filter, window=w, num_zeros=num_zeros, parallel=False) 277 | converted.append(torch.from_numpy(new_clip)) 278 | return {"waveform": torch.stack(converted, dim=0), "sample_rate": to_rate}, 279 | 280 | 281 | def logyscale(img_array): 282 | height, width = img_array.shape 283 | 284 | def _remap(y, x): 285 | return min(int(math.log(y + 1) * height / math.log(height)), height - 1), min(x, width - 1) 286 | v_remap = np.vectorize(_remap) 287 | 288 | y, x = np.meshgrid(np.arange(height), np.arange(width), indexing="ij") 289 | indices = v_remap(y, x) 290 | img_array = img_array[indices] 291 | 292 | return img_array 293 | 294 | 295 | class SpectrogramImage: 296 | """ 297 | create spectrogram images from audio. 298 | """ 299 | @classmethod 300 | def INPUT_TYPES(cls): 301 | return { 302 | "required": { 303 | "audio": ("AUDIO",), 304 | "n_fft": ("INT", {"default": 200}), 305 | "hop_len": ("INT", {"default": 50}), 306 | "win_len": ("INT", {"default": 100}), 307 | "power": ("FLOAT", {"default": 1.0}), 308 | "normalized": ("BOOLEAN", {"default": False}), 309 | "logy": ("BOOLEAN", {"default": True}), 310 | "width": ("INT", {"default": 640, "min": 0}), 311 | "height": ("INT", {"default": 320, "min": 0}), 312 | }, 313 | } 314 | 315 | RETURN_TYPES = ("IMAGE",) 316 | FUNCTION = "make_spectrogram" 317 | OUTPUT_NODE = True 318 | CATEGORY = "audio" 319 | 320 | def make_spectrogram( 321 | self, 322 | audio, 323 | n_fft=400, 324 | hop_len=50, 325 | win_len=100, 326 | power=1.0, 327 | normalized=False, 328 | logy=True, 329 | width=640, 330 | height=320, 331 | ): 332 | hop_len = n_fft // 4 if hop_len == 0 else hop_len 333 | win_len = n_fft if win_len == 0 else win_len 334 | 335 | waveform_batch = audio["waveform"] 336 | results = [] 337 | for clip in waveform_batch: 338 | end_sample = find_end_of_clip(clip) 339 | spectro = TAF.spectrogram( 340 | clip[..., :end_sample], 341 | 0, 342 | window=hann_window(win_len), 343 | n_fft=n_fft, 344 | hop_length=hop_len, 345 | win_length=win_len, 346 | power=power, 347 | normalized=normalized, 348 | center=True, 349 | pad_mode="reflect", 350 | onesided=True, 351 | ) # yields a 1xCxT tensor 352 | spectro = spectro[0].squeeze().flip(0) # CxT 353 | if logy: 354 | spectro = clip.new_tensor(logyscale(spectro.numpy())) 355 | results.append( 356 | torch.nn.functional.interpolate(spectro[None, None], (height, width), mode="bilinear") 357 | if width != 0 and height != 0 358 | else spectro[None, None] 359 | ) 360 | results = torch.cat(results, dim=0).permute(0, 2, 3, 1).expand(-1, -1, -1, 3) 361 | return results, 362 | 363 | 364 | class BlendAudio: 365 | @classmethod 366 | def INPUT_TYPES(cls): 367 | return { 368 | "required": { 369 | "audio_to": ("AUDIO",), 370 | "audio_from": ("AUDIO",), 371 | "audio_to_strength": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}), 372 | } 373 | } 374 | 375 | RETURN_TYPES = ("AUDIO",) 376 | FUNCTION = "blend" 377 | CATEGORY = "audio" 378 | 379 | def blend(self, audio_to, audio_from, audio_to_strength): 380 | a_to = audio_to["waveform"] 381 | a_from = audio_from["waveform"] 382 | a_to = a_to.float() * MAX_WAV_VALUE 383 | a_from = a_from.float() * MAX_WAV_VALUE 384 | to_n = a_to.shape[-1] 385 | from_n = a_from.shape[-1] 386 | 387 | if to_n > from_n: 388 | leftover = a_to[..., from_n:] 389 | a_to = a_to[..., :from_n] 390 | elif from_n > to_n: 391 | leftover = a_from[..., to_n:] 392 | a_from = a_from[..., :to_n] 393 | else: 394 | leftover = torch.empty(0, dtype=torch.float) 395 | 396 | new_a = audio_to_strength * a_to + (1 - audio_to_strength) * a_from 397 | blended_audio = torch.cat((new_a, leftover), dim=-1) / MAX_WAV_VALUE 398 | 399 | return {"waveform": blended_audio, "sample_rate": audio_to["sample_rate"]}, 400 | 401 | 402 | class InvertPhase: 403 | @classmethod 404 | def INPUT_TYPES(cls): 405 | return { 406 | "required": { 407 | "audio": ("AUDIO",), 408 | } 409 | } 410 | 411 | RETURN_TYPES = ("AUDIO",) 412 | FUNCTION = "invert" 413 | CATEGORY = "audio" 414 | 415 | def invert(self, audio): 416 | return {"waveform": -audio["waveform"], "sample_rate": audio["sample_rate"]}, 417 | 418 | 419 | class FilterAudio: 420 | @classmethod 421 | def INPUT_TYPES(cls): 422 | return { 423 | "required": { 424 | "audio": ("AUDIO",), 425 | "numtaps": ("INT", {"default": 101, "min": 1, "max": 2 ** 32}), 426 | "cutoff": ("INT", {"default": 10500, "min": 1, "max": 2 ** 32}), 427 | "width": ("INT", {"default": 0, "min": 0, "max": 2 ** 32}), 428 | "window": (list(FILTER_WINDOWS.keys()),), 429 | "pass_zero": ("BOOLEAN", {"default": True}), 430 | "scale": ("BOOLEAN", {"default": True}), 431 | "fs": ("INT", {"default": 32000, "min": 1, "max": 2 ** 32}), 432 | } 433 | } 434 | 435 | RETURN_TYPES = ("AUDIO",) 436 | FUNCTION = "filter_audio" 437 | CATEGORY = "audio" 438 | 439 | def filter_audio(self, audio, numtaps, cutoff, width, window, pass_zero, scale, fs): 440 | if width == 0: 441 | width = None 442 | 443 | filtered = [] 444 | f = scipy.signal.firwin(numtaps, cutoff, width=width, window=window, pass_zero=pass_zero, scale=scale, fs=fs) 445 | for clip in audio["waveform"]: 446 | filtered_clip = scipy.signal.lfilter(f, [1.0], clip.numpy() * MAX_WAV_VALUE) 447 | filtered.append(torch.from_numpy(filtered_clip / MAX_WAV_VALUE).float()) 448 | 449 | return {"waveform": torch.stack(filtered, dim=0), "sample_rate": audio["sample_rate"]}, 450 | 451 | 452 | class CombineImageWithAudio: 453 | """ 454 | combine an image and audio into a video clip. 455 | """ 456 | def __init__(self): 457 | self.output_dir = get_output_directory() 458 | self.output_type = "output" 459 | self.prefix_append = "" 460 | 461 | @classmethod 462 | def INPUT_TYPES(cls): 463 | return { 464 | "required": { 465 | "image": ("IMAGE",), 466 | "audio": ("AUDIO",), 467 | "file_format": (["webm", "mp4"],), 468 | "filename_prefix": ("STRING", {"default": "ComfyUI"}), 469 | }, 470 | } 471 | 472 | RETURN_TYPES = () 473 | FUNCTION = "save_image_with_audio" 474 | OUTPUT_NODE = True 475 | CATEGORY = "audio" 476 | 477 | def save_image_with_audio(self, image, audio, file_format, filename_prefix): 478 | filename_prefix += self.prefix_append 479 | sr = audio["sample_rate"] 480 | full_outdir, base_fname, count, subdir, filename_prefix = get_save_image_path( 481 | filename_prefix, self.output_dir 482 | ) 483 | 484 | audio_results = [] 485 | video_results = [] 486 | 487 | waveform = audio["waveform"] 488 | for image_tensor, clip in zip(image, waveform): 489 | name = f"{base_fname}_{count:05}_" 490 | tmp_dir = get_temp_directory() 491 | 492 | wav_basename = f"{name}.wav" 493 | wav_fname = os.path.join(full_outdir, wav_basename) 494 | end_sample = find_end_of_clip(clip) 495 | torchaudio.save(wav_fname, clip[..., :end_sample], sr, format="wav") 496 | 497 | image = image_tensor.mul(255.0).clip(0, 255).byte().numpy() 498 | image = Image.fromarray(image) 499 | 500 | image_basename = f"{name}.png" 501 | image_fname = os.path.join(tmp_dir, image_basename) 502 | image.save(image_fname, compress_level=4) 503 | 504 | video_basename = f"{name}.{file_format}" 505 | video_fname = os.path.join(full_outdir, video_basename) 506 | 507 | proc_args = [ 508 | shutil.which("ffmpeg"), "-y", "-i", image_fname, "-i", str(wav_fname) 509 | ] 510 | if file_format == "webm": 511 | proc_args += ["-c:v", "vp8", "-c:a", "opus", "-strict", "-2", video_fname] 512 | else: # file_format == "mp4" 513 | proc_args += ["-pix_fmt", "yuv420p", video_fname] 514 | 515 | subprocess.run(proc_args) 516 | 517 | audio_results.append({ 518 | "filename": wav_basename, 519 | "format": "audio/wav", 520 | "subfolder": subdir, 521 | "type": "output", 522 | }) 523 | video_results.append({ 524 | "filename": video_basename, 525 | "format": "video/webm" if file_format == "webm" else "video/mpeg", 526 | "subfolder": subdir, 527 | "type": "output", 528 | }) 529 | count += 1 530 | 531 | return {"ui": {"audio": audio_results, "video": video_results}} 532 | 533 | 534 | class ApplyVoiceFixer: 535 | def __init__(self): 536 | self.model = None 537 | 538 | @classmethod 539 | def INPUT_TYPES(cls): 540 | return { 541 | "required": 542 | { 543 | "audio": ("AUDIO",), 544 | "mode": ("INT", {"default": 0, "min": 0, "max": 2}), 545 | }, 546 | } 547 | 548 | FUNCTION = "apply" 549 | RETURN_TYPES = ("AUDIO",) 550 | CATEGORY = "audio" 551 | 552 | def apply(self, audio, mode): 553 | device = get_device() 554 | if self.model is None: 555 | from voicefixer import VoiceFixer 556 | self.model = VoiceFixer() 557 | 558 | results = [] 559 | with on_device(self.model, dst=device) as model: 560 | for clip in audio["waveform"]: 561 | output = model.restore_inmem(clip.squeeze(0).numpy(), cuda=device == "cuda", mode=mode) 562 | results.append(clip.new_tensor(output)) 563 | 564 | do_cleanup() 565 | return {"waveform": torch.stack(results), "sample_rate": audio["sample_rate"]}, 566 | 567 | 568 | class TrimSilence: 569 | @classmethod 570 | def INPUT_TYPES(cls): 571 | return { 572 | "required": { 573 | "audio": ("AUDIO",), 574 | "top_db": ("FLOAT", {"default": 0.0}), 575 | } 576 | } 577 | 578 | FUNCTION = "trim" 579 | RETURN_TYPES = ("AUDIO",) 580 | CATEGORY = "audio" 581 | 582 | def trim(self, audio, top_db=6.0): 583 | if audio["waveform"].shape[0] != 1: 584 | raise ValueError("Can only trim one audio clip at a time") 585 | trimmed_clip, _ = librosa.effects.trim(audio["waveform"], top_db=top_db, frame_length=256, hop_length=128) 586 | return {"waveform": trimmed_clip, "sample_rate": audio["sample_rate"]}, 587 | 588 | 589 | class AudioSampleRate: 590 | @classmethod 591 | def INPUT_TYPES(cls): 592 | return { 593 | "required": { 594 | "audio": ("AUDIO",), 595 | } 596 | } 597 | 598 | FUNCTION = "get_sample_rate" 599 | RETURN_TYPES = ("INT",) 600 | CATEGORY = "audio" 601 | 602 | def get_sample_rate(self, audio): 603 | return audio["sample_rate"], 604 | 605 | 606 | NODE_CLASS_MAPPINGS = { 607 | "ConvertAudio": ConvertAudio, 608 | "FilterAudio": FilterAudio, 609 | "ResampleAudio": ResampleAudio, 610 | "ClipAudioRegion": ClipAudio, 611 | "InvertAudioPhase": InvertPhase, 612 | "TrimAudio": TrimAudio, 613 | "TrimAudioSamples": TrimAudioSamples, 614 | "ConcatAudio": ConcatAudio, 615 | "BlendAudio": BlendAudio, 616 | "BatchAudio": BatchAudio, 617 | "FlattenAudioBatch": FlattenAudioBatch, 618 | "SpectrogramImage": SpectrogramImage, 619 | "CombineImageWithAudio": CombineImageWithAudio, 620 | "ApplyVoiceFixer": ApplyVoiceFixer, 621 | "TrimSilence": TrimSilence, 622 | "NormalizeAudio": NormalizeAudio, 623 | "AudioSampleRate": AudioSampleRate, 624 | } 625 | 626 | NODE_DISPLAY_NAME_MAPPINGS = { 627 | "ConvertAudio": "Convert Audio", 628 | "FilterAudio": "Filter Audio", 629 | "ResampleAudio": "Resample Audio", 630 | "ClipAudioRegion": "Clip Audio Region", 631 | "InvertAudioPhase": "Invert Audio Phase", 632 | "TrimAudio": "Trim Audio", 633 | "TrimAudioSamples": "Trim Audio (by samples)", 634 | "ConcatAudio": "Concatenate Audio", 635 | "BlendAudio": "Blend Audio", 636 | "BatchAudio": "Batch Audio", 637 | "FlattenAudioBatch": "Flatten Audio Batch", 638 | "SpectrogramImage": "Spectrogram Image", 639 | "CombineImageWithAudio": "Combine Image with Audio", 640 | "ApplyVoiceFixer": "Apply VoiceFixer", 641 | "TrimSilence": "Trim Silence", 642 | "NormalizeAudio": "Normalize Audio", 643 | "AudioSampleRate": "Get Audio Sample Rate", 644 | } 645 | -------------------------------------------------------------------------------- /valle_x_nodes.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from glob import glob 3 | import os 4 | import sys 5 | from urllib.request import urlretrieve 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | from .util import ( 12 | models_dir, 13 | do_cleanup, 14 | object_to, 15 | obj_on_device, 16 | on_device, 17 | tensors_to, 18 | ) 19 | 20 | import langid 21 | from audiocraft.data.audio_utils import normalize_loudness 22 | # from vallex.data import AudioTokenizer, tokenize_audio 23 | from encodec.model import EncodecModel 24 | from vallex.data.collation import get_text_token_collater, TextTokenCollater 25 | from vallex.models.vallex import VALLE 26 | from vallex.utils.g2p import PhonemeBpeTokenizer 27 | from vallex.utils.generation import url as VALLEX_CKPT_URL 28 | from vallex.utils.macros import * 29 | from vallex.utils.prompt_making import make_transcript 30 | from vocos import Vocos 31 | 32 | 33 | MODELS_PATH = os.path.join(models_dir, "vall_e_x") 34 | VOICES_PATH = os.path.join(MODELS_PATH, "voices") 35 | os.makedirs(VOICES_PATH, exist_ok=True) 36 | 37 | VOICES = { 38 | os.path.splitext(os.path.basename(x))[0]: x 39 | for x in sorted(glob(os.path.join(VOICES_PATH, "*.npz"))) 40 | } 41 | ACCENTS = ["none", *list(lang2token.keys())] 42 | 43 | VALLEX_CKPT_PATH = os.path.join(MODELS_PATH, "vallex-checkpoint.pt") 44 | VALLEX_TOKENIZER_PATH = os.path.join(MODELS_PATH, "bpe_69.json") 45 | VALLEX_TOKENIZER_URL = "https://raw.githubusercontent.com/korakoe/VALL-E-X/main/vallex/utils/g2p/bpe_69.json" 46 | VALLEX_VOICEPROMPTS = ["null", *VOICES] 47 | 48 | 49 | @dataclass 50 | class VALLEXModel: 51 | valle: VALLE 52 | encodec: EncodecModel 53 | vocos: Vocos 54 | tokenizer: PhonemeBpeTokenizer 55 | collater: TextTokenCollater 56 | 57 | 58 | # NOTE: the following function is adapted from Plachtaa's implementation of VALL-E X: 59 | # https://github.com/Plachtaa/VALL-E-X 60 | 61 | 62 | @torch.no_grad() 63 | def generate_audio( 64 | model, 65 | text_prompt, 66 | voice_prompt, 67 | language="auto", 68 | accent="no-accent", 69 | topk=100, 70 | temperature=1.0, 71 | best_of=8, 72 | length_penalty=1.0, 73 | use_vocos=True, 74 | device=None, 75 | ): 76 | valle: VALLE = model.valle 77 | vocoder = model.vocos if use_vocos else model.encodec 78 | text_tokenizer = model.tokenizer 79 | text_collater = model.collater 80 | 81 | text = text_prompt.replace("\n", "").strip(" ") 82 | 83 | # detect language 84 | if language == "auto": 85 | language = langid.classify(text)[0] 86 | lang_token = lang2token[language] 87 | lang = token2lang[lang_token] 88 | text = lang_token + text + lang_token 89 | 90 | # load prompt 91 | audio_prompts, text_prompts, lang_pr = voice_prompt 92 | 93 | enroll_x_lens = text_prompts.shape[-1] 94 | phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) 95 | text_tokens, text_tokens_lens = text_collater([phone_tokens]) 96 | text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) 97 | text_tokens_lens += enroll_x_lens 98 | 99 | # accent control 100 | lang = lang if accent == "no-accent" else accent 101 | encoded_frames = valle.inference( 102 | text_tokens.to(device), 103 | text_tokens_lens.to(device), 104 | audio_prompts.to(device), 105 | enroll_x_lens=enroll_x_lens, 106 | top_k=topk, 107 | temperature=temperature, 108 | prompt_language=lang_pr, 109 | text_language=langs if accent == "no-accent" else lang, 110 | best_of=best_of, 111 | length_penalty=length_penalty, 112 | ) 113 | 114 | # decode 115 | if use_vocos: 116 | frames = encoded_frames.permute(2, 0, 1) 117 | features = vocoder.codes_to_features(frames) 118 | samples = vocoder.decode(features, bandwidth_id=torch.tensor([2], device=device)) 119 | else: 120 | samples = vocoder.decode([(encoded_frames.transpose(2, 1), None)]) 121 | 122 | return samples.squeeze().cpu().numpy() 123 | 124 | 125 | class VALLEXLoader: 126 | def __init__(self): 127 | self.model = None 128 | 129 | @classmethod 130 | def INPUT_TYPES(cls): 131 | return {"required": {}} 132 | 133 | RETURN_NAMES = ("vallex_model", "sample_rate") 134 | RETURN_TYPES = ("VALLEX_MODEL", "INT") 135 | FUNCTION = "load" 136 | CATEGORY = "audio" 137 | 138 | def load(self): 139 | if self.model is not None: 140 | self.model = object_to(self.model, "cpu") 141 | del self.model 142 | do_cleanup() 143 | print("VALLEXLoader: unloaded models") 144 | 145 | print("VALLEXLoader: loading models") 146 | 147 | if not os.path.exists(VALLEX_CKPT_PATH): 148 | print("fetching VALL-E X checkpoint...", end="") 149 | urlretrieve(VALLEX_CKPT_URL, VALLEX_CKPT_PATH) 150 | print("done.") 151 | 152 | if not os.path.exists(VALLEX_TOKENIZER_PATH): 153 | print("fetching VALL-E X phoneme tokenizer...", end="") 154 | urlretrieve(VALLEX_TOKENIZER_URL, VALLEX_TOKENIZER_PATH) 155 | print("done.") 156 | 157 | valle = VALLE( 158 | N_DIM, 159 | NUM_HEAD, 160 | NUM_LAYERS, 161 | norm_first=True, 162 | add_prenet=False, 163 | prefix_mode=PREFIX_MODE, 164 | share_embedding=True, 165 | nar_scale_factor=1.0, 166 | prepend_bos=True, 167 | num_quantizers=NUM_QUANTIZERS, 168 | ) 169 | ckpt = torch.load(VALLEX_CKPT_PATH, map_location="cpu") 170 | valle.load_state_dict(ckpt["model"], strict=True) 171 | valle.eval() 172 | 173 | encodec = EncodecModel.encodec_model_24khz() 174 | encodec.eval() 175 | 176 | vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz") 177 | vocos.eval() 178 | 179 | tokenizer = PhonemeBpeTokenizer(VALLEX_TOKENIZER_PATH) 180 | 181 | model = VALLEXModel(valle, encodec, vocos, tokenizer, get_text_token_collater()) 182 | sr = 24000 183 | 184 | do_cleanup() 185 | return model, sr 186 | 187 | 188 | class VALLEXGenerator: 189 | @classmethod 190 | def INPUT_TYPES(cls): 191 | return { 192 | "required": { 193 | "model": ("VALLEX_MODEL",), 194 | "voice_prompt": ("VALLEX_VPROMPT",), 195 | "text_prompt": ("STRING", {"default": "", "multiline": True}), 196 | "language": (["auto", *list(lang2token.keys())],), 197 | "accent": (ACCENTS,), 198 | "temperature": ("FLOAT", {"default": 1.0, "min": 0.001, "step": 0.001}), 199 | "topk": ("INT", {"default": 100, "step": 1}), 200 | "best_of": ("INT", {"default": 8}), 201 | "length_penalty": ("FLOAT", {"default": 1.0, "min": 0.0, "step": 0.001}), 202 | "seed": ("INT", {"default": 0, "min": 0}), 203 | } 204 | } 205 | 206 | RETURN_NAMES = ("audio",) 207 | RETURN_TYPES = ("AUDIO",) 208 | FUNCTION = "generate" 209 | CATEGORY = "audio" 210 | 211 | def generate( 212 | self, 213 | model, 214 | voice_prompt, 215 | text_prompt: str = None, 216 | language: str = "auto", 217 | accent: str = "none", 218 | temperature: float = 1.0, 219 | topk: int = 100, 220 | best_of: int = 8, 221 | length_penalty: float = 1.0, 222 | seed: int = 0, 223 | ): 224 | device = "cuda" if torch.cuda.is_available() else "cpu" 225 | 226 | accent = "no-accent" if accent == "none" else accent 227 | 228 | with torch.random.fork_rng(), obj_on_device(model, dst=device) as m: 229 | torch.manual_seed(seed) 230 | audio = generate_audio( 231 | m, 232 | text_prompt, 233 | voice_prompt, 234 | language=language, 235 | accent=accent, 236 | topk=-topk, 237 | temperature=temperature, 238 | best_of=best_of, 239 | length_penalty=length_penalty, 240 | device=device, 241 | ) 242 | 243 | do_cleanup() 244 | return normalize_loudness(torch.from_numpy(audio).unsqueeze(0), 24000, loudness_compressor=True), 245 | 246 | 247 | class VALLEXVoicePromptLoader: 248 | @classmethod 249 | def INPUT_TYPES(cls): 250 | return { 251 | "required": { 252 | "voice": (VALLEX_VOICEPROMPTS,), 253 | } 254 | } 255 | 256 | RETURN_TYPES = ("VALLEX_VPROMPT",) 257 | FUNCTION = "load_prompt" 258 | CATEGORY = "audio" 259 | 260 | def load_prompt(self, voice): 261 | if voice != "null": 262 | name = VOICES[voice] 263 | prompt_path = name 264 | if not os.path.exists(prompt_path): 265 | prompt_path = os.path.join(VOICES_PATH, "presets", name + ".npz") 266 | if not os.path.exists(prompt_path): 267 | prompt_path = os.path.join(VOICES_PATH, "customs", name + ".npz") 268 | if not os.path.exists(prompt_path): 269 | raise ValueError(f"Cannot find prompt {name}") 270 | prompt_data = np.load(prompt_path) 271 | audio_prompts = prompt_data["audio_tokens"] 272 | text_prompts = prompt_data["text_tokens"] 273 | lang_pr = prompt_data["lang_code"] 274 | lang_pr = code2lang[int(lang_pr)] 275 | 276 | # numpy to tensor 277 | audio_prompts = torch.tensor(audio_prompts).type(torch.int32) 278 | text_prompts = torch.tensor(text_prompts).type(torch.int32) 279 | else: 280 | audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32) 281 | text_prompts = torch.zeros([1, 0]).type(torch.int32) 282 | lang_pr = "en" 283 | 284 | return (audio_prompts, text_prompts, lang_pr), 285 | 286 | 287 | class VALLEXVoicePromptGenerator: 288 | @classmethod 289 | def INPUT_TYPES(cls): 290 | return { 291 | "required": { 292 | "model": ("VALLEX_MODEL",), 293 | "transcript": ("STRING", {"default": "", "multiline": True}), 294 | "audio": ("AUDIO",), 295 | } 296 | } 297 | 298 | RETURN_TYPES = ("VALLEX_VPROMPT",) 299 | FUNCTION = "make_prompt" 300 | CATEGORY = "audio" 301 | 302 | def make_prompt(self, model, audio, transcript=None): 303 | encodec: EncodecModel = model.encodec 304 | tokenizer: PhonemeBpeTokenizer = model.tokenizer 305 | text_collater: TextTokenCollater = model.collater 306 | 307 | device = "cuda" if torch.cuda.is_available() else "cpu" 308 | wav_pr = audio["waveform"] 309 | 310 | if wav_pr.size(0) == 2: 311 | wav_pr = wav_pr.mean(0, keepdim=True) 312 | 313 | wav_pr = wav_pr.unsqueeze(0) 314 | 315 | text, lang = make_transcript("_temp_prompt", wav_pr, encodec.sample_rate, transcript) 316 | 317 | with torch.no_grad(), on_device(encodec, dst=device) as e, obj_on_device(tokenizer, dst=device) as t: 318 | # tokenize audio 319 | wav_pr = wav_pr.to(device) 320 | encoded_frames = e.encode(wav_pr) 321 | audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu() 322 | 323 | # tokenize text 324 | phonemes, _ = t.tokenize(text=f"{text}".strip()) 325 | text_tokens, _ = text_collater([phonemes]) 326 | wav_pr = wav_pr.cpu() 327 | 328 | do_cleanup() 329 | 330 | return (audio_tokens, text_tokens, lang), 331 | 332 | 333 | NODE_CLASS_MAPPINGS = { 334 | "VALLEXLoader": VALLEXLoader, 335 | "VALLEXGenerator": VALLEXGenerator, 336 | "VALLEXVoicePromptLoader": VALLEXVoicePromptLoader, 337 | "VALLEXVoicePromptFromAudio": VALLEXVoicePromptGenerator, 338 | } 339 | NODE_DISPLAY_NAME_MAPPINGS = { 340 | "VALLEXLoader": "VALL-E X Loader", 341 | "VALLEXGenerator": "VALL-E X Generator", 342 | "VALLEXVoicePromptLoader": "VALL-E X Voice Prompt Loader", 343 | "VALLEXVoicePromptFromAudio": "VALL-E X Voice Prompt from Audio", 344 | } 345 | --------------------------------------------------------------------------------