├── .gitignore ├── LICENSE ├── README.md ├── WavCraft-chat.py ├── WavCraft.py ├── assets ├── duck_quacking_in_water.wav ├── overview.png └── wavcraft_icon.jpg ├── scripts ├── check_watermark.py ├── continue_service.sh ├── kill_services.py ├── setup_envs.sh └── start_services.sh ├── services ├── audiocraft_service.py ├── audioldm_service.py ├── audiosep_service.py ├── audiosr_service.py ├── start_audiocraft.sh ├── start_audioldm.sh ├── start_audiosep.sh ├── start_audiosr.sh ├── start_wavmark.sh └── wavmark_service.py ├── venvs ├── audiocraft.yml ├── audioldm.yml ├── audiosr.yml └── wavcraft.yml └── wavcraft ├── __init__.py ├── apis.py ├── configs.yaml ├── ffmpeg_engineer.py ├── mistral_api.py ├── pipeline.py ├── prompts ├── text_to_code.prompt └── text_to_followup.prompt ├── utils.py └── voice_preset ├── npz ├── child_boy.npz ├── cnn_male_speaker.npz ├── elder_morgen.npz ├── news_female_speaker.npz ├── news_female_speaker_outside.npz └── news_male_speaker.npz └── voice_map.json /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | bin/ 3 | services_logs/ 4 | output/ 5 | # ext/* 6 | .empty/ 7 | scripts/chatgpt.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🌊WavCraft 2 | 3 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2403.09527) [![demo](https://img.shields.io/badge/Notion-Demo_Page-blue)](https://first-riddle-7e9.notion.site/WavCraft-Demo-40c079fc82ca411ca0520b9d65abd3f5) 4 | 5 | Generate and edit the audio with a simple sentence. 6 | 7 | This repo currently support: 8 | 9 | * text-guided audio editing: edit the content of given audio clip(s) conditioned on text input 10 | * text-guided audio generation: create an audio clip given text input 11 | * audio scriptwriting: get more inspiration from WavCraft by prompting a script setting and let the model do the scriptwriting and create the sound for you. 12 | * check if your audio file is synthesized by WavCraft 13 | 14 | ## Change log 15 | 16 | **2024-05-06**: Support openLLMs (MistralAI family) for WavCraft. 17 | 18 | **2024-03-20**: Add watermarking to the system 19 | 20 | ## Content 21 | 22 | - [Usage](##usage) 23 | - [Installation](###installation) 24 | - [Audio edition using a single line](###audio-edition-using-a-single-line) 25 | - [Audio edition via interaction](###audio-edition-via-interaction) 26 | - [Check if an audio file is generated/modified by WavCraft](###check_if_an_audio_file_is_generated/modified_by_wavcraft) 27 | - [Approach](##approach) 28 | - [Acknowledgments](##acknowledgments) 29 | - [Citing](##citing) 30 | 31 | ## Usage 32 | 33 | ### Installation 34 | 35 | ``` 36 | source scripts/setup_envs.sh 37 | ``` 38 | 39 | ## Configure environment 40 | 41 | ```bash 42 | export OPENAI_KEY=YOUR_OPENAI_KEY 43 | export HF_KEY=YOUR_HF_KEY 44 | ``` 45 | 46 | ## Launch deep learning models on local 47 | 48 | ```bash 49 | source scripts/start_services.sh 50 | ``` 51 | 52 | ## Play with WavCraft 53 | 54 | ### Audio edition using a single line 55 | 56 | ``` 57 | python3 WavCraft.py basic -f \ 58 | --input-wav assets/duck_quacking_in_water.wav \ 59 | --input-text "Add dog barking." 60 | ``` 61 | 62 | ### Audio edition via interaction 63 | 64 | ``` 65 | python3 WavCraft-chat.py basic -f -c 66 | [New session is create] 67 | Add audio files(s) (each file starts with '+'): +assets/duck_quacking_in_water.wav 68 | Enter your instruction (input `EXIT` to exit the process): "Add dog barking" 69 | 70 | ``` 71 | 72 | ### Check if an audio file is generated/modified by WavCraft 73 | 74 | ``` 75 | python3 check_watermark.py --wav-path /path/to/audio/file 76 | ``` 77 | 78 | ### Use openLLMs for generation/editing 79 | ``` 80 | python3 WavCraft.py basic -f \ 81 | --input-wav assets/duck_quacking_in_water.wav \ 82 | --input-text "Add dog barking." \ 83 | --model 'mistralai/Mistral-7B-Instruct-v0.2' 84 | ``` 85 | 86 | ## Approach 87 | 88 | WavCraft is an LLM-driven agent for audio content creation and editing. It applies LLM to connect various audio expert models and DSP function together. An overview of WavCraft architecture can be found bellow: 89 | 90 | ![overview](assets/overview.png) 91 | 92 | ## Disclaimer 93 | 94 | This repository is for **research purpose only**. We are not responsible for audio generated/edited using semantics created by this model. Also, everyone use WavCraft must NOT disable the watermarking techniques in anyway. 95 | 96 | ## Acknowledgments 97 | 98 | We appreciate [WavJourney](https://github.com/Audio-AGI/WavJourney), [AudioCraft](https://github.com/facebookresearch/audiocraft), [AudioSep](https://github.com/Audio-AGI/AudioSep), [AudioSR](https://github.com/haoheliu/versatile_audio_super_resolution), [AudioLDM](https://github.com/haoheliu/AudioLDM), [WavMark](https://github.com/wavmark/wavmark) for their amazing code work. 99 | 100 | ## Citing 101 | 102 | If you found our work is helpful, please cite our work: 103 | ``` 104 | @misc{liang2024wavcraft, 105 | title={WavCraft: Audio Editing and Generation with Large Language Models}, 106 | author={Jinhua Liang and Huan Zhang and Haohe Liu and Yin Cao and Qiuqiang Kong and Xubo Liu and Wenwu Wang and Mark D. Plumbley and Huy Phan and Emmanouil Benetos}, 107 | year={2024}, 108 | eprint={2403.09527}, 109 | archivePrefix={arXiv}, 110 | primaryClass={eess.AS} 111 | } 112 | ``` 113 | -------------------------------------------------------------------------------- /WavCraft-chat.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | 4 | import wavcraft.utils as utils 5 | import wavcraft.pipeline as pipeline 6 | 7 | parser = argparse.ArgumentParser() 8 | sub_parsers = parser.add_subparsers(dest="mode") 9 | # Basic mode 10 | basic_parser = sub_parsers.add_parser("basic") 11 | basic_parser.add_argument('-f', '--full', action='store_true', help='Go through the full proces') 12 | basic_parser.add_argument('-c', '--chat', action='store_true', help='Chat with WavCraft.') 13 | basic_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id') 14 | # Inspiration mode 15 | inspire_parser = sub_parsers.add_parser("inspiration") 16 | inspire_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process') 17 | inspire_parser.add_argument('-c', '--chat', action='store_true', help='Chat with WavCraft') 18 | inspire_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id') 19 | 20 | args = parser.parse_args() 21 | 22 | if args.mode in ("basic", "inspiration"): 23 | session_id = pipeline.init_session(args.session_id) 24 | print(f"Session {session_id} is created.") 25 | 26 | api_key = utils.get_api_key() 27 | assert api_key != None, "Please set your openai_key in the environment variable." 28 | 29 | input_wav = [] 30 | 31 | while True: 32 | this_turn_wav = input("Add audio files(s) (each file starts with '+'): ") 33 | input_text = input("Enter your instruction (input `EXIT` to exit the process): ") 34 | 35 | if input_text == "EXIT": 36 | print("WavCraft is completed.") 37 | break 38 | 39 | if args.full: 40 | this_turn_wav = this_turn_wav.split('+') 41 | this_turn_wav = [wav.strip().strip("'").strip("\"") for wav in this_turn_wav if len(wav) > 0] 42 | input_wav.extend(this_turn_wav) 43 | 44 | pipeline.full_steps(session_id, input_wav, input_text, api_key, mode=args.mode) -------------------------------------------------------------------------------- /WavCraft.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | 4 | import wavcraft.utils as utils 5 | import wavcraft.pipeline as pipeline 6 | 7 | parser = argparse.ArgumentParser() 8 | sub_parsers = parser.add_subparsers(dest="mode", help='Type of WavCraft to use') 9 | # Basic mode 10 | basic_parser = sub_parsers.add_parser("basic") 11 | basic_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process') 12 | basic_parser.add_argument('--input-wav', nargs='+', default=[], help='a list of input wave paths') 13 | basic_parser.add_argument('--input-text', type=str, help='input text or text file') 14 | # gpt-4-0125-preview 15 | basic_parser.add_argument('--model', type=str, default="gpt-4", help='ChatGPT model.') 16 | basic_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id') 17 | # Inspiration mode 18 | inspire_parser = sub_parsers.add_parser("inspiration") 19 | inspire_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process') 20 | inspire_parser.add_argument('--input-wav', nargs='+', default=[], help='a list of input wave paths') 21 | inspire_parser.add_argument('--input-text', type=str, help='input text or text file') 22 | inspire_parser.add_argument('--model', type=str, default="gpt-4", help='ChatGPT model.') 23 | inspire_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id') 24 | 25 | args = parser.parse_args() 26 | 27 | if args.mode in ("basic", "inspiration"): 28 | if args.full: 29 | input_text = args.input_text 30 | input_wav = args.input_wav 31 | 32 | start_time = time.time() 33 | session_id = pipeline.init_session(args.session_id) 34 | api_key = utils.get_api_key() 35 | 36 | assert api_key != None, "Please set your openai_key in the environment variable." 37 | 38 | print(f"Session {session_id} is created.") 39 | 40 | pipeline.full_steps(session_id, input_wav, input_text, api_key, model=args.model, mode=args.mode) 41 | end_time = time.time() 42 | 43 | print(f"Audio editor took {end_time - start_time:.2f} seconds to complete.") 44 | -------------------------------------------------------------------------------- /assets/duck_quacking_in_water.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/duck_quacking_in_water.wav -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/overview.png -------------------------------------------------------------------------------- /assets/wavcraft_icon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/wavcraft_icon.jpg -------------------------------------------------------------------------------- /scripts/check_watermark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from wavcraft.apis import _DECODE_WATERMARK 3 | 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--wav-path", type=str, help="Path to the audio file.") 7 | args = parser.parse_args() 8 | 9 | _DECODE_WATERMARK(args.wav_path, sample_rate=16000) -------------------------------------------------------------------------------- /scripts/continue_service.sh: -------------------------------------------------------------------------------- 1 | conda activate AudioEditor 2 | source ./scripts/chatgpt.sh 3 | 4 | mkdir -p services_logs 5 | 6 | export SERVICE_PORT=8088 7 | export SERVICE_URL=127.0.0.1 8 | export MAX_SCRIPT_LINES=999 9 | 10 | export AUDIOCRAFT_SERVICE_PORT=$((${SERVICE_PORT}+1)) 11 | export AUDIOSEP_SERVICE_PORT=$((${SERVICE_PORT}+2)) 12 | export AUDIOSR_SERVICE_PORT=$((${SERVICE_PORT}+3)) 13 | export AUDIOLDM_SERVICE_PORT=$((${SERVICE_PORT}+4)) 14 | export WAVMARK_SERVICE_PORT=$((${SERVICE_PORT}+5)) 15 | -------------------------------------------------------------------------------- /scripts/kill_services.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | # Extract values for each application 5 | audiocraft_service_port = int(os.environ.get('AUDIOCRAFT_SERVICE_PORT')) 6 | audiosep_service_port = int(os.environ.get('AUDIOSEP_SERVICE_PORT')) 7 | audiosr_service_port = int(os.environ.get('AUDIOSR_SERVICE_PORT')) 8 | audioldm_service_port = int(os.environ.get('AUDIOLDM_SERVICE_PORT')) 9 | wavmark_service_port = int(os.environ.get('WAVMARK_SERVICE_PORT')) 10 | 11 | # Execute the commands 12 | os.system(f'kill $(lsof -t -i :{audiocraft_service_port})') 13 | os.system(f'kill $(lsof -t -i :{audiosep_service_port})') 14 | os.system(f'kill $(lsof -t -i :{audiosr_service_port})') 15 | os.system(f'kill $(lsof -t -i :{audioldm_service_port})') 16 | os.system(f'kill $(lsof -t -i :{wavmark_service_port})') -------------------------------------------------------------------------------- /scripts/setup_envs.sh: -------------------------------------------------------------------------------- 1 | conda env create -f venvs/audiocraft.yml 2 | conda env create -f venvs/audioldm.yml 3 | conda env create -f venvs/audiosr.yml 4 | conda env create -f venvs/wavcraft.yml 5 | # Prepare third-party repos 6 | # Comment some of them if they are unnecessary 7 | mkdir ext/ 8 | cd ext/ 9 | 10 | git clone https://github.com/haoheliu/AudioLDM.git 11 | 12 | git clone https://github.com/Audio-AGI/AudioSep.git 13 | 14 | wget https://uplex.de/audiowmark/releases/audiowmark-0.6.1.tar.gz 15 | tar -xzvf audiowmark-0.6.1.tar.gz 16 | cd audiowmark-0.6.1 17 | ./configure 18 | make 19 | make install -------------------------------------------------------------------------------- /scripts/start_services.sh: -------------------------------------------------------------------------------- 1 | mkdir -p services_logs 2 | 3 | export SERVICE_PORT=8088 4 | export SERVICE_URL=127.0.0.1 5 | export MAX_SCRIPT_LINES=999 6 | 7 | # Start AudioCraft service 8 | source services/start_audiocraft.sh 9 | # Start AudioSep service 10 | source services/start_audiosep.sh 11 | # Start AudioSR service 12 | source services/start_audiosr.sh 13 | # Start AudioLDM service 14 | source services/start_audioldm.sh 15 | # Start WavMark service 16 | source services/start_wavmark.sh 17 | # WavCraft 18 | conda activate WavCraft -------------------------------------------------------------------------------- /services/audiocraft_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import logging 5 | import torch 6 | import nltk 7 | import torchaudio 8 | import torchaudio.transforms as T 9 | from torchaudio.transforms import SpeedPerturbation 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 12 | from wavcraft.apis import _WRITE_AUDIO, _LOUDNESS_NORM 13 | from wavcraft.utils import fade, get_service_port 14 | from flask import Flask, request, jsonify 15 | 16 | 17 | with open('wavcraft/configs.yaml', 'r') as file: 18 | config = yaml.safe_load(file) 19 | 20 | # Configure the logging format and level 21 | logging.basicConfig( 22 | level=logging.INFO, 23 | format='%(asctime)s - %(levelname)s - %(message)s' 24 | ) 25 | 26 | # Create a FileHandler for the log file 27 | os.makedirs('services_logs', exist_ok=True) 28 | log_filename = 'services_logs/Wav-API.log' 29 | file_handler = logging.FileHandler(log_filename, mode='w') 30 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) 31 | 32 | # Add the FileHandler to the root logger 33 | logging.getLogger('').addHandler(file_handler) 34 | 35 | 36 | """ 37 | Initialize the AudioCraft models here 38 | """ 39 | from audiocraft.models import AudioGen, MusicGen 40 | tta_model_size = config['AudioCraft']['tta_model_size'] 41 | tta_model = AudioGen.get_pretrained(f'facebook/audiogen-{tta_model_size}') 42 | logging.info(f'AudioGen ({tta_model_size}) is loaded ...') 43 | 44 | ttm_model_size = config['AudioCraft']['ttm_model_size'] 45 | ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{ttm_model_size}') 46 | logging.info(f'MusicGen ({ttm_model_size}) is loaded ...') 47 | 48 | 49 | """ 50 | Initialize the BarkModel here 51 | """ 52 | from transformers import BarkModel, AutoProcessor 53 | import json 54 | 55 | # Load voice map 56 | with open("wavcraft/voice_preset/voice_map.json", 'r') as f: 57 | voice_map = json.load(f) 58 | 59 | SPEED = float(config['Text-to-Speech']['speed']) 60 | speed_perturb = SpeedPerturbation(32000, [SPEED]) 61 | tts_model = BarkModel.from_pretrained("suno/bark") 62 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 63 | tts_model = tts_model.to(device) 64 | tts_model = tts_model.to_bettertransformer() # Flash attention 65 | SAMPLE_RATE = tts_model.generation_config.sample_rate 66 | SEMANTIC_TEMPERATURE = 0.9 67 | COARSE_TEMPERATURE = 0.5 68 | FINE_TEMPERATURE = 0.5 69 | processor = AutoProcessor.from_pretrained("suno/bark") 70 | logging.info('Bark model is loaded ...') 71 | 72 | 73 | app = Flask(__name__) 74 | 75 | 76 | @app.route('/generate_audio', methods=['POST']) 77 | def generate_audio(): 78 | # Receive the text from the POST request 79 | data = request.json 80 | text = data['text'] 81 | length = float(data.get('length', 5.0)) 82 | volume = float(data.get('volume', -35)) 83 | output_wav = data.get('output_wav', 'out.wav') 84 | 85 | logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB') 86 | 87 | try: 88 | tta_model.set_generation_params(duration=length) 89 | wav = tta_model.generate([text]) 90 | wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000) 91 | 92 | wav = wav.squeeze().cpu().detach().numpy() 93 | wav = fade(_LOUDNESS_NORM(wav, volume=volume)) 94 | _WRITE_AUDIO(wav, name=output_wav) 95 | 96 | # Return success message and the filename of the generated audio 97 | return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav}) 98 | 99 | except Exception as e: 100 | return jsonify({'API error': str(e)}), 500 101 | 102 | 103 | @app.route('/generate_music', methods=['POST']) 104 | def generate_music(): 105 | # Receive the text from the POST request 106 | data = request.json 107 | text = data['text'] 108 | melody_path = data.get('melody', None) 109 | length = float(data.get('length', 5.0)) 110 | volume = float(data.get('volume', -35)) 111 | sample_rate = int(data.get('sr', 32000)) 112 | output_wav = data.get('output_wav', 'out.wav') 113 | 114 | logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB') 115 | 116 | 117 | try: 118 | ttm_model.set_generation_params(duration=length) 119 | 120 | if melody_path is None: 121 | print("Use generate") 122 | wav = ttm_model.generate([text]) 123 | 124 | else: 125 | print("Use generate_with_chroma") 126 | melody, sr = torchaudio.load(melody_path) 127 | # Resample the audio if sr does not match sample_rate 128 | if sr != sample_rate: 129 | resampler = T.Resample(sr, sample_rate, dtype=melody.dtype) 130 | melody = resampler(melody) 131 | # Generates using the melody from the given audio and the provided descriptions. 132 | wav = ttm_model.generate_with_chroma([text], melody[None].expand(1, -1, -1), sample_rate) 133 | 134 | wav = wav[0][0].cpu().detach().numpy() 135 | wav = fade(_LOUDNESS_NORM(wav, volume=volume)) 136 | _WRITE_AUDIO(wav, name=output_wav) 137 | 138 | # Return success message and the filename of the generated audio 139 | return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav}) 140 | 141 | except Exception as e: 142 | # Return error message if something goes wrong 143 | return jsonify({'API error': str(e)}), 500 144 | 145 | 146 | @app.route('/generate_speech', methods=['POST']) 147 | def generate_speech(): 148 | # Receive the text from the POST request 149 | data = request.json 150 | text = data['text'] 151 | speaker_id = data['speaker_id'] 152 | volume = float(data.get('volume', -35)) 153 | output_wav = data.get('output_wav', 'out.wav') 154 | 155 | speaker_npz = voice_map[speaker_id]["npz_path"] 156 | 157 | logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}') 158 | 159 | try: 160 | # Generate audio using the global pipe object 161 | text = text.replace('\n', ' ').strip() 162 | sentences = nltk.sent_tokenize(text) 163 | silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0) # 0.1 second of silence 164 | 165 | pieces = [] 166 | for sentence in sentences: 167 | inputs = processor(sentence, voice_preset=speaker_npz).to(device) 168 | # NOTE: you must run the line below, otherwise you will see the runtime error 169 | # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. 170 | inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1) 171 | 172 | with torch.inference_mode(): 173 | # TODO: min_eos_p? 174 | output = tts_model.generate( 175 | **inputs, 176 | do_sample = True, 177 | semantic_temperature = SEMANTIC_TEMPERATURE, 178 | coarse_temperature = COARSE_TEMPERATURE, 179 | fine_temperature = FINE_TEMPERATURE 180 | ) 181 | 182 | pieces += [output, silence] 183 | 184 | result_audio = torch.cat(pieces, dim=1) 185 | wav_tensor = result_audio.to(dtype=torch.float32).cpu() 186 | wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000) 187 | wav = speed_perturb(wav.float())[0].squeeze(0) 188 | wav = wav.numpy() 189 | wav = _LOUDNESS_NORM(wav, volume=volume) 190 | _WRITE_AUDIO(wav, name=output_wav) 191 | 192 | # Return success message and the filename of the generated audio 193 | return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav}) 194 | 195 | except Exception as e: 196 | # Return error message if something goes wrong 197 | return jsonify({'API error': str(e)}), 500 198 | 199 | 200 | if __name__ == '__main__': 201 | service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT") 202 | # We disable multithreading to force services to process one request at a time and avoid CUDA OOM 203 | app.run(debug=False, threaded=False, port=service_port) -------------------------------------------------------------------------------- /services/audioldm_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import math 5 | import logging 6 | import librosa 7 | import numpy as np 8 | from flask import Flask, request, jsonify 9 | from scipy.io.wavfile import write 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 12 | from wavcraft.utils import get_service_port 13 | from audioldm import build_model, super_resolution_and_inpainting 14 | 15 | 16 | CACHE_DIR = os.getenv( 17 | "AUDIOLDM_CACHE_DIR", 18 | os.path.join(os.path.expanduser("~"), ".cache/audioldm")) 19 | 20 | EPS = 1e-5 21 | 22 | with open('wavcraft/configs.yaml', 'r') as file: 23 | config = yaml.safe_load(file) 24 | 25 | # Configure the logging format and level 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format='%(asctime)s - %(levelname)s - %(message)s' 29 | ) 30 | 31 | # Create a FileHandler for the log file 32 | os.makedirs('services_logs', exist_ok=True) 33 | log_filename = 'services_logs/Wav-API.log' 34 | file_handler = logging.FileHandler(log_filename, mode='w') 35 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) 36 | 37 | # Add the FileHandler to the root logger 38 | logging.getLogger('').addHandler(file_handler) 39 | 40 | 41 | audioldm = build_model(model_name=config["AudioLDM"]["model_size"]) 42 | logging.info('AudioLDM is loaded ...') 43 | 44 | 45 | app = Flask(__name__) 46 | 47 | 48 | @app.route('/audio_inpaint', methods=['POST']) 49 | def audio_inpaint(): 50 | # Receive the text from the POST request 51 | data = request.json 52 | wav_path = data['wav_path'] 53 | text = data["text"] 54 | duration = data["duration"] + EPS # avoid zero division 55 | onset = data["onset"] / duration 56 | offset = data["offset"] / duration 57 | 58 | sample_rate = data.get('sample_rate', 32000) 59 | guidance_scale = data.get('guidance_scale', 2.5) 60 | ddim_steps = data.get('ddim_steps', 200) 61 | random_seed = data.get('seed', 42) 62 | output_wav = data.get('output_wav', 'out.wav') 63 | logging.info(f"Inpaint {wav_path} with the input '{text}'...") 64 | 65 | try: 66 | # target_duration = math.ceil(data["duration"] / 2.5) * 2.5 67 | target_duration = data["duration"] 68 | waveform = super_resolution_and_inpainting( 69 | audioldm, 70 | text, # The text prompt for inpainting generation 71 | wav_path, # This audio will be padded to 10.242 seconds and perform inpainting 72 | time_mask_ratio_start_and_end=(onset, offset), # This is a ratio for inpainting at a scale of 10.242 seconds 73 | seed=random_seed, 74 | duration=target_duration, 75 | guidance_scale=guidance_scale, 76 | ddim_steps=ddim_steps, 77 | n_candidate_gen_per_text=1, 78 | batchsize=1, 79 | ) 80 | 81 | if sample_rate != 16000: 82 | waveform = librosa.resample(waveform, orig_sr=16000, target_sr=sample_rate) 83 | # Write audio to `output_wav` with `sample_rate` 84 | write(output_wav, sample_rate, np.round(waveform[:int(duration*sample_rate)] * 32767).astype(np.int16)) 85 | 86 | # Return success message and the filename of the generated audio 87 | return jsonify({'message': f'Sucessful infill {data["onset"]}-{data["offset"]}s content in {wav_path}'}) 88 | 89 | except Exception as e: 90 | # Return error message if something goes wrong 91 | return jsonify({'API error': str(e)}), 500 92 | 93 | 94 | if __name__ == '__main__': 95 | service_port = get_service_port("AUDIOLDM_SERVICE_PORT") 96 | # We disable multithreading to force services to process one request at a time and avoid CUDA OOM 97 | app.run(debug=False, threaded=False, port=service_port) -------------------------------------------------------------------------------- /services/audiosep_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import logging 5 | import torch 6 | import librosa 7 | import numpy as np 8 | from flask import Flask, request, jsonify 9 | from scipy.io.wavfile import write 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 12 | from wavcraft.utils import get_service_port 13 | 14 | 15 | with open('wavcraft/configs.yaml', 'r') as file: 16 | config = yaml.safe_load(file) 17 | 18 | # Configure the logging format and level 19 | logging.basicConfig( 20 | level=logging.INFO, 21 | format='%(asctime)s - %(levelname)s - %(message)s' 22 | ) 23 | 24 | # Create a FileHandler for the log file 25 | os.makedirs('services_logs', exist_ok=True) 26 | log_filename = 'services_logs/Wav-API.log' 27 | file_handler = logging.FileHandler(log_filename, mode='w') 28 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) 29 | 30 | # Add the FileHandler to the root logger 31 | logging.getLogger('').addHandler(file_handler) 32 | 33 | 34 | """ 35 | Initalize the AudioSep model here 36 | """ 37 | def inference(model, audio_file, text, output_file, device='cuda', use_chunk=False): 38 | print(f'Separate audio from [{audio_file}] with textual query [{text}]') 39 | mixture, fs = librosa.load(audio_file, sr=32000, mono=True) 40 | with torch.no_grad(): 41 | text = [text] 42 | 43 | conditions = model.query_encoder.get_query_embed( 44 | modality='text', 45 | text=text, 46 | device=device 47 | ) 48 | 49 | input_dict = { 50 | "mixture": torch.Tensor(mixture)[None, None, :].to(device), 51 | "condition": conditions, 52 | } 53 | 54 | if use_chunk: 55 | foreground = model.ss_model.chunk_inference(input_dict) 56 | foreground = np.squeeze(foreground) 57 | else: 58 | foreground = model.ss_model(input_dict)["waveform"] 59 | foreground = foreground.squeeze(0).squeeze(0).data.cpu().numpy() 60 | 61 | background = mixture - foreground 62 | 63 | filedir, filename = os.path.split(output_file) 64 | fg_filepath = os.path.join(filedir, "fg_"+filename) 65 | bg_filepath = os.path.join(filedir, "bg_"+filename) 66 | 67 | write(fg_filepath, 32000, np.round(foreground * 32767).astype(np.int16)) 68 | print(f'Write separated audio to [{fg_filepath}]') 69 | 70 | write(bg_filepath, 32000, np.round(background * 32767).astype(np.int16)) 71 | print(f'Write the background audio to [{bg_filepath}]') 72 | 73 | import sys 74 | sys.path.append("ext/AudioSep") 75 | from ss_pipeline import build_audiosep 76 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 77 | ss = build_audiosep( 78 | config_yaml='ext/AudioSep/config/audiosep_base.yaml', 79 | checkpoint_path='ext/AudioSep/checkpoint/audiosep_base_4M_steps.ckpt', 80 | device=device) 81 | 82 | logging.info('AudioSep is loaded ...') 83 | 84 | 85 | app = Flask(__name__) 86 | 87 | 88 | @app.route('/source_separate', methods=['POST']) 89 | def source_separate(): 90 | # Receive the text from the POST request 91 | data = request.json 92 | wav_path = data['wav_path'] 93 | text = data["text"] 94 | output_wav = data.get('output_wav', 'out.wav') 95 | 96 | logging.info(f"Separate '{text}' from {wav_path} ...") 97 | 98 | try: 99 | inference(ss, wav_path, text, output_wav, device) 100 | 101 | # Return success message and the filename of the generated audio 102 | return jsonify({'message': f'Sucessful separation from {wav_path}'}) 103 | 104 | except Exception as e: 105 | # Return error message if something goes wrong 106 | return jsonify({'API error': str(e)}), 500 107 | 108 | 109 | if __name__ == '__main__': 110 | service_port = get_service_port("AUDIOSEP_SERVICE_PORT") 111 | # We disable multithreading to force services to process one request at a time and avoid CUDA OOM 112 | app.run(debug=False, threaded=False, port=service_port) -------------------------------------------------------------------------------- /services/audiosr_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import random 5 | import logging 6 | import torch 7 | import numpy as np 8 | import soundfile as sf 9 | from flask import Flask, request, jsonify 10 | from cog import BasePredictor, Input, Path 11 | from audiosr import build_model 12 | from audiosr import super_resolution as _super_resolution 13 | 14 | 15 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 16 | from wavcraft.apis import _WRITE_AUDIO 17 | from wavcraft.utils import fade, get_service_port 18 | 19 | 20 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 21 | torch.set_float32_matmul_precision("high") 22 | 23 | with open('wavcraft/configs.yaml', 'r') as file: 24 | config = yaml.safe_load(file) 25 | 26 | # Configure the logging format and level 27 | logging.basicConfig( 28 | level=logging.INFO, 29 | format='%(asctime)s - %(levelname)s - %(message)s' 30 | ) 31 | 32 | # Create a FileHandler for the log file 33 | os.makedirs('services_logs', exist_ok=True) 34 | log_filename = 'services_logs/Wav-API.log' 35 | file_handler = logging.FileHandler(log_filename, mode='w') 36 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) 37 | 38 | # Add the FileHandler to the root logger 39 | logging.getLogger('').addHandler(file_handler) 40 | 41 | 42 | """ 43 | Initialize the AudioSR models here 44 | """ 45 | class Predictor(BasePredictor): 46 | def setup(self, model_name="basic", device="auto"): 47 | self.model_name = model_name 48 | self.device = device 49 | self.sr = 48000 50 | self.audiosr = build_model(model_name=self.model_name, device=self.device) 51 | 52 | def predict(self, 53 | input_file: Path = Input(description="Audio to upsample"), 54 | output_path: Path = Input(description="Path to output audio"), 55 | ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500), 56 | guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0), 57 | seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None), 58 | ) -> np.ndarray: 59 | """Run a single prediction on the model""" 60 | if seed is None: 61 | seed = random.randint(0, 2**32 - 1) 62 | print(f"Setting seed to: {seed}") 63 | 64 | waveform = _super_resolution( 65 | self.audiosr, 66 | input_file, 67 | seed=seed, 68 | guidance_scale=guidance_scale, 69 | ddim_steps=ddim_steps, 70 | latent_t_per_second=12.8 71 | ) 72 | out_wav = (waveform[0] * 32767).astype(np.int16).T 73 | 74 | sf.write(output_path, data=out_wav, samplerate=48000) 75 | return Path(output_path) 76 | 77 | sr_model = Predictor() 78 | sr_model.setup() 79 | logging.info('AudioSR model is loaded ...') 80 | 81 | 82 | app = Flask(__name__) 83 | 84 | 85 | @app.route('/super_resolution', methods=['POST']) 86 | def super_resolution(): 87 | # Receive the text from the POST request 88 | data = request.json 89 | wav_path = data['wav_path'] 90 | ddim_steps = int(data.get('ddim_steps', 50)) 91 | guidance_scale = float(data.get('guidance_scale', 3.5)) 92 | seed = int(data.get('seed', 42)) 93 | output_wav = data.get('output_wav', 'out.wav') 94 | 95 | logging.info(f"Super resolution: ddim_steps: {ddim_steps}, guidance_scale: {guidance_scale}.") 96 | 97 | try: 98 | sr_model.predict( 99 | wav_path, 100 | output_path=output_wav, 101 | ddim_steps=ddim_steps, 102 | guidance_scale=guidance_scale, 103 | seed=seed, 104 | ) 105 | # Return success message and the filename of the generated audio 106 | return jsonify({'message': 'Audio super resolution generated successfully', 'file': f"{output_wav}"}) 107 | 108 | except Exception as e: 109 | return jsonify({'API error': str(e)}), 500 110 | 111 | 112 | if __name__ == '__main__': 113 | service_port = get_service_port("AUDIOSR_SERVICE_PORT") 114 | # We disable multithreading to force services to process one request at a time and avoid CUDA OOM 115 | app.run(debug=False, threaded=False, port=service_port) -------------------------------------------------------------------------------- /services/start_audiocraft.sh: -------------------------------------------------------------------------------- 1 | export AUDIOCRAFT_SERVICE_PORT=$((${SERVICE_PORT}+1)) 2 | 3 | conda activate AudioCraft 4 | nohup python3 services/audiocraft_service.py > services_logs/audiocraft.out 2>&1 & 5 | echo "AudioCraft is loaded sucessfully." -------------------------------------------------------------------------------- /services/start_audioldm.sh: -------------------------------------------------------------------------------- 1 | export AUDIOLDM_SERVICE_PORT=$((${SERVICE_PORT}+4)) 2 | 3 | conda activate AudioInpainting 4 | nohup python3 services/audioldm_service.py > services_logs/audioldm.out 2>&1 & 5 | echo "AudioLDM is loaded sucessfully." -------------------------------------------------------------------------------- /services/start_audiosep.sh: -------------------------------------------------------------------------------- 1 | export AUDIOSEP_SERVICE_PORT=$((${SERVICE_PORT}+2)) 2 | 3 | conda activate AudioEditor 4 | nohup python3 services/audiosep_service.py > services_logs/audiosep.out 2>&1 & 5 | echo "AudioSep is loaded sucessfully." -------------------------------------------------------------------------------- /services/start_audiosr.sh: -------------------------------------------------------------------------------- 1 | export AUDIOSR_SERVICE_PORT=$((${SERVICE_PORT}+3)) 2 | 3 | conda activate AudioSR 4 | nohup python3 services/audiosr_service.py > services_logs/audiosr.out 2>&1 & 5 | echo "AudioSR is loaded sucessfully." -------------------------------------------------------------------------------- /services/start_wavmark.sh: -------------------------------------------------------------------------------- 1 | export WAVMARK_SERVICE_PORT=$((${SERVICE_PORT}+5)) 2 | 3 | conda activate AudioEditor 4 | nohup python3 services/wavmark_service.py > services_logs/wavmark_service.out 2>&1 & 5 | echo "WavMark is loaded sucessfully." -------------------------------------------------------------------------------- /services/wavmark_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import logging 5 | import librosa 6 | import soundfile 7 | import torch 8 | import wavmark 9 | import numpy as np 10 | from flask import Flask, request, jsonify 11 | from scipy.io.wavfile import write 12 | 13 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 14 | from wavcraft.utils import get_service_port 15 | 16 | 17 | with open('wavcraft/configs.yaml', 'r') as file: 18 | config = yaml.safe_load(file) 19 | 20 | # Configure the logging format and level 21 | logging.basicConfig( 22 | level=logging.INFO, 23 | format='%(asctime)s - %(levelname)s - %(message)s' 24 | ) 25 | 26 | # Create a FileHandler for the log file 27 | os.makedirs('services_logs', exist_ok=True) 28 | log_filename = 'services_logs/Wav-API.log' 29 | file_handler = logging.FileHandler(log_filename, mode='w') 30 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) 31 | 32 | # Add the FileHandler to the root logger 33 | logging.getLogger('').addHandler(file_handler) 34 | 35 | 36 | # Audio watermarking preserved by WavCraft 37 | payload = np.array([0,1,0,1,0,1,1,1,0,1,0,0,0,0,1,1]) 38 | model = wavmark.load_model().to("cuda" if torch.cuda.is_available() else 'cpu') 39 | logging.info('WavMark is loaded ...') 40 | 41 | 42 | app = Flask(__name__) 43 | 44 | 45 | @app.route('/audio_watermark', methods=['POST']) 46 | def audio_watermark(): 47 | # Receive the text from the POST request 48 | data = request.json 49 | wav_path = data['wav_path'] 50 | sample_rate = data.get('sample_rate', 16000) 51 | action = data.get('action', "encode") 52 | output_wav = data.get('output_wav', 'out.wav') 53 | logging.info(f"{action} watermark with {wav_path}...") 54 | 55 | # the audio should be a single-channel 16kHz wav, you can read it using soundfile: 56 | signal, sr = soundfile.read(wav_path) 57 | assert sr == sample_rate, "WavMark use 16kHz audio only!" 58 | # Otherwise, you can use the following function to convert the host audio to single-channel 16kHz format: 59 | # from wavmark.utils import file_reader 60 | # signal = file_reader.read_as_single_channel(wav_path, aim_sr=sample_rate) 61 | 62 | try: 63 | assert action in ("encode", "decode") 64 | if action == "encode": 65 | watermarked_signal, _ = wavmark.encode_watermark(model, signal, payload, show_progress=True) 66 | # you can save it as a new wav: 67 | soundfile.write(output_wav, watermarked_signal, sample_rate) 68 | else: 69 | payload_decoded, _ = wavmark.decode_watermark(model, signal, show_progress=True) 70 | confidence_score = (payload == payload_decoded).mean() * 100 71 | if confidence_score < 0.5: 72 | logging.info(f"Audio file {wav_path} is not generated by WavCraft.") 73 | else: 74 | logging.info(f"Audio file {wav_path} is generated by WavCraft.") 75 | 76 | # Return success message and the filename of the generated audio 77 | return jsonify({'message': f"Sucessful {action} watermark with {wav_path}..."}) 78 | 79 | except Exception as e: 80 | # Return error message if something goes wrong 81 | return jsonify({'API error': str(e)}), 500 82 | 83 | 84 | if __name__ == '__main__': 85 | service_port = get_service_port("WAVMARK_SERVICE_PORT") 86 | # We disable multithreading to force services to process one request at a time and avoid CUDA OOM 87 | app.run(debug=False, threaded=False, port=service_port) -------------------------------------------------------------------------------- /venvs/audiocraft.yml: -------------------------------------------------------------------------------- 1 | name: AudioCraft 2 | channels: 3 | - nvidia/label/cuda-11.8.0 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - bzip2=1.0.8=h7f98852_4 10 | - ca-certificates=2023.05.30=h06a4308_0 11 | - cuda-cccl=11.8.89=0 12 | - cuda-command-line-tools=11.8.0=0 13 | - cuda-compiler=11.8.0=0 14 | - cuda-cudart=11.8.89=0 15 | - cuda-cudart-dev=11.8.89=0 16 | - cuda-cuobjdump=11.8.86=0 17 | - cuda-cupti=11.8.87=0 18 | - cuda-cuxxfilt=11.8.86=0 19 | - cuda-documentation=11.8.86=0 20 | - cuda-driver-dev=11.8.89=0 21 | - cuda-gdb=11.8.86=0 22 | - cuda-libraries=11.8.0=0 23 | - cuda-libraries-dev=11.8.0=0 24 | - cuda-memcheck=11.8.86=0 25 | - cuda-nsight=11.8.86=0 26 | - cuda-nsight-compute=11.8.0=0 27 | - cuda-nvcc=11.8.89=0 28 | - cuda-nvdisasm=11.8.86=0 29 | - cuda-nvml-dev=11.8.86=0 30 | - cuda-nvprof=11.8.87=0 31 | - cuda-nvprune=11.8.86=0 32 | - cuda-nvrtc=11.8.89=0 33 | - cuda-nvrtc-dev=11.8.89=0 34 | - cuda-nvtx=11.8.86=0 35 | - cuda-nvvp=11.8.87=0 36 | - cuda-profiler-api=11.8.86=0 37 | - cuda-sanitizer-api=11.8.86=0 38 | - cuda-toolkit=11.8.0=0 39 | - cuda-tools=11.8.0=0 40 | - cuda-visual-tools=11.8.0=0 41 | - gds-tools=1.4.0.31=0 42 | - ld_impl_linux-64=2.40=h41732ed_0 43 | - libcublas=11.11.3.6=0 44 | - libcublas-dev=11.11.3.6=0 45 | - libcufft=10.9.0.58=0 46 | - libcufft-dev=10.9.0.58=0 47 | - libcufile=1.4.0.31=0 48 | - libcufile-dev=1.4.0.31=0 49 | - libcurand=10.3.0.86=0 50 | - libcurand-dev=10.3.0.86=0 51 | - libcusolver=11.4.1.48=0 52 | - libcusolver-dev=11.4.1.48=0 53 | - libcusparse=11.7.5.86=0 54 | - libcusparse-dev=11.7.5.86=0 55 | - libffi=3.4.2=h7f98852_5 56 | - libgcc-ng=13.1.0=he5830b7_0 57 | - libgomp=13.1.0=he5830b7_0 58 | - libnpp=11.8.0.86=0 59 | - libnpp-dev=11.8.0.86=0 60 | - libnsl=2.0.0=h7f98852_0 61 | - libnvjpeg=11.9.0.86=0 62 | - libnvjpeg-dev=11.9.0.86=0 63 | - libsqlite=3.42.0=h2797004_0 64 | - libuuid=2.38.1=h0b41bf4_0 65 | - libzlib=1.2.13=hd590300_5 66 | - ncurses=6.4=hcb278e6_0 67 | - nsight-compute=2022.3.0.22=0 68 | - openssl=3.1.1=hd590300_1 69 | - pip=23.1.2=pyhd8ed1ab_0 70 | - python=3.8.17=he550d4f_0_cpython 71 | - readline=8.2=h8228510_1 72 | - setuptools=68.0.0=pyhd8ed1ab_0 73 | - tk=8.6.12=h27826a3_0 74 | - wheel=0.40.0=pyhd8ed1ab_0 75 | - xz=5.2.6=h166bdaf_0 76 | - pip: 77 | - aiofiles==23.1.0 78 | - aiohttp==3.8.4 79 | - aiosignal==1.3.1 80 | - altair==5.0.1 81 | - antlr4-python3-runtime==4.9.3 82 | - anyio==3.7.1 83 | - async-timeout==4.0.2 84 | - attrs==23.1.0 85 | - audiocraft==1.0.0 86 | - audiomentations==0.34.1 87 | - audioread==3.0.0 88 | - av==10.0.0 89 | - blinker==1.6.2 90 | - blis==0.7.9 91 | - catalogue==2.0.8 92 | - certifi==2023.5.7 93 | - cffi==1.15.1 94 | - charset-normalizer==3.2.0 95 | - click==8.1.5 96 | - cloudpickle==2.2.1 97 | - cmake==3.26.4 98 | - cog==0.8.6 99 | - coloredlogs==15.0.1 100 | - colorlog==6.7.0 101 | - confection==0.1.0 102 | - contourpy==1.1.0 103 | - cycler==0.12.1 104 | - cymem==2.0.7 105 | - cython==0.29.36 106 | - datasets==2.14.6 107 | - demucs==4.0.0 108 | - diffq==0.2.4 109 | - dill==0.3.7 110 | - distro==1.8.0 111 | - docopt==0.6.2 112 | - dora-search==0.1.12 113 | - einops==0.6.1 114 | - encodec==0.1.1 115 | - exceptiongroup==1.1.2 116 | - fastapi==0.98.0 117 | - ffmpy==0.3.0 118 | - filelock==3.12.2 119 | - flashy==0.0.2 120 | - flask==2.3.2 121 | - fonttools==4.41.0 122 | - frozenlist==1.4.0 123 | - fsspec==2023.6.0 124 | - future==0.18.3 125 | - gradio==3.36.1 126 | - gradio-client==0.2.9 127 | - h11==0.14.0 128 | - httpcore==0.17.3 129 | - httptools==0.6.1 130 | - httpx==0.24.1 131 | - huggingface-hub==0.16.4 132 | - humanfriendly==10.0 133 | - hydra-colorlog==1.2.0 134 | - hydra-core==1.3.2 135 | - idna==3.4 136 | - importlib-metadata==6.8.0 137 | - importlib-resources==6.0.0 138 | - itsdangerous==2.1.2 139 | - joblib==1.3.1 140 | - jsonschema==4.18.3 141 | - jsonschema-specifications==2023.6.1 142 | - julius==0.2.7 143 | - kiwisolver==1.4.4 144 | - lameenc==1.5.1 145 | - langcodes==3.3.0 146 | - lazy-loader==0.3 147 | - librosa==0.10.0.post2 148 | - lightning-utilities==0.9.0 149 | - linkify-it-py==2.0.2 150 | - lit==16.0.6 151 | - llvmlite==0.40.1 152 | - markdown-it-py==2.2.0 153 | - markupsafe==2.1.3 154 | - matplotlib==3.7.2 155 | - mdit-py-plugins==0.3.3 156 | - mdurl==0.1.2 157 | - mpmath==1.3.0 158 | - msgpack==1.0.5 159 | - multidict==6.0.4 160 | - multiprocess==0.70.15 161 | - murmurhash==1.0.9 162 | - mypy-extensions==1.0.0 163 | - networkx==3.1 164 | - nltk==3.8.1 165 | - num2words==0.5.12 166 | - numba==0.57.1 167 | - numpy==1.24.4 168 | - nvidia-cublas-cu11==11.10.3.66 169 | - nvidia-cuda-cupti-cu11==11.7.101 170 | - nvidia-cuda-nvrtc-cu11==11.7.99 171 | - nvidia-cuda-runtime-cu11==11.7.99 172 | - nvidia-cudnn-cu11==8.5.0.96 173 | - nvidia-cufft-cu11==10.9.0.58 174 | - nvidia-curand-cu11==10.2.10.91 175 | - nvidia-cusolver-cu11==11.4.0.1 176 | - nvidia-cusparse-cu11==11.7.4.91 177 | - nvidia-nccl-cu11==2.14.3 178 | - nvidia-nvtx-cu11==11.7.91 179 | - omegaconf==2.3.0 180 | - openai==1.3.6 181 | - openunmix==1.2.1 182 | - optimum==1.14.0 183 | - orjson==3.9.2 184 | - packaging==23.1 185 | - pandas==2.0.3 186 | - pathy==0.10.2 187 | - pillow==10.0.0 188 | - pkgutil-resolve-name==1.3.10 189 | - pooch==1.6.0 190 | - preshed==3.0.8 191 | - protobuf==3.20.3 192 | - pyarrow==14.0.1 193 | - pydantic==1.10.11 194 | - pydub==0.25.1 195 | - pygments==2.15.1 196 | - pyloudnorm==0.1.1 197 | - pyparsing==3.0.9 198 | - pyre-extensions==0.0.29 199 | - python-dotenv==1.0.0 200 | - python-multipart==0.0.6 201 | - pytz==2023.3 202 | - pyyaml==6.0.1 203 | - referencing==0.29.1 204 | - regex==2023.6.3 205 | - requests==2.31.0 206 | - retrying==1.3.4 207 | - rpds-py==0.8.10 208 | - safetensors==0.3.1 209 | - scikit-learn==1.3.0 210 | - scipy==1.10.1 211 | - semantic-version==2.10.0 212 | - sentencepiece==0.1.99 213 | - smart-open==6.3.0 214 | - sniffio==1.3.0 215 | - soundfile==0.12.1 216 | - soxr==0.3.5 217 | - spacy==3.5.2 218 | - spacy-legacy==3.0.12 219 | - spacy-loggers==1.0.4 220 | - srsly==2.4.6 221 | - starlette==0.27.0 222 | - structlog==23.2.0 223 | - submitit==1.4.5 224 | - sympy==1.12 225 | - thinc==8.1.10 226 | - threadpoolctl==3.2.0 227 | - tokenizers==0.13.3 228 | - toolz==0.12.0 229 | - torch==2.0.1 230 | - torchaudio==2.0.2 231 | - torchmetrics==1.0.1 232 | - tqdm==4.65.0 233 | - transformers==4.31.0 234 | - treetable==0.2.5 235 | - triton==2.0.0 236 | - typer==0.7.0 237 | - typing-extensions==4.7.1 238 | - typing-inspect==0.9.0 239 | - tzdata==2023.3 240 | - uc-micro-py==1.0.2 241 | - urllib3==2.0.3 242 | - uvicorn==0.22.0 243 | - uvloop==0.19.0 244 | - wasabi==1.1.2 245 | - watchfiles==0.21.0 246 | - websockets==11.0.3 247 | - werkzeug==2.3.6 248 | - xformers==0.0.20 249 | - xxhash==3.4.1 250 | - yarl==1.9.2 251 | - zipp==3.16.2 252 | prefix: /homes/jl009/.conda/envs/AudioCraft 253 | 254 | -------------------------------------------------------------------------------- /venvs/audioldm.yml: -------------------------------------------------------------------------------- 1 | name: AudioInpainting 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=mkl 9 | - brotli-python=1.0.9=py38heb0550a_2 10 | - bzip2=1.0.8=h7b6447c_0 11 | - ca-certificates=2023.08.22=h06a4308_0 12 | - certifi=2023.11.17=py38h06a4308_0 13 | - click=8.1.7=py38h06a4308_0 14 | - cryptography=41.0.3=py38h130f0dd_0 15 | - cuda-cudart=11.7.99=0 16 | - cuda-cupti=11.7.101=0 17 | - cuda-libraries=11.7.1=0 18 | - cuda-nvrtc=11.7.99=0 19 | - cuda-nvtx=11.7.91=0 20 | - cuda-runtime=11.7.1=0 21 | - ffmpeg=4.3=hf484d3e_0 22 | - flask=2.2.2=py38h06a4308_0 23 | - freetype=2.11.0=h70c0345_0 24 | - giflib=5.2.1=h7b6447c_0 25 | - gmp=6.2.1=h295c915_3 26 | - gnutls=3.6.15=he1e5248_0 27 | - intel-openmp=2021.4.0=h06a4308_3561 28 | - itsdangerous=2.0.1=pyhd3eb1b0_0 29 | - jinja2=3.1.2=py38h06a4308_0 30 | - jpeg=9e=h7f8727e_0 31 | - lame=3.100=h7b6447c_0 32 | - lcms2=2.12=h3be6417_0 33 | - ld_impl_linux-64=2.38=h1181459_1 34 | - libcublas=11.10.3.66=0 35 | - libcufft=10.7.2.124=h4fbf590_0 36 | - libcufile=1.8.1.2=0 37 | - libcurand=10.3.4.101=0 38 | - libcusolver=11.4.0.1=0 39 | - libcusparse=11.7.4.91=0 40 | - libffi=3.3=he6710b0_2 41 | - libgcc-ng=9.1.0=hdf63c60_0 42 | - libiconv=1.16=h7f8727e_2 43 | - libidn2=2.3.2=h7f8727e_0 44 | - libnpp=11.7.4.75=0 45 | - libnvjpeg=11.8.0.2=0 46 | - libpng=1.6.37=hbc83047_0 47 | - libstdcxx-ng=9.1.0=hdf63c60_0 48 | - libtasn1=4.16.0=h27cfd23_0 49 | - libtiff=4.2.0=h2818925_1 50 | - libunistring=0.9.10=h27cfd23_0 51 | - libwebp=1.2.2=h55f646e_0 52 | - libwebp-base=1.2.2=h7f8727e_0 53 | - lz4-c=1.9.3=h295c915_1 54 | - mkl=2021.4.0=h06a4308_640 55 | - mkl-service=2.4.0=py38h7f8727e_0 56 | - mkl_fft=1.3.1=py38hd3c417c_0 57 | - mkl_random=1.2.2=py38h51133e4_0 58 | - ncurses=6.3=h7f8727e_2 59 | - nettle=3.7.3=hbbd107a_1 60 | - numpy-base=1.22.3=py38hf524024_0 61 | - openh264=2.1.1=h4ff587b_0 62 | - openssl=1.1.1w=h7f8727e_0 63 | - pip=23.3.1=py38h06a4308_0 64 | - pycparser=2.21=pyhd3eb1b0_0 65 | - pyopenssl=23.2.0=py38h06a4308_0 66 | - pysocks=1.7.1=py38h06a4308_0 67 | - python=3.8.13=h12debd9_0 68 | - pytorch=1.13.1=py3.8_cuda11.7_cudnn8.5.0_0 69 | - pytorch-cuda=11.7=h778d358_5 70 | - pytorch-mutex=1.0=cuda 71 | - readline=8.1.2=h7f8727e_1 72 | - requests=2.31.0=py38h06a4308_0 73 | - setuptools=68.0.0=py38h06a4308_0 74 | - six=1.16.0=pyhd3eb1b0_1 75 | - sqlite=3.38.5=hc218d9a_0 76 | - tk=8.6.12=h1ccaba5_0 77 | - torchaudio=0.13.1=py38_cu117 78 | - torchvision=0.14.1=py38_cu117 79 | - typing_extensions=4.7.1=py38h06a4308_0 80 | - werkzeug=2.2.3=py38h06a4308_0 81 | - wheel=0.41.2=py38h06a4308_0 82 | - xz=5.2.5=h7f8727e_1 83 | - zlib=1.2.12=h7f8727e_2 84 | - zstd=1.5.2=ha4553b6_0 85 | - pip: 86 | - aiofiles==23.2.1 87 | - altair==5.1.2 88 | - annotated-types==0.6.0 89 | - anyio==3.7.1 90 | - attrs==23.1.0 91 | - audioldm==0.1.1 92 | - audiomentations==0.34.1 93 | - audioread==3.0.1 94 | - cffi==1.16.0 95 | - chardet==5.2.0 96 | - charset-normalizer==3.3.2 97 | - colorama==0.4.6 98 | - contourpy==1.1.1 99 | - cycler==0.12.1 100 | - einops==0.7.0 101 | - exceptiongroup==1.2.0 102 | - fastapi==0.104.1 103 | - ffmpy==0.3.1 104 | - filelock==3.13.1 105 | - fonttools==4.45.1 106 | - fsspec==2023.10.0 107 | - ftfy==6.1.3 108 | - future==0.18.3 109 | - gradio==4.7.1 110 | - gradio-client==0.7.0 111 | - h11==0.14.0 112 | - httpcore==1.0.2 113 | - httpx==0.25.2 114 | - huggingface-hub==0.19.4 115 | - idna==3.6 116 | - importlib-metadata==6.8.0 117 | - importlib-resources==6.1.1 118 | - joblib==1.3.2 119 | - jsonschema==4.20.0 120 | - jsonschema-specifications==2023.11.1 121 | - kiwisolver==1.4.5 122 | - librosa==0.9.2 123 | - llvmlite==0.41.1 124 | - markdown-it-py==3.0.0 125 | - markupsafe==2.1.3 126 | - matplotlib==3.7.4 127 | - mdurl==0.1.2 128 | - mpmath==1.3.0 129 | - networkx==3.1 130 | - numba==0.58.1 131 | - numpy==1.23.5 132 | - nvidia-cublas-cu12==12.1.3.1 133 | - nvidia-cuda-cupti-cu12==12.1.105 134 | - nvidia-cuda-nvrtc-cu12==12.1.105 135 | - nvidia-cuda-runtime-cu12==12.1.105 136 | - nvidia-cudnn-cu12==8.9.2.26 137 | - nvidia-cufft-cu12==11.0.2.54 138 | - nvidia-curand-cu12==10.3.2.106 139 | - nvidia-cusolver-cu12==11.4.5.107 140 | - nvidia-cusparse-cu12==12.1.0.106 141 | - nvidia-nccl-cu12==2.18.1 142 | - nvidia-nvjitlink-cu12==12.3.101 143 | - nvidia-nvtx-cu12==12.1.105 144 | - orjson==3.9.10 145 | - packaging==23.2 146 | - pandas==2.0.3 147 | - pillow==10.1.0 148 | - pkgutil-resolve-name==1.3.10 149 | - platformdirs==4.0.0 150 | - pooch==1.8.0 151 | - progressbar==2.5 152 | - pydantic==2.5.2 153 | - pydantic-core==2.14.5 154 | - pydub==0.25.1 155 | - pygments==2.17.2 156 | - pyloudnorm==0.1.1 157 | - pyparsing==3.1.1 158 | - python-multipart==0.0.6 159 | - pytz==2023.3.post1 160 | - pyyaml==6.0.1 161 | - referencing==0.31.0 162 | - regex==2023.10.3 163 | - retrying==1.3.4 164 | - rich==13.7.0 165 | - rpds-py==0.13.1 166 | - safetensors==0.4.1 167 | - scikit-learn==1.3.2 168 | - scipy==1.10.1 169 | - semantic-version==2.10.0 170 | - shellingham==1.5.4 171 | - sniffio==1.3.0 172 | - soundfile==0.12.1 173 | - soxr==0.3.7 174 | - starlette==0.27.0 175 | - sympy==1.12 176 | - threadpoolctl==3.2.0 177 | - tokenizers==0.13.3 178 | - tomlkit==0.12.0 179 | - toolz==0.12.0 180 | - torchlibrosa==0.0.9 181 | - tqdm==4.66.1 182 | - transformers==4.29.0 183 | - triton==2.1.0 184 | - typer==0.9.0 185 | - typing-extensions==4.8.0 186 | - tzdata==2023.3 187 | - urllib3==2.1.0 188 | - uvicorn==0.24.0.post1 189 | - wcwidth==0.2.12 190 | - websockets==11.0.3 191 | - zipp==3.17.0 192 | prefix: /homes/jl009/.conda/envs/AudioInpainting 193 | 194 | -------------------------------------------------------------------------------- /venvs/audiosr.yml: -------------------------------------------------------------------------------- 1 | name: AudioSR 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=mkl 9 | - brotlipy=0.7.0=py39h27cfd23_1003 10 | - bzip2=1.0.8=h7b6447c_0 11 | - ca-certificates=2023.08.22=h06a4308_0 12 | - certifi=2023.7.22=py39h06a4308_0 13 | - cryptography=41.0.3=py39h130f0dd_0 14 | - cuda-cudart=11.8.89=0 15 | - cuda-cupti=11.8.87=0 16 | - cuda-libraries=11.8.0=0 17 | - cuda-nvrtc=11.8.89=0 18 | - cuda-nvtx=11.8.86=0 19 | - cuda-runtime=11.8.0=0 20 | - ffmpeg=4.3=hf484d3e_0 21 | - freetype=2.11.0=h70c0345_0 22 | - giflib=5.2.1=h7b6447c_0 23 | - gmp=6.2.1=h295c915_3 24 | - gmpy2=2.1.2=py39heeb90bb_0 25 | - gnutls=3.6.15=he1e5248_0 26 | - idna=3.4=py39h06a4308_0 27 | - intel-openmp=2021.4.0=h06a4308_3561 28 | - jinja2=3.1.2=py39h06a4308_0 29 | - jpeg=9e=h7f8727e_0 30 | - lame=3.100=h7b6447c_0 31 | - lcms2=2.12=h3be6417_0 32 | - ld_impl_linux-64=2.38=h1181459_1 33 | - libcublas=11.11.3.6=0 34 | - libcufft=10.9.0.58=0 35 | - libcufile=1.8.0.34=0 36 | - libcurand=10.3.4.52=0 37 | - libcusolver=11.4.1.48=0 38 | - libcusparse=11.7.5.86=0 39 | - libffi=3.3=he6710b0_2 40 | - libgcc-ng=9.1.0=hdf63c60_0 41 | - libiconv=1.16=h7f8727e_2 42 | - libidn2=2.3.2=h7f8727e_0 43 | - libnpp=11.8.0.86=0 44 | - libnvjpeg=11.9.0.86=0 45 | - libpng=1.6.37=hbc83047_0 46 | - libstdcxx-ng=9.1.0=hdf63c60_0 47 | - libtasn1=4.16.0=h27cfd23_0 48 | - libtiff=4.2.0=h2818925_1 49 | - libunistring=0.9.10=h27cfd23_0 50 | - libwebp=1.2.2=h55f646e_0 51 | - libwebp-base=1.2.2=h7f8727e_0 52 | - lz4-c=1.9.3=h295c915_1 53 | - mkl=2021.4.0=h06a4308_640 54 | - mkl-service=2.4.0=py39h7f8727e_0 55 | - mkl_fft=1.3.1=py39hd3c417c_0 56 | - mkl_random=1.2.2=py39h51133e4_0 57 | - mpc=1.1.0=h10f8cd9_1 58 | - mpfr=4.0.2=hb69a4c5_1 59 | - mpmath=1.3.0=py39h06a4308_0 60 | - ncurses=6.3=h7f8727e_2 61 | - nettle=3.7.3=hbbd107a_1 62 | - openh264=2.1.1=h4ff587b_0 63 | - openssl=1.1.1w=h7f8727e_0 64 | - pip=23.3=py39h06a4308_0 65 | - pycparser=2.21=pyhd3eb1b0_0 66 | - pyopenssl=23.2.0=py39h06a4308_0 67 | - pysocks=1.7.1=py39h06a4308_0 68 | - python=3.9.12=h12debd9_1 69 | - pytorch=2.0.1=py3.9_cuda11.8_cudnn8.7.0_0 70 | - pytorch-cuda=11.8=h7e8668a_5 71 | - pytorch-mutex=1.0=cuda 72 | - readline=8.1.2=h7f8727e_1 73 | - requests=2.31.0=py39h06a4308_0 74 | - setuptools=68.0.0=py39h06a4308_0 75 | - six=1.16.0=pyhd3eb1b0_1 76 | - sqlite=3.38.5=hc218d9a_0 77 | - tk=8.6.12=h1ccaba5_0 78 | - torchtriton=2.0.0=py39 79 | - wheel=0.41.2=py39h06a4308_0 80 | - xz=5.2.5=h7f8727e_1 81 | - zlib=1.2.12=h7f8727e_2 82 | - zstd=1.5.2=ha4553b6_0 83 | - pip: 84 | - aiofiles==23.2.1 85 | - altair==5.1.2 86 | - annotated-types==0.6.0 87 | - anyio==3.7.1 88 | - attrs==23.1.0 89 | - audiomentations==0.34.1 90 | - audioread==3.0.1 91 | - audiosr==0.0.6 92 | - babel==2.13.1 93 | - blinker==1.7.0 94 | - cffi==1.16.0 95 | - chardet==5.2.0 96 | - charset-normalizer==3.3.2 97 | - click==8.1.7 98 | - clldutils==3.20.0 99 | - cmake==3.27.7 100 | - cog==0.8.6 101 | - colorama==0.4.6 102 | - colorlog==6.7.0 103 | - contourpy==1.2.0 104 | - csvw==3.1.3 105 | - cycler==0.12.1 106 | - decorator==5.1.1 107 | - dlinfo==1.2.1 108 | - einops==0.7.0 109 | - exceptiongroup==1.1.3 110 | - fastapi==0.104.1 111 | - ffmpy==0.3.1 112 | - filelock==3.13.1 113 | - flask==3.0.0 114 | - fonttools==4.44.0 115 | - fsspec==2023.10.0 116 | - ftfy==6.1.1 117 | - future==0.18.3 118 | - gradio==4.2.0 119 | - gradio-client==0.7.0 120 | - h11==0.14.0 121 | - httpcore==1.0.2 122 | - httptools==0.6.1 123 | - httpx==0.25.1 124 | - huggingface-hub==0.19.0 125 | - importlib-metadata==6.8.0 126 | - importlib-resources==6.1.1 127 | - isodate==0.6.1 128 | - itsdangerous==2.1.2 129 | - joblib==1.3.2 130 | - jsonschema==4.19.2 131 | - jsonschema-specifications==2023.7.1 132 | - kiwisolver==1.4.5 133 | - language-tags==1.2.0 134 | - librosa==0.9.2 135 | - lit==17.0.4 136 | - llvmlite==0.41.1 137 | - lxml==4.9.3 138 | - markdown==3.5.1 139 | - markdown-it-py==3.0.0 140 | - markupsafe==2.1.3 141 | - matplotlib==3.8.1 142 | - mdurl==0.1.2 143 | - networkx==3.2.1 144 | - numba==0.58.1 145 | - numpy==1.23.5 146 | - nvidia-cublas-cu11==11.10.3.66 147 | - nvidia-cublas-cu12==12.1.3.1 148 | - nvidia-cuda-cupti-cu11==11.7.101 149 | - nvidia-cuda-cupti-cu12==12.1.105 150 | - nvidia-cuda-nvrtc-cu11==11.7.99 151 | - nvidia-cuda-nvrtc-cu12==12.1.105 152 | - nvidia-cuda-runtime-cu11==11.7.99 153 | - nvidia-cuda-runtime-cu12==12.1.105 154 | - nvidia-cudnn-cu11==8.5.0.96 155 | - nvidia-cudnn-cu12==8.9.2.26 156 | - nvidia-cufft-cu11==10.9.0.58 157 | - nvidia-cufft-cu12==11.0.2.54 158 | - nvidia-curand-cu11==10.2.10.91 159 | - nvidia-curand-cu12==10.3.2.106 160 | - nvidia-cusolver-cu11==11.4.0.1 161 | - nvidia-cusolver-cu12==11.4.5.107 162 | - nvidia-cusparse-cu11==11.7.4.91 163 | - nvidia-cusparse-cu12==12.1.0.106 164 | - nvidia-nccl-cu11==2.14.3 165 | - nvidia-nccl-cu12==2.18.1 166 | - nvidia-nvjitlink-cu12==12.3.52 167 | - nvidia-nvtx-cu11==11.7.91 168 | - nvidia-nvtx-cu12==12.1.105 169 | - orjson==3.9.10 170 | - packaging==23.2 171 | - pandas==2.1.3 172 | - phonemizer==3.2.1 173 | - pillow==10.1.0 174 | - platformdirs==4.0.0 175 | - pooch==1.8.0 176 | - progressbar==2.5 177 | - pydantic==2.5.0 178 | - pydantic-core==2.14.1 179 | - pydub==0.25.1 180 | - pygments==2.16.1 181 | - pylatexenc==2.10 182 | - pyloudnorm==0.1.1 183 | - pyparsing==3.1.1 184 | - python-dateutil==2.8.2 185 | - python-dotenv==1.0.0 186 | - python-multipart==0.0.6 187 | - pytz==2023.3.post1 188 | - pyyaml==6.0.1 189 | - rdflib==7.0.0 190 | - referencing==0.30.2 191 | - regex==2023.10.3 192 | - resampy==0.4.2 193 | - retrying==1.3.4 194 | - rfc3986==1.5.0 195 | - rich==13.6.0 196 | - rpds-py==0.12.0 197 | - safetensors==0.4.0 198 | - scikit-learn==1.3.2 199 | - scipy==1.11.3 200 | - segments==2.2.1 201 | - semantic-version==2.10.0 202 | - shellingham==1.5.4 203 | - sniffio==1.3.0 204 | - soundfile==0.12.1 205 | - soxr==0.3.7 206 | - starlette==0.27.0 207 | - structlog==23.2.0 208 | - sympy==1.12 209 | - tabulate==0.9.0 210 | - threadpoolctl==3.2.0 211 | - timm==0.9.10 212 | - tokenizers==0.13.3 213 | - tomlkit==0.12.0 214 | - toolz==0.12.0 215 | - torch==2.0.1 216 | - torchaudio==2.1.0 217 | - torchlibrosa==0.1.0 218 | - torchvision==0.16.0 219 | - tqdm==4.66.1 220 | - transformers==4.30.2 221 | - triton==2.0.0 222 | - typer==0.9.0 223 | - typing-extensions==4.8.0 224 | - tzdata==2023.3 225 | - unidecode==1.3.7 226 | - uritemplate==4.1.1 227 | - urllib3==2.0.7 228 | - uvicorn==0.24.0.post1 229 | - uvloop==0.19.0 230 | - watchfiles==0.21.0 231 | - wcwidth==0.2.9 232 | - websockets==11.0.3 233 | - werkzeug==3.0.1 234 | - zipp==3.17.0 235 | prefix: /homes/jl009/.conda/envs/AudioSR 236 | 237 | -------------------------------------------------------------------------------- /venvs/wavcraft.yml: -------------------------------------------------------------------------------- 1 | name: WavCraft 2 | channels: 3 | - nvidia/label/cuda-11.8.0 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - bzip2=1.0.8=h7f98852_4 10 | - ca-certificates=2023.05.30=h06a4308_0 11 | - cuda-cccl=11.8.89=0 12 | - cuda-command-line-tools=11.8.0=0 13 | - cuda-compiler=11.8.0=0 14 | - cuda-cudart=11.8.89=0 15 | - cuda-cudart-dev=11.8.89=0 16 | - cuda-cuobjdump=11.8.86=0 17 | - cuda-cupti=11.8.87=0 18 | - cuda-cuxxfilt=11.8.86=0 19 | - cuda-documentation=11.8.86=0 20 | - cuda-driver-dev=11.8.89=0 21 | - cuda-gdb=11.8.86=0 22 | - cuda-libraries=11.8.0=0 23 | - cuda-libraries-dev=11.8.0=0 24 | - cuda-memcheck=11.8.86=0 25 | - cuda-nsight=11.8.86=0 26 | - cuda-nsight-compute=11.8.0=0 27 | - cuda-nvcc=11.8.89=0 28 | - cuda-nvdisasm=11.8.86=0 29 | - cuda-nvml-dev=11.8.86=0 30 | - cuda-nvprof=11.8.87=0 31 | - cuda-nvprune=11.8.86=0 32 | - cuda-nvrtc=11.8.89=0 33 | - cuda-nvrtc-dev=11.8.89=0 34 | - cuda-nvtx=11.8.86=0 35 | - cuda-nvvp=11.8.87=0 36 | - cuda-profiler-api=11.8.86=0 37 | - cuda-sanitizer-api=11.8.86=0 38 | - cuda-toolkit=11.8.0=0 39 | - cuda-tools=11.8.0=0 40 | - cuda-visual-tools=11.8.0=0 41 | - gds-tools=1.4.0.31=0 42 | - ld_impl_linux-64=2.40=h41732ed_0 43 | - libcublas=11.11.3.6=0 44 | - libcublas-dev=11.11.3.6=0 45 | - libcufft=10.9.0.58=0 46 | - libcufft-dev=10.9.0.58=0 47 | - libcufile=1.4.0.31=0 48 | - libcufile-dev=1.4.0.31=0 49 | - libcurand=10.3.0.86=0 50 | - libcurand-dev=10.3.0.86=0 51 | - libcusolver=11.4.1.48=0 52 | - libcusolver-dev=11.4.1.48=0 53 | - libcusparse=11.7.5.86=0 54 | - libcusparse-dev=11.7.5.86=0 55 | - libffi=3.4.2=h7f98852_5 56 | - libgcc-ng=13.1.0=he5830b7_0 57 | - libgomp=13.1.0=he5830b7_0 58 | - libnpp=11.8.0.86=0 59 | - libnpp-dev=11.8.0.86=0 60 | - libnsl=2.0.0=h7f98852_0 61 | - libnvjpeg=11.9.0.86=0 62 | - libnvjpeg-dev=11.9.0.86=0 63 | - libsqlite=3.42.0=h2797004_0 64 | - libuuid=2.38.1=h0b41bf4_0 65 | - libzlib=1.2.13=hd590300_5 66 | - ncurses=6.4=hcb278e6_0 67 | - nsight-compute=2022.3.0.22=0 68 | - openssl=3.1.1=hd590300_1 69 | - pip=23.1.2=pyhd8ed1ab_0 70 | - python=3.8.17=he550d4f_0_cpython 71 | - readline=8.2=h8228510_1 72 | - setuptools=68.0.0=pyhd8ed1ab_0 73 | - tk=8.6.12=h27826a3_0 74 | - wheel=0.40.0=pyhd8ed1ab_0 75 | - xz=5.2.6=h166bdaf_0 76 | - pip: 77 | - aiofiles==23.1.0 78 | - aiohttp==3.8.4 79 | - aiosignal==1.3.1 80 | - altair==5.0.1 81 | - antlr4-python3-runtime==4.9.3 82 | - anyio==3.7.1 83 | - async-timeout==4.0.2 84 | - attrs==23.1.0 85 | - audiomentations==0.34.1 86 | - audioread==3.0.0 87 | - av==10.0.0 88 | - blinker==1.6.2 89 | - blis==0.7.9 90 | - braceexpand==0.1.7 91 | - catalogue==2.0.8 92 | - certifi==2023.5.7 93 | - cffi==1.15.1 94 | - charset-normalizer==3.2.0 95 | - click==8.1.5 96 | - cloudpickle==2.2.1 97 | - cmake==3.26.4 98 | - colorlog==6.7.0 99 | - confection==0.1.0 100 | - contourpy==1.1.0 101 | - cycler==0.12.1 102 | - cymem==2.0.7 103 | - cython==0.29.36 104 | - demucs==4.0.0 105 | - diffq==0.2.4 106 | - distro==1.8.0 107 | - docopt==0.6.2 108 | - dora-search==0.1.12 109 | - einops==0.6.1 110 | - encodec==0.1.1 111 | - exceptiongroup==1.1.2 112 | - fastapi==0.100.0 113 | - ffmpy==0.3.0 114 | - filelock==3.12.2 115 | - flashy==0.0.2 116 | - flask==2.3.2 117 | - fonttools==4.41.0 118 | - frozenlist==1.4.0 119 | - fsspec==2023.6.0 120 | - ftfy==6.1.1 121 | - future==0.18.3 122 | - gradio==3.36.1 123 | - gradio-client==0.7.0 124 | - h11==0.14.0 125 | - h5py==3.10.0 126 | - httpcore==0.17.3 127 | - httpx==0.24.1 128 | - huggingface-hub==0.16.4 129 | - hydra-colorlog==1.2.0 130 | - hydra-core==1.3.2 131 | - idna==3.4 132 | - importlib-metadata==6.8.0 133 | - importlib-resources==6.0.0 134 | - itsdangerous==2.1.2 135 | - joblib==1.3.1 136 | - jsonschema==4.18.3 137 | - jsonschema-specifications==2023.6.1 138 | - julius==0.2.7 139 | - kiwisolver==1.4.4 140 | - lameenc==1.5.1 141 | - langcodes==3.3.0 142 | - lazy-loader==0.3 143 | - librosa==0.10.0.post2 144 | - lightning==2.1.1 145 | - lightning-utilities==0.9.0 146 | - linkify-it-py==2.0.2 147 | - lit==16.0.6 148 | - llvmlite==0.40.1 149 | - markdown-it-py==2.2.0 150 | - markupsafe==2.1.3 151 | - matplotlib==3.7.2 152 | - mdit-py-plugins==0.3.3 153 | - mdurl==0.1.2 154 | - mpmath==1.3.0 155 | - msgpack==1.0.5 156 | - multidict==6.0.4 157 | - murmurhash==1.0.9 158 | - mypy-extensions==1.0.0 159 | - networkx==3.1 160 | - num2words==0.5.12 161 | - numba==0.57.1 162 | - numpy==1.23.0 163 | - nvidia-cublas-cu11==11.10.3.66 164 | - nvidia-cuda-cupti-cu11==11.7.101 165 | - nvidia-cuda-nvrtc-cu11==11.7.99 166 | - nvidia-cuda-runtime-cu11==11.7.99 167 | - nvidia-cudnn-cu11==8.5.0.96 168 | - nvidia-cufft-cu11==10.9.0.58 169 | - nvidia-curand-cu11==10.2.10.91 170 | - nvidia-cusolver-cu11==11.4.0.1 171 | - nvidia-cusparse-cu11==11.7.4.91 172 | - nvidia-nccl-cu11==2.14.3 173 | - nvidia-nvtx-cu11==11.7.91 174 | - omegaconf==2.3.0 175 | - openai==0.28.0 176 | - openunmix==1.2.1 177 | - orjson==3.9.2 178 | - packaging==23.1 179 | - pandas==2.0.3 180 | - pathy==0.10.2 181 | - pillow==10.0.0 182 | - pkgutil-resolve-name==1.3.10 183 | - pooch==1.6.0 184 | - preshed==3.0.8 185 | - protobuf==4.25.0 186 | - pybind11==2.11.1 187 | - pydantic==1.10.11 188 | - pydub==0.25.1 189 | - pygments==2.15.1 190 | - pyloudnorm==0.1.1 191 | - pyparsing==3.0.9 192 | - pyre-extensions==0.0.29 193 | - pyroomacoustics==0.6.0 194 | - python-multipart==0.0.6 195 | - pytorch-lightning==2.1.1 196 | - pytz==2023.3 197 | - pyyaml==6.0.1 198 | - referencing==0.29.1 199 | - regex==2023.6.3 200 | - requests==2.31.0 201 | - retrying==1.3.4 202 | - rpds-py==0.8.10 203 | - safetensors==0.3.1 204 | - scikit-learn==1.3.0 205 | - scipy==1.10.1 206 | - semantic-version==2.10.0 207 | - sentencepiece==0.1.99 208 | - smart-open==6.3.0 209 | - sniffio==1.3.0 210 | - soundfile==0.12.1 211 | - soxr==0.3.5 212 | - spacy==3.5.2 213 | - spacy-legacy==3.0.12 214 | - spacy-loggers==1.0.4 215 | - srsly==2.4.6 216 | - starlette==0.27.0 217 | - submitit==1.4.5 218 | - sympy==1.12 219 | - thinc==8.1.10 220 | - threadpoolctl==3.2.0 221 | - tokenizers==0.13.3 222 | - toolz==0.12.0 223 | - torch==2.0.1 224 | - torchaudio==2.0.2 225 | - torchlibrosa==0.1.0 226 | - torchmetrics==1.0.1 227 | - torchvision==0.15.2 228 | - tqdm==4.65.0 229 | - transformers==4.29.0 230 | - treetable==0.2.5 231 | - triton==2.0.0 232 | - typer==0.7.0 233 | - typing-extensions==4.7.1 234 | - typing-inspect==0.9.0 235 | - tzdata==2023.3 236 | - uc-micro-py==1.0.2 237 | - urllib3==2.0.3 238 | - uvicorn==0.22.0 239 | - wasabi==1.1.2 240 | - wcwidth==0.2.9 241 | - webdataset==0.2.75 242 | - websockets==11.0.3 243 | - werkzeug==2.3.6 244 | - wget==3.2 245 | - xformers==0.0.20 246 | - yarl==1.9.2 247 | - zipp==3.16.2 248 | prefix: /homes/jl009/.conda/envs/AudioEditor 249 | 250 | -------------------------------------------------------------------------------- /wavcraft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/__init__.py -------------------------------------------------------------------------------- /wavcraft/apis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torchaudio 3 | import requests 4 | import math 5 | import numpy as np 6 | import soundfile as sf 7 | import pyloudnorm as pyln 8 | from scipy.io.wavfile import write 9 | from retrying import retry 10 | from gradio_client import Client 11 | from audiomentations import AddGaussianSNR, LowPassFilter, HighPassFilter, ApplyImpulseResponse, RoomSimulator 12 | 13 | from wavcraft.utils import get_service_port, get_service_url, get_path_from_target_dir, generate_random_series 14 | 15 | 16 | os.environ['OPENBLAS_NUM_THREADS'] = '1' 17 | SAMPLE_RATE = 16000 # 32000 is NOT supported by wavmark 18 | 19 | localhost_addr = get_service_url() 20 | 21 | 22 | def _LOUDNESS_NORM(wav, volume=-25, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 23 | """ 24 | Nomalize waveform and adjust the loadness as per BS.1770. 25 | """ 26 | # peak normalize wav to -1 dB 27 | peak_normalized_wav = pyln.normalize.peak(wav, -10.0) 28 | # measure the loudness first 29 | meter = pyln.Meter(sr) # create BS.1770 meter 30 | loudness = meter.integrated_loudness(peak_normalized_wav) 31 | # loudness normalize wav to -12 dB LUFS 32 | normalized_wav = pyln.normalize.loudness(peak_normalized_wav, loudness, volume) 33 | 34 | return normalized_wav 35 | 36 | 37 | def _READ_AUDIO_NUMPY(wav, sr=SAMPLE_RATE): 38 | """ 39 | Read audio numpy 40 | Returns: 41 | np.array [samples] 42 | """ 43 | waveform, sample_rate = torchaudio.load(wav) 44 | 45 | if sample_rate != sr: 46 | waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=sr) 47 | 48 | wav_numpy = waveform[0].numpy() 49 | 50 | return wav_numpy 51 | 52 | 53 | def _WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE): 54 | """ 55 | Write audio numpy to .wav file 56 | Params: 57 | wav: np.array [samples] 58 | """ 59 | if name is None: 60 | name = 'output.wav' 61 | 62 | if len(wav.shape) > 1: 63 | wav = wav[0] 64 | 65 | # declipping 66 | max_value = np.max(np.abs(wav)) if wav.size > 0 else 0 67 | if max_value > 1: 68 | wav *= 0.9 / (max_value + 1e-5) 69 | 70 | # write audio 71 | write(name, sr, np.round(wav*32767).astype(np.int16)) 72 | 73 | 74 | def LEN(wav, sr=SAMPLE_RATE): 75 | """ 76 | Returns the duration of audio in seconds. 77 | """ 78 | wav= _READ_AUDIO_NUMPY(wav) 79 | 80 | return len(wav) / sr 81 | 82 | 83 | # def OUTPUT(wav, out_wav="output.wav"): 84 | # output_wav = get_path_from_target_dir(out_wav, wav) 85 | # os.rename(wav, output_wav) 86 | # print(f'Done all processes, result: {output_wav}') 87 | # return output_wav 88 | 89 | 90 | def OUTPUT(wav, out_wav="output.wav", sr=SAMPLE_RATE): 91 | # Add watermark to the generated audio 92 | _tmp_wav = _ENCODE_WATERMARK(wav, sample_rate=sr) 93 | 94 | output_wav = get_path_from_target_dir(out_wav, _tmp_wav) 95 | os.rename(_tmp_wav, output_wav) 96 | print(f'Done all processes, result: {output_wav}') 97 | return output_wav 98 | 99 | 100 | """ DSP modules """ 101 | def SPLIT(wav_path, break_points=[], out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 102 | """ 103 | Split audio into several pieces according to the breakpoints. 104 | Params: 105 | break_points: list[float]: a list of breakpoints (in seconds) 106 | Returns: 107 | Path to output wav file. 108 | """ 109 | # Avoid `breakpoint` containing 0 110 | break_points = [p for p in break_points if p != 0] 111 | num_pieces = len(break_points) + 1 112 | 113 | prefix = out_wav.split(".")[0] 114 | 115 | wav = _READ_AUDIO_NUMPY(wav_path) 116 | 117 | results = [] 118 | for i in range(num_pieces): 119 | onset = break_points[i - 1] * sr if i > 0 else 0 120 | offset = break_points[i] * sr if i < len(break_points) else len(wav) 121 | 122 | _o_wav = get_path_from_target_dir(prefix+f"_{i}.wav", wav_path) 123 | _WRITE_AUDIO(wav[int(onset):int(offset)], name=_o_wav) 124 | results.append(_o_wav) 125 | 126 | return results 127 | 128 | 129 | def MIX(wavs=[['1.wav', 0.], ['2.wav', 0.]], out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 130 | """ 131 | Mix multiple audio clips by considering their onset time. 132 | Returns: 133 | Path to output wav file. 134 | """ 135 | max_length = max([int(wav[1]*sr + len(_READ_AUDIO_NUMPY(wav[0]))) for wav in wavs]) 136 | template_wav = np.zeros(max_length) 137 | 138 | for wav in wavs: 139 | cur_name, cur_onset = wav 140 | cur_wav = _READ_AUDIO_NUMPY(cur_name) 141 | cur_len = len(cur_wav) 142 | cur_onset = int(cur_onset * sr) 143 | 144 | # mix 145 | template_wav[cur_onset:cur_onset+cur_len] += cur_wav 146 | 147 | out_wav = get_path_from_target_dir(out_wav, wavs[0][0]) 148 | _WRITE_AUDIO(template_wav, name=out_wav) 149 | return out_wav 150 | 151 | 152 | def CAT(wavs, out_wav=generate_random_series()+'.wav'): 153 | """ 154 | Concat multiple audio clips together. 155 | Params: 156 | wavs: List of wav file ['1.wav', '2.wav', ...] 157 | """ 158 | wav_num = len(wavs) 159 | 160 | segment0 = _READ_AUDIO_NUMPY(wavs[0]) 161 | 162 | cat_wav = segment0 163 | 164 | if wav_num > 1: 165 | for i in range(1, wav_num): 166 | next_wav = _READ_AUDIO_NUMPY(wavs[i]) 167 | cat_wav = np.concatenate((cat_wav, next_wav), axis=-1) 168 | 169 | out_wav = get_path_from_target_dir(out_wav, wavs[0]) 170 | _WRITE_AUDIO(cat_wav, name=out_wav) 171 | return out_wav 172 | 173 | 174 | def ADJUST_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 175 | """ 176 | Adjust the volume of waveform by `volume`. 177 | """ 178 | wav, sample_rate = torchaudio.load(wav_path) 179 | 180 | if sample_rate != sr: 181 | wav = torchaudio.functional.resample(wav, orig_freq=sample_rate, new_freq=sr) 182 | 183 | adj_vol_fn = torchaudio.transforms.Vol(gain=volume, gain_type="db") 184 | wav = adj_vol_fn(wav) 185 | 186 | # write audio 187 | wav = wav[0].numpy() # convert to numpy 188 | out_wav = get_path_from_target_dir(out_wav, wav_path) 189 | write(out_wav, sr, np.round(wav*32767).astype(np.int16)) 190 | return out_wav 191 | 192 | 193 | # def INC_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 194 | # """ 195 | # Increase the volume of waveform by `volume`. 196 | # """ 197 | # wav = _READ_AUDIO_NUMPY(wav_path) 198 | # # measure the loudness first 199 | # meter = pyln.Meter(sr) # create BS.1770 meter 200 | # loudness = meter.integrated_loudness(wav) 201 | # # loudness normalize audio to the desired dB LUFS 202 | # volume += loudness 203 | # wav = pyln.normalize.loudness(wav, loudness, volume) 204 | 205 | # # write audio 206 | # out_wav = get_path_from_target_dir(out_wav, wav_path) 207 | # write(out_wav, sr, np.round(wav*32767).astype(np.int16)) 208 | # return out_wav 209 | 210 | 211 | # def DEC_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 212 | # """ 213 | # Decrease the volume of waveform by `volume`. 214 | # """ 215 | # wav = _READ_AUDIO_NUMPY(wav_path) 216 | # # measure the loudness first 217 | # meter = pyln.Meter(sr) # create BS.1770 meter 218 | # loudness = meter.integrated_loudness(wav) 219 | # # loudness normalize audio to the desired dB LUFS 220 | # volume -= loudness 221 | # wav = pyln.normalize.loudness(wav, loudness, volume) 222 | 223 | # # write audio 224 | # out_wav = get_path_from_target_dir(out_wav, wav_path) 225 | # write(out_wav, sr, np.round(wav*32767).astype(np.int16)) 226 | # return out_wav 227 | 228 | 229 | def ADD_NOISE(wav_path, min_snr_db=5.0, max_snr_db=40.0, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 230 | wav = _READ_AUDIO_NUMPY(wav_path) 231 | transform = AddGaussianSNR( 232 | min_snr_db=min_snr_db, 233 | max_snr_db=max_snr_db, 234 | p=1.0 235 | ) 236 | 237 | augmented_sound = transform(wav, sample_rate=sr) 238 | 239 | # write audio 240 | out_wav = get_path_from_target_dir(out_wav, wav_path) 241 | _WRITE_AUDIO(augmented_sound, name=out_wav) 242 | return out_wav 243 | 244 | 245 | def LOW_PASS(wav_path, min_cutoff_freq=150.0, max_cutoff_freq=7500.0, min_rolloff=12, max_rolloff=24, zero_phase=False, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 246 | wav = _READ_AUDIO_NUMPY(wav_path) 247 | transform = LowPassFilter( 248 | min_cutoff_freq=min_cutoff_freq, 249 | max_cutoff_freq=max_cutoff_freq, 250 | min_rolloff=min_rolloff, 251 | max_rolloff=max_rolloff, 252 | zero_phase=zero_phase, 253 | p=1.0 254 | ) 255 | 256 | augmented_sound = transform(wav, sample_rate=sr) 257 | 258 | # write audio 259 | out_wav = get_path_from_target_dir(out_wav, wav_path) 260 | _WRITE_AUDIO(augmented_sound, name=out_wav) 261 | return out_wav 262 | 263 | 264 | def HIGH_PASS(wav_path, min_cutoff_freq=20, max_cutoff_freq=2400, min_rolloff=12, max_rolloff=24, zero_phase=False, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 265 | wav = _READ_AUDIO_NUMPY(wav_path) 266 | transform = HighPassFilter( 267 | min_cutoff_freq=min_cutoff_freq, 268 | max_cutoff_freq=max_cutoff_freq, 269 | min_rolloff=min_rolloff, 270 | max_rolloff=max_rolloff, 271 | zero_phase=zero_phase, 272 | p=1.0 273 | ) 274 | 275 | augmented_sound = transform(wav, sample_rate=sr) 276 | 277 | # write audio 278 | out_wav = get_path_from_target_dir(out_wav, wav_path) 279 | _WRITE_AUDIO(augmented_sound, name=out_wav) 280 | return out_wav 281 | 282 | 283 | def ADD_RIR(wav_path, ir_path=None, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 284 | wav = _READ_AUDIO_NUMPY(wav_path) 285 | 286 | transform = ApplyImpulseResponse(ir_path=ir_path, p=1.0) 287 | augmented_sound = transform(wav, sample_rate=sr) 288 | 289 | # write audio 290 | out_wav = get_path_from_target_dir(out_wav, wav_path) 291 | _WRITE_AUDIO(augmented_sound, name=out_wav) 292 | return out_wav 293 | 294 | 295 | def ROOM_SIMULATE(wav_path, min_size_x=3.6, max_size_x=5.6, 296 | min_size_y=3.6, max_size_y=3.9, 297 | min_size_z=2.4, max_size_z=3.0, 298 | min_absorption_value=0.075, max_absorption_value=0.4, 299 | min_source_x=0.1, max_source_x=3.5, 300 | min_source_y=0.1, max_source_y=2.7, 301 | min_source_z=1.0, max_source_z=2.1, 302 | min_mic_distance=0.15, max_mic_distance=0.35, 303 | min_mic_azimuth=-math.pi, max_mic_azimuth=math.pi, 304 | min_mic_elevation=-math.pi, max_mic_elevation=math.pi, 305 | out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 306 | wav = _READ_AUDIO_NUMPY(wav_path) 307 | 308 | transform = RoomSimulator( 309 | min_size_x=min_size_x, max_size_x=max_size_x, 310 | min_size_y=min_size_y, max_size_y=max_size_y, 311 | min_size_z=min_size_z, max_size_z=max_size_z, 312 | min_absorption_value=min_absorption_value, max_absorption_value=max_absorption_value, 313 | min_source_x=min_source_x, max_source_x=max_source_x, 314 | min_source_y=min_source_y, max_source_y=max_source_y, 315 | min_source_z=min_source_z, max_source_z=max_source_z, 316 | min_mic_distance=min_mic_distance, max_mic_distance=max_mic_distance, 317 | min_mic_azimuth=min_mic_azimuth, max_mic_azimuth=max_mic_azimuth, 318 | min_mic_elevation=min_mic_elevation, max_mic_elevation=max_mic_elevation, 319 | p=1.0) 320 | augmented_sound = transform(wav, sample_rate=sr) 321 | 322 | # write audio 323 | out_wav = get_path_from_target_dir(out_wav, wav_path) 324 | _WRITE_AUDIO(augmented_sound, name=out_wav) 325 | return out_wav 326 | 327 | 328 | # def CLIP(wav_path, offset, onset=0, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 329 | # """ 330 | # Clip the audio using onset and offset time. 331 | # Params: 332 | # onset/offset: onset/offset time in seconds. 333 | # Returns: 334 | # Path to output wav file. 335 | # """ 336 | # wav = _READ_AUDIO_NUMPY(wav_path) 337 | 338 | # # Get onset/offset with samples rates 339 | # onset *= SAMPLE_RATE 340 | # offset *= SAMPLE_RATE 341 | # assert 0 <= onset <= offset <= len(wav) 342 | 343 | # out_wav = get_path_from_target_dir(out_wav, wav_path) 344 | # _WRITE_AUDIO(wav[int(onset):int(offset)], name=out_wav) 345 | # return out_wav 346 | 347 | 348 | """ Deep-learning modules """ 349 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 350 | def AU(wav_path, text="write an audio caption describing the sound"): 351 | HF_key = os.environ.get("HF_KEY") 352 | client = Client("https://yuangongfdu-ltu.hf.space/", hf_token=HF_key) 353 | response = client.predict( 354 | wav_path, 355 | "write an audio caption describing the sound", 356 | api_name="/predict", 357 | ) 358 | return response 359 | 360 | 361 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 362 | def TTM(text, melody=None, length=10, volume=-28, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE): 363 | service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT") 364 | url = f'http://{localhost_addr}:{service_port}/generate_music' 365 | 366 | # Change the name if file exist 367 | if os.path.exists(out_wav): 368 | out_wav = generate_random_series() + '.wav' 369 | 370 | data = { 371 | 'text': f'{text}', 372 | 'melody': melody, 373 | 'length': f'{length}', 374 | 'volume': f'{volume}', 375 | 'sample_rate': f'{sr}', 376 | 'output_wav': f'{out_wav}', 377 | } 378 | 379 | response = requests.post(url, json=data) 380 | 381 | if response.status_code == 200: 382 | print('Success:', response.json()['message']) 383 | return out_wav 384 | else: 385 | print('Error:', response.json()['API error']) 386 | raise RuntimeError(response.json()['API error']) 387 | 388 | 389 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 390 | def TTA(text, length=5, volume=-35, out_wav=generate_random_series()+'.wav'): 391 | service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT") 392 | url = f'http://{localhost_addr}:{service_port}/generate_audio' 393 | 394 | # Change the name if file exist 395 | if os.path.exists(out_wav): 396 | out_wav = generate_random_series() + '.wav' 397 | 398 | data = { 399 | 'text': f'{text}', 400 | 'length': f'{length}', 401 | 'volume': f'{volume}', 402 | 'output_wav': f'{out_wav}', 403 | } 404 | 405 | response = requests.post(url, json=data) 406 | if response.status_code == 200: 407 | print('Success:', response.json()['message']) 408 | return out_wav 409 | else: 410 | print('Error:', response.json()['API error']) 411 | raise RuntimeError(response.json()['API error']) 412 | 413 | 414 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 415 | def TTS(text, speaker="Male1_En", volume=-20, out_wav=generate_random_series()+'.wav'): 416 | service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT") 417 | url = f'http://{localhost_addr}:{service_port}/generate_speech' 418 | 419 | # Change the name if file exist 420 | if os.path.exists(out_wav): 421 | out_wav = generate_random_series() + '.wav' 422 | 423 | data = { 424 | 'text': f'{text}', 425 | 'speaker_id': f'{speaker}', 426 | 'volume': f'{volume}', 427 | 'output_wav': f'{out_wav}', 428 | } 429 | 430 | response = requests.post(url, json=data) 431 | 432 | if response.status_code == 200: 433 | print('Success:', response.json()['message']) 434 | return out_wav 435 | else: 436 | print('Error:', response.json()['API error']) 437 | raise RuntimeError(response.json()['API error']) 438 | 439 | 440 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 441 | def SR(wav_path, out_wav=generate_random_series()+'.wav', ddim_steps=50, guidance_scale=3.5, seed=42): 442 | service_port = get_service_port("AUDIOSR_SERVICE_PORT") 443 | url = f'http://{localhost_addr}:{service_port}/super_resolution' 444 | out_wav = get_path_from_target_dir(out_wav, wav_path) 445 | data = { 446 | 'wav_path': f'{wav_path}', 447 | 'ddim_steps': f'{ddim_steps}', 448 | 'guidance_scale': f'{guidance_scale}', 449 | 'seed': f'{seed}', 450 | 'output_wav':f'{out_wav}' 451 | } 452 | response = requests.post(url, json=data) 453 | 454 | if response.status_code == 200: 455 | print('Success:', response.json()['message']) 456 | return out_wav 457 | else: 458 | print('Error:', response.json()['API error']) 459 | raise RuntimeError(response.json()['API error']) 460 | 461 | 462 | # @retry(stop_max_attempt_number=5, wait_fixed=2000) 463 | # def VP(wav_path, out_dir): 464 | # url = f'http://{localhost_addr}:{service_port}/parse_voice' 465 | # data = { 466 | # 'wav_path': f'{wav_path}', 467 | # 'out_dir':f'{out_dir}' 468 | # } 469 | 470 | # response = requests.post(url, json=data) 471 | 472 | # if response.status_code == 200: 473 | # print('Success:', response.json()['message']) 474 | # else: 475 | # print('Error:', response.json()['API error']) 476 | # raise RuntimeError(response.json()['API error']) 477 | 478 | 479 | # @retry(stop_max_attempt_number=5, wait_fixed=2000) 480 | # def EXTRACT(wav_path, text, out_wav=generate_random_series()+'.wav'): 481 | # service_port = get_service_port("AUDIOSEP_SERVICE_PORT") 482 | # url = f'http://{localhost_addr}:{service_port}/source_separate' 483 | # out_wav = get_path_from_target_dir(out_wav, wav_path) 484 | # data = { 485 | # 'wav_path': f'{wav_path}', 486 | # 'text': f'{text}', 487 | # 'output_wav':f'{out_wav}' 488 | # } 489 | 490 | # response = requests.post(url, json=data) 491 | 492 | # if response.status_code == 200: 493 | # filedir, filename = os.path.split(out_wav) 494 | # fg_filepath = os.path.join(filedir, "fg_"+filename) 495 | # bg_filepath = os.path.join(filedir, "bg_"+filename) 496 | # os.rename(fg_filepath, out_wav) 497 | # os.remove(bg_filepath) 498 | # print('Success:', response.json()['message']) 499 | # return out_wav 500 | # else: 501 | # print('Error:', response.json()['API error']) 502 | # raise RuntimeError(response.json()['API error']) 503 | 504 | 505 | # @retry(stop_max_attempt_number=5, wait_fixed=2000) 506 | # def DROP(wav_path, text, out_wav=generate_random_series()+'.wav'): 507 | # service_port = get_service_port("AUDIOSEP_SERVICE_PORT") 508 | # url = f'http://{localhost_addr}:{service_port}/source_separate' 509 | # out_wav = get_path_from_target_dir(out_wav, wav_path) 510 | # data = { 511 | # 'wav_path': f'{wav_path}', 512 | # 'text': f'{text}', 513 | # 'output_wav':f'{out_wav}' 514 | # } 515 | 516 | # response = requests.post(url, json=data) 517 | 518 | # if response.status_code == 200: 519 | # filedir, filename = os.path.split(out_wav) 520 | # fg_filepath = os.path.join(filedir, "fg_"+filename) 521 | # bg_filepath = os.path.join(filedir, "bg_"+filename) 522 | # os.rename(bg_filepath, out_wav) 523 | # os.remove(fg_filepath) 524 | # print('Success:', response.json()['message']) 525 | # return out_wav 526 | # else: 527 | # print('Error:', response.json()['API error']) 528 | # raise RuntimeError(response.json()['API error']) 529 | 530 | 531 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 532 | def TSS(wav_path, text, out_wav=generate_random_series()+'.wav'): 533 | service_port = get_service_port("AUDIOSEP_SERVICE_PORT") 534 | url = f'http://{localhost_addr}:{service_port}/source_separate' 535 | out_wav = get_path_from_target_dir(out_wav, wav_path) 536 | data = { 537 | 'wav_path': f'{wav_path}', 538 | 'text': f'{text}', 539 | 'output_wav':f'{out_wav}' 540 | } 541 | 542 | response = requests.post(url, json=data) 543 | 544 | if response.status_code == 200: 545 | filedir, filename = os.path.split(out_wav) 546 | fg_filepath = os.path.join(filedir, "fg_"+filename) 547 | bg_filepath = os.path.join(filedir, "bg_"+filename) 548 | print('Success:', response.json()['message']) 549 | return fg_filepath, bg_filepath 550 | else: 551 | print('Error:', response.json()['API error']) 552 | raise RuntimeError(response.json()['API error']) 553 | 554 | 555 | @retry(stop_max_attempt_number=5, wait_fixed=2000) 556 | def INPAINT(wav_path, text, onset, offset, duration, guidance_scale=2.5, ddim_steps=200, random_seed=42, sample_rate=SAMPLE_RATE, out_wav=generate_random_series()+'.wav',): 557 | service_port = get_service_port("AUDIOLDM_SERVICE_PORT") 558 | url = f'http://{localhost_addr}:{service_port}/audio_inpaint' 559 | out_wav = get_path_from_target_dir(out_wav, wav_path) 560 | data = { 561 | 'wav_path': f'{wav_path}', 562 | 'text': f'{text}', 563 | 'onset': onset, 564 | 'offset': offset, 565 | 'duration': duration, 566 | 'output_wav':f'{out_wav}', 567 | # generation settings 568 | 'sample_rate': sample_rate, 569 | 'guidance_scale': guidance_scale, 570 | 'ddim_steps': ddim_steps, 571 | 'random_seed': random_seed, 572 | } 573 | 574 | response = requests.post(url, json=data) 575 | 576 | if response.status_code == 200: 577 | print('Success:', response.json()['message']) 578 | return out_wav 579 | else: 580 | print('Error:', response.json()['API error']) 581 | raise RuntimeError(response.json()['API error']) 582 | 583 | 584 | def _ENCODE_WATERMARK(wav_path, sample_rate=SAMPLE_RATE, out_wav=generate_random_series()+'.wav',): 585 | service_port = get_service_port("WAVMARK_SERVICE_PORT") 586 | url = f'http://{localhost_addr}:{service_port}/audio_watermark' 587 | out_wav = get_path_from_target_dir(out_wav, wav_path) 588 | data = { 589 | 'wav_path': f'{wav_path}', 590 | 'action': "encode", 591 | 'output_wav':f'{out_wav}', 592 | 'sample_rate': sample_rate, 593 | } 594 | 595 | response = requests.post(url, json=data) 596 | 597 | if response.status_code == 200: 598 | print('Success:', response.json()['message']) 599 | return out_wav 600 | else: 601 | print('Error:', response.json()['API error']) 602 | raise RuntimeError(response.json()['API error']) 603 | 604 | 605 | def _DECODE_WATERMARK(wav_path, sample_rate=SAMPLE_RATE): 606 | service_port = get_service_port("WAVMARK_SERVICE_PORT") 607 | url = f'http://{localhost_addr}:{service_port}/audio_watermark' 608 | data = { 609 | 'wav_path': f'{wav_path}', 610 | 'action': "decode", 611 | 'sample_rate': sample_rate, 612 | } 613 | 614 | response = requests.post(url, json=data) 615 | 616 | if response.status_code == 200: 617 | print('Success:', response.json()['message']) 618 | return wav_path 619 | else: 620 | print('Error:', response.json()['API error']) 621 | raise RuntimeError(response.json()['API error']) 622 | -------------------------------------------------------------------------------- /wavcraft/configs.yaml: -------------------------------------------------------------------------------- 1 | AudioCraft: 2 | # MusicGen 3 | ttm_model_size: melody # [small, medium, large] 4 | # AudioGen 5 | tta_model_size: medium # [medium] 6 | 7 | Text-to-Speech: 8 | # Bark 9 | speed: 1.05 10 | 11 | Speech-Restoration: 12 | # VoiceFixer 13 | Enable: True 14 | 15 | AudioLDM: 16 | model_size: audioldm-m-full -------------------------------------------------------------------------------- /wavcraft/ffmpeg_engineer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import time 4 | import glob 5 | import pickle 6 | import openai 7 | 8 | 9 | class FFmpegEngineer: 10 | OPENAI_KEY = os.environ.get('OPENAI_KEY') 11 | 12 | def __init__(self, use_openai_cache = False,): 13 | self.use_openai_cache = False 14 | self.openai_cache = [] 15 | if self.use_openai_cache: 16 | os.makedirs('cache', exist_ok=True) 17 | for cache_file in glob.glob('cache/*.pkl'): 18 | with open(cache_file, 'rb') as file: 19 | self.openai_cache.append(pickle.load(file)) 20 | 21 | self.history = [ 22 | { 23 | "role": "system", 24 | "content": "You are a helpful assistant." 25 | }, 26 | ] 27 | 28 | def complete(self, prompt, model="gpt-4", api_key=""): 29 | content = self.chat_with_gpt(prompt, model, api_key) 30 | content = self.extract_ffmpeg_command(self.try_extract_content_from_quotes(content)) 31 | self.execute_code(content) 32 | 33 | 34 | def chat_with_gpt(self, prompt, model="gpt-4", api_key=""): 35 | api_key = self.OPENAI_KEY if not api_key else api_key 36 | 37 | if self.use_openai_cache: 38 | filtered_object = list(filter(lambda x: x['prompt'] == prompt, self.openai_cache)) 39 | if len(filtered_object) > 0: 40 | response = filtered_object[0]['response'] 41 | return response 42 | 43 | self.history.append( 44 | { 45 | "role": "user", 46 | "content": prompt 47 | }, 48 | ) 49 | 50 | try: 51 | openai.api_key = api_key 52 | chat = openai.ChatCompletion.create( 53 | model=model, # "gpt-3.5-turbo", 54 | messages=self.history, 55 | ) 56 | response = chat['choices'][0]['message']['content'] 57 | 58 | self.history.append( 59 | { 60 | "role": "system", 61 | "content": response 62 | }, 63 | ) 64 | 65 | finally: 66 | openai.api_key = '' 67 | 68 | if self.use_openai_cache: 69 | cache_obj = { 70 | 'prompt': prompt, 71 | 'response': response, 72 | } 73 | with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache: 74 | pickle.dump(cache_obj, _openai_cache) 75 | self.openai_cache.append(cache_obj) 76 | 77 | return response 78 | 79 | 80 | def reset(self,): 81 | self.history = [] 82 | 83 | 84 | @classmethod 85 | def _extract_substring_with_quotes(cls, input_string, quotes="'''"): 86 | pattern = f"{quotes}(.*?){quotes}" 87 | matches = re.findall(pattern, input_string, re.DOTALL) 88 | return matches 89 | 90 | @classmethod 91 | def extract_ffmpeg_command(cls, input_string): 92 | # Split the string into lines 93 | lines = input_string.split('\n') 94 | 95 | # Find the index where the 'ffmpeg' command starts 96 | start_index = next((i for i, line in enumerate(lines) if 'ffmpeg' in line), None) 97 | 98 | # Extract lines from the start of the 'ffmpeg' command till the end or a specific end pattern 99 | if start_index is not None: 100 | ffmpeg_lines = lines[start_index:] 101 | # ffmpeg_lines = ffmpeg_lines[:end_index] 102 | return '\n'.join(ffmpeg_lines) 103 | else: 104 | return "" 105 | 106 | def try_extract_content_from_quotes(self, content): 107 | if "'''" in content: 108 | return self._extract_substring_with_quotes(content)[0] 109 | elif "```" in content: 110 | return self._extract_substring_with_quotes(content, quotes="```")[0] 111 | else: 112 | return content 113 | 114 | def execute_code(self, content): 115 | os.system(content) 116 | 117 | 118 | """ Test """ 119 | if __name__ == "__main__": 120 | import os 121 | 122 | eng = FFmpegEngineer() 123 | 124 | prompt = "Using bash to check the version of ffmpeg in linux." 125 | model = "gpt-4" 126 | OPENAI_KEY = "your_key" # can be set using `${OPENAI_KEY}` from env 127 | 128 | # For examin the func you can use the following lines 129 | # response = eng.chat_with_gpt(prompt, model, api_key) 130 | # print(response) 131 | # code = eng.extract_ffmpeg_command(eng.try_extract_content_from_quotes(response)) 132 | # print(code) 133 | # eng.execute_code(code) 134 | # Ones can replace the above lines with one line: 135 | eng.complete(prompt, model="gpt-4", api_key=OPENAI_KEY) 136 | -------------------------------------------------------------------------------- /wavcraft/mistral_api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | 5 | 6 | class _chatLLM: 7 | def __init__(self, model_id): 8 | self.messages = [] 9 | self.model, self.tokenizer = self.build_model(model_id) 10 | 11 | def create_new_session(self): 12 | self.messages = [] 13 | 14 | 15 | class ChatMistral(_chatLLM): 16 | def __init__(self, model_id="mistralai/Mistral-7B-Instruct-v0.2"): 17 | super().__init__(model_id) 18 | self.device = self.model.device 19 | 20 | 21 | def get_response(self, prompt: str): 22 | self.messages.append({"role": "user", "content": prompt}) 23 | 24 | encodes = self.tokenizer.apply_chat_template(self.messages, return_tensors="pt").to(self.device) 25 | 26 | generated_ids = self.model.generate(encodes, max_new_tokens=1000, do_sample=True) 27 | decoded = self.tokenizer.batch_decode(generated_ids)[-1] 28 | 29 | response = self.extract_response(decoded) 30 | self.messages.append({"role": "assistant", "content": response}) # Update conversation 31 | 32 | return response 33 | 34 | 35 | def build_model(self, model_id="mistralai/Mistral-7B-Instruct-v0.2"): 36 | from transformers import BitsAndBytesConfig 37 | quantization_config = BitsAndBytesConfig( 38 | # 8-bit quantization 39 | load_in_8bit=True, 40 | # 4-bit quantization 41 | # load_in_4bit=True, 42 | # bnb_4bit_quant_type="nf4", 43 | # bnb_4bit_compute_dtype=torch.float16, 44 | ) 45 | 46 | model = AutoModelForCausalLM.from_pretrained( 47 | model_id, 48 | torch_dtype=torch.float16, 49 | # attn_implementation="flash_attention_2", # NOTE: cannot use with V100 50 | quantization_config=quantization_config, 51 | device_map="auto") 52 | 53 | # [Updated 23-04-2024] mistral built-in tokenizer has the same function as huggingface tokenizer 54 | # Ones can stick to huggingface tokenizer UNLESS tool calling is required. 55 | tokenizer = AutoTokenizer.from_pretrained(model_id) 56 | 57 | return model, tokenizer 58 | 59 | 60 | def extract_response(self, responses): 61 | # Split the interaction by "" to separate each round, and take the last non-empty round if any 62 | rounds = [r for r in responses.split("") if r.strip()] 63 | # Split response by "[/INST]" 64 | last_reponse = rounds[-1].strip().split("[/INST]")[-1].strip() 65 | 66 | return last_reponse 67 | 68 | if __name__ == "__main__": 69 | llm = ChatMistral() 70 | llm.messages = [ 71 | {"role": "user", "content": "What is your favourite condiment?"}, 72 | {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, 73 | ] 74 | re = llm.get_response("Do you have mayonnaise recipes?") 75 | print(re) 76 | import ipdb; ipdb.set_trace() 77 | -------------------------------------------------------------------------------- /wavcraft/pipeline.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import openai 4 | import re 5 | import glob 6 | import pickle 7 | import time 8 | import random 9 | import string 10 | from retrying import retry 11 | from glob import glob 12 | 13 | from wavcraft.mistral_api import ChatMistral 14 | import wavcraft.utils as utils 15 | 16 | 17 | # Enable this for debugging 18 | USE_OPENAI_CACHE = False 19 | openai_cache = [] 20 | if USE_OPENAI_CACHE: 21 | os.makedirs('cache', exist_ok=True) 22 | for cache_file in glob.glob('cache/*.pkl'): 23 | with open(cache_file, 'rb') as file: 24 | openai_cache.append(pickle.load(file)) 25 | 26 | 27 | # Global vars 28 | chat_history = [] 29 | local_llm = None 30 | 31 | 32 | def chat_with_gpt(api_key, model="gpt-4"): 33 | #"gpt-4", # "gpt-3.5-turbo" 34 | global chat_history 35 | 36 | if USE_OPENAI_CACHE: 37 | filtered_object = list(filter(lambda x: x['prompt'] == chat_history[-1]["content"], openai_cache)) 38 | if len(filtered_object) > 0: 39 | response = filtered_object[0]['response'] 40 | return response 41 | 42 | try: 43 | openai.api_key = api_key 44 | 45 | chat = openai.ChatCompletion.create( 46 | model=model, 47 | messages=chat_history, 48 | ) 49 | finally: 50 | openai.api_key = '' 51 | 52 | if USE_OPENAI_CACHE: 53 | cache_obj = { 54 | 'prompt': chat_history[-1]["content"], 55 | 'response': chat['choices'][0]['message']['content'] 56 | } 57 | with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache: 58 | pickle.dump(cache_obj, _openai_cache) 59 | openai_cache.append(cache_obj) 60 | 61 | chat_history.append({ 62 | "role": "system", 63 | "content": chat['choices'][0]['message']['content'], 64 | }) 65 | 66 | return chat['choices'][0]['message']['content'] 67 | 68 | 69 | # Assuming the existence of USE_OPENAI_CACHE, chat_history, and openai_cache similar to GPT function 70 | def chat_with_mistral(): 71 | global chat_history 72 | 73 | if USE_OPENAI_CACHE: 74 | filtered_object = list(filter(lambda x: x['prompt'] == chat_history[-1]["content"], openai_cache)) 75 | if len(filtered_object) > 0: 76 | return filtered_object[0]['response'] 77 | 78 | global local_llm 79 | # import ipdb; ipdb.set_trace() 80 | local_llm.messages = chat_history[:-1] 81 | try: 82 | response = local_llm.get_response(chat_history[-1]["content"]) 83 | finally: 84 | pass 85 | 86 | if USE_OPENAI_CACHE: 87 | cache_obj = { 88 | 'prompt': chat_history[-1]["content"], 89 | 'response': response 90 | } 91 | with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache: 92 | pickle.dump(cache_obj, _openai_cache) 93 | openai_cache.append(cache_obj) 94 | 95 | chat_history.append({ 96 | "role": "assistant", 97 | "content": response, 98 | }) 99 | 100 | return response 101 | 102 | 103 | def get_file_content(filename): 104 | with open(filename, 'r') as file: 105 | return file.read().strip() 106 | 107 | 108 | def write_to_file(filename, content): 109 | with open(filename, 'w') as file: 110 | file.write(content) 111 | 112 | 113 | def extract_substring_with_quotes(input_string, quotes="'''"): 114 | pattern = f"{quotes}(.*?){quotes}" 115 | matches = re.findall(pattern, input_string, re.DOTALL) 116 | return matches 117 | 118 | 119 | def maybe_remove_python_as_prefix(content): 120 | keyword = "python" 121 | 122 | content = content.strip() 123 | if content.startswith(keyword): 124 | # Remove the keyword and strip leading/trailing whitespaces 125 | return content[len(keyword):].strip() 126 | return content 127 | 128 | 129 | def try_extract_content_from_quotes(content): 130 | if "'''" in content: 131 | return maybe_remove_python_as_prefix(extract_substring_with_quotes(content)[0]) 132 | elif "```" in content: 133 | return maybe_remove_python_as_prefix(extract_substring_with_quotes(content, quotes="```")[0]) 134 | else: 135 | return maybe_remove_python_as_prefix(content) 136 | 137 | 138 | def maybe_get_content_from_file(content_or_filename): 139 | if os.path.exists(content_or_filename): 140 | with open(content_or_filename, 'r') as file: 141 | return file.read().strip() 142 | return content_or_filename 143 | 144 | 145 | # Pipeline Interface Guidelines: 146 | # 147 | # Init calls: 148 | # - Init calls must be called before running the actual steps 149 | # - init_session() is called every time a gradio webpage is loaded 150 | # 151 | # Single Step: 152 | # - takes input (file or content) and output path as input 153 | # - most of time just returns output content 154 | # 155 | # Compositional Step: 156 | # - takes session_id as input (you have session_id, you have all the paths) 157 | # - run a series of steps 158 | 159 | # This is called for every new gradio webpage 160 | 161 | def init_session(session_id=''): 162 | def uid8(): 163 | return ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) 164 | 165 | if session_id == '': 166 | session_id = f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{uid8()}' 167 | # create the paths 168 | os.makedirs(utils.get_session_audio_path(session_id)) 169 | print(f'New session created, session_id={session_id}') 170 | return session_id 171 | 172 | 173 | @retry(stop_max_attempt_number=3) 174 | def _input_text_to_code_with_retry(log_path, api_key, model="gpt-4"): 175 | print(" trying ...") 176 | try: 177 | if "mistral" in model: 178 | code_response = chat_with_mistral() 179 | elif "gpt" in model: 180 | code_response = try_extract_content_from_quotes(chat_with_gpt(api_key, model)) 181 | else: 182 | raise ValueError(f"Do not support {model}") 183 | 184 | except Exception as err: 185 | global chat_history 186 | chat_log = f'\n{chat_history}\n\nINPUT ERROR: {err}' 187 | write_to_file(log_path, chat_log) 188 | raise err 189 | 190 | return code_response 191 | 192 | 193 | wav_description = {} 194 | # [Basic] Step 1: input to py code 195 | def input_text_to_code(input_text, output_path, api_key, model="gpt-4"): 196 | # Claim input audio in the instruction 197 | input_description = "Input audio:\n" 198 | n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav'))) 199 | if n_input_wavs > 0: 200 | for i in range(n_input_wavs): 201 | in_wav = f"{output_path.absolute()}/audio/input_{i}.wav" 202 | 203 | if in_wav in wav_description: 204 | continue 205 | else: 206 | input_description += f"INPUT_WAV{i}\n" 207 | # Add placeholder to the log as the basic does not need to describe the audio 208 | wav_description[in_wav] = "" 209 | else: # no input wav 210 | input_description = "" 211 | 212 | input_text = maybe_get_content_from_file(input_text) 213 | 214 | log_path = output_path / 'chat.log' 215 | if not os.path.exists(log_path): 216 | text_to_audio_script_prompt = get_file_content('wavcraft/prompts/text_to_code.prompt') 217 | prompt = f'{text_to_audio_script_prompt}\n\n{input_description}Instruction:\n{input_text}\nCode:\n' 218 | else: 219 | text_to_followup = get_file_content('wavcraft/prompts/text_to_followup.prompt') 220 | prompt = f'{input_description}{text_to_followup}\n{input_text}' 221 | 222 | global chat_history 223 | chat_history.append({ 224 | "role": "user", 225 | "content": prompt, 226 | }) 227 | 228 | write_to_file(log_path, f"{chat_history}") 229 | code_response = _input_text_to_code_with_retry(log_path, api_key, model) 230 | executable_code_filename = output_path / 'audio_executable.py' 231 | 232 | write_to_file(executable_code_filename, code_response) 233 | return code_response 234 | 235 | 236 | # [Inspiration] Step 1: input to py code 237 | def input_text_to_code_plus(input_text, output_path, api_key, model="gpt-4"): 238 | import sys 239 | 240 | sys.path.append(os.path.dirname(__file__)) 241 | from wavcraft.apis import AU 242 | 243 | # Add description to the prompt 244 | input_description = "Input audio:\n" 245 | n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav'))) 246 | for i in range(n_input_wavs): 247 | in_wav = f"{output_path.absolute()}/audio/input_{i}.wav" 248 | 249 | if in_wav in wav_description: 250 | continue 251 | else: 252 | response = AU(in_wav, text="write an audio caption describing the sound") 253 | input_description += f"INPUT_WAV{i}: {response}\n" 254 | # Add to the log 255 | wav_description[in_wav] = response 256 | 257 | input_text = maybe_get_content_from_file(input_text) 258 | 259 | log_path = output_path / 'chat.log' 260 | if not os.path.exists(log_path): 261 | text_to_audio_script_prompt = get_file_content('wavcraft/prompts/text_to_code.prompt') 262 | prompt = f'{text_to_audio_script_prompt}\n\n{input_description}Instruction:\n{input_text}\nCode:\n' 263 | else: 264 | text_to_followup = get_file_content('wavcraft/prompts/text_to_followup.prompt') 265 | prompt = f'{input_description}{text_to_followup}\n{input_text}' 266 | 267 | global chat_history 268 | chat_history.append({ 269 | "role": "user", 270 | "content": prompt, 271 | }) 272 | 273 | write_to_file(log_path, f"{chat_history}") 274 | 275 | code_response = _input_text_to_code_with_retry(log_path, api_key, model) 276 | executable_code_filename = output_path / 'audio_executable.py' 277 | 278 | write_to_file(executable_code_filename, code_response) 279 | return code_response 280 | 281 | 282 | # Step 2: py code to final wav 283 | def audio_exe_to_result(code_response, output_path): 284 | executable_code_filename = output_path / 'audio_executable.py' 285 | 286 | # TODO: make this more easy to modify 287 | # Executable file header 288 | header = "from wavcraft.apis import LEN, OUTPUT, SPLIT, MIX, CAT, ADJUST_VOL, ADD_NOISE, LOW_PASS, HIGH_PASS, ADD_RIR, ROOM_SIMULATE, TTM, TTA, TTS, SR, TSS, INPAINT" 289 | 290 | input_claimer = "" 291 | n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav'))) 292 | for i in range(n_input_wavs): 293 | in_wav = f"\"{output_path.absolute()}/audio/input_{i}.wav\"" 294 | input_claimer += f"INPUT_WAV{i} = {in_wav}\n" 295 | 296 | tail = "OUTPUT(OUTPUT_WAV)" 297 | code_response = maybe_get_content_from_file(code_response) 298 | command = f"{header}\n\n\n{input_claimer}{code_response}\n{tail}" 299 | write_to_file(executable_code_filename, command) 300 | 301 | os.system(f'PYTHONPATH=. python {executable_code_filename}') 302 | 303 | 304 | # Function call used by Gradio: input_text to json 305 | def generate_code(session_id, input_wav, input_text, api_key, model="gpt-4", mode="basic"): 306 | assert mode in ("basic", "inspiration") 307 | 308 | output_path = utils.get_session_path(session_id) 309 | os.makedirs(output_path, exist_ok=True) 310 | for i, in_wav in enumerate(input_wav): 311 | os.system(f"cp {in_wav} {os.path.join(output_path, 'audio', f'input_{i}.wav')}") 312 | 313 | # Step 1 314 | print(f'session_id={session_id}, Step 1: Writing executable code with LLM ...') 315 | if mode == "basic": 316 | return input_text_to_code(input_text, output_path, api_key, model) 317 | else: 318 | return input_text_to_code_plus(input_text, output_path, api_key, model) 319 | 320 | 321 | # Function call used by Gradio: json to result wav 322 | def generate_audio(session_id, code_response): 323 | output_path = utils.get_session_path(session_id) 324 | # Step 2 325 | print(f'session_id={session_id}, Step 2: Start running Python program...') 326 | audio_exe_to_result(code_response, output_path) 327 | 328 | 329 | # Convenient function call used by wavjourney_cli 330 | def full_steps(session_id, input_wav, input_text, api_key, mode, model="gpt-4"): 331 | global local_llm, chat_history 332 | 333 | if "mistral" in model: 334 | local_llm = ChatMistral(model_id=model) 335 | elif "gpt" in model: 336 | chat_history = [{ 337 | "role": "system", 338 | "content": "You are a helpful assistant.", 339 | }] 340 | else: 341 | raise ValueError(f"Not support {model}.") 342 | code_script = generate_code(session_id, input_wav, input_text, api_key, model=model, mode=mode) 343 | return generate_audio(session_id, code_script) -------------------------------------------------------------------------------- /wavcraft/prompts/text_to_code.prompt: -------------------------------------------------------------------------------- 1 | You are an professional audio editor. Try to follow the instruction I give using several predefined tools: 2 | LEN(wav) # returns the duration of `wav` in seconds 3 | MIX(wavs: list[tuple]) # returns the mixture of the input `wavs` 4 | CAT(wavs: list) # returns the concatenated wav using input `wavs` 5 | SPLIT(wav, break_points=list[float]) # returns the split wavs using `break_points` 6 | ADJUST_VOL(wav, volume: int) # returns the adjusted wav by `volume` 7 | TTA(text: str, length: float, volume: int) # returns a generated audio conditioned on `text` 8 | TTM(text: str, melody, length: float, volume: int) # returns a generated music conditioned on `text` and (optional) `melody` 9 | TTS(text: str, volume: int) # returns a generated speech conditioned on `text` and `speaker`. `speaker` should be in ['Male1_En', 'Male2_En', 'Female1_En', 'Female2_En', 'News_Male_En', 'News_Female_En', 'News_Female_Out_En', 'Child_En', 'Old_Man_En', 'Male1_Zh', 'Male2_Zh', 'Female1_Zh', 'Female2_Zh', 'Male1_Fr', 'Male2_Fr', 'Female1_Fr', 'Female2_Fr', 'Male1_De', 'Male2_De', 'Female1_De', 'Female2_De', 'Male1_Hi', 'Male2_Hi', 'Female1_Hi', 'Female2_Hi', 'Male1_It', 'Male2_It', 'Female1_It', 'Female2_It', 'Male1_Ja', 'Male2_Ja', 'Female1_Ja', 'Female2_Ja', 'Male1_Ko', 'Male2_Ko', 'Female1_Ko', 'Female1_Ru', 'Female2_Ru', 'Male1_Ru', 'Male2_Ru', 'Female1_Es', 'Female2_Es', 'Male1_Es', 'Male2_Es', 'Female1_Tr', 'Female2_Tr', 'Male1_Tr', 'Male2_Tr', 'Male1_Pt', 'Male2_Pt', 'Female1_Pl', 'Female2_Pl', 'Male1_Pl', 'Male2_Pl'] 10 | SR(wav, seed: int) # Returns a wav upsampled to 48kHz 11 | TSS(wav, text: str) # returns foreground and background wav conditioned on `text` 12 | ADD_NOISE(wav, min_snr_db: float, max_snr_db: float) # returns a generated audio mixed with gaussian noise 13 | LOW_PASS(wav, min_cutoff_freq: float, max_cutoff_freq: float, min_rolloff: int, max_rolloff: int) # returns a generated audio processed by low pass filter 14 | HIGH_PASS(wav, min_cutoff_freq: float, max_cutoff_freq: float, min_rolloff: int, max_rolloff: int) # returns a generated audio processed by high pass filter 15 | ADD_RIR(wav, ir) # returns a generated audio mixed with a given room impulse response 16 | ROOM_SIMULATE(wav, min_size_x: float, max_size_x: float, min_size_y: float, max_size_y: float, min_size_z: float, max_size_z: float, min_absorption_value: float, max_absorption_value: float, min_source_x: float, max_source_x: float, min_source_y: float, max_source_y: float, min_source_z: float, max_source_z: float, min_mic_distance: float, max_mic_distance: float, min_mic_azimuth: float, max_mic_azimuth: float, min_mic_elevation: float, max_mic_elevation: float) # returns a synthesized audio by mixing the input `wav` with a room-specific synthesized impulse response 17 | INPAINT(wav, text: str, onset: float, offset: float, duration: float) # returns a fixed audio where the part between `onset` and `offset` has been inpainted 18 | 19 | 20 | I will give you several examples: 21 | Instruction: 22 | Increase the volume of child speech by 5 dB, decrease the volume of drum by 3 dB, drop the sound of machine sound. 23 | Code: 24 | # Separate the sound of 'child speech' from the mixture and return both 'child speech' and the background sounds 25 | WAV0, WAV1 = TSS(INPUT_WAV0, text="child speech") 26 | # Separate the sound of 'drum' from the mixture and return both 'drum' and the background sounds 27 | WAV2, WAV3 = TSS(WAV1, text="drum") 28 | # Drop the sound of 'machine sound' from the mixture 29 | _, WAV3 = TSS(WAV3, text="machine sound") 30 | # Increace the volume of "child speech" by 5dB 31 | WAV0 = ADJUST_VOL(WAV0, volume=5) 32 | # Decrease the volume of 'drum' by 5dB 33 | WAV2 = ADJUST_VOL(WAV2, volume=-3) 34 | # Mix the resulted sounds together 35 | OUTPUT_WAV = MIX([(WAV0, 0), (WAV2, 0), (WAV3, 0)]) 36 | 37 | Instruction: 38 | Extract 1-5s of the first audio with a low-pass filter to simulate the sound coming from inside a building. Replace male speech with dog barking in the second audio. Upsample the mix. 39 | Code: 40 | # Truncate the sound between 1s and 5 s 41 | _, WAV0, _ = SPLIT(INPUT_WAV0, break_points=[1, 5]) 42 | # Add a low-pass filter 43 | WAV0 = LOW_PASS(WAV0, min_cutoff_freq=300.0, max_cutoff_freq=800.0, min_rolloff=6, max_rolloff=12) 44 | # Extract the sound of 'male speech' from the truncated sound 45 | WAV1, WAV2 = TSS(INPUT_WAV1, text="male speech") 46 | # Generate the sound of 'dog barking' with the same length with the sound of 'male speech' 47 | WAV3 = TTA(text="dog barking", length=LEN(WAV1), volume=4) 48 | # Combine the sounds by mixing them together 49 | MIXTURE_WAV = MIX([(WAV3, 0), (WAV2, 0), (WAV0, 0)]) 50 | # Perform super-resolution on the mixture of sounds 51 | OUTPUT_WAV = SR(MIXTURE_WAV) 52 | 53 | Instruction: 54 | Isolate train sound in the input audio, apply a high-pass filter and increase the volume by 3 dB. Repeat it five times to simulate a longer train passing. 55 | Code: 56 | # Extract the sound of a train from the audio 57 | WAV0, _ = TSS(INPUT_WAV0, text="train") 58 | # Apply a high-pass filter to reduce low-frequency noise 59 | FILTERED_WAV0 = HIGH_PASS(WAV0, min_cutoff_freq=500.0, max_cutoff_freq=1000.0, min_rolloff=6, max_rolloff=12) 60 | # Increase the volume by 3 dB 61 | FILTERED_WAV0 = ADJUST_VOL(FILTERED_WAV0, volume=3) 62 | # Concatenate the filtered train sound three times 63 | OUTPUT_WAV = CAT([FILTERED_WAV0] * 5) 64 | 65 | Instruction: 66 | Extract the hammer sound from the first audio, and truncate it from the start towards 2 second. Remove the sound of baby crying in the second audio, and then decrease the volume by 1 dB. Mix two audio together, and the second sound should begin from 1 second. Add a reverb effect to the mixture sound using the third audio. 67 | Code: 68 | # Extract the hammer sound from the first audio 69 | WAV0, _ = TSS(INPUT_WAV0, text="hammer") 70 | # Truncate from the start towards 2 second 71 | WAV0, _ = SPLIT(WAV0, break_points=[2]) 72 | # Drop the sound of baby crying in the second audio 73 | _, WAV1 = TSS(INPUT_WAV1, text="baby crying") 74 | # Decrease the volume by 1 dB 75 | WAV1 = ADJUST_VOL(WAV1, volume=-1) 76 | # Mix the ouput sounds together 77 | MIXED_WAV = MIX([(WAV0, 0), (WAV1, 1)]) 78 | # Add a reverb effect using room impulse response 79 | OUTPUT_WAV = ADD_RIR(MIXED_WAV, ir=INPUT_WAV2) 80 | 81 | Instruction: 82 | Inpaint the first audio between 2s and 5s with the text "a car passing by with rain falling". Generate a 10s long jazz music piece with the second audio as melody, then mix it with the sound of rain from the first, starting at 3s into the jazz music. 83 | Code: 84 | # Inpaint the first audio between 2s and 5s with the text "a car passing by with rain falling" 85 | WAV0 = INPAINT(INPUT_WAV0, text="a car passing by with rain falling", onset=2, offset=5, duration=LEN(INPUT_WAV0)) 86 | # Generate a 10-second jazz music piece 87 | WAV1 = TTM(text="jazz", melody=INPUT_WAV1, length=10.0, volume=5) 88 | # Extract the sound of rain from the audio file 89 | WAV0, _ = TSS(WAV0, text="rain") 90 | # Mix the jazz music with the rain sound, starting the rain at 3 seconds 91 | OUTPUT_WAV = MIX([(WAV0, 0), (WAV1, 3)]) 92 | 93 | Instruction: 94 | Remove wind sound from an outdoor recording. Generate a 5-second saxophone music with happy mood followed by "Bravo". Mix the generated sound with the outdoor recording and simulate the mixture in a small room with high absorption. 95 | Code: 96 | # Drop the sound of wind from the original recording 97 | _, WAV0 = TSS(INPUT_WAV0, text="wind") 98 | # Generate a 5-second saxophone music with happy mood followed by a male speech "Bravo". 99 | WAV1 = TTM(text="happy saxophone", length=5.0, volume=4) 100 | # Generate a speech "Bravo" 101 | WAV2 = TTS("Bravo", volume=5) 102 | # Concatenate the generated sound together 103 | CONCAT_WAV = CAT([WAV1, WAV2]) 104 | # Mix the generated sound with the background sound 105 | MIXED_WAV = MIX((WAV0, 0), (CONCAT_WAV, 0)) 106 | # Simulate the recording in a small room with high absorption 107 | OUTPUT_WAV = ROOM_SIMULATE(MIXED_WAV, min_size_x=3, max_size_x=4, min_size_y=3, max_size_y=4, min_size_z=2.5, max_size_z=3, min_absorption_value=0.7, max_absorption_value=0.9, min_source_x=1, max_source_x=1.5, min_source_y=1, max_source_y=1.5, min_source_z=1, max_source_z=1.5, min_mic_distance=1, max_mic_distance=1.5, min_mic_azimuth=45, max_mic_azimuth=90, min_mic_elevation=20, max_mic_elevation=30) 108 | -------------------------------------------------------------------------------- /wavcraft/prompts/text_to_followup.prompt: -------------------------------------------------------------------------------- 1 | Regenerate the code by appending the new instruction to the previous instructions. The code must start with the provided audio (e.g., INPUT_WAV0) and cannot take the output from previous phase (i.e., `OUTPUT_WAV`) as a known input. The new instruction is: -------------------------------------------------------------------------------- /wavcraft/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import string 4 | import torch 5 | import random 6 | import numpy as np 7 | from pathlib import Path 8 | 9 | 10 | #### path related code BEGIN #### 11 | def get_session_path(session_id): 12 | return Path(f'output/sessions/{session_id}') 13 | 14 | def get_system_voice_preset_path(): 15 | return Path('data/voice_presets') 16 | 17 | def get_session_voice_preset_path(session_id): 18 | return Path(f'{get_session_path(session_id)}/voice_presets') 19 | 20 | def get_session_audio_path(session_id): 21 | return Path(f'{get_session_path(session_id)}/audio') 22 | 23 | def rescale_to_match_energy(segment1, segment2): 24 | ratio = get_energy_ratio(segment1, segment2) 25 | recaled_segment1 = segment1 / ratio 26 | return recaled_segment1.numpy() 27 | #### path related code END #### 28 | 29 | def text_to_abbrev_prompt(input_text): 30 | return re.sub(r'[^a-zA-Z_]', '', '_'.join(input_text.split()[:5])) 31 | 32 | def get_energy(x): 33 | return np.mean(x ** 2) 34 | 35 | 36 | def get_energy_ratio(segment1, segment2): 37 | energy1 = get_energy(segment1) 38 | energy2 = max(get_energy(segment2), 1e-10) 39 | ratio = (energy1 / energy2) ** 0.5 40 | ratio = torch.tensor(ratio) 41 | ratio = torch.clamp(ratio, 0.02, 50) 42 | return ratio 43 | 44 | def fade(audio_data, fade_duration=2, sr=32000): 45 | audio_duration = audio_data.shape[0] / sr 46 | 47 | # automated choose fade duration 48 | if audio_duration >=8: 49 | # keep fade_duration 2 50 | pass 51 | else: 52 | fade_duration = audio_duration / 5 53 | 54 | fade_sampels = int(sr * fade_duration) 55 | fade_in = np.linspace(0, 1, fade_sampels) 56 | fade_out = np.linspace(1, 0, fade_sampels) 57 | 58 | audio_data_fade_in = audio_data[:fade_sampels] * fade_in 59 | audio_data_fade_out = audio_data[-fade_sampels:] * fade_out 60 | 61 | audio_data_faded = np.concatenate((audio_data_fade_in, audio_data[len(fade_in):-len(fade_out)], audio_data_fade_out)) 62 | return audio_data_faded 63 | 64 | # def get_key(config='config.yaml'): 65 | # with open('config.yaml', 'r') as file: 66 | # config = yaml.safe_load(file) 67 | # return config['OpenAI-Key'] if 'OpenAI-Key' in config else None 68 | 69 | def get_service_port(port='SERVICE_PORT'): 70 | service_port = os.environ.get(port) 71 | return service_port 72 | 73 | def get_service_url(): 74 | service_url = os.environ.get('SERVICE_URL') 75 | return service_url 76 | 77 | def get_api_key(): 78 | api_key = os.environ.get('OPENAI_KEY') 79 | return api_key 80 | 81 | def get_max_script_lines(): 82 | max_lines = int(os.environ.get('MAX_SCRIPT_LINES', 999)) 83 | return max_lines 84 | 85 | def get_path_from_target_dir(filename, path_or_dir): 86 | if os.path.isfile(path_or_dir): 87 | path_or_dir = os.path.dirname(path_or_dir) 88 | return os.path.join(path_or_dir, filename) 89 | 90 | def generate_random_series(n=9): 91 | return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n)) 92 | -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/child_boy.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/child_boy.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/cnn_male_speaker.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/cnn_male_speaker.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/elder_morgen.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/elder_morgen.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/news_female_speaker.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_female_speaker.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/news_female_speaker_outside.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_female_speaker_outside.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/npz/news_male_speaker.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_male_speaker.npz -------------------------------------------------------------------------------- /wavcraft/voice_preset/voice_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "Male1_En": { 3 | "id": "Male1_En", 4 | "desc": "A normal male adult voice, British accent; Language: English.", 5 | "npz_path": "v2/en_speaker_1" 6 | }, 7 | "Male2_En": { 8 | "id": "Male2_En", 9 | "desc": "A normal male adult voice, American accent; Language: English.", 10 | "npz_path": "v2/en_speaker_6" 11 | }, 12 | "Female1_En": { 13 | "id": "Female1_En", 14 | "desc": "A normal female adult voice, British accent; Language: English.", 15 | "npz_path": "v2/en_speaker_9" 16 | }, 17 | "Female2_En": { 18 | "id": "Female2_En", 19 | "desc": "A normal female adult voice, American accent; Language: English.", 20 | "npz_path": "v2/de_speaker_3" 21 | }, 22 | "News_Male_En": { 23 | "id": "News_Male_En", 24 | "desc": "A male voice of a news anchor, suitable for news scenarios; Language: English.", 25 | "npz_path": "wavcraft/voice_preset/npz/news_male_speaker.npz" 26 | }, 27 | "News_Female_En": { 28 | "id": "News_Female_En", 29 | "desc": "A female voice of a news anchor, suitable for news scenarios; Language: English.", 30 | "npz_path": "wavcraft/voice_preset/npz/news_male_speaker.npz" 31 | }, 32 | "News_Female_Out_En": { 33 | "id": "News_Female_Out_En", 34 | "desc": "A female voice of a off-site news reporter, suitable for news scenario; Language: English.", 35 | "npz_path": "wavcraft/voice_preset/npz/news_female_speaker_outside.npz" 36 | }, 37 | "Child_En": { 38 | "id": "Child_En", 39 | "desc": "A small young boy voice; Language: English.", 40 | "npz_path": "wavcraft/voice_preset/npz/child_boy.npz" 41 | }, 42 | "Old_Man_En": { 43 | "id": "Old_Man_En", 44 | "desc": "A voice of an old man; Language: English.", 45 | "npz_path": "wavcraft/voice_preset/npz/elder_morgen.npz" 46 | }, 47 | "Male1_Zh": { 48 | "id": "Male1_Zh", 49 | "desc": "A normal male adult voice; Language: Chinese.", 50 | "npz_path": "v2/zh_speaker_0" 51 | }, 52 | "Male2_Zh": { 53 | "id": "Male2_Zh", 54 | "desc": "A normal male adult voice; Language: Chinese.", 55 | "npz_path": "v2/zh_speaker_1" 56 | }, 57 | "Female1_Zh": { 58 | "id": "Female1_Zh", 59 | "desc": "A normal female adult voice; Language: Chinese.", 60 | "npz_path": "v2/zh_speaker_9" 61 | }, 62 | "Female2_Zh": { 63 | "id": "Female2_Zh", 64 | "desc": "A normal female adult voice; Language: Chinese.", 65 | "npz_path": "v2/zh_speaker_4" 66 | }, 67 | "Male1_Fr": { 68 | "id": "Male1_Fr", 69 | "desc": "A normal male adult voice; Language: French.", 70 | "npz_path": "v2/fr_speaker_0" 71 | }, 72 | "Male2_Fr": { 73 | "id": "Male2_Fr", 74 | "desc": "A normal male adult voice; Language: French.", 75 | "npz_path": "v2/fr_speaker_8" 76 | }, 77 | "Female1_Fr": { 78 | "id": "Female1_Fr", 79 | "desc": "A normal female adult voice; Language: French.", 80 | "npz_path": "v2/fr_speaker_5" 81 | }, 82 | "Female2_Fr": { 83 | "id": "Female2_Fr", 84 | "desc": "A normal female adult voice; Language: French.", 85 | "npz_path": "v2/fr_speaker_1" 86 | }, 87 | "Male1_De": { 88 | "id": "Male1_De", 89 | "desc": "A normal male adult voice; Language: German.", 90 | "npz_path": "v2/de_speaker_0" 91 | }, 92 | "Male2_De": { 93 | "id": "Male2_De", 94 | "desc": "A normal male adult voice; Language: German.", 95 | "npz_path": "v2/de_speaker_1" 96 | }, 97 | "Female1_De": { 98 | "id": "Female1_De", 99 | "desc": "A normal female adult voice; Language: German.", 100 | "npz_path": "v2/de_speaker_3" 101 | }, 102 | "Female2_De": { 103 | "id": "Female2_De", 104 | "desc": "A normal female adult voice; Language: German.", 105 | "npz_path": "v2/de_speaker_8" 106 | }, 107 | "Male1_Hi": { 108 | "id": "Male1_Hi", 109 | "desc": "A normal male adult voice; Language: Hindi.", 110 | "npz_path": "v2/hi_speaker_5" 111 | }, 112 | "Male2_Hi": { 113 | "id": "Male2_Hi", 114 | "desc": "A normal male adult voice; Language: Hindi.", 115 | "npz_path": "v2/hi_speaker_8" 116 | }, 117 | "Female1_Hi": { 118 | "id": "Female1_Hi", 119 | "desc": "A normal female adult voice; Language: Hindi.", 120 | "npz_path": "v2/hi_speaker_0" 121 | }, 122 | "Female2_Hi": { 123 | "id": "Female2_Hi", 124 | "desc": "A normal female adult voice; Language: Hindi.", 125 | "npz_path": "v2/hi_speaker_3" 126 | }, 127 | "Male1_It": { 128 | "id": "Male1_It", 129 | "desc": "A normal male adult voice; Language: Italian.", 130 | "npz_path": "v2/it_speaker_4" 131 | }, 132 | "Male2_It": { 133 | "id": "Male2_It", 134 | "desc": "A normal male adult voice; Language: Italian.", 135 | "npz_path": "v2/it_speaker_5" 136 | }, 137 | "Female1_It": { 138 | "id": "Female1_It", 139 | "desc": "A normal female adult voice; Language: Italian.", 140 | "npz_path": "v2/it_speaker_7" 141 | }, 142 | "Female2_It": { 143 | "id": "Female2_It", 144 | "desc": "A normal female adult voice; Language: Italian.", 145 | "npz_path": "v2/it_speaker_9" 146 | }, 147 | "Male1_Ja": { 148 | "id": "Male1_Ja", 149 | "desc": "A normal male adult voice; Language: Japanese.", 150 | "npz_path": "v2/ja_speaker_2" 151 | }, 152 | "Male2_Ja": { 153 | "id": "Male2_Ja", 154 | "desc": "A normal male adult voice; Language: Japanese.", 155 | "npz_path": "v2/ja_speaker_6" 156 | }, 157 | "Female1_Ja": { 158 | "id": "Female1_Ja", 159 | "desc": "A normal female adult voice; Language: Japanese.", 160 | "npz_path": "v2/ja_speaker_4" 161 | }, 162 | "Female2_Ja": { 163 | "id": "Female2_Ja", 164 | "desc": "A normal female adult voice; Language: Japanese.", 165 | "npz_path": "v2/ja_speaker_5" 166 | }, 167 | "Male1_Ko": { 168 | "id": "Male1_Ko", 169 | "desc": "A normal male adult voice; Language: Korean.", 170 | "npz_path": "v2/ko_speaker_1" 171 | }, 172 | "Male2_Ko": { 173 | "id": "Male2_Ko", 174 | "desc": "A normal male adult voice; Language: Korean.", 175 | "npz_path": "v2/ko_speaker_2" 176 | }, 177 | "Female1_Ko": { 178 | "id": "Female1_Ko", 179 | "desc": "A normal female adult voice; Language: Korean.", 180 | "npz_path": "v2/ko_speaker_0" 181 | }, 182 | "Female1_Ru": { 183 | "id": "Female1_Ru", 184 | "desc": "A normal female adult voice; Language: Russian.", 185 | "npz_path": "v2/ru_speaker_5" 186 | }, 187 | "Female2_Ru": { 188 | "id": "Female2_Ru", 189 | "desc": "A normal female adult voice; Language: Russian.", 190 | "npz_path": "v2/ru_speaker_6" 191 | }, 192 | "Male1_Ru": { 193 | "id": "Male1_Ru", 194 | "desc": "A normal male adult voice; Language: Russian.", 195 | "npz_path": "v2/ru_speaker_3" 196 | }, 197 | "Male2_Ru": { 198 | "id": "Male2_Ru", 199 | "desc": "A normal male adult voice; Language: Russian.", 200 | "npz_path": "v2/ru_speaker_4" 201 | }, 202 | "Female1_Es": { 203 | "id": "Female1_Es", 204 | "desc": "A normal female adult voice; Language: Spanish.", 205 | "npz_path": "v2/es_speaker_8" 206 | }, 207 | "Female2_Es": { 208 | "id": "Female2_Es", 209 | "desc": "A normal female adult voice; Language: Spanish.", 210 | "npz_path": "v2/es_speaker_9" 211 | }, 212 | "Male1_Es": { 213 | "id": "Male1_Es", 214 | "desc": "A normal male adult voice; Language: Spanish.", 215 | "npz_path": "v2/es_speaker_6" 216 | }, 217 | "Male2_Es": { 218 | "id": "Male2_Es", 219 | "desc": "A normal male adult voice; Language: Spanish.", 220 | "npz_path": "v2/es_speaker_7" 221 | }, 222 | "Female1_Tr": { 223 | "id": "Female1_Tr", 224 | "desc": "A normal female adult voice; Language: Turkish.", 225 | "npz_path": "v2/tr_speaker_4" 226 | }, 227 | "Female2_Tr": { 228 | "id": "Female2_Tr", 229 | "desc": "A normal female adult voice; Language: Turkish.", 230 | "npz_path": "v2/tr_speaker_5" 231 | }, 232 | "Male1_Tr": { 233 | "id": "Male1_Tr", 234 | "desc": "A normal male adult voice; Language: Turkish.", 235 | "npz_path": "v2/tr_speaker_2" 236 | }, 237 | "Male2_Tr": { 238 | "id": "Male2_Tr", 239 | "desc": "A normal male adult voice; Language: Turkish.", 240 | "npz_path": "v2/tr_speaker_3" 241 | }, 242 | "Male1_Pt": { 243 | "id": "Male1_Pt", 244 | "desc": "A normal male adult voice; Language: Purtuguese.", 245 | "npz_path": "v2/pt_speaker_0" 246 | }, 247 | "Male2_Pt": { 248 | "id": "Male2_Pt", 249 | "desc": "A normal male adult voice; Language: Purtuguese.", 250 | "npz_path": "v2/pt_speaker_1" 251 | }, 252 | "Female1_Pl": { 253 | "id": "Female1_Pl", 254 | "desc": "A normal female adult voice; Language: Polish.", 255 | "npz_path": "v2/pl_speaker_4" 256 | }, 257 | "Female2_Pl": { 258 | "id": "Female2_Pl", 259 | "desc": "A normal female adult voice; Language: Polish.", 260 | "npz_path": "v2/pl_speaker_6" 261 | }, 262 | "Male1_Pl": { 263 | "id": "Male1_Pl", 264 | "desc": "A normal male adult voice; Language: Polish.", 265 | "npz_path": "v2/pl_speaker_5" 266 | }, 267 | "Male2_Pl": { 268 | "id": "Male2_Pl", 269 | "desc": "A normal male adult voice; Language: Polish.", 270 | "npz_path": "v2/pl_speaker_7" 271 | } 272 | } --------------------------------------------------------------------------------