├── .gitignore
├── LICENSE
├── README.md
├── WavCraft-chat.py
├── WavCraft.py
├── assets
    ├── duck_quacking_in_water.wav
    ├── overview.png
    └── wavcraft_icon.jpg
├── scripts
    ├── check_watermark.py
    ├── continue_service.sh
    ├── kill_services.py
    ├── setup_envs.sh
    └── start_services.sh
├── services
    ├── audiocraft_service.py
    ├── audioldm_service.py
    ├── audiosep_service.py
    ├── audiosr_service.py
    ├── start_audiocraft.sh
    ├── start_audioldm.sh
    ├── start_audiosep.sh
    ├── start_audiosr.sh
    ├── start_wavmark.sh
    └── wavmark_service.py
├── venvs
    ├── audiocraft.yml
    ├── audioldm.yml
    ├── audiosr.yml
    └── wavcraft.yml
└── wavcraft
    ├── __init__.py
    ├── apis.py
    ├── configs.yaml
    ├── ffmpeg_engineer.py
    ├── mistral_api.py
    ├── pipeline.py
    ├── prompts
        ├── text_to_code.prompt
        └── text_to_followup.prompt
    ├── utils.py
    └── voice_preset
        ├── npz
            ├── child_boy.npz
            ├── cnn_male_speaker.npz
            ├── elder_morgen.npz
            ├── news_female_speaker.npz
            ├── news_female_speaker_outside.npz
            └── news_male_speaker.npz
        └── voice_map.json


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | bin/
3 | services_logs/
4 | output/
5 | # ext/*
6 | .empty/
7 | scripts/chatgpt.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |     wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |     wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.
438 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🌊WavCraft
  2 | 
  3 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2403.09527)    [![demo](https://img.shields.io/badge/Notion-Demo_Page-blue)](https://first-riddle-7e9.notion.site/WavCraft-Demo-40c079fc82ca411ca0520b9d65abd3f5)
  4 | 
  5 | Generate and edit the audio with a simple sentence.
  6 | 
  7 | This repo currently support:
  8 | 
  9 | * text-guided audio editing: edit the content of given audio clip(s) conditioned on text input
 10 | * text-guided audio generation: create an audio clip given text input
 11 | * audio scriptwriting: get more inspiration from WavCraft by prompting a script setting and let the model do the scriptwriting and create the sound for you.
 12 | * check if your audio file is synthesized by WavCraft
 13 | 
 14 | ## Change log
 15 | 
 16 | **2024-05-06**: Support openLLMs (MistralAI family) for WavCraft.
 17 | 
 18 | **2024-03-20**: Add watermarking to the system
 19 | 
 20 | ## Content
 21 | 
 22 | - [Usage](##usage)
 23 |   - [Installation](###installation)
 24 |   - [Audio edition using a single line](###audio-edition-using-a-single-line)
 25 |   - [Audio edition via interaction](###audio-edition-via-interaction)
 26 |   - [Check if an audio file is generated/modified by WavCraft](###check_if_an_audio_file_is_generated/modified_by_wavcraft)
 27 | - [Approach](##approach)
 28 | - [Acknowledgments](##acknowledgments)
 29 | - [Citing](##citing)
 30 | 
 31 | ## Usage
 32 | 
 33 | ### Installation
 34 | 
 35 | ```
 36 | source scripts/setup_envs.sh
 37 | ```
 38 | 
 39 | ## Configure environment
 40 | 
 41 | ```bash
 42 | export OPENAI_KEY=YOUR_OPENAI_KEY
 43 | export HF_KEY=YOUR_HF_KEY
 44 | ```
 45 | 
 46 | ## Launch deep learning models on local
 47 | 
 48 | ```bash
 49 | source scripts/start_services.sh
 50 | ```
 51 | 
 52 | ## Play with WavCraft
 53 | 
 54 | ### Audio edition using a single line
 55 | 
 56 | ```
 57 | python3 WavCraft.py basic -f \
 58 | --input-wav assets/duck_quacking_in_water.wav \
 59 | --input-text "Add dog barking."
 60 | ```
 61 | 
 62 | ### Audio edition via interaction
 63 | 
 64 | ```
 65 | python3 WavCraft-chat.py basic -f -c
 66 | [New session is create]
 67 | Add audio files(s) (each file starts with '+'): +assets/duck_quacking_in_water.wav
 68 | Enter your instruction (input `EXIT` to exit the process): "Add dog barking"
 69 | 
 70 | ```
 71 | 
 72 | ### Check if an audio file is generated/modified by WavCraft
 73 | 
 74 | ```
 75 | python3 check_watermark.py --wav-path /path/to/audio/file
 76 | ```
 77 | 
 78 | ### Use openLLMs for generation/editing
 79 | ```
 80 | python3 WavCraft.py basic -f \
 81 | --input-wav assets/duck_quacking_in_water.wav \
 82 | --input-text "Add dog barking." \
 83 | --model 'mistralai/Mistral-7B-Instruct-v0.2'
 84 | ```
 85 | 
 86 | ## Approach
 87 | 
 88 | WavCraft is an LLM-driven agent for audio content creation and editing. It applies LLM to connect various audio expert models and DSP function together. An overview of WavCraft architecture can be found bellow:
 89 | 
 90 | ![overview](assets/overview.png)
 91 | 
 92 | ## Disclaimer
 93 | 
 94 | This repository is for **research purpose only**. We are not responsible for audio generated/edited using semantics created by this model. Also, everyone use WavCraft must NOT disable the watermarking techniques in anyway.
 95 | 
 96 | ## Acknowledgments
 97 | 
 98 | We appreciate [WavJourney](https://github.com/Audio-AGI/WavJourney), [AudioCraft](https://github.com/facebookresearch/audiocraft), [AudioSep](https://github.com/Audio-AGI/AudioSep), [AudioSR](https://github.com/haoheliu/versatile_audio_super_resolution), [AudioLDM](https://github.com/haoheliu/AudioLDM), [WavMark](https://github.com/wavmark/wavmark) for their amazing code work.
 99 | 
100 | ## Citing
101 | 
102 | If you found our work is helpful, please cite our work:
103 | ```
104 | @misc{liang2024wavcraft,
105 |       title={WavCraft: Audio Editing and Generation with Large Language Models}, 
106 |       author={Jinhua Liang and Huan Zhang and Haohe Liu and Yin Cao and Qiuqiang Kong and Xubo Liu and Wenwu Wang and Mark D. Plumbley and Huy Phan and Emmanouil Benetos},
107 |       year={2024},
108 |       eprint={2403.09527},
109 |       archivePrefix={arXiv},
110 |       primaryClass={eess.AS}
111 | }
112 | ```
113 | 


--------------------------------------------------------------------------------
/WavCraft-chat.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import argparse
 3 | 
 4 | import wavcraft.utils as utils
 5 | import wavcraft.pipeline as pipeline
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | sub_parsers = parser.add_subparsers(dest="mode")
 9 | # Basic mode
10 | basic_parser = sub_parsers.add_parser("basic")
11 | basic_parser.add_argument('-f', '--full', action='store_true', help='Go through the full proces')
12 | basic_parser.add_argument('-c', '--chat', action='store_true', help='Chat with WavCraft.')
13 | basic_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
14 | # Inspiration mode
15 | inspire_parser = sub_parsers.add_parser("inspiration")
16 | inspire_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process')
17 | inspire_parser.add_argument('-c', '--chat', action='store_true', help='Chat with WavCraft')
18 | inspire_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
19 | 
20 | args = parser.parse_args()
21 | 
22 | if args.mode in ("basic", "inspiration"):
23 |     session_id = pipeline.init_session(args.session_id)
24 |     print(f"Session {session_id} is created.")
25 | 
26 |     api_key = utils.get_api_key()
27 |     assert api_key != None, "Please set your openai_key in the environment variable."
28 |     
29 |     input_wav = []
30 | 
31 |     while True:
32 |         this_turn_wav = input("Add audio files(s) (each file starts with '+'): ")
33 |         input_text = input("Enter your instruction (input `EXIT` to exit the process): ")
34 | 
35 |         if input_text == "EXIT":
36 |             print("WavCraft is completed.")
37 |             break
38 |             
39 |         if args.full:
40 |             this_turn_wav = this_turn_wav.split('+')
41 |             this_turn_wav = [wav.strip().strip("'").strip("\"") for wav in this_turn_wav if len(wav) > 0]
42 |             input_wav.extend(this_turn_wav)
43 | 
44 |             pipeline.full_steps(session_id, input_wav, input_text, api_key, mode=args.mode)


--------------------------------------------------------------------------------
/WavCraft.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import argparse
 3 | 
 4 | import wavcraft.utils as utils
 5 | import wavcraft.pipeline as pipeline
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | sub_parsers = parser.add_subparsers(dest="mode", help='Type of WavCraft to use')
 9 | # Basic mode
10 | basic_parser = sub_parsers.add_parser("basic")
11 | basic_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process')
12 | basic_parser.add_argument('--input-wav', nargs='+', default=[], help='a list of input wave paths')
13 | basic_parser.add_argument('--input-text', type=str, help='input text or text file')
14 | # gpt-4-0125-preview
15 | basic_parser.add_argument('--model', type=str, default="gpt-4", help='ChatGPT model.')
16 | basic_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
17 | # Inspiration mode
18 | inspire_parser = sub_parsers.add_parser("inspiration")
19 | inspire_parser.add_argument('-f', '--full', action='store_true', help='Go through the full process')
20 | inspire_parser.add_argument('--input-wav', nargs='+', default=[], help='a list of input wave paths')
21 | inspire_parser.add_argument('--input-text', type=str, help='input text or text file')
22 | inspire_parser.add_argument('--model', type=str, default="gpt-4", help='ChatGPT model.')
23 | inspire_parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
24 | 
25 | args = parser.parse_args()
26 | 
27 | if args.mode in ("basic", "inspiration"):
28 |     if args.full:
29 |         input_text = args.input_text
30 |         input_wav = args.input_wav
31 | 
32 |         start_time = time.time()
33 |         session_id = pipeline.init_session(args.session_id)
34 |         api_key = utils.get_api_key()
35 | 
36 |         assert api_key != None, "Please set your openai_key in the environment variable."
37 |         
38 |         print(f"Session {session_id} is created.")
39 | 
40 |         pipeline.full_steps(session_id, input_wav, input_text, api_key, model=args.model, mode=args.mode)
41 |         end_time = time.time()
42 | 
43 |         print(f"Audio editor took {end_time - start_time:.2f} seconds to complete.")
44 | 


--------------------------------------------------------------------------------
/assets/duck_quacking_in_water.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/duck_quacking_in_water.wav


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/overview.png


--------------------------------------------------------------------------------
/assets/wavcraft_icon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/assets/wavcraft_icon.jpg


--------------------------------------------------------------------------------
/scripts/check_watermark.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from wavcraft.apis import _DECODE_WATERMARK
3 | 
4 | 
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--wav-path", type=str, help="Path to the audio file.")
7 | args = parser.parse_args()
8 | 
9 | _DECODE_WATERMARK(args.wav_path, sample_rate=16000)


--------------------------------------------------------------------------------
/scripts/continue_service.sh:
--------------------------------------------------------------------------------
 1 | conda activate AudioEditor
 2 | source ./scripts/chatgpt.sh
 3 | 
 4 | mkdir -p services_logs
 5 | 
 6 | export SERVICE_PORT=8088
 7 | export SERVICE_URL=127.0.0.1
 8 | export MAX_SCRIPT_LINES=999
 9 | 
10 | export AUDIOCRAFT_SERVICE_PORT=$((${SERVICE_PORT}+1))
11 | export AUDIOSEP_SERVICE_PORT=$((${SERVICE_PORT}+2))
12 | export AUDIOSR_SERVICE_PORT=$((${SERVICE_PORT}+3))
13 | export AUDIOLDM_SERVICE_PORT=$((${SERVICE_PORT}+4))
14 | export WAVMARK_SERVICE_PORT=$((${SERVICE_PORT}+5))
15 | 


--------------------------------------------------------------------------------
/scripts/kill_services.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | # Extract values for each application
 5 | audiocraft_service_port = int(os.environ.get('AUDIOCRAFT_SERVICE_PORT'))
 6 | audiosep_service_port = int(os.environ.get('AUDIOSEP_SERVICE_PORT'))
 7 | audiosr_service_port = int(os.environ.get('AUDIOSR_SERVICE_PORT'))
 8 | audioldm_service_port = int(os.environ.get('AUDIOLDM_SERVICE_PORT'))
 9 | wavmark_service_port = int(os.environ.get('WAVMARK_SERVICE_PORT'))
10 | 
11 | # Execute the commands 
12 | os.system(f'kill $(lsof -t -i :{audiocraft_service_port})')
13 | os.system(f'kill $(lsof -t -i :{audiosep_service_port})')
14 | os.system(f'kill $(lsof -t -i :{audiosr_service_port})')
15 | os.system(f'kill $(lsof -t -i :{audioldm_service_port})')
16 | os.system(f'kill $(lsof -t -i :{wavmark_service_port})')


--------------------------------------------------------------------------------
/scripts/setup_envs.sh:
--------------------------------------------------------------------------------
 1 | conda env create -f venvs/audiocraft.yml
 2 | conda env create -f venvs/audioldm.yml
 3 | conda env create -f venvs/audiosr.yml
 4 | conda env create -f venvs/wavcraft.yml
 5 | # Prepare third-party repos
 6 | # Comment some of them if they are unnecessary
 7 | mkdir ext/
 8 | cd ext/
 9 | 
10 | git clone https://github.com/haoheliu/AudioLDM.git
11 | 
12 | git clone https://github.com/Audio-AGI/AudioSep.git
13 | 
14 | wget https://uplex.de/audiowmark/releases/audiowmark-0.6.1.tar.gz
15 | tar -xzvf audiowmark-0.6.1.tar.gz
16 | cd audiowmark-0.6.1
17 | ./configure
18 | make
19 | make install


--------------------------------------------------------------------------------
/scripts/start_services.sh:
--------------------------------------------------------------------------------
 1 | mkdir -p services_logs
 2 | 
 3 | export SERVICE_PORT=8088
 4 | export SERVICE_URL=127.0.0.1
 5 | export MAX_SCRIPT_LINES=999
 6 | 
 7 | # Start AudioCraft service
 8 | source services/start_audiocraft.sh
 9 | # Start AudioSep service
10 | source services/start_audiosep.sh
11 | # Start AudioSR service
12 | source services/start_audiosr.sh
13 | # Start AudioLDM service
14 | source services/start_audioldm.sh
15 | # Start WavMark service
16 | source services/start_wavmark.sh
17 | # WavCraft
18 | conda activate WavCraft


--------------------------------------------------------------------------------
/services/audiocraft_service.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | import logging
  5 | import torch
  6 | import nltk
  7 | import torchaudio
  8 | import torchaudio.transforms as T
  9 | from torchaudio.transforms import SpeedPerturbation
 10 | 
 11 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 12 | from wavcraft.apis import _WRITE_AUDIO, _LOUDNESS_NORM
 13 | from wavcraft.utils import fade, get_service_port
 14 | from flask import Flask, request, jsonify
 15 | 
 16 | 
 17 | with open('wavcraft/configs.yaml', 'r') as file:
 18 |     config = yaml.safe_load(file)
 19 | 
 20 | # Configure the logging format and level
 21 | logging.basicConfig(
 22 |     level=logging.INFO,
 23 |     format='%(asctime)s - %(levelname)s - %(message)s'
 24 | )
 25 | 
 26 | # Create a FileHandler for the log file
 27 | os.makedirs('services_logs', exist_ok=True)
 28 | log_filename = 'services_logs/Wav-API.log'
 29 | file_handler = logging.FileHandler(log_filename, mode='w')
 30 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
 31 | 
 32 | # Add the FileHandler to the root logger
 33 | logging.getLogger('').addHandler(file_handler)
 34 | 
 35 | 
 36 | """
 37 | Initialize the AudioCraft models here
 38 | """
 39 | from audiocraft.models import AudioGen, MusicGen
 40 | tta_model_size = config['AudioCraft']['tta_model_size']
 41 | tta_model = AudioGen.get_pretrained(f'facebook/audiogen-{tta_model_size}')
 42 | logging.info(f'AudioGen ({tta_model_size}) is loaded ...')
 43 | 
 44 | ttm_model_size = config['AudioCraft']['ttm_model_size']
 45 | ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{ttm_model_size}')
 46 | logging.info(f'MusicGen ({ttm_model_size}) is loaded ...')
 47 | 
 48 | 
 49 | """
 50 | Initialize the BarkModel here
 51 | """
 52 | from transformers import BarkModel, AutoProcessor
 53 | import json
 54 | 
 55 | # Load voice map
 56 | with open("wavcraft/voice_preset/voice_map.json", 'r') as f:
 57 |     voice_map = json.load(f)
 58 | 
 59 | SPEED = float(config['Text-to-Speech']['speed'])
 60 | speed_perturb = SpeedPerturbation(32000, [SPEED])
 61 | tts_model = BarkModel.from_pretrained("suno/bark")
 62 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
 63 | tts_model = tts_model.to(device)
 64 | tts_model = tts_model.to_bettertransformer()    # Flash attention
 65 | SAMPLE_RATE = tts_model.generation_config.sample_rate
 66 | SEMANTIC_TEMPERATURE = 0.9
 67 | COARSE_TEMPERATURE = 0.5
 68 | FINE_TEMPERATURE = 0.5
 69 | processor = AutoProcessor.from_pretrained("suno/bark")
 70 | logging.info('Bark model is loaded ...')
 71 | 
 72 | 
 73 | app = Flask(__name__)
 74 | 
 75 | 
 76 | @app.route('/generate_audio', methods=['POST'])
 77 | def generate_audio():
 78 |     # Receive the text from the POST request
 79 |     data = request.json
 80 |     text = data['text']
 81 |     length = float(data.get('length', 5.0))
 82 |     volume = float(data.get('volume', -35))
 83 |     output_wav = data.get('output_wav', 'out.wav')
 84 | 
 85 |     logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
 86 |     
 87 |     try:
 88 |         tta_model.set_generation_params(duration=length)
 89 |         wav = tta_model.generate([text])  
 90 |         wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000)
 91 | 
 92 |         wav = wav.squeeze().cpu().detach().numpy()
 93 |         wav = fade(_LOUDNESS_NORM(wav, volume=volume))
 94 |         _WRITE_AUDIO(wav, name=output_wav)
 95 | 
 96 |         # Return success message and the filename of the generated audio
 97 |         return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav})
 98 | 
 99 |     except Exception as e:
100 |         return jsonify({'API error': str(e)}), 500
101 | 
102 | 
103 | @app.route('/generate_music', methods=['POST'])
104 | def generate_music():
105 |     # Receive the text from the POST request
106 |     data = request.json
107 |     text = data['text']
108 |     melody_path = data.get('melody', None)
109 |     length = float(data.get('length', 5.0))
110 |     volume = float(data.get('volume', -35))
111 |     sample_rate = int(data.get('sr', 32000))
112 |     output_wav = data.get('output_wav', 'out.wav')
113 | 
114 |     logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
115 | 
116 | 
117 |     try:
118 |         ttm_model.set_generation_params(duration=length)
119 | 
120 |         if melody_path is None:
121 |             print("Use generate")
122 |             wav = ttm_model.generate([text])
123 | 
124 |         else:
125 |             print("Use generate_with_chroma")
126 |             melody, sr = torchaudio.load(melody_path)
127 |             # Resample the audio if sr does not match sample_rate
128 |             if sr != sample_rate:
129 |                 resampler = T.Resample(sr, sample_rate, dtype=melody.dtype)
130 |                 melody = resampler(melody)
131 |             # Generates using the melody from the given audio and the provided descriptions.
132 |             wav = ttm_model.generate_with_chroma([text], melody[None].expand(1, -1, -1), sample_rate)
133 |             
134 |         wav = wav[0][0].cpu().detach().numpy()
135 |         wav = fade(_LOUDNESS_NORM(wav, volume=volume))
136 |         _WRITE_AUDIO(wav, name=output_wav)
137 | 
138 |         # Return success message and the filename of the generated audio
139 |         return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav})
140 | 
141 |     except Exception as e:
142 |         # Return error message if something goes wrong
143 |         return jsonify({'API error': str(e)}), 500
144 | 
145 | 
146 | @app.route('/generate_speech', methods=['POST'])
147 | def generate_speech():
148 |     # Receive the text from the POST request
149 |     data = request.json
150 |     text = data['text']
151 |     speaker_id = data['speaker_id']
152 |     volume = float(data.get('volume', -35))
153 |     output_wav = data.get('output_wav', 'out.wav')
154 | 
155 |     speaker_npz = voice_map[speaker_id]["npz_path"]
156 |     
157 |     logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}')
158 | 
159 |     try:   
160 |         # Generate audio using the global pipe object
161 |         text = text.replace('\n', ' ').strip()
162 |         sentences = nltk.sent_tokenize(text)
163 |         silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0)  # 0.1 second of silence
164 | 
165 |         pieces = []
166 |         for sentence in sentences:
167 |             inputs = processor(sentence, voice_preset=speaker_npz).to(device)
168 |             # NOTE: you must run the line below, otherwise you will see the runtime error
169 |             # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
170 |             inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1)
171 | 
172 |             with torch.inference_mode():
173 |                 # TODO: min_eos_p?
174 |                 output = tts_model.generate(
175 |                     **inputs,
176 |                     do_sample = True,
177 |                     semantic_temperature = SEMANTIC_TEMPERATURE,
178 |                     coarse_temperature = COARSE_TEMPERATURE,
179 |                     fine_temperature = FINE_TEMPERATURE
180 |                 )
181 | 
182 |             pieces += [output, silence]
183 | 
184 |         result_audio = torch.cat(pieces, dim=1)
185 |         wav_tensor = result_audio.to(dtype=torch.float32).cpu()
186 |         wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000)
187 |         wav = speed_perturb(wav.float())[0].squeeze(0)
188 |         wav = wav.numpy()
189 |         wav = _LOUDNESS_NORM(wav, volume=volume)
190 |         _WRITE_AUDIO(wav, name=output_wav)
191 | 
192 |         # Return success message and the filename of the generated audio
193 |         return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})
194 | 
195 |     except Exception as e:
196 |         # Return error message if something goes wrong
197 |         return jsonify({'API error': str(e)}), 500
198 |     
199 | 
200 | if __name__ == '__main__':
201 |     service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT")
202 |     # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
203 |     app.run(debug=False, threaded=False, port=service_port)


--------------------------------------------------------------------------------
/services/audioldm_service.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import yaml
 4 | import math
 5 | import logging
 6 | import librosa
 7 | import numpy as np
 8 | from flask import Flask, request, jsonify
 9 | from scipy.io.wavfile import write
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
12 | from wavcraft.utils import get_service_port
13 | from audioldm import build_model, super_resolution_and_inpainting
14 | 
15 | 
16 | CACHE_DIR = os.getenv(
17 |     "AUDIOLDM_CACHE_DIR",
18 |     os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
19 | 
20 | EPS = 1e-5
21 | 
22 | with open('wavcraft/configs.yaml', 'r') as file:
23 |     config = yaml.safe_load(file)
24 | 
25 | # Configure the logging format and level
26 | logging.basicConfig(
27 |     level=logging.INFO,
28 |     format='%(asctime)s - %(levelname)s - %(message)s'
29 | )
30 | 
31 | # Create a FileHandler for the log file
32 | os.makedirs('services_logs', exist_ok=True)
33 | log_filename = 'services_logs/Wav-API.log'
34 | file_handler = logging.FileHandler(log_filename, mode='w')
35 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
36 | 
37 | # Add the FileHandler to the root logger
38 | logging.getLogger('').addHandler(file_handler)
39 | 
40 | 
41 | audioldm = build_model(model_name=config["AudioLDM"]["model_size"])
42 | logging.info('AudioLDM is loaded ...')
43 | 
44 | 
45 | app = Flask(__name__)
46 | 
47 | 
48 | @app.route('/audio_inpaint', methods=['POST'])
49 | def audio_inpaint():
50 |     # Receive the text from the POST request
51 |     data = request.json
52 |     wav_path = data['wav_path']
53 |     text = data["text"]
54 |     duration = data["duration"] + EPS # avoid zero division
55 |     onset = data["onset"] / duration
56 |     offset = data["offset"] / duration
57 | 
58 |     sample_rate = data.get('sample_rate', 32000)
59 |     guidance_scale = data.get('guidance_scale', 2.5)
60 |     ddim_steps = data.get('ddim_steps', 200)
61 |     random_seed = data.get('seed', 42)
62 |     output_wav = data.get('output_wav', 'out.wav')
63 |     logging.info(f"Inpaint {wav_path} with the input '{text}'...")
64 | 
65 |     try:
66 |         # target_duration = math.ceil(data["duration"] / 2.5) * 2.5
67 |         target_duration = data["duration"]
68 |         waveform = super_resolution_and_inpainting(
69 |             audioldm,
70 |             text, # The text prompt for inpainting generation
71 |             wav_path, # This audio will be padded to 10.242 seconds and perform inpainting
72 |             time_mask_ratio_start_and_end=(onset, offset), # This is a ratio for inpainting at a scale of 10.242 seconds
73 |             seed=random_seed,
74 |             duration=target_duration,
75 |             guidance_scale=guidance_scale,
76 |             ddim_steps=ddim_steps,
77 |             n_candidate_gen_per_text=1,
78 |             batchsize=1,
79 |         )
80 | 
81 |         if sample_rate != 16000:
82 |             waveform = librosa.resample(waveform, orig_sr=16000, target_sr=sample_rate)
83 |         # Write audio to `output_wav` with `sample_rate`
84 |         write(output_wav, sample_rate, np.round(waveform[:int(duration*sample_rate)] * 32767).astype(np.int16))
85 |         
86 |         # Return success message and the filename of the generated audio
87 |         return jsonify({'message': f'Sucessful infill {data["onset"]}-{data["offset"]}s content in {wav_path}'})
88 | 
89 |     except Exception as e:
90 |         # Return error message if something goes wrong
91 |         return jsonify({'API error': str(e)}), 500
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     service_port = get_service_port("AUDIOLDM_SERVICE_PORT")
96 |     # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
97 |     app.run(debug=False, threaded=False, port=service_port)


--------------------------------------------------------------------------------
/services/audiosep_service.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | import logging
  5 | import torch
  6 | import librosa
  7 | import numpy as np
  8 | from flask import Flask, request, jsonify
  9 | from scipy.io.wavfile import write
 10 | 
 11 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 12 | from wavcraft.utils import get_service_port
 13 | 
 14 | 
 15 | with open('wavcraft/configs.yaml', 'r') as file:
 16 |     config = yaml.safe_load(file)
 17 | 
 18 | # Configure the logging format and level
 19 | logging.basicConfig(
 20 |     level=logging.INFO,
 21 |     format='%(asctime)s - %(levelname)s - %(message)s'
 22 | )
 23 | 
 24 | # Create a FileHandler for the log file
 25 | os.makedirs('services_logs', exist_ok=True)
 26 | log_filename = 'services_logs/Wav-API.log'
 27 | file_handler = logging.FileHandler(log_filename, mode='w')
 28 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
 29 | 
 30 | # Add the FileHandler to the root logger
 31 | logging.getLogger('').addHandler(file_handler)
 32 | 
 33 | 
 34 | """
 35 | Initalize the AudioSep model here
 36 | """
 37 | def inference(model, audio_file, text, output_file, device='cuda', use_chunk=False):
 38 |     print(f'Separate audio from [{audio_file}] with textual query [{text}]')
 39 |     mixture, fs = librosa.load(audio_file, sr=32000, mono=True)
 40 |     with torch.no_grad():
 41 |         text = [text]
 42 | 
 43 |         conditions = model.query_encoder.get_query_embed(
 44 |             modality='text',
 45 |             text=text,
 46 |             device=device
 47 |         )
 48 | 
 49 |         input_dict = {
 50 |             "mixture": torch.Tensor(mixture)[None, None, :].to(device),
 51 |             "condition": conditions,
 52 |         } 
 53 | 
 54 |         if use_chunk:
 55 |             foreground = model.ss_model.chunk_inference(input_dict)
 56 |             foreground = np.squeeze(foreground)
 57 |         else:
 58 |             foreground = model.ss_model(input_dict)["waveform"]
 59 |             foreground = foreground.squeeze(0).squeeze(0).data.cpu().numpy()
 60 | 
 61 |         background = mixture - foreground
 62 | 
 63 |         filedir, filename = os.path.split(output_file)
 64 |         fg_filepath = os.path.join(filedir, "fg_"+filename)
 65 |         bg_filepath = os.path.join(filedir, "bg_"+filename)
 66 | 
 67 |         write(fg_filepath, 32000, np.round(foreground * 32767).astype(np.int16))
 68 |         print(f'Write separated audio to [{fg_filepath}]')
 69 | 
 70 |         write(bg_filepath, 32000, np.round(background * 32767).astype(np.int16))
 71 |         print(f'Write the background audio to [{bg_filepath}]')
 72 | 
 73 | import sys
 74 | sys.path.append("ext/AudioSep")
 75 | from ss_pipeline import build_audiosep
 76 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 77 | ss = build_audiosep(
 78 |       config_yaml='ext/AudioSep/config/audiosep_base.yaml', 
 79 |       checkpoint_path='ext/AudioSep/checkpoint/audiosep_base_4M_steps.ckpt', 
 80 |       device=device)
 81 | 
 82 | logging.info('AudioSep is loaded ...')
 83 | 
 84 | 
 85 | app = Flask(__name__)
 86 | 
 87 | 
 88 | @app.route('/source_separate', methods=['POST'])
 89 | def source_separate():
 90 |     # Receive the text from the POST request
 91 |     data = request.json
 92 |     wav_path = data['wav_path']
 93 |     text = data["text"]
 94 |     output_wav = data.get('output_wav', 'out.wav')
 95 | 
 96 |     logging.info(f"Separate '{text}' from {wav_path} ...")
 97 | 
 98 |     try:
 99 |         inference(ss, wav_path, text, output_wav, device)
100 |         
101 |         # Return success message and the filename of the generated audio
102 |         return jsonify({'message': f'Sucessful separation from {wav_path}'})
103 | 
104 |     except Exception as e:
105 |         # Return error message if something goes wrong
106 |         return jsonify({'API error': str(e)}), 500
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     service_port = get_service_port("AUDIOSEP_SERVICE_PORT")
111 |     # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
112 |     app.run(debug=False, threaded=False, port=service_port)


--------------------------------------------------------------------------------
/services/audiosr_service.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | import random
  5 | import logging
  6 | import torch
  7 | import numpy as np
  8 | import soundfile as sf
  9 | from flask import Flask, request, jsonify
 10 | from cog import BasePredictor, Input, Path
 11 | from audiosr import build_model
 12 | from audiosr import super_resolution as _super_resolution
 13 | 
 14 | 
 15 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 16 | from wavcraft.apis import _WRITE_AUDIO
 17 | from wavcraft.utils import fade, get_service_port
 18 | 
 19 | 
 20 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 21 | torch.set_float32_matmul_precision("high")
 22 | 
 23 | with open('wavcraft/configs.yaml', 'r') as file:
 24 |     config = yaml.safe_load(file)
 25 | 
 26 | # Configure the logging format and level
 27 | logging.basicConfig(
 28 |     level=logging.INFO,
 29 |     format='%(asctime)s - %(levelname)s - %(message)s'
 30 | )
 31 | 
 32 | # Create a FileHandler for the log file
 33 | os.makedirs('services_logs', exist_ok=True)
 34 | log_filename = 'services_logs/Wav-API.log'
 35 | file_handler = logging.FileHandler(log_filename, mode='w')
 36 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
 37 | 
 38 | # Add the FileHandler to the root logger
 39 | logging.getLogger('').addHandler(file_handler)
 40 | 
 41 | 
 42 | """
 43 | Initialize the AudioSR models here
 44 | """
 45 | class Predictor(BasePredictor):
 46 |     def setup(self, model_name="basic", device="auto"):
 47 |         self.model_name = model_name
 48 |         self.device = device
 49 |         self.sr = 48000
 50 |         self.audiosr = build_model(model_name=self.model_name, device=self.device)
 51 | 
 52 |     def predict(self,
 53 |         input_file: Path = Input(description="Audio to upsample"),
 54 |         output_path: Path = Input(description="Path to output audio"),
 55 |         ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
 56 |         guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
 57 |         seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None),
 58 |     ) -> np.ndarray:
 59 |         """Run a single prediction on the model"""
 60 |         if seed is None:
 61 |             seed = random.randint(0, 2**32 - 1)
 62 |             print(f"Setting seed to: {seed}")
 63 | 
 64 |         waveform = _super_resolution(
 65 |             self.audiosr,
 66 |             input_file,
 67 |             seed=seed,
 68 |             guidance_scale=guidance_scale,
 69 |             ddim_steps=ddim_steps,
 70 |             latent_t_per_second=12.8
 71 |         )
 72 |         out_wav = (waveform[0] * 32767).astype(np.int16).T
 73 | 
 74 |         sf.write(output_path, data=out_wav, samplerate=48000)
 75 |         return Path(output_path)
 76 | 
 77 | sr_model = Predictor()
 78 | sr_model.setup()
 79 | logging.info('AudioSR model is loaded ...')
 80 | 
 81 | 
 82 | app = Flask(__name__)
 83 | 
 84 | 
 85 | @app.route('/super_resolution', methods=['POST'])
 86 | def super_resolution():
 87 |     # Receive the text from the POST request
 88 |     data = request.json
 89 |     wav_path = data['wav_path']
 90 |     ddim_steps = int(data.get('ddim_steps', 50))
 91 |     guidance_scale = float(data.get('guidance_scale', 3.5))
 92 |     seed = int(data.get('seed', 42))
 93 |     output_wav = data.get('output_wav', 'out.wav')
 94 | 
 95 |     logging.info(f"Super resolution: ddim_steps: {ddim_steps}, guidance_scale: {guidance_scale}.")
 96 | 
 97 |     try:
 98 |         sr_model.predict(
 99 |             wav_path,
100 |             output_path=output_wav,
101 |             ddim_steps=ddim_steps,
102 |             guidance_scale=guidance_scale,
103 |             seed=seed,
104 |         )
105 |         # Return success message and the filename of the generated audio
106 |         return jsonify({'message': 'Audio super resolution generated successfully', 'file': f"{output_wav}"})
107 | 
108 |     except Exception as e:
109 |         return jsonify({'API error': str(e)}), 500
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     service_port = get_service_port("AUDIOSR_SERVICE_PORT")
114 |     # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
115 |     app.run(debug=False, threaded=False, port=service_port)


--------------------------------------------------------------------------------
/services/start_audiocraft.sh:
--------------------------------------------------------------------------------
1 | export AUDIOCRAFT_SERVICE_PORT=$((${SERVICE_PORT}+1))
2 | 
3 | conda activate AudioCraft
4 | nohup python3 services/audiocraft_service.py > services_logs/audiocraft.out 2>&1 &
5 | echo "AudioCraft is loaded sucessfully."


--------------------------------------------------------------------------------
/services/start_audioldm.sh:
--------------------------------------------------------------------------------
1 | export AUDIOLDM_SERVICE_PORT=$((${SERVICE_PORT}+4))
2 | 
3 | conda activate AudioInpainting
4 | nohup python3 services/audioldm_service.py > services_logs/audioldm.out 2>&1 &
5 | echo "AudioLDM is loaded sucessfully."


--------------------------------------------------------------------------------
/services/start_audiosep.sh:
--------------------------------------------------------------------------------
1 | export AUDIOSEP_SERVICE_PORT=$((${SERVICE_PORT}+2))
2 | 
3 | conda activate AudioEditor
4 | nohup python3 services/audiosep_service.py > services_logs/audiosep.out 2>&1 &
5 | echo "AudioSep is loaded sucessfully."


--------------------------------------------------------------------------------
/services/start_audiosr.sh:
--------------------------------------------------------------------------------
1 | export AUDIOSR_SERVICE_PORT=$((${SERVICE_PORT}+3))
2 | 
3 | conda activate AudioSR
4 | nohup python3 services/audiosr_service.py > services_logs/audiosr.out 2>&1 &
5 | echo "AudioSR is loaded sucessfully."


--------------------------------------------------------------------------------
/services/start_wavmark.sh:
--------------------------------------------------------------------------------
1 | export WAVMARK_SERVICE_PORT=$((${SERVICE_PORT}+5))
2 | 
3 | conda activate AudioEditor
4 | nohup python3 services/wavmark_service.py > services_logs/wavmark_service.out 2>&1 &
5 | echo "WavMark is loaded sucessfully."


--------------------------------------------------------------------------------
/services/wavmark_service.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import yaml
 4 | import logging
 5 | import librosa
 6 | import soundfile
 7 | import torch
 8 | import wavmark
 9 | import numpy as np
10 | from flask import Flask, request, jsonify
11 | from scipy.io.wavfile import write
12 | 
13 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
14 | from wavcraft.utils import get_service_port
15 | 
16 | 
17 | with open('wavcraft/configs.yaml', 'r') as file:
18 |     config = yaml.safe_load(file)
19 | 
20 | # Configure the logging format and level
21 | logging.basicConfig(
22 |     level=logging.INFO,
23 |     format='%(asctime)s - %(levelname)s - %(message)s'
24 | )
25 | 
26 | # Create a FileHandler for the log file
27 | os.makedirs('services_logs', exist_ok=True)
28 | log_filename = 'services_logs/Wav-API.log'
29 | file_handler = logging.FileHandler(log_filename, mode='w')
30 | file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
31 | 
32 | # Add the FileHandler to the root logger
33 | logging.getLogger('').addHandler(file_handler)
34 | 
35 | 
36 | # Audio watermarking preserved by WavCraft
37 | payload = np.array([0,1,0,1,0,1,1,1,0,1,0,0,0,0,1,1])
38 | model = wavmark.load_model().to("cuda" if torch.cuda.is_available() else 'cpu')
39 | logging.info('WavMark is loaded ...')
40 | 
41 | 
42 | app = Flask(__name__)
43 | 
44 | 
45 | @app.route('/audio_watermark', methods=['POST'])
46 | def audio_watermark():
47 |     # Receive the text from the POST request
48 |     data = request.json
49 |     wav_path = data['wav_path']
50 |     sample_rate = data.get('sample_rate', 16000)
51 |     action = data.get('action', "encode")
52 |     output_wav = data.get('output_wav', 'out.wav')
53 |     logging.info(f"{action} watermark with {wav_path}...")
54 | 
55 |     # the audio should be a single-channel 16kHz wav, you can read it using soundfile:
56 |     signal, sr = soundfile.read(wav_path)
57 |     assert sr == sample_rate, "WavMark use 16kHz audio only!"
58 |     # Otherwise, you can use the following function to convert the host audio to single-channel 16kHz format:
59 |     # from wavmark.utils import file_reader
60 |     # signal = file_reader.read_as_single_channel(wav_path, aim_sr=sample_rate)
61 | 
62 |     try:
63 |         assert action in ("encode", "decode")
64 |         if action == "encode":
65 |             watermarked_signal, _ = wavmark.encode_watermark(model, signal, payload, show_progress=True)
66 |             # you can save it as a new wav:
67 |             soundfile.write(output_wav, watermarked_signal, sample_rate)
68 |         else:
69 |             payload_decoded, _ = wavmark.decode_watermark(model, signal, show_progress=True)
70 |             confidence_score = (payload == payload_decoded).mean() * 100
71 |             if confidence_score < 0.5:
72 |                 logging.info(f"Audio file {wav_path} is not generated by WavCraft.")
73 |             else:
74 |                 logging.info(f"Audio file {wav_path} is generated by WavCraft.")
75 | 
76 |         # Return success message and the filename of the generated audio
77 |         return jsonify({'message': f"Sucessful {action} watermark with {wav_path}..."})
78 | 
79 |     except Exception as e:
80 |         # Return error message if something goes wrong
81 |         return jsonify({'API error': str(e)}), 500
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     service_port = get_service_port("WAVMARK_SERVICE_PORT")
86 |     # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
87 |     app.run(debug=False, threaded=False, port=service_port)


--------------------------------------------------------------------------------
/venvs/audiocraft.yml:
--------------------------------------------------------------------------------
  1 | name: AudioCraft
  2 | channels:
  3 |   - nvidia/label/cuda-11.8.0
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - bzip2=1.0.8=h7f98852_4
 10 |   - ca-certificates=2023.05.30=h06a4308_0
 11 |   - cuda-cccl=11.8.89=0
 12 |   - cuda-command-line-tools=11.8.0=0
 13 |   - cuda-compiler=11.8.0=0
 14 |   - cuda-cudart=11.8.89=0
 15 |   - cuda-cudart-dev=11.8.89=0
 16 |   - cuda-cuobjdump=11.8.86=0
 17 |   - cuda-cupti=11.8.87=0
 18 |   - cuda-cuxxfilt=11.8.86=0
 19 |   - cuda-documentation=11.8.86=0
 20 |   - cuda-driver-dev=11.8.89=0
 21 |   - cuda-gdb=11.8.86=0
 22 |   - cuda-libraries=11.8.0=0
 23 |   - cuda-libraries-dev=11.8.0=0
 24 |   - cuda-memcheck=11.8.86=0
 25 |   - cuda-nsight=11.8.86=0
 26 |   - cuda-nsight-compute=11.8.0=0
 27 |   - cuda-nvcc=11.8.89=0
 28 |   - cuda-nvdisasm=11.8.86=0
 29 |   - cuda-nvml-dev=11.8.86=0
 30 |   - cuda-nvprof=11.8.87=0
 31 |   - cuda-nvprune=11.8.86=0
 32 |   - cuda-nvrtc=11.8.89=0
 33 |   - cuda-nvrtc-dev=11.8.89=0
 34 |   - cuda-nvtx=11.8.86=0
 35 |   - cuda-nvvp=11.8.87=0
 36 |   - cuda-profiler-api=11.8.86=0
 37 |   - cuda-sanitizer-api=11.8.86=0
 38 |   - cuda-toolkit=11.8.0=0
 39 |   - cuda-tools=11.8.0=0
 40 |   - cuda-visual-tools=11.8.0=0
 41 |   - gds-tools=1.4.0.31=0
 42 |   - ld_impl_linux-64=2.40=h41732ed_0
 43 |   - libcublas=11.11.3.6=0
 44 |   - libcublas-dev=11.11.3.6=0
 45 |   - libcufft=10.9.0.58=0
 46 |   - libcufft-dev=10.9.0.58=0
 47 |   - libcufile=1.4.0.31=0
 48 |   - libcufile-dev=1.4.0.31=0
 49 |   - libcurand=10.3.0.86=0
 50 |   - libcurand-dev=10.3.0.86=0
 51 |   - libcusolver=11.4.1.48=0
 52 |   - libcusolver-dev=11.4.1.48=0
 53 |   - libcusparse=11.7.5.86=0
 54 |   - libcusparse-dev=11.7.5.86=0
 55 |   - libffi=3.4.2=h7f98852_5
 56 |   - libgcc-ng=13.1.0=he5830b7_0
 57 |   - libgomp=13.1.0=he5830b7_0
 58 |   - libnpp=11.8.0.86=0
 59 |   - libnpp-dev=11.8.0.86=0
 60 |   - libnsl=2.0.0=h7f98852_0
 61 |   - libnvjpeg=11.9.0.86=0
 62 |   - libnvjpeg-dev=11.9.0.86=0
 63 |   - libsqlite=3.42.0=h2797004_0
 64 |   - libuuid=2.38.1=h0b41bf4_0
 65 |   - libzlib=1.2.13=hd590300_5
 66 |   - ncurses=6.4=hcb278e6_0
 67 |   - nsight-compute=2022.3.0.22=0
 68 |   - openssl=3.1.1=hd590300_1
 69 |   - pip=23.1.2=pyhd8ed1ab_0
 70 |   - python=3.8.17=he550d4f_0_cpython
 71 |   - readline=8.2=h8228510_1
 72 |   - setuptools=68.0.0=pyhd8ed1ab_0
 73 |   - tk=8.6.12=h27826a3_0
 74 |   - wheel=0.40.0=pyhd8ed1ab_0
 75 |   - xz=5.2.6=h166bdaf_0
 76 |   - pip:
 77 |     - aiofiles==23.1.0
 78 |     - aiohttp==3.8.4
 79 |     - aiosignal==1.3.1
 80 |     - altair==5.0.1
 81 |     - antlr4-python3-runtime==4.9.3
 82 |     - anyio==3.7.1
 83 |     - async-timeout==4.0.2
 84 |     - attrs==23.1.0
 85 |     - audiocraft==1.0.0
 86 |     - audiomentations==0.34.1
 87 |     - audioread==3.0.0
 88 |     - av==10.0.0
 89 |     - blinker==1.6.2
 90 |     - blis==0.7.9
 91 |     - catalogue==2.0.8
 92 |     - certifi==2023.5.7
 93 |     - cffi==1.15.1
 94 |     - charset-normalizer==3.2.0
 95 |     - click==8.1.5
 96 |     - cloudpickle==2.2.1
 97 |     - cmake==3.26.4
 98 |     - cog==0.8.6
 99 |     - coloredlogs==15.0.1
100 |     - colorlog==6.7.0
101 |     - confection==0.1.0
102 |     - contourpy==1.1.0
103 |     - cycler==0.12.1
104 |     - cymem==2.0.7
105 |     - cython==0.29.36
106 |     - datasets==2.14.6
107 |     - demucs==4.0.0
108 |     - diffq==0.2.4
109 |     - dill==0.3.7
110 |     - distro==1.8.0
111 |     - docopt==0.6.2
112 |     - dora-search==0.1.12
113 |     - einops==0.6.1
114 |     - encodec==0.1.1
115 |     - exceptiongroup==1.1.2
116 |     - fastapi==0.98.0
117 |     - ffmpy==0.3.0
118 |     - filelock==3.12.2
119 |     - flashy==0.0.2
120 |     - flask==2.3.2
121 |     - fonttools==4.41.0
122 |     - frozenlist==1.4.0
123 |     - fsspec==2023.6.0
124 |     - future==0.18.3
125 |     - gradio==3.36.1
126 |     - gradio-client==0.2.9
127 |     - h11==0.14.0
128 |     - httpcore==0.17.3
129 |     - httptools==0.6.1
130 |     - httpx==0.24.1
131 |     - huggingface-hub==0.16.4
132 |     - humanfriendly==10.0
133 |     - hydra-colorlog==1.2.0
134 |     - hydra-core==1.3.2
135 |     - idna==3.4
136 |     - importlib-metadata==6.8.0
137 |     - importlib-resources==6.0.0
138 |     - itsdangerous==2.1.2
139 |     - joblib==1.3.1
140 |     - jsonschema==4.18.3
141 |     - jsonschema-specifications==2023.6.1
142 |     - julius==0.2.7
143 |     - kiwisolver==1.4.4
144 |     - lameenc==1.5.1
145 |     - langcodes==3.3.0
146 |     - lazy-loader==0.3
147 |     - librosa==0.10.0.post2
148 |     - lightning-utilities==0.9.0
149 |     - linkify-it-py==2.0.2
150 |     - lit==16.0.6
151 |     - llvmlite==0.40.1
152 |     - markdown-it-py==2.2.0
153 |     - markupsafe==2.1.3
154 |     - matplotlib==3.7.2
155 |     - mdit-py-plugins==0.3.3
156 |     - mdurl==0.1.2
157 |     - mpmath==1.3.0
158 |     - msgpack==1.0.5
159 |     - multidict==6.0.4
160 |     - multiprocess==0.70.15
161 |     - murmurhash==1.0.9
162 |     - mypy-extensions==1.0.0
163 |     - networkx==3.1
164 |     - nltk==3.8.1
165 |     - num2words==0.5.12
166 |     - numba==0.57.1
167 |     - numpy==1.24.4
168 |     - nvidia-cublas-cu11==11.10.3.66
169 |     - nvidia-cuda-cupti-cu11==11.7.101
170 |     - nvidia-cuda-nvrtc-cu11==11.7.99
171 |     - nvidia-cuda-runtime-cu11==11.7.99
172 |     - nvidia-cudnn-cu11==8.5.0.96
173 |     - nvidia-cufft-cu11==10.9.0.58
174 |     - nvidia-curand-cu11==10.2.10.91
175 |     - nvidia-cusolver-cu11==11.4.0.1
176 |     - nvidia-cusparse-cu11==11.7.4.91
177 |     - nvidia-nccl-cu11==2.14.3
178 |     - nvidia-nvtx-cu11==11.7.91
179 |     - omegaconf==2.3.0
180 |     - openai==1.3.6
181 |     - openunmix==1.2.1
182 |     - optimum==1.14.0
183 |     - orjson==3.9.2
184 |     - packaging==23.1
185 |     - pandas==2.0.3
186 |     - pathy==0.10.2
187 |     - pillow==10.0.0
188 |     - pkgutil-resolve-name==1.3.10
189 |     - pooch==1.6.0
190 |     - preshed==3.0.8
191 |     - protobuf==3.20.3
192 |     - pyarrow==14.0.1
193 |     - pydantic==1.10.11
194 |     - pydub==0.25.1
195 |     - pygments==2.15.1
196 |     - pyloudnorm==0.1.1
197 |     - pyparsing==3.0.9
198 |     - pyre-extensions==0.0.29
199 |     - python-dotenv==1.0.0
200 |     - python-multipart==0.0.6
201 |     - pytz==2023.3
202 |     - pyyaml==6.0.1
203 |     - referencing==0.29.1
204 |     - regex==2023.6.3
205 |     - requests==2.31.0
206 |     - retrying==1.3.4
207 |     - rpds-py==0.8.10
208 |     - safetensors==0.3.1
209 |     - scikit-learn==1.3.0
210 |     - scipy==1.10.1
211 |     - semantic-version==2.10.0
212 |     - sentencepiece==0.1.99
213 |     - smart-open==6.3.0
214 |     - sniffio==1.3.0
215 |     - soundfile==0.12.1
216 |     - soxr==0.3.5
217 |     - spacy==3.5.2
218 |     - spacy-legacy==3.0.12
219 |     - spacy-loggers==1.0.4
220 |     - srsly==2.4.6
221 |     - starlette==0.27.0
222 |     - structlog==23.2.0
223 |     - submitit==1.4.5
224 |     - sympy==1.12
225 |     - thinc==8.1.10
226 |     - threadpoolctl==3.2.0
227 |     - tokenizers==0.13.3
228 |     - toolz==0.12.0
229 |     - torch==2.0.1
230 |     - torchaudio==2.0.2
231 |     - torchmetrics==1.0.1
232 |     - tqdm==4.65.0
233 |     - transformers==4.31.0
234 |     - treetable==0.2.5
235 |     - triton==2.0.0
236 |     - typer==0.7.0
237 |     - typing-extensions==4.7.1
238 |     - typing-inspect==0.9.0
239 |     - tzdata==2023.3
240 |     - uc-micro-py==1.0.2
241 |     - urllib3==2.0.3
242 |     - uvicorn==0.22.0
243 |     - uvloop==0.19.0
244 |     - wasabi==1.1.2
245 |     - watchfiles==0.21.0
246 |     - websockets==11.0.3
247 |     - werkzeug==2.3.6
248 |     - xformers==0.0.20
249 |     - xxhash==3.4.1
250 |     - yarl==1.9.2
251 |     - zipp==3.16.2
252 | prefix: /homes/jl009/.conda/envs/AudioCraft
253 | 
254 | 


--------------------------------------------------------------------------------
/venvs/audioldm.yml:
--------------------------------------------------------------------------------
  1 | name: AudioInpainting
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - blas=1.0=mkl
  9 |   - brotli-python=1.0.9=py38heb0550a_2
 10 |   - bzip2=1.0.8=h7b6447c_0
 11 |   - ca-certificates=2023.08.22=h06a4308_0
 12 |   - certifi=2023.11.17=py38h06a4308_0
 13 |   - click=8.1.7=py38h06a4308_0
 14 |   - cryptography=41.0.3=py38h130f0dd_0
 15 |   - cuda-cudart=11.7.99=0
 16 |   - cuda-cupti=11.7.101=0
 17 |   - cuda-libraries=11.7.1=0
 18 |   - cuda-nvrtc=11.7.99=0
 19 |   - cuda-nvtx=11.7.91=0
 20 |   - cuda-runtime=11.7.1=0
 21 |   - ffmpeg=4.3=hf484d3e_0
 22 |   - flask=2.2.2=py38h06a4308_0
 23 |   - freetype=2.11.0=h70c0345_0
 24 |   - giflib=5.2.1=h7b6447c_0
 25 |   - gmp=6.2.1=h295c915_3
 26 |   - gnutls=3.6.15=he1e5248_0
 27 |   - intel-openmp=2021.4.0=h06a4308_3561
 28 |   - itsdangerous=2.0.1=pyhd3eb1b0_0
 29 |   - jinja2=3.1.2=py38h06a4308_0
 30 |   - jpeg=9e=h7f8727e_0
 31 |   - lame=3.100=h7b6447c_0
 32 |   - lcms2=2.12=h3be6417_0
 33 |   - ld_impl_linux-64=2.38=h1181459_1
 34 |   - libcublas=11.10.3.66=0
 35 |   - libcufft=10.7.2.124=h4fbf590_0
 36 |   - libcufile=1.8.1.2=0
 37 |   - libcurand=10.3.4.101=0
 38 |   - libcusolver=11.4.0.1=0
 39 |   - libcusparse=11.7.4.91=0
 40 |   - libffi=3.3=he6710b0_2
 41 |   - libgcc-ng=9.1.0=hdf63c60_0
 42 |   - libiconv=1.16=h7f8727e_2
 43 |   - libidn2=2.3.2=h7f8727e_0
 44 |   - libnpp=11.7.4.75=0
 45 |   - libnvjpeg=11.8.0.2=0
 46 |   - libpng=1.6.37=hbc83047_0
 47 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 48 |   - libtasn1=4.16.0=h27cfd23_0
 49 |   - libtiff=4.2.0=h2818925_1
 50 |   - libunistring=0.9.10=h27cfd23_0
 51 |   - libwebp=1.2.2=h55f646e_0
 52 |   - libwebp-base=1.2.2=h7f8727e_0
 53 |   - lz4-c=1.9.3=h295c915_1
 54 |   - mkl=2021.4.0=h06a4308_640
 55 |   - mkl-service=2.4.0=py38h7f8727e_0
 56 |   - mkl_fft=1.3.1=py38hd3c417c_0
 57 |   - mkl_random=1.2.2=py38h51133e4_0
 58 |   - ncurses=6.3=h7f8727e_2
 59 |   - nettle=3.7.3=hbbd107a_1
 60 |   - numpy-base=1.22.3=py38hf524024_0
 61 |   - openh264=2.1.1=h4ff587b_0
 62 |   - openssl=1.1.1w=h7f8727e_0
 63 |   - pip=23.3.1=py38h06a4308_0
 64 |   - pycparser=2.21=pyhd3eb1b0_0
 65 |   - pyopenssl=23.2.0=py38h06a4308_0
 66 |   - pysocks=1.7.1=py38h06a4308_0
 67 |   - python=3.8.13=h12debd9_0
 68 |   - pytorch=1.13.1=py3.8_cuda11.7_cudnn8.5.0_0
 69 |   - pytorch-cuda=11.7=h778d358_5
 70 |   - pytorch-mutex=1.0=cuda
 71 |   - readline=8.1.2=h7f8727e_1
 72 |   - requests=2.31.0=py38h06a4308_0
 73 |   - setuptools=68.0.0=py38h06a4308_0
 74 |   - six=1.16.0=pyhd3eb1b0_1
 75 |   - sqlite=3.38.5=hc218d9a_0
 76 |   - tk=8.6.12=h1ccaba5_0
 77 |   - torchaudio=0.13.1=py38_cu117
 78 |   - torchvision=0.14.1=py38_cu117
 79 |   - typing_extensions=4.7.1=py38h06a4308_0
 80 |   - werkzeug=2.2.3=py38h06a4308_0
 81 |   - wheel=0.41.2=py38h06a4308_0
 82 |   - xz=5.2.5=h7f8727e_1
 83 |   - zlib=1.2.12=h7f8727e_2
 84 |   - zstd=1.5.2=ha4553b6_0
 85 |   - pip:
 86 |     - aiofiles==23.2.1
 87 |     - altair==5.1.2
 88 |     - annotated-types==0.6.0
 89 |     - anyio==3.7.1
 90 |     - attrs==23.1.0
 91 |     - audioldm==0.1.1
 92 |     - audiomentations==0.34.1
 93 |     - audioread==3.0.1
 94 |     - cffi==1.16.0
 95 |     - chardet==5.2.0
 96 |     - charset-normalizer==3.3.2
 97 |     - colorama==0.4.6
 98 |     - contourpy==1.1.1
 99 |     - cycler==0.12.1
100 |     - einops==0.7.0
101 |     - exceptiongroup==1.2.0
102 |     - fastapi==0.104.1
103 |     - ffmpy==0.3.1
104 |     - filelock==3.13.1
105 |     - fonttools==4.45.1
106 |     - fsspec==2023.10.0
107 |     - ftfy==6.1.3
108 |     - future==0.18.3
109 |     - gradio==4.7.1
110 |     - gradio-client==0.7.0
111 |     - h11==0.14.0
112 |     - httpcore==1.0.2
113 |     - httpx==0.25.2
114 |     - huggingface-hub==0.19.4
115 |     - idna==3.6
116 |     - importlib-metadata==6.8.0
117 |     - importlib-resources==6.1.1
118 |     - joblib==1.3.2
119 |     - jsonschema==4.20.0
120 |     - jsonschema-specifications==2023.11.1
121 |     - kiwisolver==1.4.5
122 |     - librosa==0.9.2
123 |     - llvmlite==0.41.1
124 |     - markdown-it-py==3.0.0
125 |     - markupsafe==2.1.3
126 |     - matplotlib==3.7.4
127 |     - mdurl==0.1.2
128 |     - mpmath==1.3.0
129 |     - networkx==3.1
130 |     - numba==0.58.1
131 |     - numpy==1.23.5
132 |     - nvidia-cublas-cu12==12.1.3.1
133 |     - nvidia-cuda-cupti-cu12==12.1.105
134 |     - nvidia-cuda-nvrtc-cu12==12.1.105
135 |     - nvidia-cuda-runtime-cu12==12.1.105
136 |     - nvidia-cudnn-cu12==8.9.2.26
137 |     - nvidia-cufft-cu12==11.0.2.54
138 |     - nvidia-curand-cu12==10.3.2.106
139 |     - nvidia-cusolver-cu12==11.4.5.107
140 |     - nvidia-cusparse-cu12==12.1.0.106
141 |     - nvidia-nccl-cu12==2.18.1
142 |     - nvidia-nvjitlink-cu12==12.3.101
143 |     - nvidia-nvtx-cu12==12.1.105
144 |     - orjson==3.9.10
145 |     - packaging==23.2
146 |     - pandas==2.0.3
147 |     - pillow==10.1.0
148 |     - pkgutil-resolve-name==1.3.10
149 |     - platformdirs==4.0.0
150 |     - pooch==1.8.0
151 |     - progressbar==2.5
152 |     - pydantic==2.5.2
153 |     - pydantic-core==2.14.5
154 |     - pydub==0.25.1
155 |     - pygments==2.17.2
156 |     - pyloudnorm==0.1.1
157 |     - pyparsing==3.1.1
158 |     - python-multipart==0.0.6
159 |     - pytz==2023.3.post1
160 |     - pyyaml==6.0.1
161 |     - referencing==0.31.0
162 |     - regex==2023.10.3
163 |     - retrying==1.3.4
164 |     - rich==13.7.0
165 |     - rpds-py==0.13.1
166 |     - safetensors==0.4.1
167 |     - scikit-learn==1.3.2
168 |     - scipy==1.10.1
169 |     - semantic-version==2.10.0
170 |     - shellingham==1.5.4
171 |     - sniffio==1.3.0
172 |     - soundfile==0.12.1
173 |     - soxr==0.3.7
174 |     - starlette==0.27.0
175 |     - sympy==1.12
176 |     - threadpoolctl==3.2.0
177 |     - tokenizers==0.13.3
178 |     - tomlkit==0.12.0
179 |     - toolz==0.12.0
180 |     - torchlibrosa==0.0.9
181 |     - tqdm==4.66.1
182 |     - transformers==4.29.0
183 |     - triton==2.1.0
184 |     - typer==0.9.0
185 |     - typing-extensions==4.8.0
186 |     - tzdata==2023.3
187 |     - urllib3==2.1.0
188 |     - uvicorn==0.24.0.post1
189 |     - wcwidth==0.2.12
190 |     - websockets==11.0.3
191 |     - zipp==3.17.0
192 | prefix: /homes/jl009/.conda/envs/AudioInpainting
193 | 
194 | 


--------------------------------------------------------------------------------
/venvs/audiosr.yml:
--------------------------------------------------------------------------------
  1 | name: AudioSR
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - blas=1.0=mkl
  9 |   - brotlipy=0.7.0=py39h27cfd23_1003
 10 |   - bzip2=1.0.8=h7b6447c_0
 11 |   - ca-certificates=2023.08.22=h06a4308_0
 12 |   - certifi=2023.7.22=py39h06a4308_0
 13 |   - cryptography=41.0.3=py39h130f0dd_0
 14 |   - cuda-cudart=11.8.89=0
 15 |   - cuda-cupti=11.8.87=0
 16 |   - cuda-libraries=11.8.0=0
 17 |   - cuda-nvrtc=11.8.89=0
 18 |   - cuda-nvtx=11.8.86=0
 19 |   - cuda-runtime=11.8.0=0
 20 |   - ffmpeg=4.3=hf484d3e_0
 21 |   - freetype=2.11.0=h70c0345_0
 22 |   - giflib=5.2.1=h7b6447c_0
 23 |   - gmp=6.2.1=h295c915_3
 24 |   - gmpy2=2.1.2=py39heeb90bb_0
 25 |   - gnutls=3.6.15=he1e5248_0
 26 |   - idna=3.4=py39h06a4308_0
 27 |   - intel-openmp=2021.4.0=h06a4308_3561
 28 |   - jinja2=3.1.2=py39h06a4308_0
 29 |   - jpeg=9e=h7f8727e_0
 30 |   - lame=3.100=h7b6447c_0
 31 |   - lcms2=2.12=h3be6417_0
 32 |   - ld_impl_linux-64=2.38=h1181459_1
 33 |   - libcublas=11.11.3.6=0
 34 |   - libcufft=10.9.0.58=0
 35 |   - libcufile=1.8.0.34=0
 36 |   - libcurand=10.3.4.52=0
 37 |   - libcusolver=11.4.1.48=0
 38 |   - libcusparse=11.7.5.86=0
 39 |   - libffi=3.3=he6710b0_2
 40 |   - libgcc-ng=9.1.0=hdf63c60_0
 41 |   - libiconv=1.16=h7f8727e_2
 42 |   - libidn2=2.3.2=h7f8727e_0
 43 |   - libnpp=11.8.0.86=0
 44 |   - libnvjpeg=11.9.0.86=0
 45 |   - libpng=1.6.37=hbc83047_0
 46 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 47 |   - libtasn1=4.16.0=h27cfd23_0
 48 |   - libtiff=4.2.0=h2818925_1
 49 |   - libunistring=0.9.10=h27cfd23_0
 50 |   - libwebp=1.2.2=h55f646e_0
 51 |   - libwebp-base=1.2.2=h7f8727e_0
 52 |   - lz4-c=1.9.3=h295c915_1
 53 |   - mkl=2021.4.0=h06a4308_640
 54 |   - mkl-service=2.4.0=py39h7f8727e_0
 55 |   - mkl_fft=1.3.1=py39hd3c417c_0
 56 |   - mkl_random=1.2.2=py39h51133e4_0
 57 |   - mpc=1.1.0=h10f8cd9_1
 58 |   - mpfr=4.0.2=hb69a4c5_1
 59 |   - mpmath=1.3.0=py39h06a4308_0
 60 |   - ncurses=6.3=h7f8727e_2
 61 |   - nettle=3.7.3=hbbd107a_1
 62 |   - openh264=2.1.1=h4ff587b_0
 63 |   - openssl=1.1.1w=h7f8727e_0
 64 |   - pip=23.3=py39h06a4308_0
 65 |   - pycparser=2.21=pyhd3eb1b0_0
 66 |   - pyopenssl=23.2.0=py39h06a4308_0
 67 |   - pysocks=1.7.1=py39h06a4308_0
 68 |   - python=3.9.12=h12debd9_1
 69 |   - pytorch=2.0.1=py3.9_cuda11.8_cudnn8.7.0_0
 70 |   - pytorch-cuda=11.8=h7e8668a_5
 71 |   - pytorch-mutex=1.0=cuda
 72 |   - readline=8.1.2=h7f8727e_1
 73 |   - requests=2.31.0=py39h06a4308_0
 74 |   - setuptools=68.0.0=py39h06a4308_0
 75 |   - six=1.16.0=pyhd3eb1b0_1
 76 |   - sqlite=3.38.5=hc218d9a_0
 77 |   - tk=8.6.12=h1ccaba5_0
 78 |   - torchtriton=2.0.0=py39
 79 |   - wheel=0.41.2=py39h06a4308_0
 80 |   - xz=5.2.5=h7f8727e_1
 81 |   - zlib=1.2.12=h7f8727e_2
 82 |   - zstd=1.5.2=ha4553b6_0
 83 |   - pip:
 84 |     - aiofiles==23.2.1
 85 |     - altair==5.1.2
 86 |     - annotated-types==0.6.0
 87 |     - anyio==3.7.1
 88 |     - attrs==23.1.0
 89 |     - audiomentations==0.34.1
 90 |     - audioread==3.0.1
 91 |     - audiosr==0.0.6
 92 |     - babel==2.13.1
 93 |     - blinker==1.7.0
 94 |     - cffi==1.16.0
 95 |     - chardet==5.2.0
 96 |     - charset-normalizer==3.3.2
 97 |     - click==8.1.7
 98 |     - clldutils==3.20.0
 99 |     - cmake==3.27.7
100 |     - cog==0.8.6
101 |     - colorama==0.4.6
102 |     - colorlog==6.7.0
103 |     - contourpy==1.2.0
104 |     - csvw==3.1.3
105 |     - cycler==0.12.1
106 |     - decorator==5.1.1
107 |     - dlinfo==1.2.1
108 |     - einops==0.7.0
109 |     - exceptiongroup==1.1.3
110 |     - fastapi==0.104.1
111 |     - ffmpy==0.3.1
112 |     - filelock==3.13.1
113 |     - flask==3.0.0
114 |     - fonttools==4.44.0
115 |     - fsspec==2023.10.0
116 |     - ftfy==6.1.1
117 |     - future==0.18.3
118 |     - gradio==4.2.0
119 |     - gradio-client==0.7.0
120 |     - h11==0.14.0
121 |     - httpcore==1.0.2
122 |     - httptools==0.6.1
123 |     - httpx==0.25.1
124 |     - huggingface-hub==0.19.0
125 |     - importlib-metadata==6.8.0
126 |     - importlib-resources==6.1.1
127 |     - isodate==0.6.1
128 |     - itsdangerous==2.1.2
129 |     - joblib==1.3.2
130 |     - jsonschema==4.19.2
131 |     - jsonschema-specifications==2023.7.1
132 |     - kiwisolver==1.4.5
133 |     - language-tags==1.2.0
134 |     - librosa==0.9.2
135 |     - lit==17.0.4
136 |     - llvmlite==0.41.1
137 |     - lxml==4.9.3
138 |     - markdown==3.5.1
139 |     - markdown-it-py==3.0.0
140 |     - markupsafe==2.1.3
141 |     - matplotlib==3.8.1
142 |     - mdurl==0.1.2
143 |     - networkx==3.2.1
144 |     - numba==0.58.1
145 |     - numpy==1.23.5
146 |     - nvidia-cublas-cu11==11.10.3.66
147 |     - nvidia-cublas-cu12==12.1.3.1
148 |     - nvidia-cuda-cupti-cu11==11.7.101
149 |     - nvidia-cuda-cupti-cu12==12.1.105
150 |     - nvidia-cuda-nvrtc-cu11==11.7.99
151 |     - nvidia-cuda-nvrtc-cu12==12.1.105
152 |     - nvidia-cuda-runtime-cu11==11.7.99
153 |     - nvidia-cuda-runtime-cu12==12.1.105
154 |     - nvidia-cudnn-cu11==8.5.0.96
155 |     - nvidia-cudnn-cu12==8.9.2.26
156 |     - nvidia-cufft-cu11==10.9.0.58
157 |     - nvidia-cufft-cu12==11.0.2.54
158 |     - nvidia-curand-cu11==10.2.10.91
159 |     - nvidia-curand-cu12==10.3.2.106
160 |     - nvidia-cusolver-cu11==11.4.0.1
161 |     - nvidia-cusolver-cu12==11.4.5.107
162 |     - nvidia-cusparse-cu11==11.7.4.91
163 |     - nvidia-cusparse-cu12==12.1.0.106
164 |     - nvidia-nccl-cu11==2.14.3
165 |     - nvidia-nccl-cu12==2.18.1
166 |     - nvidia-nvjitlink-cu12==12.3.52
167 |     - nvidia-nvtx-cu11==11.7.91
168 |     - nvidia-nvtx-cu12==12.1.105
169 |     - orjson==3.9.10
170 |     - packaging==23.2
171 |     - pandas==2.1.3
172 |     - phonemizer==3.2.1
173 |     - pillow==10.1.0
174 |     - platformdirs==4.0.0
175 |     - pooch==1.8.0
176 |     - progressbar==2.5
177 |     - pydantic==2.5.0
178 |     - pydantic-core==2.14.1
179 |     - pydub==0.25.1
180 |     - pygments==2.16.1
181 |     - pylatexenc==2.10
182 |     - pyloudnorm==0.1.1
183 |     - pyparsing==3.1.1
184 |     - python-dateutil==2.8.2
185 |     - python-dotenv==1.0.0
186 |     - python-multipart==0.0.6
187 |     - pytz==2023.3.post1
188 |     - pyyaml==6.0.1
189 |     - rdflib==7.0.0
190 |     - referencing==0.30.2
191 |     - regex==2023.10.3
192 |     - resampy==0.4.2
193 |     - retrying==1.3.4
194 |     - rfc3986==1.5.0
195 |     - rich==13.6.0
196 |     - rpds-py==0.12.0
197 |     - safetensors==0.4.0
198 |     - scikit-learn==1.3.2
199 |     - scipy==1.11.3
200 |     - segments==2.2.1
201 |     - semantic-version==2.10.0
202 |     - shellingham==1.5.4
203 |     - sniffio==1.3.0
204 |     - soundfile==0.12.1
205 |     - soxr==0.3.7
206 |     - starlette==0.27.0
207 |     - structlog==23.2.0
208 |     - sympy==1.12
209 |     - tabulate==0.9.0
210 |     - threadpoolctl==3.2.0
211 |     - timm==0.9.10
212 |     - tokenizers==0.13.3
213 |     - tomlkit==0.12.0
214 |     - toolz==0.12.0
215 |     - torch==2.0.1
216 |     - torchaudio==2.1.0
217 |     - torchlibrosa==0.1.0
218 |     - torchvision==0.16.0
219 |     - tqdm==4.66.1
220 |     - transformers==4.30.2
221 |     - triton==2.0.0
222 |     - typer==0.9.0
223 |     - typing-extensions==4.8.0
224 |     - tzdata==2023.3
225 |     - unidecode==1.3.7
226 |     - uritemplate==4.1.1
227 |     - urllib3==2.0.7
228 |     - uvicorn==0.24.0.post1
229 |     - uvloop==0.19.0
230 |     - watchfiles==0.21.0
231 |     - wcwidth==0.2.9
232 |     - websockets==11.0.3
233 |     - werkzeug==3.0.1
234 |     - zipp==3.17.0
235 | prefix: /homes/jl009/.conda/envs/AudioSR
236 | 
237 | 


--------------------------------------------------------------------------------
/venvs/wavcraft.yml:
--------------------------------------------------------------------------------
  1 | name: WavCraft
  2 | channels:
  3 |   - nvidia/label/cuda-11.8.0
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - bzip2=1.0.8=h7f98852_4
 10 |   - ca-certificates=2023.05.30=h06a4308_0
 11 |   - cuda-cccl=11.8.89=0
 12 |   - cuda-command-line-tools=11.8.0=0
 13 |   - cuda-compiler=11.8.0=0
 14 |   - cuda-cudart=11.8.89=0
 15 |   - cuda-cudart-dev=11.8.89=0
 16 |   - cuda-cuobjdump=11.8.86=0
 17 |   - cuda-cupti=11.8.87=0
 18 |   - cuda-cuxxfilt=11.8.86=0
 19 |   - cuda-documentation=11.8.86=0
 20 |   - cuda-driver-dev=11.8.89=0
 21 |   - cuda-gdb=11.8.86=0
 22 |   - cuda-libraries=11.8.0=0
 23 |   - cuda-libraries-dev=11.8.0=0
 24 |   - cuda-memcheck=11.8.86=0
 25 |   - cuda-nsight=11.8.86=0
 26 |   - cuda-nsight-compute=11.8.0=0
 27 |   - cuda-nvcc=11.8.89=0
 28 |   - cuda-nvdisasm=11.8.86=0
 29 |   - cuda-nvml-dev=11.8.86=0
 30 |   - cuda-nvprof=11.8.87=0
 31 |   - cuda-nvprune=11.8.86=0
 32 |   - cuda-nvrtc=11.8.89=0
 33 |   - cuda-nvrtc-dev=11.8.89=0
 34 |   - cuda-nvtx=11.8.86=0
 35 |   - cuda-nvvp=11.8.87=0
 36 |   - cuda-profiler-api=11.8.86=0
 37 |   - cuda-sanitizer-api=11.8.86=0
 38 |   - cuda-toolkit=11.8.0=0
 39 |   - cuda-tools=11.8.0=0
 40 |   - cuda-visual-tools=11.8.0=0
 41 |   - gds-tools=1.4.0.31=0
 42 |   - ld_impl_linux-64=2.40=h41732ed_0
 43 |   - libcublas=11.11.3.6=0
 44 |   - libcublas-dev=11.11.3.6=0
 45 |   - libcufft=10.9.0.58=0
 46 |   - libcufft-dev=10.9.0.58=0
 47 |   - libcufile=1.4.0.31=0
 48 |   - libcufile-dev=1.4.0.31=0
 49 |   - libcurand=10.3.0.86=0
 50 |   - libcurand-dev=10.3.0.86=0
 51 |   - libcusolver=11.4.1.48=0
 52 |   - libcusolver-dev=11.4.1.48=0
 53 |   - libcusparse=11.7.5.86=0
 54 |   - libcusparse-dev=11.7.5.86=0
 55 |   - libffi=3.4.2=h7f98852_5
 56 |   - libgcc-ng=13.1.0=he5830b7_0
 57 |   - libgomp=13.1.0=he5830b7_0
 58 |   - libnpp=11.8.0.86=0
 59 |   - libnpp-dev=11.8.0.86=0
 60 |   - libnsl=2.0.0=h7f98852_0
 61 |   - libnvjpeg=11.9.0.86=0
 62 |   - libnvjpeg-dev=11.9.0.86=0
 63 |   - libsqlite=3.42.0=h2797004_0
 64 |   - libuuid=2.38.1=h0b41bf4_0
 65 |   - libzlib=1.2.13=hd590300_5
 66 |   - ncurses=6.4=hcb278e6_0
 67 |   - nsight-compute=2022.3.0.22=0
 68 |   - openssl=3.1.1=hd590300_1
 69 |   - pip=23.1.2=pyhd8ed1ab_0
 70 |   - python=3.8.17=he550d4f_0_cpython
 71 |   - readline=8.2=h8228510_1
 72 |   - setuptools=68.0.0=pyhd8ed1ab_0
 73 |   - tk=8.6.12=h27826a3_0
 74 |   - wheel=0.40.0=pyhd8ed1ab_0
 75 |   - xz=5.2.6=h166bdaf_0
 76 |   - pip:
 77 |     - aiofiles==23.1.0
 78 |     - aiohttp==3.8.4
 79 |     - aiosignal==1.3.1
 80 |     - altair==5.0.1
 81 |     - antlr4-python3-runtime==4.9.3
 82 |     - anyio==3.7.1
 83 |     - async-timeout==4.0.2
 84 |     - attrs==23.1.0
 85 |     - audiomentations==0.34.1
 86 |     - audioread==3.0.0
 87 |     - av==10.0.0
 88 |     - blinker==1.6.2
 89 |     - blis==0.7.9
 90 |     - braceexpand==0.1.7
 91 |     - catalogue==2.0.8
 92 |     - certifi==2023.5.7
 93 |     - cffi==1.15.1
 94 |     - charset-normalizer==3.2.0
 95 |     - click==8.1.5
 96 |     - cloudpickle==2.2.1
 97 |     - cmake==3.26.4
 98 |     - colorlog==6.7.0
 99 |     - confection==0.1.0
100 |     - contourpy==1.1.0
101 |     - cycler==0.12.1
102 |     - cymem==2.0.7
103 |     - cython==0.29.36
104 |     - demucs==4.0.0
105 |     - diffq==0.2.4
106 |     - distro==1.8.0
107 |     - docopt==0.6.2
108 |     - dora-search==0.1.12
109 |     - einops==0.6.1
110 |     - encodec==0.1.1
111 |     - exceptiongroup==1.1.2
112 |     - fastapi==0.100.0
113 |     - ffmpy==0.3.0
114 |     - filelock==3.12.2
115 |     - flashy==0.0.2
116 |     - flask==2.3.2
117 |     - fonttools==4.41.0
118 |     - frozenlist==1.4.0
119 |     - fsspec==2023.6.0
120 |     - ftfy==6.1.1
121 |     - future==0.18.3
122 |     - gradio==3.36.1
123 |     - gradio-client==0.7.0
124 |     - h11==0.14.0
125 |     - h5py==3.10.0
126 |     - httpcore==0.17.3
127 |     - httpx==0.24.1
128 |     - huggingface-hub==0.16.4
129 |     - hydra-colorlog==1.2.0
130 |     - hydra-core==1.3.2
131 |     - idna==3.4
132 |     - importlib-metadata==6.8.0
133 |     - importlib-resources==6.0.0
134 |     - itsdangerous==2.1.2
135 |     - joblib==1.3.1
136 |     - jsonschema==4.18.3
137 |     - jsonschema-specifications==2023.6.1
138 |     - julius==0.2.7
139 |     - kiwisolver==1.4.4
140 |     - lameenc==1.5.1
141 |     - langcodes==3.3.0
142 |     - lazy-loader==0.3
143 |     - librosa==0.10.0.post2
144 |     - lightning==2.1.1
145 |     - lightning-utilities==0.9.0
146 |     - linkify-it-py==2.0.2
147 |     - lit==16.0.6
148 |     - llvmlite==0.40.1
149 |     - markdown-it-py==2.2.0
150 |     - markupsafe==2.1.3
151 |     - matplotlib==3.7.2
152 |     - mdit-py-plugins==0.3.3
153 |     - mdurl==0.1.2
154 |     - mpmath==1.3.0
155 |     - msgpack==1.0.5
156 |     - multidict==6.0.4
157 |     - murmurhash==1.0.9
158 |     - mypy-extensions==1.0.0
159 |     - networkx==3.1
160 |     - num2words==0.5.12
161 |     - numba==0.57.1
162 |     - numpy==1.23.0
163 |     - nvidia-cublas-cu11==11.10.3.66
164 |     - nvidia-cuda-cupti-cu11==11.7.101
165 |     - nvidia-cuda-nvrtc-cu11==11.7.99
166 |     - nvidia-cuda-runtime-cu11==11.7.99
167 |     - nvidia-cudnn-cu11==8.5.0.96
168 |     - nvidia-cufft-cu11==10.9.0.58
169 |     - nvidia-curand-cu11==10.2.10.91
170 |     - nvidia-cusolver-cu11==11.4.0.1
171 |     - nvidia-cusparse-cu11==11.7.4.91
172 |     - nvidia-nccl-cu11==2.14.3
173 |     - nvidia-nvtx-cu11==11.7.91
174 |     - omegaconf==2.3.0
175 |     - openai==0.28.0
176 |     - openunmix==1.2.1
177 |     - orjson==3.9.2
178 |     - packaging==23.1
179 |     - pandas==2.0.3
180 |     - pathy==0.10.2
181 |     - pillow==10.0.0
182 |     - pkgutil-resolve-name==1.3.10
183 |     - pooch==1.6.0
184 |     - preshed==3.0.8
185 |     - protobuf==4.25.0
186 |     - pybind11==2.11.1
187 |     - pydantic==1.10.11
188 |     - pydub==0.25.1
189 |     - pygments==2.15.1
190 |     - pyloudnorm==0.1.1
191 |     - pyparsing==3.0.9
192 |     - pyre-extensions==0.0.29
193 |     - pyroomacoustics==0.6.0
194 |     - python-multipart==0.0.6
195 |     - pytorch-lightning==2.1.1
196 |     - pytz==2023.3
197 |     - pyyaml==6.0.1
198 |     - referencing==0.29.1
199 |     - regex==2023.6.3
200 |     - requests==2.31.0
201 |     - retrying==1.3.4
202 |     - rpds-py==0.8.10
203 |     - safetensors==0.3.1
204 |     - scikit-learn==1.3.0
205 |     - scipy==1.10.1
206 |     - semantic-version==2.10.0
207 |     - sentencepiece==0.1.99
208 |     - smart-open==6.3.0
209 |     - sniffio==1.3.0
210 |     - soundfile==0.12.1
211 |     - soxr==0.3.5
212 |     - spacy==3.5.2
213 |     - spacy-legacy==3.0.12
214 |     - spacy-loggers==1.0.4
215 |     - srsly==2.4.6
216 |     - starlette==0.27.0
217 |     - submitit==1.4.5
218 |     - sympy==1.12
219 |     - thinc==8.1.10
220 |     - threadpoolctl==3.2.0
221 |     - tokenizers==0.13.3
222 |     - toolz==0.12.0
223 |     - torch==2.0.1
224 |     - torchaudio==2.0.2
225 |     - torchlibrosa==0.1.0
226 |     - torchmetrics==1.0.1
227 |     - torchvision==0.15.2
228 |     - tqdm==4.65.0
229 |     - transformers==4.29.0
230 |     - treetable==0.2.5
231 |     - triton==2.0.0
232 |     - typer==0.7.0
233 |     - typing-extensions==4.7.1
234 |     - typing-inspect==0.9.0
235 |     - tzdata==2023.3
236 |     - uc-micro-py==1.0.2
237 |     - urllib3==2.0.3
238 |     - uvicorn==0.22.0
239 |     - wasabi==1.1.2
240 |     - wcwidth==0.2.9
241 |     - webdataset==0.2.75
242 |     - websockets==11.0.3
243 |     - werkzeug==2.3.6
244 |     - wget==3.2
245 |     - xformers==0.0.20
246 |     - yarl==1.9.2
247 |     - zipp==3.16.2
248 | prefix: /homes/jl009/.conda/envs/AudioEditor
249 | 
250 | 


--------------------------------------------------------------------------------
/wavcraft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/__init__.py


--------------------------------------------------------------------------------
/wavcraft/apis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torchaudio
  3 | import requests
  4 | import math
  5 | import numpy as np
  6 | import soundfile as sf
  7 | import pyloudnorm as pyln
  8 | from scipy.io.wavfile import write
  9 | from retrying import retry
 10 | from gradio_client import Client
 11 | from audiomentations import AddGaussianSNR, LowPassFilter, HighPassFilter, ApplyImpulseResponse, RoomSimulator
 12 | 
 13 | from wavcraft.utils import get_service_port, get_service_url, get_path_from_target_dir, generate_random_series
 14 |  
 15 | 
 16 | os.environ['OPENBLAS_NUM_THREADS'] = '1'
 17 | SAMPLE_RATE = 16000  # 32000 is NOT supported by wavmark
 18 | 
 19 | localhost_addr = get_service_url()
 20 | 
 21 | 
 22 | def _LOUDNESS_NORM(wav, volume=-25, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
 23 |     """
 24 |     Nomalize waveform and adjust the loadness as per BS.1770.
 25 |     """
 26 |     # peak normalize wav to -1 dB
 27 |     peak_normalized_wav = pyln.normalize.peak(wav, -10.0)
 28 |     # measure the loudness first 
 29 |     meter = pyln.Meter(sr) # create BS.1770 meter
 30 |     loudness = meter.integrated_loudness(peak_normalized_wav)
 31 |     # loudness normalize wav to -12 dB LUFS
 32 |     normalized_wav = pyln.normalize.loudness(peak_normalized_wav, loudness, volume)
 33 | 
 34 |     return normalized_wav
 35 | 
 36 | 
 37 | def _READ_AUDIO_NUMPY(wav, sr=SAMPLE_RATE):
 38 |     """
 39 |     Read audio numpy 
 40 |     Returns: 
 41 |         np.array [samples]
 42 |     """
 43 |     waveform, sample_rate = torchaudio.load(wav)
 44 | 
 45 |     if sample_rate != sr:
 46 |         waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=sr)
 47 |     
 48 |     wav_numpy = waveform[0].numpy()
 49 | 
 50 |     return wav_numpy
 51 | 
 52 | 
 53 | def _WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE):
 54 |     """
 55 |     Write audio numpy to .wav file
 56 |     Params:
 57 |         wav: np.array [samples]
 58 |     """   
 59 |     if name is None:
 60 |         name = 'output.wav' 
 61 |     
 62 |     if len(wav.shape) > 1:
 63 |         wav = wav[0]
 64 | 
 65 |     # declipping
 66 |     max_value = np.max(np.abs(wav)) if wav.size > 0 else 0
 67 |     if max_value > 1:
 68 |         wav *= 0.9 / (max_value + 1e-5)
 69 |     
 70 |     # write audio
 71 |     write(name, sr, np.round(wav*32767).astype(np.int16))
 72 | 
 73 | 
 74 | def LEN(wav, sr=SAMPLE_RATE):
 75 |     """
 76 |     Returns the duration of audio in seconds.
 77 |     """
 78 |     wav= _READ_AUDIO_NUMPY(wav)
 79 | 
 80 |     return len(wav) / sr
 81 | 
 82 | 
 83 | # def OUTPUT(wav, out_wav="output.wav"):
 84 | #     output_wav = get_path_from_target_dir(out_wav, wav)
 85 | #     os.rename(wav, output_wav)
 86 | #     print(f'Done all processes, result: {output_wav}')
 87 | #     return output_wav
 88 | 
 89 | 
 90 | def OUTPUT(wav, out_wav="output.wav", sr=SAMPLE_RATE):
 91 |     # Add watermark to the generated audio
 92 |     _tmp_wav = _ENCODE_WATERMARK(wav, sample_rate=sr)
 93 |     
 94 |     output_wav = get_path_from_target_dir(out_wav, _tmp_wav)
 95 |     os.rename(_tmp_wav, output_wav)
 96 |     print(f'Done all processes, result: {output_wav}')
 97 |     return output_wav
 98 | 
 99 | 
100 | """     DSP modules     """
101 | def SPLIT(wav_path, break_points=[], out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
102 |     """
103 |     Split audio into several pieces according to the breakpoints.
104 |     Params:
105 |         break_points: list[float]: a list of breakpoints (in seconds)
106 |     Returns:
107 |         Path to output wav file.
108 |     """
109 |     # Avoid `breakpoint` containing 0
110 |     break_points = [p for p in break_points if p != 0]
111 |     num_pieces = len(break_points) + 1
112 | 
113 |     prefix = out_wav.split(".")[0]
114 | 
115 |     wav = _READ_AUDIO_NUMPY(wav_path)
116 | 
117 |     results = []
118 |     for i in range(num_pieces):
119 |         onset = break_points[i - 1] * sr if i > 0 else 0
120 |         offset = break_points[i] * sr if i < len(break_points) else len(wav)
121 | 
122 |         _o_wav = get_path_from_target_dir(prefix+f"_{i}.wav", wav_path)
123 |         _WRITE_AUDIO(wav[int(onset):int(offset)], name=_o_wav)
124 |         results.append(_o_wav)
125 | 
126 |     return results
127 | 
128 | 
129 | def MIX(wavs=[['1.wav', 0.], ['2.wav', 0.]], out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
130 |     """
131 |     Mix multiple audio clips by considering their onset time.
132 |     Returns:
133 |         Path to output wav file.
134 |     """
135 |     max_length = max([int(wav[1]*sr + len(_READ_AUDIO_NUMPY(wav[0]))) for wav in wavs])
136 |     template_wav = np.zeros(max_length)
137 | 
138 |     for wav in wavs:
139 |         cur_name, cur_onset = wav
140 |         cur_wav = _READ_AUDIO_NUMPY(cur_name)
141 |         cur_len = len(cur_wav)
142 |         cur_onset = int(cur_onset * sr)
143 |         
144 |         # mix
145 |         template_wav[cur_onset:cur_onset+cur_len] += cur_wav
146 |     
147 |     out_wav = get_path_from_target_dir(out_wav, wavs[0][0])
148 |     _WRITE_AUDIO(template_wav, name=out_wav)
149 |     return out_wav
150 | 
151 | 
152 | def CAT(wavs, out_wav=generate_random_series()+'.wav'):
153 |     """
154 |     Concat multiple audio clips together.
155 |     Params:
156 |         wavs: List of wav file ['1.wav', '2.wav', ...]
157 |     """
158 |     wav_num = len(wavs)
159 | 
160 |     segment0 = _READ_AUDIO_NUMPY(wavs[0])
161 | 
162 |     cat_wav = segment0
163 | 
164 |     if wav_num > 1:
165 |         for i in range(1, wav_num):
166 |             next_wav = _READ_AUDIO_NUMPY(wavs[i])
167 |             cat_wav = np.concatenate((cat_wav, next_wav), axis=-1)
168 | 
169 |     out_wav = get_path_from_target_dir(out_wav, wavs[0])
170 |     _WRITE_AUDIO(cat_wav, name=out_wav)
171 |     return out_wav
172 | 
173 | 
174 | def ADJUST_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
175 |     """
176 |     Adjust the volume of waveform by `volume`.
177 |     """
178 |     wav, sample_rate = torchaudio.load(wav_path)
179 | 
180 |     if sample_rate != sr:
181 |         wav = torchaudio.functional.resample(wav, orig_freq=sample_rate, new_freq=sr)
182 |     
183 |     adj_vol_fn = torchaudio.transforms.Vol(gain=volume, gain_type="db")
184 |     wav = adj_vol_fn(wav)
185 | 
186 |     # write audio
187 |     wav = wav[0].numpy()  # convert to numpy
188 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
189 |     write(out_wav, sr, np.round(wav*32767).astype(np.int16))
190 |     return out_wav
191 | 
192 | 
193 | # def INC_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
194 | #     """
195 | #     Increase the volume of waveform by `volume`.
196 | #     """
197 | #     wav = _READ_AUDIO_NUMPY(wav_path)
198 | #     # measure the loudness first 
199 | #     meter = pyln.Meter(sr) # create BS.1770 meter
200 | #     loudness = meter.integrated_loudness(wav)
201 | #     # loudness normalize audio to the desired dB LUFS
202 | #     volume += loudness
203 | #     wav = pyln.normalize.loudness(wav, loudness, volume)
204 | 
205 | #     # write audio
206 | #     out_wav = get_path_from_target_dir(out_wav, wav_path)
207 | #     write(out_wav, sr, np.round(wav*32767).astype(np.int16))
208 | #     return out_wav
209 | 
210 | 
211 | # def DEC_VOL(wav_path, volume, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
212 | #     """
213 | #     Decrease the volume of waveform by `volume`.
214 | #     """
215 | #     wav = _READ_AUDIO_NUMPY(wav_path)
216 | #     # measure the loudness first 
217 | #     meter = pyln.Meter(sr) # create BS.1770 meter
218 | #     loudness = meter.integrated_loudness(wav)
219 | #     # loudness normalize audio to the desired dB LUFS
220 | #     volume -= loudness
221 | #     wav = pyln.normalize.loudness(wav, loudness, volume)
222 | 
223 | #     # write audio
224 | #     out_wav = get_path_from_target_dir(out_wav, wav_path)
225 | #     write(out_wav, sr, np.round(wav*32767).astype(np.int16))
226 | #     return out_wav
227 | 
228 | 
229 | def ADD_NOISE(wav_path, min_snr_db=5.0, max_snr_db=40.0, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
230 |     wav = _READ_AUDIO_NUMPY(wav_path)
231 |     transform = AddGaussianSNR(
232 |         min_snr_db=min_snr_db,
233 |         max_snr_db=max_snr_db,
234 |         p=1.0
235 |     )
236 | 
237 |     augmented_sound = transform(wav, sample_rate=sr)
238 | 
239 |     # write audio
240 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
241 |     _WRITE_AUDIO(augmented_sound, name=out_wav)
242 |     return out_wav
243 | 
244 | 
245 | def LOW_PASS(wav_path, min_cutoff_freq=150.0, max_cutoff_freq=7500.0, min_rolloff=12, max_rolloff=24, zero_phase=False, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
246 |     wav = _READ_AUDIO_NUMPY(wav_path)
247 |     transform = LowPassFilter(
248 |         min_cutoff_freq=min_cutoff_freq,
249 |         max_cutoff_freq=max_cutoff_freq,
250 |         min_rolloff=min_rolloff,
251 |         max_rolloff=max_rolloff,
252 |         zero_phase=zero_phase,
253 |         p=1.0
254 |     )
255 | 
256 |     augmented_sound = transform(wav, sample_rate=sr)
257 | 
258 |     # write audio
259 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
260 |     _WRITE_AUDIO(augmented_sound, name=out_wav)
261 |     return out_wav
262 | 
263 | 
264 | def HIGH_PASS(wav_path, min_cutoff_freq=20, max_cutoff_freq=2400, min_rolloff=12, max_rolloff=24, zero_phase=False, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
265 |     wav = _READ_AUDIO_NUMPY(wav_path)
266 |     transform = HighPassFilter(
267 |         min_cutoff_freq=min_cutoff_freq,
268 |         max_cutoff_freq=max_cutoff_freq,
269 |         min_rolloff=min_rolloff,
270 |         max_rolloff=max_rolloff,
271 |         zero_phase=zero_phase,
272 |         p=1.0
273 |     )
274 | 
275 |     augmented_sound = transform(wav, sample_rate=sr)
276 | 
277 |     # write audio
278 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
279 |     _WRITE_AUDIO(augmented_sound, name=out_wav)
280 |     return out_wav
281 | 
282 | 
283 | def ADD_RIR(wav_path, ir_path=None, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
284 |     wav = _READ_AUDIO_NUMPY(wav_path)
285 | 
286 |     transform = ApplyImpulseResponse(ir_path=ir_path, p=1.0)    
287 |     augmented_sound = transform(wav, sample_rate=sr)
288 | 
289 |     # write audio
290 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
291 |     _WRITE_AUDIO(augmented_sound, name=out_wav)
292 |     return out_wav
293 | 
294 | 
295 | def ROOM_SIMULATE(wav_path, min_size_x=3.6, max_size_x=5.6, 
296 |             min_size_y=3.6, max_size_y=3.9, 
297 |             min_size_z=2.4, max_size_z=3.0, 
298 |             min_absorption_value=0.075, max_absorption_value=0.4,
299 |             min_source_x=0.1, max_source_x=3.5,
300 |             min_source_y=0.1, max_source_y=2.7,
301 |             min_source_z=1.0, max_source_z=2.1,
302 |             min_mic_distance=0.15, max_mic_distance=0.35,
303 |             min_mic_azimuth=-math.pi, max_mic_azimuth=math.pi,
304 |             min_mic_elevation=-math.pi, max_mic_elevation=math.pi,
305 |             out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
306 |     wav = _READ_AUDIO_NUMPY(wav_path)
307 | 
308 |     transform = RoomSimulator(
309 |         min_size_x=min_size_x, max_size_x=max_size_x,
310 |         min_size_y=min_size_y, max_size_y=max_size_y,
311 |         min_size_z=min_size_z, max_size_z=max_size_z,
312 |         min_absorption_value=min_absorption_value, max_absorption_value=max_absorption_value,
313 |         min_source_x=min_source_x, max_source_x=max_source_x,
314 |         min_source_y=min_source_y, max_source_y=max_source_y,
315 |         min_source_z=min_source_z, max_source_z=max_source_z,
316 |         min_mic_distance=min_mic_distance, max_mic_distance=max_mic_distance,
317 |         min_mic_azimuth=min_mic_azimuth, max_mic_azimuth=max_mic_azimuth,
318 |         min_mic_elevation=min_mic_elevation, max_mic_elevation=max_mic_elevation,
319 |         p=1.0)
320 |     augmented_sound = transform(wav, sample_rate=sr)
321 | 
322 |     # write audio
323 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
324 |     _WRITE_AUDIO(augmented_sound, name=out_wav)
325 |     return out_wav
326 |     
327 | 
328 | # def CLIP(wav_path, offset, onset=0, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
329 | #     """
330 | #     Clip the audio using onset and offset time.
331 | #     Params:
332 | #         onset/offset: onset/offset time in seconds.
333 | #     Returns:
334 | #         Path to output wav file.
335 | #     """
336 | #     wav = _READ_AUDIO_NUMPY(wav_path)
337 | 
338 | #     # Get onset/offset with samples rates
339 | #     onset *= SAMPLE_RATE
340 | #     offset *= SAMPLE_RATE
341 | #     assert 0 <= onset <= offset <= len(wav)
342 | 
343 | #     out_wav = get_path_from_target_dir(out_wav, wav_path)
344 | #     _WRITE_AUDIO(wav[int(onset):int(offset)], name=out_wav)
345 | #     return out_wav
346 | 
347 | 
348 | """     Deep-learning modules     """
349 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
350 | def AU(wav_path, text="write an audio caption describing the sound"):
351 |     HF_key = os.environ.get("HF_KEY")
352 |     client = Client("https://yuangongfdu-ltu.hf.space/", hf_token=HF_key)
353 |     response = client.predict(
354 |         wav_path,
355 |         "write an audio caption describing the sound",
356 |         api_name="/predict",
357 |     )
358 |     return response
359 |     
360 | 
361 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
362 | def TTM(text, melody=None, length=10, volume=-28, out_wav=generate_random_series()+'.wav', sr=SAMPLE_RATE):
363 |     service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT")
364 |     url = f'http://{localhost_addr}:{service_port}/generate_music'
365 | 
366 |     # Change the name if file exist
367 |     if os.path.exists(out_wav):
368 |         out_wav = generate_random_series() + '.wav'
369 |         
370 |     data = {
371 |         'text': f'{text}',
372 |         'melody': melody,
373 |         'length': f'{length}',
374 |         'volume': f'{volume}',
375 |         'sample_rate': f'{sr}',
376 |         'output_wav': f'{out_wav}',
377 |     }
378 | 
379 |     response = requests.post(url, json=data)
380 | 
381 |     if response.status_code == 200:
382 |         print('Success:', response.json()['message'])
383 |         return out_wav
384 |     else:
385 |         print('Error:', response.json()['API error'])
386 |         raise RuntimeError(response.json()['API error'])
387 | 
388 | 
389 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
390 | def TTA(text, length=5, volume=-35, out_wav=generate_random_series()+'.wav'):
391 |     service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT")
392 |     url = f'http://{localhost_addr}:{service_port}/generate_audio'
393 | 
394 |     # Change the name if file exist
395 |     if os.path.exists(out_wav):
396 |         out_wav = generate_random_series() + '.wav'
397 | 
398 |     data = {
399 |         'text': f'{text}',
400 |         'length': f'{length}',
401 |         'volume': f'{volume}',
402 |         'output_wav': f'{out_wav}',
403 |     }
404 | 
405 |     response = requests.post(url, json=data)
406 |     if response.status_code == 200:
407 |         print('Success:', response.json()['message'])
408 |         return out_wav
409 |     else:
410 |         print('Error:', response.json()['API error'])
411 |         raise RuntimeError(response.json()['API error'])
412 | 
413 | 
414 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
415 | def TTS(text, speaker="Male1_En", volume=-20, out_wav=generate_random_series()+'.wav'):
416 |     service_port = get_service_port("AUDIOCRAFT_SERVICE_PORT")
417 |     url = f'http://{localhost_addr}:{service_port}/generate_speech'
418 | 
419 |     # Change the name if file exist
420 |     if os.path.exists(out_wav):
421 |         out_wav = generate_random_series() + '.wav'
422 | 
423 |     data = {
424 |     'text': f'{text}',
425 |     'speaker_id': f'{speaker}',
426 |     'volume': f'{volume}',
427 |     'output_wav': f'{out_wav}',
428 |     }
429 | 
430 |     response = requests.post(url, json=data)
431 | 
432 |     if response.status_code == 200:
433 |         print('Success:', response.json()['message'])
434 |         return out_wav
435 |     else:
436 |         print('Error:', response.json()['API error'])
437 |         raise RuntimeError(response.json()['API error'])
438 | 
439 | 
440 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
441 | def SR(wav_path, out_wav=generate_random_series()+'.wav', ddim_steps=50, guidance_scale=3.5, seed=42):
442 |     service_port = get_service_port("AUDIOSR_SERVICE_PORT")
443 |     url = f'http://{localhost_addr}:{service_port}/super_resolution'
444 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
445 |     data = {
446 |         'wav_path': f'{wav_path}',
447 |         'ddim_steps': f'{ddim_steps}',
448 |         'guidance_scale': f'{guidance_scale}',
449 |         'seed': f'{seed}',
450 |         'output_wav':f'{out_wav}'
451 |     }
452 |     response = requests.post(url, json=data)
453 | 
454 |     if response.status_code == 200:
455 |         print('Success:', response.json()['message'])
456 |         return out_wav
457 |     else:
458 |         print('Error:', response.json()['API error'])
459 |         raise RuntimeError(response.json()['API error'])
460 | 
461 | 
462 | # @retry(stop_max_attempt_number=5, wait_fixed=2000)
463 | # def VP(wav_path, out_dir):
464 | #     url = f'http://{localhost_addr}:{service_port}/parse_voice'
465 | #     data = {
466 | #         'wav_path': f'{wav_path}', 
467 | #         'out_dir':f'{out_dir}'
468 | #     }
469 | 
470 | #     response = requests.post(url, json=data)
471 | 
472 | #     if response.status_code == 200:
473 | #         print('Success:', response.json()['message'])
474 | #     else:
475 | #         print('Error:', response.json()['API error'])
476 | #         raise RuntimeError(response.json()['API error'])
477 |     
478 | 
479 | # @retry(stop_max_attempt_number=5, wait_fixed=2000)
480 | # def EXTRACT(wav_path, text, out_wav=generate_random_series()+'.wav'):
481 | #     service_port = get_service_port("AUDIOSEP_SERVICE_PORT")
482 | #     url = f'http://{localhost_addr}:{service_port}/source_separate'
483 | #     out_wav = get_path_from_target_dir(out_wav, wav_path)
484 | #     data = {
485 | #         'wav_path': f'{wav_path}', 
486 | #         'text': f'{text}',
487 | #         'output_wav':f'{out_wav}'
488 | #     }
489 | 
490 | #     response = requests.post(url, json=data)
491 | 
492 | #     if response.status_code == 200:
493 | #         filedir, filename = os.path.split(out_wav)
494 | #         fg_filepath = os.path.join(filedir, "fg_"+filename)
495 | #         bg_filepath = os.path.join(filedir, "bg_"+filename)
496 | #         os.rename(fg_filepath, out_wav)
497 | #         os.remove(bg_filepath)
498 | #         print('Success:', response.json()['message'])
499 | #         return out_wav
500 | #     else:
501 | #         print('Error:', response.json()['API error'])
502 | #         raise RuntimeError(response.json()['API error'])
503 |     
504 | 
505 | # @retry(stop_max_attempt_number=5, wait_fixed=2000)
506 | # def DROP(wav_path, text, out_wav=generate_random_series()+'.wav'):
507 | #     service_port = get_service_port("AUDIOSEP_SERVICE_PORT")
508 | #     url = f'http://{localhost_addr}:{service_port}/source_separate'
509 | #     out_wav = get_path_from_target_dir(out_wav, wav_path)
510 | #     data = {
511 | #         'wav_path': f'{wav_path}', 
512 | #         'text': f'{text}',
513 | #         'output_wav':f'{out_wav}'
514 | #     }
515 | 
516 | #     response = requests.post(url, json=data)
517 | 
518 | #     if response.status_code == 200:
519 | #         filedir, filename = os.path.split(out_wav)
520 | #         fg_filepath = os.path.join(filedir, "fg_"+filename)
521 | #         bg_filepath = os.path.join(filedir, "bg_"+filename)
522 | #         os.rename(bg_filepath, out_wav)
523 | #         os.remove(fg_filepath)
524 | #         print('Success:', response.json()['message'])
525 | #         return out_wav
526 | #     else:
527 | #         print('Error:', response.json()['API error'])
528 | #         raise RuntimeError(response.json()['API error'])
529 |     
530 | 
531 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
532 | def TSS(wav_path, text, out_wav=generate_random_series()+'.wav'):
533 |     service_port = get_service_port("AUDIOSEP_SERVICE_PORT")
534 |     url = f'http://{localhost_addr}:{service_port}/source_separate'
535 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
536 |     data = {
537 |         'wav_path': f'{wav_path}', 
538 |         'text': f'{text}',
539 |         'output_wav':f'{out_wav}'
540 |     }
541 | 
542 |     response = requests.post(url, json=data)
543 | 
544 |     if response.status_code == 200:
545 |         filedir, filename = os.path.split(out_wav)
546 |         fg_filepath = os.path.join(filedir, "fg_"+filename)
547 |         bg_filepath = os.path.join(filedir, "bg_"+filename)
548 |         print('Success:', response.json()['message'])
549 |         return fg_filepath, bg_filepath
550 |     else:
551 |         print('Error:', response.json()['API error'])
552 |         raise RuntimeError(response.json()['API error'])
553 |     
554 | 
555 | @retry(stop_max_attempt_number=5, wait_fixed=2000)
556 | def INPAINT(wav_path, text, onset, offset, duration, guidance_scale=2.5, ddim_steps=200, random_seed=42, sample_rate=SAMPLE_RATE, out_wav=generate_random_series()+'.wav',):
557 |     service_port = get_service_port("AUDIOLDM_SERVICE_PORT")
558 |     url = f'http://{localhost_addr}:{service_port}/audio_inpaint'
559 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
560 |     data = {
561 |         'wav_path': f'{wav_path}', 
562 |         'text': f'{text}',
563 |         'onset': onset,
564 |         'offset': offset,
565 |         'duration': duration,
566 |         'output_wav':f'{out_wav}',
567 |         # generation settings
568 |         'sample_rate': sample_rate,
569 |         'guidance_scale': guidance_scale,
570 |         'ddim_steps': ddim_steps,
571 |         'random_seed': random_seed,
572 |     }
573 | 
574 |     response = requests.post(url, json=data)
575 | 
576 |     if response.status_code == 200:
577 |         print('Success:', response.json()['message'])
578 |         return out_wav
579 |     else:
580 |         print('Error:', response.json()['API error'])
581 |         raise RuntimeError(response.json()['API error'])
582 | 
583 | 
584 | def _ENCODE_WATERMARK(wav_path, sample_rate=SAMPLE_RATE, out_wav=generate_random_series()+'.wav',):
585 |     service_port = get_service_port("WAVMARK_SERVICE_PORT")
586 |     url = f'http://{localhost_addr}:{service_port}/audio_watermark'
587 |     out_wav = get_path_from_target_dir(out_wav, wav_path)
588 |     data = {
589 |         'wav_path': f'{wav_path}', 
590 |         'action': "encode",
591 |         'output_wav':f'{out_wav}',
592 |         'sample_rate': sample_rate,
593 |     }
594 | 
595 |     response = requests.post(url, json=data)
596 | 
597 |     if response.status_code == 200:
598 |         print('Success:', response.json()['message'])
599 |         return out_wav
600 |     else:
601 |         print('Error:', response.json()['API error'])
602 |         raise RuntimeError(response.json()['API error'])
603 |     
604 | 
605 | def _DECODE_WATERMARK(wav_path, sample_rate=SAMPLE_RATE):
606 |     service_port = get_service_port("WAVMARK_SERVICE_PORT")
607 |     url = f'http://{localhost_addr}:{service_port}/audio_watermark'
608 |     data = {
609 |         'wav_path': f'{wav_path}', 
610 |         'action': "decode",
611 |         'sample_rate': sample_rate,
612 |     }
613 | 
614 |     response = requests.post(url, json=data)
615 | 
616 |     if response.status_code == 200:
617 |         print('Success:', response.json()['message'])
618 |         return wav_path
619 |     else:
620 |         print('Error:', response.json()['API error'])
621 |         raise RuntimeError(response.json()['API error'])
622 |     


--------------------------------------------------------------------------------
/wavcraft/configs.yaml:
--------------------------------------------------------------------------------
 1 | AudioCraft:
 2 |   # MusicGen 
 3 |   ttm_model_size: melody # [small, medium, large]
 4 |   # AudioGen 
 5 |   tta_model_size: medium # [medium]
 6 | 
 7 | Text-to-Speech:
 8 |   # Bark
 9 |   speed: 1.05
10 | 
11 | Speech-Restoration:
12 |   # VoiceFixer
13 |   Enable: True
14 | 
15 | AudioLDM:
16 |   model_size: audioldm-m-full


--------------------------------------------------------------------------------
/wavcraft/ffmpeg_engineer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import time
  4 | import glob
  5 | import pickle
  6 | import openai
  7 | 
  8 | 
  9 | class FFmpegEngineer:
 10 |     OPENAI_KEY = os.environ.get('OPENAI_KEY')
 11 | 
 12 |     def __init__(self, use_openai_cache = False,):
 13 |         self.use_openai_cache = False
 14 |         self.openai_cache = []
 15 |         if self.use_openai_cache:
 16 |             os.makedirs('cache', exist_ok=True)
 17 |             for cache_file in glob.glob('cache/*.pkl'):
 18 |                 with open(cache_file, 'rb') as file:
 19 |                     self.openai_cache.append(pickle.load(file))
 20 | 
 21 |         self.history = [
 22 |             {
 23 |                 "role": "system",
 24 |                 "content": "You are a helpful assistant."
 25 |             },
 26 |             ]
 27 | 
 28 |     def complete(self, prompt, model="gpt-4", api_key=""):
 29 |         content = self.chat_with_gpt(prompt, model, api_key)
 30 |         content = self.extract_ffmpeg_command(self.try_extract_content_from_quotes(content))
 31 |         self.execute_code(content)
 32 | 
 33 | 
 34 |     def chat_with_gpt(self, prompt, model="gpt-4", api_key=""):
 35 |         api_key = self.OPENAI_KEY if not api_key else api_key
 36 | 
 37 |         if self.use_openai_cache:
 38 |             filtered_object = list(filter(lambda x: x['prompt'] == prompt, self.openai_cache))
 39 |             if len(filtered_object) > 0:
 40 |                 response = filtered_object[0]['response']
 41 |                 return response
 42 |         
 43 |         self.history.append(
 44 |             {
 45 |                 "role": "user",
 46 |                 "content": prompt
 47 |             },
 48 |             )
 49 | 
 50 |         try:
 51 |             openai.api_key = api_key
 52 |             chat = openai.ChatCompletion.create(
 53 |                 model=model,  # "gpt-3.5-turbo",
 54 |                 messages=self.history,
 55 |             )
 56 |             response = chat['choices'][0]['message']['content']
 57 | 
 58 |             self.history.append(
 59 |                 {
 60 |                     "role": "system",
 61 |                     "content": response
 62 |                 },
 63 |                 )
 64 |             
 65 |         finally:
 66 |             openai.api_key = ''
 67 | 
 68 |         if self.use_openai_cache:
 69 |             cache_obj = {
 70 |                 'prompt': prompt,
 71 |                 'response': response,
 72 |             }
 73 |             with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache:
 74 |                 pickle.dump(cache_obj, _openai_cache)
 75 |                 self.openai_cache.append(cache_obj)
 76 | 
 77 |         return response
 78 |     
 79 | 
 80 |     def reset(self,):
 81 |         self.history = []
 82 | 
 83 | 
 84 |     @classmethod
 85 |     def _extract_substring_with_quotes(cls, input_string, quotes="'''"):
 86 |         pattern = f"{quotes}(.*?){quotes}"
 87 |         matches = re.findall(pattern, input_string, re.DOTALL)
 88 |         return matches
 89 | 
 90 |     @classmethod
 91 |     def extract_ffmpeg_command(cls, input_string):
 92 |         # Split the string into lines
 93 |         lines = input_string.split('\n')
 94 | 
 95 |         # Find the index where the 'ffmpeg' command starts
 96 |         start_index = next((i for i, line in enumerate(lines) if 'ffmpeg' in line), None)
 97 | 
 98 |         # Extract lines from the start of the 'ffmpeg' command till the end or a specific end pattern
 99 |         if start_index is not None:
100 |             ffmpeg_lines = lines[start_index:]
101 |             # ffmpeg_lines = ffmpeg_lines[:end_index]
102 |             return '\n'.join(ffmpeg_lines)
103 |         else:
104 |             return ""
105 | 
106 |     def try_extract_content_from_quotes(self, content):
107 |         if "'''" in content:
108 |             return self._extract_substring_with_quotes(content)[0]
109 |         elif "```" in content:
110 |             return self._extract_substring_with_quotes(content, quotes="```")[0]
111 |         else:
112 |             return content
113 |         
114 |     def execute_code(self, content):
115 |         os.system(content)
116 | 
117 | 
118 | """     Test     """
119 | if __name__ == "__main__":
120 |     import os
121 | 
122 |     eng = FFmpegEngineer()
123 | 
124 |     prompt = "Using bash to check the version of ffmpeg in linux."
125 |     model = "gpt-4"
126 |     OPENAI_KEY = "your_key"  # can be set using `${OPENAI_KEY}` from env
127 | 
128 |     # For examin the func you can use the following lines
129 |     # response = eng.chat_with_gpt(prompt, model, api_key)
130 |     # print(response)
131 |     # code = eng.extract_ffmpeg_command(eng.try_extract_content_from_quotes(response))
132 |     # print(code)
133 |     # eng.execute_code(code)
134 |     # Ones can replace the above lines with one line:
135 |     eng.complete(prompt, model="gpt-4", api_key=OPENAI_KEY)
136 | 


--------------------------------------------------------------------------------
/wavcraft/mistral_api.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import List
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | 
 5 | 
 6 | class _chatLLM:
 7 |     def __init__(self, model_id):
 8 |         self.messages = []
 9 |         self.model, self.tokenizer = self.build_model(model_id)
10 | 
11 |     def create_new_session(self):
12 |         self.messages = []
13 | 
14 | 
15 | class ChatMistral(_chatLLM):
16 |     def __init__(self, model_id="mistralai/Mistral-7B-Instruct-v0.2"):
17 |         super().__init__(model_id)
18 |         self.device = self.model.device
19 | 
20 | 
21 |     def get_response(self, prompt: str):
22 |         self.messages.append({"role": "user", "content": prompt})
23 | 
24 |         encodes = self.tokenizer.apply_chat_template(self.messages, return_tensors="pt").to(self.device)
25 | 
26 |         generated_ids = self.model.generate(encodes, max_new_tokens=1000, do_sample=True)
27 |         decoded = self.tokenizer.batch_decode(generated_ids)[-1]
28 | 
29 |         response = self.extract_response(decoded)
30 |         self.messages.append({"role": "assistant", "content": response})  # Update conversation
31 | 
32 |         return response
33 |     
34 | 
35 |     def build_model(self, model_id="mistralai/Mistral-7B-Instruct-v0.2"):
36 |         from transformers import BitsAndBytesConfig
37 |         quantization_config = BitsAndBytesConfig(
38 |             # 8-bit quantization
39 |             load_in_8bit=True,
40 |             # 4-bit quantization
41 |             # load_in_4bit=True,
42 |             # bnb_4bit_quant_type="nf4",
43 |             # bnb_4bit_compute_dtype=torch.float16,
44 |             )
45 | 
46 |         model = AutoModelForCausalLM.from_pretrained(
47 |             model_id,
48 |             torch_dtype=torch.float16, 
49 |             # attn_implementation="flash_attention_2",  # NOTE: cannot use with V100
50 |             quantization_config=quantization_config,
51 |             device_map="auto")
52 |         
53 |         # [Updated 23-04-2024] mistral built-in tokenizer has the same function as huggingface tokenizer
54 |         # Ones can stick to huggingface tokenizer UNLESS tool calling is required.
55 |         tokenizer = AutoTokenizer.from_pretrained(model_id)
56 | 
57 |         return model, tokenizer
58 | 
59 | 
60 |     def extract_response(self, responses):
61 |         # Split the interaction by "</s>" to separate each round, and take the last non-empty round if any
62 |         rounds = [r for r in responses.split("</s>") if r.strip()]
63 |         # Split response by "[/INST]"
64 |         last_reponse = rounds[-1].strip().split("[/INST]")[-1].strip()
65 | 
66 |         return last_reponse
67 | 
68 | if __name__ == "__main__":
69 |     llm = ChatMistral()
70 |     llm.messages = [
71 |         {"role": "user", "content": "What is your favourite condiment?"},
72 |         {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
73 |     ]
74 |     re = llm.get_response("Do you have mayonnaise recipes?")
75 |     print(re)
76 |     import ipdb; ipdb.set_trace()
77 | 


--------------------------------------------------------------------------------
/wavcraft/pipeline.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import openai
  4 | import re
  5 | import glob
  6 | import pickle
  7 | import time
  8 | import random
  9 | import string
 10 | from retrying import retry
 11 | from glob import glob
 12 | 
 13 | from wavcraft.mistral_api import ChatMistral
 14 | import wavcraft.utils as utils
 15 | 
 16 | 
 17 | # Enable this for debugging
 18 | USE_OPENAI_CACHE = False
 19 | openai_cache = []
 20 | if USE_OPENAI_CACHE:
 21 |     os.makedirs('cache', exist_ok=True)
 22 |     for cache_file in glob.glob('cache/*.pkl'):
 23 |         with open(cache_file, 'rb') as file:
 24 |             openai_cache.append(pickle.load(file))
 25 | 
 26 | 
 27 | # Global vars
 28 | chat_history = []
 29 | local_llm = None
 30 | 
 31 | 
 32 | def chat_with_gpt(api_key, model="gpt-4"):
 33 |     #"gpt-4",  # "gpt-3.5-turbo"
 34 |     global chat_history
 35 | 
 36 |     if USE_OPENAI_CACHE:
 37 |         filtered_object = list(filter(lambda x: x['prompt'] == chat_history[-1]["content"], openai_cache))
 38 |         if len(filtered_object) > 0:
 39 |             response = filtered_object[0]['response']
 40 |             return response
 41 |     
 42 |     try:
 43 |         openai.api_key = api_key
 44 |         
 45 |         chat = openai.ChatCompletion.create(
 46 |             model=model,
 47 |             messages=chat_history,
 48 |         )
 49 |     finally:
 50 |         openai.api_key = ''
 51 | 
 52 |     if USE_OPENAI_CACHE:
 53 |         cache_obj = {
 54 |             'prompt': chat_history[-1]["content"],
 55 |             'response': chat['choices'][0]['message']['content']
 56 |         }
 57 |         with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache:
 58 |             pickle.dump(cache_obj, _openai_cache)
 59 |             openai_cache.append(cache_obj)
 60 | 
 61 |     chat_history.append({
 62 |                     "role": "system",
 63 |                     "content": chat['choices'][0]['message']['content'],
 64 |                     })
 65 | 
 66 |     return chat['choices'][0]['message']['content']
 67 | 
 68 | 
 69 | # Assuming the existence of USE_OPENAI_CACHE, chat_history, and openai_cache similar to GPT function
 70 | def chat_with_mistral():  
 71 |     global chat_history
 72 |     
 73 |     if USE_OPENAI_CACHE:
 74 |         filtered_object = list(filter(lambda x: x['prompt'] == chat_history[-1]["content"], openai_cache))
 75 |         if len(filtered_object) > 0:
 76 |             return filtered_object[0]['response']
 77 | 
 78 |     global local_llm 
 79 |     # import ipdb; ipdb.set_trace()
 80 |     local_llm.messages = chat_history[:-1]
 81 |     try:
 82 |         response = local_llm.get_response(chat_history[-1]["content"])
 83 |     finally:
 84 |         pass  
 85 |     
 86 |     if USE_OPENAI_CACHE:
 87 |         cache_obj = {
 88 |             'prompt': chat_history[-1]["content"],
 89 |             'response': response
 90 |         }
 91 |         with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache:
 92 |             pickle.dump(cache_obj, _openai_cache)
 93 |             openai_cache.append(cache_obj)
 94 | 
 95 |     chat_history.append({
 96 |         "role": "assistant",
 97 |         "content": response,
 98 |     })
 99 | 
100 |     return response
101 | 
102 | 
103 | def get_file_content(filename):
104 |     with open(filename, 'r') as file:
105 |         return file.read().strip()
106 | 
107 | 
108 | def write_to_file(filename, content):
109 |     with open(filename, 'w') as file:
110 |         file.write(content)
111 | 
112 | 
113 | def extract_substring_with_quotes(input_string, quotes="'''"):
114 |     pattern = f"{quotes}(.*?){quotes}"
115 |     matches = re.findall(pattern, input_string, re.DOTALL)
116 |     return matches
117 | 
118 | 
119 | def maybe_remove_python_as_prefix(content):
120 |     keyword = "python"
121 |     
122 |     content = content.strip()
123 |     if content.startswith(keyword):
124 |         # Remove the keyword and strip leading/trailing whitespaces
125 |         return content[len(keyword):].strip()
126 |     return content
127 | 
128 | 
129 | def try_extract_content_from_quotes(content):
130 |     if "'''" in content:
131 |         return maybe_remove_python_as_prefix(extract_substring_with_quotes(content)[0])
132 |     elif "```" in content:
133 |         return maybe_remove_python_as_prefix(extract_substring_with_quotes(content, quotes="```")[0])
134 |     else:
135 |         return maybe_remove_python_as_prefix(content)
136 | 
137 | 
138 | def maybe_get_content_from_file(content_or_filename):
139 |     if os.path.exists(content_or_filename):
140 |         with open(content_or_filename, 'r') as file:
141 |             return file.read().strip()
142 |     return content_or_filename
143 | 
144 | 
145 | # Pipeline Interface Guidelines:
146 | #
147 | # Init calls:
148 | # - Init calls must be called before running the actual steps
149 | #   - init_session() is called every time a gradio webpage is loaded
150 | #
151 | # Single Step:
152 | # - takes input (file or content) and output path as input
153 | # - most of time just returns output content
154 | # 
155 | # Compositional Step:
156 | # - takes session_id as input (you have session_id, you have all the paths)
157 | # - run a series of steps
158 | 
159 | # This is called for every new gradio webpage
160 | 
161 | def init_session(session_id=''):
162 |     def uid8():
163 |         return ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
164 | 
165 |     if session_id == '':
166 |         session_id = f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{uid8()}'
167 |     # create the paths
168 |     os.makedirs(utils.get_session_audio_path(session_id))
169 |     print(f'New session created, session_id={session_id}')
170 |     return session_id
171 | 
172 | 
173 | @retry(stop_max_attempt_number=3)
174 | def _input_text_to_code_with_retry(log_path, api_key, model="gpt-4"):
175 |     print("    trying ...")
176 |     try:
177 |         if "mistral" in model:
178 |             code_response = chat_with_mistral()
179 |         elif "gpt" in model:
180 |             code_response = try_extract_content_from_quotes(chat_with_gpt(api_key, model))
181 |         else:
182 |             raise ValueError(f"Do not support {model}")
183 | 
184 |     except Exception as err:
185 |         global chat_history
186 |         chat_log = f'\n{chat_history}\n\nINPUT ERROR: {err}'
187 |         write_to_file(log_path, chat_log)
188 |         raise err
189 | 
190 |     return code_response
191 | 
192 | 
193 | wav_description = {}
194 | # [Basic] Step 1: input to py code
195 | def input_text_to_code(input_text, output_path, api_key, model="gpt-4"):
196 |     # Claim input audio in the instruction
197 |     input_description = "Input audio:\n"
198 |     n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav')))
199 |     if n_input_wavs > 0:
200 |         for i in range(n_input_wavs):
201 |             in_wav = f"{output_path.absolute()}/audio/input_{i}.wav"
202 | 
203 |             if in_wav in wav_description:
204 |                 continue
205 |             else:
206 |                 input_description += f"INPUT_WAV{i}\n"
207 |                 # Add placeholder to the log as the basic does not need to describe the audio
208 |                 wav_description[in_wav] = ""
209 |     else:  # no input wav
210 |         input_description = ""
211 | 
212 |     input_text = maybe_get_content_from_file(input_text)
213 | 
214 |     log_path = output_path / 'chat.log'
215 |     if not os.path.exists(log_path):
216 |         text_to_audio_script_prompt = get_file_content('wavcraft/prompts/text_to_code.prompt')
217 |         prompt = f'{text_to_audio_script_prompt}\n\n{input_description}Instruction:\n{input_text}\nCode:\n'
218 |     else:
219 |         text_to_followup = get_file_content('wavcraft/prompts/text_to_followup.prompt')
220 |         prompt = f'{input_description}{text_to_followup}\n{input_text}'
221 | 
222 |     global chat_history
223 |     chat_history.append({
224 |         "role": "user",
225 |         "content": prompt,
226 |         })
227 |        
228 |     write_to_file(log_path, f"{chat_history}")
229 |     code_response = _input_text_to_code_with_retry(log_path, api_key, model)
230 |     executable_code_filename = output_path / 'audio_executable.py'
231 | 
232 |     write_to_file(executable_code_filename, code_response)
233 |     return code_response
234 | 
235 | 
236 | # [Inspiration] Step 1: input to py code
237 | def input_text_to_code_plus(input_text, output_path, api_key, model="gpt-4"):
238 |     import sys
239 | 
240 |     sys.path.append(os.path.dirname(__file__))
241 |     from wavcraft.apis import AU
242 | 
243 |     # Add description to the prompt
244 |     input_description = "Input audio:\n"
245 |     n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav')))
246 |     for i in range(n_input_wavs):
247 |         in_wav = f"{output_path.absolute()}/audio/input_{i}.wav"
248 | 
249 |         if in_wav in wav_description:
250 |             continue
251 |         else:
252 |             response = AU(in_wav, text="write an audio caption describing the sound")
253 |             input_description += f"INPUT_WAV{i}: {response}\n"
254 |             # Add to the log
255 |             wav_description[in_wav] = response
256 | 
257 |     input_text = maybe_get_content_from_file(input_text)
258 | 
259 |     log_path = output_path / 'chat.log'
260 |     if not os.path.exists(log_path):
261 |         text_to_audio_script_prompt = get_file_content('wavcraft/prompts/text_to_code.prompt')
262 |         prompt = f'{text_to_audio_script_prompt}\n\n{input_description}Instruction:\n{input_text}\nCode:\n'
263 |     else:
264 |         text_to_followup = get_file_content('wavcraft/prompts/text_to_followup.prompt')
265 |         prompt = f'{input_description}{text_to_followup}\n{input_text}'
266 | 
267 |     global chat_history
268 |     chat_history.append({
269 |         "role": "user",
270 |         "content": prompt,
271 |         })
272 |        
273 |     write_to_file(log_path, f"{chat_history}")
274 | 
275 |     code_response = _input_text_to_code_with_retry(log_path, api_key, model)
276 |     executable_code_filename = output_path / 'audio_executable.py'
277 | 
278 |     write_to_file(executable_code_filename, code_response)
279 |     return code_response
280 | 
281 | 
282 | # Step 2: py code to final wav
283 | def audio_exe_to_result(code_response, output_path):
284 |     executable_code_filename = output_path / 'audio_executable.py'
285 | 
286 |     # TODO: make this more easy to modify
287 |     # Executable file header
288 |     header = "from wavcraft.apis import LEN, OUTPUT, SPLIT, MIX, CAT, ADJUST_VOL, ADD_NOISE, LOW_PASS, HIGH_PASS, ADD_RIR, ROOM_SIMULATE, TTM, TTA, TTS, SR, TSS, INPAINT"
289 |     
290 |     input_claimer = ""
291 |     n_input_wavs = len(glob(os.path.join(output_path, 'audio', 'input_*.wav')))
292 |     for i in range(n_input_wavs):
293 |         in_wav = f"\"{output_path.absolute()}/audio/input_{i}.wav\""
294 |         input_claimer += f"INPUT_WAV{i} = {in_wav}\n"
295 |     
296 |     tail = "OUTPUT(OUTPUT_WAV)"
297 |     code_response = maybe_get_content_from_file(code_response)
298 |     command = f"{header}\n\n\n{input_claimer}{code_response}\n{tail}"
299 |     write_to_file(executable_code_filename, command)
300 |     
301 |     os.system(f'PYTHONPATH=. python {executable_code_filename}')
302 | 
303 | 
304 | # Function call used by Gradio: input_text to json
305 | def generate_code(session_id, input_wav, input_text, api_key, model="gpt-4", mode="basic"):
306 |     assert mode in ("basic", "inspiration")
307 | 
308 |     output_path = utils.get_session_path(session_id)
309 |     os.makedirs(output_path, exist_ok=True)
310 |     for i, in_wav in enumerate(input_wav):
311 |         os.system(f"cp {in_wav} {os.path.join(output_path, 'audio', f'input_{i}.wav')}")
312 | 
313 |     # Step 1
314 |     print(f'session_id={session_id}, Step 1: Writing executable code with LLM ...')
315 |     if mode == "basic":
316 |         return input_text_to_code(input_text, output_path, api_key, model)
317 |     else:
318 |         return input_text_to_code_plus(input_text, output_path, api_key, model)
319 | 
320 | 
321 | # Function call used by Gradio: json to result wav
322 | def generate_audio(session_id, code_response):
323 |     output_path = utils.get_session_path(session_id)
324 |     # Step 2
325 |     print(f'session_id={session_id}, Step 2: Start running Python program...')
326 |     audio_exe_to_result(code_response, output_path)
327 | 
328 | 
329 | # Convenient function call used by wavjourney_cli
330 | def full_steps(session_id, input_wav, input_text, api_key, mode, model="gpt-4"):
331 |     global local_llm, chat_history
332 | 
333 |     if "mistral" in model:
334 |         local_llm = ChatMistral(model_id=model)
335 |     elif "gpt" in model:
336 |         chat_history = [{
337 |             "role": "system",
338 |             "content": "You are a helpful assistant.",
339 |             }]
340 |     else:
341 |         raise ValueError(f"Not support {model}.")
342 |     code_script = generate_code(session_id, input_wav, input_text, api_key, model=model, mode=mode)
343 |     return generate_audio(session_id, code_script)


--------------------------------------------------------------------------------
/wavcraft/prompts/text_to_code.prompt:
--------------------------------------------------------------------------------
  1 | You are an professional audio editor. Try to follow the instruction I give using several predefined tools:
  2 | LEN(wav) # returns the duration of `wav` in seconds
  3 | MIX(wavs: list[tuple])  # returns the mixture of the input `wavs`
  4 | CAT(wavs: list)  # returns the concatenated wav using input `wavs`
  5 | SPLIT(wav, break_points=list[float]) # returns the split wavs using `break_points`
  6 | ADJUST_VOL(wav, volume: int)  # returns the adjusted wav by `volume`
  7 | TTA(text: str, length: float, volume: int)  # returns a generated audio conditioned on `text`
  8 | TTM(text: str, melody, length: float, volume: int)  # returns a generated music conditioned on `text` and (optional) `melody`
  9 | TTS(text: str, volume: int)  # returns a generated speech conditioned on `text` and `speaker`. `speaker` should be in ['Male1_En', 'Male2_En', 'Female1_En', 'Female2_En', 'News_Male_En', 'News_Female_En', 'News_Female_Out_En', 'Child_En', 'Old_Man_En', 'Male1_Zh', 'Male2_Zh', 'Female1_Zh', 'Female2_Zh', 'Male1_Fr', 'Male2_Fr', 'Female1_Fr', 'Female2_Fr', 'Male1_De', 'Male2_De', 'Female1_De', 'Female2_De', 'Male1_Hi', 'Male2_Hi', 'Female1_Hi', 'Female2_Hi', 'Male1_It', 'Male2_It', 'Female1_It', 'Female2_It', 'Male1_Ja', 'Male2_Ja', 'Female1_Ja', 'Female2_Ja', 'Male1_Ko', 'Male2_Ko', 'Female1_Ko', 'Female1_Ru', 'Female2_Ru', 'Male1_Ru', 'Male2_Ru', 'Female1_Es', 'Female2_Es', 'Male1_Es', 'Male2_Es', 'Female1_Tr', 'Female2_Tr', 'Male1_Tr', 'Male2_Tr', 'Male1_Pt', 'Male2_Pt', 'Female1_Pl', 'Female2_Pl', 'Male1_Pl', 'Male2_Pl']
 10 | SR(wav, seed: int)  # Returns a wav upsampled to 48kHz
 11 | TSS(wav, text: str)  # returns foreground and background wav conditioned on `text`
 12 | ADD_NOISE(wav, min_snr_db: float, max_snr_db: float)  # returns a generated audio mixed with gaussian noise
 13 | LOW_PASS(wav, min_cutoff_freq: float, max_cutoff_freq: float, min_rolloff: int, max_rolloff: int)  # returns a generated audio processed by low pass filter
 14 | HIGH_PASS(wav, min_cutoff_freq: float, max_cutoff_freq: float, min_rolloff: int, max_rolloff: int)  # returns a generated audio processed by high pass filter
 15 | ADD_RIR(wav, ir)  # returns a generated audio mixed with a given room impulse response
 16 | ROOM_SIMULATE(wav, min_size_x: float, max_size_x: float, min_size_y: float, max_size_y: float, min_size_z: float, max_size_z: float, min_absorption_value: float, max_absorption_value: float, min_source_x: float, max_source_x: float, min_source_y: float, max_source_y: float, min_source_z: float, max_source_z: float, min_mic_distance: float, max_mic_distance: float, min_mic_azimuth: float, max_mic_azimuth: float, min_mic_elevation: float, max_mic_elevation: float)  # returns a synthesized audio by mixing the input `wav` with a room-specific synthesized impulse response 
 17 | INPAINT(wav, text: str, onset: float, offset: float, duration: float)  # returns a fixed audio where the part between `onset` and `offset` has been inpainted
 18 | 
 19 | 
 20 | I will give you several examples:
 21 | Instruction:
 22 | Increase the volume of child speech by 5 dB, decrease the volume of drum by 3 dB, drop the sound of machine sound.
 23 | Code:
 24 | # Separate the sound of 'child speech' from the mixture and return both 'child speech' and the background sounds
 25 | WAV0, WAV1 = TSS(INPUT_WAV0, text="child speech")
 26 | # Separate the sound of 'drum' from the mixture and return both 'drum' and the background sounds
 27 | WAV2, WAV3 = TSS(WAV1, text="drum")
 28 | # Drop the sound of 'machine sound' from the mixture
 29 | _, WAV3 = TSS(WAV3, text="machine sound")
 30 | # Increace the volume of "child speech" by 5dB
 31 | WAV0 = ADJUST_VOL(WAV0, volume=5)
 32 | # Decrease the volume of 'drum' by 5dB
 33 | WAV2 = ADJUST_VOL(WAV2, volume=-3)
 34 | # Mix the resulted sounds together
 35 | OUTPUT_WAV = MIX([(WAV0, 0), (WAV2, 0), (WAV3, 0)])
 36 | 
 37 | Instruction:
 38 | Extract 1-5s of the first audio with a low-pass filter to simulate the sound coming from inside a building. Replace male speech with dog barking in the second audio. Upsample the mix.
 39 | Code:
 40 | # Truncate the sound between 1s and 5 s
 41 | _, WAV0, _ = SPLIT(INPUT_WAV0, break_points=[1, 5])
 42 | # Add a low-pass filter
 43 | WAV0 = LOW_PASS(WAV0, min_cutoff_freq=300.0, max_cutoff_freq=800.0, min_rolloff=6, max_rolloff=12)
 44 | # Extract the sound of 'male speech' from the truncated sound
 45 | WAV1, WAV2 = TSS(INPUT_WAV1, text="male speech")
 46 | # Generate the sound of 'dog barking' with the same length with the sound of 'male speech'
 47 | WAV3 = TTA(text="dog barking", length=LEN(WAV1), volume=4)
 48 | # Combine the sounds by mixing them together
 49 | MIXTURE_WAV = MIX([(WAV3, 0), (WAV2, 0), (WAV0, 0)])
 50 | # Perform super-resolution on the mixture of sounds
 51 | OUTPUT_WAV = SR(MIXTURE_WAV)
 52 | 
 53 | Instruction:
 54 | Isolate train sound in the input audio, apply a high-pass filter and increase the volume by 3 dB. Repeat it five times to simulate a longer train passing.
 55 | Code:
 56 | # Extract the sound of a train from the audio
 57 | WAV0, _ = TSS(INPUT_WAV0, text="train")
 58 | # Apply a high-pass filter to reduce low-frequency noise
 59 | FILTERED_WAV0 = HIGH_PASS(WAV0, min_cutoff_freq=500.0, max_cutoff_freq=1000.0, min_rolloff=6, max_rolloff=12)
 60 | # Increase the volume by 3 dB
 61 | FILTERED_WAV0 = ADJUST_VOL(FILTERED_WAV0, volume=3)
 62 | # Concatenate the filtered train sound three times
 63 | OUTPUT_WAV = CAT([FILTERED_WAV0] * 5)
 64 | 
 65 | Instruction:
 66 | Extract the hammer sound from the first audio, and truncate it from the start towards 2 second. Remove the sound of baby crying in the second audio, and then decrease the volume by 1 dB. Mix two audio together, and the second sound should begin from 1 second. Add a reverb effect to the mixture sound using the third audio.
 67 | Code:
 68 | # Extract the hammer sound from the first audio
 69 | WAV0, _ = TSS(INPUT_WAV0, text="hammer")
 70 | # Truncate from the start towards 2 second
 71 | WAV0, _ = SPLIT(WAV0, break_points=[2])
 72 | # Drop the sound of baby crying in the second audio
 73 | _, WAV1 = TSS(INPUT_WAV1, text="baby crying")
 74 | # Decrease the volume by 1 dB
 75 | WAV1 = ADJUST_VOL(WAV1, volume=-1)
 76 | # Mix the ouput sounds together
 77 | MIXED_WAV = MIX([(WAV0, 0), (WAV1, 1)])
 78 | # Add a reverb effect using room impulse response
 79 | OUTPUT_WAV = ADD_RIR(MIXED_WAV, ir=INPUT_WAV2)
 80 | 
 81 | Instruction:
 82 | Inpaint the first audio between 2s and 5s with the text "a car passing by with rain falling". Generate a 10s long jazz music piece with the second audio as melody, then mix it with the sound of rain from the first, starting at 3s into the jazz music. 
 83 | Code:
 84 | # Inpaint the first audio between 2s and 5s with the text "a car passing by with rain falling"
 85 | WAV0 = INPAINT(INPUT_WAV0, text="a car passing by with rain falling", onset=2, offset=5, duration=LEN(INPUT_WAV0))
 86 | # Generate a 10-second jazz music piece
 87 | WAV1 = TTM(text="jazz", melody=INPUT_WAV1, length=10.0, volume=5)
 88 | # Extract the sound of rain from the audio file
 89 | WAV0, _ = TSS(WAV0, text="rain")
 90 | # Mix the jazz music with the rain sound, starting the rain at 3 seconds
 91 | OUTPUT_WAV = MIX([(WAV0, 0), (WAV1, 3)])
 92 | 
 93 | Instruction:
 94 | Remove wind sound from an outdoor recording. Generate a 5-second saxophone music with happy mood followed by "Bravo". Mix the generated sound with the outdoor recording and simulate the mixture in a small room with high absorption.
 95 | Code:
 96 | # Drop the sound of wind from the original recording
 97 | _, WAV0 = TSS(INPUT_WAV0, text="wind")
 98 | # Generate a 5-second saxophone music with happy mood followed by a male speech "Bravo".
 99 | WAV1 = TTM(text="happy saxophone", length=5.0, volume=4)
100 | # Generate a speech "Bravo"
101 | WAV2 = TTS("Bravo", volume=5)
102 | # Concatenate the generated sound together
103 | CONCAT_WAV = CAT([WAV1, WAV2]) 
104 | # Mix the generated sound with the background sound
105 | MIXED_WAV = MIX((WAV0, 0), (CONCAT_WAV, 0))
106 | # Simulate the recording in a small room with high absorption
107 | OUTPUT_WAV = ROOM_SIMULATE(MIXED_WAV, min_size_x=3, max_size_x=4, min_size_y=3, max_size_y=4, min_size_z=2.5, max_size_z=3, min_absorption_value=0.7, max_absorption_value=0.9, min_source_x=1, max_source_x=1.5, min_source_y=1, max_source_y=1.5, min_source_z=1, max_source_z=1.5, min_mic_distance=1, max_mic_distance=1.5, min_mic_azimuth=45, max_mic_azimuth=90, min_mic_elevation=20, max_mic_elevation=30)
108 | 


--------------------------------------------------------------------------------
/wavcraft/prompts/text_to_followup.prompt:
--------------------------------------------------------------------------------
1 | Regenerate the code by appending the new instruction to the previous instructions. The code must start with the provided audio (e.g., INPUT_WAV0) and cannot take the output from previous phase (i.e., `OUTPUT_WAV`) as a known input. The new instruction is:


--------------------------------------------------------------------------------
/wavcraft/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import string
 4 | import torch
 5 | import random
 6 | import numpy as np
 7 | from pathlib import Path
 8 | 
 9 | 
10 | #### path related code BEGIN ####
11 | def get_session_path(session_id):
12 |     return Path(f'output/sessions/{session_id}')
13 | 
14 | def get_system_voice_preset_path():
15 |     return Path('data/voice_presets')
16 |     
17 | def get_session_voice_preset_path(session_id):
18 |     return Path(f'{get_session_path(session_id)}/voice_presets')
19 |     
20 | def get_session_audio_path(session_id):
21 |     return Path(f'{get_session_path(session_id)}/audio')
22 | 
23 | def rescale_to_match_energy(segment1, segment2):
24 |     ratio = get_energy_ratio(segment1, segment2)
25 |     recaled_segment1 = segment1 / ratio
26 |     return recaled_segment1.numpy()
27 | #### path related code END ####
28 | 
29 | def text_to_abbrev_prompt(input_text):
30 |     return re.sub(r'[^a-zA-Z_]', '', '_'.join(input_text.split()[:5]))
31 | 
32 | def get_energy(x):
33 |     return np.mean(x ** 2)
34 | 
35 | 
36 | def get_energy_ratio(segment1, segment2):
37 |     energy1 = get_energy(segment1)
38 |     energy2 = max(get_energy(segment2), 1e-10)
39 |     ratio = (energy1 / energy2) ** 0.5
40 |     ratio = torch.tensor(ratio)
41 |     ratio = torch.clamp(ratio, 0.02, 50)
42 |     return ratio
43 | 
44 | def fade(audio_data, fade_duration=2, sr=32000):
45 |     audio_duration = audio_data.shape[0] / sr
46 | 
47 |     # automated choose fade duration
48 |     if audio_duration >=8:
49 |          # keep fade_duration 2
50 |         pass
51 |     else:
52 |         fade_duration = audio_duration / 5
53 | 
54 |     fade_sampels = int(sr * fade_duration)
55 |     fade_in = np.linspace(0, 1, fade_sampels)
56 |     fade_out = np.linspace(1, 0, fade_sampels)
57 | 
58 |     audio_data_fade_in = audio_data[:fade_sampels] * fade_in
59 |     audio_data_fade_out = audio_data[-fade_sampels:] * fade_out
60 | 
61 |     audio_data_faded = np.concatenate((audio_data_fade_in, audio_data[len(fade_in):-len(fade_out)], audio_data_fade_out))
62 |     return audio_data_faded
63 | 
64 | # def get_key(config='config.yaml'):
65 | #     with open('config.yaml', 'r') as file:
66 | #         config = yaml.safe_load(file)
67 | #         return config['OpenAI-Key'] if 'OpenAI-Key' in config else None
68 | 
69 | def get_service_port(port='SERVICE_PORT'):
70 |     service_port = os.environ.get(port)
71 |     return service_port
72 | 
73 | def get_service_url():
74 |     service_url = os.environ.get('SERVICE_URL')
75 |     return service_url 
76 | 
77 | def get_api_key():
78 |     api_key = os.environ.get('OPENAI_KEY')
79 |     return api_key       
80 | 
81 | def get_max_script_lines():
82 |     max_lines = int(os.environ.get('MAX_SCRIPT_LINES', 999))
83 |     return max_lines
84 | 
85 | def get_path_from_target_dir(filename, path_or_dir):
86 |     if os.path.isfile(path_or_dir):
87 |         path_or_dir = os.path.dirname(path_or_dir)
88 |     return os.path.join(path_or_dir, filename)
89 | 
90 | def generate_random_series(n=9):
91 |     return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n))
92 | 


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/child_boy.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/child_boy.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/cnn_male_speaker.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/cnn_male_speaker.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/elder_morgen.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/elder_morgen.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/news_female_speaker.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_female_speaker.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/news_female_speaker_outside.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_female_speaker_outside.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/npz/news_male_speaker.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinhuaLiang/WavCraft/6e926a6e095c9cc916c4de171e84904ebb2fea7b/wavcraft/voice_preset/npz/news_male_speaker.npz


--------------------------------------------------------------------------------
/wavcraft/voice_preset/voice_map.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Male1_En": {
  3 |         "id": "Male1_En",
  4 |         "desc": "A normal male adult voice, British accent; Language: English.",
  5 |         "npz_path": "v2/en_speaker_1"
  6 |     },
  7 |     "Male2_En": {
  8 |         "id": "Male2_En",
  9 |         "desc": "A normal male adult voice, American accent; Language: English.",
 10 |         "npz_path": "v2/en_speaker_6"
 11 |     },
 12 |     "Female1_En": {
 13 |         "id": "Female1_En",
 14 |         "desc": "A normal female adult voice, British accent; Language: English.",
 15 |         "npz_path": "v2/en_speaker_9"
 16 |     },
 17 |     "Female2_En": {
 18 |         "id": "Female2_En",
 19 |         "desc": "A normal female adult voice, American accent; Language: English.",
 20 |         "npz_path": "v2/de_speaker_3"
 21 |     },
 22 |     "News_Male_En": {
 23 |         "id": "News_Male_En",
 24 |         "desc": "A male voice of a news anchor, suitable for news scenarios; Language: English.",
 25 |         "npz_path": "wavcraft/voice_preset/npz/news_male_speaker.npz"
 26 |     },
 27 |     "News_Female_En": {
 28 |         "id": "News_Female_En",
 29 |         "desc": "A female voice of a news anchor, suitable for news scenarios; Language: English.",
 30 |         "npz_path": "wavcraft/voice_preset/npz/news_male_speaker.npz"
 31 |     },
 32 |     "News_Female_Out_En": {
 33 |         "id": "News_Female_Out_En",
 34 |         "desc": "A female voice of a off-site news reporter, suitable for news scenario; Language: English.",
 35 |         "npz_path": "wavcraft/voice_preset/npz/news_female_speaker_outside.npz"
 36 |     },
 37 |     "Child_En": {
 38 |         "id": "Child_En",
 39 |         "desc": "A small young boy voice; Language: English.",
 40 |         "npz_path": "wavcraft/voice_preset/npz/child_boy.npz"
 41 |     },
 42 |     "Old_Man_En": {
 43 |         "id": "Old_Man_En",
 44 |         "desc": "A voice of an old man; Language: English.",
 45 |         "npz_path": "wavcraft/voice_preset/npz/elder_morgen.npz"
 46 |     },
 47 |     "Male1_Zh": {
 48 |         "id": "Male1_Zh",
 49 |         "desc": "A normal male adult voice; Language: Chinese.",
 50 |         "npz_path": "v2/zh_speaker_0"
 51 |     },
 52 |     "Male2_Zh": {
 53 |         "id": "Male2_Zh",
 54 |         "desc": "A normal male adult voice; Language: Chinese.",
 55 |         "npz_path": "v2/zh_speaker_1"
 56 |     },
 57 |     "Female1_Zh": {
 58 |         "id": "Female1_Zh",
 59 |         "desc": "A normal female adult voice; Language: Chinese.",
 60 |         "npz_path": "v2/zh_speaker_9"
 61 |     },
 62 |     "Female2_Zh": {
 63 |         "id": "Female2_Zh",
 64 |         "desc": "A normal female adult voice; Language: Chinese.",
 65 |         "npz_path": "v2/zh_speaker_4"
 66 |     },
 67 |     "Male1_Fr": {
 68 |         "id": "Male1_Fr",
 69 |         "desc": "A normal male adult voice; Language: French.",
 70 |         "npz_path": "v2/fr_speaker_0"
 71 |     },
 72 |     "Male2_Fr": {
 73 |         "id": "Male2_Fr",
 74 |         "desc": "A normal male adult voice; Language: French.",
 75 |         "npz_path": "v2/fr_speaker_8"
 76 |     },
 77 |     "Female1_Fr": {
 78 |         "id": "Female1_Fr",
 79 |         "desc": "A normal female adult voice; Language: French.",
 80 |         "npz_path": "v2/fr_speaker_5"
 81 |     },
 82 |     "Female2_Fr": {
 83 |         "id": "Female2_Fr",
 84 |         "desc": "A normal female adult voice; Language: French.",
 85 |         "npz_path": "v2/fr_speaker_1"
 86 |     },
 87 |     "Male1_De": {
 88 |         "id": "Male1_De",
 89 |         "desc": "A normal male adult voice; Language: German.",
 90 |         "npz_path": "v2/de_speaker_0"
 91 |     },
 92 |     "Male2_De": {
 93 |         "id": "Male2_De",
 94 |         "desc": "A normal male adult voice; Language: German.",
 95 |         "npz_path": "v2/de_speaker_1"
 96 |     },
 97 |     "Female1_De": {
 98 |         "id": "Female1_De",
 99 |         "desc": "A normal female adult voice; Language: German.",
100 |         "npz_path": "v2/de_speaker_3"
101 |     },
102 |     "Female2_De": {
103 |         "id": "Female2_De",
104 |         "desc": "A normal female adult voice; Language: German.",
105 |         "npz_path": "v2/de_speaker_8"
106 |     },
107 |     "Male1_Hi": {
108 |         "id": "Male1_Hi",
109 |         "desc": "A normal male adult voice; Language: Hindi.",
110 |         "npz_path": "v2/hi_speaker_5"
111 |     },
112 |     "Male2_Hi": {
113 |         "id": "Male2_Hi",
114 |         "desc": "A normal male adult voice; Language: Hindi.",
115 |         "npz_path": "v2/hi_speaker_8"
116 |     },
117 |     "Female1_Hi": {
118 |         "id": "Female1_Hi",
119 |         "desc": "A normal female adult voice; Language: Hindi.",
120 |         "npz_path": "v2/hi_speaker_0"
121 |     },
122 |     "Female2_Hi": {
123 |         "id": "Female2_Hi",
124 |         "desc": "A normal female adult voice; Language: Hindi.",
125 |         "npz_path": "v2/hi_speaker_3"
126 |     },
127 |     "Male1_It": {
128 |         "id": "Male1_It",
129 |         "desc": "A normal male adult voice; Language: Italian.",
130 |         "npz_path": "v2/it_speaker_4"
131 |     },
132 |     "Male2_It": {
133 |         "id": "Male2_It",
134 |         "desc": "A normal male adult voice; Language: Italian.",
135 |         "npz_path": "v2/it_speaker_5"
136 |     },
137 |     "Female1_It": {
138 |         "id": "Female1_It",
139 |         "desc": "A normal female adult voice; Language: Italian.",
140 |         "npz_path": "v2/it_speaker_7"
141 |     },
142 |     "Female2_It": {
143 |         "id": "Female2_It",
144 |         "desc": "A normal female adult voice; Language: Italian.",
145 |         "npz_path": "v2/it_speaker_9"
146 |     },
147 |     "Male1_Ja": {
148 |         "id": "Male1_Ja",
149 |         "desc": "A normal male adult voice; Language: Japanese.",
150 |         "npz_path": "v2/ja_speaker_2"
151 |     },
152 |     "Male2_Ja": {
153 |         "id": "Male2_Ja",
154 |         "desc": "A normal male adult voice; Language: Japanese.",
155 |         "npz_path": "v2/ja_speaker_6"
156 |     },
157 |     "Female1_Ja": {
158 |         "id": "Female1_Ja",
159 |         "desc": "A normal female adult voice; Language: Japanese.",
160 |         "npz_path": "v2/ja_speaker_4"
161 |     },
162 |     "Female2_Ja": {
163 |         "id": "Female2_Ja",
164 |         "desc": "A normal female adult voice; Language: Japanese.",
165 |         "npz_path": "v2/ja_speaker_5"
166 |     },
167 |     "Male1_Ko": {
168 |         "id": "Male1_Ko",
169 |         "desc": "A normal male adult voice; Language: Korean.",
170 |         "npz_path": "v2/ko_speaker_1"
171 |     },
172 |     "Male2_Ko": {
173 |         "id": "Male2_Ko",
174 |         "desc": "A normal male adult voice; Language: Korean.",
175 |         "npz_path": "v2/ko_speaker_2"
176 |     },
177 |     "Female1_Ko": {
178 |         "id": "Female1_Ko",
179 |         "desc": "A normal female adult voice; Language: Korean.",
180 |         "npz_path": "v2/ko_speaker_0"
181 |     },
182 |     "Female1_Ru": {
183 |         "id": "Female1_Ru",
184 |         "desc": "A normal female adult voice; Language: Russian.",
185 |         "npz_path": "v2/ru_speaker_5"
186 |     },
187 |     "Female2_Ru": {
188 |         "id": "Female2_Ru",
189 |         "desc": "A normal female adult voice; Language: Russian.",
190 |         "npz_path": "v2/ru_speaker_6"
191 |     },
192 |     "Male1_Ru": {
193 |         "id": "Male1_Ru",
194 |         "desc": "A normal male adult voice; Language: Russian.",
195 |         "npz_path": "v2/ru_speaker_3"
196 |     },
197 |     "Male2_Ru": {
198 |         "id": "Male2_Ru",
199 |         "desc": "A normal male adult voice; Language: Russian.",
200 |         "npz_path": "v2/ru_speaker_4"
201 |     },
202 |     "Female1_Es": {
203 |         "id": "Female1_Es",
204 |         "desc": "A normal female adult voice; Language: Spanish.",
205 |         "npz_path": "v2/es_speaker_8"
206 |     },
207 |     "Female2_Es": {
208 |         "id": "Female2_Es",
209 |         "desc": "A normal female adult voice; Language: Spanish.",
210 |         "npz_path": "v2/es_speaker_9"
211 |     },
212 |     "Male1_Es": {
213 |         "id": "Male1_Es",
214 |         "desc": "A normal male adult voice; Language: Spanish.",
215 |         "npz_path": "v2/es_speaker_6"
216 |     },
217 |     "Male2_Es": {
218 |         "id": "Male2_Es",
219 |         "desc": "A normal male adult voice; Language: Spanish.",
220 |         "npz_path": "v2/es_speaker_7"
221 |     },
222 |     "Female1_Tr": {
223 |         "id": "Female1_Tr",
224 |         "desc": "A normal female adult voice; Language: Turkish.",
225 |         "npz_path": "v2/tr_speaker_4"
226 |     },
227 |     "Female2_Tr": {
228 |         "id": "Female2_Tr",
229 |         "desc": "A normal female adult voice; Language: Turkish.",
230 |         "npz_path": "v2/tr_speaker_5"
231 |     },
232 |     "Male1_Tr": {
233 |         "id": "Male1_Tr",
234 |         "desc": "A normal male adult voice; Language: Turkish.",
235 |         "npz_path": "v2/tr_speaker_2"
236 |     },
237 |     "Male2_Tr": {
238 |         "id": "Male2_Tr",
239 |         "desc": "A normal male adult voice; Language: Turkish.",
240 |         "npz_path": "v2/tr_speaker_3"
241 |     },
242 |     "Male1_Pt": {
243 |         "id": "Male1_Pt",
244 |         "desc": "A normal male adult voice; Language: Purtuguese.",
245 |         "npz_path": "v2/pt_speaker_0"
246 |     },
247 |     "Male2_Pt": {
248 |         "id": "Male2_Pt",
249 |         "desc": "A normal male adult voice; Language: Purtuguese.",
250 |         "npz_path": "v2/pt_speaker_1"
251 |     },
252 |     "Female1_Pl": {
253 |         "id": "Female1_Pl",
254 |         "desc": "A normal female adult voice; Language: Polish.",
255 |         "npz_path": "v2/pl_speaker_4"
256 |     },
257 |     "Female2_Pl": {
258 |         "id": "Female2_Pl",
259 |         "desc": "A normal female adult voice; Language: Polish.",
260 |         "npz_path": "v2/pl_speaker_6"
261 |     },
262 |     "Male1_Pl": {
263 |         "id": "Male1_Pl",
264 |         "desc": "A normal male adult voice; Language: Polish.",
265 |         "npz_path": "v2/pl_speaker_5"
266 |     },
267 |     "Male2_Pl": {
268 |         "id": "Male2_Pl",
269 |         "desc": "A normal male adult voice; Language: Polish.",
270 |         "npz_path": "v2/pl_speaker_7"
271 |     }
272 | }


--------------------------------------------------------------------------------