├── .github
    └── FUNDING.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── README_CN.md
├── install.py
├── requirements.txt
└── scripts
    ├── bark
        ├── speakers.json
        └── tts.py
    ├── faceswap
        ├── model
        │   └── README.md
        └── swap.py
    ├── ui.py
    ├── wav2lip
        ├── audio.py
        ├── checkpoints
        │   └── README.md
        ├── face_detection
        │   ├── README.md
        │   ├── __init__.py
        │   ├── api.py
        │   ├── detection
        │   │   ├── __init__.py
        │   │   ├── core.py
        │   │   └── sfd
        │   │   │   ├── __init__.py
        │   │   │   ├── bbox.py
        │   │   │   ├── detect.py
        │   │   │   ├── net_s3fd.py
        │   │   │   └── sfd_detector.py
        │   ├── models.py
        │   └── utils.py
        ├── hparams.py
        ├── models
        │   ├── __init__.py
        │   ├── conv.py
        │   ├── syncnet.py
        │   └── wav2lip.py
        ├── output
        │   ├── debug
        │   │   └── README.md
        │   ├── face_enhanced
        │   │   └── README.md
        │   ├── faceswap
        │   │   └── README.md
        │   └── final
        │   │   └── README.md
        ├── predicator
        │   └── README.md
        ├── results
        │   └── README.md
        ├── temp
        │   └── README.md
        ├── w2l.py
        └── wav2lip_uhq.py
    ├── wav2lip_uhq.py
    └── wav2lip_uhq_extend_paths.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: Wav2LipStudio
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | demo/*
 3 | **/__pycache__/
 4 | scripts/wav2lip/checkpoints/wav2lip_gan.pth
 5 | scripts/wav2lip/checkpoints/wav2lip.pth
 6 | scripts/wav2lip/checkpoints/visual_quality_disc_.pth
 7 | scripts/wav2lip/checkpoints/lipsync_expert_.pth
 8 | scripts/wav2lip/face_detection/detection/sfd/s3fd.pth
 9 | scripts/wav2lip/predicator/shape_predictor_68_face_landmarks.dat
10 | scripts/wav2lip/output/debug/*.png
11 | scripts/wav2lip/output/final/*.png
12 | scripts/wav2lip/output/images/*.png
13 | scripts/wav2lip/output/masks/*.png
14 | scripts/wav2lip/output/*.mp4
15 | scripts/wav2lip/output/*.aac
16 | scripts/wav2lip/results/result_voice.mp4
17 | scripts/wav2lip/temp/*.avi
18 | scripts/wav2lip/temp/*.wav
19 | docs/*
20 | scripts/faceswap/model/inswapper_128.onnx


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to sd-wav2lip-uhq
 2 | 
 3 | Thank you for your interest in contributing to sd-wav2lip-uhq! We appreciate your effort and to help us incorporate your contribution in the best way possible, please follow the following contribution guidelines.
 4 | 
 5 | ## Reporting Bugs
 6 | 
 7 | If you find a bug in the project, we encourage you to report it. Here's how:
 8 | 
 9 | 1. First, check the [existing Issues](url_of_issues) to see if the issue has already been reported. If it has, please add a comment to the existing issue rather than creating a new one.
10 | 2. If you can't find an existing issue that matches your bug, create a new issue. Make sure to include as many details as possible so we can understand and reproduce the problem.
11 | 
12 | ## Proposing Changes
13 | 
14 | We welcome code contributions from the community. Here's how to propose changes:
15 | 
16 | 1. Fork this repository to your own GitHub account.
17 | 2. Create a new branch on your fork for your changes.
18 | 3. Make your changes in this branch.
19 | 4. When you are ready, submit a pull request to the `main` branch of this repository.
20 | 
21 | Please note that we use the GitHub Flow workflow, so all pull requests should be made to the `main` branch.
22 | 
23 | Before submitting a pull request, please make sure your code adheres to the project's coding conventions and it has passed all tests. If you are adding features, please also add appropriate tests.
24 | 
25 | ## Contact
26 | 
27 | If you have any questions or need help, please ping the developer via discord NumZ#7184 to make sure your addition will fit well into such a large project and to get help if needed.
28 | 
29 | Thank you again for your contribution!
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2023 NumZ
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🔉👄 Wav2Lip STUDIO extension for Stable Diffusion WebUI Automatic1111
  2 | 
  3 | ## <div align="center"><b><a href="README.md">English</a> | <a href="README_CN.md">简体中文</a></b></div>
  4 | 
  5 | <img src="https://user-images.githubusercontent.com/800903/258130805-26d9732f-4d33-4c7e-974e-7af2f1261768.gif" width="100%">
  6 | 
  7 | https://user-images.githubusercontent.com/800903/262435301-af205a91-30d7-43f2-afcc-05980d581fe0.mp4
  8 | ## **STANDALONE VERSION CAN BE FOUND HERE** : [WAV2LIP STUDIO STANDALONE](https://www.patreon.com/Wav2LipStudio)
  9 | In the standalone version you can :
 10 | - ♻ Manage project: Add a feature to manage multiple project
 11 | - 👪 Introduced multiple face swap: Can now Swap multiple face in one shot
 12 | - ⛔ Visible face restriction: Can now make whole process even if no face detected on frame!
 13 | - 📺 Video Size: works with high resolution video input, (test with 1980x1080, should works with 4K but slow)
 14 | - 🔑 Keyframe manager: Add a keyframe manager for better control of the video generation
 15 | - 🍪 coqui TTS integration: Remove bark integration, use coqui TTS instead
 16 | - 💬 Conversation: Add a conversation feature with multiple person
 17 | - 🔈 Record your own voice: Add a feature to record your own voice
 18 | - 👬 Clone voice: Add a feature to clone voice from video
 19 | - 🎏 translate video: Add a feature to translate video with voice clone (HEYGEN like)
 20 | - 🔉 Volume amplifier for wav2lip: Add a feature to amplify the volume of the wav2lip output
 21 | - 🕡 Add delay before sound speech start
 22 | - 🚀 Speed up process: Speed up the process
 23 | 
 24 | 
 25 | ## 💡 Description
 26 | This repository contains a Wav2Lip Studio extension for Automatic1111. 
 27 | 
 28 | It's an all-in-one solution: just choose a video and a speech file (wav or mp3), and the extension will generate a lip-sync video. It improves the quality of the lip-sync videos generated by the [Wav2Lip tool](https://github.com/Rudrabha/Wav2Lip) by applying specific post-processing techniques with Stable diffusion tools.
 29 | 
 30 | ![Illustration](https://user-images.githubusercontent.com/800903/267808204-ae971458-9e8d-403e-9e10-9b7b7590d999.png)
 31 | 
 32 | ## 📖 Quick Index
 33 | * [🚀 Updates](#-updates)
 34 | * [🔗 Requirements](#-requirements)
 35 | * [💻 Installation](#-installation)
 36 | * [🐍 Usage](#-usage)
 37 | * [👄 Note on the bark Fidelity](#-note-on-the-bark-fidelity)
 38 | * [📺 Examples](#-examples)
 39 | * [📖 Behind the scenes](#-behind-the-scenes)
 40 | * [💪 Quality tips](#-quality-tips)
 41 | * [⚠️Noted Constraints](#-noted-constraints)
 42 | * [📝 To do](#-to-do)
 43 | * [😎 Contributing](#-contributing)
 44 | * [🙏 Appreciation](#-appreciation)
 45 | * [📝 Citation](#-citation)
 46 | * [📜 License](#-license)
 47 | * [☕ Support Wav2lip Studio](#-support-wav2lip-studio)
 48 | 
 49 | ## 🚀 Updates
 50 | **2023.09.13**
 51 | - 👪 Introduced face swap: facefusion integration (See Usage section) **this feature is under experimental**.
 52 | 
 53 | **2023.08.22**
 54 | - 👄 Introduced [bark](https://github.com/suno-ai/bark/) (See Usage section), **this feature is under experimental**.
 55 | 
 56 | **2023.08.20**
 57 | - 🚢 Introduced the GFPGAN model as an option.
 58 | - ▶ Added the feature to resume generation.
 59 | - 📏 Optimized to release memory post-generation.
 60 | 
 61 | **2023.08.17**
 62 | - 🐛 Fixed purple lips bug 
 63 | 
 64 | **2023.08.16**
 65 | - ⚡ Added Wav2lip and enhanced video output, with the option to download the one that's best for you, likely the "generated video".
 66 | - 🚢 Updated User Interface: Introduced control over CodeFormer Fidelity.
 67 | - 👄 Removed image as input, [SadTalker](https://github.com/OpenTalker/SadTalker) is better suited for this.
 68 | - 🐛 Fixed a bug regarding the discrepancy between input and output video that incorrectly positioned the mask.
 69 | - 💪 Refined the quality process for greater efficiency.
 70 | - 🚫 Interruption will now generate videos if the process creates frames
 71 | 
 72 | **2023.08.13**
 73 | - ⚡ Speed-up computation 
 74 | - 🚢 Change User Interface : Add controls on hidden parameters
 75 | - 👄 Only Track mouth if needed
 76 | - 📰 Control debug
 77 | - 🐛 Fix resize factor bug
 78 | 
 79 | ## 🔗 Requirements
 80 | 
 81 | - latest version of Stable Diffusion WebUI Automatic1111 by following the instructions on the [Stable Diffusion Webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) repository.
 82 | - FFmpeg : download it from the [official FFmpeg site](https://ffmpeg.org/download.html). Follow the instructions appropriate for your operating system, note ffmpeg have to be accessible from the command line.
 83 | 
 84 | ## 💻 Installation
 85 | 
 86 | 1. Launch Automatic1111
 87 | 2. Face Swap : On Windows, download and install [Visual Studio](https://visualstudio.microsoft.com/fr/downloads/). During the install, make sure to include the Python and C++ packages.
 88 | 3. In the extensions tab, enter the following URL in the "Install from URL" field and click "Install":
 89 | 
 90 | ![Illustration](https://user-images.githubusercontent.com/800903/258115646-22b4b363-c363-4fc8-b316-c162b61b5d15.png)
 91 | 
 92 | 4. Go to the "Installed Tab" in the extensions tab and click "Apply and quit".
 93 | 
 94 | ![Illustration](https://user-images.githubusercontent.com/800903/258115651-196a07bd-ee4b-4aaf-b11e-8e2d1ffaa42f.png)
 95 | 
 96 | 5. If you don't see the "Wav2Lip UHQ tab" restart Automatic1111.
 97 | 
 98 | 6. 🔥 Important: Get the weights. Download the model weights from the following locations and place them in the corresponding directories (take care about the filename, especially for s3fd)
 99 | 
100 | |        Model        |                                    Description                                     |                                                                            Link to the model                                                                             |                                       install folder                                       |
101 | |:-------------------:|:----------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|
102 | |       Wav2Lip       |                              Highly accurate lip-sync                              |            [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW)             |                   extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\                   |
103 | |    Wav2Lip + GAN    |               Slightly inferior lip-sync, but better visual quality                |            [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW)             |                   extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\                   |
104 | |        s3fd         |                          Face Detection pre trained model                          |                                               [Link](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth)                                               |      extensions\sd-wav2lip-uhq\scripts\wav2lip\face_detection\detection\sfd\s3fd.pth       |
105 | | landmark predicator |        Dlib 68 point face landmark prediction (click on the download icon)         |                                  [Link](https://github.com/numz/wav2lip_uhq/blob/main/predicator/shape_predictor_68_face_landmarks.dat)                                  | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
106 | | landmark predicator |              Dlib 68 point face landmark prediction (alternate link)               |     [Link](https://huggingface.co/spaces/asdasdasdasd/Face-forgery-detection/resolve/ccfc24642e0210d4d885bc7b3dbc9a68ed948ad6/shape_predictor_68_face_landmarks.dat)     | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
107 | | landmark predicator | Dlib 68 point face landmark prediction (alternate link click on the download icon) |                            [Link](https://github.com/italojs/facial-landmarks-recognition/blob/master/shape_predictor_68_face_landmarks.dat)                             | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
108 | |   face swap model   |                              model used by face swap                               |[Link](https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/inswapper_128.onnx)                                                                                |  extensions\sd-wav2lip-uhq\scripts\faceswap\model\inswapper_128.onnx   |
109 | 
110 | 
111 | ## 🐍 Usage
112 | 1. Choose a video (avi or mp4 format) with a face in it. If there is no face in only one frame of the video, process will fail. Note avi file will not appear in Video input but process will works.
113 | 2. Face Swap (take times so be patient):
114 |    1. **Face Swap**: chose the image of the face you want to swap with the face in the video.
115 |    2. **Face Index**: if there are multiple faces in the image, you can choose the face you want to swap with the face in the video. 0 is the first face from left to right.
116 | 3. Audio, 2 options:
117 |    1. Put audio file in the "Speech" input. 
118 |    2. Generate Audio with the text to speech [bark](https://github.com/suno-ai/bark/) integration.
119 |       1. Choose the language : Turkish, English, Chinese, Hindi, Italian, Japanese, Korean, Portuguese, Russian, Spanish, Polish, German, French
120 |       2. Choose the Gender
121 |       3. Choose your speaker, you can ear a sample in the "Audio Example"
122 |       4. Choose Low VRAM True (default) if you have a Video Card with less than 16GB VRAM 
123 |       5. Write your text in the text area "Prompt"
124 |          - **Note** that bark can only generate 14 seconds of audio, so if you want to generate a longer audio, you have to use  "[split]" in your text.  
125 |          - For example, if you want to generate a 30 seconds audio, you have to write your text like this :
126 |            - "This is the first part of my text **[split]** This is the second part of my text"
127 |       6. Temperature: 0.0 is supposed to be closer to the voice, and 1.0 is more creative, but in reality, 0.0 yields strange results and 1.0 something very far from the voice. 0.7 is the default value set by 'bark', try different values to see what works best for you.
128 |       7. Silence : Time in seconds between each punctuation(。！!.？?,). Default is 0.25 seconds.
129 |       8. See Bark [documentation](https://github.com/suno-ai/bark/) for more details.
130 |       9. Below is a list of some known non-speech sounds.
131 |          - [laughter]
132 |          - [laughs]
133 |          - [sighs]
134 |          - [music]
135 |          - [gasps]
136 |          - [clears throat]
137 |          - "-" or ... for hesitations
138 |          - ♪ for song lyrics
139 |          - CAPITALIZATION for emphasis of a word
140 |          - [MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
141 | 4. choose a checkpoint (see table above).
142 | 5. **Padding**: Wav2Lip uses this to move the mouth. This is useful if the mouth is not at the good place. Usually, default value is good, but certain video may need to be adjusted.
143 | 6. **No Smooth**: When checked, this option retains the original mouth shape without smoothing.
144 | 7. **Resize Factor**: This is a resize factor for the video. The default value is 1.0, but you can change it to suit your needs. This is useful if the video size is too large.
145 | 8. **Only Mouth**: This option tracks only the mouth, removing other facial motions like those of the cheeks and chin.
146 | 9. **Mouth Mask Dilate**: This will dilate the mouth mask to cover more area around the mouth. depends on the mouth size.
147 | 10. **Face Mask Erode**: This will erode the face mask to remove some area around the face. depends on the face size.
148 | 11. **Mask Blur**: This will blur the mask to make it more smooth, try to keep it under or equal to **Mouth Mask Dilate**.
149 | 12. **Code Former Fidelity**: 
150 |     1. A value of 0 offers higher quality but may significantly alter the person's facial appearance and cause noticeable flickering between frames.
151 |     2. A value of 1 provides lower quality but maintains the person's face more consistently and reduces frame flickering.
152 |     3. Using a value below 0.5 is not advised. Adjust this setting to achieve optimal results. Starting with a value of 0.75 is recommended.
153 | 13. **Active debug**: This will create step-by-step images in the debug folder.
154 | 14. Click on the "Generate" button.
155 | 15. ⚠ "resume" button can be use if face swap and wav2lip step have been done, then you can adjust "mouth mask dilate", "face mask erode", "mask blur" and change "restoration model" without regenerate face swap and wav2lip.
156 | 
157 | ## 👄 Note on the bark Fidelity
158 | 
159 | Bark is interesting but sometimes yields strange results (or even hilarious ones). Each generation will give you something different and It may take several generations before you achieve something conclusive. 
160 | Apart from English, it seems that the other languages speak as if they were being used by a foreigner. Sometimes even if you choose "Male" it will speak like a woman, and vice versa. Sometimes, even when choosing a specific speaker, it will sound like another speaker or even another language.
161 | 
162 | ## 📺 Examples
163 | 
164 | https://user-images.githubusercontent.com/800903/262439441-bb9d888a-d33e-4246-9f0a-1ddeac062d35.mp4
165 | 
166 | https://user-images.githubusercontent.com/800903/262442794-61b1e32f-3f87-4b36-98d6-f711822bdb1e.mp4
167 | 
168 | https://user-images.githubusercontent.com/800903/262449305-901086a3-22cb-42d2-b5be-a5f38db4549a.mp4
169 | 
170 | https://user-images.githubusercontent.com/800903/267808494-300f8cc3-9136-4810-86e2-92f2114a5f9a.mp4
171 | 
172 | ## 📖 Behind the scenes
173 | 
174 | This extension operates in several stages to improve the quality of Wav2Lip-generated videos:
175 | 
176 | 1. **Generate face swap video**: The script first generates the face swap video if image is in "face Swap" field, this operation take times so be patient.
177 | 2. **Generate a Wav2lip video**: Then script generates a low-quality Wav2Lip video using the input video and audio.
178 | 3. **Video Quality Enhancement**: Create a high-quality video using the low-quality video by using the enhancer define by user. 
179 | 4. **Mask Creation**: The script creates a mask around the mouth and tries to keep other facial motions like those of the cheeks and chin.
180 | 5. **Video Generation**: The script then takes the high-quality mouth image and overlays it onto the original image guided by the mouth mask.
181 | 6. **Video Post Processing**: The script then uses the ffmpeg tool to generate the final video.
182 | 
183 | ## 💪 Quality tips
184 | - Use a high quality video as input
185 | - Utilize a video with a consistent frame rate. Occasionally, videos may exhibit unusual playback frame rates (not the standard 24, 25, 30, 60), which can lead to issues with the face mask.
186 | - Use a high quality audio file as input, without background noise or music. Clean audio with a tool like [https://podcast.adobe.com/enhance](https://podcast.adobe.com/enhance).
187 | - Dilate the mouth mask. This will help the model retain some facial motion and hide the original mouth.
188 | - Mask Blur maximum twice the value of Mouth Mask Dilate. If you want to increase the blur, increase the value of Mouth Mask Dilate otherwise the mouth will be blurred and the underlying mouth could be visible.
189 | - Upscaling can be good for improving result, particularly around the mouth area. However, it will extend the processing duration. Use this tutorial from Olivio Sarikas to upscale your video: [https://www.youtube.com/watch?v=3z4MKUqFEUk](https://www.youtube.com/watch?v=3z4MKUqFEUk). Ensure the denoising strength is set between 0.0 and 0.05, select the 'revAnimated' model, and use the batch mode. i'll create a tutorial for this soon.
190 | - Ensure there is a face on each frame of the video. If the face is not detected, process will stop.
191 | 
192 | ## ⚠ Noted Constraints
193 | - for speed up process try to keep resolution under 1000x1000px, so use resize factor and upscaling after process.
194 | - If the initial phase is excessively lengthy, consider using the "resize factor" to decrease the video's dimensions.
195 | - While there's no strict size limit for videos, larger videos will require more processing time. It's advisable to employ the "resize factor" to minimize the video size and then upscale the video once processing is complete.
196 | 
197 | ## 📖 Troubleshooting
198 | - Mac users: dlib will not install correctly. in requirements.txt, replace "dlib-bin" with "dlib"
199 | 
200 | ## 📝 To do
201 | - [ ] Tutorials
202 | - [ ] Convert avi to mp4. Avi is not show in video input but process work fine
203 | - [ ] Add Possibility to use a video for audio input
204 | - [ ] Standalone version
205 | - [ ] ComfyUI intergration
206 | 
207 | ## 😎 Contributing
208 | 
209 | We welcome contributions to this project. When submitting pull requests, please provide a detailed description of the changes. see [CONTRIBUTING](CONTRIBUTING.md) for more information.
210 | 
211 | ## 🙏 Appreciation 
212 | - [Wav2Lip](https://github.com/Rudrabha/Wav2Lip)
213 | - [CodeFormer](https://github.com/sczhou/CodeFormer)
214 | - [bark](https://github.com/suno-ai/bark/)
215 | - [facefusion](https://github.com/facefusion/facefusion)
216 | 
217 | ## ☕ Support Wav2lip Studio
218 | 
219 | this project is open-source effort that is free to use and modify. I rely on the support of users to keep this project going and help improve it. If you'd like to support me, you can make a donation on my Patreon page. Any contribution, large or small, is greatly appreciated!
220 | 
221 | Your support helps me cover the costs of development and maintenance, and allows me to allocate more time and resources to enhancing this project. Thank you for your support!
222 | 
223 | [patreon page](https://www.patreon.com/Wav2LipStudio)
224 | 
225 | ## 📝 Citation
226 | If you use this project in your own work, in articles, tutorials, or presentations, we encourage you to cite this project to acknowledge the efforts put into it.
227 | 
228 | To cite this project, please use the following BibTeX format:
229 | 
230 | ```
231 | @misc{wav2lip_uhq,
232 |   author = {numz},
233 |   title = {Wav2Lip UHQ},
234 |   year = {2023},
235 |   howpublished = {GitHub repository},
236 |   publisher = {numz},
237 |   url = {https://github.com/numz/sd-wav2lip-uhq}
238 | }
239 | ``` 
240 | 
241 | ## 📜 License
242 | * The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
243 | 


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
  1 | # 🔉👄 Stable Diffusion WebUI Automatic1111 Wav2Lip Studio 扩展插件 
  2 | 
  3 | ## <div align="center"><b><a href="README.md">English</a> | <a href="README_CN.md">简体中文</a></b></div>
  4 | 
  5 | <img src="https://user-images.githubusercontent.com/800903/258130805-26d9732f-4d33-4c7e-974e-7af2f1261768.gif" width="100%">
  6 | 
  7 | https://user-images.githubusercontent.com/800903/262435301-af205a91-30d7-43f2-afcc-05980d581fe0.mp4
  8 | 
  9 | ## 💡 简介
 10 | 本代码仓库是适用于Automatic1111的 Wav2Lip UHQ扩展插件。
 11 | 
 12 | 本插件为一体化集成解决方案：只需要一段视频和一段口播音频文件（wav或者mp3），就可以生成一个嘴唇同步的视频。通过Stable Diffusion特别的后处理技术，本插件所生成视频的嘴唇同步效果相比于[Wav2Lip tool](https://github.com/Rudrabha/Wav2Lip)所生成的视频，有更好的质量。
 13 | 
 14 | ![Illustration](https://user-images.githubusercontent.com/800903/267808204-ae971458-9e8d-403e-9e10-9b7b7590d999.png)
 15 | 
 16 | ## 📖 快速索引
 17 | * [🚀 更新](#-更新)
 18 | * [🔗 必要环境](#-必要环境)
 19 | * [💻 安装说明](#-安装说明)
 20 | * [🐍 使用方法](#-使用方法)
 21 | * [👄 关于bark的保真度说明](#-关于bark的保真度说明)
 22 | * [📺 样例](#-样例)
 23 | * [📖 后台原理](#-后台原理)
 24 | * [💪 提高质量的小提示](#-提高质量的小提示)
 25 | * [⚠️需要注意的约束](#-需要注意的约束)
 26 | * [📝 即将上线](#-即将上线)
 27 | * [😎 贡献](#-贡献)
 28 | * [🙏 鸣谢](#-鸣谢)
 29 | * [📝 引用](#-引用)
 30 | * [📜 版权声明](#-版权声明)
 31 | * [☕ 支持Wav2lip Studio](#-支持wav2lip-studio)
 32 | 
 33 | ## 🚀 更新
 34 | **2023.09.13**
 35 | - 👪 增加了 face swap 换脸: 整合了roop (参加下方使用方法章节) **本功能为实验性功能**。
 36 | 
 37 | **2023.08.22**
 38 | - 👄 增加了 [bark](https://github.com/suno-ai/bark/) (参加下方使用方法章节)， **本功能为实验性功能**。
 39 | 
 40 | **2023.08.20**
 41 | - 🚢 增加了新的面部修复模型选择：GFPGAN。
 42 | - ▶ 增加了暂停/恢复功能。
 43 | - 📏 优化了释放内存方式，视频生成后会释放内存。
 44 | 
 45 | **2023.08.17**
 46 | - 🐛 修复嘴唇发紫的bug 
 47 | 
 48 | **2023.08.16**
 49 | - ⚡ 除了generated版本的视频，额外输出Wav2lip和enhanced版本的视频，你可以从中选择效果更好的一个版本。
 50 | - 🚢 用户界面更新：增加了对CodeFormer Fidelity的控制说明。
 51 | - 👄 删除了图片输入方式,因为 [SadTalker](https://github.com/OpenTalker/SadTalker) 的方法更好。
 52 | - 🐛 修复了输入和输出视频之间的差异导致遮罩蒙板位置不正确的bug。
 53 | - 💪 改进了处理流程，提高了效率。
 54 | - 🚫 如果程序在处理过程中，中段还会继续产出视频。
 55 | 
 56 | **2023.08.13**
 57 | - ⚡ 加快计算速度。
 58 | - 🚢 用户界面更新：添加了一些隐藏参数的设置。
 59 | - 👄 提供了“仅追踪嘴巴”的选项。
 60 | - 📰 控制debug。
 61 | - 🐛 修复了resize factor的bug。
 62 | 
 63 | 
 64 | ## 🔗 必要环境
 65 | 
 66 | - 最新版本的Stable Diffusion WebUI Automatic1111 [Stable Diffusion Webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) 。
 67 | - FFmpeg : 预先安装好FFmpeg，下载地址：[FFmpeg官网](https://ffmpeg.org/download.html)。根据你的操作系统，按照官网说明安装好，注意，FFmpeg要加入环境变量设置，便于在任意目录调用。
 68 | 
 69 | ## 💻 安装说明
 70 | 
 71 | 1. 启动Automatic1111
 72 | 2. Face Swap的环境依赖需要编译安装，所以在Windows中，需要安装 [Visual Studio](https://visualstudio.microsoft.com/fr/downloads/). 确保安装时，勾选Python开发环境和C++桌面开发环境包.
 73 | 3. 在扩展菜单里, 找到“从网址安装”标签，输入下方URL地址，点击“安装”：
 74 | 
 75 | ![Illustration](https://user-images.githubusercontent.com/800903/258115646-22b4b363-c363-4fc8-b316-c162b61b5d15.png)
 76 | 
 77 | 4. 来到“已安装”标签，点击“应用并重启用户界面”.
 78 | 
 79 | ![Illustration](https://user-images.githubusercontent.com/800903/258115651-196a07bd-ee4b-4aaf-b11e-8e2d1ffaa42f.png)
 80 | 
 81 | 5. 如果您仍然看不到"Wav2Lip UHQ"的菜单，尝试重启Automatic1111.
 82 | 
 83 | 6. 🔥 十分重要: 必须要下载模型。从下方表格下载全部所需的模型。（要注意模型文件名，确保文件名正确无误，尤其是s3fd模型）。
 84 | 
 85 | |        模型        |                                    描述                                     |                                                                        地址                                                                         |                                       安装目录                                       |
 86 | |:-------------------:|:----------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|
 87 | |       Wav2Lip       |                              高精度的唇同步                              |        [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW)         |                   extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\                   |
 88 | |    Wav2Lip + GAN    |               嘴唇同步稍差，但视觉质量更好                |        [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW)         |                   extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\                   |
 89 | |        s3fd         |                          人脸检测预训练模型                          |                                           [Link](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth)                                           |      extensions\sd-wav2lip-uhq\scripts\wav2lip\face_detection\detection\sfd\s3fd.pth       |
 90 | | landmark predicator |        Dlib 68点人脸特征推测 (点击下载按钮)         |                              [Link](https://github.com/numz/wav2lip_uhq/blob/main/predicator/shape_predictor_68_face_landmarks.dat)                              | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
 91 | | landmark predicator |              Dlib 68点人脸特征推测 (备用地址1)               | [Link](https://huggingface.co/spaces/asdasdasdasd/Face-forgery-detection/resolve/ccfc24642e0210d4d885bc7b3dbc9a68ed948ad6/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
 92 | | landmark predicator | Dlib 68点人脸特征推测 (备用地址2，点击下载按钮) |                        [Link](https://github.com/italojs/facial-landmarks-recognition/blob/master/shape_predictor_68_face_landmarks.dat)                         | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat |
 93 | |   face swap model   |                              face swap换脸所用模型                               |[Link](https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/inswapper_128.onnx)                                                                                |  extensions\sd-wav2lip-uhq\scripts\faceswap\model\inswapper_128.onnx   |
 94 | 
 95 | 
 96 | ## 🐍 使用方法
 97 | 1. 上传一个包含人脸的视频文件（avi格式或者mp4格式均可）。如果视频里没有人脸，哪怕只有一帧不包含人脸，会导致处理失败。请注意，如果你上传的是avi文件，在界面上你看不见它，但不用担心，插件会正常处理视频。
 98 | 2. 换脸 (耗时很长，需耐心等待):
 99 |    1. **Face Swap**: 选择一张照片用来替换视频里的脸。
100 |    2. **Face Index**: 如果照片里有多张脸，可以指定其中一个，0 指的是从左到右数的第一个脸。
101 | 3. 上传一个口播音频文件，现在音频的输入有两种方式:
102 |    1. 跟原来一样，在音频输入区域上传口播音频文件。 
103 |    2. 用 [bark](https://github.com/suno-ai/bark/) 插件将文字转成口播语音.
104 |       1. 选择语言 : 土耳其语, 英语, 汉语, 印地语, 意大利语, 日语, 韩语, 葡萄牙语, 俄语, 西班牙语, 波兰语, 德语, 法语
105 |       2. 选择性别
106 |       3. 选择朗读者, 你可以在 "Audio Example（声音样例）" 里试听
107 |       4. 如果你的显卡内存低于16GB，勾选低显存为 True (默认选中)  
108 |       5. 将你需要朗读的文本填入 "Prompt" 区域
109 |          - **注意** bark生成的一句话只能在14秒以内，如果你的一句话比较长，需要用"[split]"进行分割。  
110 |          - 例如,假如你一句话大约有30秒，你可以将你的文本写成这样：
111 |            - "这是前半段文字 **[split]** 这是后半段文字"
112 |       6. Temperature: 靠近0.0接近原声, 靠近1.0让AI发挥创意, 但现实情况是but in reality, 0.0会感觉有点奇怪，1.0更原声相差甚远。bark设置了0.7为默认值，你可以自行微调以达到效果更佳。
113 |       7. Silence（停顿） : 在遇到标点符号(。！!.？?,)时的停顿时间. 默认值是0.25秒.
114 |       8. 关于更多Bard的有关细节，可查看 Bark [文档](https://github.com/suno-ai/bark/) .
115 |       9. 下列为已知一些支持的非说话的声音(但有时候没反应).
116 |          - [laughter] 大笑
117 |          - [laughs] 微笑
118 |          - [sighs] 叹气
119 |          - [music] 音乐
120 |          - [gasps] 喘气
121 |          - [clears throat] 清嗓
122 |          - "-" or ... 犹豫停顿
123 |          - ♪ 歌词
124 |          - 大写时用于强调
125 |          - 可以在提示词里单独写[MAN] 或者 [WOMAN]可无视朗读者的选择，将文本用指性别的朗读者
126 | 4. 选择模型 (详见上方表格).
127 | 5. **Padding（填充位移）**: Wav2Lip用它来移动嘴巴位置。如果嘴巴位置不理想，可以用它来微调，通常情况下，不必刻意调整。
128 | 6. **No Smooth（不要平滑）**: 当勾选该选项，将会保持原始嘴部形状不做平滑处理。
129 | 7. **Resize Factor（调整大小）**: 该选项会对视频的分辨率进行调整。默认值是1，如果你需要降低你的视频分辨率，你可以调整它。
130 | 8. **Only Mouth（仅追踪嘴巴）**: 选中该选项，将仅对嘴部进行追踪，这将会移除例如脸颊和下巴的动作。
131 | 9. **Mouth Mask Dilate（嘴部遮罩蒙板扩张）**: 该选项用于调整嘴巴覆盖区域，参数越大，覆盖面积越大，根据嘴巴的大小来作出调整。
132 | 10. **Face Mask Erode（面部遮罩蒙板侵蚀）**: 对脸部外延区域进行渗透侵蚀处理，根据脸型大小作出调整。
133 | 11. **Mask Blur（遮罩模糊）**: 通过对遮罩层进行模糊处理，使其变得更平滑，建议尽量使该参数小于等于 **Mouth Mask Dilate（嘴部遮罩蒙板扩张）** 参数.
134 | 12. **Code Former Fidelity（Code Former保真度）**: 
135 |     1. 当该参数偏向0时，虽然有更高的画质，但可能会引起人物外观特征改变，以及画面闪烁。
136 |     2. 当该参数偏向1时，虽然降低了画质，但是能更大程度的保留原来人物的外观特征，以及降低画面闪烁。
137 |     3. 不建议该参数低于0.5。为了达到良好的效果，建议在0.75左右进行调整。
138 | 13. **Active debug（启用debug模式）**: 开启该选项，将会在debug目录里逐步执行来生成图片。
139 | 14. 点击“Generate”（生成）按钮。
140 | 
141 | ## 演示教程
142 | 
143 | | 链接 | 语言 |
144 | |:----:|:----:|
145 | |[哔哩哔哩](https://www.bilibili.com/video/BV1J94y1r7Xc/)|中文|
146 | |[抖音](https://v.douyin.com/iJtQVU51/)|中文|
147 | |[Youtube](https://youtu.be/9M-IzuxlFRU)|中文|
148 | 
149 | 
150 | 
151 | ## 👄 关于bark的保真度说明
152 | Bark十分有趣，但它有时候输出的声音十分奇怪（甚至有点搞笑）。每次生成的结果都有些许不同，你可能需要多生成几次以达到你想要的结果。
153 | 除了英语，其他语言的采样似乎并非来自本土母语，听起来有点像外国人在说话。有时候选男，但听起来有点像女，反之亦然。甚至有时候当你选了某一个声音，但听起来像另外一个声音或者另一种语音。
154 | 
155 | ## 📺 样例
156 | 
157 | https://user-images.githubusercontent.com/800903/262439441-bb9d888a-d33e-4246-9f0a-1ddeac062d35.mp4
158 | 
159 | https://user-images.githubusercontent.com/800903/262442794-61b1e32f-3f87-4b36-98d6-f711822bdb1e.mp4
160 | 
161 | https://user-images.githubusercontent.com/800903/262449305-901086a3-22cb-42d2-b5be-a5f38db4549a.mp4
162 | 
163 | https://user-images.githubusercontent.com/800903/267808494-300f8cc3-9136-4810-86e2-92f2114a5f9a.mp4
164 | 
165 | ## 📖 后台原理
166 | 
167 | 本扩展分几个流程运行，以此达到提高Wav2Lip生成的视频的质量的效果：
168 | 
169 | 1. **Generate face swap video**: 如果face swap提供了需要替换脸部的图片，那会先将原始视频进行脸部替换，该操作耗时很长，需耐心等待。
170 | 2. **Generate a Wav2lip video（生成Wav2lip视频）**: 该脚本先使用输入的视频和音频生成低质量的Wav2Lip视频。
171 | 3. **Video Quality Enhancement（视频质量增强）**: 根据用户选择的面部修复模型来将低清视频转化成高清的视频。
172 | 4. **Mask Creation（创建遮罩蒙板）**: 该脚本在嘴巴周围制作了一个遮罩蒙板，并试图保持其他面部动作，比如脸颊和下巴的动作。
173 | 5. **Video Generation（生成视频）***: 该脚本会获取高质量的嘴巴图像，并将其覆盖在由嘴部遮罩引导的原始图像上。
174 | 6. **Video Post Processing（后期合成）**: 该脚本调用ffmpeg生成最终版本的视频。
175 | 
176 | ## 💪 提高质量的小提示
177 | - 使用高质量的视频作为输入源
178 | - 使用常见FPS（譬如24fps、25fps、30fps、60fps）的视频，如果不是常见的FPS，偶尔会出现一些问题，譬如面部遮罩蒙板处理。
179 | - 使用高质量的音频源文件，不要有音乐，不要有背景白噪声。使用类似 [https://podcast.adobe.com/enhance](https://podcast.adobe.com/enhance) 的工具清除背景音乐。
180 | - 扩大嘴部遮罩蒙板范围。这将有助于模型保留一些面部动作，并盖住原来的嘴巴。
181 | - “遮罩模糊”（Mask Blur）的最大值是“嘴部遮罩蒙板扩张”（Mouth Mask Dilate）值的两倍。如果要增加模糊度，请增加“嘴部遮罩蒙板扩张”的值，否则嘴巴将变得模糊，并且可以看到下面的嘴巴。
182 | - 高清放大有利于提高质量，尤其是在嘴巴周围。但是，它将使处理时间变长。你可以参考Olivio Sarikas的教程来高清放大处理你的视频: [https://www.youtube.com/watch?v=3z4MKUqFEUk](https://www.youtube.com/watch?v=3z4MKUqFEUk). 确保去噪强度设置在0.0和0.05之间，选择“revAnimated”模型，并使用批处理模式。回头我再补个简单的教程说明。
183 | - 确保视频的每一帧上都有一张脸。如果未检测到人脸，插件将停止运行。
184 | 
185 | 
186 | ## ⚠ 需要注意的约束
187 | - 目前的模型对胡须并不友好.
188 | - 如果初始化阶段过长，请考虑使用调整“Resize Factor”来减小视频分辨率的大小。
189 | - 虽然对原始视频没有严格的大小限制，但较大的视频需要更多的处理时间。建议使用“调整大小因子”来最小化视频大小，然后在处理完成后升级视频。
190 | 
191 | ## 📖 故障排除
192 | - Mac用户: dlib会报无法安装,在requirements.txt文件里，找到”dlib-bin“，并将其替换成"dlib"
193 | 
194 | ## 📝 即将上线
195 | - [ ] 教程指引
196 | - [ ] 将avi转mp4（目前avi文件在输入框不显示，但依然能正常工作）
197 | - [ ] 可用视频文件作为音频的输入
198 | - [ ] 独立版本
199 | - [ ] ComfyUI 插件
200 | 
201 | ## 😎 贡献
202 | 
203 | 我们欢迎各位对本项目的贡献提交。提交合并提取请求（Pull requests）时，请提供更改的详细说明。 详参 [CONTRIBUTING](CONTRIBUTING.md) 。
204 | 
205 | ## 🙏 鸣谢 
206 | - [Wav2Lip](https://github.com/Rudrabha/Wav2Lip)
207 | - [CodeFormer](https://github.com/sczhou/CodeFormer)
208 | - [bark](https://github.com/suno-ai/bark/)
209 | - [roop](https://github.com/s0md3v/sd-webui-roop)
210 | 
211 | ## ☕ 支持Wav2lip Studio
212 | 
213 | 该项目是开源项目，可以免费使用和修改。项目的持续改进，依靠用户的支持。如果您喜欢本项目，愿意支持我，可以在我的Patreon页面上捐款。捐献无论大小，我们都将不胜感激！
214 | 
215 | 您的捐献将帮助我分担了开发和维护的成本，并让我能够分配更多的时间和资源来增强这个项目。感谢您的支持！
216 | 
217 | [patreon页面](https://www.patreon.com/Wav2LipStudio)
218 | 
219 | ## 📝 引用
220 | 如果您在工作、发表文章、教程或演示中使用到了项目，我们非常鼓励您引用此项目。
221 | 
222 | 如需引证本项目，请考虑使用如下BibTeX格式：
223 | 
224 | ```
225 | @misc{wav2lip_uhq,
226 |   author = {numz},
227 |   title = {Wav2Lip UHQ},
228 |   year = {2023},
229 |   howpublished = {GitHub repository},
230 |   publisher = {numz},
231 |   url = {https://github.com/numz/sd-wav2lip-uhq}
232 | }
233 | ``` 
234 | 
235 | ## 📜 版权声明
236 | * 此代码仓库中的代码是根据MIT许可协议发布的。 [LICENSE file](LICENSE).
237 | 


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
 1 | import launch
 2 | import os
 3 | import platform
 4 | 
 5 | system = platform.system()
 6 | 
 7 | req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
 8 | 
 9 | with open(req_file) as file:
10 |     for lib in file:
11 |         lib = lib.strip()
12 |         if lib == "dlib-bin" and system == "Darwin":
13 |             lib = "dlib"  # replace dlib-bin as dlib
14 |         if lib == "onnxruntime-gpu==1.15.0" and system == "Darwin":
15 |             continue  # skip onnxruntime-gpu
16 |         if not launch.is_installed(lib):
17 |             launch.run_pip(f"install {lib}", f"wav2lip_uhq requirement: {lib}")
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | imutils
 2 | dlib-bin
 3 | numpy
 4 | opencv-python
 5 | scipy
 6 | requests
 7 | pillow
 8 | librosa==0.10.0.post2
 9 | opencv-contrib-python
10 | tqdm
11 | numba
12 | imutils
13 | imageio_ffmpeg
14 | git+https://github.com/suno-ai/bark.git
15 | insightface==0.7.3
16 | onnx==1.14.0
17 | onnxruntime==1.15.0
18 | onnxruntime-gpu==1.15.0
19 | opencv-python>=4.8.0
20 | ifnude


--------------------------------------------------------------------------------
/scripts/bark/speakers.json:
--------------------------------------------------------------------------------
   1 | [
   2 |     {
   3 |         "name": "Speaker 0 (EN)",
   4 |         "id": "v2/en_speaker_0",
   5 |         "language": "English",
   6 |         "gender": "Male",
   7 |         "quality": null,
   8 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_0.mp3",
   9 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_0.mp3"
  10 |     },
  11 |     {
  12 |         "name": "Speaker 1 (EN)",
  13 |         "id": "v2/en_speaker_1",
  14 |         "language": "English",
  15 |         "gender": "Male",
  16 |         "quality": null,
  17 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_1.mp3",
  18 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_1.mp3"
  19 |     },
  20 |     {
  21 |         "name": "Speaker 2 (EN)",
  22 |         "id": "v2/en_speaker_2",
  23 |         "language": "English",
  24 |         "gender": "Male",
  25 |         "quality": null,
  26 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_2.mp3",
  27 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_2.mp3"
  28 |     },
  29 |     {
  30 |         "name": "Speaker 3 (EN)",
  31 |         "id": "v2/en_speaker_3",
  32 |         "language": "English",
  33 |         "gender": "Male",
  34 |         "quality": null,
  35 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_3.mp3",
  36 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_3.mp3"
  37 |     },
  38 |     {
  39 |         "name": "Speaker 4 (EN)",
  40 |         "id": "v2/en_speaker_4",
  41 |         "language": "English",
  42 |         "gender": "Male",
  43 |         "quality": null,
  44 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_4.mp3",
  45 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_4.mp3"
  46 |     },
  47 |     {
  48 |         "name": "Speaker 5 (EN)",
  49 |         "id": "v2/en_speaker_5",
  50 |         "language": "English",
  51 |         "gender": "Male",
  52 |         "quality": "Grainy",
  53 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_5.mp3",
  54 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_5.mp3"
  55 |     },
  56 |     {
  57 |         "name": "Speaker 6 (EN)",
  58 |         "id": "v2/en_speaker_6",
  59 |         "language": "English",
  60 |         "gender": "Male",
  61 |         "quality": "Suno Favorite",
  62 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_6.mp3",
  63 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_6.mp3"
  64 |     },
  65 |     {
  66 |         "name": "Speaker 7 (EN)",
  67 |         "id": "v2/en_speaker_7",
  68 |         "language": "English",
  69 |         "gender": "Male",
  70 |         "quality": null,
  71 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_7.mp3",
  72 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_7.mp3"
  73 |     },
  74 |     {
  75 |         "name": "Speaker 8 (EN)",
  76 |         "id": "v2/en_speaker_8",
  77 |         "language": "English",
  78 |         "gender": "Male",
  79 |         "quality": null,
  80 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_8.mp3",
  81 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_8.mp3"
  82 |     },
  83 |     {
  84 |         "name": "Speaker 9 (EN)",
  85 |         "id": "v2/en_speaker_9",
  86 |         "language": "English",
  87 |         "gender": "Female",
  88 |         "quality": null,
  89 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_9.mp3",
  90 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_9.mp3"
  91 |     },
  92 |     {
  93 |         "name": "Speaker 0 (ZH)",
  94 |         "id": "v2/zh_speaker_0",
  95 |         "language": "Chinese",
  96 |         "gender": "Male",
  97 |         "quality": null,
  98 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_0.mp3",
  99 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_0.mp3"
 100 |     },
 101 |     {
 102 |         "name": "Speaker 1 (ZH)",
 103 |         "id": "v2/zh_speaker_1",
 104 |         "language": "Chinese",
 105 |         "gender": "Male",
 106 |         "quality": null,
 107 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_1.mp3",
 108 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_1.mp3"
 109 |     },
 110 |     {
 111 |         "name": "Speaker 2 (ZH)",
 112 |         "id": "v2/zh_speaker_2",
 113 |         "language": "Chinese",
 114 |         "gender": "Male",
 115 |         "quality": null,
 116 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_2.mp3",
 117 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_2.mp3"
 118 |     },
 119 |     {
 120 |         "name": "Speaker 3 (ZH)",
 121 |         "id": "v2/zh_speaker_3",
 122 |         "language": "Chinese",
 123 |         "gender": "Male",
 124 |         "quality": null,
 125 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_3.mp3",
 126 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_3.mp3"
 127 |     },
 128 |     {
 129 |         "name": "Speaker 4 (ZH)",
 130 |         "id": "v2/zh_speaker_4",
 131 |         "language": "Chinese",
 132 |         "gender": "Female",
 133 |         "quality": null,
 134 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_4.mp3",
 135 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_4.mp3"
 136 |     },
 137 |     {
 138 |         "name": "Speaker 5 (ZH)",
 139 |         "id": "v2/zh_speaker_5",
 140 |         "language": "Chinese",
 141 |         "gender": "Male",
 142 |         "quality": null,
 143 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_5.mp3",
 144 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_5.mp3"
 145 |     },
 146 |     {
 147 |         "name": "Speaker 6 (ZH)",
 148 |         "id": "v2/zh_speaker_6",
 149 |         "language": "Chinese",
 150 |         "gender": "Female",
 151 |         "quality": "Background Noise",
 152 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_6.mp3",
 153 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_6.mp3"
 154 |     },
 155 |     {
 156 |         "name": "Speaker 7 (ZH)",
 157 |         "id": "v2/zh_speaker_7",
 158 |         "language": "Chinese",
 159 |         "gender": "Female",
 160 |         "quality": null,
 161 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_7.mp3",
 162 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_7.mp3"
 163 |     },
 164 |     {
 165 |         "name": "Speaker 8 (ZH)",
 166 |         "id": "v2/zh_speaker_8",
 167 |         "language": "Chinese",
 168 |         "gender": "Male",
 169 |         "quality": null,
 170 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_8.mp3",
 171 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_8.mp3"
 172 |     },
 173 |     {
 174 |         "name": "Speaker 9 (ZH)",
 175 |         "id": "v2/zh_speaker_9",
 176 |         "language": "Chinese",
 177 |         "gender": "Female",
 178 |         "quality": null,
 179 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_9.mp3",
 180 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_9.mp3"
 181 |     },
 182 |     {
 183 |         "name": "Speaker 0 (FR)",
 184 |         "id": "v2/fr_speaker_0",
 185 |         "language": "French",
 186 |         "gender": "Male",
 187 |         "quality": null,
 188 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_0.mp3",
 189 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_0.mp3"
 190 |     },
 191 |     {
 192 |         "name": "Speaker 1 (FR)",
 193 |         "id": "v2/fr_speaker_1",
 194 |         "language": "French",
 195 |         "gender": "Female",
 196 |         "quality": null,
 197 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_1.mp3",
 198 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_1.mp3"
 199 |     },
 200 |     {
 201 |         "name": "Speaker 2 (FR)",
 202 |         "id": "v2/fr_speaker_2",
 203 |         "language": "French",
 204 |         "gender": "Female",
 205 |         "quality": null,
 206 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_2.mp3",
 207 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_2.mp3"
 208 |     },
 209 |     {
 210 |         "name": "Speaker 3 (FR)",
 211 |         "id": "v2/fr_speaker_3",
 212 |         "language": "French",
 213 |         "gender": "Male",
 214 |         "quality": null,
 215 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_3.mp3",
 216 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_3.mp3"
 217 |     },
 218 |     {
 219 |         "name": "Speaker 4 (FR)",
 220 |         "id": "v2/fr_speaker_4",
 221 |         "language": "French",
 222 |         "gender": "Male",
 223 |         "quality": null,
 224 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_4.mp3",
 225 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_4.mp3"
 226 |     },
 227 |     {
 228 |         "name": "Speaker 5 (FR)",
 229 |         "id": "v2/fr_speaker_5",
 230 |         "language": "French",
 231 |         "gender": "Female",
 232 |         "quality": null,
 233 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_5.mp3",
 234 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_5.mp3"
 235 |     },
 236 |     {
 237 |         "name": "Speaker 6 (FR)",
 238 |         "id": "v2/fr_speaker_6",
 239 |         "language": "French",
 240 |         "gender": "Male",
 241 |         "quality": null,
 242 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_6.mp3",
 243 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_6.mp3"
 244 |     },
 245 |     {
 246 |         "name": "Speaker 7 (FR)",
 247 |         "id": "v2/fr_speaker_7",
 248 |         "language": "French",
 249 |         "gender": "Male",
 250 |         "quality": null,
 251 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_7.mp3",
 252 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_7.mp3"
 253 |     },
 254 |     {
 255 |         "name": "Speaker 8 (FR)",
 256 |         "id": "v2/fr_speaker_8",
 257 |         "language": "French",
 258 |         "gender": "Male",
 259 |         "quality": null,
 260 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_8.mp3",
 261 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_8.mp3"
 262 |     },
 263 |     {
 264 |         "name": "Speaker 9 (FR)",
 265 |         "id": "v2/fr_speaker_9",
 266 |         "language": "French",
 267 |         "gender": "Male",
 268 |         "quality": "Auditorium",
 269 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_9.mp3",
 270 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_9.mp3"
 271 |     },
 272 |     {
 273 |         "name": "Speaker 0 (DE)",
 274 |         "id": "v2/de_speaker_0",
 275 |         "language": "German",
 276 |         "gender": "Male",
 277 |         "quality": null,
 278 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_0.mp3",
 279 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_0.mp3"
 280 |     },
 281 |     {
 282 |         "name": "Speaker 1 (DE)",
 283 |         "id": "v2/de_speaker_1",
 284 |         "language": "German",
 285 |         "gender": "Male",
 286 |         "quality": null,
 287 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_1.mp3",
 288 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_1.mp3"
 289 |     },
 290 |     {
 291 |         "name": "Speaker 2 (DE)",
 292 |         "id": "v2/de_speaker_2",
 293 |         "language": "German",
 294 |         "gender": "Male",
 295 |         "quality": null,
 296 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_2.mp3",
 297 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_2.mp3"
 298 |     },
 299 |     {
 300 |         "name": "Speaker 3 (DE)",
 301 |         "id": "v2/de_speaker_3",
 302 |         "language": "German",
 303 |         "gender": "Female",
 304 |         "quality": null,
 305 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_3.mp3",
 306 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_3.mp3"
 307 |     },
 308 |     {
 309 |         "name": "Speaker 4 (DE)",
 310 |         "id": "v2/de_speaker_4",
 311 |         "language": "German",
 312 |         "gender": "Male",
 313 |         "quality": null,
 314 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_4.mp3",
 315 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_4.mp3"
 316 |     },
 317 |     {
 318 |         "name": "Speaker 5 (DE)",
 319 |         "id": "v2/de_speaker_5",
 320 |         "language": "German",
 321 |         "gender": "Male",
 322 |         "quality": null,
 323 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_5.mp3",
 324 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_5.mp3"
 325 |     },
 326 |     {
 327 |         "name": "Speaker 6 (DE)",
 328 |         "id": "v2/de_speaker_6",
 329 |         "language": "German",
 330 |         "gender": "Male",
 331 |         "quality": null,
 332 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_6.mp3",
 333 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_6.mp3"
 334 |     },
 335 |     {
 336 |         "name": "Speaker 7 (DE)",
 337 |         "id": "v2/de_speaker_7",
 338 |         "language": "German",
 339 |         "gender": "Male",
 340 |         "quality": null,
 341 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_7.mp3",
 342 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_7.mp3"
 343 |     },
 344 |     {
 345 |         "name": "Speaker 8 (DE)",
 346 |         "id": "v2/de_speaker_8",
 347 |         "language": "German",
 348 |         "gender": "Female",
 349 |         "quality": null,
 350 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_8.mp3",
 351 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_8.mp3"
 352 |     },
 353 |     {
 354 |         "name": "Speaker 9 (DE)",
 355 |         "id": "v2/de_speaker_9",
 356 |         "language": "German",
 357 |         "gender": "Male",
 358 |         "quality": null,
 359 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_9.mp3",
 360 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_9.mp3"
 361 |     },
 362 |     {
 363 |         "name": "Speaker 0 (HI)",
 364 |         "id": "v2/hi_speaker_0",
 365 |         "language": "Hindi",
 366 |         "gender": "Female",
 367 |         "quality": null,
 368 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_0.mp3",
 369 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_0.mp3"
 370 |     },
 371 |     {
 372 |         "name": "Speaker 1 (HI)",
 373 |         "id": "v2/hi_speaker_1",
 374 |         "language": "Hindi",
 375 |         "gender": "Female",
 376 |         "quality": "Background Noise",
 377 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_1.mp3",
 378 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_1.mp3"
 379 |     },
 380 |     {
 381 |         "name": "Speaker 2 (HI)",
 382 |         "id": "v2/hi_speaker_2",
 383 |         "language": "Hindi",
 384 |         "gender": "Male",
 385 |         "quality": null,
 386 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_2.mp3",
 387 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_2.mp3"
 388 |     },
 389 |     {
 390 |         "name": "Speaker 3 (HI)",
 391 |         "id": "v2/hi_speaker_3",
 392 |         "language": "Hindi",
 393 |         "gender": "Female",
 394 |         "quality": null,
 395 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_3.mp3",
 396 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_3.mp3"
 397 |     },
 398 |     {
 399 |         "name": "Speaker 4 (HI)",
 400 |         "id": "v2/hi_speaker_4",
 401 |         "language": "Hindi",
 402 |         "gender": "Female",
 403 |         "quality": "Background Noise",
 404 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_4.mp3",
 405 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_4.mp3"
 406 |     },
 407 |     {
 408 |         "name": "Speaker 5 (HI)",
 409 |         "id": "v2/hi_speaker_5",
 410 |         "language": "Hindi",
 411 |         "gender": "Male",
 412 |         "quality": null,
 413 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_5.mp3",
 414 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_5.mp3"
 415 |     },
 416 |     {
 417 |         "name": "Speaker 6 (HI)",
 418 |         "id": "v2/hi_speaker_6",
 419 |         "language": "Hindi",
 420 |         "gender": "Male",
 421 |         "quality": null,
 422 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_6.mp3",
 423 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_6.mp3"
 424 |     },
 425 |     {
 426 |         "name": "Speaker 7 (HI)",
 427 |         "id": "v2/hi_speaker_7",
 428 |         "language": "Hindi",
 429 |         "gender": "Male",
 430 |         "quality": null,
 431 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_7.mp3",
 432 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_7.mp3"
 433 |     },
 434 |     {
 435 |         "name": "Speaker 8 (HI)",
 436 |         "id": "v2/hi_speaker_8",
 437 |         "language": "Hindi",
 438 |         "gender": "Male",
 439 |         "quality": null,
 440 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_8.mp3",
 441 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_8.mp3"
 442 |     },
 443 |     {
 444 |         "name": "Speaker 9 (HI)",
 445 |         "id": "v2/hi_speaker_9",
 446 |         "language": "Hindi",
 447 |         "gender": "Female",
 448 |         "quality": null,
 449 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_9.mp3",
 450 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_9.mp3"
 451 |     },
 452 |     {
 453 |         "name": "Speaker 0 (IT)",
 454 |         "id": "v2/it_speaker_0",
 455 |         "language": "Italian",
 456 |         "gender": "Male",
 457 |         "quality": null,
 458 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_0.mp3",
 459 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_0.mp3"
 460 |     },
 461 |     {
 462 |         "name": "Speaker 1 (IT)",
 463 |         "id": "v2/it_speaker_1",
 464 |         "language": "Italian",
 465 |         "gender": "Male",
 466 |         "quality": null,
 467 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_1.mp3",
 468 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_1.mp3"
 469 |     },
 470 |     {
 471 |         "name": "Speaker 2 (IT)",
 472 |         "id": "v2/it_speaker_2",
 473 |         "language": "Italian",
 474 |         "gender": "Female",
 475 |         "quality": null,
 476 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_2.mp3",
 477 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_2.mp3"
 478 |     },
 479 |     {
 480 |         "name": "Speaker 3 (IT)",
 481 |         "id": "v2/it_speaker_3",
 482 |         "language": "Italian",
 483 |         "gender": "Male",
 484 |         "quality": null,
 485 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_3.mp3",
 486 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_3.mp3"
 487 |     },
 488 |     {
 489 |         "name": "Speaker 4 (IT)",
 490 |         "id": "v2/it_speaker_4",
 491 |         "language": "Italian",
 492 |         "gender": "Male",
 493 |         "quality": "Suno Favorite",
 494 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_4.mp3",
 495 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_4.mp3"
 496 |     },
 497 |     {
 498 |         "name": "Speaker 5 (IT)",
 499 |         "id": "v2/it_speaker_5",
 500 |         "language": "Italian",
 501 |         "gender": "Male",
 502 |         "quality": null,
 503 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_5.mp3",
 504 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_5.mp3"
 505 |     },
 506 |     {
 507 |         "name": "Speaker 6 (IT)",
 508 |         "id": "v2/it_speaker_6",
 509 |         "language": "Italian",
 510 |         "gender": "Male",
 511 |         "quality": null,
 512 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_6.mp3",
 513 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_6.mp3"
 514 |     },
 515 |     {
 516 |         "name": "Speaker 7 (IT)",
 517 |         "id": "v2/it_speaker_7",
 518 |         "language": "Italian",
 519 |         "gender": "Female",
 520 |         "quality": null,
 521 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_7.mp3",
 522 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_7.mp3"
 523 |     },
 524 |     {
 525 |         "name": "Speaker 8 (IT)",
 526 |         "id": "v2/it_speaker_8",
 527 |         "language": "Italian",
 528 |         "gender": "Male",
 529 |         "quality": null,
 530 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_8.mp3",
 531 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_8.mp3"
 532 |     },
 533 |     {
 534 |         "name": "Speaker 9 (IT)",
 535 |         "id": "v2/it_speaker_9",
 536 |         "language": "Italian",
 537 |         "gender": "Female",
 538 |         "quality": null,
 539 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_9.mp3",
 540 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_9.mp3"
 541 |     },
 542 |     {
 543 |         "name": "Speaker 0 (JA)",
 544 |         "id": "v2/ja_speaker_0",
 545 |         "language": "Japanese",
 546 |         "gender": "Female",
 547 |         "quality": null,
 548 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_0.mp3",
 549 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_0.mp3"
 550 |     },
 551 |     {
 552 |         "name": "Speaker 1 (JA)",
 553 |         "id": "v2/ja_speaker_1",
 554 |         "language": "Japanese",
 555 |         "gender": "Female",
 556 |         "quality": "Background Noise",
 557 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_1.mp3",
 558 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_1.mp3"
 559 |     },
 560 |     {
 561 |         "name": "Speaker 2 (JA)",
 562 |         "id": "v2/ja_speaker_2",
 563 |         "language": "Japanese",
 564 |         "gender": "Male",
 565 |         "quality": null,
 566 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_2.mp3",
 567 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_2.mp3"
 568 |     },
 569 |     {
 570 |         "name": "Speaker 3 (JA)",
 571 |         "id": "v2/ja_speaker_3",
 572 |         "language": "Japanese",
 573 |         "gender": "Female",
 574 |         "quality": null,
 575 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_3.mp3",
 576 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_3.mp3"
 577 |     },
 578 |     {
 579 |         "name": "Speaker 4 (JA)",
 580 |         "id": "v2/ja_speaker_4",
 581 |         "language": "Japanese",
 582 |         "gender": "Female",
 583 |         "quality": null,
 584 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_4.mp3",
 585 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_4.mp3"
 586 |     },
 587 |     {
 588 |         "name": "Speaker 5 (JA)",
 589 |         "id": "v2/ja_speaker_5",
 590 |         "language": "Japanese",
 591 |         "gender": "Female",
 592 |         "quality": null,
 593 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_5.mp3",
 594 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_5.mp3"
 595 |     },
 596 |     {
 597 |         "name": "Speaker 6 (JA)",
 598 |         "id": "v2/ja_speaker_6",
 599 |         "language": "Japanese",
 600 |         "gender": "Male",
 601 |         "quality": null,
 602 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_6.mp3",
 603 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_6.mp3"
 604 |     },
 605 |     {
 606 |         "name": "Speaker 7 (JA)",
 607 |         "id": "v2/ja_speaker_7",
 608 |         "language": "Japanese",
 609 |         "gender": "Female",
 610 |         "quality": null,
 611 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_7.mp3",
 612 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_7.mp3"
 613 |     },
 614 |     {
 615 |         "name": "Speaker 8 (JA)",
 616 |         "id": "v2/ja_speaker_8",
 617 |         "language": "Japanese",
 618 |         "gender": "Female",
 619 |         "quality": null,
 620 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_8.mp3",
 621 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_8.mp3"
 622 |     },
 623 |     {
 624 |         "name": "Speaker 9 (JA)",
 625 |         "id": "v2/ja_speaker_9",
 626 |         "language": "Japanese",
 627 |         "gender": "Female",
 628 |         "quality": null,
 629 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_9.mp3",
 630 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_9.mp3"
 631 |     },
 632 |     {
 633 |         "name": "Speaker 0 (KO)",
 634 |         "id": "v2/ko_speaker_0",
 635 |         "language": "Korean",
 636 |         "gender": "Female",
 637 |         "quality": null,
 638 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_0.mp3",
 639 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_0.mp3"
 640 |     },
 641 |     {
 642 |         "name": "Speaker 1 (KO)",
 643 |         "id": "v2/ko_speaker_1",
 644 |         "language": "Korean",
 645 |         "gender": "Male",
 646 |         "quality": null,
 647 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_1.mp3",
 648 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_1.mp3"
 649 |     },
 650 |     {
 651 |         "name": "Speaker 2 (KO)",
 652 |         "id": "v2/ko_speaker_2",
 653 |         "language": "Korean",
 654 |         "gender": "Male",
 655 |         "quality": null,
 656 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_2.mp3",
 657 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_2.mp3"
 658 |     },
 659 |     {
 660 |         "name": "Speaker 3 (KO)",
 661 |         "id": "v2/ko_speaker_3",
 662 |         "language": "Korean",
 663 |         "gender": "Male",
 664 |         "quality": null,
 665 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_3.mp3",
 666 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_3.mp3"
 667 |     },
 668 |     {
 669 |         "name": "Speaker 4 (KO)",
 670 |         "id": "v2/ko_speaker_4",
 671 |         "language": "Korean",
 672 |         "gender": "Male",
 673 |         "quality": null,
 674 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_4.mp3",
 675 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_4.mp3"
 676 |     },
 677 |     {
 678 |         "name": "Speaker 5 (KO)",
 679 |         "id": "v2/ko_speaker_5",
 680 |         "language": "Korean",
 681 |         "gender": "Male",
 682 |         "quality": null,
 683 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_5.mp3",
 684 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_5.mp3"
 685 |     },
 686 |     {
 687 |         "name": "Speaker 6 (KO)",
 688 |         "id": "v2/ko_speaker_6",
 689 |         "language": "Korean",
 690 |         "gender": "Male",
 691 |         "quality": null,
 692 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_6.mp3",
 693 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_6.mp3"
 694 |     },
 695 |     {
 696 |         "name": "Speaker 7 (KO)",
 697 |         "id": "v2/ko_speaker_7",
 698 |         "language": "Korean",
 699 |         "gender": "Male",
 700 |         "quality": null,
 701 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_7.mp3",
 702 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_7.mp3"
 703 |     },
 704 |     {
 705 |         "name": "Speaker 8 (KO)",
 706 |         "id": "v2/ko_speaker_8",
 707 |         "language": "Korean",
 708 |         "gender": "Male",
 709 |         "quality": null,
 710 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_8.mp3",
 711 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_8.mp3"
 712 |     },
 713 |     {
 714 |         "name": "Speaker 9 (KO)",
 715 |         "id": "v2/ko_speaker_9",
 716 |         "language": "Korean",
 717 |         "gender": "Male",
 718 |         "quality": null,
 719 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_9.mp3",
 720 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_9.mp3"
 721 |     },
 722 |     {
 723 |         "name": "Speaker 0 (PL)",
 724 |         "id": "v2/pl_speaker_0",
 725 |         "language": "Polish",
 726 |         "gender": "Male",
 727 |         "quality": null,
 728 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_0.mp3",
 729 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_0.mp3"
 730 |     },
 731 |     {
 732 |         "name": "Speaker 1 (PL)",
 733 |         "id": "v2/pl_speaker_1",
 734 |         "language": "Polish",
 735 |         "gender": "Male",
 736 |         "quality": null,
 737 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_1.mp3",
 738 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_1.mp3"
 739 |     },
 740 |     {
 741 |         "name": "Speaker 2 (PL)",
 742 |         "id": "v2/pl_speaker_2",
 743 |         "language": "Polish",
 744 |         "gender": "Male",
 745 |         "quality": null,
 746 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_2.mp3",
 747 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_2.mp3"
 748 |     },
 749 |     {
 750 |         "name": "Speaker 3 (PL)",
 751 |         "id": "v2/pl_speaker_3",
 752 |         "language": "Polish",
 753 |         "gender": "Male",
 754 |         "quality": null,
 755 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_3.mp3",
 756 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_3.mp3"
 757 |     },
 758 |     {
 759 |         "name": "Speaker 4 (PL)",
 760 |         "id": "v2/pl_speaker_4",
 761 |         "language": "Polish",
 762 |         "gender": "Female",
 763 |         "quality": null,
 764 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_4.mp3",
 765 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_4.mp3"
 766 |     },
 767 |     {
 768 |         "name": "Speaker 5 (PL)",
 769 |         "id": "v2/pl_speaker_5",
 770 |         "language": "Polish",
 771 |         "gender": "Male",
 772 |         "quality": null,
 773 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_5.mp3",
 774 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_5.mp3"
 775 |     },
 776 |     {
 777 |         "name": "Speaker 6 (PL)",
 778 |         "id": "v2/pl_speaker_6",
 779 |         "language": "Polish",
 780 |         "gender": "Female",
 781 |         "quality": null,
 782 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_6.mp3",
 783 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_6.mp3"
 784 |     },
 785 |     {
 786 |         "name": "Speaker 7 (PL)",
 787 |         "id": "v2/pl_speaker_7",
 788 |         "language": "Polish",
 789 |         "gender": "Male",
 790 |         "quality": null,
 791 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_7.mp3",
 792 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_7.mp3"
 793 |     },
 794 |     {
 795 |         "name": "Speaker 8 (PL)",
 796 |         "id": "v2/pl_speaker_8",
 797 |         "language": "Polish",
 798 |         "gender": "Male",
 799 |         "quality": null,
 800 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_8.mp3",
 801 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_8.mp3"
 802 |     },
 803 |     {
 804 |         "name": "Speaker 9 (PL)",
 805 |         "id": "v2/pl_speaker_9",
 806 |         "language": "Polish",
 807 |         "gender": "Female",
 808 |         "quality": null,
 809 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_9.mp3",
 810 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_9.mp3"
 811 |     },
 812 |     {
 813 |         "name": "Speaker 0 (PT)",
 814 |         "id": "v2/pt_speaker_0",
 815 |         "language": "Portuguese",
 816 |         "gender": "Male",
 817 |         "quality": null,
 818 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_0.mp3",
 819 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_0.mp3"
 820 |     },
 821 |     {
 822 |         "name": "Speaker 1 (PT)",
 823 |         "id": "v2/pt_speaker_1",
 824 |         "language": "Portuguese",
 825 |         "gender": "Male",
 826 |         "quality": null,
 827 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_1.mp3",
 828 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_1.mp3"
 829 |     },
 830 |     {
 831 |         "name": "Speaker 2 (PT)",
 832 |         "id": "v2/pt_speaker_2",
 833 |         "language": "Portuguese",
 834 |         "gender": "Male",
 835 |         "quality": null,
 836 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_2.mp3",
 837 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_2.mp3"
 838 |     },
 839 |     {
 840 |         "name": "Speaker 3 (PT)",
 841 |         "id": "v2/pt_speaker_3",
 842 |         "language": "Portuguese",
 843 |         "gender": "Male",
 844 |         "quality": null,
 845 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_3.mp3",
 846 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_3.mp3"
 847 |     },
 848 |     {
 849 |         "name": "Speaker 4 (PT)",
 850 |         "id": "v2/pt_speaker_4",
 851 |         "language": "Portuguese",
 852 |         "gender": "Male",
 853 |         "quality": null,
 854 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_4.mp3",
 855 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_4.mp3"
 856 |     },
 857 |     {
 858 |         "name": "Speaker 5 (PT)",
 859 |         "id": "v2/pt_speaker_5",
 860 |         "language": "Portuguese",
 861 |         "gender": "Male",
 862 |         "quality": null,
 863 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_5.mp3",
 864 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_5.mp3"
 865 |     },
 866 |     {
 867 |         "name": "Speaker 6 (PT)",
 868 |         "id": "v2/pt_speaker_6",
 869 |         "language": "Portuguese",
 870 |         "gender": "Male",
 871 |         "quality": "Background Noise",
 872 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_6.mp3",
 873 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_6.mp3"
 874 |     },
 875 |     {
 876 |         "name": "Speaker 7 (PT)",
 877 |         "id": "v2/pt_speaker_7",
 878 |         "language": "Portuguese",
 879 |         "gender": "Male",
 880 |         "quality": null,
 881 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_7.mp3",
 882 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_7.mp3"
 883 |     },
 884 |     {
 885 |         "name": "Speaker 8 (PT)",
 886 |         "id": "v2/pt_speaker_8",
 887 |         "language": "Portuguese",
 888 |         "gender": "Male",
 889 |         "quality": null,
 890 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_8.mp3",
 891 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_8.mp3"
 892 |     },
 893 |     {
 894 |         "name": "Speaker 9 (PT)",
 895 |         "id": "v2/pt_speaker_9",
 896 |         "language": "Portuguese",
 897 |         "gender": "Male",
 898 |         "quality": null,
 899 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_9.mp3",
 900 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_9.mp3"
 901 |     },
 902 |     {
 903 |         "name": "Speaker 0 (RU)",
 904 |         "id": "v2/ru_speaker_0",
 905 |         "language": "Russian",
 906 |         "gender": "Male",
 907 |         "quality": null,
 908 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_0.mp3",
 909 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_0.mp3"
 910 |     },
 911 |     {
 912 |         "name": "Speaker 1 (RU)",
 913 |         "id": "v2/ru_speaker_1",
 914 |         "language": "Russian",
 915 |         "gender": "Male",
 916 |         "quality": "Echoes",
 917 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_1.mp3",
 918 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_1.mp3"
 919 |     },
 920 |     {
 921 |         "name": "Speaker 2 (RU)",
 922 |         "id": "v2/ru_speaker_2",
 923 |         "language": "Russian",
 924 |         "gender": "Male",
 925 |         "quality": "Echoes",
 926 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_2.mp3",
 927 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_2.mp3"
 928 |     },
 929 |     {
 930 |         "name": "Speaker 3 (RU)",
 931 |         "id": "v2/ru_speaker_3",
 932 |         "language": "Russian",
 933 |         "gender": "Male",
 934 |         "quality": null,
 935 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_3.mp3",
 936 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_3.mp3"
 937 |     },
 938 |     {
 939 |         "name": "Speaker 4 (RU)",
 940 |         "id": "v2/ru_speaker_4",
 941 |         "language": "Russian",
 942 |         "gender": "Male",
 943 |         "quality": null,
 944 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_4.mp3",
 945 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_4.mp3"
 946 |     },
 947 |     {
 948 |         "name": "Speaker 5 (RU)",
 949 |         "id": "v2/ru_speaker_5",
 950 |         "language": "Russian",
 951 |         "gender": "Female",
 952 |         "quality": null,
 953 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_5.mp3",
 954 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_5.mp3"
 955 |     },
 956 |     {
 957 |         "name": "Speaker 6 (RU)",
 958 |         "id": "v2/ru_speaker_6",
 959 |         "language": "Russian",
 960 |         "gender": "Female",
 961 |         "quality": "Grainy",
 962 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_6.mp3",
 963 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_6.mp3"
 964 |     },
 965 |     {
 966 |         "name": "Speaker 7 (RU)",
 967 |         "id": "v2/ru_speaker_7",
 968 |         "language": "Russian",
 969 |         "gender": "Male",
 970 |         "quality": null,
 971 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_7.mp3",
 972 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_7.mp3"
 973 |     },
 974 |     {
 975 |         "name": "Speaker 8 (RU)",
 976 |         "id": "v2/ru_speaker_8",
 977 |         "language": "Russian",
 978 |         "gender": "Male",
 979 |         "quality": "Grainy",
 980 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_8.mp3",
 981 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_8.mp3"
 982 |     },
 983 |     {
 984 |         "name": "Speaker 9 (RU)",
 985 |         "id": "v2/ru_speaker_9",
 986 |         "language": "Russian",
 987 |         "gender": "Female",
 988 |         "quality": "Grainy",
 989 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_9.mp3",
 990 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_9.mp3"
 991 |     },
 992 |     {
 993 |         "name": "Speaker 0 (ES)",
 994 |         "id": "v2/es_speaker_0",
 995 |         "language": "Spanish",
 996 |         "gender": "Male",
 997 |         "quality": null,
 998 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_0.mp3",
 999 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_0.mp3"
1000 |     },
1001 |     {
1002 |         "name": "Speaker 1 (ES)",
1003 |         "id": "v2/es_speaker_1",
1004 |         "language": "Spanish",
1005 |         "gender": "Male",
1006 |         "quality": null,
1007 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_1.mp3",
1008 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_1.mp3"
1009 |     },
1010 |     {
1011 |         "name": "Speaker 2 (ES)",
1012 |         "id": "v2/es_speaker_2",
1013 |         "language": "Spanish",
1014 |         "gender": "Male",
1015 |         "quality": "Background Noise",
1016 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_2.mp3",
1017 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_2.mp3"
1018 |     },
1019 |     {
1020 |         "name": "Speaker 3 (ES)",
1021 |         "id": "v2/es_speaker_3",
1022 |         "language": "Spanish",
1023 |         "gender": "Male",
1024 |         "quality": "Background Noise",
1025 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_3.mp3",
1026 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_3.mp3"
1027 |     },
1028 |     {
1029 |         "name": "Speaker 4 (ES)",
1030 |         "id": "v2/es_speaker_4",
1031 |         "language": "Spanish",
1032 |         "gender": "Male",
1033 |         "quality": null,
1034 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_4.mp3",
1035 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_4.mp3"
1036 |     },
1037 |     {
1038 |         "name": "Speaker 5 (ES)",
1039 |         "id": "v2/es_speaker_5",
1040 |         "language": "Spanish",
1041 |         "gender": "Male",
1042 |         "quality": "Background Noise",
1043 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_5.mp3",
1044 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_5.mp3"
1045 |     },
1046 |     {
1047 |         "name": "Speaker 6 (ES)",
1048 |         "id": "v2/es_speaker_6",
1049 |         "language": "Spanish",
1050 |         "gender": "Male",
1051 |         "quality": null,
1052 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_6.mp3",
1053 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_6.mp3"
1054 |     },
1055 |     {
1056 |         "name": "Speaker 7 (ES)",
1057 |         "id": "v2/es_speaker_7",
1058 |         "language": "Spanish",
1059 |         "gender": "Male",
1060 |         "quality": null,
1061 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_7.mp3",
1062 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_7.mp3"
1063 |     },
1064 |     {
1065 |         "name": "Speaker 8 (ES)",
1066 |         "id": "v2/es_speaker_8",
1067 |         "language": "Spanish",
1068 |         "gender": "Female",
1069 |         "quality": null,
1070 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_8.mp3",
1071 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_8.mp3"
1072 |     },
1073 |     {
1074 |         "name": "Speaker 9 (ES)",
1075 |         "id": "v2/es_speaker_9",
1076 |         "language": "Spanish",
1077 |         "gender": "Female",
1078 |         "quality": null,
1079 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_9.mp3",
1080 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_9.mp3"
1081 |     },
1082 |     {
1083 |         "name": "Speaker 0 (TR)",
1084 |         "id": "v2/tr_speaker_0",
1085 |         "language": "Turkish",
1086 |         "gender": "Male",
1087 |         "quality": null,
1088 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_0.mp3",
1089 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_0.mp3"
1090 |     },
1091 |     {
1092 |         "name": "Speaker 1 (TR)",
1093 |         "id": "v2/tr_speaker_1",
1094 |         "language": "Turkish",
1095 |         "gender": "Male",
1096 |         "quality": null,
1097 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_1.mp3",
1098 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_1.mp3"
1099 |     },
1100 |     {
1101 |         "name": "Speaker 2 (TR)",
1102 |         "id": "v2/tr_speaker_2",
1103 |         "language": "Turkish",
1104 |         "gender": "Male",
1105 |         "quality": null,
1106 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_2.mp3",
1107 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_2.mp3"
1108 |     },
1109 |     {
1110 |         "name": "Speaker 3 (TR)",
1111 |         "id": "v2/tr_speaker_3",
1112 |         "language": "Turkish",
1113 |         "gender": "Male",
1114 |         "quality": null,
1115 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_3.mp3",
1116 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_3.mp3"
1117 |     },
1118 |     {
1119 |         "name": "Speaker 4 (TR)",
1120 |         "id": "v2/tr_speaker_4",
1121 |         "language": "Turkish",
1122 |         "gender": "Female",
1123 |         "quality": null,
1124 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_4.mp3",
1125 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_4.mp3"
1126 |     },
1127 |     {
1128 |         "name": "Speaker 5 (TR)",
1129 |         "id": "v2/tr_speaker_5",
1130 |         "language": "Turkish",
1131 |         "gender": "Female",
1132 |         "quality": null,
1133 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_5.mp3",
1134 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_5.mp3"
1135 |     },
1136 |     {
1137 |         "name": "Speaker 6 (TR)",
1138 |         "id": "v2/tr_speaker_6",
1139 |         "language": "Turkish",
1140 |         "gender": "Male",
1141 |         "quality": null,
1142 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_6.mp3",
1143 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_6.mp3"
1144 |     },
1145 |     {
1146 |         "name": "Speaker 7 (TR)",
1147 |         "id": "v2/tr_speaker_7",
1148 |         "language": "Turkish",
1149 |         "gender": "Male",
1150 |         "quality": "Grainy",
1151 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_7.mp3",
1152 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_7.mp3"
1153 |     },
1154 |     {
1155 |         "name": "Speaker 8 (TR)",
1156 |         "id": "v2/tr_speaker_8",
1157 |         "language": "Turkish",
1158 |         "gender": "Male",
1159 |         "quality": null,
1160 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_8.mp3",
1161 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_8.mp3"
1162 |     },
1163 |     {
1164 |         "name": "Speaker 9 (TR)",
1165 |         "id": "v2/tr_speaker_9",
1166 |         "language": "Turkish",
1167 |         "gender": "Male",
1168 |         "quality": null,
1169 |         "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_9.mp3",
1170 |         "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_9.mp3"
1171 |     }
1172 | ]


--------------------------------------------------------------------------------
/scripts/bark/tts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import numpy as np
 4 | from scipy.io.wavfile import write as write_wav
 5 | from bark.generation import (
 6 |     preload_models,
 7 |     clean_models
 8 | )
 9 | from bark import generate_audio, SAMPLE_RATE
10 | 
11 | 
12 | class TTS:
13 |     def __init__(self, text_prompt, speaker, temperature, silence, voice, low_vram):
14 |         self.text_prompt = text_prompt
15 |         self.speaker = speaker
16 |         self.temperature = temperature
17 |         self.silence = silence
18 |         self.voice = voice
19 |         self.low_vram = low_vram
20 | 
21 |     def generate(self):
22 |         if self.low_vram:
23 |             preload_models(text_use_gpu=True,
24 |                            text_use_small=True,
25 |                            coarse_use_gpu=True,
26 |                            coarse_use_small=True,
27 |                            fine_use_gpu=True,
28 |                            fine_use_small=True,
29 |                            codec_use_gpu=True,
30 |                            force_reload=False)
31 |         else:
32 |             preload_models(text_use_gpu=True,
33 |                 text_use_small=False,
34 |                 coarse_use_gpu=True,
35 |                 coarse_use_small=False,
36 |                 fine_use_gpu=True,
37 |                 fine_use_small=False,
38 |                 codec_use_gpu=True,
39 |                 force_reload=False)
40 |         pieces = []
41 |         # split text_prompt into sentences by punctuation
42 |         sentences = re.split('\[split\]', self.text_prompt)
43 |         silence = np.zeros(int(self.silence * SAMPLE_RATE)).astype(np.float32)
44 |         for sentence in sentences:
45 |             if sentence.strip() != "":
46 |                 audio_array = generate_audio(sentence, history_prompt=self.speaker, text_temp=self.temperature)
47 |                 pieces += [audio_array, silence.copy()]
48 | 
49 |         write_wav("bark_generation.wav", SAMPLE_RATE, np.concatenate(pieces))
50 |         clean_models()
51 |         print("Done!")
52 |         return "bark_generation.wav"
53 | 
54 | 


--------------------------------------------------------------------------------
/scripts/faceswap/model/README.md:
--------------------------------------------------------------------------------
1 | inswapper model folder


--------------------------------------------------------------------------------
/scripts/faceswap/swap.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | from PIL import Image
  5 | import subprocess
  6 | import insightface
  7 | from dataclasses import dataclass
  8 | from typing import List, Union, Dict, Set, Tuple
  9 | from pkg_resources import resource_filename
 10 | from modules.shared import state, opts
 11 | import modules.face_restoration
 12 | from modules.upscaler import Upscaler, UpscalerData
 13 | from modules.face_restoration import FaceRestoration, restore_faces
 14 | import scripts.wav2lip.audio as audio
 15 | import tempfile
 16 | from ifnude import detect
 17 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
 18 | 
 19 | 
 20 | @dataclass
 21 | class ImageResult:
 22 |     path: Union[str, None] = None
 23 |     similarity: Union[Dict[int, float], None] = None  # face, 0..1
 24 | 
 25 |     def image(self) -> Union[Image.Image, None]:
 26 |         if self.path:
 27 |             return Image.open(self.path)
 28 |         return None
 29 | 
 30 | 
 31 | @dataclass
 32 | class UpscaleOptions:
 33 |     scale: int = 1
 34 |     upscaler: UpscalerData = None
 35 |     upscale_visibility: float = 0.5
 36 |     face_restorer: FaceRestoration = None
 37 |     restorer_visibility: float = 0.5
 38 | 
 39 | 
 40 | class FaceSwap:
 41 |     def __init__(self, face=None, audio=None, face_index=None, source=None, resize_factor=None, face_restore_model=None, code_former_weight=None):
 42 |         self.faceswap_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1])
 43 |         self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-2])
 44 |         self.faceswap_output_folder = os.path.join(self.wav2lip_folder, 'wav2lip', 'output', 'faceswap')
 45 |         self.face = face
 46 |         self.audio = audio
 47 |         self.source = source
 48 |         self.resize_factor = resize_factor
 49 |         self.code_former_weight = code_former_weight
 50 |         self.face_restore_model = face_restore_model
 51 |         self.model = self.faceswap_folder + "/model/inswapper_128.onnx"
 52 |         self.faces_index = {face_index}
 53 |         self.ffmpeg_binary = self.find_ffmpeg_binary()
 54 |         model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.model)
 55 |         self.face_swapper = insightface.model_zoo.get_model(model_path, providers=providers)
 56 |         self.face_analyser = insightface.app.FaceAnalysis(name="buffalo_l", providers=providers)
 57 |         self.face_analyser.prepare(ctx_id=0, det_size=(640, 640))
 58 |         self.mel_step_size = 16
 59 |         if audio is not None:
 60 |             self.nb_frame = self.calc_frame()
 61 | 
 62 |     def calc_frame(self):
 63 | 
 64 |         video_stream = cv2.VideoCapture(self.face)
 65 |         fps = video_stream.get(cv2.CAP_PROP_FPS)
 66 |         wav = audio.load_wav(self.audio, 16000)
 67 |         mel = audio.melspectrogram(wav)
 68 | 
 69 |         if np.isnan(mel.reshape(-1)).sum() > 0:
 70 |             raise ValueError(
 71 |                 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
 72 | 
 73 |         mel_chunks = []
 74 |         mel_idx_multiplier = 80. / fps
 75 |         i = 0
 76 |         while 1:
 77 |             start_idx = int(i * mel_idx_multiplier)
 78 |             if start_idx + self.mel_step_size > len(mel[0]):
 79 |                 mel_chunks.append(mel[:, len(mel[0]) - self.mel_step_size:])
 80 |                 break
 81 |             mel_chunks.append(mel[:, start_idx: start_idx + self.mel_step_size])
 82 |             i += 1
 83 | 
 84 |         return len(mel_chunks)
 85 | 
 86 |     def convert_to_sd(self, img):
 87 |         shapes = []
 88 |         chunks = detect(img)
 89 |         for chunk in chunks:
 90 |             shapes.append(chunk["score"] > 0.7)
 91 |         return [any(shapes), tempfile.NamedTemporaryFile(delete=False, suffix=".png")]
 92 | 
 93 |     def find_ffmpeg_binary(self):
 94 |         for package in ['imageio_ffmpeg', 'imageio-ffmpeg']:
 95 |             try:
 96 |                 package_path = resource_filename(package, 'binaries')
 97 |                 files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")]
 98 |                 files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
 99 |                 return files[0] if files else 'ffmpeg'
100 |             except:
101 |                 return 'ffmpeg'
102 | 
103 |     def get_framerate(self, video_file):
104 |         video = cv2.VideoCapture(video_file)
105 |         fps = video.get(cv2.CAP_PROP_FPS)
106 |         video.release()
107 |         return fps
108 | 
109 |     def create_video_from_images(self, nb_frames):
110 |         fps = str(self.get_framerate(self.face))
111 |         command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i",
112 |                    self.faceswap_output_folder + "/face_swap_%05d.png", "-vframes",
113 |                    str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k",
114 |                    self.faceswap_output_folder + "/video.mp4"]
115 | 
116 |         self.execute_command(command)
117 | 
118 |     def execute_command(self, command):
119 |         process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
120 |         stdout, stderr = process.communicate()
121 |         if process.returncode != 0:
122 |             raise RuntimeError(stderr)
123 | 
124 |     def get_face_single(self, img_data: np.ndarray, face_index=0, det_size=(640, 640)):
125 |         face = self.face_analyser.get(img_data)
126 |         if len(face) == 0 and det_size[0] > 320 and det_size[1] > 320:
127 |             det_size_half = (det_size[0] // 2, det_size[1] // 2)
128 |             self.face_analyser.prepare(ctx_id=0, det_size=det_size_half)
129 |             face = self.face_analyser.get(img_data)
130 |             self.face_analyser.prepare(ctx_id=0, det_size=det_size)
131 |             try:
132 |                 return sorted(face, key=lambda x: x.bbox[0])[face_index]
133 |             except IndexError:
134 |                 return None
135 |         try:
136 |             return sorted(face, key=lambda x: x.bbox[0])[face_index]
137 |         except IndexError:
138 |             return None
139 | 
140 |     def swap_face(self,
141 |                   source_img: Image.Image,
142 |                   target_img: Image.Image,
143 |                   model: Union[str, None] = None,
144 |                   faces_index: Set[int] = {0},
145 |                   upscale_options: Union[UpscaleOptions, None] = None,
146 |                   ) -> ImageResult:
147 |         result_image = target_img
148 |         converted = self.convert_to_sd(target_img)
149 |         scale, fn = converted[0], converted[1]
150 |         if model is not None and not scale:
151 |             source_img = cv2.cvtColor(np.array(source_img), cv2.COLOR_RGB2BGR)
152 |             target_img = cv2.cvtColor(np.array(target_img), cv2.COLOR_RGB2BGR)
153 |             source_face = self.get_face_single(source_img, face_index=0)
154 |             if source_face is not None:
155 |                 result = target_img
156 |                 for face_num in faces_index:
157 |                     target_face = self.get_face_single(target_img, face_index=face_num)
158 |                     if target_face is not None:
159 |                         result = self.face_swapper.get(result, target_face, source_face)
160 |                     else:
161 |                         print(f"No target face found for {face_num}")
162 |                 result_image = Image.fromarray(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
163 |             else:
164 |                 print("No source face found")
165 |         result_image.save(fn.name)
166 |         return ImageResult(path=fn.name)
167 | 
168 |     def resume(self):
169 |         return self.faceswap_output_folder + "/video.mp4"
170 | 
171 |     def generate(self):
172 |         original_codeformer_weight = opts.code_former_weight
173 |         original_face_restoration_model = opts.face_restoration_model
174 | 
175 |         opts.code_former_weight = self.code_former_weight
176 |         opts.face_restoration_model = self.face_restore_model
177 |         video_stream = cv2.VideoCapture(self.face)
178 | 
179 |         print('Reading video frames for face swap...')
180 |         frame_number = 0
181 | 
182 |         while frame_number != self.nb_frame+1:
183 |             f_number = str(frame_number).rjust(5, '0')
184 |             print("[INFO] Processing frame: " + str(frame_number) + " of " + str(self.nb_frame) + " - ", end="\r")
185 |             still_reading, frame = video_stream.read()
186 |             if not still_reading:
187 |                 video_stream.release()
188 |                 break
189 | 
190 |             if self.resize_factor > 1:
191 |                 frame = cv2.resize(frame,
192 |                                    (frame.shape[1] // self.resize_factor, frame.shape[0] // self.resize_factor))
193 | 
194 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
195 |             result = self.swap_face(
196 |                 self.source,
197 |                 frame,
198 |                 faces_index=self.faces_index,
199 |                 model=self.model,
200 |                 upscale_options=None
201 |             )
202 |             # copy image to output folder
203 |             face_swapped = cv2.imread(result.path)
204 |             face_swapped = cv2.cvtColor(face_swapped, cv2.COLOR_RGB2BGR)
205 |             image_restored = modules.face_restoration.restore_faces(face_swapped)
206 |             image_restored2 = cv2.cvtColor(image_restored, cv2.COLOR_RGB2BGR)
207 |             cv2.imwrite(self.faceswap_output_folder + "/face_swap_" + f_number + ".png", image_restored2)
208 | 
209 |             frame_number += 1
210 | 
211 |         self.create_video_from_images(frame_number - 1)
212 |         opts.code_former_weight = original_codeformer_weight
213 |         opts.face_restoration_model = original_face_restoration_model
214 |         return self.faceswap_output_folder + "/video.mp4"
215 | 


--------------------------------------------------------------------------------
/scripts/ui.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from scripts.wav2lip_uhq_extend_paths import wav2lip_uhq_sys_extend
  3 | import gradio as gr
  4 | from scripts.wav2lip.w2l import W2l
  5 | from scripts.wav2lip.wav2lip_uhq import Wav2LipUHQ
  6 | from modules.shared import state
  7 | from scripts.bark.tts import TTS
  8 | from scripts.faceswap.swap import FaceSwap
  9 | 
 10 | speaker_id = "v2/en_speaker_0"
 11 | 
 12 | 
 13 | def on_ui_tabs():
 14 |     wav2lip_uhq_sys_extend()
 15 |     speaker_json = json.load(open("extensions/sd-wav2lip-uhq/scripts/bark/speakers.json", "r"))
 16 |     speaker_list = [speaker["name"] for speaker in speaker_json if
 17 |                     speaker["language"] == "English" and speaker["gender"] == "Male"]
 18 |     speaker_language = list(set([speaker["language"] for speaker in speaker_json]))
 19 |     speaker_gender = list(set([speaker["gender"] for speaker in speaker_json]))
 20 | 
 21 |     def update_speaker_list(new_language, new_gender):
 22 |         # Mettez à jour la liste des speakers basée sur la langue et le genre sélectionnés
 23 |         global speaker_id
 24 |         new_speaker_list = [speaker["name"] for speaker in speaker_json if
 25 |                             speaker["language"] == new_language and speaker["gender"] == new_gender]
 26 |         audio_mp3 = [speaker["prompt_audio"] for speaker in speaker_json if speaker["name"] in new_speaker_list[0]][0]
 27 |         speaker_id = [speaker["id"] for speaker in speaker_json if speaker["name"] in new_speaker_list[0]][0]
 28 |         return [gr.Dropdown.update(choices=new_speaker_list, value=new_speaker_list[0]),
 29 |                 gr.Audio.update(value=audio_mp3), gr.Dropdown.update(value=new_language)]
 30 | 
 31 |     def select_speaker(speaker):
 32 |         # Mettez à jour l'audio basé sur le speaker sélectionné
 33 |         global speaker_id
 34 |         audio_mp3 = [sp["prompt_audio"] for sp in speaker_json if sp["name"] == speaker][0]
 35 |         speaker_id = [sp["id"] for sp in speaker_json if sp["name"] == speaker][0]
 36 |         return gr.Audio.update(value=audio_mp3)
 37 | 
 38 |     with gr.Blocks(analytics_enabled=False) as wav2lip_uhq_interface:
 39 |         gr.Markdown(
 40 |             "<div align='center'> <h3><a href='https://github.com/numz/sd-wav2lip-uhq'> Follow installation instructions here </a> </h3> </div>")
 41 |         gr.Markdown(
 42 |             "<div align='center'> <h3><a href='https://www.patreon.com/Wav2LipStudio'> STANDALONE VERSION AVAILABLE HERE </a> </h3> </div>")
 43 |         with gr.Row():
 44 |             with gr.Column():
 45 |                 with gr.Row():
 46 |                     with gr.Column():
 47 |                         video = gr.Video(label="Video", format="mp4",
 48 |                                          info="Filepath of video/image that contains faces to use",
 49 |                                          file_types=["mp4", "png", "jpg", "jpeg", "avi"])
 50 |                         face_swap_img = gr.Image(label="Face Swap", type="pil")
 51 |                         face_index_slider = gr.Slider(minimum=0, maximum=20, step=1, value=0, label="Face index",
 52 |                                                     info="index of face to swap, left face in image is 0")
 53 | 
 54 |                     with gr.Column():
 55 |                         with gr.Row():
 56 |                             language = gr.Dropdown(
 57 |                                 speaker_language, label="Language", info="Select the language to use",
 58 |                                 value="English"
 59 |                             )
 60 |                             gender = gr.Dropdown(
 61 |                                 speaker_gender, label="Gender", info="Select gender", value="Male"
 62 |                             )
 63 |                         with gr.Row():
 64 |                             speaker = gr.Dropdown(
 65 |                                 speaker_list, label="Speaker", info="Select the speaker to use",
 66 |                                 value=speaker_list[0]
 67 |                             )
 68 |                             low_vram = gr.Radio(["False", "True"], value="True", label="Low VRAM",
 69 |                                                 info="Less than 16GB of VRAM, set True")
 70 |                         with gr.Row():
 71 |                             audio_example = gr.Audio(label="Audio example",
 72 |                                                      value="https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_0.mp3")
 73 |                         with gr.Column():
 74 |                             suno_prompt = gr.Textbox(label="Prompt", placeholder="Prompt", lines=5, type="text",info="Don't forget that bark can only generate 14 seconds of audio at a time, so for long text, you need to use [split] to split the text into multiple prompts")
 75 |                             temperature = gr.Slider(label="Generation temperature", minimum=0.01, maximum=1, step=0.01, value=0.7,
 76 |                                                   info="1.0 more diverse, 0.0 more conservative")
 77 |                             silence = gr.Slider(label="Silence", minimum=0, maximum=1, step=0.01, value=0.25, info="Silence after [split] in seconde")
 78 |                             generate_audio = gr.Button("Generate")
 79 |                             audio = gr.Audio(label="Speech", type="filepath")
 80 | 
 81 |                         # if language changed, update speaker list
 82 |                         language.change(update_speaker_list, [language, gender], [speaker, audio_example])
 83 |                         gender.change(update_speaker_list, [language, gender], [speaker, audio_example])
 84 |                         speaker.change(select_speaker, speaker, audio_example)
 85 | 
 86 |                 with gr.Row():
 87 |                     checkpoint = gr.Radio(["wav2lip", "wav2lip_gan"], value="wav2lip_gan", label="Checkpoint",
 88 |                                           info="Wav2lip model to use")
 89 |                     face_restore_model = gr.Radio(["CodeFormer", "GFPGAN"], value="GFPGAN",
 90 |                                                   label="Face Restoration Model",
 91 |                                                   info="Model to use")
 92 | 
 93 |                 with gr.Row():
 94 |                     no_smooth = gr.Checkbox(label="No Smooth", info="Prevent smoothing face detections")
 95 |                     only_mouth = gr.Checkbox(label="Only Mouth", info="Only track the mouth")
 96 |                     active_debug = gr.Checkbox(label="Active Debug", info="Active Debug")
 97 |                 with gr.Row():
 98 |                     with gr.Column():
 99 |                         resize_factor = gr.Slider(minimum=1, maximum=4, step=1, label="Resize Factor",
100 |                                                   info="Reduce the resolution by this factor.")
101 |                         mouth_mask_dilatation = gr.Slider(minimum=0, maximum=128, step=1, value=15,
102 |                                                           label="Mouth Mask Dilate",
103 |                                                           info="Dilatation of the mask around the mouth (in pixels)")
104 |                         erode_face_mask = gr.Slider(minimum=0, maximum=128, step=1, value=15, label="Face Mask Erode",
105 |                                                     info="Erode the mask around the face (in pixels)")
106 |                         mask_blur = gr.Slider(minimum=0, maximum=128, step=1, value=15, label="Mask Blur",
107 |                                               info="Kernel size of Gaussian blur for masking")
108 |                         code_former_weight = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.75,
109 |                                                        label="Code Former Fidelity",
110 |                                                        info="0 for better quality, 1 for better identity (Effect only if codeformer is selected)")
111 |                     with gr.Column():
112 |                         pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Top",
113 |                                             info="Padding above lips")
114 |                         pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Bottom",
115 |                                                info="Padding below lips")
116 |                         pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Left",
117 |                                              info="Padding to the left of lips")
118 |                         pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Right",
119 |                                               info="Padding to the right of lips")
120 | 
121 |             with gr.Column():
122 |                 with gr.Tabs(elem_id="wav2lip_generated"):
123 |                     with gr.Row():
124 |                         faceswap_video = gr.Video(label="faceSwap video", format="mp4")
125 |                         wav2lip_video = gr.Video(label="Wav2Lip video", format="mp4")
126 |                         restore_video = gr.Video(label="Restored face video", format="mp4")
127 |                         result = gr.Video(label="Generated video", format="mp4")
128 |                 generate_btn = gr.Button("Generate")
129 |                 interrupt_btn = gr.Button('Interrupt', elem_id=f"interrupt", visible=True)
130 |                 resume_btn = gr.Button('Resume', elem_id=f"resume", visible=True)
131 | 
132 |         def on_interrupt():
133 |             state.interrupt()
134 |             return "Interrupted"
135 | 
136 |         def gen_audio(suno_prompt, temperature, silence, low_vram):
137 |             global speaker_id
138 |             if suno_prompt is None or speaker_id is None:
139 |                 return
140 |             tts = TTS(suno_prompt, speaker_id, temperature, silence,None, low_vram)
141 |             wav = tts.generate()
142 |             # delete tts object to free memory
143 |             del tts
144 | 
145 |             return wav
146 | 
147 |         def generate(video, face_swap_img, face_index, audio, checkpoint, face_restore_model, no_smooth, only_mouth, resize_factor,
148 |                      mouth_mask_dilatation, erode_face_mask, mask_blur, pad_top, pad_bottom, pad_left, pad_right,
149 |                      active_debug, code_former_weight):
150 |             state.begin()
151 | 
152 |             if video is None or audio is None:
153 |                 print("[ERROR] Please select a video and an audio file")
154 |                 return
155 | 
156 |             if face_swap_img is not None:
157 |                 face_swap = FaceSwap(video, audio, face_index, face_swap_img, resize_factor, face_restore_model, code_former_weight)
158 |                 video = face_swap.generate()
159 | 
160 |             w2l = W2l(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left,
161 |                       pad_right, face_swap_img)
162 |             w2l.execute()
163 | 
164 |             w2luhq = Wav2LipUHQ(video, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur,
165 |                                 only_mouth, face_swap_img, resize_factor, code_former_weight, active_debug)
166 | 
167 |             return w2luhq.execute()
168 | 
169 |         def resume(video,face_swap_img, face_restore_model, only_mouth, resize_factor, mouth_mask_dilatation, erode_face_mask,
170 |                    mask_blur, active_debug, code_former_weight):
171 |             state.begin()
172 |             if face_swap_img is not None:
173 |                 face_swap = FaceSwap()
174 |                 video = face_swap.resume()
175 |             w2luhq = Wav2LipUHQ(video, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur,
176 |                                 only_mouth, face_swap_img, resize_factor, code_former_weight, active_debug)
177 | 
178 |             return w2luhq.execute(True)
179 | 
180 |         generate_audio.click(
181 |             gen_audio,
182 |             [suno_prompt, temperature, silence, low_vram],
183 |             audio)
184 | 
185 |         generate_btn.click(
186 |             generate,
187 |             [video, face_swap_img, face_index_slider, audio, checkpoint, face_restore_model, no_smooth, only_mouth, resize_factor, mouth_mask_dilatation,
188 |              erode_face_mask, mask_blur, pad_top, pad_bottom, pad_left, pad_right, active_debug, code_former_weight],
189 |             [faceswap_video, wav2lip_video, restore_video, result])
190 | 
191 |         resume_btn.click(
192 |             resume,
193 |             [video,face_swap_img, face_restore_model, only_mouth, resize_factor, mouth_mask_dilatation, erode_face_mask,
194 |              mask_blur, active_debug, code_former_weight],
195 |             [faceswap_video, wav2lip_video, restore_video, result])
196 | 
197 |         interrupt_btn.click(on_interrupt)
198 | 
199 |     return [(wav2lip_uhq_interface, "Wav2lip Studio", "wav2lip_uhq_interface")]
200 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | from scipy import signal
  5 | from scipy.io import wavfile
  6 | from scripts.wav2lip.hparams import hparams as hp
  7 | 
  8 | 
  9 | def load_wav(path, sr):
 10 |     return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | 
 13 | def save_wav(wav, path, sr):
 14 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 15 |     wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | 
 18 | def save_wavenet_wav(wav, path, sr):
 19 |     librosa.output.write_wav(path, wav, sr=sr)
 20 | 
 21 | 
 22 | def preemphasis(wav, k, preemphasize=True):
 23 |     if preemphasize:
 24 |         return signal.lfilter([1, -k], [1], wav)
 25 |     return wav
 26 | 
 27 | 
 28 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 29 |     if inv_preemphasize:
 30 |         return signal.lfilter([1], [1, -k], wav)
 31 |     return wav
 32 | 
 33 | 
 34 | def get_hop_size():
 35 |     hop_size = hp.hop_size
 36 |     if hop_size is None:
 37 |         assert hp.frame_shift_ms is not None
 38 |         hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
 39 |     return hop_size
 40 | 
 41 | 
 42 | def linearspectrogram(wav):
 43 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 44 |     S = _amp_to_db(np.abs(D)) - hp.ref_level_db
 45 | 
 46 |     if hp.signal_normalization:
 47 |         return _normalize(S)
 48 |     return S
 49 | 
 50 | 
 51 | def melspectrogram(wav):
 52 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 53 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
 54 | 
 55 |     if hp.signal_normalization:
 56 |         return _normalize(S)
 57 |     return S
 58 | 
 59 | 
 60 | def _lws_processor():
 61 |     import lws
 62 |     return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
 63 | 
 64 | 
 65 | def _stft(y):
 66 |     if hp.use_lws:
 67 |         return _lws_processor(hp).stft(y).T
 68 |     else:
 69 |         return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
 70 | 
 71 | 
 72 | ##########################################################
 73 | # Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
 74 | def num_frames(length, fsize, fshift):
 75 |     """Compute number of time frames of spectrogram
 76 |     """
 77 |     pad = (fsize - fshift)
 78 |     if length % fshift == 0:
 79 |         M = (length + pad * 2 - fsize) // fshift + 1
 80 |     else:
 81 |         M = (length + pad * 2 - fsize) // fshift + 2
 82 |     return M
 83 | 
 84 | 
 85 | def pad_lr(x, fsize, fshift):
 86 |     """Compute left and right padding
 87 |     """
 88 |     M = num_frames(len(x), fsize, fshift)
 89 |     pad = (fsize - fshift)
 90 |     T = len(x) + 2 * pad
 91 |     r = (M - 1) * fshift + fsize - T
 92 |     return pad, pad + r
 93 | 
 94 | 
 95 | ##########################################################
 96 | # Librosa correct padding
 97 | def librosa_pad_lr(x, fsize, fshift):
 98 |     return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
 99 | 
100 | 
101 | # Conversions
102 | _mel_basis = None
103 | 
104 | 
105 | def _linear_to_mel(spectogram):
106 |     global _mel_basis
107 |     if _mel_basis is None:
108 |         _mel_basis = _build_mel_basis()
109 |     return np.dot(_mel_basis, spectogram)
110 | 
111 | 
112 | def _build_mel_basis():
113 |     assert hp.fmax <= hp.sample_rate // 2
114 |     return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
115 |                                fmin=hp.fmin, fmax=hp.fmax)
116 | 
117 | 
118 | def _amp_to_db(x):
119 |     min_level = np.exp(hp.min_level_db / 20 * np.log(10))
120 |     return 20 * np.log10(np.maximum(min_level, x))
121 | 
122 | 
123 | def _db_to_amp(x):
124 |     return np.power(10.0, (x) * 0.05)
125 | 
126 | 
127 | def _normalize(S):
128 |     if hp.allow_clipping_in_normalization:
129 |         if hp.symmetric_mels:
130 |             return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
131 |                            -hp.max_abs_value, hp.max_abs_value)
132 |         else:
133 |             return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
134 | 
135 |     assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
136 |     if hp.symmetric_mels:
137 |         return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
138 |     else:
139 |         return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
140 | 
141 | 
142 | def _denormalize(D):
143 |     if hp.allow_clipping_in_normalization:
144 |         if hp.symmetric_mels:
145 |             return (((np.clip(D, -hp.max_abs_value,
146 |                               hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
147 |                     + hp.min_level_db)
148 |         else:
149 |             return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
150 | 
151 |     if hp.symmetric_mels:
152 |         return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
153 |     else:
154 |         return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
155 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/checkpoints/README.md:
--------------------------------------------------------------------------------
1 | Place all your checkpoints (.pth files) here. 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/README.md:
--------------------------------------------------------------------------------
1 | The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = """Adrian Bulat"""
4 | __email__ = 'adrian.bulat@nottingham.ac.uk'
5 | __version__ = '1.0.1'
6 | 
7 | from .api import FaceAlignment, LandmarksType, NetworkSize
8 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/api.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from enum import Enum
 3 | 
 4 | try:
 5 |     import urllib.request as request_file
 6 | except BaseException:
 7 |     import urllib as request_file
 8 | 
 9 | from .utils import *
10 | 
11 | 
12 | class LandmarksType(Enum):
13 |     """Enum class defining the type of landmarks to detect.
14 | 
15 |     ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
16 |     ``_2halfD`` - this points represent the projection of the 3D points into 3D
17 |     ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
18 | 
19 |     """
20 |     _2D = 1
21 |     _2halfD = 2
22 |     _3D = 3
23 | 
24 | 
25 | class NetworkSize(Enum):
26 |     # TINY = 1
27 |     # SMALL = 2
28 |     # MEDIUM = 3
29 |     LARGE = 4
30 | 
31 |     def __new__(cls, value):
32 |         member = object.__new__(cls)
33 |         member._value_ = value
34 |         return member
35 | 
36 |     def __int__(self):
37 |         return self.value
38 | 
39 | 
40 | ROOT = os.path.dirname(os.path.abspath(__file__))
41 | 
42 | 
43 | class FaceAlignment:
44 |     def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
45 |                  device='cuda', flip_input=False, face_detector='sfd', verbose=False):
46 |         self.device = device
47 |         self.flip_input = flip_input
48 |         self.landmarks_type = landmarks_type
49 |         self.verbose = verbose
50 | 
51 |         network_size = int(network_size)
52 | 
53 |         if 'cuda' in device:
54 |             torch.backends.cudnn.benchmark = True
55 | 
56 |         # Get the face detector
57 |         face_detector_module = __import__('scripts.wav2lip.face_detection.detection.' + face_detector,
58 |                                           globals(), locals(), [face_detector], 0)
59 |         self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)
60 | 
61 |     def get_detections_for_batch(self, images):
62 |         images = images[..., ::-1]
63 |         detected_faces = self.face_detector.detect_from_batch(images.copy())
64 |         results = []
65 | 
66 |         for i, d in enumerate(detected_faces):
67 |             if len(d) == 0:
68 |                 results.append(None)
69 |                 continue
70 |             d = d[0]
71 |             d = np.clip(d, 0, None)
72 | 
73 |             x1, y1, x2, y2 = map(int, d[:-1])
74 |             results.append((x1, y1, x2, y2))
75 | 
76 |         return results
77 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import FaceDetector


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/core.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import glob
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import torch
  6 | import cv2
  7 | 
  8 | 
  9 | class FaceDetector(object):
 10 |     """An abstract class representing a face detector.
 11 | 
 12 |     Any other face detection implementation must subclass it. All subclasses
 13 |     must implement ``detect_from_image``, that return a list of detected
 14 |     bounding boxes. Optionally, for speed considerations detect from path is
 15 |     recommended.
 16 |     """
 17 | 
 18 |     def __init__(self, device, verbose):
 19 |         self.device = device
 20 |         self.verbose = verbose
 21 | 
 22 |         if verbose:
 23 |             if 'cpu' in device:
 24 |                 logger = logging.getLogger(__name__)
 25 |                 logger.warning("Detection running on CPU, this may be potentially slow.")
 26 | 
 27 |         if 'cpu' not in device and 'cuda' not in device:
 28 |             if verbose:
 29 |                 logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
 30 |             raise ValueError
 31 | 
 32 |     def detect_from_image(self, tensor_or_path):
 33 |         """Detects faces in a given image.
 34 | 
 35 |         This function detects the faces present in a provided BGR(usually)
 36 |         image. The input can be either the image itself or the path to it.
 37 | 
 38 |         Arguments:
 39 |             tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
 40 |             to an image or the image itself.
 41 | 
 42 |         Example::
 43 | 
 44 |             >>> path_to_image = 'data/image_01.jpg'
 45 |             ...   detected_faces = detect_from_image(path_to_image)
 46 |             [A list of bounding boxes (x1, y1, x2, y2)]
 47 |             >>> image = cv2.imread(path_to_image)
 48 |             ...   detected_faces = detect_from_image(image)
 49 |             [A list of bounding boxes (x1, y1, x2, y2)]
 50 | 
 51 |         """
 52 |         raise NotImplementedError
 53 | 
 54 |     def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
 55 |         """Detects faces from all the images present in a given directory.
 56 | 
 57 |         Arguments:
 58 |             path {string} -- a string containing a path that points to the folder containing the images
 59 | 
 60 |         Keyword Arguments:
 61 |             extensions {list} -- list of string containing the extensions to be
 62 |             consider in the following format: ``.extension_name`` (default:
 63 |             {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
 64 |             folder recursively (default: {False}) show_progress_bar {bool} --
 65 |             display a progressbar (default: {True})
 66 | 
 67 |         Example:
 68 |         >>> directory = 'data'
 69 |         ...   detected_faces = detect_from_directory(directory)
 70 |         {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
 71 | 
 72 |         """
 73 |         if self.verbose:
 74 |             logger = logging.getLogger(__name__)
 75 | 
 76 |         if len(extensions) == 0:
 77 |             if self.verbose:
 78 |                 logger.error("Expected at list one extension, but none was received.")
 79 |             raise ValueError
 80 | 
 81 |         if self.verbose:
 82 |             logger.info("Constructing the list of images.")
 83 |         additional_pattern = '/**/*' if recursive else '/*'
 84 |         files = []
 85 |         for extension in extensions:
 86 |             files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
 87 | 
 88 |         if self.verbose:
 89 |             logger.info("Finished searching for images. %s images found", len(files))
 90 |             logger.info("Preparing to run the detection.")
 91 | 
 92 |         predictions = {}
 93 |         for image_path in tqdm(files, disable=not show_progress_bar):
 94 |             if self.verbose:
 95 |                 logger.info("Running the face detector on image: %s", image_path)
 96 |             predictions[image_path] = self.detect_from_image(image_path)
 97 | 
 98 |         if self.verbose:
 99 |             logger.info("The detector was successfully run on all %s images", len(files))
100 | 
101 |         return predictions
102 | 
103 |     @property
104 |     def reference_scale(self):
105 |         raise NotImplementedError
106 | 
107 |     @property
108 |     def reference_x_shift(self):
109 |         raise NotImplementedError
110 | 
111 |     @property
112 |     def reference_y_shift(self):
113 |         raise NotImplementedError
114 | 
115 |     @staticmethod
116 |     def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
117 |         """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
118 | 
119 |         Arguments:
120 |             tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
121 |         """
122 |         if isinstance(tensor_or_path, str):
123 |             return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
124 |         elif torch.is_tensor(tensor_or_path):
125 |             # Call cpu in case its coming from cuda
126 |             return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
127 |         elif isinstance(tensor_or_path, np.ndarray):
128 |             return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
129 |         else:
130 |             raise TypeError
131 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/sfd/__init__.py:
--------------------------------------------------------------------------------
1 | from .sfd_detector import SFDDetector as FaceDetector


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/sfd/bbox.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import sys
  4 | import cv2
  5 | import random
  6 | import datetime
  7 | import time
  8 | import math
  9 | import argparse
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | try:
 14 |     from iou import IOU
 15 | except BaseException:
 16 |     # IOU cython speedup 10x
 17 |     def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
 18 |         sa = abs((ax2 - ax1) * (ay2 - ay1))
 19 |         sb = abs((bx2 - bx1) * (by2 - by1))
 20 |         x1, y1 = max(ax1, bx1), max(ay1, by1)
 21 |         x2, y2 = min(ax2, bx2), min(ay2, by2)
 22 |         w = x2 - x1
 23 |         h = y2 - y1
 24 |         if w < 0 or h < 0:
 25 |             return 0.0
 26 |         else:
 27 |             return 1.0 * w * h / (sa + sb - w * h)
 28 | 
 29 | 
 30 | def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
 31 |     xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
 32 |     dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
 33 |     dw, dh = math.log(ww / aww), math.log(hh / ahh)
 34 |     return dx, dy, dw, dh
 35 | 
 36 | 
 37 | def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
 38 |     xc, yc = dx * aww + axc, dy * ahh + ayc
 39 |     ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
 40 |     x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
 41 |     return x1, y1, x2, y2
 42 | 
 43 | 
 44 | def nms(dets, thresh):
 45 |     if 0 == len(dets):
 46 |         return []
 47 |     x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
 48 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 49 |     order = scores.argsort()[::-1]
 50 | 
 51 |     keep = []
 52 |     while order.size > 0:
 53 |         i = order[0]
 54 |         keep.append(i)
 55 |         xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
 56 |         xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
 57 | 
 58 |         w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
 59 |         ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
 60 | 
 61 |         inds = np.where(ovr <= thresh)[0]
 62 |         order = order[inds + 1]
 63 | 
 64 |     return keep
 65 | 
 66 | 
 67 | def encode(matched, priors, variances):
 68 |     """Encode the variances from the priorbox layers into the ground truth boxes
 69 |     we have matched (based on jaccard overlap) with the prior boxes.
 70 |     Args:
 71 |         matched: (tensor) Coords of ground truth for each prior in point-form
 72 |             Shape: [num_priors, 4].
 73 |         priors: (tensor) Prior boxes in center-offset form
 74 |             Shape: [num_priors,4].
 75 |         variances: (list[float]) Variances of priorboxes
 76 |     Return:
 77 |         encoded boxes (tensor), Shape: [num_priors, 4]
 78 |     """
 79 | 
 80 |     # dist b/t match center and prior's center
 81 |     g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
 82 |     # encode variance
 83 |     g_cxcy /= (variances[0] * priors[:, 2:])
 84 |     # match wh / prior wh
 85 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
 86 |     g_wh = torch.log(g_wh) / variances[1]
 87 |     # return target for smooth_l1_loss
 88 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
 89 | 
 90 | 
 91 | def decode(loc, priors, variances):
 92 |     """Decode locations from predictions using priors to undo
 93 |     the encoding we did for offset regression at train time.
 94 |     Args:
 95 |         loc (tensor): location predictions for loc layers,
 96 |             Shape: [num_priors,4]
 97 |         priors (tensor): Prior boxes in center-offset form.
 98 |             Shape: [num_priors,4].
 99 |         variances: (list[float]) Variances of priorboxes
100 |     Return:
101 |         decoded bounding box predictions
102 |     """
103 | 
104 |     boxes = torch.cat((
105 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
106 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
107 |     boxes[:, :2] -= boxes[:, 2:] / 2
108 |     boxes[:, 2:] += boxes[:, :2]
109 |     return boxes
110 | 
111 | def batch_decode(loc, priors, variances):
112 |     """Decode locations from predictions using priors to undo
113 |     the encoding we did for offset regression at train time.
114 |     Args:
115 |         loc (tensor): location predictions for loc layers,
116 |             Shape: [num_priors,4]
117 |         priors (tensor): Prior boxes in center-offset form.
118 |             Shape: [num_priors,4].
119 |         variances: (list[float]) Variances of priorboxes
120 |     Return:
121 |         decoded bounding box predictions
122 |     """
123 | 
124 |     boxes = torch.cat((
125 |         priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
126 |         priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
127 |     boxes[:, :, :2] -= boxes[:, :, 2:] / 2
128 |     boxes[:, :, 2:] += boxes[:, :, :2]
129 |     return boxes
130 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/sfd/detect.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | import os
  5 | import sys
  6 | import cv2
  7 | import random
  8 | import datetime
  9 | import math
 10 | import argparse
 11 | import numpy as np
 12 | 
 13 | import scipy.io as sio
 14 | import zipfile
 15 | from .net_s3fd import s3fd
 16 | from .bbox import *
 17 | 
 18 | 
 19 | def detect(net, img, device):
 20 |     img = img - np.array([104, 117, 123])
 21 |     img = img.transpose(2, 0, 1)
 22 |     img = img.reshape((1,) + img.shape)
 23 | 
 24 |     if 'cuda' in device:
 25 |         torch.backends.cudnn.benchmark = True
 26 | 
 27 |     img = torch.from_numpy(img).float().to(device)
 28 |     BB, CC, HH, WW = img.size()
 29 |     with torch.no_grad():
 30 |         olist = net(img)
 31 | 
 32 |     bboxlist = []
 33 |     for i in range(len(olist) // 2):
 34 |         olist[i * 2] = F.softmax(olist[i * 2], dim=1)
 35 |     olist = [oelem.data.cpu() for oelem in olist]
 36 |     for i in range(len(olist) // 2):
 37 |         ocls, oreg = olist[i * 2], olist[i * 2 + 1]
 38 |         FB, FC, FH, FW = ocls.size()  # feature map size
 39 |         stride = 2**(i + 2)    # 4,8,16,32,64,128
 40 |         anchor = stride * 4
 41 |         poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
 42 |         for Iindex, hindex, windex in poss:
 43 |             axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
 44 |             score = ocls[0, 1, hindex, windex]
 45 |             loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
 46 |             priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
 47 |             variances = [0.1, 0.2]
 48 |             box = decode(loc, priors, variances)
 49 |             x1, y1, x2, y2 = box[0] * 1.0
 50 |             # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
 51 |             bboxlist.append([x1, y1, x2, y2, score])
 52 |     bboxlist = np.array(bboxlist)
 53 |     if 0 == len(bboxlist):
 54 |         bboxlist = np.zeros((1, 5))
 55 | 
 56 |     return bboxlist
 57 | 
 58 | def batch_detect(net, imgs, device):
 59 |     imgs = imgs - np.array([104, 117, 123])
 60 |     imgs = imgs.transpose(0, 3, 1, 2)
 61 | 
 62 |     if 'cuda' in device:
 63 |         torch.backends.cudnn.benchmark = True
 64 | 
 65 |     imgs = torch.from_numpy(imgs).float().to(device)
 66 |     BB, CC, HH, WW = imgs.size()
 67 |     with torch.no_grad():
 68 |         olist = net(imgs)
 69 | 
 70 |     bboxlist = []
 71 |     for i in range(len(olist) // 2):
 72 |         olist[i * 2] = F.softmax(olist[i * 2], dim=1)
 73 |     olist = [oelem.data.cpu() for oelem in olist]
 74 |     for i in range(len(olist) // 2):
 75 |         ocls, oreg = olist[i * 2], olist[i * 2 + 1]
 76 |         FB, FC, FH, FW = ocls.size()  # feature map size
 77 |         stride = 2**(i + 2)    # 4,8,16,32,64,128
 78 |         anchor = stride * 4
 79 |         poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
 80 |         for Iindex, hindex, windex in poss:
 81 |             axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
 82 |             score = ocls[:, 1, hindex, windex]
 83 |             loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
 84 |             priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
 85 |             variances = [0.1, 0.2]
 86 |             box = batch_decode(loc, priors, variances)
 87 |             box = box[:, 0] * 1.0
 88 |             # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
 89 |             bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
 90 |     bboxlist = np.array(bboxlist)
 91 |     if 0 == len(bboxlist):
 92 |         bboxlist = np.zeros((1, BB, 5))
 93 | 
 94 |     return bboxlist
 95 | 
 96 | def flip_detect(net, img, device):
 97 |     img = cv2.flip(img, 1)
 98 |     b = detect(net, img, device)
 99 | 
100 |     bboxlist = np.zeros(b.shape)
101 |     bboxlist[:, 0] = img.shape[1] - b[:, 2]
102 |     bboxlist[:, 1] = b[:, 1]
103 |     bboxlist[:, 2] = img.shape[1] - b[:, 0]
104 |     bboxlist[:, 3] = b[:, 3]
105 |     bboxlist[:, 4] = b[:, 4]
106 |     return bboxlist
107 | 
108 | 
109 | def pts_to_bb(pts):
110 |     min_x, min_y = np.min(pts, axis=0)
111 |     max_x, max_y = np.max(pts, axis=0)
112 |     return np.array([min_x, min_y, max_x, max_y])
113 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/sfd/net_s3fd.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class L2Norm(nn.Module):
  7 |     def __init__(self, n_channels, scale=1.0):
  8 |         super(L2Norm, self).__init__()
  9 |         self.n_channels = n_channels
 10 |         self.scale = scale
 11 |         self.eps = 1e-10
 12 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 13 |         self.weight.data *= 0.0
 14 |         self.weight.data += self.scale
 15 | 
 16 |     def forward(self, x):
 17 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 18 |         x = x / norm * self.weight.view(1, -1, 1, 1)
 19 |         return x
 20 | 
 21 | 
 22 | class s3fd(nn.Module):
 23 |     def __init__(self):
 24 |         super(s3fd, self).__init__()
 25 |         self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
 26 |         self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
 27 | 
 28 |         self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
 29 |         self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
 30 | 
 31 |         self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
 32 |         self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
 33 |         self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
 34 | 
 35 |         self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
 36 |         self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
 37 |         self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
 38 | 
 39 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
 40 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
 41 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
 42 | 
 43 |         self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3)
 44 |         self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0)
 45 | 
 46 |         self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
 47 |         self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
 48 | 
 49 |         self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0)
 50 |         self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
 51 | 
 52 |         self.conv3_3_norm = L2Norm(256, scale=10)
 53 |         self.conv4_3_norm = L2Norm(512, scale=8)
 54 |         self.conv5_3_norm = L2Norm(512, scale=5)
 55 | 
 56 |         self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
 57 |         self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
 58 |         self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
 59 |         self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
 60 |         self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
 61 |         self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
 62 | 
 63 |         self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1)
 64 |         self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1)
 65 |         self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
 66 |         self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
 67 |         self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1)
 68 |         self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
 69 | 
 70 |     def forward(self, x):
 71 |         h = F.relu(self.conv1_1(x))
 72 |         h = F.relu(self.conv1_2(h))
 73 |         h = F.max_pool2d(h, 2, 2)
 74 | 
 75 |         h = F.relu(self.conv2_1(h))
 76 |         h = F.relu(self.conv2_2(h))
 77 |         h = F.max_pool2d(h, 2, 2)
 78 | 
 79 |         h = F.relu(self.conv3_1(h))
 80 |         h = F.relu(self.conv3_2(h))
 81 |         h = F.relu(self.conv3_3(h))
 82 |         f3_3 = h
 83 |         h = F.max_pool2d(h, 2, 2)
 84 | 
 85 |         h = F.relu(self.conv4_1(h))
 86 |         h = F.relu(self.conv4_2(h))
 87 |         h = F.relu(self.conv4_3(h))
 88 |         f4_3 = h
 89 |         h = F.max_pool2d(h, 2, 2)
 90 | 
 91 |         h = F.relu(self.conv5_1(h))
 92 |         h = F.relu(self.conv5_2(h))
 93 |         h = F.relu(self.conv5_3(h))
 94 |         f5_3 = h
 95 |         h = F.max_pool2d(h, 2, 2)
 96 | 
 97 |         h = F.relu(self.fc6(h))
 98 |         h = F.relu(self.fc7(h))
 99 |         ffc7 = h
100 |         h = F.relu(self.conv6_1(h))
101 |         h = F.relu(self.conv6_2(h))
102 |         f6_2 = h
103 |         h = F.relu(self.conv7_1(h))
104 |         h = F.relu(self.conv7_2(h))
105 |         f7_2 = h
106 | 
107 |         f3_3 = self.conv3_3_norm(f3_3)
108 |         f4_3 = self.conv4_3_norm(f4_3)
109 |         f5_3 = self.conv5_3_norm(f5_3)
110 | 
111 |         cls1 = self.conv3_3_norm_mbox_conf(f3_3)
112 |         reg1 = self.conv3_3_norm_mbox_loc(f3_3)
113 |         cls2 = self.conv4_3_norm_mbox_conf(f4_3)
114 |         reg2 = self.conv4_3_norm_mbox_loc(f4_3)
115 |         cls3 = self.conv5_3_norm_mbox_conf(f5_3)
116 |         reg3 = self.conv5_3_norm_mbox_loc(f5_3)
117 |         cls4 = self.fc7_mbox_conf(ffc7)
118 |         reg4 = self.fc7_mbox_loc(ffc7)
119 |         cls5 = self.conv6_2_mbox_conf(f6_2)
120 |         reg5 = self.conv6_2_mbox_loc(f6_2)
121 |         cls6 = self.conv7_2_mbox_conf(f7_2)
122 |         reg6 = self.conv7_2_mbox_loc(f7_2)
123 | 
124 |         # max-out background label
125 |         chunk = torch.chunk(cls1, 4, 1)
126 |         bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
127 |         cls1 = torch.cat([bmax, chunk[3]], dim=1)
128 | 
129 |         return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]
130 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/detection/sfd/sfd_detector.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | from torch.utils.model_zoo import load_url
 4 | import modules.shared as shared
 5 | from ..core import FaceDetector
 6 | 
 7 | from .net_s3fd import s3fd
 8 | from .bbox import *
 9 | from .detect import *
10 | 
11 | models_urls = {
12 |     's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
13 | }
14 | 
15 | 
16 | class SFDDetector(FaceDetector):
17 |     def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
18 |         super(SFDDetector, self).__init__(device, verbose)
19 |         shared.cmd_opts.disable_safe_unpickle = True
20 |         # Initialise the face detector
21 |         if not os.path.isfile(path_to_detector):
22 |             model_weights = load_url(models_urls['s3fd'])
23 |         else:
24 |             model_weights = torch.load(path_to_detector)
25 | 
26 |         self.face_detector = s3fd()
27 |         self.face_detector.load_state_dict(model_weights)
28 |         self.face_detector.to(device)
29 |         self.face_detector.eval()
30 |         shared.cmd_opts.disable_safe_unpickle = False
31 | 
32 |     def detect_from_image(self, tensor_or_path):
33 |         image = self.tensor_or_path_to_ndarray(tensor_or_path)
34 | 
35 |         bboxlist = detect(self.face_detector, image, device=self.device)
36 |         keep = nms(bboxlist, 0.3)
37 |         bboxlist = bboxlist[keep, :]
38 |         bboxlist = [x for x in bboxlist if x[-1] > 0.5]
39 | 
40 |         return bboxlist
41 | 
42 |     def detect_from_batch(self, images):
43 |         bboxlists = batch_detect(self.face_detector, images, device=self.device)
44 |         keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
45 |         bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
46 |         bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
47 | 
48 |         return bboxlists
49 | 
50 |     @property
51 |     def reference_scale(self):
52 |         return 195
53 | 
54 |     @property
55 |     def reference_x_shift(self):
56 |         return 0
57 | 
58 |     @property
59 |     def reference_y_shift(self):
60 |         return 0
61 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import math
  5 | 
  6 | 
  7 | def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
  8 |     "3x3 convolution with padding"
  9 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3,
 10 |                      stride=strd, padding=padding, bias=bias)
 11 | 
 12 | 
 13 | class ConvBlock(nn.Module):
 14 |     def __init__(self, in_planes, out_planes):
 15 |         super(ConvBlock, self).__init__()
 16 |         self.bn1 = nn.BatchNorm2d(in_planes)
 17 |         self.conv1 = conv3x3(in_planes, int(out_planes / 2))
 18 |         self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
 19 |         self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
 20 |         self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
 21 |         self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
 22 | 
 23 |         if in_planes != out_planes:
 24 |             self.downsample = nn.Sequential(
 25 |                 nn.BatchNorm2d(in_planes),
 26 |                 nn.ReLU(True),
 27 |                 nn.Conv2d(in_planes, out_planes,
 28 |                           kernel_size=1, stride=1, bias=False),
 29 |             )
 30 |         else:
 31 |             self.downsample = None
 32 | 
 33 |     def forward(self, x):
 34 |         residual = x
 35 | 
 36 |         out1 = self.bn1(x)
 37 |         out1 = F.relu(out1, True)
 38 |         out1 = self.conv1(out1)
 39 | 
 40 |         out2 = self.bn2(out1)
 41 |         out2 = F.relu(out2, True)
 42 |         out2 = self.conv2(out2)
 43 | 
 44 |         out3 = self.bn3(out2)
 45 |         out3 = F.relu(out3, True)
 46 |         out3 = self.conv3(out3)
 47 | 
 48 |         out3 = torch.cat((out1, out2, out3), 1)
 49 | 
 50 |         if self.downsample is not None:
 51 |             residual = self.downsample(residual)
 52 | 
 53 |         out3 += residual
 54 | 
 55 |         return out3
 56 | 
 57 | 
 58 | class Bottleneck(nn.Module):
 59 | 
 60 |     expansion = 4
 61 | 
 62 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 63 |         super(Bottleneck, self).__init__()
 64 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 65 |         self.bn1 = nn.BatchNorm2d(planes)
 66 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
 67 |                                padding=1, bias=False)
 68 |         self.bn2 = nn.BatchNorm2d(planes)
 69 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 70 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 71 |         self.relu = nn.ReLU(inplace=True)
 72 |         self.downsample = downsample
 73 |         self.stride = stride
 74 | 
 75 |     def forward(self, x):
 76 |         residual = x
 77 | 
 78 |         out = self.conv1(x)
 79 |         out = self.bn1(out)
 80 |         out = self.relu(out)
 81 | 
 82 |         out = self.conv2(out)
 83 |         out = self.bn2(out)
 84 |         out = self.relu(out)
 85 | 
 86 |         out = self.conv3(out)
 87 |         out = self.bn3(out)
 88 | 
 89 |         if self.downsample is not None:
 90 |             residual = self.downsample(x)
 91 | 
 92 |         out += residual
 93 |         out = self.relu(out)
 94 | 
 95 |         return out
 96 | 
 97 | 
 98 | class HourGlass(nn.Module):
 99 |     def __init__(self, num_modules, depth, num_features):
100 |         super(HourGlass, self).__init__()
101 |         self.num_modules = num_modules
102 |         self.depth = depth
103 |         self.features = num_features
104 | 
105 |         self._generate_network(self.depth)
106 | 
107 |     def _generate_network(self, level):
108 |         self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))
109 | 
110 |         self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))
111 | 
112 |         if level > 1:
113 |             self._generate_network(level - 1)
114 |         else:
115 |             self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))
116 | 
117 |         self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))
118 | 
119 |     def _forward(self, level, inp):
120 |         # Upper branch
121 |         up1 = inp
122 |         up1 = self._modules['b1_' + str(level)](up1)
123 | 
124 |         # Lower branch
125 |         low1 = F.avg_pool2d(inp, 2, stride=2)
126 |         low1 = self._modules['b2_' + str(level)](low1)
127 | 
128 |         if level > 1:
129 |             low2 = self._forward(level - 1, low1)
130 |         else:
131 |             low2 = low1
132 |             low2 = self._modules['b2_plus_' + str(level)](low2)
133 | 
134 |         low3 = low2
135 |         low3 = self._modules['b3_' + str(level)](low3)
136 | 
137 |         up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
138 | 
139 |         return up1 + up2
140 | 
141 |     def forward(self, x):
142 |         return self._forward(self.depth, x)
143 | 
144 | 
145 | class FAN(nn.Module):
146 | 
147 |     def __init__(self, num_modules=1):
148 |         super(FAN, self).__init__()
149 |         self.num_modules = num_modules
150 | 
151 |         # Base part
152 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
153 |         self.bn1 = nn.BatchNorm2d(64)
154 |         self.conv2 = ConvBlock(64, 128)
155 |         self.conv3 = ConvBlock(128, 128)
156 |         self.conv4 = ConvBlock(128, 256)
157 | 
158 |         # Stacking part
159 |         for hg_module in range(self.num_modules):
160 |             self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
161 |             self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
162 |             self.add_module('conv_last' + str(hg_module),
163 |                             nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
164 |             self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
165 |             self.add_module('l' + str(hg_module), nn.Conv2d(256,
166 |                                                             68, kernel_size=1, stride=1, padding=0))
167 | 
168 |             if hg_module < self.num_modules - 1:
169 |                 self.add_module(
170 |                     'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
171 |                 self.add_module('al' + str(hg_module), nn.Conv2d(68,
172 |                                                                  256, kernel_size=1, stride=1, padding=0))
173 | 
174 |     def forward(self, x):
175 |         x = F.relu(self.bn1(self.conv1(x)), True)
176 |         x = F.avg_pool2d(self.conv2(x), 2, stride=2)
177 |         x = self.conv3(x)
178 |         x = self.conv4(x)
179 | 
180 |         previous = x
181 | 
182 |         outputs = []
183 |         for i in range(self.num_modules):
184 |             hg = self._modules['m' + str(i)](previous)
185 | 
186 |             ll = hg
187 |             ll = self._modules['top_m_' + str(i)](ll)
188 | 
189 |             ll = F.relu(self._modules['bn_end' + str(i)]
190 |                         (self._modules['conv_last' + str(i)](ll)), True)
191 | 
192 |             # Predict heatmaps
193 |             tmp_out = self._modules['l' + str(i)](ll)
194 |             outputs.append(tmp_out)
195 | 
196 |             if i < self.num_modules - 1:
197 |                 ll = self._modules['bl' + str(i)](ll)
198 |                 tmp_out_ = self._modules['al' + str(i)](tmp_out)
199 |                 previous = previous + ll + tmp_out_
200 | 
201 |         return outputs
202 | 
203 | 
204 | class ResNetDepth(nn.Module):
205 | 
206 |     def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68):
207 |         self.inplanes = 64
208 |         super(ResNetDepth, self).__init__()
209 |         self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3,
210 |                                bias=False)
211 |         self.bn1 = nn.BatchNorm2d(64)
212 |         self.relu = nn.ReLU(inplace=True)
213 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
214 |         self.layer1 = self._make_layer(block, 64, layers[0])
215 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
216 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
217 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
218 |         self.avgpool = nn.AvgPool2d(7)
219 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
220 | 
221 |         for m in self.modules():
222 |             if isinstance(m, nn.Conv2d):
223 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
224 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
225 |             elif isinstance(m, nn.BatchNorm2d):
226 |                 m.weight.data.fill_(1)
227 |                 m.bias.data.zero_()
228 | 
229 |     def _make_layer(self, block, planes, blocks, stride=1):
230 |         downsample = None
231 |         if stride != 1 or self.inplanes != planes * block.expansion:
232 |             downsample = nn.Sequential(
233 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
234 |                           kernel_size=1, stride=stride, bias=False),
235 |                 nn.BatchNorm2d(planes * block.expansion),
236 |             )
237 | 
238 |         layers = []
239 |         layers.append(block(self.inplanes, planes, stride, downsample))
240 |         self.inplanes = planes * block.expansion
241 |         for i in range(1, blocks):
242 |             layers.append(block(self.inplanes, planes))
243 | 
244 |         return nn.Sequential(*layers)
245 | 
246 |     def forward(self, x):
247 |         x = self.conv1(x)
248 |         x = self.bn1(x)
249 |         x = self.relu(x)
250 |         x = self.maxpool(x)
251 | 
252 |         x = self.layer1(x)
253 |         x = self.layer2(x)
254 |         x = self.layer3(x)
255 |         x = self.layer4(x)
256 | 
257 |         x = self.avgpool(x)
258 |         x = x.view(x.size(0), -1)
259 |         x = self.fc(x)
260 | 
261 |         return x
262 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/face_detection/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import sys
  4 | import time
  5 | import torch
  6 | import math
  7 | import numpy as np
  8 | import cv2
  9 | 
 10 | 
 11 | def _gaussian(
 12 |         size=3, sigma=0.25, amplitude=1, normalize=False, width=None,
 13 |         height=None, sigma_horz=None, sigma_vert=None, mean_horz=0.5,
 14 |         mean_vert=0.5):
 15 |     # handle some defaults
 16 |     if width is None:
 17 |         width = size
 18 |     if height is None:
 19 |         height = size
 20 |     if sigma_horz is None:
 21 |         sigma_horz = sigma
 22 |     if sigma_vert is None:
 23 |         sigma_vert = sigma
 24 |     center_x = mean_horz * width + 0.5
 25 |     center_y = mean_vert * height + 0.5
 26 |     gauss = np.empty((height, width), dtype=np.float32)
 27 |     # generate kernel
 28 |     for i in range(height):
 29 |         for j in range(width):
 30 |             gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / (
 31 |                 sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0))
 32 |     if normalize:
 33 |         gauss = gauss / np.sum(gauss)
 34 |     return gauss
 35 | 
 36 | 
 37 | def draw_gaussian(image, point, sigma):
 38 |     # Check if the gaussian is inside
 39 |     ul = [math.floor(point[0] - 3 * sigma), math.floor(point[1] - 3 * sigma)]
 40 |     br = [math.floor(point[0] + 3 * sigma), math.floor(point[1] + 3 * sigma)]
 41 |     if (ul[0] > image.shape[1] or ul[1] > image.shape[0] or br[0] < 1 or br[1] < 1):
 42 |         return image
 43 |     size = 6 * sigma + 1
 44 |     g = _gaussian(size)
 45 |     g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) - int(max(1, ul[0])) + int(max(1, -ul[0]))]
 46 |     g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) - int(max(1, ul[1])) + int(max(1, -ul[1]))]
 47 |     img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))]
 48 |     img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))]
 49 |     assert (g_x[0] > 0 and g_y[1] > 0)
 50 |     image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]
 51 |           ] = image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]]
 52 |     image[image > 1] = 1
 53 |     return image
 54 | 
 55 | 
 56 | def transform(point, center, scale, resolution, invert=False):
 57 |     """Generate and affine transformation matrix.
 58 | 
 59 |     Given a set of points, a center, a scale and a targer resolution, the
 60 |     function generates and affine transformation matrix. If invert is ``True``
 61 |     it will produce the inverse transformation.
 62 | 
 63 |     Arguments:
 64 |         point {torch.tensor} -- the input 2D point
 65 |         center {torch.tensor or numpy.array} -- the center around which to perform the transformations
 66 |         scale {float} -- the scale of the face/object
 67 |         resolution {float} -- the output resolution
 68 | 
 69 |     Keyword Arguments:
 70 |         invert {bool} -- define wherever the function should produce the direct or the
 71 |         inverse transformation matrix (default: {False})
 72 |     """
 73 |     _pt = torch.ones(3)
 74 |     _pt[0] = point[0]
 75 |     _pt[1] = point[1]
 76 | 
 77 |     h = 200.0 * scale
 78 |     t = torch.eye(3)
 79 |     t[0, 0] = resolution / h
 80 |     t[1, 1] = resolution / h
 81 |     t[0, 2] = resolution * (-center[0] / h + 0.5)
 82 |     t[1, 2] = resolution * (-center[1] / h + 0.5)
 83 | 
 84 |     if invert:
 85 |         t = torch.inverse(t)
 86 | 
 87 |     new_point = (torch.matmul(t, _pt))[0:2]
 88 | 
 89 |     return new_point.int()
 90 | 
 91 | 
 92 | def crop(image, center, scale, resolution=256.0):
 93 |     """Center crops an image or set of heatmaps
 94 | 
 95 |     Arguments:
 96 |         image {numpy.array} -- an rgb image
 97 |         center {numpy.array} -- the center of the object, usually the same as of the bounding box
 98 |         scale {float} -- scale of the face
 99 | 
100 |     Keyword Arguments:
101 |         resolution {float} -- the size of the output cropped image (default: {256.0})
102 | 
103 |     Returns:
104 |         [type] -- [description]
105 |     """  # Crop around the center point
106 |     """ Crops the image around the center. Input is expected to be an np.ndarray """
107 |     ul = transform([1, 1], center, scale, resolution, True)
108 |     br = transform([resolution, resolution], center, scale, resolution, True)
109 |     # pad = math.ceil(torch.norm((ul - br).float()) / 2.0 - (br[0] - ul[0]) / 2.0)
110 |     if image.ndim > 2:
111 |         newDim = np.array([br[1] - ul[1], br[0] - ul[0],
112 |                            image.shape[2]], dtype=np.int32)
113 |         newImg = np.zeros(newDim, dtype=np.uint8)
114 |     else:
115 |         newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int)
116 |         newImg = np.zeros(newDim, dtype=np.uint8)
117 |     ht = image.shape[0]
118 |     wd = image.shape[1]
119 |     newX = np.array(
120 |         [max(1, -ul[0] + 1), min(br[0], wd) - ul[0]], dtype=np.int32)
121 |     newY = np.array(
122 |         [max(1, -ul[1] + 1), min(br[1], ht) - ul[1]], dtype=np.int32)
123 |     oldX = np.array([max(1, ul[0] + 1), min(br[0], wd)], dtype=np.int32)
124 |     oldY = np.array([max(1, ul[1] + 1), min(br[1], ht)], dtype=np.int32)
125 |     newImg[newY[0] - 1:newY[1], newX[0] - 1:newX[1]
126 |            ] = image[oldY[0] - 1:oldY[1], oldX[0] - 1:oldX[1], :]
127 |     newImg = cv2.resize(newImg, dsize=(int(resolution), int(resolution)),
128 |                         interpolation=cv2.INTER_LINEAR)
129 |     return newImg
130 | 
131 | 
132 | def get_preds_fromhm(hm, center=None, scale=None):
133 |     """Obtain (x,y) coordinates given a set of N heatmaps. If the center
134 |     and the scale is provided the function will return the points also in
135 |     the original coordinate frame.
136 | 
137 |     Arguments:
138 |         hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
139 | 
140 |     Keyword Arguments:
141 |         center {torch.tensor} -- the center of the bounding box (default: {None})
142 |         scale {float} -- face scale (default: {None})
143 |     """
144 |     max, idx = torch.max(
145 |         hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
146 |     idx += 1
147 |     preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
148 |     preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
149 |     preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
150 | 
151 |     for i in range(preds.size(0)):
152 |         for j in range(preds.size(1)):
153 |             hm_ = hm[i, j, :]
154 |             pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
155 |             if pX > 0 and pX < 63 and pY > 0 and pY < 63:
156 |                 diff = torch.FloatTensor(
157 |                     [hm_[pY, pX + 1] - hm_[pY, pX - 1],
158 |                      hm_[pY + 1, pX] - hm_[pY - 1, pX]])
159 |                 preds[i, j].add_(diff.sign_().mul_(.25))
160 | 
161 |     preds.add_(-.5)
162 | 
163 |     preds_orig = torch.zeros(preds.size())
164 |     if center is not None and scale is not None:
165 |         for i in range(hm.size(0)):
166 |             for j in range(hm.size(1)):
167 |                 preds_orig[i, j] = transform(
168 |                     preds[i, j], center, scale, hm.size(2), True)
169 | 
170 |     return preds, preds_orig
171 | 
172 | def get_preds_fromhm_batch(hm, centers=None, scales=None):
173 |     """Obtain (x,y) coordinates given a set of N heatmaps. If the centers
174 |     and the scales is provided the function will return the points also in
175 |     the original coordinate frame.
176 | 
177 |     Arguments:
178 |         hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
179 | 
180 |     Keyword Arguments:
181 |         centers {torch.tensor} -- the centers of the bounding box (default: {None})
182 |         scales {float} -- face scales (default: {None})
183 |     """
184 |     max, idx = torch.max(
185 |         hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
186 |     idx += 1
187 |     preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
188 |     preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
189 |     preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
190 | 
191 |     for i in range(preds.size(0)):
192 |         for j in range(preds.size(1)):
193 |             hm_ = hm[i, j, :]
194 |             pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
195 |             if pX > 0 and pX < 63 and pY > 0 and pY < 63:
196 |                 diff = torch.FloatTensor(
197 |                     [hm_[pY, pX + 1] - hm_[pY, pX - 1],
198 |                      hm_[pY + 1, pX] - hm_[pY - 1, pX]])
199 |                 preds[i, j].add_(diff.sign_().mul_(.25))
200 | 
201 |     preds.add_(-.5)
202 | 
203 |     preds_orig = torch.zeros(preds.size())
204 |     if centers is not None and scales is not None:
205 |         for i in range(hm.size(0)):
206 |             for j in range(hm.size(1)):
207 |                 preds_orig[i, j] = transform(
208 |                     preds[i, j], centers[i], scales[i], hm.size(2), True)
209 | 
210 |     return preds, preds_orig
211 | 
212 | def shuffle_lr(parts, pairs=None):
213 |     """Shuffle the points left-right according to the axis of symmetry
214 |     of the object.
215 | 
216 |     Arguments:
217 |         parts {torch.tensor} -- a 3D or 4D object containing the
218 |         heatmaps.
219 | 
220 |     Keyword Arguments:
221 |         pairs {list of integers} -- [order of the flipped points] (default: {None})
222 |     """
223 |     if pairs is None:
224 |         pairs = [16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
225 |                  26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35,
226 |                  34, 33, 32, 31, 45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41,
227 |                  40, 54, 53, 52, 51, 50, 49, 48, 59, 58, 57, 56, 55, 64, 63,
228 |                  62, 61, 60, 67, 66, 65]
229 |     if parts.ndimension() == 3:
230 |         parts = parts[pairs, ...]
231 |     else:
232 |         parts = parts[:, pairs, ...]
233 | 
234 |     return parts
235 | 
236 | 
237 | def flip(tensor, is_label=False):
238 |     """Flip an image or a set of heatmaps left-right
239 | 
240 |     Arguments:
241 |         tensor {numpy.array or torch.tensor} -- [the input image or heatmaps]
242 | 
243 |     Keyword Arguments:
244 |         is_label {bool} -- [denote wherever the input is an image or a set of heatmaps ] (default: {False})
245 |     """
246 |     if not torch.is_tensor(tensor):
247 |         tensor = torch.from_numpy(tensor)
248 | 
249 |     if is_label:
250 |         tensor = shuffle_lr(tensor).flip(tensor.ndimension() - 1)
251 |     else:
252 |         tensor = tensor.flip(tensor.ndimension() - 1)
253 | 
254 |     return tensor
255 | 
256 | # From pyzolib/paths.py (https://bitbucket.org/pyzo/pyzolib/src/tip/paths.py)
257 | 
258 | 
259 | def appdata_dir(appname=None, roaming=False):
260 |     """ appdata_dir(appname=None, roaming=False)
261 | 
262 |     Get the path to the application directory, where applications are allowed
263 |     to write user specific files (e.g. configurations). For non-user specific
264 |     data, consider using common_appdata_dir().
265 |     If appname is given, a subdir is appended (and created if necessary).
266 |     If roaming is True, will prefer a roaming directory (Windows Vista/7).
267 |     """
268 | 
269 |     # Define default user directory
270 |     userDir = os.getenv('FACEALIGNMENT_USERDIR', None)
271 |     if userDir is None:
272 |         userDir = os.path.expanduser('~')
273 |         if not os.path.isdir(userDir):  # pragma: no cover
274 |             userDir = '/var/tmp'  # issue #54
275 | 
276 |     # Get system app data dir
277 |     path = None
278 |     if sys.platform.startswith('win'):
279 |         path1, path2 = os.getenv('LOCALAPPDATA'), os.getenv('APPDATA')
280 |         path = (path2 or path1) if roaming else (path1 or path2)
281 |     elif sys.platform.startswith('darwin'):
282 |         path = os.path.join(userDir, 'Library', 'Application Support')
283 |     # On Linux and as fallback
284 |     if not (path and os.path.isdir(path)):
285 |         path = userDir
286 | 
287 |     # Maybe we should store things local to the executable (in case of a
288 |     # portable distro or a frozen application that wants to be portable)
289 |     prefix = sys.prefix
290 |     if getattr(sys, 'frozen', None):
291 |         prefix = os.path.abspath(os.path.dirname(sys.executable))
292 |     for reldir in ('settings', '../settings'):
293 |         localpath = os.path.abspath(os.path.join(prefix, reldir))
294 |         if os.path.isdir(localpath):  # pragma: no cover
295 |             try:
296 |                 open(os.path.join(localpath, 'test.write'), 'wb').close()
297 |                 os.remove(os.path.join(localpath, 'test.write'))
298 |             except IOError:
299 |                 pass  # We cannot write in this directory
300 |             else:
301 |                 path = localpath
302 |                 break
303 | 
304 |     # Get path specific for this app
305 |     if appname:
306 |         if path == userDir:
307 |             appname = '.' + appname.lstrip('.')  # Make it a hidden directory
308 |         path = os.path.join(path, appname)
309 |         if not os.path.isdir(path):  # pragma: no cover
310 |             os.mkdir(path)
311 | 
312 |     # Done
313 |     return path
314 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/hparams.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | def get_image_list(data_root, split):
  5 |     filelist = []
  6 | 
  7 |     with open('filelists/{}.txt'.format(split)) as f:
  8 |         for line in f:
  9 |             line = line.strip()
 10 |             if ' ' in line: line = line.split()[0]
 11 |             filelist.append(os.path.join(data_root, line))
 12 | 
 13 |     return filelist
 14 | 
 15 | 
 16 | class HParams:
 17 |     def __init__(self, **kwargs):
 18 |         self.data = {}
 19 | 
 20 |         for key, value in kwargs.items():
 21 |             self.data[key] = value
 22 | 
 23 |     def __getattr__(self, key):
 24 |         if key not in self.data:
 25 |             raise AttributeError("'HParams' object has no attribute %s" % key)
 26 |         return self.data[key]
 27 | 
 28 |     def set_hparam(self, key, value):
 29 |         self.data[key] = value
 30 | 
 31 | 
 32 | # Default hyperparameters
 33 | hparams = HParams(
 34 |     num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
 35 |     #  network
 36 |     rescale=True,  # Whether to rescale audio prior to preprocessing
 37 |     rescaling_max=0.9,  # Rescaling value
 38 | 
 39 |     # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
 40 |     # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
 41 |     # Does not work if n_ffit is not multiple of hop_size!!
 42 |     use_lws=False,
 43 | 
 44 |     n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
 45 |     hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
 46 |     win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
 47 |     sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
 48 | 
 49 |     frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
 50 | 
 51 |     # Mel and Linear spectrograms normalization/scaling and clipping
 52 |     signal_normalization=True,
 53 |     # Whether to normalize mel spectrograms to some predefined range (following below parameters)
 54 |     allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
 55 |     symmetric_mels=True,
 56 |     # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
 57 |     # faster and cleaner convergence)
 58 |     max_abs_value=4.,
 59 |     # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
 60 |     # be too big to avoid gradient explosion,
 61 |     # not too small for fast convergence)
 62 |     # Contribution by @begeekmyfriend
 63 |     # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
 64 |     # levels. Also allows for better G&L phase reconstruction)
 65 |     preemphasize=True,  # whether to apply filter
 66 |     preemphasis=0.97,  # filter coefficient.
 67 | 
 68 |     # Limits
 69 |     min_level_db=-100,
 70 |     ref_level_db=20,
 71 |     fmin=55,
 72 |     # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
 73 |     # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 74 |     fmax=7600,  # To be increased/reduced depending on data.
 75 | 
 76 |     ###################### Our training parameters #################################
 77 |     img_size=96,
 78 |     fps=25,
 79 | 
 80 |     batch_size=16,
 81 |     initial_learning_rate=1e-4,
 82 |     nepochs=200000000000000000,
 83 |     ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
 84 |     num_workers=16,
 85 |     checkpoint_interval=3000,
 86 |     eval_interval=3000,
 87 |     save_optimizer_state=True,
 88 | 
 89 |     syncnet_wt=0.0,  # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence.
 90 |     syncnet_batch_size=64,
 91 |     syncnet_lr=1e-4,
 92 |     syncnet_eval_interval=10000,
 93 |     syncnet_checkpoint_interval=10000,
 94 | 
 95 |     disc_wt=0.07,
 96 |     disc_initial_learning_rate=1e-4,
 97 | )
 98 | 
 99 | 
100 | def hparams_debug_string():
101 |     values = hparams.values()
102 |     hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
103 |     return "Hyperparameters:\n" + "\n".join(hp)
104 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .wav2lip import Wav2Lip, Wav2Lip_disc_qual
2 | from .syncnet import SyncNet_color


--------------------------------------------------------------------------------
/scripts/wav2lip/models/conv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | class Conv2d(nn.Module):
 6 |     def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 |         self.conv_block = nn.Sequential(
 9 |                             nn.Conv2d(cin, cout, kernel_size, stride, padding),
10 |                             nn.BatchNorm2d(cout)
11 |                             )
12 |         self.act = nn.ReLU()
13 |         self.residual = residual
14 | 
15 |     def forward(self, x):
16 |         out = self.conv_block(x)
17 |         if self.residual:
18 |             out += x
19 |         return self.act(out)
20 | 
21 | class nonorm_Conv2d(nn.Module):
22 |     def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
23 |         super().__init__(*args, **kwargs)
24 |         self.conv_block = nn.Sequential(
25 |                             nn.Conv2d(cin, cout, kernel_size, stride, padding),
26 |                             )
27 |         self.act = nn.LeakyReLU(0.01, inplace=True)
28 | 
29 |     def forward(self, x):
30 |         out = self.conv_block(x)
31 |         return self.act(out)
32 | 
33 | class Conv2dTranspose(nn.Module):
34 |     def __init__(self, cin, cout, kernel_size, stride, padding, output_padding=0, *args, **kwargs):
35 |         super().__init__(*args, **kwargs)
36 |         self.conv_block = nn.Sequential(
37 |                             nn.ConvTranspose2d(cin, cout, kernel_size, stride, padding, output_padding),
38 |                             nn.BatchNorm2d(cout)
39 |                             )
40 |         self.act = nn.ReLU()
41 | 
42 |     def forward(self, x):
43 |         out = self.conv_block(x)
44 |         return self.act(out)
45 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/models/syncnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | from .conv import Conv2d
 6 | 
 7 | class SyncNet_color(nn.Module):
 8 |     def __init__(self):
 9 |         super(SyncNet_color, self).__init__()
10 | 
11 |         self.face_encoder = nn.Sequential(
12 |             Conv2d(15, 32, kernel_size=(7, 7), stride=1, padding=3),
13 | 
14 |             Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=1),
15 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
16 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
17 | 
18 |             Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
19 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
20 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
21 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
22 | 
23 |             Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
24 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
25 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
26 | 
27 |             Conv2d(256, 512, kernel_size=3, stride=2, padding=1),
28 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
29 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
30 | 
31 |             Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
32 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=0),
33 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
34 | 
35 |         self.audio_encoder = nn.Sequential(
36 |             Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
37 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
38 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
39 | 
40 |             Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
41 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
42 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
43 | 
44 |             Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
45 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
46 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
47 | 
48 |             Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
49 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
50 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
51 | 
52 |             Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
53 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
54 | 
55 |     def forward(self, audio_sequences, face_sequences): # audio_sequences := (B, dim, T)
56 |         face_embedding = self.face_encoder(face_sequences)
57 |         audio_embedding = self.audio_encoder(audio_sequences)
58 | 
59 |         audio_embedding = audio_embedding.view(audio_embedding.size(0), -1)
60 |         face_embedding = face_embedding.view(face_embedding.size(0), -1)
61 | 
62 |         audio_embedding = F.normalize(audio_embedding, p=2, dim=1)
63 |         face_embedding = F.normalize(face_embedding, p=2, dim=1)
64 | 
65 | 
66 |         return audio_embedding, face_embedding
67 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/models/wav2lip.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | import math
  5 | 
  6 | from .conv import Conv2dTranspose, Conv2d, nonorm_Conv2d
  7 | 
  8 | class Wav2Lip(nn.Module):
  9 |     def __init__(self):
 10 |         super(Wav2Lip, self).__init__()
 11 | 
 12 |         self.face_encoder_blocks = nn.ModuleList([
 13 |             nn.Sequential(Conv2d(6, 16, kernel_size=7, stride=1, padding=3)), # 96,96
 14 | 
 15 |             nn.Sequential(Conv2d(16, 32, kernel_size=3, stride=2, padding=1), # 48,48
 16 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
 17 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True)),
 18 | 
 19 |             nn.Sequential(Conv2d(32, 64, kernel_size=3, stride=2, padding=1),    # 24,24
 20 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
 21 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
 22 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True)),
 23 | 
 24 |             nn.Sequential(Conv2d(64, 128, kernel_size=3, stride=2, padding=1),   # 12,12
 25 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
 26 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True)),
 27 | 
 28 |             nn.Sequential(Conv2d(128, 256, kernel_size=3, stride=2, padding=1),       # 6,6
 29 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
 30 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True)),
 31 | 
 32 |             nn.Sequential(Conv2d(256, 512, kernel_size=3, stride=2, padding=1),     # 3,3
 33 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),),
 34 |             
 35 |             nn.Sequential(Conv2d(512, 512, kernel_size=3, stride=1, padding=0),     # 1, 1
 36 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),])
 37 | 
 38 |         self.audio_encoder = nn.Sequential(
 39 |             Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
 40 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
 41 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
 42 | 
 43 |             Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
 44 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
 45 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
 46 | 
 47 |             Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
 48 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
 49 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
 50 | 
 51 |             Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
 52 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
 53 | 
 54 |             Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
 55 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
 56 | 
 57 |         self.face_decoder_blocks = nn.ModuleList([
 58 |             nn.Sequential(Conv2d(512, 512, kernel_size=1, stride=1, padding=0),),
 59 | 
 60 |             nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=1, padding=0), # 3,3
 61 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),),
 62 | 
 63 |             nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=2, padding=1, output_padding=1),
 64 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
 65 |             Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),), # 6, 6
 66 | 
 67 |             nn.Sequential(Conv2dTranspose(768, 384, kernel_size=3, stride=2, padding=1, output_padding=1),
 68 |             Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True),
 69 |             Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True),), # 12, 12
 70 | 
 71 |             nn.Sequential(Conv2dTranspose(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1),
 72 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
 73 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),), # 24, 24
 74 | 
 75 |             nn.Sequential(Conv2dTranspose(320, 128, kernel_size=3, stride=2, padding=1, output_padding=1), 
 76 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
 77 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),), # 48, 48
 78 | 
 79 |             nn.Sequential(Conv2dTranspose(160, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
 80 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
 81 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),),]) # 96,96
 82 | 
 83 |         self.output_block = nn.Sequential(Conv2d(80, 32, kernel_size=3, stride=1, padding=1),
 84 |             nn.Conv2d(32, 3, kernel_size=1, stride=1, padding=0),
 85 |             nn.Sigmoid()) 
 86 | 
 87 |     def forward(self, audio_sequences, face_sequences):
 88 |         # audio_sequences = (B, T, 1, 80, 16)
 89 |         B = audio_sequences.size(0)
 90 | 
 91 |         input_dim_size = len(face_sequences.size())
 92 |         if input_dim_size > 4:
 93 |             audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
 94 |             face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
 95 | 
 96 |         audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
 97 | 
 98 |         feats = []
 99 |         x = face_sequences
100 |         for f in self.face_encoder_blocks:
101 |             x = f(x)
102 |             feats.append(x)
103 | 
104 |         x = audio_embedding
105 |         for f in self.face_decoder_blocks:
106 |             x = f(x)
107 |             try:
108 |                 x = torch.cat((x, feats[-1]), dim=1)
109 |             except Exception as e:
110 |                 print(x.size())
111 |                 print(feats[-1].size())
112 |                 raise e
113 |             
114 |             feats.pop()
115 | 
116 |         x = self.output_block(x)
117 | 
118 |         if input_dim_size > 4:
119 |             x = torch.split(x, B, dim=0) # [(B, C, H, W)]
120 |             outputs = torch.stack(x, dim=2) # (B, C, T, H, W)
121 | 
122 |         else:
123 |             outputs = x
124 |             
125 |         return outputs
126 | 
127 | class Wav2Lip_disc_qual(nn.Module):
128 |     def __init__(self):
129 |         super(Wav2Lip_disc_qual, self).__init__()
130 | 
131 |         self.face_encoder_blocks = nn.ModuleList([
132 |             nn.Sequential(nonorm_Conv2d(3, 32, kernel_size=7, stride=1, padding=3)), # 48,96
133 | 
134 |             nn.Sequential(nonorm_Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=2), # 48,48
135 |             nonorm_Conv2d(64, 64, kernel_size=5, stride=1, padding=2)),
136 | 
137 |             nn.Sequential(nonorm_Conv2d(64, 128, kernel_size=5, stride=2, padding=2),    # 24,24
138 |             nonorm_Conv2d(128, 128, kernel_size=5, stride=1, padding=2)),
139 | 
140 |             nn.Sequential(nonorm_Conv2d(128, 256, kernel_size=5, stride=2, padding=2),   # 12,12
141 |             nonorm_Conv2d(256, 256, kernel_size=5, stride=1, padding=2)),
142 | 
143 |             nn.Sequential(nonorm_Conv2d(256, 512, kernel_size=3, stride=2, padding=1),       # 6,6
144 |             nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1)),
145 | 
146 |             nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=2, padding=1),     # 3,3
147 |             nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1),),
148 |             
149 |             nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=0),     # 1, 1
150 |             nonorm_Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),])
151 | 
152 |         self.binary_pred = nn.Sequential(nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0), nn.Sigmoid())
153 |         self.label_noise = .0
154 | 
155 |     def get_lower_half(self, face_sequences):
156 |         return face_sequences[:, :, face_sequences.size(2)//2:]
157 | 
158 |     def to_2d(self, face_sequences):
159 |         B = face_sequences.size(0)
160 |         face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
161 |         return face_sequences
162 | 
163 |     def perceptual_forward(self, false_face_sequences):
164 |         false_face_sequences = self.to_2d(false_face_sequences)
165 |         false_face_sequences = self.get_lower_half(false_face_sequences)
166 | 
167 |         false_feats = false_face_sequences
168 |         for f in self.face_encoder_blocks:
169 |             false_feats = f(false_feats)
170 | 
171 |         false_pred_loss = F.binary_cross_entropy(self.binary_pred(false_feats).view(len(false_feats), -1), 
172 |                                         torch.ones((len(false_feats), 1)).cuda())
173 | 
174 |         return false_pred_loss
175 | 
176 |     def forward(self, face_sequences):
177 |         face_sequences = self.to_2d(face_sequences)
178 |         face_sequences = self.get_lower_half(face_sequences)
179 | 
180 |         x = face_sequences
181 |         for f in self.face_encoder_blocks:
182 |             x = f(x)
183 | 
184 |         return self.binary_pred(x).view(len(x), -1)
185 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/output/debug/README.md:
--------------------------------------------------------------------------------
1 | debug folder


--------------------------------------------------------------------------------
/scripts/wav2lip/output/face_enhanced/README.md:
--------------------------------------------------------------------------------
1 | Enhanced file folder


--------------------------------------------------------------------------------
/scripts/wav2lip/output/faceswap/README.md:
--------------------------------------------------------------------------------
1 | faceswap file folder


--------------------------------------------------------------------------------
/scripts/wav2lip/output/final/README.md:
--------------------------------------------------------------------------------
1 | image generated by stable diffusion


--------------------------------------------------------------------------------
/scripts/wav2lip/predicator/README.md:
--------------------------------------------------------------------------------
1 | Place shape_predictor_68_face_landmarks.dat here


--------------------------------------------------------------------------------
/scripts/wav2lip/results/README.md:
--------------------------------------------------------------------------------
1 | Generated results will be placed in this folder by default.


--------------------------------------------------------------------------------
/scripts/wav2lip/temp/README.md:
--------------------------------------------------------------------------------
1 | Temporary files at the time of inference/testing will be saved here. You can ignore them.


--------------------------------------------------------------------------------
/scripts/wav2lip/w2l.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gc
  3 | import cv2, os, scripts.wav2lip.audio as audio
  4 | import subprocess
  5 | from tqdm import tqdm
  6 | import torch, scripts.wav2lip.face_detection as face_detection
  7 | from scripts.wav2lip.models import Wav2Lip
  8 | import modules.shared as shared
  9 | from pkg_resources import resource_filename
 10 | 
 11 | 
 12 | class W2l:
 13 |     def __init__(self, face, audio, checkpoint, nosmooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, face_swap_img):
 14 |         self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1])
 15 |         self.static = False
 16 |         if os.path.isfile(face) and face.split('.')[1] in ['jpg', 'png', 'jpeg']:
 17 |             self.static = True
 18 | 
 19 |         self.img_size = 96
 20 |         self.face = face
 21 |         self.audio = audio
 22 |         self.checkpoint = checkpoint
 23 |         self.mel_step_size = 16
 24 |         self.face_det_batch_size = 16
 25 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 26 |         self.pads = [pad_top, pad_bottom, pad_left, pad_right]
 27 |         self.face_swap_img = face_swap_img
 28 |         self.nosmooth = nosmooth
 29 |         self.box = [-1, -1, -1, -1]
 30 |         self.wav2lip_batch_size = 128
 31 |         self.fps = 25
 32 |         self.resize_factor = resize_factor
 33 |         self.rotate = False
 34 |         self.crop = [0, -1, 0, -1]
 35 |         self.checkpoint_path = self.wav2lip_folder + '/checkpoints/' + self.checkpoint + '.pth'
 36 |         self.outfile = self.wav2lip_folder + '/results/result_voice.mp4'
 37 |         print('Using {} for inference.'.format(self.device))
 38 |         self.ffmpeg_binary = self.find_ffmpeg_binary()
 39 | 
 40 |     def find_ffmpeg_binary(self):
 41 |         for package in ['imageio_ffmpeg', 'imageio-ffmpeg']:
 42 |             try:
 43 |                 package_path = resource_filename(package, 'binaries')
 44 |                 files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")]
 45 |                 files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
 46 |                 return files[0] if files else 'ffmpeg'
 47 |             except:
 48 |                 return 'ffmpeg'
 49 | 
 50 |     def execute_command(self, command):
 51 |         process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
 52 |         stdout, stderr = process.communicate()
 53 |         if process.returncode != 0:
 54 |             raise RuntimeError(stderr)
 55 | 
 56 |     def get_smoothened_boxes(self, boxes, T):
 57 |         for i in range(len(boxes)):
 58 |             if i + T > len(boxes):
 59 |                 window = boxes[len(boxes) - T:]
 60 |             else:
 61 |                 window = boxes[i: i + T]
 62 |             boxes[i] = np.mean(window, axis=0)
 63 |         return boxes
 64 | 
 65 |     def face_detect(self, images):
 66 |         detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
 67 |                                                 flip_input=False, device=self.device)
 68 | 
 69 |         batch_size = self.face_det_batch_size
 70 | 
 71 |         while 1:
 72 |             predictions = []
 73 |             try:
 74 |                 for i in tqdm(range(0, len(images), batch_size)):
 75 |                     predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
 76 |             except RuntimeError:
 77 |                 if batch_size == 1:
 78 |                     raise RuntimeError(
 79 |                         'Image too big to run face detection on GPU. Please use the --resize_factor argument')
 80 |                 batch_size //= 2
 81 |                 print('Recovering from OOM error; New batch size: {}'.format(batch_size))
 82 |                 continue
 83 |             break
 84 | 
 85 |         results = []
 86 |         pady1, pady2, padx1, padx2 = self.pads
 87 |         n = 0
 88 |         for rect, image in zip(predictions, images):
 89 |             if rect is None:
 90 |                 print("Hum : " + str(n))
 91 |                 cv2.imwrite(self.wav2lip_folder + '/temp/faulty_frame.jpg',
 92 |                             image)  # check this frame where the face was not detected.
 93 |                 raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
 94 | 
 95 |             y1 = max(0, rect[1] - pady1)
 96 |             y2 = min(image.shape[0], rect[3] + pady2)
 97 |             x1 = max(0, rect[0] - padx1)
 98 |             x2 = min(image.shape[1], rect[2] + padx2)
 99 | 
100 |             results.append([x1, y1, x2, y2])
101 |             n += 1
102 | 
103 |         boxes = np.array(results)
104 |         if not self.nosmooth: boxes = self.get_smoothened_boxes(boxes, T=5)
105 |         results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
106 | 
107 |         del detector
108 |         return results
109 | 
110 |     def datagen(self, frames, mels):
111 |         img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
112 | 
113 |         if self.box[0] == -1:
114 |             if not self.static:
115 |                 face_det_results = self.face_detect(frames)  # BGR2RGB for CNN face detection
116 |             else:
117 |                 face_det_results = self.face_detect([frames[0]])
118 |         else:
119 |             print('Using the specified bounding box instead of face detection...')
120 |             y1, y2, x1, x2 = self.box
121 |             face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
122 | 
123 |         for i, m in enumerate(mels):
124 |             idx = 0 if self.static else i % len(frames)
125 |             frame_to_save = frames[idx].copy()
126 |             face, coords = face_det_results[idx].copy()
127 | 
128 |             face = cv2.resize(face, (self.img_size, self.img_size))
129 | 
130 |             img_batch.append(face)
131 |             mel_batch.append(m)
132 |             frame_batch.append(frame_to_save)
133 |             coords_batch.append(coords)
134 | 
135 |             if len(img_batch) >= self.wav2lip_batch_size:
136 |                 img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
137 | 
138 |                 img_masked = img_batch.copy()
139 |                 img_masked[:, self.img_size // 2:] = 0
140 | 
141 |                 img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
142 |                 mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
143 | 
144 |                 yield img_batch, mel_batch, frame_batch, coords_batch
145 |                 img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
146 | 
147 |         if len(img_batch) > 0:
148 |             img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
149 | 
150 |             img_masked = img_batch.copy()
151 |             img_masked[:, self.img_size // 2:] = 0
152 | 
153 |             img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
154 |             mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
155 | 
156 |             yield img_batch, mel_batch, frame_batch, coords_batch
157 | 
158 |     def _load(self, checkpoint_path):
159 |         shared.cmd_opts.disable_safe_unpickle = True
160 |         if self.device == 'cuda':
161 |             checkpoint = torch.load(checkpoint_path)
162 |         else:
163 |             checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
164 |         shared.cmd_opts.disable_safe_unpickle = False
165 |         return checkpoint
166 | 
167 |     def load_model(self, path):
168 |         model = Wav2Lip()
169 |         print("Load checkpoint from: {}".format(path))
170 |         checkpoint = self._load(path)
171 |         s = checkpoint["state_dict"]
172 |         new_s = {}
173 |         for k, v in s.items():
174 |             new_s[k.replace('module.', '')] = v
175 |         model.load_state_dict(new_s)
176 | 
177 |         model = model.to(self.device)
178 |         return model.eval()
179 | 
180 |     def execute(self):
181 |         if not os.path.isfile(self.face):
182 |             raise ValueError('--face argument must be a valid path to video/image file')
183 | 
184 |         elif self.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
185 |             full_frames = [cv2.imread(self.face)]
186 |             fps = self.fps
187 | 
188 |         else:
189 |             video_stream = cv2.VideoCapture(self.face)
190 |             fps = video_stream.get(cv2.CAP_PROP_FPS)
191 | 
192 |             print('Reading video frames...')
193 | 
194 |             full_frames = []
195 |             while 1:
196 |                 still_reading, frame = video_stream.read()
197 |                 if not still_reading:
198 |                     video_stream.release()
199 |                     break
200 |                 if self.resize_factor > 1 and self.face_swap_img is None:
201 |                     frame = cv2.resize(frame,
202 |                                        (frame.shape[1] // self.resize_factor, frame.shape[0] // self.resize_factor))
203 | 
204 |                 if self.rotate:
205 |                     frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
206 | 
207 |                 y1, y2, x1, x2 = self.crop
208 |                 if x2 == -1: x2 = frame.shape[1]
209 |                 if y2 == -1: y2 = frame.shape[0]
210 | 
211 |                 frame = frame[y1:y2, x1:x2]
212 | 
213 |                 full_frames.append(frame)
214 | 
215 |         print("Number of frames available for inference: " + str(len(full_frames)))
216 | 
217 |         if not self.audio.endswith('.wav'):
218 |             print('Extracting raw audio...')
219 |             command = [self.ffmpeg_binary, "-y", "-i", self.audio, "-strict", "-2",
220 |                        self.wav2lip_folder + "/temp/temp.wav"]
221 | 
222 |             self.execute_command(command)
223 |             self.audio = self.wav2lip_folder + '/temp/temp.wav'
224 | 
225 |         wav = audio.load_wav(self.audio, 16000)
226 |         mel = audio.melspectrogram(wav)
227 |         print(mel.shape)
228 | 
229 |         if np.isnan(mel.reshape(-1)).sum() > 0:
230 |             raise ValueError(
231 |                 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
232 | 
233 |         mel_chunks = []
234 |         mel_idx_multiplier = 80. / fps
235 |         i = 0
236 |         while 1:
237 |             start_idx = int(i * mel_idx_multiplier)
238 |             if start_idx + self.mel_step_size > len(mel[0]):
239 |                 mel_chunks.append(mel[:, len(mel[0]) - self.mel_step_size:])
240 |                 break
241 |             mel_chunks.append(mel[:, start_idx: start_idx + self.mel_step_size])
242 |             i += 1
243 | 
244 |         print("Length of mel chunks: {}".format(len(mel_chunks)))
245 | 
246 |         full_frames = full_frames[:len(mel_chunks)]
247 | 
248 |         batch_size = self.wav2lip_batch_size
249 |         gen = self.datagen(full_frames.copy(), mel_chunks)
250 | 
251 |         for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
252 |                                                                         total=int(
253 |                                                                             np.ceil(
254 |                                                                                 float(len(mel_chunks)) / batch_size)))):
255 |             if i == 0:
256 |                 model = self.load_model(self.checkpoint_path)
257 |                 print("Model loaded")
258 | 
259 |                 frame_h, frame_w = full_frames[0].shape[:-1]
260 |                 out = cv2.VideoWriter(self.wav2lip_folder + '/temp/result.avi',
261 |                                       cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
262 | 
263 |             img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(self.device)
264 |             mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(self.device)
265 | 
266 |             with torch.no_grad():
267 |                 pred = model(mel_batch, img_batch)
268 | 
269 |             pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
270 | 
271 |             for p, f, c in zip(pred, frames, coords):
272 |                 y1, y2, x1, x2 = c
273 |                 p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
274 | 
275 |                 f[y1:y2, x1:x2] = p
276 |                 out.write(f)
277 | 
278 |         out.release()
279 |         # release memory
280 |         model.cpu()
281 |         del model
282 |         torch.cuda.empty_cache()
283 |         gc.collect()
284 | 
285 |         command = [self.ffmpeg_binary, "-y", "-i", self.audio, "-i", self.wav2lip_folder + '/temp/result.avi',
286 |                    "-strict", "-2", "-q:v", "1", self.outfile]
287 |         self.execute_command(command)
288 | 


--------------------------------------------------------------------------------
/scripts/wav2lip/wav2lip_uhq.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import cv2
  4 | import dlib
  5 | import json
  6 | import torch
  7 | import scripts.wav2lip.face_detection as face_detection
  8 | from imutils import face_utils
  9 | import subprocess
 10 | from modules.shared import state, opts
 11 | from pkg_resources import resource_filename
 12 | import modules.face_restoration
 13 | from modules import devices
 14 | 
 15 | 
 16 | class Wav2LipUHQ:
 17 |     def __init__(self, face, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur, only_mouth,
 18 |                  face_swap_img, resize_factor, code_former_weight, debug=False):
 19 |         self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1])
 20 |         self.original_video = face
 21 |         self.face_restore_model = face_restore_model
 22 |         self.mouth_mask_dilatation = mouth_mask_dilatation
 23 |         self.erode_face_mask = erode_face_mask
 24 |         self.mask_blur = mask_blur
 25 |         self.only_mouth = only_mouth
 26 |         self.face_swap_img = face_swap_img
 27 |         self.w2l_video = self.wav2lip_folder + '/results/result_voice.mp4'
 28 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 29 |         self.ffmpeg_binary = self.find_ffmpeg_binary()
 30 |         self.resize_factor = resize_factor
 31 |         self.code_former_weight = code_former_weight
 32 |         self.debug = debug
 33 | 
 34 |     def find_ffmpeg_binary(self):
 35 |         for package in ['imageio_ffmpeg', 'imageio-ffmpeg']:
 36 |             try:
 37 |                 package_path = resource_filename(package, 'binaries')
 38 |                 files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")]
 39 |                 files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
 40 |                 return files[0] if files else 'ffmpeg'
 41 |             except:
 42 |                 return 'ffmpeg'
 43 | 
 44 |     def assure_path_exists(self, path):
 45 |         dir = os.path.dirname(path)
 46 |         if not os.path.exists(dir):
 47 |             os.makedirs(dir)
 48 | 
 49 |     def get_framerate(self, video_file):
 50 |         video = cv2.VideoCapture(video_file)
 51 |         fps = video.get(cv2.CAP_PROP_FPS)
 52 |         video.release()
 53 |         return fps
 54 | 
 55 |     def execute_command(self, command):
 56 |         process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
 57 |         stdout, stderr = process.communicate()
 58 |         if process.returncode != 0:
 59 |             raise RuntimeError(stderr)
 60 | 
 61 |     def create_video_from_images(self, nb_frames):
 62 |         fps = str(self.get_framerate(self.w2l_video))
 63 |         command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i",
 64 |                    self.wav2lip_folder + "/output/final/output_%05d.png", "-vframes",
 65 |                    str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k",
 66 |                    self.wav2lip_folder + "/output/video.mp4"]
 67 | 
 68 |         self.execute_command(command)
 69 | 
 70 |         command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i",
 71 |                    self.wav2lip_folder + "/output/face_enhanced/face_restore_%05d.png", "-vframes",
 72 |                    str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k",
 73 |                    self.wav2lip_folder + "/output/video_enhanced.mp4"]
 74 | 
 75 |         self.execute_command(command)
 76 | 
 77 |     def extract_audio_from_video(self):
 78 |         command = [self.ffmpeg_binary, "-y", "-i", self.w2l_video, "-vn", "-acodec", "copy",
 79 |                    self.wav2lip_folder + "/output/output_audio.aac"]
 80 |         self.execute_command(command)
 81 | 
 82 |     def add_audio_to_video(self):
 83 |         command = [self.ffmpeg_binary, "-y", "-i", self.wav2lip_folder + "/output/video.mp4", "-i",
 84 |                    self.wav2lip_folder + "/output/output_audio.aac", "-c:v", "copy", "-c:a", "aac", "-strict",
 85 |                    "experimental", self.wav2lip_folder + "/output/output_video.mp4"]
 86 |         self.execute_command(command)
 87 | 
 88 |         command = [self.ffmpeg_binary, "-y", "-i", self.wav2lip_folder + "/output/video_enhanced.mp4", "-i",
 89 |                    self.wav2lip_folder + "/output/output_audio.aac", "-c:v", "copy", "-c:a", "aac", "-strict",
 90 |                    "experimental", self.wav2lip_folder + "/output/output_video_enhanced.mp4"]
 91 |         self.execute_command(command)
 92 | 
 93 |     def initialize_dlib_predictor(self):
 94 |         print("[INFO] Loading the predictor...")
 95 |         detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
 96 |                                                 flip_input=False, device=self.device)
 97 |         predictor = dlib.shape_predictor(self.wav2lip_folder + "/predicator/shape_predictor_68_face_landmarks.dat")
 98 |         return detector, predictor
 99 | 
100 |     def initialize_video_streams(self):
101 |         print("[INFO] Loading File...")
102 |         vs = cv2.VideoCapture(self.w2l_video)
103 |         vi = cv2.VideoCapture(self.original_video)
104 |         return vs, vi
105 | 
106 |     def dilate_mouth(self, mouth, w, h):
107 |         mask = np.zeros((w, h), dtype=np.uint8)
108 |         cv2.fillPoly(mask, [mouth], 255)
109 |         kernel = np.ones((self.mouth_mask_dilatation, self.mouth_mask_dilatation), np.uint8)
110 |         dilated_mask = cv2.dilate(mask, kernel, iterations=1)
111 |         contours, _ = cv2.findContours(dilated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
112 |         dilated_points = contours[0].squeeze()
113 |         return dilated_points
114 | 
115 |     def execute(self, resume=False):
116 |         output_dir = self.wav2lip_folder + '/output/'
117 |         debug_path = output_dir + "debug/"
118 |         face_enhanced_path = output_dir + "face_enhanced/"
119 |         final_path = output_dir + 'final/'
120 |         detector, predictor = self.initialize_dlib_predictor()
121 |         vs, vi = self.initialize_video_streams()
122 |         (mstart, mend) = face_utils.FACIAL_LANDMARKS_IDXS["mouth"]
123 |         (jstart, jend) = face_utils.FACIAL_LANDMARKS_IDXS["jaw"]
124 |         (nstart, nend) = face_utils.FACIAL_LANDMARKS_IDXS["nose"]
125 | 
126 |         max_frame = str(int(vs.get(cv2.CAP_PROP_FRAME_COUNT)))
127 |         original_codeformer_weight = opts.code_former_weight
128 |         original_face_restoration_model = opts.face_restoration_model
129 | 
130 |         opts.code_former_weight = self.code_former_weight
131 |         opts.face_restoration_model = self.face_restore_model
132 | 
133 |         frame_number = 0
134 |         if resume:
135 |             if os.path.exists(self.wav2lip_folder + "/resume.json"):
136 |                 with open(self.wav2lip_folder + "/resume.json", "r") as f:
137 |                     parameters = json.load(f)
138 |                 # Read frame
139 |                 for f in range(parameters["frame"]):
140 |                     _, _ = vs.read()
141 |                     ret, _ = vi.read()
142 |                     if not ret:
143 |                         vi.release()
144 |                         vi = cv2.VideoCapture(self.original_video)
145 |                         _, _ = vi.read()
146 |                 frame_number = parameters["frame"]
147 |         print("Face Restoration model: " + str(opts.face_restoration_model))
148 | 
149 |         while True:
150 |             print("[INFO] Processing frame: " + str(frame_number) + " of " + max_frame + " - ", end="\r")
151 |             f_number = str(frame_number).rjust(5, '0')
152 |             if state.interrupted:
153 |                 break
154 | 
155 |             # Read frame
156 |             ret, w2l_frame = vs.read()
157 |             if not ret:
158 |                 break
159 | 
160 |             ret, original_frame = vi.read()
161 |             if not ret:
162 |                 vi.release()
163 |                 vi = cv2.VideoCapture(self.original_video)
164 |                 ret, original_frame = vi.read()
165 | 
166 |             if w2l_frame.shape != original_frame.shape:
167 |                 if self.resize_factor > 1 and self.face_swap_img is None:
168 |                     original_frame = cv2.resize(original_frame, (w2l_frame.shape[1], w2l_frame.shape[0]))
169 |                 else:
170 |                     w2l_frame = cv2.resize(w2l_frame, (original_frame.shape[1], original_frame.shape[0]))
171 | 
172 |             # Convert to gray
173 |             original_gray = cv2.cvtColor(original_frame, cv2.COLOR_RGB2GRAY)
174 | 
175 |             # Restore face
176 |             w2l_frame_to_restore = cv2.cvtColor(w2l_frame, cv2.COLOR_BGR2RGB)
177 |             image_restored = modules.face_restoration.restore_faces(w2l_frame_to_restore)
178 | 
179 |             image_restored2 = cv2.cvtColor(image_restored, cv2.COLOR_RGB2BGR)
180 |             cv2.imwrite(face_enhanced_path + "face_restore_" + f_number + ".png", image_restored2)
181 |             image_restored_gray = cv2.cvtColor(image_restored2, cv2.COLOR_RGB2GRAY)
182 | 
183 |             # Detect faces
184 |             rects = detector.get_detections_for_batch(np.array([np.array(image_restored2)]))
185 | 
186 |             # Initialize mask
187 |             mask = np.zeros_like(original_gray)
188 | 
189 |             # Process each detected face
190 |             for (i, rect) in enumerate(rects):
191 |                 # Get face coordinates
192 |                 if not self.only_mouth:
193 |                     shape = predictor(original_gray, dlib.rectangle(*rect))
194 |                     shape = face_utils.shape_to_np(shape)
195 |                     jaw = shape[jstart:jend][1:-1]
196 |                     nose = shape[nstart:nend][2]
197 | 
198 |                 # Get mouth coordinates
199 |                 shape = predictor(image_restored_gray, dlib.rectangle(*rect))
200 |                 shape = face_utils.shape_to_np(shape)
201 | 
202 |                 mouth = shape[mstart:mend][:-8]
203 |                 mouth = np.delete(mouth, [3], axis=0)
204 |                 if self.mouth_mask_dilatation > 0:
205 |                     mouth = self.dilate_mouth(mouth, original_gray.shape[0], original_gray.shape[1])
206 | 
207 |                 # Create mask for face
208 |                 if not self.only_mouth:
209 |                     external_shape = np.append(jaw, [nose], axis=0)
210 |                     external_shape_pts = external_shape.reshape((-1, 1, 2))
211 |                     mask = cv2.fillPoly(mask, [external_shape_pts], 255)
212 |                     if self.erode_face_mask > 0:
213 |                         kernel = np.ones((self.erode_face_mask, self.erode_face_mask), np.uint8)
214 |                         mask = cv2.erode(mask, kernel, iterations=1)
215 |                     # Calculate diff between frames and apply threshold
216 |                     diff = np.abs(original_gray - image_restored_gray)
217 |                     diff[diff > 10] = 255
218 |                     diff[diff <= 10] = 0
219 |                     masked_diff = cv2.bitwise_and(diff, diff, mask=mask)
220 |                 else:
221 |                     masked_diff = mask
222 | 
223 |                 # Create mask for mouth
224 |                 cv2.fillConvexPoly(masked_diff, mouth, 255)
225 | 
226 |                 # Save mask
227 |                 if self.mask_blur > 0:
228 |                     blur = self.mask_blur if self.mask_blur % 2 == 1 else self.mask_blur - 1
229 |                     masked_save = cv2.GaussianBlur(masked_diff, (blur, blur), 0)
230 |                 else:
231 |                     masked_save = masked_diff
232 | 
233 |                 original = original_frame.copy()
234 | 
235 |                 # Apply restored face to original image with mask attention
236 |                 extended_mask = np.stack([masked_save] * 3, axis=-1)
237 |                 normalized_mask = extended_mask / 255.0
238 |                 dst = image_restored2 * normalized_mask
239 |                 original = original * (1 - normalized_mask) + dst
240 |                 original = original.astype(np.uint8)
241 | 
242 |                 # Save final image
243 |                 cv2.imwrite(final_path + "output_" + f_number + ".png", original)
244 | 
245 |                 if self.debug:
246 |                     clone = w2l_frame.copy()
247 |                     if not self.only_mouth:
248 |                         for (x, y) in np.concatenate((jaw, mouth, [nose])):
249 |                             cv2.circle(clone, (x, y), 1, (0, 0, 255), -1)
250 |                     else:
251 |                         for (x, y) in mouth:
252 |                             cv2.circle(clone, (x, y), 1, (0, 0, 255), -1)
253 |                     if not self.only_mouth:
254 |                         cv2.imwrite(debug_path + "diff_" + f_number + ".png", diff)
255 |                     cv2.imwrite(debug_path + "points_" + f_number + ".png", clone)
256 |                     cv2.imwrite(debug_path + 'mask_' + f_number + '.png', masked_save)
257 |                     cv2.imwrite(debug_path + 'original_' + f_number + '.png', original_frame)
258 |                     cv2.imwrite(debug_path + "face_restore_" + f_number + ".png", image_restored2)
259 |                     cv2.imwrite(debug_path + "dst_" + f_number + ".png", dst)
260 | 
261 |             frame_number += 1
262 |         opts.code_former_weight = original_codeformer_weight
263 |         opts.face_restoration_model = original_face_restoration_model
264 |         devices.torch_gc()
265 |         if frame_number > 1:
266 |             vs.release()
267 |             vi.release()
268 | 
269 |             print("[INFO] Create Videos output!")
270 |             self.create_video_from_images(frame_number - 1)
271 |             print("[INFO] Extract Audio from input!")
272 |             self.extract_audio_from_video()
273 |             print("[INFO] Add Audio to Videos!")
274 |             self.add_audio_to_video()
275 |             print("[INFO] Done! file save in output/video_output.mp4")
276 | 
277 |             if str(frame_number) != max_frame:
278 |                 parameters = {"frame": frame_number}
279 |                 with open(self.wav2lip_folder + "/resume.json", 'w') as f:
280 |                     json.dump(parameters, f)
281 |             else:
282 |                 if os.path.exists(self.wav2lip_folder + "/resume.json"):
283 |                     os.remove(self.wav2lip_folder + "/resume.json")
284 |             if self.face_swap_img is None:
285 |                 face_swap_output = None
286 |             else:
287 |                 face_swap_output = self.wav2lip_folder + "/output/faceswap/video.mp4"
288 |             return [face_swap_output,
289 |                     self.wav2lip_folder + "/results/result_voice.mp4",
290 |                     self.wav2lip_folder + "/output/output_video_enhanced.mp4",
291 |                     self.wav2lip_folder + "/output/output_video.mp4"]
292 |         else:
293 |             print("[INFO] Interrupted!")
294 |             return None
295 | 


--------------------------------------------------------------------------------
/scripts/wav2lip_uhq.py:
--------------------------------------------------------------------------------
 1 | from modules import script_callbacks
 2 | from scripts.wav2lip_uhq_extend_paths import wav2lip_uhq_sys_extend
 3 | 
 4 | 
 5 | def init_wav2lip_uhq():
 6 |     wav2lip_uhq_sys_extend()
 7 |     from ui import on_ui_tabs
 8 |     script_callbacks.on_ui_tabs(on_ui_tabs)
 9 | 
10 | 
11 | init_wav2lip_uhq()
12 | 


--------------------------------------------------------------------------------
/scripts/wav2lip_uhq_extend_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def wav2lip_uhq_sys_extend():
 6 |     wav2lip_uhq_folder_name = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-2])
 7 | 
 8 |     basedirs = [os.getcwd()]
 9 |     for _ in basedirs:
10 |         wav2lip_uhq_paths_to_ensure = [os.path.join(wav2lip_uhq_folder_name, 'scripts')]
11 |         for wav2lip_uhq_scripts_path_fix in wav2lip_uhq_paths_to_ensure:
12 |             if wav2lip_uhq_scripts_path_fix not in sys.path:
13 |                 sys.path.extend([wav2lip_uhq_scripts_path_fix])
14 | 


--------------------------------------------------------------------------------