├── .github └── FUNDING.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── README_CN.md ├── install.py ├── requirements.txt └── scripts ├── bark ├── speakers.json └── tts.py ├── faceswap ├── model │ └── README.md └── swap.py ├── ui.py ├── wav2lip ├── audio.py ├── checkpoints │ └── README.md ├── face_detection │ ├── README.md │ ├── __init__.py │ ├── api.py │ ├── detection │ │ ├── __init__.py │ │ ├── core.py │ │ └── sfd │ │ │ ├── __init__.py │ │ │ ├── bbox.py │ │ │ ├── detect.py │ │ │ ├── net_s3fd.py │ │ │ └── sfd_detector.py │ ├── models.py │ └── utils.py ├── hparams.py ├── models │ ├── __init__.py │ ├── conv.py │ ├── syncnet.py │ └── wav2lip.py ├── output │ ├── debug │ │ └── README.md │ ├── face_enhanced │ │ └── README.md │ ├── faceswap │ │ └── README.md │ └── final │ │ └── README.md ├── predicator │ └── README.md ├── results │ └── README.md ├── temp │ └── README.md ├── w2l.py └── wav2lip_uhq.py ├── wav2lip_uhq.py └── wav2lip_uhq_extend_paths.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: Wav2LipStudio 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | demo/* 3 | **/__pycache__/ 4 | scripts/wav2lip/checkpoints/wav2lip_gan.pth 5 | scripts/wav2lip/checkpoints/wav2lip.pth 6 | scripts/wav2lip/checkpoints/visual_quality_disc_.pth 7 | scripts/wav2lip/checkpoints/lipsync_expert_.pth 8 | scripts/wav2lip/face_detection/detection/sfd/s3fd.pth 9 | scripts/wav2lip/predicator/shape_predictor_68_face_landmarks.dat 10 | scripts/wav2lip/output/debug/*.png 11 | scripts/wav2lip/output/final/*.png 12 | scripts/wav2lip/output/images/*.png 13 | scripts/wav2lip/output/masks/*.png 14 | scripts/wav2lip/output/*.mp4 15 | scripts/wav2lip/output/*.aac 16 | scripts/wav2lip/results/result_voice.mp4 17 | scripts/wav2lip/temp/*.avi 18 | scripts/wav2lip/temp/*.wav 19 | docs/* 20 | scripts/faceswap/model/inswapper_128.onnx -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to sd-wav2lip-uhq 2 | 3 | Thank you for your interest in contributing to sd-wav2lip-uhq! We appreciate your effort and to help us incorporate your contribution in the best way possible, please follow the following contribution guidelines. 4 | 5 | ## Reporting Bugs 6 | 7 | If you find a bug in the project, we encourage you to report it. Here's how: 8 | 9 | 1. First, check the [existing Issues](url_of_issues) to see if the issue has already been reported. If it has, please add a comment to the existing issue rather than creating a new one. 10 | 2. If you can't find an existing issue that matches your bug, create a new issue. Make sure to include as many details as possible so we can understand and reproduce the problem. 11 | 12 | ## Proposing Changes 13 | 14 | We welcome code contributions from the community. Here's how to propose changes: 15 | 16 | 1. Fork this repository to your own GitHub account. 17 | 2. Create a new branch on your fork for your changes. 18 | 3. Make your changes in this branch. 19 | 4. When you are ready, submit a pull request to the `main` branch of this repository. 20 | 21 | Please note that we use the GitHub Flow workflow, so all pull requests should be made to the `main` branch. 22 | 23 | Before submitting a pull request, please make sure your code adheres to the project's coding conventions and it has passed all tests. If you are adding features, please also add appropriate tests. 24 | 25 | ## Contact 26 | 27 | If you have any questions or need help, please ping the developer via discord NumZ#7184 to make sure your addition will fit well into such a large project and to get help if needed. 28 | 29 | Thank you again for your contribution! 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2023 NumZ 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🔉👄 Wav2Lip STUDIO extension for Stable Diffusion WebUI Automatic1111 2 | 3 | ##
English | 简体中文
4 | 5 | 6 | 7 | https://user-images.githubusercontent.com/800903/262435301-af205a91-30d7-43f2-afcc-05980d581fe0.mp4 8 | ## **STANDALONE VERSION CAN BE FOUND HERE** : [WAV2LIP STUDIO STANDALONE](https://www.patreon.com/Wav2LipStudio) 9 | In the standalone version you can : 10 | - ♻ Manage project: Add a feature to manage multiple project 11 | - 👪 Introduced multiple face swap: Can now Swap multiple face in one shot 12 | - ⛔ Visible face restriction: Can now make whole process even if no face detected on frame! 13 | - 📺 Video Size: works with high resolution video input, (test with 1980x1080, should works with 4K but slow) 14 | - 🔑 Keyframe manager: Add a keyframe manager for better control of the video generation 15 | - 🍪 coqui TTS integration: Remove bark integration, use coqui TTS instead 16 | - 💬 Conversation: Add a conversation feature with multiple person 17 | - 🔈 Record your own voice: Add a feature to record your own voice 18 | - 👬 Clone voice: Add a feature to clone voice from video 19 | - 🎏 translate video: Add a feature to translate video with voice clone (HEYGEN like) 20 | - 🔉 Volume amplifier for wav2lip: Add a feature to amplify the volume of the wav2lip output 21 | - 🕡 Add delay before sound speech start 22 | - 🚀 Speed up process: Speed up the process 23 | 24 | 25 | ## 💡 Description 26 | This repository contains a Wav2Lip Studio extension for Automatic1111. 27 | 28 | It's an all-in-one solution: just choose a video and a speech file (wav or mp3), and the extension will generate a lip-sync video. It improves the quality of the lip-sync videos generated by the [Wav2Lip tool](https://github.com/Rudrabha/Wav2Lip) by applying specific post-processing techniques with Stable diffusion tools. 29 | 30 | ![Illustration](https://user-images.githubusercontent.com/800903/267808204-ae971458-9e8d-403e-9e10-9b7b7590d999.png) 31 | 32 | ## 📖 Quick Index 33 | * [🚀 Updates](#-updates) 34 | * [🔗 Requirements](#-requirements) 35 | * [💻 Installation](#-installation) 36 | * [🐍 Usage](#-usage) 37 | * [👄 Note on the bark Fidelity](#-note-on-the-bark-fidelity) 38 | * [📺 Examples](#-examples) 39 | * [📖 Behind the scenes](#-behind-the-scenes) 40 | * [💪 Quality tips](#-quality-tips) 41 | * [⚠️Noted Constraints](#-noted-constraints) 42 | * [📝 To do](#-to-do) 43 | * [😎 Contributing](#-contributing) 44 | * [🙏 Appreciation](#-appreciation) 45 | * [📝 Citation](#-citation) 46 | * [📜 License](#-license) 47 | * [☕ Support Wav2lip Studio](#-support-wav2lip-studio) 48 | 49 | ## 🚀 Updates 50 | **2023.09.13** 51 | - 👪 Introduced face swap: facefusion integration (See Usage section) **this feature is under experimental**. 52 | 53 | **2023.08.22** 54 | - 👄 Introduced [bark](https://github.com/suno-ai/bark/) (See Usage section), **this feature is under experimental**. 55 | 56 | **2023.08.20** 57 | - 🚢 Introduced the GFPGAN model as an option. 58 | - ▶ Added the feature to resume generation. 59 | - 📏 Optimized to release memory post-generation. 60 | 61 | **2023.08.17** 62 | - 🐛 Fixed purple lips bug 63 | 64 | **2023.08.16** 65 | - ⚡ Added Wav2lip and enhanced video output, with the option to download the one that's best for you, likely the "generated video". 66 | - 🚢 Updated User Interface: Introduced control over CodeFormer Fidelity. 67 | - 👄 Removed image as input, [SadTalker](https://github.com/OpenTalker/SadTalker) is better suited for this. 68 | - 🐛 Fixed a bug regarding the discrepancy between input and output video that incorrectly positioned the mask. 69 | - 💪 Refined the quality process for greater efficiency. 70 | - 🚫 Interruption will now generate videos if the process creates frames 71 | 72 | **2023.08.13** 73 | - ⚡ Speed-up computation 74 | - 🚢 Change User Interface : Add controls on hidden parameters 75 | - 👄 Only Track mouth if needed 76 | - 📰 Control debug 77 | - 🐛 Fix resize factor bug 78 | 79 | ## 🔗 Requirements 80 | 81 | - latest version of Stable Diffusion WebUI Automatic1111 by following the instructions on the [Stable Diffusion Webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) repository. 82 | - FFmpeg : download it from the [official FFmpeg site](https://ffmpeg.org/download.html). Follow the instructions appropriate for your operating system, note ffmpeg have to be accessible from the command line. 83 | 84 | ## 💻 Installation 85 | 86 | 1. Launch Automatic1111 87 | 2. Face Swap : On Windows, download and install [Visual Studio](https://visualstudio.microsoft.com/fr/downloads/). During the install, make sure to include the Python and C++ packages. 88 | 3. In the extensions tab, enter the following URL in the "Install from URL" field and click "Install": 89 | 90 | ![Illustration](https://user-images.githubusercontent.com/800903/258115646-22b4b363-c363-4fc8-b316-c162b61b5d15.png) 91 | 92 | 4. Go to the "Installed Tab" in the extensions tab and click "Apply and quit". 93 | 94 | ![Illustration](https://user-images.githubusercontent.com/800903/258115651-196a07bd-ee4b-4aaf-b11e-8e2d1ffaa42f.png) 95 | 96 | 5. If you don't see the "Wav2Lip UHQ tab" restart Automatic1111. 97 | 98 | 6. 🔥 Important: Get the weights. Download the model weights from the following locations and place them in the corresponding directories (take care about the filename, especially for s3fd) 99 | 100 | | Model | Description | Link to the model | install folder | 101 | |:-------------------:|:----------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:| 102 | | Wav2Lip | Highly accurate lip-sync | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW) | extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\ | 103 | | Wav2Lip + GAN | Slightly inferior lip-sync, but better visual quality | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW) | extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\ | 104 | | s3fd | Face Detection pre trained model | [Link](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) | extensions\sd-wav2lip-uhq\scripts\wav2lip\face_detection\detection\sfd\s3fd.pth | 105 | | landmark predicator | Dlib 68 point face landmark prediction (click on the download icon) | [Link](https://github.com/numz/wav2lip_uhq/blob/main/predicator/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 106 | | landmark predicator | Dlib 68 point face landmark prediction (alternate link) | [Link](https://huggingface.co/spaces/asdasdasdasd/Face-forgery-detection/resolve/ccfc24642e0210d4d885bc7b3dbc9a68ed948ad6/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 107 | | landmark predicator | Dlib 68 point face landmark prediction (alternate link click on the download icon) | [Link](https://github.com/italojs/facial-landmarks-recognition/blob/master/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 108 | | face swap model | model used by face swap |[Link](https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/inswapper_128.onnx) | extensions\sd-wav2lip-uhq\scripts\faceswap\model\inswapper_128.onnx | 109 | 110 | 111 | ## 🐍 Usage 112 | 1. Choose a video (avi or mp4 format) with a face in it. If there is no face in only one frame of the video, process will fail. Note avi file will not appear in Video input but process will works. 113 | 2. Face Swap (take times so be patient): 114 | 1. **Face Swap**: chose the image of the face you want to swap with the face in the video. 115 | 2. **Face Index**: if there are multiple faces in the image, you can choose the face you want to swap with the face in the video. 0 is the first face from left to right. 116 | 3. Audio, 2 options: 117 | 1. Put audio file in the "Speech" input. 118 | 2. Generate Audio with the text to speech [bark](https://github.com/suno-ai/bark/) integration. 119 | 1. Choose the language : Turkish, English, Chinese, Hindi, Italian, Japanese, Korean, Portuguese, Russian, Spanish, Polish, German, French 120 | 2. Choose the Gender 121 | 3. Choose your speaker, you can ear a sample in the "Audio Example" 122 | 4. Choose Low VRAM True (default) if you have a Video Card with less than 16GB VRAM 123 | 5. Write your text in the text area "Prompt" 124 | - **Note** that bark can only generate 14 seconds of audio, so if you want to generate a longer audio, you have to use "[split]" in your text. 125 | - For example, if you want to generate a 30 seconds audio, you have to write your text like this : 126 | - "This is the first part of my text **[split]** This is the second part of my text" 127 | 6. Temperature: 0.0 is supposed to be closer to the voice, and 1.0 is more creative, but in reality, 0.0 yields strange results and 1.0 something very far from the voice. 0.7 is the default value set by 'bark', try different values to see what works best for you. 128 | 7. Silence : Time in seconds between each punctuation(。!!.??,). Default is 0.25 seconds. 129 | 8. See Bark [documentation](https://github.com/suno-ai/bark/) for more details. 130 | 9. Below is a list of some known non-speech sounds. 131 | - [laughter] 132 | - [laughs] 133 | - [sighs] 134 | - [music] 135 | - [gasps] 136 | - [clears throat] 137 | - "-" or ... for hesitations 138 | - ♪ for song lyrics 139 | - CAPITALIZATION for emphasis of a word 140 | - [MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively 141 | 4. choose a checkpoint (see table above). 142 | 5. **Padding**: Wav2Lip uses this to move the mouth. This is useful if the mouth is not at the good place. Usually, default value is good, but certain video may need to be adjusted. 143 | 6. **No Smooth**: When checked, this option retains the original mouth shape without smoothing. 144 | 7. **Resize Factor**: This is a resize factor for the video. The default value is 1.0, but you can change it to suit your needs. This is useful if the video size is too large. 145 | 8. **Only Mouth**: This option tracks only the mouth, removing other facial motions like those of the cheeks and chin. 146 | 9. **Mouth Mask Dilate**: This will dilate the mouth mask to cover more area around the mouth. depends on the mouth size. 147 | 10. **Face Mask Erode**: This will erode the face mask to remove some area around the face. depends on the face size. 148 | 11. **Mask Blur**: This will blur the mask to make it more smooth, try to keep it under or equal to **Mouth Mask Dilate**. 149 | 12. **Code Former Fidelity**: 150 | 1. A value of 0 offers higher quality but may significantly alter the person's facial appearance and cause noticeable flickering between frames. 151 | 2. A value of 1 provides lower quality but maintains the person's face more consistently and reduces frame flickering. 152 | 3. Using a value below 0.5 is not advised. Adjust this setting to achieve optimal results. Starting with a value of 0.75 is recommended. 153 | 13. **Active debug**: This will create step-by-step images in the debug folder. 154 | 14. Click on the "Generate" button. 155 | 15. ⚠ "resume" button can be use if face swap and wav2lip step have been done, then you can adjust "mouth mask dilate", "face mask erode", "mask blur" and change "restoration model" without regenerate face swap and wav2lip. 156 | 157 | ## 👄 Note on the bark Fidelity 158 | 159 | Bark is interesting but sometimes yields strange results (or even hilarious ones). Each generation will give you something different and It may take several generations before you achieve something conclusive. 160 | Apart from English, it seems that the other languages speak as if they were being used by a foreigner. Sometimes even if you choose "Male" it will speak like a woman, and vice versa. Sometimes, even when choosing a specific speaker, it will sound like another speaker or even another language. 161 | 162 | ## 📺 Examples 163 | 164 | https://user-images.githubusercontent.com/800903/262439441-bb9d888a-d33e-4246-9f0a-1ddeac062d35.mp4 165 | 166 | https://user-images.githubusercontent.com/800903/262442794-61b1e32f-3f87-4b36-98d6-f711822bdb1e.mp4 167 | 168 | https://user-images.githubusercontent.com/800903/262449305-901086a3-22cb-42d2-b5be-a5f38db4549a.mp4 169 | 170 | https://user-images.githubusercontent.com/800903/267808494-300f8cc3-9136-4810-86e2-92f2114a5f9a.mp4 171 | 172 | ## 📖 Behind the scenes 173 | 174 | This extension operates in several stages to improve the quality of Wav2Lip-generated videos: 175 | 176 | 1. **Generate face swap video**: The script first generates the face swap video if image is in "face Swap" field, this operation take times so be patient. 177 | 2. **Generate a Wav2lip video**: Then script generates a low-quality Wav2Lip video using the input video and audio. 178 | 3. **Video Quality Enhancement**: Create a high-quality video using the low-quality video by using the enhancer define by user. 179 | 4. **Mask Creation**: The script creates a mask around the mouth and tries to keep other facial motions like those of the cheeks and chin. 180 | 5. **Video Generation**: The script then takes the high-quality mouth image and overlays it onto the original image guided by the mouth mask. 181 | 6. **Video Post Processing**: The script then uses the ffmpeg tool to generate the final video. 182 | 183 | ## 💪 Quality tips 184 | - Use a high quality video as input 185 | - Utilize a video with a consistent frame rate. Occasionally, videos may exhibit unusual playback frame rates (not the standard 24, 25, 30, 60), which can lead to issues with the face mask. 186 | - Use a high quality audio file as input, without background noise or music. Clean audio with a tool like [https://podcast.adobe.com/enhance](https://podcast.adobe.com/enhance). 187 | - Dilate the mouth mask. This will help the model retain some facial motion and hide the original mouth. 188 | - Mask Blur maximum twice the value of Mouth Mask Dilate. If you want to increase the blur, increase the value of Mouth Mask Dilate otherwise the mouth will be blurred and the underlying mouth could be visible. 189 | - Upscaling can be good for improving result, particularly around the mouth area. However, it will extend the processing duration. Use this tutorial from Olivio Sarikas to upscale your video: [https://www.youtube.com/watch?v=3z4MKUqFEUk](https://www.youtube.com/watch?v=3z4MKUqFEUk). Ensure the denoising strength is set between 0.0 and 0.05, select the 'revAnimated' model, and use the batch mode. i'll create a tutorial for this soon. 190 | - Ensure there is a face on each frame of the video. If the face is not detected, process will stop. 191 | 192 | ## ⚠ Noted Constraints 193 | - for speed up process try to keep resolution under 1000x1000px, so use resize factor and upscaling after process. 194 | - If the initial phase is excessively lengthy, consider using the "resize factor" to decrease the video's dimensions. 195 | - While there's no strict size limit for videos, larger videos will require more processing time. It's advisable to employ the "resize factor" to minimize the video size and then upscale the video once processing is complete. 196 | 197 | ## 📖 Troubleshooting 198 | - Mac users: dlib will not install correctly. in requirements.txt, replace "dlib-bin" with "dlib" 199 | 200 | ## 📝 To do 201 | - [ ] Tutorials 202 | - [ ] Convert avi to mp4. Avi is not show in video input but process work fine 203 | - [ ] Add Possibility to use a video for audio input 204 | - [ ] Standalone version 205 | - [ ] ComfyUI intergration 206 | 207 | ## 😎 Contributing 208 | 209 | We welcome contributions to this project. When submitting pull requests, please provide a detailed description of the changes. see [CONTRIBUTING](CONTRIBUTING.md) for more information. 210 | 211 | ## 🙏 Appreciation 212 | - [Wav2Lip](https://github.com/Rudrabha/Wav2Lip) 213 | - [CodeFormer](https://github.com/sczhou/CodeFormer) 214 | - [bark](https://github.com/suno-ai/bark/) 215 | - [facefusion](https://github.com/facefusion/facefusion) 216 | 217 | ## ☕ Support Wav2lip Studio 218 | 219 | this project is open-source effort that is free to use and modify. I rely on the support of users to keep this project going and help improve it. If you'd like to support me, you can make a donation on my Patreon page. Any contribution, large or small, is greatly appreciated! 220 | 221 | Your support helps me cover the costs of development and maintenance, and allows me to allocate more time and resources to enhancing this project. Thank you for your support! 222 | 223 | [patreon page](https://www.patreon.com/Wav2LipStudio) 224 | 225 | ## 📝 Citation 226 | If you use this project in your own work, in articles, tutorials, or presentations, we encourage you to cite this project to acknowledge the efforts put into it. 227 | 228 | To cite this project, please use the following BibTeX format: 229 | 230 | ``` 231 | @misc{wav2lip_uhq, 232 | author = {numz}, 233 | title = {Wav2Lip UHQ}, 234 | year = {2023}, 235 | howpublished = {GitHub repository}, 236 | publisher = {numz}, 237 | url = {https://github.com/numz/sd-wav2lip-uhq} 238 | } 239 | ``` 240 | 241 | ## 📜 License 242 | * The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE). 243 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | # 🔉👄 Stable Diffusion WebUI Automatic1111 Wav2Lip Studio 扩展插件 2 | 3 | ##
English | 简体中文
4 | 5 | 6 | 7 | https://user-images.githubusercontent.com/800903/262435301-af205a91-30d7-43f2-afcc-05980d581fe0.mp4 8 | 9 | ## 💡 简介 10 | 本代码仓库是适用于Automatic1111的 Wav2Lip UHQ扩展插件。 11 | 12 | 本插件为一体化集成解决方案:只需要一段视频和一段口播音频文件(wav或者mp3),就可以生成一个嘴唇同步的视频。通过Stable Diffusion特别的后处理技术,本插件所生成视频的嘴唇同步效果相比于[Wav2Lip tool](https://github.com/Rudrabha/Wav2Lip)所生成的视频,有更好的质量。 13 | 14 | ![Illustration](https://user-images.githubusercontent.com/800903/267808204-ae971458-9e8d-403e-9e10-9b7b7590d999.png) 15 | 16 | ## 📖 快速索引 17 | * [🚀 更新](#-更新) 18 | * [🔗 必要环境](#-必要环境) 19 | * [💻 安装说明](#-安装说明) 20 | * [🐍 使用方法](#-使用方法) 21 | * [👄 关于bark的保真度说明](#-关于bark的保真度说明) 22 | * [📺 样例](#-样例) 23 | * [📖 后台原理](#-后台原理) 24 | * [💪 提高质量的小提示](#-提高质量的小提示) 25 | * [⚠️需要注意的约束](#-需要注意的约束) 26 | * [📝 即将上线](#-即将上线) 27 | * [😎 贡献](#-贡献) 28 | * [🙏 鸣谢](#-鸣谢) 29 | * [📝 引用](#-引用) 30 | * [📜 版权声明](#-版权声明) 31 | * [☕ 支持Wav2lip Studio](#-支持wav2lip-studio) 32 | 33 | ## 🚀 更新 34 | **2023.09.13** 35 | - 👪 增加了 face swap 换脸: 整合了roop (参加下方使用方法章节) **本功能为实验性功能**。 36 | 37 | **2023.08.22** 38 | - 👄 增加了 [bark](https://github.com/suno-ai/bark/) (参加下方使用方法章节), **本功能为实验性功能**。 39 | 40 | **2023.08.20** 41 | - 🚢 增加了新的面部修复模型选择:GFPGAN。 42 | - ▶ 增加了暂停/恢复功能。 43 | - 📏 优化了释放内存方式,视频生成后会释放内存。 44 | 45 | **2023.08.17** 46 | - 🐛 修复嘴唇发紫的bug 47 | 48 | **2023.08.16** 49 | - ⚡ 除了generated版本的视频,额外输出Wav2lip和enhanced版本的视频,你可以从中选择效果更好的一个版本。 50 | - 🚢 用户界面更新:增加了对CodeFormer Fidelity的控制说明。 51 | - 👄 删除了图片输入方式,因为 [SadTalker](https://github.com/OpenTalker/SadTalker) 的方法更好。 52 | - 🐛 修复了输入和输出视频之间的差异导致遮罩蒙板位置不正确的bug。 53 | - 💪 改进了处理流程,提高了效率。 54 | - 🚫 如果程序在处理过程中,中段还会继续产出视频。 55 | 56 | **2023.08.13** 57 | - ⚡ 加快计算速度。 58 | - 🚢 用户界面更新:添加了一些隐藏参数的设置。 59 | - 👄 提供了“仅追踪嘴巴”的选项。 60 | - 📰 控制debug。 61 | - 🐛 修复了resize factor的bug。 62 | 63 | 64 | ## 🔗 必要环境 65 | 66 | - 最新版本的Stable Diffusion WebUI Automatic1111 [Stable Diffusion Webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) 。 67 | - FFmpeg : 预先安装好FFmpeg,下载地址:[FFmpeg官网](https://ffmpeg.org/download.html)。根据你的操作系统,按照官网说明安装好,注意,FFmpeg要加入环境变量设置,便于在任意目录调用。 68 | 69 | ## 💻 安装说明 70 | 71 | 1. 启动Automatic1111 72 | 2. Face Swap的环境依赖需要编译安装,所以在Windows中,需要安装 [Visual Studio](https://visualstudio.microsoft.com/fr/downloads/). 确保安装时,勾选Python开发环境和C++桌面开发环境包. 73 | 3. 在扩展菜单里, 找到“从网址安装”标签,输入下方URL地址,点击“安装”: 74 | 75 | ![Illustration](https://user-images.githubusercontent.com/800903/258115646-22b4b363-c363-4fc8-b316-c162b61b5d15.png) 76 | 77 | 4. 来到“已安装”标签,点击“应用并重启用户界面”. 78 | 79 | ![Illustration](https://user-images.githubusercontent.com/800903/258115651-196a07bd-ee4b-4aaf-b11e-8e2d1ffaa42f.png) 80 | 81 | 5. 如果您仍然看不到"Wav2Lip UHQ"的菜单,尝试重启Automatic1111. 82 | 83 | 6. 🔥 十分重要: 必须要下载模型。从下方表格下载全部所需的模型。(要注意模型文件名,确保文件名正确无误,尤其是s3fd模型)。 84 | 85 | | 模型 | 描述 | 地址 | 安装目录 | 86 | |:-------------------:|:----------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:| 87 | | Wav2Lip | 高精度的唇同步 | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW) | extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\ | 88 | | Wav2Lip + GAN | 嘴唇同步稍差,但视觉质量更好 | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW) | extensions\sd-wav2lip-uhq\scripts\wav2lip\checkpoints\ | 89 | | s3fd | 人脸检测预训练模型 | [Link](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) | extensions\sd-wav2lip-uhq\scripts\wav2lip\face_detection\detection\sfd\s3fd.pth | 90 | | landmark predicator | Dlib 68点人脸特征推测 (点击下载按钮) | [Link](https://github.com/numz/wav2lip_uhq/blob/main/predicator/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 91 | | landmark predicator | Dlib 68点人脸特征推测 (备用地址1) | [Link](https://huggingface.co/spaces/asdasdasdasd/Face-forgery-detection/resolve/ccfc24642e0210d4d885bc7b3dbc9a68ed948ad6/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 92 | | landmark predicator | Dlib 68点人脸特征推测 (备用地址2,点击下载按钮) | [Link](https://github.com/italojs/facial-landmarks-recognition/blob/master/shape_predictor_68_face_landmarks.dat) | extensions\sd-wav2lip-uhq\scripts\wav2lip\predicator\shape_predictor_68_face_landmarks.dat | 93 | | face swap model | face swap换脸所用模型 |[Link](https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/inswapper_128.onnx) | extensions\sd-wav2lip-uhq\scripts\faceswap\model\inswapper_128.onnx | 94 | 95 | 96 | ## 🐍 使用方法 97 | 1. 上传一个包含人脸的视频文件(avi格式或者mp4格式均可)。如果视频里没有人脸,哪怕只有一帧不包含人脸,会导致处理失败。请注意,如果你上传的是avi文件,在界面上你看不见它,但不用担心,插件会正常处理视频。 98 | 2. 换脸 (耗时很长,需耐心等待): 99 | 1. **Face Swap**: 选择一张照片用来替换视频里的脸。 100 | 2. **Face Index**: 如果照片里有多张脸,可以指定其中一个,0 指的是从左到右数的第一个脸。 101 | 3. 上传一个口播音频文件,现在音频的输入有两种方式: 102 | 1. 跟原来一样,在音频输入区域上传口播音频文件。 103 | 2. 用 [bark](https://github.com/suno-ai/bark/) 插件将文字转成口播语音. 104 | 1. 选择语言 : 土耳其语, 英语, 汉语, 印地语, 意大利语, 日语, 韩语, 葡萄牙语, 俄语, 西班牙语, 波兰语, 德语, 法语 105 | 2. 选择性别 106 | 3. 选择朗读者, 你可以在 "Audio Example(声音样例)" 里试听 107 | 4. 如果你的显卡内存低于16GB,勾选低显存为 True (默认选中) 108 | 5. 将你需要朗读的文本填入 "Prompt" 区域 109 | - **注意** bark生成的一句话只能在14秒以内,如果你的一句话比较长,需要用"[split]"进行分割。 110 | - 例如,假如你一句话大约有30秒,你可以将你的文本写成这样: 111 | - "这是前半段文字 **[split]** 这是后半段文字" 112 | 6. Temperature: 靠近0.0接近原声, 靠近1.0让AI发挥创意, 但现实情况是but in reality, 0.0会感觉有点奇怪,1.0更原声相差甚远。bark设置了0.7为默认值,你可以自行微调以达到效果更佳。 113 | 7. Silence(停顿) : 在遇到标点符号(。!!.??,)时的停顿时间. 默认值是0.25秒. 114 | 8. 关于更多Bard的有关细节,可查看 Bark [文档](https://github.com/suno-ai/bark/) . 115 | 9. 下列为已知一些支持的非说话的声音(但有时候没反应). 116 | - [laughter] 大笑 117 | - [laughs] 微笑 118 | - [sighs] 叹气 119 | - [music] 音乐 120 | - [gasps] 喘气 121 | - [clears throat] 清嗓 122 | - "-" or ... 犹豫停顿 123 | - ♪ 歌词 124 | - 大写时用于强调 125 | - 可以在提示词里单独写[MAN] 或者 [WOMAN]可无视朗读者的选择,将文本用指性别的朗读者 126 | 4. 选择模型 (详见上方表格). 127 | 5. **Padding(填充位移)**: Wav2Lip用它来移动嘴巴位置。如果嘴巴位置不理想,可以用它来微调,通常情况下,不必刻意调整。 128 | 6. **No Smooth(不要平滑)**: 当勾选该选项,将会保持原始嘴部形状不做平滑处理。 129 | 7. **Resize Factor(调整大小)**: 该选项会对视频的分辨率进行调整。默认值是1,如果你需要降低你的视频分辨率,你可以调整它。 130 | 8. **Only Mouth(仅追踪嘴巴)**: 选中该选项,将仅对嘴部进行追踪,这将会移除例如脸颊和下巴的动作。 131 | 9. **Mouth Mask Dilate(嘴部遮罩蒙板扩张)**: 该选项用于调整嘴巴覆盖区域,参数越大,覆盖面积越大,根据嘴巴的大小来作出调整。 132 | 10. **Face Mask Erode(面部遮罩蒙板侵蚀)**: 对脸部外延区域进行渗透侵蚀处理,根据脸型大小作出调整。 133 | 11. **Mask Blur(遮罩模糊)**: 通过对遮罩层进行模糊处理,使其变得更平滑,建议尽量使该参数小于等于 **Mouth Mask Dilate(嘴部遮罩蒙板扩张)** 参数. 134 | 12. **Code Former Fidelity(Code Former保真度)**: 135 | 1. 当该参数偏向0时,虽然有更高的画质,但可能会引起人物外观特征改变,以及画面闪烁。 136 | 2. 当该参数偏向1时,虽然降低了画质,但是能更大程度的保留原来人物的外观特征,以及降低画面闪烁。 137 | 3. 不建议该参数低于0.5。为了达到良好的效果,建议在0.75左右进行调整。 138 | 13. **Active debug(启用debug模式)**: 开启该选项,将会在debug目录里逐步执行来生成图片。 139 | 14. 点击“Generate”(生成)按钮。 140 | 141 | ## 演示教程 142 | 143 | | 链接 | 语言 | 144 | |:----:|:----:| 145 | |[哔哩哔哩](https://www.bilibili.com/video/BV1J94y1r7Xc/)|中文| 146 | |[抖音](https://v.douyin.com/iJtQVU51/)|中文| 147 | |[Youtube](https://youtu.be/9M-IzuxlFRU)|中文| 148 | 149 | 150 | 151 | ## 👄 关于bark的保真度说明 152 | Bark十分有趣,但它有时候输出的声音十分奇怪(甚至有点搞笑)。每次生成的结果都有些许不同,你可能需要多生成几次以达到你想要的结果。 153 | 除了英语,其他语言的采样似乎并非来自本土母语,听起来有点像外国人在说话。有时候选男,但听起来有点像女,反之亦然。甚至有时候当你选了某一个声音,但听起来像另外一个声音或者另一种语音。 154 | 155 | ## 📺 样例 156 | 157 | https://user-images.githubusercontent.com/800903/262439441-bb9d888a-d33e-4246-9f0a-1ddeac062d35.mp4 158 | 159 | https://user-images.githubusercontent.com/800903/262442794-61b1e32f-3f87-4b36-98d6-f711822bdb1e.mp4 160 | 161 | https://user-images.githubusercontent.com/800903/262449305-901086a3-22cb-42d2-b5be-a5f38db4549a.mp4 162 | 163 | https://user-images.githubusercontent.com/800903/267808494-300f8cc3-9136-4810-86e2-92f2114a5f9a.mp4 164 | 165 | ## 📖 后台原理 166 | 167 | 本扩展分几个流程运行,以此达到提高Wav2Lip生成的视频的质量的效果: 168 | 169 | 1. **Generate face swap video**: 如果face swap提供了需要替换脸部的图片,那会先将原始视频进行脸部替换,该操作耗时很长,需耐心等待。 170 | 2. **Generate a Wav2lip video(生成Wav2lip视频)**: 该脚本先使用输入的视频和音频生成低质量的Wav2Lip视频。 171 | 3. **Video Quality Enhancement(视频质量增强)**: 根据用户选择的面部修复模型来将低清视频转化成高清的视频。 172 | 4. **Mask Creation(创建遮罩蒙板)**: 该脚本在嘴巴周围制作了一个遮罩蒙板,并试图保持其他面部动作,比如脸颊和下巴的动作。 173 | 5. **Video Generation(生成视频)***: 该脚本会获取高质量的嘴巴图像,并将其覆盖在由嘴部遮罩引导的原始图像上。 174 | 6. **Video Post Processing(后期合成)**: 该脚本调用ffmpeg生成最终版本的视频。 175 | 176 | ## 💪 提高质量的小提示 177 | - 使用高质量的视频作为输入源 178 | - 使用常见FPS(譬如24fps、25fps、30fps、60fps)的视频,如果不是常见的FPS,偶尔会出现一些问题,譬如面部遮罩蒙板处理。 179 | - 使用高质量的音频源文件,不要有音乐,不要有背景白噪声。使用类似 [https://podcast.adobe.com/enhance](https://podcast.adobe.com/enhance) 的工具清除背景音乐。 180 | - 扩大嘴部遮罩蒙板范围。这将有助于模型保留一些面部动作,并盖住原来的嘴巴。 181 | - “遮罩模糊”(Mask Blur)的最大值是“嘴部遮罩蒙板扩张”(Mouth Mask Dilate)值的两倍。如果要增加模糊度,请增加“嘴部遮罩蒙板扩张”的值,否则嘴巴将变得模糊,并且可以看到下面的嘴巴。 182 | - 高清放大有利于提高质量,尤其是在嘴巴周围。但是,它将使处理时间变长。你可以参考Olivio Sarikas的教程来高清放大处理你的视频: [https://www.youtube.com/watch?v=3z4MKUqFEUk](https://www.youtube.com/watch?v=3z4MKUqFEUk). 确保去噪强度设置在0.0和0.05之间,选择“revAnimated”模型,并使用批处理模式。回头我再补个简单的教程说明。 183 | - 确保视频的每一帧上都有一张脸。如果未检测到人脸,插件将停止运行。 184 | 185 | 186 | ## ⚠ 需要注意的约束 187 | - 目前的模型对胡须并不友好. 188 | - 如果初始化阶段过长,请考虑使用调整“Resize Factor”来减小视频分辨率的大小。 189 | - 虽然对原始视频没有严格的大小限制,但较大的视频需要更多的处理时间。建议使用“调整大小因子”来最小化视频大小,然后在处理完成后升级视频。 190 | 191 | ## 📖 故障排除 192 | - Mac用户: dlib会报无法安装,在requirements.txt文件里,找到”dlib-bin“,并将其替换成"dlib" 193 | 194 | ## 📝 即将上线 195 | - [ ] 教程指引 196 | - [ ] 将avi转mp4(目前avi文件在输入框不显示,但依然能正常工作) 197 | - [ ] 可用视频文件作为音频的输入 198 | - [ ] 独立版本 199 | - [ ] ComfyUI 插件 200 | 201 | ## 😎 贡献 202 | 203 | 我们欢迎各位对本项目的贡献提交。提交合并提取请求(Pull requests)时,请提供更改的详细说明。 详参 [CONTRIBUTING](CONTRIBUTING.md) 。 204 | 205 | ## 🙏 鸣谢 206 | - [Wav2Lip](https://github.com/Rudrabha/Wav2Lip) 207 | - [CodeFormer](https://github.com/sczhou/CodeFormer) 208 | - [bark](https://github.com/suno-ai/bark/) 209 | - [roop](https://github.com/s0md3v/sd-webui-roop) 210 | 211 | ## ☕ 支持Wav2lip Studio 212 | 213 | 该项目是开源项目,可以免费使用和修改。项目的持续改进,依靠用户的支持。如果您喜欢本项目,愿意支持我,可以在我的Patreon页面上捐款。捐献无论大小,我们都将不胜感激! 214 | 215 | 您的捐献将帮助我分担了开发和维护的成本,并让我能够分配更多的时间和资源来增强这个项目。感谢您的支持! 216 | 217 | [patreon页面](https://www.patreon.com/Wav2LipStudio) 218 | 219 | ## 📝 引用 220 | 如果您在工作、发表文章、教程或演示中使用到了项目,我们非常鼓励您引用此项目。 221 | 222 | 如需引证本项目,请考虑使用如下BibTeX格式: 223 | 224 | ``` 225 | @misc{wav2lip_uhq, 226 | author = {numz}, 227 | title = {Wav2Lip UHQ}, 228 | year = {2023}, 229 | howpublished = {GitHub repository}, 230 | publisher = {numz}, 231 | url = {https://github.com/numz/sd-wav2lip-uhq} 232 | } 233 | ``` 234 | 235 | ## 📜 版权声明 236 | * 此代码仓库中的代码是根据MIT许可协议发布的。 [LICENSE file](LICENSE). 237 | -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | import launch 2 | import os 3 | import platform 4 | 5 | system = platform.system() 6 | 7 | req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt") 8 | 9 | with open(req_file) as file: 10 | for lib in file: 11 | lib = lib.strip() 12 | if lib == "dlib-bin" and system == "Darwin": 13 | lib = "dlib" # replace dlib-bin as dlib 14 | if lib == "onnxruntime-gpu==1.15.0" and system == "Darwin": 15 | continue # skip onnxruntime-gpu 16 | if not launch.is_installed(lib): 17 | launch.run_pip(f"install {lib}", f"wav2lip_uhq requirement: {lib}") 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | imutils 2 | dlib-bin 3 | numpy 4 | opencv-python 5 | scipy 6 | requests 7 | pillow 8 | librosa==0.10.0.post2 9 | opencv-contrib-python 10 | tqdm 11 | numba 12 | imutils 13 | imageio_ffmpeg 14 | git+https://github.com/suno-ai/bark.git 15 | insightface==0.7.3 16 | onnx==1.14.0 17 | onnxruntime==1.15.0 18 | onnxruntime-gpu==1.15.0 19 | opencv-python>=4.8.0 20 | ifnude -------------------------------------------------------------------------------- /scripts/bark/speakers.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Speaker 0 (EN)", 4 | "id": "v2/en_speaker_0", 5 | "language": "English", 6 | "gender": "Male", 7 | "quality": null, 8 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_0.mp3", 9 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_0.mp3" 10 | }, 11 | { 12 | "name": "Speaker 1 (EN)", 13 | "id": "v2/en_speaker_1", 14 | "language": "English", 15 | "gender": "Male", 16 | "quality": null, 17 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_1.mp3", 18 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_1.mp3" 19 | }, 20 | { 21 | "name": "Speaker 2 (EN)", 22 | "id": "v2/en_speaker_2", 23 | "language": "English", 24 | "gender": "Male", 25 | "quality": null, 26 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_2.mp3", 27 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_2.mp3" 28 | }, 29 | { 30 | "name": "Speaker 3 (EN)", 31 | "id": "v2/en_speaker_3", 32 | "language": "English", 33 | "gender": "Male", 34 | "quality": null, 35 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_3.mp3", 36 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_3.mp3" 37 | }, 38 | { 39 | "name": "Speaker 4 (EN)", 40 | "id": "v2/en_speaker_4", 41 | "language": "English", 42 | "gender": "Male", 43 | "quality": null, 44 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_4.mp3", 45 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_4.mp3" 46 | }, 47 | { 48 | "name": "Speaker 5 (EN)", 49 | "id": "v2/en_speaker_5", 50 | "language": "English", 51 | "gender": "Male", 52 | "quality": "Grainy", 53 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_5.mp3", 54 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_5.mp3" 55 | }, 56 | { 57 | "name": "Speaker 6 (EN)", 58 | "id": "v2/en_speaker_6", 59 | "language": "English", 60 | "gender": "Male", 61 | "quality": "Suno Favorite", 62 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_6.mp3", 63 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_6.mp3" 64 | }, 65 | { 66 | "name": "Speaker 7 (EN)", 67 | "id": "v2/en_speaker_7", 68 | "language": "English", 69 | "gender": "Male", 70 | "quality": null, 71 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_7.mp3", 72 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_7.mp3" 73 | }, 74 | { 75 | "name": "Speaker 8 (EN)", 76 | "id": "v2/en_speaker_8", 77 | "language": "English", 78 | "gender": "Male", 79 | "quality": null, 80 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_8.mp3", 81 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_8.mp3" 82 | }, 83 | { 84 | "name": "Speaker 9 (EN)", 85 | "id": "v2/en_speaker_9", 86 | "language": "English", 87 | "gender": "Female", 88 | "quality": null, 89 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_9.mp3", 90 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/en_speaker_9.mp3" 91 | }, 92 | { 93 | "name": "Speaker 0 (ZH)", 94 | "id": "v2/zh_speaker_0", 95 | "language": "Chinese", 96 | "gender": "Male", 97 | "quality": null, 98 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_0.mp3", 99 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_0.mp3" 100 | }, 101 | { 102 | "name": "Speaker 1 (ZH)", 103 | "id": "v2/zh_speaker_1", 104 | "language": "Chinese", 105 | "gender": "Male", 106 | "quality": null, 107 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_1.mp3", 108 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_1.mp3" 109 | }, 110 | { 111 | "name": "Speaker 2 (ZH)", 112 | "id": "v2/zh_speaker_2", 113 | "language": "Chinese", 114 | "gender": "Male", 115 | "quality": null, 116 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_2.mp3", 117 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_2.mp3" 118 | }, 119 | { 120 | "name": "Speaker 3 (ZH)", 121 | "id": "v2/zh_speaker_3", 122 | "language": "Chinese", 123 | "gender": "Male", 124 | "quality": null, 125 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_3.mp3", 126 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_3.mp3" 127 | }, 128 | { 129 | "name": "Speaker 4 (ZH)", 130 | "id": "v2/zh_speaker_4", 131 | "language": "Chinese", 132 | "gender": "Female", 133 | "quality": null, 134 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_4.mp3", 135 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_4.mp3" 136 | }, 137 | { 138 | "name": "Speaker 5 (ZH)", 139 | "id": "v2/zh_speaker_5", 140 | "language": "Chinese", 141 | "gender": "Male", 142 | "quality": null, 143 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_5.mp3", 144 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_5.mp3" 145 | }, 146 | { 147 | "name": "Speaker 6 (ZH)", 148 | "id": "v2/zh_speaker_6", 149 | "language": "Chinese", 150 | "gender": "Female", 151 | "quality": "Background Noise", 152 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_6.mp3", 153 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_6.mp3" 154 | }, 155 | { 156 | "name": "Speaker 7 (ZH)", 157 | "id": "v2/zh_speaker_7", 158 | "language": "Chinese", 159 | "gender": "Female", 160 | "quality": null, 161 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_7.mp3", 162 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_7.mp3" 163 | }, 164 | { 165 | "name": "Speaker 8 (ZH)", 166 | "id": "v2/zh_speaker_8", 167 | "language": "Chinese", 168 | "gender": "Male", 169 | "quality": null, 170 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_8.mp3", 171 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_8.mp3" 172 | }, 173 | { 174 | "name": "Speaker 9 (ZH)", 175 | "id": "v2/zh_speaker_9", 176 | "language": "Chinese", 177 | "gender": "Female", 178 | "quality": null, 179 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/zh_speaker_9.mp3", 180 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/zh_speaker_9.mp3" 181 | }, 182 | { 183 | "name": "Speaker 0 (FR)", 184 | "id": "v2/fr_speaker_0", 185 | "language": "French", 186 | "gender": "Male", 187 | "quality": null, 188 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_0.mp3", 189 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_0.mp3" 190 | }, 191 | { 192 | "name": "Speaker 1 (FR)", 193 | "id": "v2/fr_speaker_1", 194 | "language": "French", 195 | "gender": "Female", 196 | "quality": null, 197 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_1.mp3", 198 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_1.mp3" 199 | }, 200 | { 201 | "name": "Speaker 2 (FR)", 202 | "id": "v2/fr_speaker_2", 203 | "language": "French", 204 | "gender": "Female", 205 | "quality": null, 206 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_2.mp3", 207 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_2.mp3" 208 | }, 209 | { 210 | "name": "Speaker 3 (FR)", 211 | "id": "v2/fr_speaker_3", 212 | "language": "French", 213 | "gender": "Male", 214 | "quality": null, 215 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_3.mp3", 216 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_3.mp3" 217 | }, 218 | { 219 | "name": "Speaker 4 (FR)", 220 | "id": "v2/fr_speaker_4", 221 | "language": "French", 222 | "gender": "Male", 223 | "quality": null, 224 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_4.mp3", 225 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_4.mp3" 226 | }, 227 | { 228 | "name": "Speaker 5 (FR)", 229 | "id": "v2/fr_speaker_5", 230 | "language": "French", 231 | "gender": "Female", 232 | "quality": null, 233 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_5.mp3", 234 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_5.mp3" 235 | }, 236 | { 237 | "name": "Speaker 6 (FR)", 238 | "id": "v2/fr_speaker_6", 239 | "language": "French", 240 | "gender": "Male", 241 | "quality": null, 242 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_6.mp3", 243 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_6.mp3" 244 | }, 245 | { 246 | "name": "Speaker 7 (FR)", 247 | "id": "v2/fr_speaker_7", 248 | "language": "French", 249 | "gender": "Male", 250 | "quality": null, 251 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_7.mp3", 252 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_7.mp3" 253 | }, 254 | { 255 | "name": "Speaker 8 (FR)", 256 | "id": "v2/fr_speaker_8", 257 | "language": "French", 258 | "gender": "Male", 259 | "quality": null, 260 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_8.mp3", 261 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_8.mp3" 262 | }, 263 | { 264 | "name": "Speaker 9 (FR)", 265 | "id": "v2/fr_speaker_9", 266 | "language": "French", 267 | "gender": "Male", 268 | "quality": "Auditorium", 269 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/fr_speaker_9.mp3", 270 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/fr_speaker_9.mp3" 271 | }, 272 | { 273 | "name": "Speaker 0 (DE)", 274 | "id": "v2/de_speaker_0", 275 | "language": "German", 276 | "gender": "Male", 277 | "quality": null, 278 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_0.mp3", 279 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_0.mp3" 280 | }, 281 | { 282 | "name": "Speaker 1 (DE)", 283 | "id": "v2/de_speaker_1", 284 | "language": "German", 285 | "gender": "Male", 286 | "quality": null, 287 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_1.mp3", 288 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_1.mp3" 289 | }, 290 | { 291 | "name": "Speaker 2 (DE)", 292 | "id": "v2/de_speaker_2", 293 | "language": "German", 294 | "gender": "Male", 295 | "quality": null, 296 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_2.mp3", 297 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_2.mp3" 298 | }, 299 | { 300 | "name": "Speaker 3 (DE)", 301 | "id": "v2/de_speaker_3", 302 | "language": "German", 303 | "gender": "Female", 304 | "quality": null, 305 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_3.mp3", 306 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_3.mp3" 307 | }, 308 | { 309 | "name": "Speaker 4 (DE)", 310 | "id": "v2/de_speaker_4", 311 | "language": "German", 312 | "gender": "Male", 313 | "quality": null, 314 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_4.mp3", 315 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_4.mp3" 316 | }, 317 | { 318 | "name": "Speaker 5 (DE)", 319 | "id": "v2/de_speaker_5", 320 | "language": "German", 321 | "gender": "Male", 322 | "quality": null, 323 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_5.mp3", 324 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_5.mp3" 325 | }, 326 | { 327 | "name": "Speaker 6 (DE)", 328 | "id": "v2/de_speaker_6", 329 | "language": "German", 330 | "gender": "Male", 331 | "quality": null, 332 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_6.mp3", 333 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_6.mp3" 334 | }, 335 | { 336 | "name": "Speaker 7 (DE)", 337 | "id": "v2/de_speaker_7", 338 | "language": "German", 339 | "gender": "Male", 340 | "quality": null, 341 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_7.mp3", 342 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_7.mp3" 343 | }, 344 | { 345 | "name": "Speaker 8 (DE)", 346 | "id": "v2/de_speaker_8", 347 | "language": "German", 348 | "gender": "Female", 349 | "quality": null, 350 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_8.mp3", 351 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_8.mp3" 352 | }, 353 | { 354 | "name": "Speaker 9 (DE)", 355 | "id": "v2/de_speaker_9", 356 | "language": "German", 357 | "gender": "Male", 358 | "quality": null, 359 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/de_speaker_9.mp3", 360 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/de_speaker_9.mp3" 361 | }, 362 | { 363 | "name": "Speaker 0 (HI)", 364 | "id": "v2/hi_speaker_0", 365 | "language": "Hindi", 366 | "gender": "Female", 367 | "quality": null, 368 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_0.mp3", 369 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_0.mp3" 370 | }, 371 | { 372 | "name": "Speaker 1 (HI)", 373 | "id": "v2/hi_speaker_1", 374 | "language": "Hindi", 375 | "gender": "Female", 376 | "quality": "Background Noise", 377 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_1.mp3", 378 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_1.mp3" 379 | }, 380 | { 381 | "name": "Speaker 2 (HI)", 382 | "id": "v2/hi_speaker_2", 383 | "language": "Hindi", 384 | "gender": "Male", 385 | "quality": null, 386 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_2.mp3", 387 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_2.mp3" 388 | }, 389 | { 390 | "name": "Speaker 3 (HI)", 391 | "id": "v2/hi_speaker_3", 392 | "language": "Hindi", 393 | "gender": "Female", 394 | "quality": null, 395 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_3.mp3", 396 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_3.mp3" 397 | }, 398 | { 399 | "name": "Speaker 4 (HI)", 400 | "id": "v2/hi_speaker_4", 401 | "language": "Hindi", 402 | "gender": "Female", 403 | "quality": "Background Noise", 404 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_4.mp3", 405 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_4.mp3" 406 | }, 407 | { 408 | "name": "Speaker 5 (HI)", 409 | "id": "v2/hi_speaker_5", 410 | "language": "Hindi", 411 | "gender": "Male", 412 | "quality": null, 413 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_5.mp3", 414 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_5.mp3" 415 | }, 416 | { 417 | "name": "Speaker 6 (HI)", 418 | "id": "v2/hi_speaker_6", 419 | "language": "Hindi", 420 | "gender": "Male", 421 | "quality": null, 422 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_6.mp3", 423 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_6.mp3" 424 | }, 425 | { 426 | "name": "Speaker 7 (HI)", 427 | "id": "v2/hi_speaker_7", 428 | "language": "Hindi", 429 | "gender": "Male", 430 | "quality": null, 431 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_7.mp3", 432 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_7.mp3" 433 | }, 434 | { 435 | "name": "Speaker 8 (HI)", 436 | "id": "v2/hi_speaker_8", 437 | "language": "Hindi", 438 | "gender": "Male", 439 | "quality": null, 440 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_8.mp3", 441 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_8.mp3" 442 | }, 443 | { 444 | "name": "Speaker 9 (HI)", 445 | "id": "v2/hi_speaker_9", 446 | "language": "Hindi", 447 | "gender": "Female", 448 | "quality": null, 449 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/hi_speaker_9.mp3", 450 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/hi_speaker_9.mp3" 451 | }, 452 | { 453 | "name": "Speaker 0 (IT)", 454 | "id": "v2/it_speaker_0", 455 | "language": "Italian", 456 | "gender": "Male", 457 | "quality": null, 458 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_0.mp3", 459 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_0.mp3" 460 | }, 461 | { 462 | "name": "Speaker 1 (IT)", 463 | "id": "v2/it_speaker_1", 464 | "language": "Italian", 465 | "gender": "Male", 466 | "quality": null, 467 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_1.mp3", 468 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_1.mp3" 469 | }, 470 | { 471 | "name": "Speaker 2 (IT)", 472 | "id": "v2/it_speaker_2", 473 | "language": "Italian", 474 | "gender": "Female", 475 | "quality": null, 476 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_2.mp3", 477 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_2.mp3" 478 | }, 479 | { 480 | "name": "Speaker 3 (IT)", 481 | "id": "v2/it_speaker_3", 482 | "language": "Italian", 483 | "gender": "Male", 484 | "quality": null, 485 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_3.mp3", 486 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_3.mp3" 487 | }, 488 | { 489 | "name": "Speaker 4 (IT)", 490 | "id": "v2/it_speaker_4", 491 | "language": "Italian", 492 | "gender": "Male", 493 | "quality": "Suno Favorite", 494 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_4.mp3", 495 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_4.mp3" 496 | }, 497 | { 498 | "name": "Speaker 5 (IT)", 499 | "id": "v2/it_speaker_5", 500 | "language": "Italian", 501 | "gender": "Male", 502 | "quality": null, 503 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_5.mp3", 504 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_5.mp3" 505 | }, 506 | { 507 | "name": "Speaker 6 (IT)", 508 | "id": "v2/it_speaker_6", 509 | "language": "Italian", 510 | "gender": "Male", 511 | "quality": null, 512 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_6.mp3", 513 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_6.mp3" 514 | }, 515 | { 516 | "name": "Speaker 7 (IT)", 517 | "id": "v2/it_speaker_7", 518 | "language": "Italian", 519 | "gender": "Female", 520 | "quality": null, 521 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_7.mp3", 522 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_7.mp3" 523 | }, 524 | { 525 | "name": "Speaker 8 (IT)", 526 | "id": "v2/it_speaker_8", 527 | "language": "Italian", 528 | "gender": "Male", 529 | "quality": null, 530 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_8.mp3", 531 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_8.mp3" 532 | }, 533 | { 534 | "name": "Speaker 9 (IT)", 535 | "id": "v2/it_speaker_9", 536 | "language": "Italian", 537 | "gender": "Female", 538 | "quality": null, 539 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/it_speaker_9.mp3", 540 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/it_speaker_9.mp3" 541 | }, 542 | { 543 | "name": "Speaker 0 (JA)", 544 | "id": "v2/ja_speaker_0", 545 | "language": "Japanese", 546 | "gender": "Female", 547 | "quality": null, 548 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_0.mp3", 549 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_0.mp3" 550 | }, 551 | { 552 | "name": "Speaker 1 (JA)", 553 | "id": "v2/ja_speaker_1", 554 | "language": "Japanese", 555 | "gender": "Female", 556 | "quality": "Background Noise", 557 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_1.mp3", 558 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_1.mp3" 559 | }, 560 | { 561 | "name": "Speaker 2 (JA)", 562 | "id": "v2/ja_speaker_2", 563 | "language": "Japanese", 564 | "gender": "Male", 565 | "quality": null, 566 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_2.mp3", 567 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_2.mp3" 568 | }, 569 | { 570 | "name": "Speaker 3 (JA)", 571 | "id": "v2/ja_speaker_3", 572 | "language": "Japanese", 573 | "gender": "Female", 574 | "quality": null, 575 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_3.mp3", 576 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_3.mp3" 577 | }, 578 | { 579 | "name": "Speaker 4 (JA)", 580 | "id": "v2/ja_speaker_4", 581 | "language": "Japanese", 582 | "gender": "Female", 583 | "quality": null, 584 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_4.mp3", 585 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_4.mp3" 586 | }, 587 | { 588 | "name": "Speaker 5 (JA)", 589 | "id": "v2/ja_speaker_5", 590 | "language": "Japanese", 591 | "gender": "Female", 592 | "quality": null, 593 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_5.mp3", 594 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_5.mp3" 595 | }, 596 | { 597 | "name": "Speaker 6 (JA)", 598 | "id": "v2/ja_speaker_6", 599 | "language": "Japanese", 600 | "gender": "Male", 601 | "quality": null, 602 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_6.mp3", 603 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_6.mp3" 604 | }, 605 | { 606 | "name": "Speaker 7 (JA)", 607 | "id": "v2/ja_speaker_7", 608 | "language": "Japanese", 609 | "gender": "Female", 610 | "quality": null, 611 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_7.mp3", 612 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_7.mp3" 613 | }, 614 | { 615 | "name": "Speaker 8 (JA)", 616 | "id": "v2/ja_speaker_8", 617 | "language": "Japanese", 618 | "gender": "Female", 619 | "quality": null, 620 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_8.mp3", 621 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_8.mp3" 622 | }, 623 | { 624 | "name": "Speaker 9 (JA)", 625 | "id": "v2/ja_speaker_9", 626 | "language": "Japanese", 627 | "gender": "Female", 628 | "quality": null, 629 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ja_speaker_9.mp3", 630 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ja_speaker_9.mp3" 631 | }, 632 | { 633 | "name": "Speaker 0 (KO)", 634 | "id": "v2/ko_speaker_0", 635 | "language": "Korean", 636 | "gender": "Female", 637 | "quality": null, 638 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_0.mp3", 639 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_0.mp3" 640 | }, 641 | { 642 | "name": "Speaker 1 (KO)", 643 | "id": "v2/ko_speaker_1", 644 | "language": "Korean", 645 | "gender": "Male", 646 | "quality": null, 647 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_1.mp3", 648 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_1.mp3" 649 | }, 650 | { 651 | "name": "Speaker 2 (KO)", 652 | "id": "v2/ko_speaker_2", 653 | "language": "Korean", 654 | "gender": "Male", 655 | "quality": null, 656 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_2.mp3", 657 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_2.mp3" 658 | }, 659 | { 660 | "name": "Speaker 3 (KO)", 661 | "id": "v2/ko_speaker_3", 662 | "language": "Korean", 663 | "gender": "Male", 664 | "quality": null, 665 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_3.mp3", 666 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_3.mp3" 667 | }, 668 | { 669 | "name": "Speaker 4 (KO)", 670 | "id": "v2/ko_speaker_4", 671 | "language": "Korean", 672 | "gender": "Male", 673 | "quality": null, 674 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_4.mp3", 675 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_4.mp3" 676 | }, 677 | { 678 | "name": "Speaker 5 (KO)", 679 | "id": "v2/ko_speaker_5", 680 | "language": "Korean", 681 | "gender": "Male", 682 | "quality": null, 683 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_5.mp3", 684 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_5.mp3" 685 | }, 686 | { 687 | "name": "Speaker 6 (KO)", 688 | "id": "v2/ko_speaker_6", 689 | "language": "Korean", 690 | "gender": "Male", 691 | "quality": null, 692 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_6.mp3", 693 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_6.mp3" 694 | }, 695 | { 696 | "name": "Speaker 7 (KO)", 697 | "id": "v2/ko_speaker_7", 698 | "language": "Korean", 699 | "gender": "Male", 700 | "quality": null, 701 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_7.mp3", 702 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_7.mp3" 703 | }, 704 | { 705 | "name": "Speaker 8 (KO)", 706 | "id": "v2/ko_speaker_8", 707 | "language": "Korean", 708 | "gender": "Male", 709 | "quality": null, 710 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_8.mp3", 711 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_8.mp3" 712 | }, 713 | { 714 | "name": "Speaker 9 (KO)", 715 | "id": "v2/ko_speaker_9", 716 | "language": "Korean", 717 | "gender": "Male", 718 | "quality": null, 719 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ko_speaker_9.mp3", 720 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ko_speaker_9.mp3" 721 | }, 722 | { 723 | "name": "Speaker 0 (PL)", 724 | "id": "v2/pl_speaker_0", 725 | "language": "Polish", 726 | "gender": "Male", 727 | "quality": null, 728 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_0.mp3", 729 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_0.mp3" 730 | }, 731 | { 732 | "name": "Speaker 1 (PL)", 733 | "id": "v2/pl_speaker_1", 734 | "language": "Polish", 735 | "gender": "Male", 736 | "quality": null, 737 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_1.mp3", 738 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_1.mp3" 739 | }, 740 | { 741 | "name": "Speaker 2 (PL)", 742 | "id": "v2/pl_speaker_2", 743 | "language": "Polish", 744 | "gender": "Male", 745 | "quality": null, 746 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_2.mp3", 747 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_2.mp3" 748 | }, 749 | { 750 | "name": "Speaker 3 (PL)", 751 | "id": "v2/pl_speaker_3", 752 | "language": "Polish", 753 | "gender": "Male", 754 | "quality": null, 755 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_3.mp3", 756 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_3.mp3" 757 | }, 758 | { 759 | "name": "Speaker 4 (PL)", 760 | "id": "v2/pl_speaker_4", 761 | "language": "Polish", 762 | "gender": "Female", 763 | "quality": null, 764 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_4.mp3", 765 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_4.mp3" 766 | }, 767 | { 768 | "name": "Speaker 5 (PL)", 769 | "id": "v2/pl_speaker_5", 770 | "language": "Polish", 771 | "gender": "Male", 772 | "quality": null, 773 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_5.mp3", 774 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_5.mp3" 775 | }, 776 | { 777 | "name": "Speaker 6 (PL)", 778 | "id": "v2/pl_speaker_6", 779 | "language": "Polish", 780 | "gender": "Female", 781 | "quality": null, 782 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_6.mp3", 783 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_6.mp3" 784 | }, 785 | { 786 | "name": "Speaker 7 (PL)", 787 | "id": "v2/pl_speaker_7", 788 | "language": "Polish", 789 | "gender": "Male", 790 | "quality": null, 791 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_7.mp3", 792 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_7.mp3" 793 | }, 794 | { 795 | "name": "Speaker 8 (PL)", 796 | "id": "v2/pl_speaker_8", 797 | "language": "Polish", 798 | "gender": "Male", 799 | "quality": null, 800 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_8.mp3", 801 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_8.mp3" 802 | }, 803 | { 804 | "name": "Speaker 9 (PL)", 805 | "id": "v2/pl_speaker_9", 806 | "language": "Polish", 807 | "gender": "Female", 808 | "quality": null, 809 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pl_speaker_9.mp3", 810 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pl_speaker_9.mp3" 811 | }, 812 | { 813 | "name": "Speaker 0 (PT)", 814 | "id": "v2/pt_speaker_0", 815 | "language": "Portuguese", 816 | "gender": "Male", 817 | "quality": null, 818 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_0.mp3", 819 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_0.mp3" 820 | }, 821 | { 822 | "name": "Speaker 1 (PT)", 823 | "id": "v2/pt_speaker_1", 824 | "language": "Portuguese", 825 | "gender": "Male", 826 | "quality": null, 827 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_1.mp3", 828 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_1.mp3" 829 | }, 830 | { 831 | "name": "Speaker 2 (PT)", 832 | "id": "v2/pt_speaker_2", 833 | "language": "Portuguese", 834 | "gender": "Male", 835 | "quality": null, 836 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_2.mp3", 837 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_2.mp3" 838 | }, 839 | { 840 | "name": "Speaker 3 (PT)", 841 | "id": "v2/pt_speaker_3", 842 | "language": "Portuguese", 843 | "gender": "Male", 844 | "quality": null, 845 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_3.mp3", 846 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_3.mp3" 847 | }, 848 | { 849 | "name": "Speaker 4 (PT)", 850 | "id": "v2/pt_speaker_4", 851 | "language": "Portuguese", 852 | "gender": "Male", 853 | "quality": null, 854 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_4.mp3", 855 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_4.mp3" 856 | }, 857 | { 858 | "name": "Speaker 5 (PT)", 859 | "id": "v2/pt_speaker_5", 860 | "language": "Portuguese", 861 | "gender": "Male", 862 | "quality": null, 863 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_5.mp3", 864 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_5.mp3" 865 | }, 866 | { 867 | "name": "Speaker 6 (PT)", 868 | "id": "v2/pt_speaker_6", 869 | "language": "Portuguese", 870 | "gender": "Male", 871 | "quality": "Background Noise", 872 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_6.mp3", 873 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_6.mp3" 874 | }, 875 | { 876 | "name": "Speaker 7 (PT)", 877 | "id": "v2/pt_speaker_7", 878 | "language": "Portuguese", 879 | "gender": "Male", 880 | "quality": null, 881 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_7.mp3", 882 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_7.mp3" 883 | }, 884 | { 885 | "name": "Speaker 8 (PT)", 886 | "id": "v2/pt_speaker_8", 887 | "language": "Portuguese", 888 | "gender": "Male", 889 | "quality": null, 890 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_8.mp3", 891 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_8.mp3" 892 | }, 893 | { 894 | "name": "Speaker 9 (PT)", 895 | "id": "v2/pt_speaker_9", 896 | "language": "Portuguese", 897 | "gender": "Male", 898 | "quality": null, 899 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/pt_speaker_9.mp3", 900 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/pt_speaker_9.mp3" 901 | }, 902 | { 903 | "name": "Speaker 0 (RU)", 904 | "id": "v2/ru_speaker_0", 905 | "language": "Russian", 906 | "gender": "Male", 907 | "quality": null, 908 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_0.mp3", 909 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_0.mp3" 910 | }, 911 | { 912 | "name": "Speaker 1 (RU)", 913 | "id": "v2/ru_speaker_1", 914 | "language": "Russian", 915 | "gender": "Male", 916 | "quality": "Echoes", 917 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_1.mp3", 918 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_1.mp3" 919 | }, 920 | { 921 | "name": "Speaker 2 (RU)", 922 | "id": "v2/ru_speaker_2", 923 | "language": "Russian", 924 | "gender": "Male", 925 | "quality": "Echoes", 926 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_2.mp3", 927 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_2.mp3" 928 | }, 929 | { 930 | "name": "Speaker 3 (RU)", 931 | "id": "v2/ru_speaker_3", 932 | "language": "Russian", 933 | "gender": "Male", 934 | "quality": null, 935 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_3.mp3", 936 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_3.mp3" 937 | }, 938 | { 939 | "name": "Speaker 4 (RU)", 940 | "id": "v2/ru_speaker_4", 941 | "language": "Russian", 942 | "gender": "Male", 943 | "quality": null, 944 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_4.mp3", 945 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_4.mp3" 946 | }, 947 | { 948 | "name": "Speaker 5 (RU)", 949 | "id": "v2/ru_speaker_5", 950 | "language": "Russian", 951 | "gender": "Female", 952 | "quality": null, 953 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_5.mp3", 954 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_5.mp3" 955 | }, 956 | { 957 | "name": "Speaker 6 (RU)", 958 | "id": "v2/ru_speaker_6", 959 | "language": "Russian", 960 | "gender": "Female", 961 | "quality": "Grainy", 962 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_6.mp3", 963 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_6.mp3" 964 | }, 965 | { 966 | "name": "Speaker 7 (RU)", 967 | "id": "v2/ru_speaker_7", 968 | "language": "Russian", 969 | "gender": "Male", 970 | "quality": null, 971 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_7.mp3", 972 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_7.mp3" 973 | }, 974 | { 975 | "name": "Speaker 8 (RU)", 976 | "id": "v2/ru_speaker_8", 977 | "language": "Russian", 978 | "gender": "Male", 979 | "quality": "Grainy", 980 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_8.mp3", 981 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_8.mp3" 982 | }, 983 | { 984 | "name": "Speaker 9 (RU)", 985 | "id": "v2/ru_speaker_9", 986 | "language": "Russian", 987 | "gender": "Female", 988 | "quality": "Grainy", 989 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/ru_speaker_9.mp3", 990 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/ru_speaker_9.mp3" 991 | }, 992 | { 993 | "name": "Speaker 0 (ES)", 994 | "id": "v2/es_speaker_0", 995 | "language": "Spanish", 996 | "gender": "Male", 997 | "quality": null, 998 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_0.mp3", 999 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_0.mp3" 1000 | }, 1001 | { 1002 | "name": "Speaker 1 (ES)", 1003 | "id": "v2/es_speaker_1", 1004 | "language": "Spanish", 1005 | "gender": "Male", 1006 | "quality": null, 1007 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_1.mp3", 1008 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_1.mp3" 1009 | }, 1010 | { 1011 | "name": "Speaker 2 (ES)", 1012 | "id": "v2/es_speaker_2", 1013 | "language": "Spanish", 1014 | "gender": "Male", 1015 | "quality": "Background Noise", 1016 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_2.mp3", 1017 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_2.mp3" 1018 | }, 1019 | { 1020 | "name": "Speaker 3 (ES)", 1021 | "id": "v2/es_speaker_3", 1022 | "language": "Spanish", 1023 | "gender": "Male", 1024 | "quality": "Background Noise", 1025 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_3.mp3", 1026 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_3.mp3" 1027 | }, 1028 | { 1029 | "name": "Speaker 4 (ES)", 1030 | "id": "v2/es_speaker_4", 1031 | "language": "Spanish", 1032 | "gender": "Male", 1033 | "quality": null, 1034 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_4.mp3", 1035 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_4.mp3" 1036 | }, 1037 | { 1038 | "name": "Speaker 5 (ES)", 1039 | "id": "v2/es_speaker_5", 1040 | "language": "Spanish", 1041 | "gender": "Male", 1042 | "quality": "Background Noise", 1043 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_5.mp3", 1044 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_5.mp3" 1045 | }, 1046 | { 1047 | "name": "Speaker 6 (ES)", 1048 | "id": "v2/es_speaker_6", 1049 | "language": "Spanish", 1050 | "gender": "Male", 1051 | "quality": null, 1052 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_6.mp3", 1053 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_6.mp3" 1054 | }, 1055 | { 1056 | "name": "Speaker 7 (ES)", 1057 | "id": "v2/es_speaker_7", 1058 | "language": "Spanish", 1059 | "gender": "Male", 1060 | "quality": null, 1061 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_7.mp3", 1062 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_7.mp3" 1063 | }, 1064 | { 1065 | "name": "Speaker 8 (ES)", 1066 | "id": "v2/es_speaker_8", 1067 | "language": "Spanish", 1068 | "gender": "Female", 1069 | "quality": null, 1070 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_8.mp3", 1071 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_8.mp3" 1072 | }, 1073 | { 1074 | "name": "Speaker 9 (ES)", 1075 | "id": "v2/es_speaker_9", 1076 | "language": "Spanish", 1077 | "gender": "Female", 1078 | "quality": null, 1079 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/es_speaker_9.mp3", 1080 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/es_speaker_9.mp3" 1081 | }, 1082 | { 1083 | "name": "Speaker 0 (TR)", 1084 | "id": "v2/tr_speaker_0", 1085 | "language": "Turkish", 1086 | "gender": "Male", 1087 | "quality": null, 1088 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_0.mp3", 1089 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_0.mp3" 1090 | }, 1091 | { 1092 | "name": "Speaker 1 (TR)", 1093 | "id": "v2/tr_speaker_1", 1094 | "language": "Turkish", 1095 | "gender": "Male", 1096 | "quality": null, 1097 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_1.mp3", 1098 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_1.mp3" 1099 | }, 1100 | { 1101 | "name": "Speaker 2 (TR)", 1102 | "id": "v2/tr_speaker_2", 1103 | "language": "Turkish", 1104 | "gender": "Male", 1105 | "quality": null, 1106 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_2.mp3", 1107 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_2.mp3" 1108 | }, 1109 | { 1110 | "name": "Speaker 3 (TR)", 1111 | "id": "v2/tr_speaker_3", 1112 | "language": "Turkish", 1113 | "gender": "Male", 1114 | "quality": null, 1115 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_3.mp3", 1116 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_3.mp3" 1117 | }, 1118 | { 1119 | "name": "Speaker 4 (TR)", 1120 | "id": "v2/tr_speaker_4", 1121 | "language": "Turkish", 1122 | "gender": "Female", 1123 | "quality": null, 1124 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_4.mp3", 1125 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_4.mp3" 1126 | }, 1127 | { 1128 | "name": "Speaker 5 (TR)", 1129 | "id": "v2/tr_speaker_5", 1130 | "language": "Turkish", 1131 | "gender": "Female", 1132 | "quality": null, 1133 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_5.mp3", 1134 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_5.mp3" 1135 | }, 1136 | { 1137 | "name": "Speaker 6 (TR)", 1138 | "id": "v2/tr_speaker_6", 1139 | "language": "Turkish", 1140 | "gender": "Male", 1141 | "quality": null, 1142 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_6.mp3", 1143 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_6.mp3" 1144 | }, 1145 | { 1146 | "name": "Speaker 7 (TR)", 1147 | "id": "v2/tr_speaker_7", 1148 | "language": "Turkish", 1149 | "gender": "Male", 1150 | "quality": "Grainy", 1151 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_7.mp3", 1152 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_7.mp3" 1153 | }, 1154 | { 1155 | "name": "Speaker 8 (TR)", 1156 | "id": "v2/tr_speaker_8", 1157 | "language": "Turkish", 1158 | "gender": "Male", 1159 | "quality": null, 1160 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_8.mp3", 1161 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_8.mp3" 1162 | }, 1163 | { 1164 | "name": "Speaker 9 (TR)", 1165 | "id": "v2/tr_speaker_9", 1166 | "language": "Turkish", 1167 | "gender": "Male", 1168 | "quality": null, 1169 | "prompt_audio": "https://dl.suno-models.io/bark/prompts/prompt_audio/tr_speaker_9.mp3", 1170 | "continuation_audio": "https://dl.suno-models.io/bark/prompts/continuation_audio/tr_speaker_9.mp3" 1171 | } 1172 | ] -------------------------------------------------------------------------------- /scripts/bark/tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import numpy as np 4 | from scipy.io.wavfile import write as write_wav 5 | from bark.generation import ( 6 | preload_models, 7 | clean_models 8 | ) 9 | from bark import generate_audio, SAMPLE_RATE 10 | 11 | 12 | class TTS: 13 | def __init__(self, text_prompt, speaker, temperature, silence, voice, low_vram): 14 | self.text_prompt = text_prompt 15 | self.speaker = speaker 16 | self.temperature = temperature 17 | self.silence = silence 18 | self.voice = voice 19 | self.low_vram = low_vram 20 | 21 | def generate(self): 22 | if self.low_vram: 23 | preload_models(text_use_gpu=True, 24 | text_use_small=True, 25 | coarse_use_gpu=True, 26 | coarse_use_small=True, 27 | fine_use_gpu=True, 28 | fine_use_small=True, 29 | codec_use_gpu=True, 30 | force_reload=False) 31 | else: 32 | preload_models(text_use_gpu=True, 33 | text_use_small=False, 34 | coarse_use_gpu=True, 35 | coarse_use_small=False, 36 | fine_use_gpu=True, 37 | fine_use_small=False, 38 | codec_use_gpu=True, 39 | force_reload=False) 40 | pieces = [] 41 | # split text_prompt into sentences by punctuation 42 | sentences = re.split('\[split\]', self.text_prompt) 43 | silence = np.zeros(int(self.silence * SAMPLE_RATE)).astype(np.float32) 44 | for sentence in sentences: 45 | if sentence.strip() != "": 46 | audio_array = generate_audio(sentence, history_prompt=self.speaker, text_temp=self.temperature) 47 | pieces += [audio_array, silence.copy()] 48 | 49 | write_wav("bark_generation.wav", SAMPLE_RATE, np.concatenate(pieces)) 50 | clean_models() 51 | print("Done!") 52 | return "bark_generation.wav" 53 | 54 | -------------------------------------------------------------------------------- /scripts/faceswap/model/README.md: -------------------------------------------------------------------------------- 1 | inswapper model folder -------------------------------------------------------------------------------- /scripts/faceswap/swap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from PIL import Image 5 | import subprocess 6 | import insightface 7 | from dataclasses import dataclass 8 | from typing import List, Union, Dict, Set, Tuple 9 | from pkg_resources import resource_filename 10 | from modules.shared import state, opts 11 | import modules.face_restoration 12 | from modules.upscaler import Upscaler, UpscalerData 13 | from modules.face_restoration import FaceRestoration, restore_faces 14 | import scripts.wav2lip.audio as audio 15 | import tempfile 16 | from ifnude import detect 17 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] 18 | 19 | 20 | @dataclass 21 | class ImageResult: 22 | path: Union[str, None] = None 23 | similarity: Union[Dict[int, float], None] = None # face, 0..1 24 | 25 | def image(self) -> Union[Image.Image, None]: 26 | if self.path: 27 | return Image.open(self.path) 28 | return None 29 | 30 | 31 | @dataclass 32 | class UpscaleOptions: 33 | scale: int = 1 34 | upscaler: UpscalerData = None 35 | upscale_visibility: float = 0.5 36 | face_restorer: FaceRestoration = None 37 | restorer_visibility: float = 0.5 38 | 39 | 40 | class FaceSwap: 41 | def __init__(self, face=None, audio=None, face_index=None, source=None, resize_factor=None, face_restore_model=None, code_former_weight=None): 42 | self.faceswap_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1]) 43 | self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-2]) 44 | self.faceswap_output_folder = os.path.join(self.wav2lip_folder, 'wav2lip', 'output', 'faceswap') 45 | self.face = face 46 | self.audio = audio 47 | self.source = source 48 | self.resize_factor = resize_factor 49 | self.code_former_weight = code_former_weight 50 | self.face_restore_model = face_restore_model 51 | self.model = self.faceswap_folder + "/model/inswapper_128.onnx" 52 | self.faces_index = {face_index} 53 | self.ffmpeg_binary = self.find_ffmpeg_binary() 54 | model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.model) 55 | self.face_swapper = insightface.model_zoo.get_model(model_path, providers=providers) 56 | self.face_analyser = insightface.app.FaceAnalysis(name="buffalo_l", providers=providers) 57 | self.face_analyser.prepare(ctx_id=0, det_size=(640, 640)) 58 | self.mel_step_size = 16 59 | if audio is not None: 60 | self.nb_frame = self.calc_frame() 61 | 62 | def calc_frame(self): 63 | 64 | video_stream = cv2.VideoCapture(self.face) 65 | fps = video_stream.get(cv2.CAP_PROP_FPS) 66 | wav = audio.load_wav(self.audio, 16000) 67 | mel = audio.melspectrogram(wav) 68 | 69 | if np.isnan(mel.reshape(-1)).sum() > 0: 70 | raise ValueError( 71 | 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') 72 | 73 | mel_chunks = [] 74 | mel_idx_multiplier = 80. / fps 75 | i = 0 76 | while 1: 77 | start_idx = int(i * mel_idx_multiplier) 78 | if start_idx + self.mel_step_size > len(mel[0]): 79 | mel_chunks.append(mel[:, len(mel[0]) - self.mel_step_size:]) 80 | break 81 | mel_chunks.append(mel[:, start_idx: start_idx + self.mel_step_size]) 82 | i += 1 83 | 84 | return len(mel_chunks) 85 | 86 | def convert_to_sd(self, img): 87 | shapes = [] 88 | chunks = detect(img) 89 | for chunk in chunks: 90 | shapes.append(chunk["score"] > 0.7) 91 | return [any(shapes), tempfile.NamedTemporaryFile(delete=False, suffix=".png")] 92 | 93 | def find_ffmpeg_binary(self): 94 | for package in ['imageio_ffmpeg', 'imageio-ffmpeg']: 95 | try: 96 | package_path = resource_filename(package, 'binaries') 97 | files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")] 98 | files.sort(key=lambda x: os.path.getmtime(x), reverse=True) 99 | return files[0] if files else 'ffmpeg' 100 | except: 101 | return 'ffmpeg' 102 | 103 | def get_framerate(self, video_file): 104 | video = cv2.VideoCapture(video_file) 105 | fps = video.get(cv2.CAP_PROP_FPS) 106 | video.release() 107 | return fps 108 | 109 | def create_video_from_images(self, nb_frames): 110 | fps = str(self.get_framerate(self.face)) 111 | command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i", 112 | self.faceswap_output_folder + "/face_swap_%05d.png", "-vframes", 113 | str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k", 114 | self.faceswap_output_folder + "/video.mp4"] 115 | 116 | self.execute_command(command) 117 | 118 | def execute_command(self, command): 119 | process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) 120 | stdout, stderr = process.communicate() 121 | if process.returncode != 0: 122 | raise RuntimeError(stderr) 123 | 124 | def get_face_single(self, img_data: np.ndarray, face_index=0, det_size=(640, 640)): 125 | face = self.face_analyser.get(img_data) 126 | if len(face) == 0 and det_size[0] > 320 and det_size[1] > 320: 127 | det_size_half = (det_size[0] // 2, det_size[1] // 2) 128 | self.face_analyser.prepare(ctx_id=0, det_size=det_size_half) 129 | face = self.face_analyser.get(img_data) 130 | self.face_analyser.prepare(ctx_id=0, det_size=det_size) 131 | try: 132 | return sorted(face, key=lambda x: x.bbox[0])[face_index] 133 | except IndexError: 134 | return None 135 | try: 136 | return sorted(face, key=lambda x: x.bbox[0])[face_index] 137 | except IndexError: 138 | return None 139 | 140 | def swap_face(self, 141 | source_img: Image.Image, 142 | target_img: Image.Image, 143 | model: Union[str, None] = None, 144 | faces_index: Set[int] = {0}, 145 | upscale_options: Union[UpscaleOptions, None] = None, 146 | ) -> ImageResult: 147 | result_image = target_img 148 | converted = self.convert_to_sd(target_img) 149 | scale, fn = converted[0], converted[1] 150 | if model is not None and not scale: 151 | source_img = cv2.cvtColor(np.array(source_img), cv2.COLOR_RGB2BGR) 152 | target_img = cv2.cvtColor(np.array(target_img), cv2.COLOR_RGB2BGR) 153 | source_face = self.get_face_single(source_img, face_index=0) 154 | if source_face is not None: 155 | result = target_img 156 | for face_num in faces_index: 157 | target_face = self.get_face_single(target_img, face_index=face_num) 158 | if target_face is not None: 159 | result = self.face_swapper.get(result, target_face, source_face) 160 | else: 161 | print(f"No target face found for {face_num}") 162 | result_image = Image.fromarray(cv2.cvtColor(result, cv2.COLOR_BGR2RGB)) 163 | else: 164 | print("No source face found") 165 | result_image.save(fn.name) 166 | return ImageResult(path=fn.name) 167 | 168 | def resume(self): 169 | return self.faceswap_output_folder + "/video.mp4" 170 | 171 | def generate(self): 172 | original_codeformer_weight = opts.code_former_weight 173 | original_face_restoration_model = opts.face_restoration_model 174 | 175 | opts.code_former_weight = self.code_former_weight 176 | opts.face_restoration_model = self.face_restore_model 177 | video_stream = cv2.VideoCapture(self.face) 178 | 179 | print('Reading video frames for face swap...') 180 | frame_number = 0 181 | 182 | while frame_number != self.nb_frame+1: 183 | f_number = str(frame_number).rjust(5, '0') 184 | print("[INFO] Processing frame: " + str(frame_number) + " of " + str(self.nb_frame) + " - ", end="\r") 185 | still_reading, frame = video_stream.read() 186 | if not still_reading: 187 | video_stream.release() 188 | break 189 | 190 | if self.resize_factor > 1: 191 | frame = cv2.resize(frame, 192 | (frame.shape[1] // self.resize_factor, frame.shape[0] // self.resize_factor)) 193 | 194 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 195 | result = self.swap_face( 196 | self.source, 197 | frame, 198 | faces_index=self.faces_index, 199 | model=self.model, 200 | upscale_options=None 201 | ) 202 | # copy image to output folder 203 | face_swapped = cv2.imread(result.path) 204 | face_swapped = cv2.cvtColor(face_swapped, cv2.COLOR_RGB2BGR) 205 | image_restored = modules.face_restoration.restore_faces(face_swapped) 206 | image_restored2 = cv2.cvtColor(image_restored, cv2.COLOR_RGB2BGR) 207 | cv2.imwrite(self.faceswap_output_folder + "/face_swap_" + f_number + ".png", image_restored2) 208 | 209 | frame_number += 1 210 | 211 | self.create_video_from_images(frame_number - 1) 212 | opts.code_former_weight = original_codeformer_weight 213 | opts.face_restoration_model = original_face_restoration_model 214 | return self.faceswap_output_folder + "/video.mp4" 215 | -------------------------------------------------------------------------------- /scripts/ui.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scripts.wav2lip_uhq_extend_paths import wav2lip_uhq_sys_extend 3 | import gradio as gr 4 | from scripts.wav2lip.w2l import W2l 5 | from scripts.wav2lip.wav2lip_uhq import Wav2LipUHQ 6 | from modules.shared import state 7 | from scripts.bark.tts import TTS 8 | from scripts.faceswap.swap import FaceSwap 9 | 10 | speaker_id = "v2/en_speaker_0" 11 | 12 | 13 | def on_ui_tabs(): 14 | wav2lip_uhq_sys_extend() 15 | speaker_json = json.load(open("extensions/sd-wav2lip-uhq/scripts/bark/speakers.json", "r")) 16 | speaker_list = [speaker["name"] for speaker in speaker_json if 17 | speaker["language"] == "English" and speaker["gender"] == "Male"] 18 | speaker_language = list(set([speaker["language"] for speaker in speaker_json])) 19 | speaker_gender = list(set([speaker["gender"] for speaker in speaker_json])) 20 | 21 | def update_speaker_list(new_language, new_gender): 22 | # Mettez à jour la liste des speakers basée sur la langue et le genre sélectionnés 23 | global speaker_id 24 | new_speaker_list = [speaker["name"] for speaker in speaker_json if 25 | speaker["language"] == new_language and speaker["gender"] == new_gender] 26 | audio_mp3 = [speaker["prompt_audio"] for speaker in speaker_json if speaker["name"] in new_speaker_list[0]][0] 27 | speaker_id = [speaker["id"] for speaker in speaker_json if speaker["name"] in new_speaker_list[0]][0] 28 | return [gr.Dropdown.update(choices=new_speaker_list, value=new_speaker_list[0]), 29 | gr.Audio.update(value=audio_mp3), gr.Dropdown.update(value=new_language)] 30 | 31 | def select_speaker(speaker): 32 | # Mettez à jour l'audio basé sur le speaker sélectionné 33 | global speaker_id 34 | audio_mp3 = [sp["prompt_audio"] for sp in speaker_json if sp["name"] == speaker][0] 35 | speaker_id = [sp["id"] for sp in speaker_json if sp["name"] == speaker][0] 36 | return gr.Audio.update(value=audio_mp3) 37 | 38 | with gr.Blocks(analytics_enabled=False) as wav2lip_uhq_interface: 39 | gr.Markdown( 40 | "

Follow installation instructions here

") 41 | gr.Markdown( 42 | "

STANDALONE VERSION AVAILABLE HERE

") 43 | with gr.Row(): 44 | with gr.Column(): 45 | with gr.Row(): 46 | with gr.Column(): 47 | video = gr.Video(label="Video", format="mp4", 48 | info="Filepath of video/image that contains faces to use", 49 | file_types=["mp4", "png", "jpg", "jpeg", "avi"]) 50 | face_swap_img = gr.Image(label="Face Swap", type="pil") 51 | face_index_slider = gr.Slider(minimum=0, maximum=20, step=1, value=0, label="Face index", 52 | info="index of face to swap, left face in image is 0") 53 | 54 | with gr.Column(): 55 | with gr.Row(): 56 | language = gr.Dropdown( 57 | speaker_language, label="Language", info="Select the language to use", 58 | value="English" 59 | ) 60 | gender = gr.Dropdown( 61 | speaker_gender, label="Gender", info="Select gender", value="Male" 62 | ) 63 | with gr.Row(): 64 | speaker = gr.Dropdown( 65 | speaker_list, label="Speaker", info="Select the speaker to use", 66 | value=speaker_list[0] 67 | ) 68 | low_vram = gr.Radio(["False", "True"], value="True", label="Low VRAM", 69 | info="Less than 16GB of VRAM, set True") 70 | with gr.Row(): 71 | audio_example = gr.Audio(label="Audio example", 72 | value="https://dl.suno-models.io/bark/prompts/prompt_audio/en_speaker_0.mp3") 73 | with gr.Column(): 74 | suno_prompt = gr.Textbox(label="Prompt", placeholder="Prompt", lines=5, type="text",info="Don't forget that bark can only generate 14 seconds of audio at a time, so for long text, you need to use [split] to split the text into multiple prompts") 75 | temperature = gr.Slider(label="Generation temperature", minimum=0.01, maximum=1, step=0.01, value=0.7, 76 | info="1.0 more diverse, 0.0 more conservative") 77 | silence = gr.Slider(label="Silence", minimum=0, maximum=1, step=0.01, value=0.25, info="Silence after [split] in seconde") 78 | generate_audio = gr.Button("Generate") 79 | audio = gr.Audio(label="Speech", type="filepath") 80 | 81 | # if language changed, update speaker list 82 | language.change(update_speaker_list, [language, gender], [speaker, audio_example]) 83 | gender.change(update_speaker_list, [language, gender], [speaker, audio_example]) 84 | speaker.change(select_speaker, speaker, audio_example) 85 | 86 | with gr.Row(): 87 | checkpoint = gr.Radio(["wav2lip", "wav2lip_gan"], value="wav2lip_gan", label="Checkpoint", 88 | info="Wav2lip model to use") 89 | face_restore_model = gr.Radio(["CodeFormer", "GFPGAN"], value="GFPGAN", 90 | label="Face Restoration Model", 91 | info="Model to use") 92 | 93 | with gr.Row(): 94 | no_smooth = gr.Checkbox(label="No Smooth", info="Prevent smoothing face detections") 95 | only_mouth = gr.Checkbox(label="Only Mouth", info="Only track the mouth") 96 | active_debug = gr.Checkbox(label="Active Debug", info="Active Debug") 97 | with gr.Row(): 98 | with gr.Column(): 99 | resize_factor = gr.Slider(minimum=1, maximum=4, step=1, label="Resize Factor", 100 | info="Reduce the resolution by this factor.") 101 | mouth_mask_dilatation = gr.Slider(minimum=0, maximum=128, step=1, value=15, 102 | label="Mouth Mask Dilate", 103 | info="Dilatation of the mask around the mouth (in pixels)") 104 | erode_face_mask = gr.Slider(minimum=0, maximum=128, step=1, value=15, label="Face Mask Erode", 105 | info="Erode the mask around the face (in pixels)") 106 | mask_blur = gr.Slider(minimum=0, maximum=128, step=1, value=15, label="Mask Blur", 107 | info="Kernel size of Gaussian blur for masking") 108 | code_former_weight = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.75, 109 | label="Code Former Fidelity", 110 | info="0 for better quality, 1 for better identity (Effect only if codeformer is selected)") 111 | with gr.Column(): 112 | pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Top", 113 | info="Padding above lips") 114 | pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Bottom", 115 | info="Padding below lips") 116 | pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Left", 117 | info="Padding to the left of lips") 118 | pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Pad Right", 119 | info="Padding to the right of lips") 120 | 121 | with gr.Column(): 122 | with gr.Tabs(elem_id="wav2lip_generated"): 123 | with gr.Row(): 124 | faceswap_video = gr.Video(label="faceSwap video", format="mp4") 125 | wav2lip_video = gr.Video(label="Wav2Lip video", format="mp4") 126 | restore_video = gr.Video(label="Restored face video", format="mp4") 127 | result = gr.Video(label="Generated video", format="mp4") 128 | generate_btn = gr.Button("Generate") 129 | interrupt_btn = gr.Button('Interrupt', elem_id=f"interrupt", visible=True) 130 | resume_btn = gr.Button('Resume', elem_id=f"resume", visible=True) 131 | 132 | def on_interrupt(): 133 | state.interrupt() 134 | return "Interrupted" 135 | 136 | def gen_audio(suno_prompt, temperature, silence, low_vram): 137 | global speaker_id 138 | if suno_prompt is None or speaker_id is None: 139 | return 140 | tts = TTS(suno_prompt, speaker_id, temperature, silence,None, low_vram) 141 | wav = tts.generate() 142 | # delete tts object to free memory 143 | del tts 144 | 145 | return wav 146 | 147 | def generate(video, face_swap_img, face_index, audio, checkpoint, face_restore_model, no_smooth, only_mouth, resize_factor, 148 | mouth_mask_dilatation, erode_face_mask, mask_blur, pad_top, pad_bottom, pad_left, pad_right, 149 | active_debug, code_former_weight): 150 | state.begin() 151 | 152 | if video is None or audio is None: 153 | print("[ERROR] Please select a video and an audio file") 154 | return 155 | 156 | if face_swap_img is not None: 157 | face_swap = FaceSwap(video, audio, face_index, face_swap_img, resize_factor, face_restore_model, code_former_weight) 158 | video = face_swap.generate() 159 | 160 | w2l = W2l(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, 161 | pad_right, face_swap_img) 162 | w2l.execute() 163 | 164 | w2luhq = Wav2LipUHQ(video, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur, 165 | only_mouth, face_swap_img, resize_factor, code_former_weight, active_debug) 166 | 167 | return w2luhq.execute() 168 | 169 | def resume(video,face_swap_img, face_restore_model, only_mouth, resize_factor, mouth_mask_dilatation, erode_face_mask, 170 | mask_blur, active_debug, code_former_weight): 171 | state.begin() 172 | if face_swap_img is not None: 173 | face_swap = FaceSwap() 174 | video = face_swap.resume() 175 | w2luhq = Wav2LipUHQ(video, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur, 176 | only_mouth, face_swap_img, resize_factor, code_former_weight, active_debug) 177 | 178 | return w2luhq.execute(True) 179 | 180 | generate_audio.click( 181 | gen_audio, 182 | [suno_prompt, temperature, silence, low_vram], 183 | audio) 184 | 185 | generate_btn.click( 186 | generate, 187 | [video, face_swap_img, face_index_slider, audio, checkpoint, face_restore_model, no_smooth, only_mouth, resize_factor, mouth_mask_dilatation, 188 | erode_face_mask, mask_blur, pad_top, pad_bottom, pad_left, pad_right, active_debug, code_former_weight], 189 | [faceswap_video, wav2lip_video, restore_video, result]) 190 | 191 | resume_btn.click( 192 | resume, 193 | [video,face_swap_img, face_restore_model, only_mouth, resize_factor, mouth_mask_dilatation, erode_face_mask, 194 | mask_blur, active_debug, code_former_weight], 195 | [faceswap_video, wav2lip_video, restore_video, result]) 196 | 197 | interrupt_btn.click(on_interrupt) 198 | 199 | return [(wav2lip_uhq_interface, "Wav2lip Studio", "wav2lip_uhq_interface")] 200 | -------------------------------------------------------------------------------- /scripts/wav2lip/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from scipy import signal 5 | from scipy.io import wavfile 6 | from scripts.wav2lip.hparams import hparams as hp 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | 13 | def save_wav(wav, path, sr): 14 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | 18 | def save_wavenet_wav(wav, path, sr): 19 | librosa.output.write_wav(path, wav, sr=sr) 20 | 21 | 22 | def preemphasis(wav, k, preemphasize=True): 23 | if preemphasize: 24 | return signal.lfilter([1, -k], [1], wav) 25 | return wav 26 | 27 | 28 | def inv_preemphasis(wav, k, inv_preemphasize=True): 29 | if inv_preemphasize: 30 | return signal.lfilter([1], [1, -k], wav) 31 | return wav 32 | 33 | 34 | def get_hop_size(): 35 | hop_size = hp.hop_size 36 | if hop_size is None: 37 | assert hp.frame_shift_ms is not None 38 | hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate) 39 | return hop_size 40 | 41 | 42 | def linearspectrogram(wav): 43 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 44 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 45 | 46 | if hp.signal_normalization: 47 | return _normalize(S) 48 | return S 49 | 50 | 51 | def melspectrogram(wav): 52 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 53 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db 54 | 55 | if hp.signal_normalization: 56 | return _normalize(S) 57 | return S 58 | 59 | 60 | def _lws_processor(): 61 | import lws 62 | return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech") 63 | 64 | 65 | def _stft(y): 66 | if hp.use_lws: 67 | return _lws_processor(hp).stft(y).T 68 | else: 69 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size) 70 | 71 | 72 | ########################################################## 73 | # Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 74 | def num_frames(length, fsize, fshift): 75 | """Compute number of time frames of spectrogram 76 | """ 77 | pad = (fsize - fshift) 78 | if length % fshift == 0: 79 | M = (length + pad * 2 - fsize) // fshift + 1 80 | else: 81 | M = (length + pad * 2 - fsize) // fshift + 2 82 | return M 83 | 84 | 85 | def pad_lr(x, fsize, fshift): 86 | """Compute left and right padding 87 | """ 88 | M = num_frames(len(x), fsize, fshift) 89 | pad = (fsize - fshift) 90 | T = len(x) + 2 * pad 91 | r = (M - 1) * fshift + fsize - T 92 | return pad, pad + r 93 | 94 | 95 | ########################################################## 96 | # Librosa correct padding 97 | def librosa_pad_lr(x, fsize, fshift): 98 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 99 | 100 | 101 | # Conversions 102 | _mel_basis = None 103 | 104 | 105 | def _linear_to_mel(spectogram): 106 | global _mel_basis 107 | if _mel_basis is None: 108 | _mel_basis = _build_mel_basis() 109 | return np.dot(_mel_basis, spectogram) 110 | 111 | 112 | def _build_mel_basis(): 113 | assert hp.fmax <= hp.sample_rate // 2 114 | return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels, 115 | fmin=hp.fmin, fmax=hp.fmax) 116 | 117 | 118 | def _amp_to_db(x): 119 | min_level = np.exp(hp.min_level_db / 20 * np.log(10)) 120 | return 20 * np.log10(np.maximum(min_level, x)) 121 | 122 | 123 | def _db_to_amp(x): 124 | return np.power(10.0, (x) * 0.05) 125 | 126 | 127 | def _normalize(S): 128 | if hp.allow_clipping_in_normalization: 129 | if hp.symmetric_mels: 130 | return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value, 131 | -hp.max_abs_value, hp.max_abs_value) 132 | else: 133 | return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value) 134 | 135 | assert S.max() <= 0 and S.min() - hp.min_level_db >= 0 136 | if hp.symmetric_mels: 137 | return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value 138 | else: 139 | return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)) 140 | 141 | 142 | def _denormalize(D): 143 | if hp.allow_clipping_in_normalization: 144 | if hp.symmetric_mels: 145 | return (((np.clip(D, -hp.max_abs_value, 146 | hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) 147 | + hp.min_level_db) 148 | else: 149 | return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 150 | 151 | if hp.symmetric_mels: 152 | return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db) 153 | else: 154 | return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 155 | -------------------------------------------------------------------------------- /scripts/wav2lip/checkpoints/README.md: -------------------------------------------------------------------------------- 1 | Place all your checkpoints (.pth files) here. -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/README.md: -------------------------------------------------------------------------------- 1 | The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Adrian Bulat""" 4 | __email__ = 'adrian.bulat@nottingham.ac.uk' 5 | __version__ = '1.0.1' 6 | 7 | from .api import FaceAlignment, LandmarksType, NetworkSize 8 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/api.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from enum import Enum 3 | 4 | try: 5 | import urllib.request as request_file 6 | except BaseException: 7 | import urllib as request_file 8 | 9 | from .utils import * 10 | 11 | 12 | class LandmarksType(Enum): 13 | """Enum class defining the type of landmarks to detect. 14 | 15 | ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face 16 | ``_2halfD`` - this points represent the projection of the 3D points into 3D 17 | ``_3D`` - detect the points ``(x,y,z)``` in a 3D space 18 | 19 | """ 20 | _2D = 1 21 | _2halfD = 2 22 | _3D = 3 23 | 24 | 25 | class NetworkSize(Enum): 26 | # TINY = 1 27 | # SMALL = 2 28 | # MEDIUM = 3 29 | LARGE = 4 30 | 31 | def __new__(cls, value): 32 | member = object.__new__(cls) 33 | member._value_ = value 34 | return member 35 | 36 | def __int__(self): 37 | return self.value 38 | 39 | 40 | ROOT = os.path.dirname(os.path.abspath(__file__)) 41 | 42 | 43 | class FaceAlignment: 44 | def __init__(self, landmarks_type, network_size=NetworkSize.LARGE, 45 | device='cuda', flip_input=False, face_detector='sfd', verbose=False): 46 | self.device = device 47 | self.flip_input = flip_input 48 | self.landmarks_type = landmarks_type 49 | self.verbose = verbose 50 | 51 | network_size = int(network_size) 52 | 53 | if 'cuda' in device: 54 | torch.backends.cudnn.benchmark = True 55 | 56 | # Get the face detector 57 | face_detector_module = __import__('scripts.wav2lip.face_detection.detection.' + face_detector, 58 | globals(), locals(), [face_detector], 0) 59 | self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose) 60 | 61 | def get_detections_for_batch(self, images): 62 | images = images[..., ::-1] 63 | detected_faces = self.face_detector.detect_from_batch(images.copy()) 64 | results = [] 65 | 66 | for i, d in enumerate(detected_faces): 67 | if len(d) == 0: 68 | results.append(None) 69 | continue 70 | d = d[0] 71 | d = np.clip(d, 0, None) 72 | 73 | x1, y1, x2, y2 = map(int, d[:-1]) 74 | results.append((x1, y1, x2, y2)) 75 | 76 | return results 77 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import FaceDetector -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/core.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import glob 3 | from tqdm import tqdm 4 | import numpy as np 5 | import torch 6 | import cv2 7 | 8 | 9 | class FaceDetector(object): 10 | """An abstract class representing a face detector. 11 | 12 | Any other face detection implementation must subclass it. All subclasses 13 | must implement ``detect_from_image``, that return a list of detected 14 | bounding boxes. Optionally, for speed considerations detect from path is 15 | recommended. 16 | """ 17 | 18 | def __init__(self, device, verbose): 19 | self.device = device 20 | self.verbose = verbose 21 | 22 | if verbose: 23 | if 'cpu' in device: 24 | logger = logging.getLogger(__name__) 25 | logger.warning("Detection running on CPU, this may be potentially slow.") 26 | 27 | if 'cpu' not in device and 'cuda' not in device: 28 | if verbose: 29 | logger.error("Expected values for device are: {cpu, cuda} but got: %s", device) 30 | raise ValueError 31 | 32 | def detect_from_image(self, tensor_or_path): 33 | """Detects faces in a given image. 34 | 35 | This function detects the faces present in a provided BGR(usually) 36 | image. The input can be either the image itself or the path to it. 37 | 38 | Arguments: 39 | tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path 40 | to an image or the image itself. 41 | 42 | Example:: 43 | 44 | >>> path_to_image = 'data/image_01.jpg' 45 | ... detected_faces = detect_from_image(path_to_image) 46 | [A list of bounding boxes (x1, y1, x2, y2)] 47 | >>> image = cv2.imread(path_to_image) 48 | ... detected_faces = detect_from_image(image) 49 | [A list of bounding boxes (x1, y1, x2, y2)] 50 | 51 | """ 52 | raise NotImplementedError 53 | 54 | def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True): 55 | """Detects faces from all the images present in a given directory. 56 | 57 | Arguments: 58 | path {string} -- a string containing a path that points to the folder containing the images 59 | 60 | Keyword Arguments: 61 | extensions {list} -- list of string containing the extensions to be 62 | consider in the following format: ``.extension_name`` (default: 63 | {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the 64 | folder recursively (default: {False}) show_progress_bar {bool} -- 65 | display a progressbar (default: {True}) 66 | 67 | Example: 68 | >>> directory = 'data' 69 | ... detected_faces = detect_from_directory(directory) 70 | {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]} 71 | 72 | """ 73 | if self.verbose: 74 | logger = logging.getLogger(__name__) 75 | 76 | if len(extensions) == 0: 77 | if self.verbose: 78 | logger.error("Expected at list one extension, but none was received.") 79 | raise ValueError 80 | 81 | if self.verbose: 82 | logger.info("Constructing the list of images.") 83 | additional_pattern = '/**/*' if recursive else '/*' 84 | files = [] 85 | for extension in extensions: 86 | files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive)) 87 | 88 | if self.verbose: 89 | logger.info("Finished searching for images. %s images found", len(files)) 90 | logger.info("Preparing to run the detection.") 91 | 92 | predictions = {} 93 | for image_path in tqdm(files, disable=not show_progress_bar): 94 | if self.verbose: 95 | logger.info("Running the face detector on image: %s", image_path) 96 | predictions[image_path] = self.detect_from_image(image_path) 97 | 98 | if self.verbose: 99 | logger.info("The detector was successfully run on all %s images", len(files)) 100 | 101 | return predictions 102 | 103 | @property 104 | def reference_scale(self): 105 | raise NotImplementedError 106 | 107 | @property 108 | def reference_x_shift(self): 109 | raise NotImplementedError 110 | 111 | @property 112 | def reference_y_shift(self): 113 | raise NotImplementedError 114 | 115 | @staticmethod 116 | def tensor_or_path_to_ndarray(tensor_or_path, rgb=True): 117 | """Convert path (represented as a string) or torch.tensor to a numpy.ndarray 118 | 119 | Arguments: 120 | tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself 121 | """ 122 | if isinstance(tensor_or_path, str): 123 | return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1] 124 | elif torch.is_tensor(tensor_or_path): 125 | # Call cpu in case its coming from cuda 126 | return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy() 127 | elif isinstance(tensor_or_path, np.ndarray): 128 | return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path 129 | else: 130 | raise TypeError 131 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/sfd/__init__.py: -------------------------------------------------------------------------------- 1 | from .sfd_detector import SFDDetector as FaceDetector -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/sfd/bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import cv2 5 | import random 6 | import datetime 7 | import time 8 | import math 9 | import argparse 10 | import numpy as np 11 | import torch 12 | 13 | try: 14 | from iou import IOU 15 | except BaseException: 16 | # IOU cython speedup 10x 17 | def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2): 18 | sa = abs((ax2 - ax1) * (ay2 - ay1)) 19 | sb = abs((bx2 - bx1) * (by2 - by1)) 20 | x1, y1 = max(ax1, bx1), max(ay1, by1) 21 | x2, y2 = min(ax2, bx2), min(ay2, by2) 22 | w = x2 - x1 23 | h = y2 - y1 24 | if w < 0 or h < 0: 25 | return 0.0 26 | else: 27 | return 1.0 * w * h / (sa + sb - w * h) 28 | 29 | 30 | def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh): 31 | xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1 32 | dx, dy = (xc - axc) / aww, (yc - ayc) / ahh 33 | dw, dh = math.log(ww / aww), math.log(hh / ahh) 34 | return dx, dy, dw, dh 35 | 36 | 37 | def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh): 38 | xc, yc = dx * aww + axc, dy * ahh + ayc 39 | ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh 40 | x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2 41 | return x1, y1, x2, y2 42 | 43 | 44 | def nms(dets, thresh): 45 | if 0 == len(dets): 46 | return [] 47 | x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4] 48 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 49 | order = scores.argsort()[::-1] 50 | 51 | keep = [] 52 | while order.size > 0: 53 | i = order[0] 54 | keep.append(i) 55 | xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]]) 56 | xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]]) 57 | 58 | w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1) 59 | ovr = w * h / (areas[i] + areas[order[1:]] - w * h) 60 | 61 | inds = np.where(ovr <= thresh)[0] 62 | order = order[inds + 1] 63 | 64 | return keep 65 | 66 | 67 | def encode(matched, priors, variances): 68 | """Encode the variances from the priorbox layers into the ground truth boxes 69 | we have matched (based on jaccard overlap) with the prior boxes. 70 | Args: 71 | matched: (tensor) Coords of ground truth for each prior in point-form 72 | Shape: [num_priors, 4]. 73 | priors: (tensor) Prior boxes in center-offset form 74 | Shape: [num_priors,4]. 75 | variances: (list[float]) Variances of priorboxes 76 | Return: 77 | encoded boxes (tensor), Shape: [num_priors, 4] 78 | """ 79 | 80 | # dist b/t match center and prior's center 81 | g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2] 82 | # encode variance 83 | g_cxcy /= (variances[0] * priors[:, 2:]) 84 | # match wh / prior wh 85 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 86 | g_wh = torch.log(g_wh) / variances[1] 87 | # return target for smooth_l1_loss 88 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 89 | 90 | 91 | def decode(loc, priors, variances): 92 | """Decode locations from predictions using priors to undo 93 | the encoding we did for offset regression at train time. 94 | Args: 95 | loc (tensor): location predictions for loc layers, 96 | Shape: [num_priors,4] 97 | priors (tensor): Prior boxes in center-offset form. 98 | Shape: [num_priors,4]. 99 | variances: (list[float]) Variances of priorboxes 100 | Return: 101 | decoded bounding box predictions 102 | """ 103 | 104 | boxes = torch.cat(( 105 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 106 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 107 | boxes[:, :2] -= boxes[:, 2:] / 2 108 | boxes[:, 2:] += boxes[:, :2] 109 | return boxes 110 | 111 | def batch_decode(loc, priors, variances): 112 | """Decode locations from predictions using priors to undo 113 | the encoding we did for offset regression at train time. 114 | Args: 115 | loc (tensor): location predictions for loc layers, 116 | Shape: [num_priors,4] 117 | priors (tensor): Prior boxes in center-offset form. 118 | Shape: [num_priors,4]. 119 | variances: (list[float]) Variances of priorboxes 120 | Return: 121 | decoded bounding box predictions 122 | """ 123 | 124 | boxes = torch.cat(( 125 | priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:], 126 | priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2) 127 | boxes[:, :, :2] -= boxes[:, :, 2:] / 2 128 | boxes[:, :, 2:] += boxes[:, :, :2] 129 | return boxes 130 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/sfd/detect.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | import os 5 | import sys 6 | import cv2 7 | import random 8 | import datetime 9 | import math 10 | import argparse 11 | import numpy as np 12 | 13 | import scipy.io as sio 14 | import zipfile 15 | from .net_s3fd import s3fd 16 | from .bbox import * 17 | 18 | 19 | def detect(net, img, device): 20 | img = img - np.array([104, 117, 123]) 21 | img = img.transpose(2, 0, 1) 22 | img = img.reshape((1,) + img.shape) 23 | 24 | if 'cuda' in device: 25 | torch.backends.cudnn.benchmark = True 26 | 27 | img = torch.from_numpy(img).float().to(device) 28 | BB, CC, HH, WW = img.size() 29 | with torch.no_grad(): 30 | olist = net(img) 31 | 32 | bboxlist = [] 33 | for i in range(len(olist) // 2): 34 | olist[i * 2] = F.softmax(olist[i * 2], dim=1) 35 | olist = [oelem.data.cpu() for oelem in olist] 36 | for i in range(len(olist) // 2): 37 | ocls, oreg = olist[i * 2], olist[i * 2 + 1] 38 | FB, FC, FH, FW = ocls.size() # feature map size 39 | stride = 2**(i + 2) # 4,8,16,32,64,128 40 | anchor = stride * 4 41 | poss = zip(*np.where(ocls[:, 1, :, :] > 0.05)) 42 | for Iindex, hindex, windex in poss: 43 | axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride 44 | score = ocls[0, 1, hindex, windex] 45 | loc = oreg[0, :, hindex, windex].contiguous().view(1, 4) 46 | priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]) 47 | variances = [0.1, 0.2] 48 | box = decode(loc, priors, variances) 49 | x1, y1, x2, y2 = box[0] * 1.0 50 | # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1) 51 | bboxlist.append([x1, y1, x2, y2, score]) 52 | bboxlist = np.array(bboxlist) 53 | if 0 == len(bboxlist): 54 | bboxlist = np.zeros((1, 5)) 55 | 56 | return bboxlist 57 | 58 | def batch_detect(net, imgs, device): 59 | imgs = imgs - np.array([104, 117, 123]) 60 | imgs = imgs.transpose(0, 3, 1, 2) 61 | 62 | if 'cuda' in device: 63 | torch.backends.cudnn.benchmark = True 64 | 65 | imgs = torch.from_numpy(imgs).float().to(device) 66 | BB, CC, HH, WW = imgs.size() 67 | with torch.no_grad(): 68 | olist = net(imgs) 69 | 70 | bboxlist = [] 71 | for i in range(len(olist) // 2): 72 | olist[i * 2] = F.softmax(olist[i * 2], dim=1) 73 | olist = [oelem.data.cpu() for oelem in olist] 74 | for i in range(len(olist) // 2): 75 | ocls, oreg = olist[i * 2], olist[i * 2 + 1] 76 | FB, FC, FH, FW = ocls.size() # feature map size 77 | stride = 2**(i + 2) # 4,8,16,32,64,128 78 | anchor = stride * 4 79 | poss = zip(*np.where(ocls[:, 1, :, :] > 0.05)) 80 | for Iindex, hindex, windex in poss: 81 | axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride 82 | score = ocls[:, 1, hindex, windex] 83 | loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4) 84 | priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4) 85 | variances = [0.1, 0.2] 86 | box = batch_decode(loc, priors, variances) 87 | box = box[:, 0] * 1.0 88 | # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1) 89 | bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy()) 90 | bboxlist = np.array(bboxlist) 91 | if 0 == len(bboxlist): 92 | bboxlist = np.zeros((1, BB, 5)) 93 | 94 | return bboxlist 95 | 96 | def flip_detect(net, img, device): 97 | img = cv2.flip(img, 1) 98 | b = detect(net, img, device) 99 | 100 | bboxlist = np.zeros(b.shape) 101 | bboxlist[:, 0] = img.shape[1] - b[:, 2] 102 | bboxlist[:, 1] = b[:, 1] 103 | bboxlist[:, 2] = img.shape[1] - b[:, 0] 104 | bboxlist[:, 3] = b[:, 3] 105 | bboxlist[:, 4] = b[:, 4] 106 | return bboxlist 107 | 108 | 109 | def pts_to_bb(pts): 110 | min_x, min_y = np.min(pts, axis=0) 111 | max_x, max_y = np.max(pts, axis=0) 112 | return np.array([min_x, min_y, max_x, max_y]) 113 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/sfd/net_s3fd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class L2Norm(nn.Module): 7 | def __init__(self, n_channels, scale=1.0): 8 | super(L2Norm, self).__init__() 9 | self.n_channels = n_channels 10 | self.scale = scale 11 | self.eps = 1e-10 12 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 13 | self.weight.data *= 0.0 14 | self.weight.data += self.scale 15 | 16 | def forward(self, x): 17 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 18 | x = x / norm * self.weight.view(1, -1, 1, 1) 19 | return x 20 | 21 | 22 | class s3fd(nn.Module): 23 | def __init__(self): 24 | super(s3fd, self).__init__() 25 | self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1) 26 | self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) 27 | 28 | self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1) 29 | self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1) 30 | 31 | self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1) 32 | self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 33 | self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 34 | 35 | self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1) 36 | self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 37 | self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 38 | 39 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 40 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 41 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 42 | 43 | self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3) 44 | self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0) 45 | 46 | self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) 47 | self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1) 48 | 49 | self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0) 50 | self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1) 51 | 52 | self.conv3_3_norm = L2Norm(256, scale=10) 53 | self.conv4_3_norm = L2Norm(512, scale=8) 54 | self.conv5_3_norm = L2Norm(512, scale=5) 55 | 56 | self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1) 57 | self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1) 58 | self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1) 59 | self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1) 60 | self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1) 61 | self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1) 62 | 63 | self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1) 64 | self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1) 65 | self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1) 66 | self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1) 67 | self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1) 68 | self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1) 69 | 70 | def forward(self, x): 71 | h = F.relu(self.conv1_1(x)) 72 | h = F.relu(self.conv1_2(h)) 73 | h = F.max_pool2d(h, 2, 2) 74 | 75 | h = F.relu(self.conv2_1(h)) 76 | h = F.relu(self.conv2_2(h)) 77 | h = F.max_pool2d(h, 2, 2) 78 | 79 | h = F.relu(self.conv3_1(h)) 80 | h = F.relu(self.conv3_2(h)) 81 | h = F.relu(self.conv3_3(h)) 82 | f3_3 = h 83 | h = F.max_pool2d(h, 2, 2) 84 | 85 | h = F.relu(self.conv4_1(h)) 86 | h = F.relu(self.conv4_2(h)) 87 | h = F.relu(self.conv4_3(h)) 88 | f4_3 = h 89 | h = F.max_pool2d(h, 2, 2) 90 | 91 | h = F.relu(self.conv5_1(h)) 92 | h = F.relu(self.conv5_2(h)) 93 | h = F.relu(self.conv5_3(h)) 94 | f5_3 = h 95 | h = F.max_pool2d(h, 2, 2) 96 | 97 | h = F.relu(self.fc6(h)) 98 | h = F.relu(self.fc7(h)) 99 | ffc7 = h 100 | h = F.relu(self.conv6_1(h)) 101 | h = F.relu(self.conv6_2(h)) 102 | f6_2 = h 103 | h = F.relu(self.conv7_1(h)) 104 | h = F.relu(self.conv7_2(h)) 105 | f7_2 = h 106 | 107 | f3_3 = self.conv3_3_norm(f3_3) 108 | f4_3 = self.conv4_3_norm(f4_3) 109 | f5_3 = self.conv5_3_norm(f5_3) 110 | 111 | cls1 = self.conv3_3_norm_mbox_conf(f3_3) 112 | reg1 = self.conv3_3_norm_mbox_loc(f3_3) 113 | cls2 = self.conv4_3_norm_mbox_conf(f4_3) 114 | reg2 = self.conv4_3_norm_mbox_loc(f4_3) 115 | cls3 = self.conv5_3_norm_mbox_conf(f5_3) 116 | reg3 = self.conv5_3_norm_mbox_loc(f5_3) 117 | cls4 = self.fc7_mbox_conf(ffc7) 118 | reg4 = self.fc7_mbox_loc(ffc7) 119 | cls5 = self.conv6_2_mbox_conf(f6_2) 120 | reg5 = self.conv6_2_mbox_loc(f6_2) 121 | cls6 = self.conv7_2_mbox_conf(f7_2) 122 | reg6 = self.conv7_2_mbox_loc(f7_2) 123 | 124 | # max-out background label 125 | chunk = torch.chunk(cls1, 4, 1) 126 | bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2]) 127 | cls1 = torch.cat([bmax, chunk[3]], dim=1) 128 | 129 | return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6] 130 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/detection/sfd/sfd_detector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | from torch.utils.model_zoo import load_url 4 | import modules.shared as shared 5 | from ..core import FaceDetector 6 | 7 | from .net_s3fd import s3fd 8 | from .bbox import * 9 | from .detect import * 10 | 11 | models_urls = { 12 | 's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth', 13 | } 14 | 15 | 16 | class SFDDetector(FaceDetector): 17 | def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False): 18 | super(SFDDetector, self).__init__(device, verbose) 19 | shared.cmd_opts.disable_safe_unpickle = True 20 | # Initialise the face detector 21 | if not os.path.isfile(path_to_detector): 22 | model_weights = load_url(models_urls['s3fd']) 23 | else: 24 | model_weights = torch.load(path_to_detector) 25 | 26 | self.face_detector = s3fd() 27 | self.face_detector.load_state_dict(model_weights) 28 | self.face_detector.to(device) 29 | self.face_detector.eval() 30 | shared.cmd_opts.disable_safe_unpickle = False 31 | 32 | def detect_from_image(self, tensor_or_path): 33 | image = self.tensor_or_path_to_ndarray(tensor_or_path) 34 | 35 | bboxlist = detect(self.face_detector, image, device=self.device) 36 | keep = nms(bboxlist, 0.3) 37 | bboxlist = bboxlist[keep, :] 38 | bboxlist = [x for x in bboxlist if x[-1] > 0.5] 39 | 40 | return bboxlist 41 | 42 | def detect_from_batch(self, images): 43 | bboxlists = batch_detect(self.face_detector, images, device=self.device) 44 | keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])] 45 | bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)] 46 | bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists] 47 | 48 | return bboxlists 49 | 50 | @property 51 | def reference_scale(self): 52 | return 195 53 | 54 | @property 55 | def reference_x_shift(self): 56 | return 0 57 | 58 | @property 59 | def reference_y_shift(self): 60 | return 0 61 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | 7 | def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False): 8 | "3x3 convolution with padding" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, 10 | stride=strd, padding=padding, bias=bias) 11 | 12 | 13 | class ConvBlock(nn.Module): 14 | def __init__(self, in_planes, out_planes): 15 | super(ConvBlock, self).__init__() 16 | self.bn1 = nn.BatchNorm2d(in_planes) 17 | self.conv1 = conv3x3(in_planes, int(out_planes / 2)) 18 | self.bn2 = nn.BatchNorm2d(int(out_planes / 2)) 19 | self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4)) 20 | self.bn3 = nn.BatchNorm2d(int(out_planes / 4)) 21 | self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4)) 22 | 23 | if in_planes != out_planes: 24 | self.downsample = nn.Sequential( 25 | nn.BatchNorm2d(in_planes), 26 | nn.ReLU(True), 27 | nn.Conv2d(in_planes, out_planes, 28 | kernel_size=1, stride=1, bias=False), 29 | ) 30 | else: 31 | self.downsample = None 32 | 33 | def forward(self, x): 34 | residual = x 35 | 36 | out1 = self.bn1(x) 37 | out1 = F.relu(out1, True) 38 | out1 = self.conv1(out1) 39 | 40 | out2 = self.bn2(out1) 41 | out2 = F.relu(out2, True) 42 | out2 = self.conv2(out2) 43 | 44 | out3 = self.bn3(out2) 45 | out3 = F.relu(out3, True) 46 | out3 = self.conv3(out3) 47 | 48 | out3 = torch.cat((out1, out2, out3), 1) 49 | 50 | if self.downsample is not None: 51 | residual = self.downsample(residual) 52 | 53 | out3 += residual 54 | 55 | return out3 56 | 57 | 58 | class Bottleneck(nn.Module): 59 | 60 | expansion = 4 61 | 62 | def __init__(self, inplanes, planes, stride=1, downsample=None): 63 | super(Bottleneck, self).__init__() 64 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 65 | self.bn1 = nn.BatchNorm2d(planes) 66 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 67 | padding=1, bias=False) 68 | self.bn2 = nn.BatchNorm2d(planes) 69 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 70 | self.bn3 = nn.BatchNorm2d(planes * 4) 71 | self.relu = nn.ReLU(inplace=True) 72 | self.downsample = downsample 73 | self.stride = stride 74 | 75 | def forward(self, x): 76 | residual = x 77 | 78 | out = self.conv1(x) 79 | out = self.bn1(out) 80 | out = self.relu(out) 81 | 82 | out = self.conv2(out) 83 | out = self.bn2(out) 84 | out = self.relu(out) 85 | 86 | out = self.conv3(out) 87 | out = self.bn3(out) 88 | 89 | if self.downsample is not None: 90 | residual = self.downsample(x) 91 | 92 | out += residual 93 | out = self.relu(out) 94 | 95 | return out 96 | 97 | 98 | class HourGlass(nn.Module): 99 | def __init__(self, num_modules, depth, num_features): 100 | super(HourGlass, self).__init__() 101 | self.num_modules = num_modules 102 | self.depth = depth 103 | self.features = num_features 104 | 105 | self._generate_network(self.depth) 106 | 107 | def _generate_network(self, level): 108 | self.add_module('b1_' + str(level), ConvBlock(self.features, self.features)) 109 | 110 | self.add_module('b2_' + str(level), ConvBlock(self.features, self.features)) 111 | 112 | if level > 1: 113 | self._generate_network(level - 1) 114 | else: 115 | self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features)) 116 | 117 | self.add_module('b3_' + str(level), ConvBlock(self.features, self.features)) 118 | 119 | def _forward(self, level, inp): 120 | # Upper branch 121 | up1 = inp 122 | up1 = self._modules['b1_' + str(level)](up1) 123 | 124 | # Lower branch 125 | low1 = F.avg_pool2d(inp, 2, stride=2) 126 | low1 = self._modules['b2_' + str(level)](low1) 127 | 128 | if level > 1: 129 | low2 = self._forward(level - 1, low1) 130 | else: 131 | low2 = low1 132 | low2 = self._modules['b2_plus_' + str(level)](low2) 133 | 134 | low3 = low2 135 | low3 = self._modules['b3_' + str(level)](low3) 136 | 137 | up2 = F.interpolate(low3, scale_factor=2, mode='nearest') 138 | 139 | return up1 + up2 140 | 141 | def forward(self, x): 142 | return self._forward(self.depth, x) 143 | 144 | 145 | class FAN(nn.Module): 146 | 147 | def __init__(self, num_modules=1): 148 | super(FAN, self).__init__() 149 | self.num_modules = num_modules 150 | 151 | # Base part 152 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) 153 | self.bn1 = nn.BatchNorm2d(64) 154 | self.conv2 = ConvBlock(64, 128) 155 | self.conv3 = ConvBlock(128, 128) 156 | self.conv4 = ConvBlock(128, 256) 157 | 158 | # Stacking part 159 | for hg_module in range(self.num_modules): 160 | self.add_module('m' + str(hg_module), HourGlass(1, 4, 256)) 161 | self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256)) 162 | self.add_module('conv_last' + str(hg_module), 163 | nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) 164 | self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256)) 165 | self.add_module('l' + str(hg_module), nn.Conv2d(256, 166 | 68, kernel_size=1, stride=1, padding=0)) 167 | 168 | if hg_module < self.num_modules - 1: 169 | self.add_module( 170 | 'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) 171 | self.add_module('al' + str(hg_module), nn.Conv2d(68, 172 | 256, kernel_size=1, stride=1, padding=0)) 173 | 174 | def forward(self, x): 175 | x = F.relu(self.bn1(self.conv1(x)), True) 176 | x = F.avg_pool2d(self.conv2(x), 2, stride=2) 177 | x = self.conv3(x) 178 | x = self.conv4(x) 179 | 180 | previous = x 181 | 182 | outputs = [] 183 | for i in range(self.num_modules): 184 | hg = self._modules['m' + str(i)](previous) 185 | 186 | ll = hg 187 | ll = self._modules['top_m_' + str(i)](ll) 188 | 189 | ll = F.relu(self._modules['bn_end' + str(i)] 190 | (self._modules['conv_last' + str(i)](ll)), True) 191 | 192 | # Predict heatmaps 193 | tmp_out = self._modules['l' + str(i)](ll) 194 | outputs.append(tmp_out) 195 | 196 | if i < self.num_modules - 1: 197 | ll = self._modules['bl' + str(i)](ll) 198 | tmp_out_ = self._modules['al' + str(i)](tmp_out) 199 | previous = previous + ll + tmp_out_ 200 | 201 | return outputs 202 | 203 | 204 | class ResNetDepth(nn.Module): 205 | 206 | def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68): 207 | self.inplanes = 64 208 | super(ResNetDepth, self).__init__() 209 | self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3, 210 | bias=False) 211 | self.bn1 = nn.BatchNorm2d(64) 212 | self.relu = nn.ReLU(inplace=True) 213 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 214 | self.layer1 = self._make_layer(block, 64, layers[0]) 215 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 216 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 217 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 218 | self.avgpool = nn.AvgPool2d(7) 219 | self.fc = nn.Linear(512 * block.expansion, num_classes) 220 | 221 | for m in self.modules(): 222 | if isinstance(m, nn.Conv2d): 223 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 224 | m.weight.data.normal_(0, math.sqrt(2. / n)) 225 | elif isinstance(m, nn.BatchNorm2d): 226 | m.weight.data.fill_(1) 227 | m.bias.data.zero_() 228 | 229 | def _make_layer(self, block, planes, blocks, stride=1): 230 | downsample = None 231 | if stride != 1 or self.inplanes != planes * block.expansion: 232 | downsample = nn.Sequential( 233 | nn.Conv2d(self.inplanes, planes * block.expansion, 234 | kernel_size=1, stride=stride, bias=False), 235 | nn.BatchNorm2d(planes * block.expansion), 236 | ) 237 | 238 | layers = [] 239 | layers.append(block(self.inplanes, planes, stride, downsample)) 240 | self.inplanes = planes * block.expansion 241 | for i in range(1, blocks): 242 | layers.append(block(self.inplanes, planes)) 243 | 244 | return nn.Sequential(*layers) 245 | 246 | def forward(self, x): 247 | x = self.conv1(x) 248 | x = self.bn1(x) 249 | x = self.relu(x) 250 | x = self.maxpool(x) 251 | 252 | x = self.layer1(x) 253 | x = self.layer2(x) 254 | x = self.layer3(x) 255 | x = self.layer4(x) 256 | 257 | x = self.avgpool(x) 258 | x = x.view(x.size(0), -1) 259 | x = self.fc(x) 260 | 261 | return x 262 | -------------------------------------------------------------------------------- /scripts/wav2lip/face_detection/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import time 5 | import torch 6 | import math 7 | import numpy as np 8 | import cv2 9 | 10 | 11 | def _gaussian( 12 | size=3, sigma=0.25, amplitude=1, normalize=False, width=None, 13 | height=None, sigma_horz=None, sigma_vert=None, mean_horz=0.5, 14 | mean_vert=0.5): 15 | # handle some defaults 16 | if width is None: 17 | width = size 18 | if height is None: 19 | height = size 20 | if sigma_horz is None: 21 | sigma_horz = sigma 22 | if sigma_vert is None: 23 | sigma_vert = sigma 24 | center_x = mean_horz * width + 0.5 25 | center_y = mean_vert * height + 0.5 26 | gauss = np.empty((height, width), dtype=np.float32) 27 | # generate kernel 28 | for i in range(height): 29 | for j in range(width): 30 | gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / ( 31 | sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0)) 32 | if normalize: 33 | gauss = gauss / np.sum(gauss) 34 | return gauss 35 | 36 | 37 | def draw_gaussian(image, point, sigma): 38 | # Check if the gaussian is inside 39 | ul = [math.floor(point[0] - 3 * sigma), math.floor(point[1] - 3 * sigma)] 40 | br = [math.floor(point[0] + 3 * sigma), math.floor(point[1] + 3 * sigma)] 41 | if (ul[0] > image.shape[1] or ul[1] > image.shape[0] or br[0] < 1 or br[1] < 1): 42 | return image 43 | size = 6 * sigma + 1 44 | g = _gaussian(size) 45 | g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) - int(max(1, ul[0])) + int(max(1, -ul[0]))] 46 | g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) - int(max(1, ul[1])) + int(max(1, -ul[1]))] 47 | img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))] 48 | img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))] 49 | assert (g_x[0] > 0 and g_y[1] > 0) 50 | image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1] 51 | ] = image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]] 52 | image[image > 1] = 1 53 | return image 54 | 55 | 56 | def transform(point, center, scale, resolution, invert=False): 57 | """Generate and affine transformation matrix. 58 | 59 | Given a set of points, a center, a scale and a targer resolution, the 60 | function generates and affine transformation matrix. If invert is ``True`` 61 | it will produce the inverse transformation. 62 | 63 | Arguments: 64 | point {torch.tensor} -- the input 2D point 65 | center {torch.tensor or numpy.array} -- the center around which to perform the transformations 66 | scale {float} -- the scale of the face/object 67 | resolution {float} -- the output resolution 68 | 69 | Keyword Arguments: 70 | invert {bool} -- define wherever the function should produce the direct or the 71 | inverse transformation matrix (default: {False}) 72 | """ 73 | _pt = torch.ones(3) 74 | _pt[0] = point[0] 75 | _pt[1] = point[1] 76 | 77 | h = 200.0 * scale 78 | t = torch.eye(3) 79 | t[0, 0] = resolution / h 80 | t[1, 1] = resolution / h 81 | t[0, 2] = resolution * (-center[0] / h + 0.5) 82 | t[1, 2] = resolution * (-center[1] / h + 0.5) 83 | 84 | if invert: 85 | t = torch.inverse(t) 86 | 87 | new_point = (torch.matmul(t, _pt))[0:2] 88 | 89 | return new_point.int() 90 | 91 | 92 | def crop(image, center, scale, resolution=256.0): 93 | """Center crops an image or set of heatmaps 94 | 95 | Arguments: 96 | image {numpy.array} -- an rgb image 97 | center {numpy.array} -- the center of the object, usually the same as of the bounding box 98 | scale {float} -- scale of the face 99 | 100 | Keyword Arguments: 101 | resolution {float} -- the size of the output cropped image (default: {256.0}) 102 | 103 | Returns: 104 | [type] -- [description] 105 | """ # Crop around the center point 106 | """ Crops the image around the center. Input is expected to be an np.ndarray """ 107 | ul = transform([1, 1], center, scale, resolution, True) 108 | br = transform([resolution, resolution], center, scale, resolution, True) 109 | # pad = math.ceil(torch.norm((ul - br).float()) / 2.0 - (br[0] - ul[0]) / 2.0) 110 | if image.ndim > 2: 111 | newDim = np.array([br[1] - ul[1], br[0] - ul[0], 112 | image.shape[2]], dtype=np.int32) 113 | newImg = np.zeros(newDim, dtype=np.uint8) 114 | else: 115 | newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int) 116 | newImg = np.zeros(newDim, dtype=np.uint8) 117 | ht = image.shape[0] 118 | wd = image.shape[1] 119 | newX = np.array( 120 | [max(1, -ul[0] + 1), min(br[0], wd) - ul[0]], dtype=np.int32) 121 | newY = np.array( 122 | [max(1, -ul[1] + 1), min(br[1], ht) - ul[1]], dtype=np.int32) 123 | oldX = np.array([max(1, ul[0] + 1), min(br[0], wd)], dtype=np.int32) 124 | oldY = np.array([max(1, ul[1] + 1), min(br[1], ht)], dtype=np.int32) 125 | newImg[newY[0] - 1:newY[1], newX[0] - 1:newX[1] 126 | ] = image[oldY[0] - 1:oldY[1], oldX[0] - 1:oldX[1], :] 127 | newImg = cv2.resize(newImg, dsize=(int(resolution), int(resolution)), 128 | interpolation=cv2.INTER_LINEAR) 129 | return newImg 130 | 131 | 132 | def get_preds_fromhm(hm, center=None, scale=None): 133 | """Obtain (x,y) coordinates given a set of N heatmaps. If the center 134 | and the scale is provided the function will return the points also in 135 | the original coordinate frame. 136 | 137 | Arguments: 138 | hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H] 139 | 140 | Keyword Arguments: 141 | center {torch.tensor} -- the center of the bounding box (default: {None}) 142 | scale {float} -- face scale (default: {None}) 143 | """ 144 | max, idx = torch.max( 145 | hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2) 146 | idx += 1 147 | preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float() 148 | preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1) 149 | preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1) 150 | 151 | for i in range(preds.size(0)): 152 | for j in range(preds.size(1)): 153 | hm_ = hm[i, j, :] 154 | pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1 155 | if pX > 0 and pX < 63 and pY > 0 and pY < 63: 156 | diff = torch.FloatTensor( 157 | [hm_[pY, pX + 1] - hm_[pY, pX - 1], 158 | hm_[pY + 1, pX] - hm_[pY - 1, pX]]) 159 | preds[i, j].add_(diff.sign_().mul_(.25)) 160 | 161 | preds.add_(-.5) 162 | 163 | preds_orig = torch.zeros(preds.size()) 164 | if center is not None and scale is not None: 165 | for i in range(hm.size(0)): 166 | for j in range(hm.size(1)): 167 | preds_orig[i, j] = transform( 168 | preds[i, j], center, scale, hm.size(2), True) 169 | 170 | return preds, preds_orig 171 | 172 | def get_preds_fromhm_batch(hm, centers=None, scales=None): 173 | """Obtain (x,y) coordinates given a set of N heatmaps. If the centers 174 | and the scales is provided the function will return the points also in 175 | the original coordinate frame. 176 | 177 | Arguments: 178 | hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H] 179 | 180 | Keyword Arguments: 181 | centers {torch.tensor} -- the centers of the bounding box (default: {None}) 182 | scales {float} -- face scales (default: {None}) 183 | """ 184 | max, idx = torch.max( 185 | hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2) 186 | idx += 1 187 | preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float() 188 | preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1) 189 | preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1) 190 | 191 | for i in range(preds.size(0)): 192 | for j in range(preds.size(1)): 193 | hm_ = hm[i, j, :] 194 | pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1 195 | if pX > 0 and pX < 63 and pY > 0 and pY < 63: 196 | diff = torch.FloatTensor( 197 | [hm_[pY, pX + 1] - hm_[pY, pX - 1], 198 | hm_[pY + 1, pX] - hm_[pY - 1, pX]]) 199 | preds[i, j].add_(diff.sign_().mul_(.25)) 200 | 201 | preds.add_(-.5) 202 | 203 | preds_orig = torch.zeros(preds.size()) 204 | if centers is not None and scales is not None: 205 | for i in range(hm.size(0)): 206 | for j in range(hm.size(1)): 207 | preds_orig[i, j] = transform( 208 | preds[i, j], centers[i], scales[i], hm.size(2), True) 209 | 210 | return preds, preds_orig 211 | 212 | def shuffle_lr(parts, pairs=None): 213 | """Shuffle the points left-right according to the axis of symmetry 214 | of the object. 215 | 216 | Arguments: 217 | parts {torch.tensor} -- a 3D or 4D object containing the 218 | heatmaps. 219 | 220 | Keyword Arguments: 221 | pairs {list of integers} -- [order of the flipped points] (default: {None}) 222 | """ 223 | if pairs is None: 224 | pairs = [16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 225 | 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35, 226 | 34, 33, 32, 31, 45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41, 227 | 40, 54, 53, 52, 51, 50, 49, 48, 59, 58, 57, 56, 55, 64, 63, 228 | 62, 61, 60, 67, 66, 65] 229 | if parts.ndimension() == 3: 230 | parts = parts[pairs, ...] 231 | else: 232 | parts = parts[:, pairs, ...] 233 | 234 | return parts 235 | 236 | 237 | def flip(tensor, is_label=False): 238 | """Flip an image or a set of heatmaps left-right 239 | 240 | Arguments: 241 | tensor {numpy.array or torch.tensor} -- [the input image or heatmaps] 242 | 243 | Keyword Arguments: 244 | is_label {bool} -- [denote wherever the input is an image or a set of heatmaps ] (default: {False}) 245 | """ 246 | if not torch.is_tensor(tensor): 247 | tensor = torch.from_numpy(tensor) 248 | 249 | if is_label: 250 | tensor = shuffle_lr(tensor).flip(tensor.ndimension() - 1) 251 | else: 252 | tensor = tensor.flip(tensor.ndimension() - 1) 253 | 254 | return tensor 255 | 256 | # From pyzolib/paths.py (https://bitbucket.org/pyzo/pyzolib/src/tip/paths.py) 257 | 258 | 259 | def appdata_dir(appname=None, roaming=False): 260 | """ appdata_dir(appname=None, roaming=False) 261 | 262 | Get the path to the application directory, where applications are allowed 263 | to write user specific files (e.g. configurations). For non-user specific 264 | data, consider using common_appdata_dir(). 265 | If appname is given, a subdir is appended (and created if necessary). 266 | If roaming is True, will prefer a roaming directory (Windows Vista/7). 267 | """ 268 | 269 | # Define default user directory 270 | userDir = os.getenv('FACEALIGNMENT_USERDIR', None) 271 | if userDir is None: 272 | userDir = os.path.expanduser('~') 273 | if not os.path.isdir(userDir): # pragma: no cover 274 | userDir = '/var/tmp' # issue #54 275 | 276 | # Get system app data dir 277 | path = None 278 | if sys.platform.startswith('win'): 279 | path1, path2 = os.getenv('LOCALAPPDATA'), os.getenv('APPDATA') 280 | path = (path2 or path1) if roaming else (path1 or path2) 281 | elif sys.platform.startswith('darwin'): 282 | path = os.path.join(userDir, 'Library', 'Application Support') 283 | # On Linux and as fallback 284 | if not (path and os.path.isdir(path)): 285 | path = userDir 286 | 287 | # Maybe we should store things local to the executable (in case of a 288 | # portable distro or a frozen application that wants to be portable) 289 | prefix = sys.prefix 290 | if getattr(sys, 'frozen', None): 291 | prefix = os.path.abspath(os.path.dirname(sys.executable)) 292 | for reldir in ('settings', '../settings'): 293 | localpath = os.path.abspath(os.path.join(prefix, reldir)) 294 | if os.path.isdir(localpath): # pragma: no cover 295 | try: 296 | open(os.path.join(localpath, 'test.write'), 'wb').close() 297 | os.remove(os.path.join(localpath, 'test.write')) 298 | except IOError: 299 | pass # We cannot write in this directory 300 | else: 301 | path = localpath 302 | break 303 | 304 | # Get path specific for this app 305 | if appname: 306 | if path == userDir: 307 | appname = '.' + appname.lstrip('.') # Make it a hidden directory 308 | path = os.path.join(path, appname) 309 | if not os.path.isdir(path): # pragma: no cover 310 | os.mkdir(path) 311 | 312 | # Done 313 | return path 314 | -------------------------------------------------------------------------------- /scripts/wav2lip/hparams.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_image_list(data_root, split): 5 | filelist = [] 6 | 7 | with open('filelists/{}.txt'.format(split)) as f: 8 | for line in f: 9 | line = line.strip() 10 | if ' ' in line: line = line.split()[0] 11 | filelist.append(os.path.join(data_root, line)) 12 | 13 | return filelist 14 | 15 | 16 | class HParams: 17 | def __init__(self, **kwargs): 18 | self.data = {} 19 | 20 | for key, value in kwargs.items(): 21 | self.data[key] = value 22 | 23 | def __getattr__(self, key): 24 | if key not in self.data: 25 | raise AttributeError("'HParams' object has no attribute %s" % key) 26 | return self.data[key] 27 | 28 | def set_hparam(self, key, value): 29 | self.data[key] = value 30 | 31 | 32 | # Default hyperparameters 33 | hparams = HParams( 34 | num_mels=80, # Number of mel-spectrogram channels and local conditioning dimensionality 35 | # network 36 | rescale=True, # Whether to rescale audio prior to preprocessing 37 | rescaling_max=0.9, # Rescaling value 38 | 39 | # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction 40 | # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder 41 | # Does not work if n_ffit is not multiple of hop_size!! 42 | use_lws=False, 43 | 44 | n_fft=800, # Extra window size is filled with 0 paddings to match this parameter 45 | hop_size=200, # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate) 46 | win_size=800, # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate) 47 | sample_rate=16000, # 16000Hz (corresponding to librispeech) (sox --i ) 48 | 49 | frame_shift_ms=None, # Can replace hop_size parameter. (Recommended: 12.5) 50 | 51 | # Mel and Linear spectrograms normalization/scaling and clipping 52 | signal_normalization=True, 53 | # Whether to normalize mel spectrograms to some predefined range (following below parameters) 54 | allow_clipping_in_normalization=True, # Only relevant if mel_normalization = True 55 | symmetric_mels=True, 56 | # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, 57 | # faster and cleaner convergence) 58 | max_abs_value=4., 59 | # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not 60 | # be too big to avoid gradient explosion, 61 | # not too small for fast convergence) 62 | # Contribution by @begeekmyfriend 63 | # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude 64 | # levels. Also allows for better G&L phase reconstruction) 65 | preemphasize=True, # whether to apply filter 66 | preemphasis=0.97, # filter coefficient. 67 | 68 | # Limits 69 | min_level_db=-100, 70 | ref_level_db=20, 71 | fmin=55, 72 | # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To 73 | # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 74 | fmax=7600, # To be increased/reduced depending on data. 75 | 76 | ###################### Our training parameters ################################# 77 | img_size=96, 78 | fps=25, 79 | 80 | batch_size=16, 81 | initial_learning_rate=1e-4, 82 | nepochs=200000000000000000, 83 | ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs 84 | num_workers=16, 85 | checkpoint_interval=3000, 86 | eval_interval=3000, 87 | save_optimizer_state=True, 88 | 89 | syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence. 90 | syncnet_batch_size=64, 91 | syncnet_lr=1e-4, 92 | syncnet_eval_interval=10000, 93 | syncnet_checkpoint_interval=10000, 94 | 95 | disc_wt=0.07, 96 | disc_initial_learning_rate=1e-4, 97 | ) 98 | 99 | 100 | def hparams_debug_string(): 101 | values = hparams.values() 102 | hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"] 103 | return "Hyperparameters:\n" + "\n".join(hp) 104 | -------------------------------------------------------------------------------- /scripts/wav2lip/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .wav2lip import Wav2Lip, Wav2Lip_disc_qual 2 | from .syncnet import SyncNet_color -------------------------------------------------------------------------------- /scripts/wav2lip/models/conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | class Conv2d(nn.Module): 6 | def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.conv_block = nn.Sequential( 9 | nn.Conv2d(cin, cout, kernel_size, stride, padding), 10 | nn.BatchNorm2d(cout) 11 | ) 12 | self.act = nn.ReLU() 13 | self.residual = residual 14 | 15 | def forward(self, x): 16 | out = self.conv_block(x) 17 | if self.residual: 18 | out += x 19 | return self.act(out) 20 | 21 | class nonorm_Conv2d(nn.Module): 22 | def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | self.conv_block = nn.Sequential( 25 | nn.Conv2d(cin, cout, kernel_size, stride, padding), 26 | ) 27 | self.act = nn.LeakyReLU(0.01, inplace=True) 28 | 29 | def forward(self, x): 30 | out = self.conv_block(x) 31 | return self.act(out) 32 | 33 | class Conv2dTranspose(nn.Module): 34 | def __init__(self, cin, cout, kernel_size, stride, padding, output_padding=0, *args, **kwargs): 35 | super().__init__(*args, **kwargs) 36 | self.conv_block = nn.Sequential( 37 | nn.ConvTranspose2d(cin, cout, kernel_size, stride, padding, output_padding), 38 | nn.BatchNorm2d(cout) 39 | ) 40 | self.act = nn.ReLU() 41 | 42 | def forward(self, x): 43 | out = self.conv_block(x) 44 | return self.act(out) 45 | -------------------------------------------------------------------------------- /scripts/wav2lip/models/syncnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from .conv import Conv2d 6 | 7 | class SyncNet_color(nn.Module): 8 | def __init__(self): 9 | super(SyncNet_color, self).__init__() 10 | 11 | self.face_encoder = nn.Sequential( 12 | Conv2d(15, 32, kernel_size=(7, 7), stride=1, padding=3), 13 | 14 | Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=1), 15 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 16 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 17 | 18 | Conv2d(64, 128, kernel_size=3, stride=2, padding=1), 19 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 20 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 21 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 22 | 23 | Conv2d(128, 256, kernel_size=3, stride=2, padding=1), 24 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 25 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 26 | 27 | Conv2d(256, 512, kernel_size=3, stride=2, padding=1), 28 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True), 29 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True), 30 | 31 | Conv2d(512, 512, kernel_size=3, stride=2, padding=1), 32 | Conv2d(512, 512, kernel_size=3, stride=1, padding=0), 33 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0),) 34 | 35 | self.audio_encoder = nn.Sequential( 36 | Conv2d(1, 32, kernel_size=3, stride=1, padding=1), 37 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 38 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 39 | 40 | Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1), 41 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 42 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 43 | 44 | Conv2d(64, 128, kernel_size=3, stride=3, padding=1), 45 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 46 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 47 | 48 | Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1), 49 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 50 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 51 | 52 | Conv2d(256, 512, kernel_size=3, stride=1, padding=0), 53 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0),) 54 | 55 | def forward(self, audio_sequences, face_sequences): # audio_sequences := (B, dim, T) 56 | face_embedding = self.face_encoder(face_sequences) 57 | audio_embedding = self.audio_encoder(audio_sequences) 58 | 59 | audio_embedding = audio_embedding.view(audio_embedding.size(0), -1) 60 | face_embedding = face_embedding.view(face_embedding.size(0), -1) 61 | 62 | audio_embedding = F.normalize(audio_embedding, p=2, dim=1) 63 | face_embedding = F.normalize(face_embedding, p=2, dim=1) 64 | 65 | 66 | return audio_embedding, face_embedding 67 | -------------------------------------------------------------------------------- /scripts/wav2lip/models/wav2lip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import math 5 | 6 | from .conv import Conv2dTranspose, Conv2d, nonorm_Conv2d 7 | 8 | class Wav2Lip(nn.Module): 9 | def __init__(self): 10 | super(Wav2Lip, self).__init__() 11 | 12 | self.face_encoder_blocks = nn.ModuleList([ 13 | nn.Sequential(Conv2d(6, 16, kernel_size=7, stride=1, padding=3)), # 96,96 14 | 15 | nn.Sequential(Conv2d(16, 32, kernel_size=3, stride=2, padding=1), # 48,48 16 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 17 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True)), 18 | 19 | nn.Sequential(Conv2d(32, 64, kernel_size=3, stride=2, padding=1), # 24,24 20 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 21 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 22 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True)), 23 | 24 | nn.Sequential(Conv2d(64, 128, kernel_size=3, stride=2, padding=1), # 12,12 25 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 26 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True)), 27 | 28 | nn.Sequential(Conv2d(128, 256, kernel_size=3, stride=2, padding=1), # 6,6 29 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 30 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True)), 31 | 32 | nn.Sequential(Conv2d(256, 512, kernel_size=3, stride=2, padding=1), # 3,3 33 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),), 34 | 35 | nn.Sequential(Conv2d(512, 512, kernel_size=3, stride=1, padding=0), # 1, 1 36 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),]) 37 | 38 | self.audio_encoder = nn.Sequential( 39 | Conv2d(1, 32, kernel_size=3, stride=1, padding=1), 40 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 41 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 42 | 43 | Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1), 44 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 45 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 46 | 47 | Conv2d(64, 128, kernel_size=3, stride=3, padding=1), 48 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 49 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 50 | 51 | Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1), 52 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 53 | 54 | Conv2d(256, 512, kernel_size=3, stride=1, padding=0), 55 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0),) 56 | 57 | self.face_decoder_blocks = nn.ModuleList([ 58 | nn.Sequential(Conv2d(512, 512, kernel_size=1, stride=1, padding=0),), 59 | 60 | nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=1, padding=0), # 3,3 61 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),), 62 | 63 | nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=2, padding=1, output_padding=1), 64 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True), 65 | Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),), # 6, 6 66 | 67 | nn.Sequential(Conv2dTranspose(768, 384, kernel_size=3, stride=2, padding=1, output_padding=1), 68 | Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True), 69 | Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True),), # 12, 12 70 | 71 | nn.Sequential(Conv2dTranspose(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1), 72 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 73 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),), # 24, 24 74 | 75 | nn.Sequential(Conv2dTranspose(320, 128, kernel_size=3, stride=2, padding=1, output_padding=1), 76 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 77 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),), # 48, 48 78 | 79 | nn.Sequential(Conv2dTranspose(160, 64, kernel_size=3, stride=2, padding=1, output_padding=1), 80 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 81 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),),]) # 96,96 82 | 83 | self.output_block = nn.Sequential(Conv2d(80, 32, kernel_size=3, stride=1, padding=1), 84 | nn.Conv2d(32, 3, kernel_size=1, stride=1, padding=0), 85 | nn.Sigmoid()) 86 | 87 | def forward(self, audio_sequences, face_sequences): 88 | # audio_sequences = (B, T, 1, 80, 16) 89 | B = audio_sequences.size(0) 90 | 91 | input_dim_size = len(face_sequences.size()) 92 | if input_dim_size > 4: 93 | audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0) 94 | face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0) 95 | 96 | audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1 97 | 98 | feats = [] 99 | x = face_sequences 100 | for f in self.face_encoder_blocks: 101 | x = f(x) 102 | feats.append(x) 103 | 104 | x = audio_embedding 105 | for f in self.face_decoder_blocks: 106 | x = f(x) 107 | try: 108 | x = torch.cat((x, feats[-1]), dim=1) 109 | except Exception as e: 110 | print(x.size()) 111 | print(feats[-1].size()) 112 | raise e 113 | 114 | feats.pop() 115 | 116 | x = self.output_block(x) 117 | 118 | if input_dim_size > 4: 119 | x = torch.split(x, B, dim=0) # [(B, C, H, W)] 120 | outputs = torch.stack(x, dim=2) # (B, C, T, H, W) 121 | 122 | else: 123 | outputs = x 124 | 125 | return outputs 126 | 127 | class Wav2Lip_disc_qual(nn.Module): 128 | def __init__(self): 129 | super(Wav2Lip_disc_qual, self).__init__() 130 | 131 | self.face_encoder_blocks = nn.ModuleList([ 132 | nn.Sequential(nonorm_Conv2d(3, 32, kernel_size=7, stride=1, padding=3)), # 48,96 133 | 134 | nn.Sequential(nonorm_Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=2), # 48,48 135 | nonorm_Conv2d(64, 64, kernel_size=5, stride=1, padding=2)), 136 | 137 | nn.Sequential(nonorm_Conv2d(64, 128, kernel_size=5, stride=2, padding=2), # 24,24 138 | nonorm_Conv2d(128, 128, kernel_size=5, stride=1, padding=2)), 139 | 140 | nn.Sequential(nonorm_Conv2d(128, 256, kernel_size=5, stride=2, padding=2), # 12,12 141 | nonorm_Conv2d(256, 256, kernel_size=5, stride=1, padding=2)), 142 | 143 | nn.Sequential(nonorm_Conv2d(256, 512, kernel_size=3, stride=2, padding=1), # 6,6 144 | nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1)), 145 | 146 | nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=2, padding=1), # 3,3 147 | nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1),), 148 | 149 | nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=0), # 1, 1 150 | nonorm_Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),]) 151 | 152 | self.binary_pred = nn.Sequential(nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0), nn.Sigmoid()) 153 | self.label_noise = .0 154 | 155 | def get_lower_half(self, face_sequences): 156 | return face_sequences[:, :, face_sequences.size(2)//2:] 157 | 158 | def to_2d(self, face_sequences): 159 | B = face_sequences.size(0) 160 | face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0) 161 | return face_sequences 162 | 163 | def perceptual_forward(self, false_face_sequences): 164 | false_face_sequences = self.to_2d(false_face_sequences) 165 | false_face_sequences = self.get_lower_half(false_face_sequences) 166 | 167 | false_feats = false_face_sequences 168 | for f in self.face_encoder_blocks: 169 | false_feats = f(false_feats) 170 | 171 | false_pred_loss = F.binary_cross_entropy(self.binary_pred(false_feats).view(len(false_feats), -1), 172 | torch.ones((len(false_feats), 1)).cuda()) 173 | 174 | return false_pred_loss 175 | 176 | def forward(self, face_sequences): 177 | face_sequences = self.to_2d(face_sequences) 178 | face_sequences = self.get_lower_half(face_sequences) 179 | 180 | x = face_sequences 181 | for f in self.face_encoder_blocks: 182 | x = f(x) 183 | 184 | return self.binary_pred(x).view(len(x), -1) 185 | -------------------------------------------------------------------------------- /scripts/wav2lip/output/debug/README.md: -------------------------------------------------------------------------------- 1 | debug folder -------------------------------------------------------------------------------- /scripts/wav2lip/output/face_enhanced/README.md: -------------------------------------------------------------------------------- 1 | Enhanced file folder -------------------------------------------------------------------------------- /scripts/wav2lip/output/faceswap/README.md: -------------------------------------------------------------------------------- 1 | faceswap file folder -------------------------------------------------------------------------------- /scripts/wav2lip/output/final/README.md: -------------------------------------------------------------------------------- 1 | image generated by stable diffusion -------------------------------------------------------------------------------- /scripts/wav2lip/predicator/README.md: -------------------------------------------------------------------------------- 1 | Place shape_predictor_68_face_landmarks.dat here -------------------------------------------------------------------------------- /scripts/wav2lip/results/README.md: -------------------------------------------------------------------------------- 1 | Generated results will be placed in this folder by default. -------------------------------------------------------------------------------- /scripts/wav2lip/temp/README.md: -------------------------------------------------------------------------------- 1 | Temporary files at the time of inference/testing will be saved here. You can ignore them. -------------------------------------------------------------------------------- /scripts/wav2lip/w2l.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gc 3 | import cv2, os, scripts.wav2lip.audio as audio 4 | import subprocess 5 | from tqdm import tqdm 6 | import torch, scripts.wav2lip.face_detection as face_detection 7 | from scripts.wav2lip.models import Wav2Lip 8 | import modules.shared as shared 9 | from pkg_resources import resource_filename 10 | 11 | 12 | class W2l: 13 | def __init__(self, face, audio, checkpoint, nosmooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, face_swap_img): 14 | self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1]) 15 | self.static = False 16 | if os.path.isfile(face) and face.split('.')[1] in ['jpg', 'png', 'jpeg']: 17 | self.static = True 18 | 19 | self.img_size = 96 20 | self.face = face 21 | self.audio = audio 22 | self.checkpoint = checkpoint 23 | self.mel_step_size = 16 24 | self.face_det_batch_size = 16 25 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 26 | self.pads = [pad_top, pad_bottom, pad_left, pad_right] 27 | self.face_swap_img = face_swap_img 28 | self.nosmooth = nosmooth 29 | self.box = [-1, -1, -1, -1] 30 | self.wav2lip_batch_size = 128 31 | self.fps = 25 32 | self.resize_factor = resize_factor 33 | self.rotate = False 34 | self.crop = [0, -1, 0, -1] 35 | self.checkpoint_path = self.wav2lip_folder + '/checkpoints/' + self.checkpoint + '.pth' 36 | self.outfile = self.wav2lip_folder + '/results/result_voice.mp4' 37 | print('Using {} for inference.'.format(self.device)) 38 | self.ffmpeg_binary = self.find_ffmpeg_binary() 39 | 40 | def find_ffmpeg_binary(self): 41 | for package in ['imageio_ffmpeg', 'imageio-ffmpeg']: 42 | try: 43 | package_path = resource_filename(package, 'binaries') 44 | files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")] 45 | files.sort(key=lambda x: os.path.getmtime(x), reverse=True) 46 | return files[0] if files else 'ffmpeg' 47 | except: 48 | return 'ffmpeg' 49 | 50 | def execute_command(self, command): 51 | process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) 52 | stdout, stderr = process.communicate() 53 | if process.returncode != 0: 54 | raise RuntimeError(stderr) 55 | 56 | def get_smoothened_boxes(self, boxes, T): 57 | for i in range(len(boxes)): 58 | if i + T > len(boxes): 59 | window = boxes[len(boxes) - T:] 60 | else: 61 | window = boxes[i: i + T] 62 | boxes[i] = np.mean(window, axis=0) 63 | return boxes 64 | 65 | def face_detect(self, images): 66 | detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 67 | flip_input=False, device=self.device) 68 | 69 | batch_size = self.face_det_batch_size 70 | 71 | while 1: 72 | predictions = [] 73 | try: 74 | for i in tqdm(range(0, len(images), batch_size)): 75 | predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size]))) 76 | except RuntimeError: 77 | if batch_size == 1: 78 | raise RuntimeError( 79 | 'Image too big to run face detection on GPU. Please use the --resize_factor argument') 80 | batch_size //= 2 81 | print('Recovering from OOM error; New batch size: {}'.format(batch_size)) 82 | continue 83 | break 84 | 85 | results = [] 86 | pady1, pady2, padx1, padx2 = self.pads 87 | n = 0 88 | for rect, image in zip(predictions, images): 89 | if rect is None: 90 | print("Hum : " + str(n)) 91 | cv2.imwrite(self.wav2lip_folder + '/temp/faulty_frame.jpg', 92 | image) # check this frame where the face was not detected. 93 | raise ValueError('Face not detected! Ensure the video contains a face in all the frames.') 94 | 95 | y1 = max(0, rect[1] - pady1) 96 | y2 = min(image.shape[0], rect[3] + pady2) 97 | x1 = max(0, rect[0] - padx1) 98 | x2 = min(image.shape[1], rect[2] + padx2) 99 | 100 | results.append([x1, y1, x2, y2]) 101 | n += 1 102 | 103 | boxes = np.array(results) 104 | if not self.nosmooth: boxes = self.get_smoothened_boxes(boxes, T=5) 105 | results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)] 106 | 107 | del detector 108 | return results 109 | 110 | def datagen(self, frames, mels): 111 | img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] 112 | 113 | if self.box[0] == -1: 114 | if not self.static: 115 | face_det_results = self.face_detect(frames) # BGR2RGB for CNN face detection 116 | else: 117 | face_det_results = self.face_detect([frames[0]]) 118 | else: 119 | print('Using the specified bounding box instead of face detection...') 120 | y1, y2, x1, x2 = self.box 121 | face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames] 122 | 123 | for i, m in enumerate(mels): 124 | idx = 0 if self.static else i % len(frames) 125 | frame_to_save = frames[idx].copy() 126 | face, coords = face_det_results[idx].copy() 127 | 128 | face = cv2.resize(face, (self.img_size, self.img_size)) 129 | 130 | img_batch.append(face) 131 | mel_batch.append(m) 132 | frame_batch.append(frame_to_save) 133 | coords_batch.append(coords) 134 | 135 | if len(img_batch) >= self.wav2lip_batch_size: 136 | img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) 137 | 138 | img_masked = img_batch.copy() 139 | img_masked[:, self.img_size // 2:] = 0 140 | 141 | img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. 142 | mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) 143 | 144 | yield img_batch, mel_batch, frame_batch, coords_batch 145 | img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] 146 | 147 | if len(img_batch) > 0: 148 | img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) 149 | 150 | img_masked = img_batch.copy() 151 | img_masked[:, self.img_size // 2:] = 0 152 | 153 | img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. 154 | mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) 155 | 156 | yield img_batch, mel_batch, frame_batch, coords_batch 157 | 158 | def _load(self, checkpoint_path): 159 | shared.cmd_opts.disable_safe_unpickle = True 160 | if self.device == 'cuda': 161 | checkpoint = torch.load(checkpoint_path) 162 | else: 163 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 164 | shared.cmd_opts.disable_safe_unpickle = False 165 | return checkpoint 166 | 167 | def load_model(self, path): 168 | model = Wav2Lip() 169 | print("Load checkpoint from: {}".format(path)) 170 | checkpoint = self._load(path) 171 | s = checkpoint["state_dict"] 172 | new_s = {} 173 | for k, v in s.items(): 174 | new_s[k.replace('module.', '')] = v 175 | model.load_state_dict(new_s) 176 | 177 | model = model.to(self.device) 178 | return model.eval() 179 | 180 | def execute(self): 181 | if not os.path.isfile(self.face): 182 | raise ValueError('--face argument must be a valid path to video/image file') 183 | 184 | elif self.face.split('.')[1] in ['jpg', 'png', 'jpeg']: 185 | full_frames = [cv2.imread(self.face)] 186 | fps = self.fps 187 | 188 | else: 189 | video_stream = cv2.VideoCapture(self.face) 190 | fps = video_stream.get(cv2.CAP_PROP_FPS) 191 | 192 | print('Reading video frames...') 193 | 194 | full_frames = [] 195 | while 1: 196 | still_reading, frame = video_stream.read() 197 | if not still_reading: 198 | video_stream.release() 199 | break 200 | if self.resize_factor > 1 and self.face_swap_img is None: 201 | frame = cv2.resize(frame, 202 | (frame.shape[1] // self.resize_factor, frame.shape[0] // self.resize_factor)) 203 | 204 | if self.rotate: 205 | frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE) 206 | 207 | y1, y2, x1, x2 = self.crop 208 | if x2 == -1: x2 = frame.shape[1] 209 | if y2 == -1: y2 = frame.shape[0] 210 | 211 | frame = frame[y1:y2, x1:x2] 212 | 213 | full_frames.append(frame) 214 | 215 | print("Number of frames available for inference: " + str(len(full_frames))) 216 | 217 | if not self.audio.endswith('.wav'): 218 | print('Extracting raw audio...') 219 | command = [self.ffmpeg_binary, "-y", "-i", self.audio, "-strict", "-2", 220 | self.wav2lip_folder + "/temp/temp.wav"] 221 | 222 | self.execute_command(command) 223 | self.audio = self.wav2lip_folder + '/temp/temp.wav' 224 | 225 | wav = audio.load_wav(self.audio, 16000) 226 | mel = audio.melspectrogram(wav) 227 | print(mel.shape) 228 | 229 | if np.isnan(mel.reshape(-1)).sum() > 0: 230 | raise ValueError( 231 | 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') 232 | 233 | mel_chunks = [] 234 | mel_idx_multiplier = 80. / fps 235 | i = 0 236 | while 1: 237 | start_idx = int(i * mel_idx_multiplier) 238 | if start_idx + self.mel_step_size > len(mel[0]): 239 | mel_chunks.append(mel[:, len(mel[0]) - self.mel_step_size:]) 240 | break 241 | mel_chunks.append(mel[:, start_idx: start_idx + self.mel_step_size]) 242 | i += 1 243 | 244 | print("Length of mel chunks: {}".format(len(mel_chunks))) 245 | 246 | full_frames = full_frames[:len(mel_chunks)] 247 | 248 | batch_size = self.wav2lip_batch_size 249 | gen = self.datagen(full_frames.copy(), mel_chunks) 250 | 251 | for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, 252 | total=int( 253 | np.ceil( 254 | float(len(mel_chunks)) / batch_size)))): 255 | if i == 0: 256 | model = self.load_model(self.checkpoint_path) 257 | print("Model loaded") 258 | 259 | frame_h, frame_w = full_frames[0].shape[:-1] 260 | out = cv2.VideoWriter(self.wav2lip_folder + '/temp/result.avi', 261 | cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h)) 262 | 263 | img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(self.device) 264 | mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(self.device) 265 | 266 | with torch.no_grad(): 267 | pred = model(mel_batch, img_batch) 268 | 269 | pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255. 270 | 271 | for p, f, c in zip(pred, frames, coords): 272 | y1, y2, x1, x2 = c 273 | p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) 274 | 275 | f[y1:y2, x1:x2] = p 276 | out.write(f) 277 | 278 | out.release() 279 | # release memory 280 | model.cpu() 281 | del model 282 | torch.cuda.empty_cache() 283 | gc.collect() 284 | 285 | command = [self.ffmpeg_binary, "-y", "-i", self.audio, "-i", self.wav2lip_folder + '/temp/result.avi', 286 | "-strict", "-2", "-q:v", "1", self.outfile] 287 | self.execute_command(command) 288 | -------------------------------------------------------------------------------- /scripts/wav2lip/wav2lip_uhq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import cv2 4 | import dlib 5 | import json 6 | import torch 7 | import scripts.wav2lip.face_detection as face_detection 8 | from imutils import face_utils 9 | import subprocess 10 | from modules.shared import state, opts 11 | from pkg_resources import resource_filename 12 | import modules.face_restoration 13 | from modules import devices 14 | 15 | 16 | class Wav2LipUHQ: 17 | def __init__(self, face, face_restore_model, mouth_mask_dilatation, erode_face_mask, mask_blur, only_mouth, 18 | face_swap_img, resize_factor, code_former_weight, debug=False): 19 | self.wav2lip_folder = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-1]) 20 | self.original_video = face 21 | self.face_restore_model = face_restore_model 22 | self.mouth_mask_dilatation = mouth_mask_dilatation 23 | self.erode_face_mask = erode_face_mask 24 | self.mask_blur = mask_blur 25 | self.only_mouth = only_mouth 26 | self.face_swap_img = face_swap_img 27 | self.w2l_video = self.wav2lip_folder + '/results/result_voice.mp4' 28 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 29 | self.ffmpeg_binary = self.find_ffmpeg_binary() 30 | self.resize_factor = resize_factor 31 | self.code_former_weight = code_former_weight 32 | self.debug = debug 33 | 34 | def find_ffmpeg_binary(self): 35 | for package in ['imageio_ffmpeg', 'imageio-ffmpeg']: 36 | try: 37 | package_path = resource_filename(package, 'binaries') 38 | files = [os.path.join(package_path, f) for f in os.listdir(package_path) if f.startswith("ffmpeg-")] 39 | files.sort(key=lambda x: os.path.getmtime(x), reverse=True) 40 | return files[0] if files else 'ffmpeg' 41 | except: 42 | return 'ffmpeg' 43 | 44 | def assure_path_exists(self, path): 45 | dir = os.path.dirname(path) 46 | if not os.path.exists(dir): 47 | os.makedirs(dir) 48 | 49 | def get_framerate(self, video_file): 50 | video = cv2.VideoCapture(video_file) 51 | fps = video.get(cv2.CAP_PROP_FPS) 52 | video.release() 53 | return fps 54 | 55 | def execute_command(self, command): 56 | process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) 57 | stdout, stderr = process.communicate() 58 | if process.returncode != 0: 59 | raise RuntimeError(stderr) 60 | 61 | def create_video_from_images(self, nb_frames): 62 | fps = str(self.get_framerate(self.w2l_video)) 63 | command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i", 64 | self.wav2lip_folder + "/output/final/output_%05d.png", "-vframes", 65 | str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k", 66 | self.wav2lip_folder + "/output/video.mp4"] 67 | 68 | self.execute_command(command) 69 | 70 | command = [self.ffmpeg_binary, "-y", "-framerate", fps, "-start_number", "0", "-i", 71 | self.wav2lip_folder + "/output/face_enhanced/face_restore_%05d.png", "-vframes", 72 | str(nb_frames), "-c:v", "libx264", "-pix_fmt", "yuv420p", "-b:v", "8000k", 73 | self.wav2lip_folder + "/output/video_enhanced.mp4"] 74 | 75 | self.execute_command(command) 76 | 77 | def extract_audio_from_video(self): 78 | command = [self.ffmpeg_binary, "-y", "-i", self.w2l_video, "-vn", "-acodec", "copy", 79 | self.wav2lip_folder + "/output/output_audio.aac"] 80 | self.execute_command(command) 81 | 82 | def add_audio_to_video(self): 83 | command = [self.ffmpeg_binary, "-y", "-i", self.wav2lip_folder + "/output/video.mp4", "-i", 84 | self.wav2lip_folder + "/output/output_audio.aac", "-c:v", "copy", "-c:a", "aac", "-strict", 85 | "experimental", self.wav2lip_folder + "/output/output_video.mp4"] 86 | self.execute_command(command) 87 | 88 | command = [self.ffmpeg_binary, "-y", "-i", self.wav2lip_folder + "/output/video_enhanced.mp4", "-i", 89 | self.wav2lip_folder + "/output/output_audio.aac", "-c:v", "copy", "-c:a", "aac", "-strict", 90 | "experimental", self.wav2lip_folder + "/output/output_video_enhanced.mp4"] 91 | self.execute_command(command) 92 | 93 | def initialize_dlib_predictor(self): 94 | print("[INFO] Loading the predictor...") 95 | detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 96 | flip_input=False, device=self.device) 97 | predictor = dlib.shape_predictor(self.wav2lip_folder + "/predicator/shape_predictor_68_face_landmarks.dat") 98 | return detector, predictor 99 | 100 | def initialize_video_streams(self): 101 | print("[INFO] Loading File...") 102 | vs = cv2.VideoCapture(self.w2l_video) 103 | vi = cv2.VideoCapture(self.original_video) 104 | return vs, vi 105 | 106 | def dilate_mouth(self, mouth, w, h): 107 | mask = np.zeros((w, h), dtype=np.uint8) 108 | cv2.fillPoly(mask, [mouth], 255) 109 | kernel = np.ones((self.mouth_mask_dilatation, self.mouth_mask_dilatation), np.uint8) 110 | dilated_mask = cv2.dilate(mask, kernel, iterations=1) 111 | contours, _ = cv2.findContours(dilated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 112 | dilated_points = contours[0].squeeze() 113 | return dilated_points 114 | 115 | def execute(self, resume=False): 116 | output_dir = self.wav2lip_folder + '/output/' 117 | debug_path = output_dir + "debug/" 118 | face_enhanced_path = output_dir + "face_enhanced/" 119 | final_path = output_dir + 'final/' 120 | detector, predictor = self.initialize_dlib_predictor() 121 | vs, vi = self.initialize_video_streams() 122 | (mstart, mend) = face_utils.FACIAL_LANDMARKS_IDXS["mouth"] 123 | (jstart, jend) = face_utils.FACIAL_LANDMARKS_IDXS["jaw"] 124 | (nstart, nend) = face_utils.FACIAL_LANDMARKS_IDXS["nose"] 125 | 126 | max_frame = str(int(vs.get(cv2.CAP_PROP_FRAME_COUNT))) 127 | original_codeformer_weight = opts.code_former_weight 128 | original_face_restoration_model = opts.face_restoration_model 129 | 130 | opts.code_former_weight = self.code_former_weight 131 | opts.face_restoration_model = self.face_restore_model 132 | 133 | frame_number = 0 134 | if resume: 135 | if os.path.exists(self.wav2lip_folder + "/resume.json"): 136 | with open(self.wav2lip_folder + "/resume.json", "r") as f: 137 | parameters = json.load(f) 138 | # Read frame 139 | for f in range(parameters["frame"]): 140 | _, _ = vs.read() 141 | ret, _ = vi.read() 142 | if not ret: 143 | vi.release() 144 | vi = cv2.VideoCapture(self.original_video) 145 | _, _ = vi.read() 146 | frame_number = parameters["frame"] 147 | print("Face Restoration model: " + str(opts.face_restoration_model)) 148 | 149 | while True: 150 | print("[INFO] Processing frame: " + str(frame_number) + " of " + max_frame + " - ", end="\r") 151 | f_number = str(frame_number).rjust(5, '0') 152 | if state.interrupted: 153 | break 154 | 155 | # Read frame 156 | ret, w2l_frame = vs.read() 157 | if not ret: 158 | break 159 | 160 | ret, original_frame = vi.read() 161 | if not ret: 162 | vi.release() 163 | vi = cv2.VideoCapture(self.original_video) 164 | ret, original_frame = vi.read() 165 | 166 | if w2l_frame.shape != original_frame.shape: 167 | if self.resize_factor > 1 and self.face_swap_img is None: 168 | original_frame = cv2.resize(original_frame, (w2l_frame.shape[1], w2l_frame.shape[0])) 169 | else: 170 | w2l_frame = cv2.resize(w2l_frame, (original_frame.shape[1], original_frame.shape[0])) 171 | 172 | # Convert to gray 173 | original_gray = cv2.cvtColor(original_frame, cv2.COLOR_RGB2GRAY) 174 | 175 | # Restore face 176 | w2l_frame_to_restore = cv2.cvtColor(w2l_frame, cv2.COLOR_BGR2RGB) 177 | image_restored = modules.face_restoration.restore_faces(w2l_frame_to_restore) 178 | 179 | image_restored2 = cv2.cvtColor(image_restored, cv2.COLOR_RGB2BGR) 180 | cv2.imwrite(face_enhanced_path + "face_restore_" + f_number + ".png", image_restored2) 181 | image_restored_gray = cv2.cvtColor(image_restored2, cv2.COLOR_RGB2GRAY) 182 | 183 | # Detect faces 184 | rects = detector.get_detections_for_batch(np.array([np.array(image_restored2)])) 185 | 186 | # Initialize mask 187 | mask = np.zeros_like(original_gray) 188 | 189 | # Process each detected face 190 | for (i, rect) in enumerate(rects): 191 | # Get face coordinates 192 | if not self.only_mouth: 193 | shape = predictor(original_gray, dlib.rectangle(*rect)) 194 | shape = face_utils.shape_to_np(shape) 195 | jaw = shape[jstart:jend][1:-1] 196 | nose = shape[nstart:nend][2] 197 | 198 | # Get mouth coordinates 199 | shape = predictor(image_restored_gray, dlib.rectangle(*rect)) 200 | shape = face_utils.shape_to_np(shape) 201 | 202 | mouth = shape[mstart:mend][:-8] 203 | mouth = np.delete(mouth, [3], axis=0) 204 | if self.mouth_mask_dilatation > 0: 205 | mouth = self.dilate_mouth(mouth, original_gray.shape[0], original_gray.shape[1]) 206 | 207 | # Create mask for face 208 | if not self.only_mouth: 209 | external_shape = np.append(jaw, [nose], axis=0) 210 | external_shape_pts = external_shape.reshape((-1, 1, 2)) 211 | mask = cv2.fillPoly(mask, [external_shape_pts], 255) 212 | if self.erode_face_mask > 0: 213 | kernel = np.ones((self.erode_face_mask, self.erode_face_mask), np.uint8) 214 | mask = cv2.erode(mask, kernel, iterations=1) 215 | # Calculate diff between frames and apply threshold 216 | diff = np.abs(original_gray - image_restored_gray) 217 | diff[diff > 10] = 255 218 | diff[diff <= 10] = 0 219 | masked_diff = cv2.bitwise_and(diff, diff, mask=mask) 220 | else: 221 | masked_diff = mask 222 | 223 | # Create mask for mouth 224 | cv2.fillConvexPoly(masked_diff, mouth, 255) 225 | 226 | # Save mask 227 | if self.mask_blur > 0: 228 | blur = self.mask_blur if self.mask_blur % 2 == 1 else self.mask_blur - 1 229 | masked_save = cv2.GaussianBlur(masked_diff, (blur, blur), 0) 230 | else: 231 | masked_save = masked_diff 232 | 233 | original = original_frame.copy() 234 | 235 | # Apply restored face to original image with mask attention 236 | extended_mask = np.stack([masked_save] * 3, axis=-1) 237 | normalized_mask = extended_mask / 255.0 238 | dst = image_restored2 * normalized_mask 239 | original = original * (1 - normalized_mask) + dst 240 | original = original.astype(np.uint8) 241 | 242 | # Save final image 243 | cv2.imwrite(final_path + "output_" + f_number + ".png", original) 244 | 245 | if self.debug: 246 | clone = w2l_frame.copy() 247 | if not self.only_mouth: 248 | for (x, y) in np.concatenate((jaw, mouth, [nose])): 249 | cv2.circle(clone, (x, y), 1, (0, 0, 255), -1) 250 | else: 251 | for (x, y) in mouth: 252 | cv2.circle(clone, (x, y), 1, (0, 0, 255), -1) 253 | if not self.only_mouth: 254 | cv2.imwrite(debug_path + "diff_" + f_number + ".png", diff) 255 | cv2.imwrite(debug_path + "points_" + f_number + ".png", clone) 256 | cv2.imwrite(debug_path + 'mask_' + f_number + '.png', masked_save) 257 | cv2.imwrite(debug_path + 'original_' + f_number + '.png', original_frame) 258 | cv2.imwrite(debug_path + "face_restore_" + f_number + ".png", image_restored2) 259 | cv2.imwrite(debug_path + "dst_" + f_number + ".png", dst) 260 | 261 | frame_number += 1 262 | opts.code_former_weight = original_codeformer_weight 263 | opts.face_restoration_model = original_face_restoration_model 264 | devices.torch_gc() 265 | if frame_number > 1: 266 | vs.release() 267 | vi.release() 268 | 269 | print("[INFO] Create Videos output!") 270 | self.create_video_from_images(frame_number - 1) 271 | print("[INFO] Extract Audio from input!") 272 | self.extract_audio_from_video() 273 | print("[INFO] Add Audio to Videos!") 274 | self.add_audio_to_video() 275 | print("[INFO] Done! file save in output/video_output.mp4") 276 | 277 | if str(frame_number) != max_frame: 278 | parameters = {"frame": frame_number} 279 | with open(self.wav2lip_folder + "/resume.json", 'w') as f: 280 | json.dump(parameters, f) 281 | else: 282 | if os.path.exists(self.wav2lip_folder + "/resume.json"): 283 | os.remove(self.wav2lip_folder + "/resume.json") 284 | if self.face_swap_img is None: 285 | face_swap_output = None 286 | else: 287 | face_swap_output = self.wav2lip_folder + "/output/faceswap/video.mp4" 288 | return [face_swap_output, 289 | self.wav2lip_folder + "/results/result_voice.mp4", 290 | self.wav2lip_folder + "/output/output_video_enhanced.mp4", 291 | self.wav2lip_folder + "/output/output_video.mp4"] 292 | else: 293 | print("[INFO] Interrupted!") 294 | return None 295 | -------------------------------------------------------------------------------- /scripts/wav2lip_uhq.py: -------------------------------------------------------------------------------- 1 | from modules import script_callbacks 2 | from scripts.wav2lip_uhq_extend_paths import wav2lip_uhq_sys_extend 3 | 4 | 5 | def init_wav2lip_uhq(): 6 | wav2lip_uhq_sys_extend() 7 | from ui import on_ui_tabs 8 | script_callbacks.on_ui_tabs(on_ui_tabs) 9 | 10 | 11 | init_wav2lip_uhq() 12 | -------------------------------------------------------------------------------- /scripts/wav2lip_uhq_extend_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def wav2lip_uhq_sys_extend(): 6 | wav2lip_uhq_folder_name = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-2]) 7 | 8 | basedirs = [os.getcwd()] 9 | for _ in basedirs: 10 | wav2lip_uhq_paths_to_ensure = [os.path.join(wav2lip_uhq_folder_name, 'scripts')] 11 | for wav2lip_uhq_scripts_path_fix in wav2lip_uhq_paths_to_ensure: 12 | if wav2lip_uhq_scripts_path_fix not in sys.path: 13 | sys.path.extend([wav2lip_uhq_scripts_path_fix]) 14 | --------------------------------------------------------------------------------