├── LICENSE
├── README.md
├── __pycache__
├── audio.cpython-37.pyc
└── hparams.cpython-37.pyc
├── audio.py
├── basicsr
├── README.md
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── apply_sr.cpython-37.pyc
│ ├── test.cpython-37.pyc
│ └── train.cpython-37.pyc
├── apply_sr.py
├── archs
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── arch_util.cpython-37.pyc
│ │ ├── dfdnet_arch.cpython-37.pyc
│ │ ├── dfdnet_util.cpython-37.pyc
│ │ ├── discriminator_arch.cpython-37.pyc
│ │ ├── duf_arch.cpython-37.pyc
│ │ ├── edsr_arch.cpython-37.pyc
│ │ ├── edvr_arch.cpython-37.pyc
│ │ ├── rcan_arch.cpython-37.pyc
│ │ ├── ridnet_arch.cpython-37.pyc
│ │ ├── rrdbnet_arch.cpython-37.pyc
│ │ ├── spynet_arch.cpython-37.pyc
│ │ ├── srresnet_arch.cpython-37.pyc
│ │ ├── stylegan2_arch.cpython-37.pyc
│ │ ├── tof_arch.cpython-37.pyc
│ │ └── vgg_arch.cpython-37.pyc
│ ├── arch_util.py
│ ├── dfdnet_arch.py
│ ├── dfdnet_util.py
│ ├── discriminator_arch.py
│ ├── duf_arch.py
│ ├── edsr_arch.py
│ ├── edvr_arch.py
│ ├── inception.py
│ ├── rcan_arch.py
│ ├── ridnet_arch.py
│ ├── rrdbnet_arch.py
│ ├── spynet_arch.py
│ ├── srresnet_arch.py
│ ├── stylegan2_arch.py
│ ├── tof_arch.py
│ └── vgg_arch.py
├── data
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── data_sampler.cpython-37.pyc
│ │ ├── data_util.cpython-37.pyc
│ │ ├── ffhq_dataset.cpython-37.pyc
│ │ ├── paired_image_dataset.cpython-37.pyc
│ │ ├── prefetch_dataloader.cpython-37.pyc
│ │ ├── reds_dataset.cpython-37.pyc
│ │ ├── single_image_dataset.cpython-37.pyc
│ │ ├── transforms.cpython-37.pyc
│ │ ├── video_test_dataset.cpython-37.pyc
│ │ └── vimeo90k_dataset.cpython-37.pyc
│ ├── data_sampler.py
│ ├── data_util.py
│ ├── degradations.py
│ ├── ffhq_dataset.py
│ ├── meta_info
│ │ ├── meta_info_DIV2K800sub_GT.txt
│ │ ├── meta_info_REDS4_test_GT.txt
│ │ ├── meta_info_REDS_GT.txt
│ │ ├── meta_info_REDSofficial4_test_GT.txt
│ │ ├── meta_info_REDSval_official_test_GT.txt
│ │ ├── meta_info_Vimeo90K_test_GT.txt
│ │ ├── meta_info_Vimeo90K_test_fast_GT.txt
│ │ ├── meta_info_Vimeo90K_test_medium_GT.txt
│ │ ├── meta_info_Vimeo90K_test_slow_GT.txt
│ │ └── meta_info_Vimeo90K_train_GT.txt
│ ├── paired_image_dataset.py
│ ├── prefetch_dataloader.py
│ ├── reds_dataset.py
│ ├── single_image_dataset.py
│ ├── transforms.py
│ ├── video_test_dataset.py
│ └── vimeo90k_dataset.py
├── losses
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── loss_util.cpython-37.pyc
│ │ └── losses.cpython-37.pyc
│ ├── loss_util.py
│ └── losses.py
├── metrics
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── metric_util.cpython-37.pyc
│ │ ├── niqe.cpython-37.pyc
│ │ └── psnr_ssim.cpython-37.pyc
│ ├── fid.py
│ ├── metric_util.py
│ ├── niqe.py
│ ├── niqe_pris_params.npz
│ └── psnr_ssim.py
├── models
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── base_model.cpython-37.pyc
│ │ ├── edvr_model.cpython-37.pyc
│ │ ├── esrgan_model.cpython-37.pyc
│ │ ├── lr_scheduler.cpython-37.pyc
│ │ ├── sr_model.cpython-37.pyc
│ │ ├── srgan_model.cpython-37.pyc
│ │ ├── stylegan2_model.cpython-37.pyc
│ │ ├── video_base_model.cpython-37.pyc
│ │ └── video_gan_model.cpython-37.pyc
│ ├── base_model.py
│ ├── edvr_model.py
│ ├── esrgan_model.py
│ ├── lr_scheduler.py
│ ├── sr_model.py
│ ├── srgan_model.py
│ ├── stylegan2_model.py
│ ├── video_base_model.py
│ └── video_gan_model.py
├── ops
│ ├── __init__.py
│ ├── __pycache__
│ │ └── __init__.cpython-37.pyc
│ ├── dcn
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── deform_conv.cpython-37.pyc
│ │ ├── deform_conv.py
│ │ └── src
│ │ │ ├── deform_conv_cuda.cpp
│ │ │ ├── deform_conv_cuda_kernel.cu
│ │ │ └── deform_conv_ext.cpp
│ ├── fused_act
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── fused_act.cpython-37.pyc
│ │ ├── fused_act.py
│ │ └── src
│ │ │ ├── fused_bias_act.cpp
│ │ │ └── fused_bias_act_kernel.cu
│ └── upfirdn2d
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── upfirdn2d.cpython-37.pyc
│ │ ├── src
│ │ ├── upfirdn2d.cpp
│ │ └── upfirdn2d_kernel.cu
│ │ └── upfirdn2d.py
├── test.py
├── train.py
└── utils
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── dist_util.cpython-37.pyc
│ ├── file_client.cpython-37.pyc
│ ├── flow_util.cpython-37.pyc
│ ├── img_util.cpython-37.pyc
│ ├── logger.cpython-37.pyc
│ ├── matlab_functions.cpython-37.pyc
│ ├── misc.cpython-37.pyc
│ ├── options.cpython-37.pyc
│ └── registry.cpython-37.pyc
│ ├── dist_util.py
│ ├── download_util.py
│ ├── face_util.py
│ ├── file_client.py
│ ├── flow_util.py
│ ├── img_util.py
│ ├── lmdb_util.py
│ ├── logger.py
│ ├── matlab_functions.py
│ ├── misc.py
│ ├── options.py
│ └── registry.py
├── checkpoints
└── readme.md
├── download_models.py
├── examples
├── 1_hd.jpg
├── 1_low.jpg
├── kennedy_hd.jpg
├── kennedy_hd.mkv
├── kennedy_low.jpg
├── kennedy_low.mp4
├── mona_hd.jpg
├── mona_hd.mkv
├── mona_low.jpg
└── mona_low.mp4
├── experiments
└── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb
│ └── models
│ └── readme.md
├── face_detection
├── README.md
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── api.cpython-37.pyc
│ ├── models.cpython-37.pyc
│ └── utils.cpython-37.pyc
├── api.py
├── detection
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── core.cpython-37.pyc
│ ├── core.py
│ └── sfd
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── bbox.cpython-37.pyc
│ │ ├── detect.cpython-37.pyc
│ │ ├── net_s3fd.cpython-37.pyc
│ │ └── sfd_detector.cpython-37.pyc
│ │ ├── bbox.py
│ │ ├── detect.py
│ │ ├── net_s3fd.py
│ │ ├── readme.md
│ │ └── sfd_detector.py
├── models.py
└── utils.py
├── face_parsing
├── README.md
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── model.cpython-37.pyc
│ ├── resnet.cpython-37.pyc
│ └── swap.cpython-37.pyc
├── model.py
├── resnet.py
└── swap.py
├── hparams.py
├── inference.py
├── input_audios
└── ai.wav
├── input_videos
├── README.md
├── kennedy.mp4
└── mona.mp4
├── output_videos_hd
├── kennedy.mkv
└── mona.mkv
├── output_videos_wav2lip
├── kennedy.mp4
└── mona.mp4
├── requirements.txt
├── resizeframes.py
├── results
└── README.md
├── run_final.sh
├── tb_logger
├── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb
│ ├── events.out.tfevents.1680350184.user1-Alienware-Aurora-R9.591624.0
│ ├── events.out.tfevents.1680351829.user1-Alienware-Aurora-R9.593185.0
│ └── events.out.tfevents.1680351874.user1-Alienware-Aurora-R9.593246.0
├── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb_archived_20230401_052824
│ ├── events.out.tfevents.1680319497.user1-Alienware-Aurora-R9.580638.0
│ └── events.out.tfevents.1680319611.user1-Alienware-Aurora-R9.580758.0
├── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb_archived_20230401_135250
│ ├── events.out.tfevents.1680319705.user1-Alienware-Aurora-R9.580840.0
│ ├── events.out.tfevents.1680319744.user1-Alienware-Aurora-R9.580886.0
│ ├── events.out.tfevents.1680319774.user1-Alienware-Aurora-R9.580937.0
│ ├── events.out.tfevents.1680319845.user1-Alienware-Aurora-R9.581036.0
│ ├── events.out.tfevents.1680319869.user1-Alienware-Aurora-R9.581085.0
│ ├── events.out.tfevents.1680319883.user1-Alienware-Aurora-R9.581137.0
│ ├── events.out.tfevents.1680349247.user1-Alienware-Aurora-R9.590276.0
│ ├── events.out.tfevents.1680349479.user1-Alienware-Aurora-R9.590615.0
│ ├── events.out.tfevents.1680349575.user1-Alienware-Aurora-R9.590713.0
│ ├── events.out.tfevents.1680349630.user1-Alienware-Aurora-R9.590788.0
│ ├── events.out.tfevents.1680349693.user1-Alienware-Aurora-R9.590885.0
│ └── events.out.tfevents.1680349735.user1-Alienware-Aurora-R9.590947.0
├── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb_archived_20230401_135406
│ └── events.out.tfevents.1680349971.user1-Alienware-Aurora-R9.591190.0
├── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb_archived_20230401_135512
│ └── events.out.tfevents.1680350047.user1-Alienware-Aurora-R9.591301.0
└── 001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb_archived_20230401_135623
│ └── events.out.tfevents.1680350113.user1-Alienware-Aurora-R9.591460.0
├── temp
├── README.md
├── result.avi
└── temp.wav
├── train.py
├── train_basicsr.yml
├── video2frames.py
└── wav2lip_models
├── README.md
├── __init__.py
├── __pycache__
├── __init__.cpython-37.pyc
├── conv.cpython-37.pyc
├── syncnet.cpython-37.pyc
└── wav2lip.cpython-37.pyc
├── conv.py
├── syncnet.py
└── wav2lip.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Saif Hassan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 | NOTE: PLEASE READ LICENSE REQUIREMENTS (COPYRIGHTS INFORMATION) FROM `WAV2LIP OFFICIAL REPO`, MENTIONED ON MAIN PAGE OF THIS REPOSITORY.
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Wav2Lip-HD: Improving Wav2Lip to achieve High-Fidelity Videos
2 |
3 | This repository contains code for achieving high-fidelity lip-syncing in videos, using the [Wav2Lip algorithm](https://github.com/Rudrabha/Wav2Lip) for lip-syncing and the [Real-ESRGAN algorithm](https://github.com/xinntao/Real-ESRGAN) for super-resolution. The combination of these two algorithms allows for the creation of lip-synced videos that are both highly accurate and visually stunning.
4 |
5 | ## Algorithm
6 |
7 | The algorithm for achieving high-fidelity lip-syncing with Wav2Lip and Real-ESRGAN can be summarized as follows:
8 |
9 | 1. The input video and audio are given to `Wav2Lip` algorithm.
10 | 2. Python script is written to extract frames from the video generated by wav2lip.
11 | 3. Frames are provided to Real-ESRGAN algorithm to improve quality.
12 | 4. Then, the high-quality frames are converted to video using ffmpeg, along with the original audio.
13 | 5. The result is a high-quality lip-syncing video.
14 | 6. The specific steps for running this algorithm are described in the [Testing Model](https://github.com/saifhassan/Wav2Lip-HD#testing-model) section of this README.
15 |
16 | ## Testing Model
17 |
18 | To test the "Wav2Lip-HD" model, follow these steps:
19 |
20 | 1. Clone this repository and install requirements using following command (Make sure, Python and CUDA are already installed):
21 |
22 | ```
23 | git clone https://github.com/saifhassan/Wav2Lip-HD.git
24 | cd Wav2Lip-HD
25 | pip install -r requirements.txt
26 | ```
27 |
28 | 2. Downloading weights
29 |
30 | | Model | Directory | Download Link |
31 | | :------------- |:-------------| :-----:|
32 | | Wav2Lip | [checkpoints/](https://github.com/saifhassan/Wav2Lip-HD/tree/main/checkpoints) | [Link](https://drive.google.com/drive/folders/1tB_uz-TYMePRMZzrDMdShWUZZ0JK3SIZ?usp=sharing) |
33 | | ESRGAN | [experiments/001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb/models/](https://github.com/saifhassan/Wav2Lip-HD/tree/main/experiments/001_ESRGAN_x4_f64b23_custom16k_500k_B16G1_wandb/models) | [Link](https://drive.google.com/file/d/1Al8lEpnx2K-kDX7zL2DBcAuDnSKXACPb/view?usp=sharing) |
34 | | Face_Detection | [face_detection/detection/sfd/](https://github.com/saifhassan/Wav2Lip-HD/tree/main/face_detection/detection/sfd) | [Link](https://drive.google.com/file/d/1uNLYCPFFmO-og3WSHyFytJQLLYOwH5uY/view?usp=sharing) |
35 | | Real-ESRGAN | Real-ESRGAN/gfpgan/weights/ | [Link](https://drive.google.com/drive/folders/1BLx6aMpHgFt41fJ27_cRmT8bt53kVAYG?usp=sharing) |
36 | | Real-ESRGAN | Real-ESRGAN/weights/ | [Link](https://drive.google.com/file/d/1qNIf8cJl_dQo3ivelPJVWFkApyEAGnLi/view?usp=sharing) |
37 |
38 |
39 | 3. Put input video to `input_videos` directory and input audio to `input_audios` directory.
40 | 4. Open `run_final.sh` file and modify following parameters:
41 |
42 | `filename=kennedy` (just video file name without extension)
43 |
44 | `input_audio=input_audios/ai.wav` (audio filename with extension)
45 |
46 | 5. Execute `run_final.sh` using following command:
47 |
48 | ```
49 | bash run_final.sh
50 | ```
51 |
52 | 6. Outputs
53 |
54 | - `output_videos_wav2lip` directory contains video output generated by wav2lip algorithm.
55 | - `frames_wav2lip` directory contains frames extracted from video (generated by wav2lip algorithm).
56 | - `frames_hd` directory contains frames after performing super-resolution using Real-ESRGAN algorithm.
57 | - `output_videos_hd` directory contains final high quality video output generated by Wav2Lip-HD.
58 |
59 |
60 | ## Results
61 | The results produced by Wav2Lip-HD are in two forms, one is frames and other is videos. Both are shared below:
62 |
63 | ### Example output frames
64 |
65 |
66 | Frame by Wav2Lip |
67 | Optimized Frame |
68 |
69 |
70 |  |
71 |  |
72 |
73 |
74 |  |
75 |  |
76 |
77 |
78 |
79 |
80 |  |
81 |  |
82 |
83 |
84 |
85 |
86 | ### Example output videos
87 |
88 | | Video by Wav2Lip | Optimized Video |
89 | | ------------- | ------------- |
90 | |