├── README.md ├── Temp └── temp.txt ├── audio.py ├── blendmasker ├── blendmask.py └── place blendmasker onnx here.txt ├── checkpoints └── place wav2lip onnx models here.txt ├── commandline_options.txt ├── enhancers ├── Codeformer │ ├── Codeformer.py │ ├── LICENSE │ └── place codeformer onnx model here.txt ├── GFPGAN │ ├── GFPGAN.py │ ├── LICENSE │ └── place gfpgan onnx model here.txt ├── GPEN │ ├── GPEN.py │ ├── LICENSE │ └── place gpen onnx model here.txt ├── RealEsrgan │ ├── clear_reality_x4.onnx │ └── esrganONNX.py └── restoreformer │ ├── place restoreformer onnx model here.txt │ ├── restoreformer16.py │ └── restoreformer32.py ├── faceID ├── faceID.py └── place recognition onnx model here.txt ├── hparams.py ├── inference_onnxModel.py ├── requirements.txt ├── resemble_denoiser ├── place models here.txt └── resemble_denoiser.py ├── setup_new.txt ├── utils ├── face_alignment.py ├── retinaface.py └── scrfd_2.5g_bnkps.onnx └── xseg ├── place xseg onnx model here.txt └── xseg.py /README.md: -------------------------------------------------------------------------------- 1 | # wav2lip-onnx-HQ 2 | 3 | Update 28.05.2025 4 | 5 | - removed 'final audio' option stuff 6 | - added resemble audio denoiser to avoid unwanted lip movements 7 | (not as good as vocal separation (eg. KimVocal_v2) but working similar in most cases) 8 | - minor code optimizations 9 | 10 | 11 | Update 29.04.2025 (inference_onnxModel_V2.py) 12 | 13 | - replaced occlusion mask with xseg occlusion 14 | - added option frame enhancer realEsrgan (clear_reality_x4 model included) 15 | - added option short fade-in/fade-out 16 | - added option for facemode 0 or 1 for better result on different face shapes 17 | (0=portrait like orig. wav2lip, 1=square for less mouth opening) 18 | - bugfix crashing when using xseg and specific face is not detected 19 | 20 | Update 08.02.2025 21 | 22 | - optmized occlusion mask 23 | - Replaced insightface with retinaface detection/alignment for easier installation 24 | - Replaced seg-mask with faster blendmasker 25 | - Added free cropping of final result video 26 | - Added specific target face selection from first frame 27 | 28 | . 29 | 30 | Just another Wav2Lip HQ local installation, fully running on Torch to ONNX converted models for: 31 | - face-detection 32 | - face-recognition 33 | - face-alignment 34 | - face-parsing 35 | - face-enhancement 36 | - wav2lip inference. 37 | 38 | . 39 | 40 | Can be run on CPU or Nvidia GPU 41 | 42 | I've made some modifications such as: 43 | * New face-detection and face-alignment code. (working for ~ +- 60º head tilt) 44 | * Four different face enhancers available, adjustable enhancement level . 45 | * Choose pingpong loop instead of original loop function. 46 | * Set cut-in/cut-out position to create the loop or cut longer video. 47 | * Cut-in position = used frame if static is selected. 48 | * Select the target face. 49 | * Use two audio files, eg. vocal for driving and full music mix for final output. 50 | * This version does not crash if no face is detected, it just continues ... 51 | 52 | Type --help for all commandline parameters 53 | 54 | . 55 | 56 | Model download - https://drive.google.com/drive/folders/1BGl9bmMtlGEMx_wwKufJrZChFyqjnlsQ?usp=sharing 57 | 58 | . 59 | 60 | 61 | Original wav2lip - https://github.com/Rudrabha/Wav2Lip 62 | 63 | Face enhancers taken from - https://github.com/harisreedhar/Face-Upscalers-ONNX 64 | 65 | Face detection taken from - https://github.com/neuralchen/SimSwap 66 | 67 | Face occluder taken from - https://github.com/facefusion/facefusion-assets/releases 68 | 69 | Blendmasker extracted from - https://github.com/mapooon/BlendFace during onnx conversion 70 | 71 | Face recognition for specifc face taken from - https://github.com/jahongir7174/FaceID 72 | 73 | Resemble-denoiser-ONNX adopted from - https://github.com/skeskinen/resemble-denoise-onnx-inference 74 | 75 | . 76 | 77 | . 78 | 79 | 80 | -------------------------------------------------------------------------------- /Temp/temp.txt: -------------------------------------------------------------------------------- 1 | required for temp files -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | # import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | from hparams import hparams as hp 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr): 18 | librosa.output.write_wav(path, wav, sr=sr) 19 | 20 | def preemphasis(wav, k, preemphasize=True): 21 | if preemphasize: 22 | return signal.lfilter([1, -k], [1], wav) 23 | return wav 24 | 25 | def inv_preemphasis(wav, k, inv_preemphasize=True): 26 | if inv_preemphasize: 27 | return signal.lfilter([1], [1, -k], wav) 28 | return wav 29 | 30 | def get_hop_size(): 31 | hop_size = hp.hop_size 32 | if hop_size is None: 33 | assert hp.frame_shift_ms is not None 34 | hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate) 35 | return hop_size 36 | 37 | def linearspectrogram(wav): 38 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 39 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 40 | 41 | if hp.signal_normalization: 42 | return _normalize(S) 43 | return S 44 | 45 | def melspectrogram(wav): 46 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 47 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db 48 | 49 | if hp.signal_normalization: 50 | return _normalize(S) 51 | return S 52 | 53 | def _lws_processor(): 54 | import lws 55 | return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech") 56 | 57 | def _stft(y): 58 | if hp.use_lws: 59 | return _lws_processor(hp).stft(y).T 60 | else: 61 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size) 62 | 63 | ########################################################## 64 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 65 | def num_frames(length, fsize, fshift): 66 | """Compute number of time frames of spectrogram 67 | """ 68 | pad = (fsize - fshift) 69 | if length % fshift == 0: 70 | M = (length + pad * 2 - fsize) // fshift + 1 71 | else: 72 | M = (length + pad * 2 - fsize) // fshift + 2 73 | return M 74 | 75 | 76 | def pad_lr(x, fsize, fshift): 77 | """Compute left and right padding 78 | """ 79 | M = num_frames(len(x), fsize, fshift) 80 | pad = (fsize - fshift) 81 | T = len(x) + 2 * pad 82 | r = (M - 1) * fshift + fsize - T 83 | return pad, pad + r 84 | ########################################################## 85 | #Librosa correct padding 86 | def librosa_pad_lr(x, fsize, fshift): 87 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 88 | 89 | # Conversions 90 | _mel_basis = None 91 | 92 | def _linear_to_mel(spectogram): 93 | global _mel_basis 94 | if _mel_basis is None: 95 | _mel_basis = _build_mel_basis() 96 | return np.dot(_mel_basis, spectogram) 97 | 98 | def _build_mel_basis(): 99 | assert hp.fmax <= hp.sample_rate // 2 100 | return librosa.filters.mel(sr=hp.sample_rate, n_fft= hp.n_fft, n_mels=hp.num_mels, 101 | fmin=hp.fmin, fmax=hp.fmax) 102 | 103 | def _amp_to_db(x): 104 | min_level = np.exp(hp.min_level_db / 20 * np.log(10)) 105 | return 20 * np.log10(np.maximum(min_level, x)) 106 | 107 | def _db_to_amp(x): 108 | return np.power(10.0, (x) * 0.05) 109 | 110 | def _normalize(S): 111 | if hp.allow_clipping_in_normalization: 112 | if hp.symmetric_mels: 113 | return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value, 114 | -hp.max_abs_value, hp.max_abs_value) 115 | else: 116 | return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value) 117 | 118 | assert S.max() <= 0 and S.min() - hp.min_level_db >= 0 119 | if hp.symmetric_mels: 120 | return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value 121 | else: 122 | return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)) 123 | 124 | def _denormalize(D): 125 | if hp.allow_clipping_in_normalization: 126 | if hp.symmetric_mels: 127 | return (((np.clip(D, -hp.max_abs_value, 128 | hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) 129 | + hp.min_level_db) 130 | else: 131 | return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 132 | 133 | if hp.symmetric_mels: 134 | return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db) 135 | else: 136 | return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 137 | -------------------------------------------------------------------------------- /blendmasker/blendmask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import onnxruntime 3 | 4 | class BLENDMASK: 5 | def __init__(self, model_path="blendswap_256.onnx", device='cpu'): 6 | session_options = onnxruntime.SessionOptions() 7 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 8 | providers = ["CPUExecutionProvider"] 9 | if device == 'cuda': 10 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "EXHAUSTIVE"}),"CPUExecutionProvider"] 11 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 12 | 13 | def mask(self, target_face): 14 | 15 | target_face = target_face.astype(np.float32) 16 | target_face = target_face[..., ::-1] 17 | target_face = target_face.transpose((2, 0, 1)) 18 | target_face = target_face /255.0 19 | target_face = np.expand_dims(target_face, axis=0).astype(np.float32) 20 | 21 | res = self.session.run(None, {(self.session.get_inputs()[0].name):target_face})[0] 22 | 23 | res = res.squeeze() 24 | res = res * 255.0 25 | res = res.astype(np.uint8) 26 | res = np.stack([res] * 3, axis=-1) 27 | 28 | return res -------------------------------------------------------------------------------- /blendmasker/place blendmasker onnx here.txt: -------------------------------------------------------------------------------- 1 | place blendmasker.onnx here -------------------------------------------------------------------------------- /checkpoints/place wav2lip onnx models here.txt: -------------------------------------------------------------------------------- 1 | place wav2lip onnx models here -------------------------------------------------------------------------------- /commandline_options.txt: -------------------------------------------------------------------------------- 1 | ('--checkpoint_path', type=str, help='Name of saved checkpoint to load weights from', required=True) 2 | ('--face', type=str, help='Filepath of video/image that contains faces to use', required=True) 3 | ('--audio', type=str, help='Filepath of video/audio file to use as raw audio source', required=True) 4 | ('--denoise', default=False, action="store_true", help="Denoise input audio to avoid unwanted lipmovement") 5 | ('--outfile', type=str, help='Video path to save result. See default for an e.g.', default='results/result_voice.mp4') 6 | ('--hq_output', default=False, action='store_true',help='HQ output') 7 | 8 | ('--static', default=False, action='store_true', help='If True, then use only first video frame for inference') 9 | ('--pingpong', default=False, action='store_true',help='pingpong loop if audio is longer than video') 10 | 11 | ('--cut_in', type=int, default=0, help="Frame to start inference") 12 | ('--cut_out', type=int, default=0, help="Frame to end inference") 13 | ('--fade', action="store_true", help="Fade in/out") 14 | 15 | ('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', default=25., required=False) 16 | ('--resize_factor', default=1, type=int, help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p') 17 | 18 | ('--enhancer', default='none', choices=['none', 'gpen', 'gfpgan', 'codeformer', 'restoreformer']) 19 | ('--blending', default=10, type=float, help='Amount of face enhancement blending 1 - 10') 20 | ('--sharpen', default=False, action="store_true", help="Slightly sharpen swapped face") 21 | ('--frame_enhancer', action="store_true", help="Use frame enhancer") 22 | 23 | ('--face_mask', action="store_true", help="Use face mask") 24 | ('--face_occluder', action="store_true", help="Use x-seg occluder face mask") 25 | 26 | ('--pads', type=int, default=4, help='Padding top, bottom to adjust best mouth position, move crop up/down, between -15 to 15') # pos value mov synced mouth up 27 | ('--face_mode', type=int, default=0, help='Face crop mode, 0 or 1, rect or square, affects mouth opening' ) 28 | 29 | ('--preview', default=False, action='store_true', help='Preview during inference') -------------------------------------------------------------------------------- /enhancers/Codeformer/Codeformer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | # import torch 3 | import onnxruntime 4 | import numpy as np 5 | 6 | class CodeFormer: 7 | def __init__(self, model_path="codeformer.onnx", device='cpu'): 8 | session_options = onnxruntime.SessionOptions() 9 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 10 | providers = ["CPUExecutionProvider"] 11 | if device == 'cuda': 12 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 13 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 14 | self.resolution = self.session.get_inputs()[0].shape[-2:] 15 | 16 | def preprocess(self, img, w): 17 | img = cv2.resize(img, self.resolution, interpolation=cv2.INTER_LINEAR) 18 | img = img.astype(np.float32)[:,:,::-1] / 255.0 19 | img = img.transpose((2, 0, 1)) 20 | img = (img - 0.5) / 0.5 21 | img = np.expand_dims(img, axis=0).astype(np.float32) 22 | w = np.array([w], dtype=np.double) 23 | return img, w 24 | 25 | def postprocess(self, img): 26 | img = (img.transpose(1,2,0).clip(-1,1) + 1) * 0.5 27 | img = (img * 255)[:,:,::-1] 28 | img = img.clip(0, 255).astype('uint8') 29 | return img 30 | 31 | def enhance(self, img, w=0.9): 32 | img, w = self.preprocess(img, w) 33 | output = self.session.run(None, {'x':img, 'w':w})[0][0] 34 | output = self.postprocess(output) 35 | return output 36 | -------------------------------------------------------------------------------- /enhancers/Codeformer/LICENSE: -------------------------------------------------------------------------------- 1 | S-Lab License 1.0 2 | 3 | Copyright 2022 S-Lab 4 | 5 | Redistribution and use for non-commercial purpose in source and 6 | binary forms, with or without modification, are permitted provided 7 | that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in 14 | the documentation and/or other materials provided with the 15 | distribution. 16 | 17 | 3. Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived 19 | from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | 33 | In the event that redistribution and/or use for commercial purpose in 34 | source or binary forms, with or without modification is required, 35 | please contact the contributor(s) of the work. 36 | -------------------------------------------------------------------------------- /enhancers/Codeformer/place codeformer onnx model here.txt: -------------------------------------------------------------------------------- 1 | place codeformer onnx model here -------------------------------------------------------------------------------- /enhancers/GFPGAN/GFPGAN.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | # import torch 3 | import onnxruntime 4 | import numpy as np 5 | 6 | class GFPGAN: 7 | def __init__(self, model_path="GFPGANv1.4.onnx", device='cpu'): 8 | session_options = onnxruntime.SessionOptions() 9 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 10 | providers = ["CPUExecutionProvider"] 11 | if device == 'cuda': 12 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 13 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 14 | self.resolution = self.session.get_inputs()[0].shape[-2:] 15 | 16 | def preprocess(self, img): 17 | img = cv2.resize(img, self.resolution, interpolation=cv2.INTER_LINEAR) 18 | img = img.astype(np.float32)[:,:,::-1] / 255.0 19 | img = img.transpose((2, 0, 1)) 20 | img = (img - 0.5) / 0.5 21 | img = np.expand_dims(img, axis=0).astype(np.float32) 22 | return img 23 | 24 | def postprocess(self, img): 25 | img = (img.transpose(1,2,0).clip(-1,1) + 1) * 0.5 26 | img = (img * 255)[:,:,::-1] 27 | img = img.clip(0, 255).astype('uint8') 28 | return img 29 | 30 | def enhance(self, img): 31 | img = self.preprocess(img) 32 | output = self.session.run(None, {'input':img})[0][0] 33 | output = self.postprocess(output) 34 | return output 35 | -------------------------------------------------------------------------------- /enhancers/GFPGAN/LICENSE: -------------------------------------------------------------------------------- 1 | Tencent is pleased to support the open source community by making GFPGAN available. 2 | 3 | Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | GFPGAN is licensed under the Apache License Version 2.0 except for the third-party components listed below. 6 | 7 | 8 | Terms of the Apache License Version 2.0: 9 | --------------------------------------------- 10 | Apache License 11 | 12 | Version 2.0, January 2004 13 | 14 | http://www.apache.org/licenses/ 15 | 16 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 17 | 1. Definitions. 18 | 19 | “License” shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 20 | 21 | “Licensor” shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 22 | 23 | “Legal Entity” shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, “control” means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | “You” (or “Your”) shall mean an individual or Legal Entity exercising permissions granted by this License. 26 | 27 | “Source” form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 28 | 29 | “Object” form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 30 | 31 | “Work” shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 32 | 33 | “Derivative Works” shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 34 | 35 | “Contribution” shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, “submitted” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as “Not a Contribution.” 36 | 37 | “Contributor” shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 38 | 39 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 40 | 41 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 42 | 43 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 44 | 45 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 46 | 47 | You must cause any modified files to carry prominent notices stating that You changed the files; and 48 | 49 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 50 | 51 | If the Work includes a “NOTICE” text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 52 | 53 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 54 | 55 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 56 | 57 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 58 | 59 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 60 | 61 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 62 | 63 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 64 | 65 | END OF TERMS AND CONDITIONS 66 | 67 | 68 | 69 | Other dependencies and licenses: 70 | 71 | 72 | Open Source Software licensed under the Apache 2.0 license and Other Licenses of the Third-Party Components therein: 73 | --------------------------------------------- 74 | 1. basicsr 75 | Copyright 2018-2020 BasicSR Authors 76 | 77 | 78 | This BasicSR project is released under the Apache 2.0 license. 79 | 80 | A copy of Apache 2.0 is included in this file. 81 | 82 | StyleGAN2 83 | The codes are modified from the repository stylegan2-pytorch. Many thanks to the author - Kim Seonghyeon 😊 for translating from the official TensorFlow codes to PyTorch ones. Here is the license of stylegan2-pytorch. 84 | The official repository is https://github.com/NVlabs/stylegan2, and here is the NVIDIA license. 85 | DFDNet 86 | The codes are largely modified from the repository DFDNet. Their license is Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 87 | 88 | Terms of the Nvidia License: 89 | --------------------------------------------- 90 | 91 | 1. Definitions 92 | 93 | "Licensor" means any person or entity that distributes its Work. 94 | 95 | "Software" means the original work of authorship made available under 96 | this License. 97 | 98 | "Work" means the Software and any additions to or derivative works of 99 | the Software that are made available under this License. 100 | 101 | "Nvidia Processors" means any central processing unit (CPU), graphics 102 | processing unit (GPU), field-programmable gate array (FPGA), 103 | application-specific integrated circuit (ASIC) or any combination 104 | thereof designed, made, sold, or provided by Nvidia or its affiliates. 105 | 106 | The terms "reproduce," "reproduction," "derivative works," and 107 | "distribution" have the meaning as provided under U.S. copyright law; 108 | provided, however, that for the purposes of this License, derivative 109 | works shall not include works that remain separable from, or merely 110 | link (or bind by name) to the interfaces of, the Work. 111 | 112 | Works, including the Software, are "made available" under this License 113 | by including in or with the Work either (a) a copyright notice 114 | referencing the applicability of this License to the Work, or (b) a 115 | copy of this License. 116 | 117 | 2. License Grants 118 | 119 | 2.1 Copyright Grant. Subject to the terms and conditions of this 120 | License, each Licensor grants to you a perpetual, worldwide, 121 | non-exclusive, royalty-free, copyright license to reproduce, 122 | prepare derivative works of, publicly display, publicly perform, 123 | sublicense and distribute its Work and any resulting derivative 124 | works in any form. 125 | 126 | 3. Limitations 127 | 128 | 3.1 Redistribution. You may reproduce or distribute the Work only 129 | if (a) you do so under this License, (b) you include a complete 130 | copy of this License with your distribution, and (c) you retain 131 | without modification any copyright, patent, trademark, or 132 | attribution notices that are present in the Work. 133 | 134 | 3.2 Derivative Works. You may specify that additional or different 135 | terms apply to the use, reproduction, and distribution of your 136 | derivative works of the Work ("Your Terms") only if (a) Your Terms 137 | provide that the use limitation in Section 3.3 applies to your 138 | derivative works, and (b) you identify the specific derivative 139 | works that are subject to Your Terms. Notwithstanding Your Terms, 140 | this License (including the redistribution requirements in Section 141 | 3.1) will continue to apply to the Work itself. 142 | 143 | 3.3 Use Limitation. The Work and any derivative works thereof only 144 | may be used or intended for use non-commercially. The Work or 145 | derivative works thereof may be used or intended for use by Nvidia 146 | or its affiliates commercially or non-commercially. As used herein, 147 | "non-commercially" means for research or evaluation purposes only. 148 | 149 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim 150 | against any Licensor (including any claim, cross-claim or 151 | counterclaim in a lawsuit) to enforce any patents that you allege 152 | are infringed by any Work, then your rights under this License from 153 | such Licensor (including the grants in Sections 2.1 and 2.2) will 154 | terminate immediately. 155 | 156 | 3.5 Trademarks. This License does not grant any rights to use any 157 | Licensor's or its affiliates' names, logos, or trademarks, except 158 | as necessary to reproduce the notices described in this License. 159 | 160 | 3.6 Termination. If you violate any term of this License, then your 161 | rights under this License (including the grants in Sections 2.1 and 162 | 2.2) will terminate immediately. 163 | 164 | 4. Disclaimer of Warranty. 165 | 166 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY 167 | KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 168 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR 169 | NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER 170 | THIS LICENSE. 171 | 172 | 5. Limitation of Liability. 173 | 174 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 175 | THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 176 | SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, 177 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF 178 | OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 179 | (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, 180 | LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 181 | COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF 182 | THE POSSIBILITY OF SUCH DAMAGES. 183 | 184 | MIT License 185 | 186 | Copyright (c) 2019 Kim Seonghyeon 187 | 188 | Permission is hereby granted, free of charge, to any person obtaining a copy 189 | of this software and associated documentation files (the "Software"), to deal 190 | in the Software without restriction, including without limitation the rights 191 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 192 | copies of the Software, and to permit persons to whom the Software is 193 | furnished to do so, subject to the following conditions: 194 | 195 | The above copyright notice and this permission notice shall be included in all 196 | copies or substantial portions of the Software. 197 | 198 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 199 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 200 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 201 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 202 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 203 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 204 | SOFTWARE. 205 | 206 | 207 | 208 | Open Source Software licensed under the BSD 3-Clause license: 209 | --------------------------------------------- 210 | 1. torchvision 211 | Copyright (c) Soumith Chintala 2016, 212 | All rights reserved. 213 | 214 | 2. torch 215 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 216 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 217 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 218 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 219 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 220 | Copyright (c) 2011-2013 NYU (Clement Farabet) 221 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 222 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 223 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 224 | 225 | 226 | Terms of the BSD 3-Clause License: 227 | --------------------------------------------- 228 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 229 | 230 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 231 | 232 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 233 | 234 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 235 | 236 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 237 | 238 | 239 | 240 | Open Source Software licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein: 241 | --------------------------------------------- 242 | 1. numpy 243 | Copyright (c) 2005-2020, NumPy Developers. 244 | All rights reserved. 245 | 246 | A copy of BSD 3-Clause License is included in this file. 247 | 248 | The NumPy repository and source distributions bundle several libraries that are 249 | compatibly licensed. We list these here. 250 | 251 | Name: Numpydoc 252 | Files: doc/sphinxext/numpydoc/* 253 | License: BSD-2-Clause 254 | For details, see doc/sphinxext/LICENSE.txt 255 | 256 | Name: scipy-sphinx-theme 257 | Files: doc/scipy-sphinx-theme/* 258 | License: BSD-3-Clause AND PSF-2.0 AND Apache-2.0 259 | For details, see doc/scipy-sphinx-theme/LICENSE.txt 260 | 261 | Name: lapack-lite 262 | Files: numpy/linalg/lapack_lite/* 263 | License: BSD-3-Clause 264 | For details, see numpy/linalg/lapack_lite/LICENSE.txt 265 | 266 | Name: tempita 267 | Files: tools/npy_tempita/* 268 | License: MIT 269 | For details, see tools/npy_tempita/license.txt 270 | 271 | Name: dragon4 272 | Files: numpy/core/src/multiarray/dragon4.c 273 | License: MIT 274 | For license text, see numpy/core/src/multiarray/dragon4.c 275 | 276 | 277 | 278 | Open Source Software licensed under the MIT license: 279 | --------------------------------------------- 280 | 1. facexlib 281 | Copyright (c) 2020 Xintao Wang 282 | 283 | 2. opencv-python 284 | Copyright (c) Olli-Pekka Heinisuo 285 | Please note that only files in cv2 package are used. 286 | 287 | 288 | Terms of the MIT License: 289 | --------------------------------------------- 290 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 291 | 292 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 293 | 294 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 295 | 296 | 297 | 298 | Open Source Software licensed under the MIT license and Other Licenses of the Third-Party Components therein: 299 | --------------------------------------------- 300 | 1. tqdm 301 | Copyright (c) 2013 noamraph 302 | 303 | `tqdm` is a product of collaborative work. 304 | Unless otherwise stated, all authors (see commit logs) retain copyright 305 | for their respective work, and release the work under the MIT licence 306 | (text below). 307 | 308 | Exceptions or notable authors are listed below 309 | in reverse chronological order: 310 | 311 | * files: * 312 | MPLv2.0 2015-2020 (c) Casper da Costa-Luis 313 | [casperdcl](https://github.com/casperdcl). 314 | * files: tqdm/_tqdm.py 315 | MIT 2016 (c) [PR #96] on behalf of Google Inc. 316 | * files: tqdm/_tqdm.py setup.py README.rst MANIFEST.in .gitignore 317 | MIT 2013 (c) Noam Yorav-Raphael, original author. 318 | 319 | [PR #96]: https://github.com/tqdm/tqdm/pull/96 320 | 321 | 322 | Mozilla Public Licence (MPL) v. 2.0 - Exhibit A 323 | ----------------------------------------------- 324 | 325 | This Source Code Form is subject to the terms of the 326 | Mozilla Public License, v. 2.0. 327 | If a copy of the MPL was not distributed with this file, 328 | You can obtain one at https://mozilla.org/MPL/2.0/. 329 | 330 | 331 | MIT License (MIT) 332 | ----------------- 333 | 334 | Copyright (c) 2013 noamraph 335 | 336 | Permission is hereby granted, free of charge, to any person obtaining a copy of 337 | this software and associated documentation files (the "Software"), to deal in 338 | the Software without restriction, including without limitation the rights to 339 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 340 | the Software, and to permit persons to whom the Software is furnished to do so, 341 | subject to the following conditions: 342 | 343 | The above copyright notice and this permission notice shall be included in all 344 | copies or substantial portions of the Software. 345 | 346 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 347 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 348 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 349 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 350 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 351 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /enhancers/GFPGAN/place gfpgan onnx model here.txt: -------------------------------------------------------------------------------- 1 | place gfpgan onnx model here -------------------------------------------------------------------------------- /enhancers/GPEN/GPEN.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | # import torch 3 | import onnxruntime 4 | import numpy as np 5 | 6 | class GPEN: 7 | def __init__(self, model_path="GPEN-BFR-512.onnx", device='cpu'): 8 | session_options = onnxruntime.SessionOptions() 9 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 10 | providers = ["CPUExecutionProvider"] 11 | if device == 'cuda': 12 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 13 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 14 | self.resolution = self.session.get_inputs()[0].shape[-2:] 15 | 16 | def preprocess(self, img): 17 | img = cv2.resize(img, self.resolution, interpolation=cv2.INTER_LINEAR) 18 | img = img.astype(np.float32)[:,:,::-1] / 255.0 19 | img = img.transpose((2, 0, 1)) 20 | img = (img - 0.5) / 0.5 21 | img = np.expand_dims(img, axis=0).astype(np.float32) 22 | return img 23 | 24 | def postprocess(self, img): 25 | img = (img.transpose(1,2,0).clip(-1,1) + 1) * 0.5 26 | img = (img * 255)[:,:,::-1] 27 | img = img.clip(0, 255).astype('uint8') 28 | return img 29 | 30 | def enhance(self, img): 31 | img = self.preprocess(img) 32 | output = self.session.run(None, {'input':img})[0][0] 33 | output = self.postprocess(output) 34 | return output 35 | -------------------------------------------------------------------------------- /enhancers/GPEN/LICENSE: -------------------------------------------------------------------------------- 1 | © Alibaba, 2021. For academic and non-commercial use only. 2 | -------------------------------------------------------------------------------- /enhancers/GPEN/place gpen onnx model here.txt: -------------------------------------------------------------------------------- 1 | place gpen onnx model here -------------------------------------------------------------------------------- /enhancers/RealEsrgan/clear_reality_x4.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instant-high/wav2lip-onnx-HQ/79fc6261a1c36ca9abf946cc5260a8aaeeb3746f/enhancers/RealEsrgan/clear_reality_x4.onnx -------------------------------------------------------------------------------- /enhancers/RealEsrgan/esrganONNX.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | # import torch 3 | import onnxruntime 4 | import numpy as np 5 | 6 | 7 | class RealESRGAN_ONNX: 8 | def __init__(self, model_path="RealESRGAN_x2.onnx", device='cuda'): 9 | session_options = onnxruntime.SessionOptions() 10 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 11 | providers = ["CPUExecutionProvider"] 12 | if device == 'cuda': 13 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 14 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 15 | 16 | def enhance(self, img): 17 | h, w = img.shape[:2] 18 | #img = cv2.resize(img,(w//2, h//2), interpolation=cv2.INTER_AREA) 19 | img = img.astype(np.float32) 20 | img = img.transpose((2, 0, 1)) 21 | img = img /255 22 | img = np.expand_dims(img, axis=0).astype(np.float32) 23 | # 24 | result = self.session.run(None, {(self.session.get_inputs()[0].name):img})[0][0] 25 | # 26 | result = (result.squeeze().transpose((1,2,0)) * 255).clip(0, 255).astype(np.uint8) 27 | return result 28 | 29 | 30 | -------------------------------------------------------------------------------- /enhancers/restoreformer/place restoreformer onnx model here.txt: -------------------------------------------------------------------------------- 1 | place restoreformer onnx model here -------------------------------------------------------------------------------- /enhancers/restoreformer/restoreformer16.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import onnxruntime 3 | import numpy as np 4 | 5 | class RestoreFormer: 6 | def __init__(self, model_path="restoreformer.onnx", device='cpu'): 7 | session_options = onnxruntime.SessionOptions() 8 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 9 | providers = ["CPUExecutionProvider"] 10 | if device == 'cuda': 11 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 12 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 13 | self.resolution = self.session.get_inputs()[0].shape[-2:] 14 | 15 | def preprocess(self, img): 16 | img = cv2.resize(img, self.resolution, interpolation=cv2.INTER_LINEAR) 17 | img = img.astype(np.float32)[:,:,::-1] / 255.0 18 | img = img.transpose((2, 0, 1)) 19 | img = (img - 0.5) / 0.5 20 | img = np.expand_dims(img, axis=0).astype(np.float16) 21 | return img 22 | 23 | def postprocess(self, img): 24 | img = (img.transpose(1,2,0).clip(-1,1) + 1) * 0.5 25 | img = (img * 255)[:,:,::-1] 26 | img = img.clip(0, 255).astype('uint8') 27 | return img 28 | 29 | def enhance(self, img): 30 | img = self.preprocess(img) 31 | output = self.session.run(None, {'input':img,})[0][0] 32 | output = self.postprocess(output) 33 | return output 34 | -------------------------------------------------------------------------------- /enhancers/restoreformer/restoreformer32.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import onnxruntime 3 | import numpy as np 4 | 5 | class RestoreFormer: 6 | def __init__(self, model_path="restoreformer.onnx", device='cpu'): 7 | session_options = onnxruntime.SessionOptions() 8 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 9 | providers = ["CPUExecutionProvider"] 10 | if device == 'cuda': 11 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 12 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 13 | self.resolution = self.session.get_inputs()[0].shape[-2:] 14 | 15 | def preprocess(self, img): 16 | img = cv2.resize(img, self.resolution, interpolation=cv2.INTER_LINEAR) 17 | img = img.astype(np.float32)[:,:,::-1] / 255.0 18 | img = img.transpose((2, 0, 1)) 19 | img = (img - 0.5) / 0.5 20 | img = np.expand_dims(img, axis=0).astype(np.float32) 21 | return img 22 | 23 | def postprocess(self, img): 24 | img = (img.transpose(1,2,0).clip(-1,1) + 1) * 0.5 25 | img = (img * 255)[:,:,::-1] 26 | img = img.clip(0, 255).astype('uint8') 27 | return img 28 | 29 | def enhance(self, img): 30 | img = self.preprocess(img) 31 | output = self.session.run(None, {'input':img,})[0][0] 32 | output = self.postprocess(output) 33 | return output 34 | -------------------------------------------------------------------------------- /faceID/faceID.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cv2 4 | import numpy 5 | from onnxruntime import InferenceSession 6 | 7 | 8 | def distance2box(points, distance, max_shape=None): 9 | x1 = points[:, 0] - distance[:, 0] 10 | y1 = points[:, 1] - distance[:, 1] 11 | x2 = points[:, 0] + distance[:, 2] 12 | y2 = points[:, 1] + distance[:, 3] 13 | if max_shape is not None: 14 | x1 = x1.clamp(min=0, max=max_shape[1]) 15 | y1 = y1.clamp(min=0, max=max_shape[0]) 16 | x2 = x2.clamp(min=0, max=max_shape[1]) 17 | y2 = y2.clamp(min=0, max=max_shape[0]) 18 | return numpy.stack([x1, y1, x2, y2], axis=-1) 19 | 20 | 21 | def distance2kps(points, distance, max_shape=None): 22 | outputs = [] 23 | for i in range(0, distance.shape[1], 2): 24 | px = points[:, i % 2] + distance[:, i] 25 | py = points[:, i % 2 + 1] + distance[:, i + 1] 26 | if max_shape is not None: 27 | px = px.clamp(min=0, max=max_shape[1]) 28 | py = py.clamp(min=0, max=max_shape[0]) 29 | outputs.append(px) 30 | outputs.append(py) 31 | return numpy.stack(outputs, axis=-1) 32 | 33 | 34 | class FaceDetection: 35 | def __init__(self, onnx_path=None, session=None): 36 | self.batched = False 37 | self.session = session 38 | 39 | if self.session is None: 40 | assert onnx_path is not None 41 | assert os.path.exists(onnx_path) 42 | self.session = InferenceSession(onnx_path, 43 | providers=['CUDAExecutionProvider']) 44 | self.nms_thresh = 0.4 45 | self.center_cache = {} 46 | input_cfg = self.session.get_inputs()[0] 47 | input_shape = input_cfg.shape 48 | if isinstance(input_shape[2], str): 49 | self.input_size = None 50 | else: 51 | self.input_size = tuple(input_shape[2:4][::-1]) 52 | input_name = input_cfg.name 53 | outputs = self.session.get_outputs() 54 | if len(outputs[0].shape) == 3: 55 | self.batched = True 56 | output_names = [] 57 | for output in outputs: 58 | output_names.append(output.name) 59 | self.input_name = input_name 60 | self.output_names = output_names 61 | self.use_kps = False 62 | self._num_anchors = 1 63 | if len(outputs) == 6: 64 | self.fmc = 3 65 | self._feat_stride_fpn = [8, 16, 32] 66 | self._num_anchors = 2 67 | elif len(outputs) == 9: 68 | self.fmc = 3 69 | self._feat_stride_fpn = [8, 16, 32] 70 | self._num_anchors = 2 71 | self.use_kps = True 72 | elif len(outputs) == 10: 73 | self.fmc = 5 74 | self._feat_stride_fpn = [8, 16, 32, 64, 128] 75 | self._num_anchors = 1 76 | elif len(outputs) == 15: 77 | self.fmc = 5 78 | self._feat_stride_fpn = [8, 16, 32, 64, 128] 79 | self._num_anchors = 1 80 | self.use_kps = True 81 | 82 | def forward(self, x, score_thresh): 83 | scores_list = [] 84 | bboxes_list = [] 85 | points_list = [] 86 | input_size = tuple(x.shape[0:2][::-1]) 87 | blob = cv2.dnn.blobFromImage(x, 88 | 1.0 / 128, 89 | input_size, 90 | (127.5, 127.5, 127.5), swapRB=True) 91 | net_outs = self.session.run(self.output_names, {self.input_name: blob}) 92 | 93 | input_height = blob.shape[2] 94 | input_width = blob.shape[3] 95 | fmc = self.fmc 96 | for idx, stride in enumerate(self._feat_stride_fpn): 97 | if self.batched: 98 | scores = net_outs[idx][0] 99 | boxes = net_outs[idx + fmc][0] 100 | boxes = boxes * stride 101 | if self.use_kps: 102 | points = net_outs[idx + fmc * 2][0] * stride 103 | else: 104 | scores = net_outs[idx] 105 | boxes = net_outs[idx + fmc] 106 | boxes = boxes * stride 107 | if self.use_kps: 108 | points = net_outs[idx + fmc * 2] * stride 109 | 110 | height = input_height // stride 111 | width = input_width // stride 112 | key = (height, width, stride) 113 | if key in self.center_cache: 114 | anchor_centers = self.center_cache[key] 115 | else: 116 | anchor_centers = numpy.stack(numpy.mgrid[:height, :width][::-1], axis=-1) 117 | anchor_centers = anchor_centers.astype(numpy.float32) 118 | 119 | anchor_centers = (anchor_centers * stride).reshape((-1, 2)) 120 | if self._num_anchors > 1: 121 | anchor_centers = numpy.stack([anchor_centers] * self._num_anchors, axis=1) 122 | anchor_centers = anchor_centers.reshape((-1, 2)) 123 | if len(self.center_cache) < 100: 124 | self.center_cache[key] = anchor_centers 125 | 126 | pos_indices = numpy.where(scores >= score_thresh)[0] 127 | bboxes = distance2box(anchor_centers, boxes) 128 | pos_scores = scores[pos_indices] 129 | pos_bboxes = bboxes[pos_indices] 130 | scores_list.append(pos_scores) 131 | bboxes_list.append(pos_bboxes) 132 | if self.use_kps: 133 | points = distance2kps(anchor_centers, points) 134 | points = points.reshape((points.shape[0], -1, 2)) 135 | points_list.append(points[pos_indices]) 136 | return scores_list, bboxes_list, points_list 137 | 138 | def __call__(self, img, score_thresh=0.5, input_size=None, max_num=0, metric='default'): 139 | assert input_size is not None or self.input_size is not None 140 | input_size = self.input_size if input_size is None else input_size 141 | 142 | im_ratio = float(img.shape[0]) / img.shape[1] 143 | model_ratio = float(input_size[1]) / input_size[0] 144 | if im_ratio > model_ratio: 145 | new_height = input_size[1] 146 | new_width = int(new_height / im_ratio) 147 | else: 148 | new_width = input_size[0] 149 | new_height = int(new_width * im_ratio) 150 | det_scale = float(new_height) / img.shape[0] 151 | resized_img = cv2.resize(img, (new_width, new_height)) 152 | det_img = numpy.zeros((input_size[1], input_size[0], 3), dtype=numpy.uint8) 153 | det_img[:new_height, :new_width, :] = resized_img 154 | 155 | scores_list, bboxes_list, points_list = self.forward(det_img, score_thresh) 156 | 157 | scores = numpy.vstack(scores_list) 158 | scores_ravel = scores.ravel() 159 | order = scores_ravel.argsort()[::-1] 160 | bboxes = numpy.vstack(bboxes_list) / det_scale 161 | if self.use_kps: 162 | points = numpy.vstack(points_list) / det_scale 163 | pre_det = numpy.hstack((bboxes, scores)).astype(numpy.float32, copy=False) 164 | pre_det = pre_det[order, :] 165 | keep = self.nms(pre_det) 166 | det = pre_det[keep, :] 167 | if self.use_kps: 168 | points = points[order, :, :] 169 | points = points[keep, :, :] 170 | else: 171 | points = None 172 | if 0 < max_num < det.shape[0]: 173 | area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]) 174 | img_center = img.shape[0] // 2, img.shape[1] // 2 175 | offsets = numpy.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1], 176 | (det[:, 1] + det[:, 3]) / 2 - img_center[0]]) 177 | offset_dist_squared = numpy.sum(numpy.power(offsets, 2.0), 0) 178 | if metric == 'max': 179 | values = area 180 | else: 181 | values = area - offset_dist_squared * 2.0 # some extra weight on the centering 182 | index = numpy.argsort(values)[::-1] # some extra weight on the centering 183 | index = index[0:max_num] 184 | det = det[index, :] 185 | if points is not None: 186 | points = points[index, :] 187 | return det, points 188 | 189 | def nms(self, outputs): 190 | thresh = self.nms_thresh 191 | x1 = outputs[:, 0] 192 | y1 = outputs[:, 1] 193 | x2 = outputs[:, 2] 194 | y2 = outputs[:, 3] 195 | scores = outputs[:, 4] 196 | 197 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 198 | order = scores.argsort()[::-1] 199 | 200 | keep = [] 201 | while order.size > 0: 202 | i = order[0] 203 | keep.append(i) 204 | xx1 = numpy.maximum(x1[i], x1[order[1:]]) 205 | yy1 = numpy.maximum(y1[i], y1[order[1:]]) 206 | xx2 = numpy.minimum(x2[i], x2[order[1:]]) 207 | yy2 = numpy.minimum(y2[i], y2[order[1:]]) 208 | 209 | w = numpy.maximum(0.0, xx2 - xx1 + 1) 210 | h = numpy.maximum(0.0, yy2 - yy1 + 1) 211 | inter = w * h 212 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 213 | 214 | indices = numpy.where(ovr <= thresh)[0] 215 | order = order[indices + 1] 216 | 217 | return keep 218 | 219 | 220 | class FaceRecognition: 221 | def __init__(self, onnx_path=None, session=None): 222 | self.session = session 223 | 224 | if self.session is None: 225 | assert onnx_path is not None 226 | assert os.path.exists(onnx_path) 227 | self.session = InferenceSession(onnx_path, 228 | providers=['CUDAExecutionProvider']) 229 | 230 | def __call__(self, x): 231 | x = x.astype('float32') 232 | x = (x / 255 - 0.5) / 0.5 233 | x = x.transpose((2, 0, 1)) 234 | x = numpy.expand_dims(x, 0) 235 | return self.session.run(None, {'data': x}) 236 | #return self.session.run(None, {'input.1': x}) 237 | -------------------------------------------------------------------------------- /faceID/place recognition onnx model here.txt: -------------------------------------------------------------------------------- 1 | place recognition.onnx model here -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import os 3 | 4 | def get_image_list(data_root, split): 5 | filelist = [] 6 | 7 | with open('filelists/{}.txt'.format(split)) as f: 8 | for line in f: 9 | line = line.strip() 10 | if ' ' in line: line = line.split()[0] 11 | filelist.append(os.path.join(data_root, line)) 12 | 13 | return filelist 14 | 15 | class HParams: 16 | def __init__(self, **kwargs): 17 | self.data = {} 18 | 19 | for key, value in kwargs.items(): 20 | self.data[key] = value 21 | 22 | def __getattr__(self, key): 23 | if key not in self.data: 24 | raise AttributeError("'HParams' object has no attribute %s" % key) 25 | return self.data[key] 26 | 27 | def set_hparam(self, key, value): 28 | self.data[key] = value 29 | 30 | 31 | # Default hyperparameters 32 | hparams = HParams( 33 | num_mels=80, # Number of mel-spectrogram channels and local conditioning dimensionality 34 | # network 35 | rescale=True, # Whether to rescale audio prior to preprocessing 36 | rescaling_max=0.9, # Rescaling value 37 | 38 | # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction 39 | # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder 40 | # Does not work if n_ffit is not multiple of hop_size!! 41 | use_lws=False, 42 | 43 | n_fft=800, # Extra window size is filled with 0 paddings to match this parameter 44 | hop_size=200, # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate) 45 | win_size=800, # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate) 46 | sample_rate=16000, # 16000Hz (corresponding to librispeech) (sox --i ) 47 | 48 | frame_shift_ms=None, # Can replace hop_size parameter. (Recommended: 12.5) 49 | 50 | # Mel and Linear spectrograms normalization/scaling and clipping 51 | signal_normalization=True, 52 | # Whether to normalize mel spectrograms to some predefined range (following below parameters) 53 | allow_clipping_in_normalization=True, # Only relevant if mel_normalization = True 54 | symmetric_mels=True, 55 | # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, 56 | # faster and cleaner convergence) 57 | max_abs_value=4., 58 | # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not 59 | # be too big to avoid gradient explosion, 60 | # not too small for fast convergence) 61 | # Contribution by @begeekmyfriend 62 | # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude 63 | # levels. Also allows for better G&L phase reconstruction) 64 | preemphasize=True, # whether to apply filter 65 | preemphasis=0.97, # filter coefficient. 66 | 67 | # Limits 68 | min_level_db=-100, 69 | ref_level_db=20, 70 | fmin=55, 71 | # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To 72 | # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 73 | fmax=7600, # To be increased/reduced depending on data. 74 | 75 | ###################### Our training parameters ################################# 76 | img_size=96, 77 | fps=25, 78 | 79 | batch_size=16, 80 | initial_learning_rate=1e-4, 81 | nepochs=200000000000000000, ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs 82 | num_workers=16, 83 | checkpoint_interval=3000, 84 | eval_interval=3000, 85 | save_optimizer_state=True, 86 | 87 | syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence. 88 | syncnet_batch_size=64, 89 | syncnet_lr=1e-4, 90 | syncnet_eval_interval=10000, 91 | syncnet_checkpoint_interval=10000, 92 | 93 | disc_wt=0.07, 94 | disc_initial_learning_rate=1e-4, 95 | ) 96 | 97 | 98 | def hparams_debug_string(): 99 | values = hparams.values() 100 | hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"] 101 | return "Hyperparameters:\n" + "\n".join(hp) 102 | -------------------------------------------------------------------------------- /inference_onnxModel.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import subprocess 3 | import platform 4 | import numpy as np 5 | import cv2 6 | import argparse 7 | import audio 8 | import shutil 9 | import librosa 10 | from os import listdir, path 11 | from tqdm import tqdm 12 | from PIL import Image 13 | from scipy.io.wavfile import write 14 | 15 | import onnxruntime 16 | onnxruntime.set_default_logger_severity(3) 17 | 18 | # face detection and alignment 19 | from utils.retinaface import RetinaFace 20 | from utils.face_alignment import get_cropped_head_256 21 | detector = RetinaFace("utils/scrfd_2.5g_bnkps.onnx", provider=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"], session_options=None) 22 | 23 | # specific face selector 24 | from faceID.faceID import FaceRecognition 25 | recognition = FaceRecognition('faceID/recognition.onnx') 26 | 27 | 28 | # arguments 29 | parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models') 30 | 31 | parser.add_argument('--checkpoint_path', type=str, help='Name of saved checkpoint to load weights from', required=True) 32 | parser.add_argument('--face', type=str, help='Filepath of video/image that contains faces to use', required=True) 33 | parser.add_argument('--audio', type=str, help='Filepath of video/audio file to use as raw audio source', required=True) 34 | parser.add_argument('--denoise', default=False, action="store_true", help="Denoise input audio to avoid unwanted lipmovement") 35 | parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.', default='results/result_voice.mp4') 36 | parser.add_argument('--hq_output', default=False, action='store_true',help='HQ output') 37 | 38 | parser.add_argument('--static', default=False, action='store_true', help='If True, then use only first video frame for inference') 39 | parser.add_argument('--pingpong', default=False, action='store_true',help='pingpong loop if audio is longer than video') 40 | 41 | parser.add_argument('--cut_in', type=int, default=0, help="Frame to start inference") 42 | parser.add_argument('--cut_out', type=int, default=0, help="Frame to end inference") 43 | parser.add_argument('--fade', action="store_true", help="Fade in/out") 44 | 45 | parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', default=25., required=False) 46 | parser.add_argument('--resize_factor', default=1, type=int, help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p') 47 | 48 | parser.add_argument('--enhancer', default='none', choices=['none', 'gpen', 'gfpgan', 'codeformer', 'restoreformer']) 49 | parser.add_argument('--blending', default=10, type=float, help='Amount of face enhancement blending 1 - 10') 50 | parser.add_argument('--sharpen', default=False, action="store_true", help="Slightly sharpen swapped face") 51 | parser.add_argument('--frame_enhancer', action="store_true", help="Use frame enhancer") 52 | 53 | parser.add_argument('--face_mask', action="store_true", help="Use face mask") 54 | parser.add_argument('--face_occluder', action="store_true", help="Use x-seg occluder face mask") 55 | 56 | parser.add_argument('--pads', type=int, default=4, help='Padding top, bottom to adjust best mouth position, move crop up/down, between -15 to 15') # pos value mov synced mouth up 57 | parser.add_argument('--face_mode', type=int, default=0, help='Face crop mode, 0 or 1, rect or square, affects mouth opening' ) 58 | 59 | parser.add_argument('--preview', default=False, action='store_true', help='Preview during inference') 60 | 61 | # removed arguments 62 | #parser.add_argument('--face_det_batch_size', type=int, help='Batch size for face detection', default=16) 63 | #parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=1) 64 | #parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1], help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width') 65 | #parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1], help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.''Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).') 66 | #parser.add_argument('--rotate', default=False, action='store_true',help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.''Use if you get a flipped result, despite feeding a normal looking video') 67 | #parser.add_argument('--nosmooth', default=False, action='store_true',help='Prevent smoothing face detections over a short temporal window') 68 | 69 | args = parser.parse_args() 70 | 71 | if args.checkpoint_path == 'checkpoints\wav2lip_384.onnx' or args.checkpoint_path == 'checkpoints\wav2lip_384_fp16.onnx': 72 | args.img_size = 384 73 | else: 74 | args.img_size = 96 75 | 76 | mel_step_size = 16 77 | padY = max(-15, min(args.pads, 15)) 78 | 79 | device = 'cpu' 80 | if onnxruntime.get_device() == 'GPU': 81 | device = 'cuda' 82 | print("Running on " + device) 83 | 84 | 85 | if args.enhancer == 'gpen': 86 | from enhancers.GPEN.GPEN import GPEN 87 | enhancer = GPEN(model_path="enhancers/GPEN/GPEN-BFR-256-sim.onnx", device=device) #GPEN-BFR-256-sim 88 | 89 | if args.enhancer == 'codeformer': 90 | from enhancers.Codeformer.Codeformer import CodeFormer 91 | enhancer = CodeFormer(model_path="enhancers/Codeformer/codeformerfixed.onnx", device=device) 92 | 93 | if args.enhancer == 'restoreformer': 94 | from enhancers.restoreformer.restoreformer16 import RestoreFormer 95 | enhancer = RestoreFormer(model_path="enhancers/restoreformer/restoreformer16.onnx", device=device) 96 | 97 | if args.enhancer == 'gfpgan': 98 | from enhancers.GFPGAN.GFPGAN import GFPGAN 99 | enhancer = GFPGAN(model_path="enhancers/GFPGAN/GFPGANv1.4.onnx", device=device) 100 | 101 | if args.frame_enhancer: 102 | from enhancers.RealEsrgan.esrganONNX import RealESRGAN_ONNX 103 | frame_enhancer = RealESRGAN_ONNX(model_path="enhancers/RealEsrgan/clear_reality_x4.onnx", device=device) 104 | 105 | if args.face_mask: 106 | from blendmasker.blendmask import BLENDMASK 107 | masker = BLENDMASK(model_path="blendmasker/blendmasker.onnx", device=device) 108 | 109 | if args.face_occluder: 110 | from xseg.xseg import MASK 111 | occluder = MASK(model_path="xseg/xseg.onnx", device=device) 112 | 113 | if args.denoise: 114 | from resemble_denoiser.resemble_denoiser import ResembleDenoiser 115 | denoiser = ResembleDenoiser(model_path='resemble_denoiser/denoiser.onnx', device=device) 116 | 117 | if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']: 118 | args.static: args.static = True 119 | 120 | 121 | 122 | def load_model(device): 123 | model_path = args.checkpoint_path 124 | session_options = onnxruntime.SessionOptions() 125 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 126 | providers = ["CPUExecutionProvider"] 127 | if device == 'cuda': 128 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 129 | 130 | session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 131 | 132 | return session 133 | 134 | 135 | def select_specific_face(model, spec_img, size, crop_scale=1.0): 136 | 137 | # select face: 138 | h, w = spec_img.shape[:-1] 139 | roi = cv2.selectROI("Select speaker face", spec_img, showCrosshair=False) 140 | if roi == (0,0,0,0):roi = (0,0,w,h) 141 | cropped_roi = spec_img[int(roi[1]):int(roi[1]+roi[3]), int(roi[0]):int(roi[0]+roi[2])] 142 | cv2.destroyAllWindows() 143 | 144 | bboxes, kpss = model.detect(cropped_roi, input_size = (320,320), det_thresh=0.3) 145 | assert len(kpss) != 0, "No face detected" 146 | 147 | target_face, mat = get_cropped_head_256(cropped_roi, kpss[0], size=size, scale=crop_scale) 148 | target_face = cv2.resize(target_face,(112,112)) 149 | target_id = recognition(target_face)[0].flatten() 150 | 151 | return target_id 152 | 153 | def process_video_specific(model, img, size, target_id, crop_scale=1.0): 154 | ori_img = img 155 | bboxes, kpss = model.detect(ori_img, input_size=(320, 320), det_thresh=0.3) 156 | 157 | assert len(kpss) != 0, "No face detected" 158 | 159 | best_score = -float('inf') 160 | best_aimg = None 161 | best_mat = None 162 | 163 | for kps in kpss: 164 | aimg, mat = get_cropped_head_256(ori_img, kps, size=size, scale=crop_scale) 165 | 166 | face = aimg.copy() 167 | face = cv2.resize(face, (112, 112)) 168 | face_id = recognition(face)[0].flatten() 169 | 170 | # Calculate similarity score with the target ID 171 | score = target_id @ face_id # Dot product or cosine similarity 172 | 173 | if score > best_score: 174 | best_score = score 175 | best_aimg = aimg 176 | best_mat = mat 177 | if best_score < 0.4: 178 | best_aimg = np.zeros((256,256), dtype=np.uint8) 179 | best_aimg = cv2.cvtColor(best_aimg, cv2.COLOR_GRAY2RGB)/255 180 | best_mat = np.float32([[1,2,3],[1,2,3]]) 181 | 182 | return best_aimg, best_mat 183 | 184 | def face_detect(images, target_id): 185 | 186 | os.system('cls') 187 | print ("Detecting face and generating data...") 188 | 189 | crop_size = 256 190 | 191 | sub_faces = [] 192 | crop_faces = [] 193 | matrix = [] 194 | face_error = [] 195 | 196 | for i in tqdm(range(0, len(images))): 197 | 198 | try: 199 | 200 | crop_face, M = process_video_specific(detector, images[i], 256, target_id, crop_scale=1.0) 201 | 202 | # crop modes 203 | if args.face_mode == 0: 204 | sub_face = crop_face[65-(padY):241-(padY),62:194] 205 | #cv2.imwrite("sub_0.jpg",sub_face) 206 | else: 207 | sub_face = crop_face[65-(padY):241-(padY),42:214] 208 | #cv2.imwrite("sub_1.jpg",sub_face) 209 | 210 | sub_face = cv2.resize(sub_face, (args.img_size,args.img_size)) 211 | 212 | sub_faces.append(sub_face) 213 | crop_faces.append(crop_face) 214 | matrix.append(M) 215 | 216 | no_face = 0 217 | 218 | except: 219 | if i == 0: 220 | crop_face = np.zeros((256,256), dtype=np.uint8) 221 | crop_face = cv2.cvtColor(crop_face, cv2.COLOR_GRAY2RGB)/255 222 | sub_face = crop_face[65-(padY):241-(padY),62:194] 223 | sub_face = cv2.resize(sub_face, (args.img_size,args.img_size)) 224 | M = np.float32([[1,2,3],[1,2,3]]) 225 | 226 | sub_faces.append(sub_face) 227 | crop_faces.append(crop_face) 228 | matrix.append(M) 229 | 230 | no_face = -1 231 | 232 | face_error.append(no_face) 233 | 234 | return crop_faces, sub_faces, matrix, face_error 235 | 236 | def datagen(frames, mels): 237 | 238 | img_batch, mel_batch, frame_batch = [], [], [] 239 | 240 | for i, m in enumerate(mels): 241 | 242 | idx = 0 if args.static else i%len(frames) 243 | 244 | frame_to_save = frames[idx].copy() 245 | frame_batch.append(frame_to_save) 246 | 247 | img_batch.append(frames[idx]) 248 | mel_batch.append(m) 249 | 250 | img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) 251 | 252 | img_masked = img_batch.copy() 253 | img_masked[:, args.img_size//2:] = 0 254 | 255 | img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. 256 | mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) 257 | 258 | yield img_batch, mel_batch, frame_batch 259 | img_batch, mel_batch, frame_batch = [], [], [] 260 | 261 | def main(): 262 | if args.hq_output: 263 | if not os.path.exists('hq_temp'): 264 | os.mkdir('hq_temp') 265 | 266 | # ffmpeg preset for HQ processing 267 | preset='medium' 268 | 269 | blend = args.blending/10 270 | 271 | static_face_mask = np.zeros((224,224), dtype=np.uint8) 272 | static_face_mask = cv2.ellipse(static_face_mask, (112,162), (62,54),0,0,360,(255,255,255), -1) 273 | static_face_mask = cv2.ellipse(static_face_mask, (112,122), (46,23),0,0,360,(0,0,0), -1) 274 | static_face_mask = cv2.resize(static_face_mask,(256,256)) 275 | 276 | static_face_mask = cv2.rectangle(static_face_mask, (0,246), (246,246),(0,0,0), -1) 277 | static_face_mask = cv2.cvtColor(static_face_mask, cv2.COLOR_GRAY2RGB)/255 278 | static_face_mask = cv2.GaussianBlur(static_face_mask,(19,19),cv2.BORDER_DEFAULT) 279 | 280 | sub_face_mask = np.zeros((256,256), dtype=np.uint8) 281 | 282 | #if args.face_mode == 0: 283 | # sub_face_mask = cv2.rectangle(sub_face_mask, (62, 65 - padY), (194, 241 - padY), (255, 255, 255), -1) #0 284 | #else: 285 | # sub_face_mask = cv2.rectangle(sub_face_mask, (42, 65 - padY), (214, 241 - padY), (255, 255, 255), -1) #1 286 | 287 | sub_face_mask = cv2.rectangle(sub_face_mask, (42, 65 - padY), (214, 249), (255, 255, 255), -1) #1 288 | sub_face_mask = cv2.GaussianBlur(sub_face_mask.astype(np.uint8),(29,29),cv2.BORDER_DEFAULT) 289 | sub_face_mask = cv2.cvtColor(sub_face_mask, cv2.COLOR_GRAY2RGB) 290 | sub_face_mask = sub_face_mask/255 291 | 292 | im = cv2.imread(args.face) 293 | 294 | if not os.path.isfile(args.face): 295 | raise ValueError('--face argument must be a valid path to video/image file') 296 | 297 | elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg', 'bmp']: 298 | orig_frame = cv2.imread(args.face) 299 | orig_frame = cv2.resize(orig_frame, (orig_frame.shape[1]//args.resize_factor, orig_frame.shape[0]//args.resize_factor)) 300 | orig_frames = [orig_frame] 301 | fps = args.fps 302 | 303 | # crop final: 304 | h, w = orig_frame.shape[:-1] 305 | roi = cv2.selectROI("Crop final video", orig_frame, showCrosshair=False) 306 | if roi == (0,0,0,0):roi = (0,0,w,h) 307 | cropped_roi = orig_frame[int(roi[1]):int(roi[1]+roi[3]), int(roi[0]):int(roi[0]+roi[2])] 308 | cv2.destroyAllWindows() 309 | full_frames = [cropped_roi] 310 | orig_h, orig_w = cropped_roi.shape[:-1] 311 | 312 | # select specific face: 313 | target_id = select_specific_face(detector, cropped_roi, 256, crop_scale=1) 314 | 315 | else: 316 | video_stream = cv2.VideoCapture(args.face) 317 | fps = video_stream.get(cv2.CAP_PROP_FPS) 318 | video_stream.set(1,args.cut_in) 319 | 320 | print('Reading video frames...') 321 | 322 | # cut to input/putput position: 323 | if args.cut_out == 0: 324 | args.cut_out = int(video_stream.get(cv2.CAP_PROP_FRAME_COUNT)) 325 | 326 | duration = int(video_stream.get(cv2.CAP_PROP_FRAME_COUNT)) - args.cut_in 327 | new_duration = args.cut_out - args.cut_in 328 | 329 | if args.static: 330 | new_duration = 1 331 | 332 | video_stream.set(1,args.cut_in) 333 | 334 | # read frames and crop roi: 335 | full_frames = [] 336 | orig_frames = [] 337 | 338 | for l in range(new_duration): 339 | still_reading, frame = video_stream.read() 340 | 341 | if not still_reading: 342 | video_stream.release() 343 | break 344 | 345 | if args.resize_factor > 1: 346 | frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor)) 347 | 348 | # crop first frame: 349 | if l == 0: 350 | h, w = frame.shape[:-1] 351 | roi = cv2.selectROI("Crop final video", frame, showCrosshair=False) 352 | if roi == (0,0,0,0):roi = (0,0,w,h) 353 | 354 | cropped_roi = frame[int(roi[1]):int(roi[1]+roi[3]), int(roi[0]):int(roi[0]+roi[2])] 355 | cv2.destroyAllWindows() 356 | os.system('cls') 357 | 358 | # select_specific_face: 359 | target_id = select_specific_face(detector, cropped_roi, 256, crop_scale=1) 360 | orig_h, orig_w = cropped_roi.shape[:-1] 361 | print("Reading frames....") 362 | print(f'\r{l}', end=' ', flush=True) 363 | 364 | # crop all frames: 365 | cropped_roi = frame[int(roi[1]):int(roi[1]+roi[3]), int(roi[0]):int(roi[0]+roi[2])] 366 | full_frames.append(cropped_roi) 367 | orig_frames.append(cropped_roi) 368 | 369 | # memory usage raw video: 370 | memory_usage_bytes = sum(frame.nbytes for frame in full_frames) 371 | memory_usage_mb = memory_usage_bytes / (1024**2) 372 | 373 | print ("Number of frames used for inference: " + str(len(full_frames)) + " / ~ " + str(int(memory_usage_mb)) + " mb memory usage") 374 | 375 | 376 | # convert input audio to wav anyway: 377 | print('Extracting raw audio...') 378 | subprocess.run(['ffmpeg', '-y', '-i', args.audio, '-ac', '1', '-strict', '-2', 'temp/temp.wav']) 379 | 380 | os.system('cls') 381 | print('Raw audio extracted') 382 | 383 | # denoise extracted audio: 384 | if args.denoise: 385 | print('Denoising audio...') 386 | wav, sr = librosa.load('temp/temp.wav', sr=44100, mono=True) 387 | wav_denoised, new_sr = denoiser.denoise(wav, sr, batch_process_chunks=False) 388 | write('temp/temp.wav', new_sr, (wav_denoised * 32767).astype(np.int16)) 389 | 390 | 391 | wav = audio.load_wav('temp/temp.wav', 16000) 392 | mel = audio.melspectrogram(wav) 393 | 394 | if np.isnan(mel.reshape(-1)).sum() > 0: 395 | raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') 396 | 397 | mel_chunks = [] 398 | mel_idx_multiplier = 80./fps 399 | i = 0 400 | while 1: 401 | start_idx = int(i * mel_idx_multiplier) 402 | if start_idx + mel_step_size > len(mel[0]): 403 | mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) 404 | break 405 | mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size]) 406 | i += 1 407 | 408 | print("Length of mel chunks: {}".format(len(mel_chunks))) 409 | 410 | full_frames = full_frames[:len(mel_chunks)] 411 | 412 | # face detection: 413 | aligned_faces, sub_faces, matrix, no_face = face_detect(full_frames, target_id) 414 | 415 | if args.pingpong: 416 | orig_frames = orig_frames + orig_frames[::-1] 417 | full_frames = full_frames + full_frames[::-1] 418 | aligned_faces = aligned_faces + aligned_faces[::-1] 419 | sub_faces = sub_faces + sub_faces[::-1] 420 | matrix = matrix + matrix[::-1] 421 | no_face = no_face + no_face[::-1] 422 | 423 | # datagen: 424 | gen = datagen(sub_faces.copy(), mel_chunks) 425 | 426 | fc = 0 427 | 428 | model = load_model(device) 429 | 430 | frame_h, frame_w = full_frames[0].shape[:-1] 431 | 432 | out = cv2.VideoWriter('temp/temp.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (orig_w, orig_h)) 433 | 434 | os.system('cls') 435 | print('Running on ' + onnxruntime.get_device()) 436 | print ('Checkpoint: ' + args.checkpoint_path) 437 | print ('Resize factor: ' + str(args.resize_factor)) 438 | if args.pingpong: print ('Use pingpong') 439 | if args.enhancer != 'none': print ('Use ' + args.enhancer) 440 | if args.face_mask: print ('Use face mask') 441 | if args.face_occluder: print ('Use occlusion mask') 442 | print ('') 443 | 444 | # fade in/out 445 | fade_in = 11 446 | total_length = int(np.ceil(float(len(mel_chunks)))) 447 | fade_out = total_length - 11 448 | bright_in = 0 449 | bright_out = 0 450 | 451 | for i, (img_batch, mel_batch, frames) in enumerate(tqdm(gen, total=int(np.ceil(float(len(mel_chunks)))))): 452 | 453 | if fc == (len(full_frames)): 454 | fc = 0 455 | 456 | face_err = no_face[fc] 457 | 458 | img_batch = img_batch.transpose((0, 3, 1, 2)).astype(np.float32) 459 | mel_batch = mel_batch.transpose((0, 3, 1, 2)).astype(np.float32) 460 | 461 | # wav2lip onnx inference: 462 | pred = model.run(None,{'mel_spectrogram':mel_batch, 'video_frames':img_batch})[0][0] 463 | 464 | pred = pred.transpose(1, 2, 0)*255 465 | pred = pred.astype(np.uint8) 466 | pred = pred.reshape((1, args.img_size, args.img_size, 3)) 467 | 468 | mat = matrix[fc] 469 | mat_rev = cv2.invertAffineTransform(mat) 470 | 471 | aligned_face = aligned_faces[fc] 472 | aligned_face_orig = aligned_face.copy() 473 | p_aligned = aligned_face.copy() 474 | 475 | full_frame = full_frames[fc] 476 | 477 | final = orig_frames[fc] 478 | 479 | for p, f in zip(pred, frames): 480 | 481 | if not args.static: fc = fc + 1 482 | 483 | # crop mode: 484 | if args.face_mode == 0: 485 | p = cv2.resize(p,(132,176)) 486 | else: 487 | p = cv2.resize(p,(172,176)) 488 | 489 | if args.face_mode == 0: 490 | p_aligned[65-(padY):241-(padY),62:194] = p 491 | else: 492 | p_aligned[65-(padY):241-(padY),42:214] = p 493 | 494 | aligned_face = (sub_face_mask * p_aligned + (1 - sub_face_mask) * aligned_face_orig).astype(np.uint8) 495 | 496 | if face_err != 0: 497 | res = full_frame 498 | face_err = 0 499 | 500 | else: 501 | 502 | # face enhancers: 503 | if args.enhancer != 'none': 504 | aligned_face_enhanced = enhancer.enhance(aligned_face) 505 | aligned_face_enhanced = cv2.resize(aligned_face_enhanced,(256,256)) 506 | aligned_face = cv2.addWeighted(aligned_face_enhanced.astype(np.float32),blend, aligned_face.astype(np.float32), 1.-blend, 0.0) 507 | 508 | # mask options: 509 | if args.face_mask: 510 | seg_mask = masker.mask(aligned_face) 511 | #seg_mask[seg_mask > 32] = 255 512 | seg_mask = cv2.blur(seg_mask,(5,5)) 513 | seg_mask = seg_mask /255 514 | mask = cv2.warpAffine(seg_mask, mat_rev,(frame_w, frame_h)) 515 | 516 | if args.face_occluder: 517 | # handle specific face not detected: 518 | try: 519 | seg_mask = occluder.mask(aligned_face_orig) 520 | seg_mask = cv2.cvtColor(seg_mask, cv2.COLOR_GRAY2RGB) 521 | mask = cv2.warpAffine(seg_mask, mat_rev,(frame_w, frame_h)) 522 | except: 523 | seg_mask = occluder.mask(aligned_face) #xseg 524 | seg_mask = cv2.cvtColor(seg_mask, cv2.COLOR_GRAY2RGB) 525 | mask = cv2.warpAffine(seg_mask, mat_rev,(frame_w, frame_h)) 526 | 527 | if not args.face_mask and not args.face_occluder: 528 | mask = cv2.warpAffine(static_face_mask, mat_rev,(frame_w, frame_h)) 529 | 530 | if args.sharpen: 531 | #smoothed = cv2.GaussianBlur(aligned_face, (9, 9), 10) 532 | #aligned_face = cv2.addWeighted(aligned_face, 1.5, smoothed, -0.5, 0) 533 | #aligned_face = np.clip(aligned_face, 0, 255).astype(np.uint8) 534 | aligned_face = cv2.detailEnhance(aligned_face, sigma_s=1.3, sigma_r=0.15) 535 | 536 | #cv2.imshow("D",aligned_face) 537 | 538 | dealigned_face = cv2.warpAffine(aligned_face, mat_rev, (frame_w, frame_h)) 539 | #cv2.imshow("mask",mask) 540 | #cv2.waitKey(1) 541 | #mask = cv2.warpAffine(static_face_mask, mat_rev,(frame_w, frame_h)) 542 | 543 | res = (mask * dealigned_face + (1 - mask) * full_frame).astype(np.uint8) 544 | 545 | final = res 546 | 547 | if args.frame_enhancer: 548 | final = frame_enhancer.enhance(final) 549 | final = cv2.resize(final,(orig_w, orig_h), interpolation=cv2.INTER_AREA) 550 | 551 | # fade in/out: 552 | if i < 11 and args.fade: 553 | final = cv2.convertScaleAbs(final, alpha=0 + (0.1 * bright_in), beta=0) 554 | bright_in = bright_in + 1 555 | if i > fade_out and args.fade: 556 | final = cv2.convertScaleAbs(final, alpha=1 - (0.1 * bright_out), beta=0) 557 | bright_out = bright_out + 1 558 | 559 | if args.hq_output: 560 | cv2.imwrite(os.path.join('./hq_temp', '{:0>7d}.png'.format(i)), final) 561 | else: 562 | out.write(final) 563 | 564 | if args.preview: 565 | cv2.imshow("Result - press ESC to stop and save",final) 566 | k = cv2.waitKey(1) 567 | if k == 27: 568 | cv2.destroyAllWindows() 569 | out.release() 570 | break 571 | 572 | if k == ord('s'): 573 | if args.sharpen == False: 574 | args.sharpen = True 575 | else: 576 | args.sharpen = False 577 | print ('') 578 | print ("Sharpen = " + str(args.sharpen)) 579 | 580 | out.release() 581 | 582 | if args.hq_output: 583 | command = 'ffmpeg.exe -y -i ' + '"' + args.audio + '"' + ' -r ' + str(fps) + ' -f image2 -i ' + '"' + './hq_temp/' + '%07d.png' + '"' + ' -shortest -vcodec libx264 -pix_fmt yuv420p -crf 5 -preset slow -acodec libmp3lame -ac 2 -ar 44100 -ab 128000 -strict -2 ' + '"' + args.outfile + '"' 584 | else: 585 | command = 'ffmpeg.exe -y -i ' + '"' + args.audio + '"' + ' -i ' + 'temp/temp.mp4' + ' -shortest -vcodec copy -acodec libmp3lame -ac 2 -ar 44100 -ab 128000 -strict -2 ' + '"' + args.outfile + '"' 586 | 587 | subprocess.call(command, shell=platform.system() != 'Windows') 588 | 589 | if os.path.exists('temp/temp.mp4'): 590 | os.remove('temp/temp.mp4') 591 | if os.path.exists('temp/temp.wav'): 592 | os.remove('temp/temp.wav') 593 | if os.path.exists('hq_temp'): 594 | shutil.rmtree('hq_temp') 595 | 596 | if __name__ == '__main__': 597 | main() 598 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.8.0.76 2 | numpy 3 | tqdm 4 | librosa 5 | numba 6 | imutils 7 | REM insightface==0.7.3 8 | onnxruntime==1.14.1 9 | -------------------------------------------------------------------------------- /resemble_denoiser/place models here.txt: -------------------------------------------------------------------------------- 1 | place denoiser_fp16.onnx models here -------------------------------------------------------------------------------- /resemble_denoiser/resemble_denoiser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import onnxruntime 3 | from librosa import stft, istft 4 | 5 | class ResembleDenoiser: 6 | def __init__(self, model_path='denoiser_fp16.onnx', device='cpu'): 7 | self.stft_hop_length = 420 8 | self.win_length = self.n_fft = 4 * self.stft_hop_length 9 | 10 | session_options = onnxruntime.SessionOptions() 11 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 12 | session_options.inter_op_num_threads = 4 13 | session_options.intra_op_num_threads = 4 14 | session_options.log_severity_level = 4 15 | 16 | providers = ["CPUExecutionProvider"] 17 | if device == 'cuda': 18 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"] 19 | 20 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 21 | 22 | 23 | def _stft(self, x): 24 | s = stft( 25 | x, window='hann', win_length=self.win_length, n_fft=self.n_fft, 26 | hop_length=self.stft_hop_length, center=True, pad_mode='reflect' 27 | ) 28 | s = s[..., :-1] 29 | mag = np.abs(s) 30 | phi = np.angle(s) 31 | return mag, np.cos(phi), np.sin(phi) 32 | 33 | def _istft(self, mag, cos, sin): 34 | real = mag * cos 35 | imag = mag * sin 36 | s = real + imag * 1.0j 37 | s = np.pad(s, ((0, 0), (0, 0), (0, 1)), mode='edge') 38 | x = istft( 39 | s, window='hann', win_length=self.win_length, 40 | hop_length=self.stft_hop_length, n_fft=self.n_fft 41 | ) 42 | return x 43 | 44 | def _model_infer(self, wav): 45 | padded_wav = np.pad(wav, ((0, 0), (0, 441))) 46 | mag, cos, sin = self._stft(padded_wav) 47 | 48 | ort_inputs = { 49 | "mag": mag, 50 | "cos": cos, 51 | "sin": sin, 52 | } 53 | sep_mag, sep_cos, sep_sin = self.session.run(None, ort_inputs) 54 | out = self._istft(sep_mag, sep_cos, sep_sin) 55 | return out[:wav.shape[-1]] 56 | 57 | def denoise(self, wav: np.ndarray, sample_rate: int, batch_process_chunks=False): 58 | assert wav.ndim == 1, 'Input should be 1D (mono) wav' 59 | 60 | chunk_length = int(44100 * 30) 61 | hop_length = chunk_length 62 | num_chunks = 1 + (wav.shape[-1] - 1) // hop_length 63 | n_pad = (num_chunks - wav.shape[-1] % num_chunks) % num_chunks 64 | wav = np.pad(wav, (0, n_pad)) 65 | 66 | chunks = np.reshape(wav, (num_chunks, -1)) 67 | abs_max = np.clip(np.max(np.abs(chunks), axis=-1, keepdims=True), 1e-7, None) 68 | chunks /= abs_max 69 | 70 | if batch_process_chunks: 71 | res_chunks = self._model_infer(chunks) 72 | else: 73 | res_chunks = np.array([ 74 | self._model_infer(c[None]) for c in chunks 75 | ]).squeeze(axis=1) 76 | 77 | res_chunks *= abs_max 78 | res = np.reshape(res_chunks, (-1)) 79 | return res[:wav.shape[-1]], 44100 80 | -------------------------------------------------------------------------------- /setup_new.txt: -------------------------------------------------------------------------------- 1 | wav2lip_onnx 2 | 3 | clone repo 4 | 5 | conda create -n ENV_NAME python==3.11 6 | conda activate ENV_NAME 7 | cd c:\env\env_path 8 | 9 | pip install -r requirements.txt 10 | 11 | rem not needed anymore: 12 | rem pip install insightface>=0.7.3 never worked for me but downloaded file did: 13 | rem pip install insightface-0.7.3-cp311-cp311-win_amd64.whl 14 | 15 | pip install onnx 16 | pip install onnxruntime==1.14.1 17 | 18 | pip install opencv-contrib-python 19 | 20 | ############# GPU ############### 21 | 22 | conda install -c conda-forge cudatoolkit=11.2 cudnn=8.1.0 23 | pip uninstall onnxuntime 24 | pip install onnxruntime-gpu==1.14.1 -------------------------------------------------------------------------------- /utils/face_alignment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def align_crop(img, landmark, size): 6 | template_ffhq = np.array( 7 | [ 8 | [192.98138, 239.94708], 9 | [318.90277, 240.19366], 10 | [256.63416, 314.01935], 11 | [201.26117, 371.41043], 12 | [313.08905, 371.15118] 13 | ]) 14 | 15 | template_ffhq *= (512 / size) 16 | matrix = cv2.estimateAffinePartial2D(landmark, template_ffhq, method=cv2.RANSAC, ransacReprojThreshold=100)[0] 17 | warped = cv2.warpAffine(img, matrix, (size, size), borderMode=cv2.BORDER_REPLICATE) 18 | return warped, matrix 19 | 20 | def get_cropped_head(img, landmark, scale=1.4, size=512): 21 | center = np.mean(landmark, axis=0) 22 | landmark = center + (landmark - center) * scale 23 | return align_crop(img, landmark, size) 24 | 25 | 26 | # -------------------------------------------------- 27 | 28 | def align_crop_256(img, landmark, size): 29 | template_ffhq = np.array( 30 | [ 31 | [192.98138, 239.94708], 32 | [318.90277, 240.19366], 33 | [256.63416, 314.01935], 34 | [201.26117, 371.41043], 35 | [313.08905, 371.15118] 36 | ]) 37 | 38 | template_ffhq = template_ffhq /2 39 | template_ffhq *= (256 / size) 40 | matrix = cv2.estimateAffinePartial2D(landmark, template_ffhq, method=cv2.RANSAC, ransacReprojThreshold=100)[0] 41 | warped = cv2.warpAffine(img, matrix, (size, size), borderMode=cv2.BORDER_REPLICATE) 42 | return warped, matrix 43 | 44 | 45 | def get_cropped_head_256(img, landmark, scale=1.4, size=512): 46 | center = np.mean(landmark, axis=0) 47 | landmark = center + (landmark - center) * scale 48 | return align_crop_256(img, landmark, size) 49 | 50 | def get_cropped(img, landmark, scale=1.4, size=512, bbox_expansion_factor=3): 51 | # Scale landmarks around the center 52 | center = np.mean(landmark, axis=0) 53 | scaled_landmark = center + (landmark - center) * scale 54 | 55 | # Calculate the bounding box 56 | min_coords = np.min(scaled_landmark, axis=0) 57 | max_coords = np.max(scaled_landmark, axis=0) 58 | 59 | width, height = max_coords - min_coords 60 | max_dim = max(width, height) 61 | 62 | # Expand the bounding box by the specified factor 63 | expanded_dim = max_dim * bbox_expansion_factor 64 | 65 | # Calculate the expanded bounding box coordinates 66 | center_x, center_y = (min_coords + max_coords) / 2 67 | half_expanded_dim = expanded_dim / 2 68 | min_x = max(int(center_x - half_expanded_dim), 0) 69 | min_y = max(int(center_y - half_expanded_dim), 0) 70 | max_x = min(int(center_x + half_expanded_dim), img.shape[1]) 71 | max_y = min(int(center_y + half_expanded_dim), img.shape[0]) 72 | 73 | # Crop and resize the image 74 | cropped_img = img[min_y:max_y, min_x:max_x] 75 | cropped_img_resized = cv2.resize(cropped_img, (size, size)) 76 | 77 | return cropped_img_resized 78 | 79 | -------------------------------------------------------------------------------- /utils/retinaface.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Organization : insightface.ai 3 | # @Author : Jia Guo 4 | # @Time : 2021-09-18 5 | # @Function : 6 | 7 | from __future__ import division 8 | import datetime 9 | import numpy as np 10 | import onnxruntime 11 | import os 12 | import cv2 13 | import sys 14 | 15 | def softmax(z): 16 | assert len(z.shape) == 2 17 | s = np.max(z, axis=1) 18 | s = s[:, np.newaxis] # necessary step to do broadcasting 19 | e_x = np.exp(z - s) 20 | div = np.sum(e_x, axis=1) 21 | div = div[:, np.newaxis] # dito 22 | return e_x / div 23 | 24 | def distance2bbox(points, distance, max_shape=None): 25 | """Decode distance prediction to bounding box. 26 | 27 | Args: 28 | points (Tensor): Shape (n, 2), [x, y]. 29 | distance (Tensor): Distance from the given point to 4 30 | boundaries (left, top, right, bottom). 31 | max_shape (tuple): Shape of the image. 32 | 33 | Returns: 34 | Tensor: Decoded bboxes. 35 | """ 36 | x1 = points[:, 0] - distance[:, 0] 37 | y1 = points[:, 1] - distance[:, 1] 38 | x2 = points[:, 0] + distance[:, 2] 39 | y2 = points[:, 1] + distance[:, 3] 40 | if max_shape is not None: 41 | x1 = x1.clamp(min=0, max=max_shape[1]) 42 | y1 = y1.clamp(min=0, max=max_shape[0]) 43 | x2 = x2.clamp(min=0, max=max_shape[1]) 44 | y2 = y2.clamp(min=0, max=max_shape[0]) 45 | return np.stack([x1, y1, x2, y2], axis=-1) 46 | 47 | def distance2kps(points, distance, max_shape=None): 48 | """Decode distance prediction to bounding box. 49 | 50 | Args: 51 | points (Tensor): Shape (n, 2), [x, y]. 52 | distance (Tensor): Distance from the given point to 4 53 | boundaries (left, top, right, bottom). 54 | max_shape (tuple): Shape of the image. 55 | 56 | Returns: 57 | Tensor: Decoded bboxes. 58 | """ 59 | preds = [] 60 | for i in range(0, distance.shape[1], 2): 61 | px = points[:, i%2] + distance[:, i] 62 | py = points[:, i%2+1] + distance[:, i+1] 63 | if max_shape is not None: 64 | px = px.clamp(min=0, max=max_shape[1]) 65 | py = py.clamp(min=0, max=max_shape[0]) 66 | preds.append(px) 67 | preds.append(py) 68 | return np.stack(preds, axis=-1) 69 | 70 | class RetinaFace: 71 | def __init__(self, model_file=None, provider=["CPUExecutionProvider"], session_options=None): 72 | self.model_file = model_file 73 | self.session_options = session_options 74 | if self.session_options is None: 75 | self.session_options = onnxruntime.SessionOptions() 76 | self.session = onnxruntime.InferenceSession(self.model_file, providers=provider, sess_options=self.session_options) 77 | self.center_cache = {} 78 | self.nms_thresh = 0.4 79 | self.det_thresh = 0.5 80 | self._init_vars() 81 | 82 | def _init_vars(self): 83 | input_cfg = self.session.get_inputs()[0] 84 | input_shape = input_cfg.shape 85 | #print(input_shape) 86 | if isinstance(input_shape[2], str): 87 | self.input_size = None 88 | else: 89 | self.input_size = tuple(input_shape[2:4][::-1]) 90 | #print('image_size:', self.image_size) 91 | input_name = input_cfg.name 92 | self.input_shape = input_shape 93 | outputs = self.session.get_outputs() 94 | output_names = [] 95 | for o in outputs: 96 | output_names.append(o.name) 97 | self.input_name = input_name 98 | self.output_names = output_names 99 | self.input_mean = 127.5 100 | self.input_std = 128.0 101 | #print(self.output_names) 102 | #assert len(outputs)==10 or len(outputs)==15 103 | self.use_kps = False 104 | self._anchor_ratio = 1.0 105 | self._num_anchors = 1 106 | if len(outputs)==6: 107 | self.fmc = 3 108 | self._feat_stride_fpn = [8, 16, 32] 109 | self._num_anchors = 2 110 | elif len(outputs)==9: 111 | self.fmc = 3 112 | self._feat_stride_fpn = [8, 16, 32] 113 | self._num_anchors = 2 114 | self.use_kps = True 115 | elif len(outputs)==10: 116 | self.fmc = 5 117 | self._feat_stride_fpn = [8, 16, 32, 64, 128] 118 | self._num_anchors = 1 119 | elif len(outputs)==15: 120 | self.fmc = 5 121 | self._feat_stride_fpn = [8, 16, 32, 64, 128] 122 | self._num_anchors = 1 123 | self.use_kps = True 124 | 125 | def prepare(self, **kwargs): 126 | nms_thresh = kwargs.get('nms_thresh', None) 127 | if nms_thresh is not None: 128 | self.nms_thresh = nms_thresh 129 | det_thresh = kwargs.get('det_thresh', None) 130 | if det_thresh is not None: 131 | self.det_thresh = det_thresh 132 | input_size = kwargs.get('input_size', None) 133 | if input_size is not None: 134 | if self.input_size is not None: 135 | print('warning: det_size is already set in detection model, ignore') 136 | else: 137 | self.input_size = input_size 138 | 139 | def forward(self, img, threshold): 140 | scores_list = [] 141 | bboxes_list = [] 142 | kpss_list = [] 143 | input_size = tuple(img.shape[0:2][::-1]) 144 | blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) 145 | net_outs = self.session.run(self.output_names, {self.input_name : blob}) 146 | 147 | input_height = blob.shape[2] 148 | input_width = blob.shape[3] 149 | fmc = self.fmc 150 | for idx, stride in enumerate(self._feat_stride_fpn): 151 | scores = net_outs[idx] 152 | bbox_preds = net_outs[idx+fmc] 153 | bbox_preds = bbox_preds * stride 154 | if self.use_kps: 155 | kps_preds = net_outs[idx+fmc*2] * stride 156 | height = input_height // stride 157 | width = input_width // stride 158 | K = height * width 159 | key = (height, width, stride) 160 | if key in self.center_cache: 161 | anchor_centers = self.center_cache[key] 162 | else: 163 | anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32) 164 | anchor_centers = (anchor_centers * stride).reshape( (-1, 2) ) 165 | if self._num_anchors>1: 166 | anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) ) 167 | if len(self.center_cache)<100: 168 | self.center_cache[key] = anchor_centers 169 | 170 | pos_inds = np.where(scores>=threshold)[0] 171 | bboxes = distance2bbox(anchor_centers, bbox_preds) 172 | pos_scores = scores[pos_inds] 173 | pos_bboxes = bboxes[pos_inds] 174 | scores_list.append(pos_scores) 175 | bboxes_list.append(pos_bboxes) 176 | if self.use_kps: 177 | kpss = distance2kps(anchor_centers, kps_preds) 178 | kpss = kpss.reshape( (kpss.shape[0], -1, 2) ) 179 | pos_kpss = kpss[pos_inds] 180 | kpss_list.append(pos_kpss) 181 | return scores_list, bboxes_list, kpss_list 182 | 183 | def detect(self, img, input_size = (640,640), max_num=0, metric='default', det_thresh=0.5): 184 | assert input_size is not None or self.input_size is not None 185 | input_size = self.input_size if input_size is None else input_size 186 | 187 | im_ratio = float(img.shape[0]) / img.shape[1] 188 | model_ratio = float(input_size[1]) / input_size[0] 189 | if im_ratio>model_ratio: 190 | new_height = input_size[1] 191 | new_width = int(new_height / im_ratio) 192 | else: 193 | new_width = input_size[0] 194 | new_height = int(new_width * im_ratio) 195 | det_scale = float(new_height) / img.shape[0] 196 | resized_img = cv2.resize(img, (new_width, new_height)) 197 | det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 ) 198 | det_img[:new_height, :new_width, :] = resized_img 199 | 200 | scores_list, bboxes_list, kpss_list = self.forward(det_img, det_thresh) 201 | 202 | scores = np.vstack(scores_list) 203 | scores_ravel = scores.ravel() 204 | order = scores_ravel.argsort()[::-1] 205 | bboxes = np.vstack(bboxes_list) / det_scale 206 | if self.use_kps: 207 | kpss = np.vstack(kpss_list) / det_scale 208 | pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False) 209 | pre_det = pre_det[order, :] 210 | keep = self.nms(pre_det) 211 | det = pre_det[keep, :] 212 | if self.use_kps: 213 | kpss = kpss[order,:,:] 214 | kpss = kpss[keep,:,:] 215 | else: 216 | kpss = None 217 | if max_num > 0 and det.shape[0] > max_num: 218 | area = (det[:, 2] - det[:, 0]) * (det[:, 3] - 219 | det[:, 1]) 220 | img_center = img.shape[0] // 2, img.shape[1] // 2 221 | offsets = np.vstack([ 222 | (det[:, 0] + det[:, 2]) / 2 - img_center[1], 223 | (det[:, 1] + det[:, 3]) / 2 - img_center[0] 224 | ]) 225 | offset_dist_squared = np.sum(np.power(offsets, 2.0), 0) 226 | if metric=='max': 227 | values = area 228 | else: 229 | values = area - offset_dist_squared * 2.0 # some extra weight on the centering 230 | bindex = np.argsort( 231 | values)[::-1] # some extra weight on the centering 232 | bindex = bindex[0:max_num] 233 | det = det[bindex, :] 234 | if kpss is not None: 235 | kpss = kpss[bindex, :] 236 | return det, kpss 237 | 238 | def nms(self, dets): 239 | thresh = self.nms_thresh 240 | x1 = dets[:, 0] 241 | y1 = dets[:, 1] 242 | x2 = dets[:, 2] 243 | y2 = dets[:, 3] 244 | scores = dets[:, 4] 245 | 246 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 247 | order = scores.argsort()[::-1] 248 | 249 | keep = [] 250 | while order.size > 0: 251 | i = order[0] 252 | keep.append(i) 253 | xx1 = np.maximum(x1[i], x1[order[1:]]) 254 | yy1 = np.maximum(y1[i], y1[order[1:]]) 255 | xx2 = np.minimum(x2[i], x2[order[1:]]) 256 | yy2 = np.minimum(y2[i], y2[order[1:]]) 257 | 258 | w = np.maximum(0.0, xx2 - xx1 + 1) 259 | h = np.maximum(0.0, yy2 - yy1 + 1) 260 | inter = w * h 261 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 262 | 263 | inds = np.where(ovr <= thresh)[0] 264 | order = order[inds + 1] 265 | 266 | return keep 267 | -------------------------------------------------------------------------------- /utils/scrfd_2.5g_bnkps.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instant-high/wav2lip-onnx-HQ/79fc6261a1c36ca9abf946cc5260a8aaeeb3746f/utils/scrfd_2.5g_bnkps.onnx -------------------------------------------------------------------------------- /xseg/place xseg onnx model here.txt: -------------------------------------------------------------------------------- 1 | place xseg onnx model here -------------------------------------------------------------------------------- /xseg/xseg.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import onnxruntime 4 | 5 | class MASK: 6 | def __init__(self, model_path="xseg.onnx", device='cpu'): 7 | session_options = onnxruntime.SessionOptions() 8 | session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 9 | providers = ["CPUExecutionProvider"] 10 | if device == 'cuda': 11 | providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}),"CPUExecutionProvider"] 12 | self.session = onnxruntime.InferenceSession(model_path, sess_options=session_options, providers=providers) 13 | 14 | 15 | def mask(self, img): 16 | 17 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 18 | img = cv2.resize(img, (256, 256)) 19 | img = img.astype(np.float32) 20 | img = img / 255 21 | img = np.expand_dims(img, axis=0).astype(np.float32) 22 | 23 | result = self.session.run(None, {(self.session.get_inputs()[0].name):img})[0][0] 24 | 25 | return result 26 | --------------------------------------------------------------------------------