├── 1_convert_25fps.py ├── 5_create_filelist.py ├── 7_to_mel.py ├── 2_crop_video.py ├── 3_segment.py ├── 6_au_sync.py ├── 4_detection.py ├── hparams.py ├── audio.py └── README.md /1_convert_25fps.py: -------------------------------------------------------------------------------- 1 | import os 2 | import concurrent.futures 3 | import subprocess 4 | import sys 5 | 6 | dataset_path = sys.argv[1] 7 | presenter_name = sys.argv[2] 8 | n_processes = int(sys.argv[3]) 9 | 10 | input_video_path = os.path.join(dataset_path, presenter_name, 'full_voice') 11 | output_video_path = os.path.join(dataset_path, presenter_name, 'full_voice_25fps') 12 | if not os.path.exists(output_video_path): 13 | os.makedirs(output_video_path) 14 | source_dir = os.listdir(input_video_path) 15 | 16 | def convert_25fps(name_video): 17 | video = os.path.join(input_video_path, name_video) 18 | new_video = os.path.join(output_video_path, name_video) 19 | subprocess.call(f"ffmpeg -y -i {video} -filter:v fps=25 -b:v 50M {new_video}", shell=True) 20 | 21 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor: 22 | inputs = [x for x in source_dir] 23 | executor.map(convert_25fps, inputs) 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /5_create_filelist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | dataset_path = sys.argv[1] 5 | presenter_name = sys.argv[2] 6 | token = sys.argv[3] 7 | 8 | path = os.path.join(dataset_path, presenter_name) 9 | output_path = os.path.join(path, f'filelist_{token}') 10 | if not os.path.exists(output_path): 11 | os.makedirs(output_path) 12 | source_path = os.path.join(path, f'output_{token}') 13 | data_list = os.listdir(source_path) 14 | 15 | results = [] 16 | errors = [] 17 | for d in data_list: 18 | d_path = os.path.join(source_path, d) 19 | train_list = os.listdir(d_path) 20 | train_list = [t for t in train_list if os.path.isdir(os.path.join(d_path, t))] 21 | for t in train_list: 22 | t_path = os.path.join(d_path, t) 23 | 24 | if os.path.isfile(os.path.join(t_path, "audio.wav")): 25 | results.append(t_path) 26 | else: 27 | errors.append(t_path) 28 | 29 | with open(f"{output_path}/raw_filelist.txt", "w") as f: 30 | for line in results: 31 | f.write(line + "\n") 32 | 33 | with open(f"{output_path}/raw_filelist_errors.txt", "w") as f: 34 | for line in errors: 35 | f.write(line + "\n") 36 | 37 | -------------------------------------------------------------------------------- /7_to_mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import audio 4 | import numpy as np 5 | 6 | dataset_path = sys.argv[1] 7 | presenter_name = sys.argv[2] 8 | token = sys.argv[3] 9 | 10 | obj_path = os.path.join(dataset_path, presenter_name) 11 | 12 | ROOT = os.path.join(obj_path, f"output_{token}") 13 | 14 | with open(os.path.join(obj_path, f"filelist_{token}/raw_filelist.txt"), "r") as f: 15 | data = f.readlines() 16 | data = [line.strip() for line in data] 17 | data.sort() 18 | 19 | start = 0 20 | end = len(data) 21 | 22 | data = data[start:] 23 | print("Data", start, end, len(data)) 24 | sample_rate = 16000 25 | error = [] 26 | 27 | for d in data: 28 | try: 29 | mel_out_path = os.path.join(d, "mel.npy") 30 | 31 | wavpath = os.path.join(d, "synced_audio.wav") 32 | 33 | wav = audio.load_wav(wavpath, sample_rate) 34 | 35 | orig_mel = audio.melspectrogram(wav).T 36 | with open(mel_out_path, "wb") as f: 37 | np.save(f, orig_mel) 38 | except Exception: 39 | print("Error", d) 40 | error.append(d) 41 | 42 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_data_mel_errors_{start}_{end}.txt"), "w") as f: 43 | for line in error: 44 | f.write(line + "\n") 45 | -------------------------------------------------------------------------------- /2_crop_video.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import concurrent.futures 4 | import cv2 5 | import dlib 6 | import subprocess 7 | import sys 8 | 9 | dataset_path = sys.argv[1] 10 | presenter_name = sys.argv[2] 11 | n_processes = int(sys.argv[3]) 12 | 13 | input_video_path = os.path.join(dataset_path, presenter_name, 'full_voice_25fps') 14 | output_video_path = os.path.join(dataset_path, presenter_name, 'videos_crop') 15 | 16 | if not os.path.exists(output_video_path): 17 | os.makedirs(output_video_path) 18 | source_dir = os.listdir(input_video_path) 19 | print(source_dir) 20 | def crop_video(name_video): 21 | vid_path = os.path.join(input_video_path, name_video) 22 | out_path = os.path.join(output_video_path, name_video) 23 | 24 | detector = dlib.get_frontal_face_detector() 25 | 26 | # Load the video 27 | cap = cv2.VideoCapture(vid_path) 28 | 29 | # Get the first frame 30 | ret, frame = cap.read() 31 | 32 | if not ret: 33 | print("Can't receive frame (stream end?). Exiting ...") 34 | exit() 35 | 36 | # Detect face in the first frame 37 | faces = detector(frame) 38 | 39 | # Check if any face is detected 40 | if len(faces) > 0: 41 | # Get the bounding box of the first face detected 42 | x, y, w, h = faces[0].left(), faces[0].top(), faces[0].width(), faces[0].height() 43 | y = max(0, y - int(0.8*w)) 44 | h = 3*h 45 | x = (2*x + w - h)//2 46 | w = h 47 | # Use ffmpeg to crop the video based on the bounding box 48 | command = f"ffmpeg -y -i {vid_path} -filter:v \"crop={w}:{h}:{x}:{y}\" -b:v 4M {out_path}" 49 | subprocess.call(command, shell=True) 50 | 51 | cap.release() 52 | 53 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor: 54 | inputs = [x for x in source_dir] 55 | executor.map(crop_video, inputs) -------------------------------------------------------------------------------- /3_segment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import concurrent.futures 5 | 6 | dataset_path = sys.argv[1] 7 | presenter_name = sys.argv[2] 8 | n_processes = int(sys.argv[3]) 9 | 10 | input_video_path = os.path.join(dataset_path, presenter_name, 'videos_crop') 11 | output_video_path = os.path.join(dataset_path, presenter_name, 'videos_segment') 12 | if not os.path.exists(output_video_path): 13 | os.makedirs(output_video_path) 14 | source_dir = os.listdir(input_video_path) 15 | 16 | def segment(name_video): 17 | video_path = os.path.join(input_video_path, name_video) 18 | split_video_path = video_path.replace('videos_crop', 'videos_segment').replace('.mp4', '').replace('.MP4', '') 19 | if not os.path.exists(split_video_path): 20 | os.makedirs(split_video_path) 21 | split_audio_path = video_path.replace('videos_crop', 'audios_segment').replace('.mp4', '').replace('.MP4', '') 22 | if not os.path.exists(split_audio_path): 23 | os.makedirs(split_audio_path) 24 | command = f"ffmpeg -nostdin -y -i {video_path} 2>&1 | grep Duration | sed 's/Duration: \(.*\), start/\\1/g'" 25 | output_terminal = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].decode('utf-8') 26 | duration_video = int(float(output_terminal.split(":")[1]) * 60 + float(output_terminal.split(":")[2])) 27 | segment_time = list(range(0, duration_video, 10)) 28 | for i in range(0, len(segment_time) - 1): 29 | small_video_path = os.path.join(split_video_path, f'{segment_time[i]}_{segment_time[i+1]}.mp4') 30 | small_audio_path = os.path.join(split_audio_path, f'{segment_time[i]}_{segment_time[i+1]}.wav') 31 | vid_command = f"ffmpeg -nostdin -y -ss {segment_time[i]} -i {video_path} -t 10 -filter:v fps=25 -b:v 4M {small_video_path}" 32 | vid_status = os.system(vid_command) 33 | # aud_command = f"ffmpeg -ss {segment_time[i]} -i {audio_path} -t 10 -ar 16000 {small_audio_path}" 34 | aud_command = f"ffmpeg -nostdin -y -i {small_video_path} -ar 16000 {small_audio_path}" 35 | aud_status = os.system(aud_command) 36 | print(small_video_path, vid_status, aud_status) 37 | 38 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor: 39 | inputs = [x for x in source_dir] 40 | executor.map(segment, inputs) 41 | -------------------------------------------------------------------------------- /6_au_sync.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import soundfile as sf 5 | import librosa 6 | 7 | dataset_path = sys.argv[1] 8 | presenter_name = sys.argv[2] 9 | token = sys.argv[3] 10 | 11 | obj_path = os.path.join(dataset_path, presenter_name) 12 | ROOT = os.path.join(obj_path, f"output_{token}") 13 | print("ROOT:", ROOT) 14 | start = 0 15 | end = -1 16 | 17 | with open(os.path.join(obj_path, f"filelist_{token}/raw_filelist.txt"), "r") as f: 18 | data = f.readlines() 19 | 20 | data = [line.strip() for line in data] 21 | data.sort() 22 | data = data[start:] 23 | print("Data", start, len(data), len(data)) 24 | 25 | errors = [] 26 | results = [] 27 | for p in data: 28 | try: 29 | d = os.path.join(ROOT, p) 30 | frames = os.listdir(d) 31 | frames = [file for file in frames if ".jpg" in file] 32 | frame_count = len(frames) 33 | vid_duration = frame_count/25 34 | # print(vid_duration) 35 | vid_name = d.split("/")[-1] 36 | 37 | org_path = os.path.join(d, f"{vid_name}.wav") 38 | au_path = os.path.join(d, "audio.wav") 39 | synced_path = os.path.join(d, "synced_audio.wav") 40 | 41 | if not os.path.isfile(au_path): 42 | status = os.system(f"ffmpeg -i {org_path} -ar 16000 {au_path}") 43 | if status != 0: 44 | errors.append(p) 45 | continue 46 | if os.path.isfile(synced_path): 47 | continue 48 | 49 | au, sr = librosa.load(au_path, sr=16000) 50 | au_duration = au.shape[0]/sr 51 | 52 | extra = int(vid_duration * sr - au.shape[0]) 53 | is_append = extra >= 0 54 | extra = abs(extra) 55 | new_au = au 56 | if extra > 0: 57 | front = False 58 | if (is_append): 59 | # append audio 60 | if front: 61 | new_au = np.concatenate([np.zeros(extra), au]) 62 | else: 63 | new_au = np.concatenate([au, np.zeros(extra)]) 64 | else: 65 | # cut audio 66 | if front: 67 | new_au = au[:-extra] 68 | else: 69 | new_au = au[extra:] 70 | sf.write(synced_path, new_au, sr) 71 | results.append(p) 72 | except Exception: 73 | print(p) 74 | errors.append(p) 75 | if not os.path.exists(os.path.join(obj_path, f"filelist_{token}/temp")): 76 | os.mkdir(os.path.join(obj_path, f"filelist_{token}/temp")) 77 | 78 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_synced_{start}_{len(data)}.txt"), "w") as f: 79 | for line in results: 80 | f.write(line + "\n") 81 | 82 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_synced_errors_{start}_{len(data)}.txt"), "w") as f: 83 | for line in errors: 84 | f.write(line + "\n") -------------------------------------------------------------------------------- /4_detection.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import mediapipe as mp 3 | import os 4 | import sys 5 | import subprocess 6 | import concurrent.futures 7 | 8 | dataset_path = sys.argv[1] 9 | presenter_name = sys.argv[2] 10 | n_processes = int(sys.argv[3]) 11 | 12 | input_video_path = os.path.join(dataset_path, presenter_name, 'videos_segment') 13 | output_video_path = os.path.join(dataset_path, presenter_name, 'output') 14 | if not os.path.exists(output_video_path): 15 | os.makedirs(output_video_path) 16 | id_vids = os.listdir(input_video_path) 17 | arr_path_vid = [] 18 | for id_vid in id_vids: 19 | path_id_vid = os.path.join(input_video_path, id_vid) 20 | if not os.path.exists(path_id_vid.replace("videos_segment", "output")): 21 | os.makedirs(path_id_vid.replace("videos_segment", f"output")) 22 | split_vids = os.listdir(path_id_vid) 23 | for split_vid in split_vids: 24 | if "mp4" in split_vid or "MP4" in split_vid: 25 | path_split_vid = os.path.join(path_id_vid, split_vid) 26 | arr_path_vid.append(path_split_vid) 27 | mp_face_mesh = mp.solutions.face_mesh 28 | 29 | def detection(path_split_vid): 30 | path_output = path_split_vid.replace("videos_segment", f"output").replace(".mp4" , "").replace(".MP4", "") 31 | if not os.path.exists(path_output): 32 | os.makedirs(path_output) 33 | cap = cv2.VideoCapture(path_split_vid) 34 | flag_person = True 35 | t = 0 36 | while cap.isOpened(): 37 | path_output_image = f'{path_output}/{str(t).zfill(5)}.jpg' 38 | print(path_output_image) 39 | ret, img = cap.read() 40 | if not ret: 41 | break 42 | h, w, _ = img.shape 43 | 44 | with mp_face_mesh.FaceMesh( static_image_mode=True, refine_landmarks=True, min_detection_confidence=0.5) as face_mesh: 45 | results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 46 | # Draw face detections of each face. 47 | if not results.multi_face_landmarks: 48 | flag_person = False 49 | break 50 | for face_landmarks in results.multi_face_landmarks: 51 | face_indices = face_landmarks.landmark[:-10] 52 | x1, x2, y1, y2 = int(face_indices[234].x*w), int(face_indices[454].x*w), int(face_indices[10].y*h), int(face_indices[152].y*h) 53 | 54 | y2 = y2 + int((y2 - y1)*0.14) 55 | www = x2 - x1 56 | x1 = x1 - int(0.07*www) 57 | x2 = x2 + int(0.07*www) 58 | img_final = img[y1:y2, x1: x2] 59 | cv2.imwrite(path_output_image, img_final, [cv2.IMWRITE_JPEG_QUALITY, 100]) 60 | t+=1 61 | cap.release() 62 | 63 | # Delete the video if one frame does not include the presenter's face 64 | if not flag_person: 65 | command = f'rm -r {path_output}' 66 | subprocess.call(command, shell=True) 67 | return 68 | 69 | #Copy audio 70 | old_audio = path_split_vid.replace("videos_segment", 'audios_segment').replace(".mp4",".wav") 71 | new_audio = path_split_vid.replace("videos_segment", f"output").replace('.mp4', '') 72 | command = f"cp {old_audio} {new_audio}/audio.wav" 73 | subprocess.call(command, shell=True) 74 | 75 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor: 76 | inputs = [x for x in arr_path_vid] 77 | executor.map(detection, inputs) 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | 3 | 4 | import os 5 | 6 | def get_image_list(data_root, split): 7 | filelist = [] 8 | 9 | with open('filelists/{}.txt'.format(split)) as f: 10 | for line in f: 11 | line = line.strip() 12 | if ' ' in line: line = line.split()[0] 13 | filelist.append(os.path.join(data_root, line)) 14 | 15 | return filelist 16 | 17 | class HParams: 18 | def __init__(self, **kwargs): 19 | self.data = {} 20 | 21 | for key, value in kwargs.items(): 22 | self.data[key] = value 23 | 24 | def __getattr__(self, key): 25 | if key not in self.data: 26 | raise AttributeError("'HParams' object has no attribute %s" % key) 27 | return self.data[key] 28 | 29 | def set_hparam(self, key, value): 30 | self.data[key] = value 31 | 32 | 33 | # Default hyperparameters 34 | hparams = HParams( 35 | num_mels=80, # Number of mel-spectrogram channels and local conditioning dimensionality 36 | # network 37 | rescale=True, # Whether to rescale audio prior to preprocessing 38 | rescaling_max=0.9, # Rescaling value 39 | 40 | # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction 41 | # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder 42 | # Does not work if n_ffit is not multiple of hop_size!! 43 | use_lws=False, 44 | 45 | n_fft=800, # Extra window size is filled with 0 paddings to match this parameter 46 | hop_size=200, # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate) 47 | win_size=800, # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate) 48 | sample_rate=16000, # 16000Hz (corresponding to librispeech) (sox --i ) 49 | 50 | frame_shift_ms=None, # Can replace hop_size parameter. (Recommended: 12.5) 51 | 52 | # Mel and Linear spectrograms normalization/scaling and clipping 53 | signal_normalization=True, 54 | # Whether to normalize mel spectrograms to some predefined range (following below parameters) 55 | allow_clipping_in_normalization=True, # Only relevant if mel_normalization = True 56 | symmetric_mels=True, 57 | # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, 58 | # faster and cleaner convergence) 59 | max_abs_value=4., 60 | # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not 61 | # be too big to avoid gradient explosion, 62 | # not too small for fast convergence) 63 | # Contribution by @begeekmyfriend 64 | # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude 65 | # levels. Also allows for better G&L phase reconstruction) 66 | preemphasize=True, # whether to apply filter 67 | preemphasis=0.97, # filter coefficient. 68 | 69 | # Limits 70 | min_level_db=-100, 71 | ref_level_db=20, 72 | fmin=55, 73 | # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To 74 | # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 75 | fmax=7600, # To be increased/reduced depending on data. 76 | 77 | ###################### Our training parameters ################################# 78 | img_size=192, # change image size to 192 79 | fps=25, 80 | 81 | # TODO: restore params 82 | batch_size=64, 83 | # batch_size=2, # for local testing 84 | initial_learning_rate=1e-4, 85 | # nepochs=200000000000000000, ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs 86 | nepochs=200000000000000000, # for local testing 87 | # num_workers=16, 88 | num_workers=16, # for local testing 89 | # checkpoint_interval=500, 90 | checkpoint_interval=200, 91 | # log_interval=100, 92 | log_interval=200, 93 | # eval_interval=500, 94 | eval_interval=500, 95 | save_optimizer_state=True, 96 | 97 | syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence. 98 | syncnet_batch_size=128, 99 | syncnet_lr=1e-5, 100 | syncnet_eval_interval=500, 101 | syncnet_checkpoint_interval=500, 102 | 103 | disc_wt=0.07, 104 | # disc_wt=0.04, 105 | disc_initial_learning_rate=1e-4, 106 | num_checkpoints=25 107 | ) 108 | 109 | 110 | def hparams_debug_string(): 111 | values = hparams.values() 112 | hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"] 113 | return "Hyperparameters:\n" + "\n".join(hp) 114 | -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | # import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | from hparams import hparams as hp 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr): 18 | librosa.output.write_wav(path, wav, sr=sr) 19 | 20 | def preemphasis(wav, k, preemphasize=True): 21 | if preemphasize: 22 | return signal.lfilter([1, -k], [1], wav) 23 | return wav 24 | 25 | def inv_preemphasis(wav, k, inv_preemphasize=True): 26 | if inv_preemphasize: 27 | return signal.lfilter([1], [1, -k], wav) 28 | return wav 29 | 30 | def get_hop_size(): 31 | hop_size = hp.hop_size 32 | if hop_size is None: 33 | assert hp.frame_shift_ms is not None 34 | hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate) 35 | return hop_size 36 | 37 | def linearspectrogram(wav): 38 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 39 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 40 | 41 | if hp.signal_normalization: 42 | return _normalize(S) 43 | return S 44 | 45 | def melspectrogram(wav): 46 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 47 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db 48 | 49 | if hp.signal_normalization: 50 | return _normalize(S) 51 | return S 52 | 53 | def _lws_processor(): 54 | import lws 55 | return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech") 56 | 57 | def _stft(y): 58 | if hp.use_lws: 59 | return _lws_processor(hp).stft(y).T 60 | else: 61 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size) 62 | 63 | ########################################################## 64 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 65 | def num_frames(length, fsize, fshift): 66 | """Compute number of time frames of spectrogram 67 | """ 68 | pad = (fsize - fshift) 69 | if length % fshift == 0: 70 | M = (length + pad * 2 - fsize) // fshift + 1 71 | else: 72 | M = (length + pad * 2 - fsize) // fshift + 2 73 | return M 74 | 75 | 76 | def pad_lr(x, fsize, fshift): 77 | """Compute left and right padding 78 | """ 79 | M = num_frames(len(x), fsize, fshift) 80 | pad = (fsize - fshift) 81 | T = len(x) + 2 * pad 82 | r = (M - 1) * fshift + fsize - T 83 | return pad, pad + r 84 | ########################################################## 85 | #Librosa correct padding 86 | def librosa_pad_lr(x, fsize, fshift): 87 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 88 | 89 | # Conversions 90 | _mel_basis = None 91 | 92 | def _linear_to_mel(spectogram): 93 | global _mel_basis 94 | if _mel_basis is None: 95 | _mel_basis = _build_mel_basis() 96 | return np.dot(_mel_basis, spectogram) 97 | 98 | def _build_mel_basis(): 99 | assert hp.fmax <= hp.sample_rate // 2 100 | return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, 101 | fmin=hp.fmin, fmax=hp.fmax) 102 | 103 | def _amp_to_db(x): 104 | min_level = np.exp(hp.min_level_db / 20 * np.log(10)) 105 | return 20 * np.log10(np.maximum(min_level, x)) 106 | 107 | def _db_to_amp(x): 108 | return np.power(10.0, (x) * 0.05) 109 | 110 | def _normalize(S): 111 | if hp.allow_clipping_in_normalization: 112 | if hp.symmetric_mels: 113 | return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value, 114 | -hp.max_abs_value, hp.max_abs_value) 115 | else: 116 | return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value) 117 | 118 | assert S.max() <= 0 and S.min() - hp.min_level_db >= 0 119 | if hp.symmetric_mels: 120 | return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value 121 | else: 122 | return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)) 123 | 124 | def _denormalize(D): 125 | if hp.allow_clipping_in_normalization: 126 | if hp.symmetric_mels: 127 | return (((np.clip(D, -hp.max_abs_value, 128 | hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) 129 | + hp.min_level_db) 130 | else: 131 | return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 132 | 133 | if hp.symmetric_mels: 134 | return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db) 135 | else: 136 | return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parallel Wav2Lip Data Preprocessing 2 | 3 | ### 1. Run convert video to standard 25 FPS 4 | assume dataset is located "your-folder-dataset" 5 | ```bash 6 | python3 1_convert_25fps.py 7 | ``` 8 | it will automatically create new folder name ***your-folder-dataset/full_voice_25fps*** 9 | 10 | the folder structure: 11 | ``` 12 | your-folder-dataset 13 | |---full_voice 14 | | video1.mp4 15 | | video2.mp4 16 | | .......... 17 | |---full_voice_25fps 18 | | video1.mp4 19 | | video2.mp4 20 | | .......... 21 | ``` 22 | 23 | 24 | ### 2. Run to crop video 25 | 26 | ```bash 27 | python3 2_crop_video.py 28 | ``` 29 | it will automatically create new folder name ***your-folder-dataset/videos_crop*** 30 | 31 | the folder structure: 32 | ``` 33 | your-folder-dataset 34 | |---full_voice 35 | |---full_voice_25fps 36 | |---videos_crop 37 | | video1.mp4 38 | | video2.mp4 39 | | .......... 40 | ``` 41 | 42 | 43 | ### 3. Run to split each video into 10s videos 44 | 45 | ```bash 46 | python3 3_segment.py 47 | ``` 48 | it will automatically create new folder name ***your-folder-dataset/videos_segment*** and ***your-folder-dataset/audios_segment*** 49 | 50 | the folder structure: 51 | ``` 52 | your-folder-dataset 53 | |---full_voice 54 | |---full_voice_25fps 55 | |---videos_crop 56 | |---videos_segment 57 | |---------video1 58 | | 0_10.mp4 59 | | 10_20.mp4 60 | | ......... 61 | |---------video2 62 | | 0_10.mp4 63 | | 10_20.mp4 64 | | ......... 65 | |---------............. 66 | |---audios_segment 67 | |---------video1 68 | | 0_10.wav 69 | | 10_20.wav 70 | | ......... 71 | |---------video2 72 | | 0_10.wav 73 | | 10_20.wav 74 | | ......... 75 | |---------............. 76 | ``` 77 | 78 | 79 | ### 4. Run face detection 80 | 81 | ```bash 82 | python3 4_detection.py 83 | ``` 84 | it will automatically create new folder name ***your-folder-dataset/output*** 85 | 86 | the folder structure: 87 | ``` 88 | your-folder-dataset 89 | |---full_voice 90 | |---full_voice_25fps 91 | |---videos_crop 92 | |---videos_segment 93 | |---audios_segment 94 | |---output 95 | |---------video1 96 | |-------------0_10 97 | | 00000.jpg 98 | | 00001.jpg 99 | | 00002.jpg 100 | | ......... 101 | | audio.wav 102 | |-------------10_20 103 | | 00000.jpg 104 | | 00001.jpg 105 | | 00002.jpg 106 | | ......... 107 | | audio.wav 108 | |-------------........... 109 | |---------video2 110 | |-------------0_10 111 | | 00000.jpg 112 | | 00001.jpg 113 | | 00002.jpg 114 | | ......... 115 | | audio.wav 116 | |-------------10_20 117 | | 00000.jpg 118 | | 00001.jpg 119 | | 00002.jpg 120 | | ......... 121 | | audio.wav 122 | |-------------........... 123 | |---------....... 124 | ``` 125 | 126 | 127 | ### 5. Create filelist structure for wav2lip training 128 | 129 | ```bash 130 | python3 5_create_filelist.py 131 | ``` 132 | it will automatically create new folder name ***your-folder-dataset/filelist*** 133 | 134 | the folder structure: 135 | ``` 136 | your-folder-dataset 137 | |---full_voice 138 | |---full_voice_25fps 139 | |---videos_crop 140 | |---videos_segment 141 | |---audios_segment 142 | |---output 143 | |---filelist 144 | | raw_filelist.txt 145 | | raw_filelist_errors.txt 146 | ``` 147 | 148 | ### 6. Correct audio with video (audio lenght less than video length after converting to 25fps) 149 | 150 | ```bash 151 | python3 6_au_sync.py 152 | ``` 153 | it will automatically create new folder name ***your-folder-dataset/filelist/temp*** 154 | 155 | the folder structure: 156 | ``` 157 | your-folder-dataset 158 | |---full_voice 159 | |---full_voice_25fps 160 | |---videos_crop 161 | |---videos_segment 162 | |---audios_segment 163 | |---output 164 | |---------video1 165 | |-------------0_10 166 | | 00000.jpg 167 | | 00001.jpg 168 | | 00002.jpg 169 | | ......... 170 | | audio.wav 171 | | synced_audio.wav 172 | |-------------10_20 173 | | 00000.jpg 174 | | 00001.jpg 175 | | 00002.jpg 176 | | ......... 177 | | audio.wav 178 | | synced_audio.wav 179 | |-------------........... 180 | |---------video2 181 | |-------------0_10 182 | | 00000.jpg 183 | | 00001.jpg 184 | | 00002.jpg 185 | | ......... 186 | | audio.wav 187 | | synced_audio.wav 188 | |-------------10_20 189 | | 00000.jpg 190 | | 00001.jpg 191 | | 00002.jpg 192 | | ......... 193 | | audio.wav 194 | | synced_audio.wav 195 | |-------------........... 196 | |---------....... 197 | |---filelist 198 | | raw_filelist.txt 199 | | raw_filelist_errors.txt 200 | |-------temp 201 | | output_synced__.txt 202 | | output_synced_errors__.txt 203 | ``` 204 | 205 | ### 7. Convert audio to mel spectrogram 206 | 207 | ```bash 208 | python3 7_to_mel.py 209 | ``` 210 | 211 | the folder structure: 212 | ``` 213 | your-folder-dataset 214 | |---full_voice 215 | |---full_voice_25fps 216 | |---videos_crop 217 | |---videos_segment 218 | |---audios_segment 219 | |---output 220 | |---------video1 221 | |-------------0_10 222 | | 00000.jpg 223 | | 00001.jpg 224 | | 00002.jpg 225 | | ......... 226 | | audio.wav 227 | | synced_audio.wav 228 | | mel.npy 229 | |-------------10_20 230 | | 00000.jpg 231 | | 00001.jpg 232 | | 00002.jpg 233 | | ......... 234 | | audio.wav 235 | | synced_audio.wav 236 | | mel.npy 237 | |-------------........... 238 | |---------video2 239 | |-------------0_10 240 | | 00000.jpg 241 | | 00001.jpg 242 | | 00002.jpg 243 | | ......... 244 | | audio.wav 245 | | synced_audio.wav 246 | | mel.npy 247 | |-------------10_20 248 | | 00000.jpg 249 | | 00001.jpg 250 | | 00002.jpg 251 | | ......... 252 | | audio.wav 253 | | synced_audio.wav 254 | | mel.npy 255 | |-------------........... 256 | |---------....... 257 | |---filelist 258 | | raw_filelist.txt 259 | | raw_filelist_errors.txt 260 | |-------temp 261 | | output_synced__.txt 262 | | output_synced_errors__.txt 263 | | output_data_mel_errors__.txt 264 | ``` 265 | --------------------------------------------------------------------------------