├── 1_convert_25fps.py
├── 5_create_filelist.py
├── 7_to_mel.py
├── 2_crop_video.py
├── 3_segment.py
├── 6_au_sync.py
├── 4_detection.py
├── hparams.py
├── audio.py
└── README.md


/1_convert_25fps.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import concurrent.futures
 3 | import subprocess
 4 | import sys
 5 | 
 6 | dataset_path = sys.argv[1]
 7 | presenter_name = sys.argv[2]
 8 | n_processes = int(sys.argv[3])
 9 | 
10 | input_video_path = os.path.join(dataset_path, presenter_name, 'full_voice')
11 | output_video_path = os.path.join(dataset_path, presenter_name, 'full_voice_25fps')
12 | if not os.path.exists(output_video_path):
13 |     os.makedirs(output_video_path)
14 | source_dir = os.listdir(input_video_path)
15 | 
16 | def convert_25fps(name_video):
17 |     video = os.path.join(input_video_path, name_video)
18 |     new_video = os.path.join(output_video_path, name_video)
19 |     subprocess.call(f"ffmpeg -y -i {video} -filter:v fps=25 -b:v 50M {new_video}", shell=True)
20 | 
21 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor:
22 |     inputs = [x for x in source_dir]
23 |     executor.map(convert_25fps, inputs)
24 |     
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/5_create_filelist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | dataset_path = sys.argv[1]
 5 | presenter_name = sys.argv[2]
 6 | token = sys.argv[3]
 7 | 
 8 | path = os.path.join(dataset_path, presenter_name)
 9 | output_path = os.path.join(path, f'filelist_{token}')
10 | if not os.path.exists(output_path):
11 |     os.makedirs(output_path)
12 | source_path = os.path.join(path, f'output_{token}')
13 | data_list = os.listdir(source_path)
14 | 
15 | results = []
16 | errors = []
17 | for d in data_list:
18 |     d_path = os.path.join(source_path, d)
19 |     train_list = os.listdir(d_path)
20 |     train_list = [t for t in train_list if os.path.isdir(os.path.join(d_path, t))]
21 |     for t in train_list:
22 |         t_path = os.path.join(d_path, t)
23 | 
24 |         if os.path.isfile(os.path.join(t_path, "audio.wav")):
25 |             results.append(t_path)
26 |         else:
27 |             errors.append(t_path)
28 | 
29 | with open(f"{output_path}/raw_filelist.txt", "w") as f:
30 |     for line in results:
31 |         f.write(line + "\n")
32 | 
33 | with open(f"{output_path}/raw_filelist_errors.txt", "w") as f:
34 |     for line in errors:
35 |         f.write(line + "\n")
36 |         
37 | 


--------------------------------------------------------------------------------
/7_to_mel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import audio
 4 | import numpy as np
 5 | 
 6 | dataset_path = sys.argv[1]
 7 | presenter_name = sys.argv[2]
 8 | token = sys.argv[3]
 9 | 
10 | obj_path = os.path.join(dataset_path, presenter_name)
11 | 
12 | ROOT = os.path.join(obj_path, f"output_{token}")
13 | 
14 | with open(os.path.join(obj_path, f"filelist_{token}/raw_filelist.txt"), "r") as f:
15 |     data = f.readlines()
16 | data = [line.strip() for line in data]
17 | data.sort()
18 | 
19 | start = 0
20 | end = len(data)
21 | 
22 | data = data[start:]
23 | print("Data", start, end, len(data))
24 | sample_rate = 16000
25 | error = []
26 | 
27 | for d in data:
28 |     try:
29 |         mel_out_path = os.path.join(d, "mel.npy")
30 | 
31 |         wavpath = os.path.join(d, "synced_audio.wav")
32 | 
33 |         wav = audio.load_wav(wavpath, sample_rate)
34 | 
35 |         orig_mel = audio.melspectrogram(wav).T
36 |         with open(mel_out_path, "wb") as f:   
37 |             np.save(f, orig_mel)
38 |     except Exception:
39 |         print("Error", d)
40 |         error.append(d)
41 | 
42 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_data_mel_errors_{start}_{end}.txt"), "w") as f:
43 |     for line in error:
44 |         f.write(line + "\n")
45 | 


--------------------------------------------------------------------------------
/2_crop_video.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import concurrent.futures
 4 | import cv2
 5 | import dlib
 6 | import subprocess
 7 | import sys
 8 | 
 9 | dataset_path = sys.argv[1]
10 | presenter_name = sys.argv[2]
11 | n_processes = int(sys.argv[3])
12 | 
13 | input_video_path = os.path.join(dataset_path, presenter_name, 'full_voice_25fps')
14 | output_video_path = os.path.join(dataset_path, presenter_name, 'videos_crop')
15 | 
16 | if not os.path.exists(output_video_path):
17 |     os.makedirs(output_video_path)
18 | source_dir = os.listdir(input_video_path)
19 | print(source_dir)
20 | def crop_video(name_video):
21 |     vid_path = os.path.join(input_video_path, name_video)
22 |     out_path = os.path.join(output_video_path, name_video)
23 |     
24 |     detector = dlib.get_frontal_face_detector()
25 | 
26 |     # Load the video
27 |     cap = cv2.VideoCapture(vid_path)
28 | 
29 |     # Get the first frame
30 |     ret, frame = cap.read()
31 | 
32 |     if not ret:
33 |         print("Can't receive frame (stream end?). Exiting ...")
34 |         exit()
35 | 
36 |     # Detect face in the first frame
37 |     faces = detector(frame)
38 | 
39 |     # Check if any face is detected
40 |     if len(faces) > 0:
41 |         # Get the bounding box of the first face detected
42 |         x, y, w, h = faces[0].left(), faces[0].top(), faces[0].width(), faces[0].height()
43 |         y = max(0, y - int(0.8*w))
44 |         h = 3*h
45 |         x = (2*x + w - h)//2
46 |         w = h
47 |         # Use ffmpeg to crop the video based on the bounding box
48 |         command = f"ffmpeg -y -i {vid_path} -filter:v \"crop={w}:{h}:{x}:{y}\" -b:v 4M {out_path}"
49 |         subprocess.call(command, shell=True)
50 | 
51 |     cap.release()
52 |     
53 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor:
54 |     inputs = [x for x in source_dir]
55 |     executor.map(crop_video, inputs)


--------------------------------------------------------------------------------
/3_segment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import concurrent.futures
 5 | 
 6 | dataset_path = sys.argv[1]
 7 | presenter_name = sys.argv[2]
 8 | n_processes = int(sys.argv[3])
 9 | 
10 | input_video_path = os.path.join(dataset_path, presenter_name, 'videos_crop')
11 | output_video_path = os.path.join(dataset_path, presenter_name, 'videos_segment')
12 | if not os.path.exists(output_video_path):
13 |     os.makedirs(output_video_path)
14 | source_dir = os.listdir(input_video_path)
15 | 
16 | def segment(name_video):
17 |     video_path = os.path.join(input_video_path, name_video)
18 |     split_video_path = video_path.replace('videos_crop', 'videos_segment').replace('.mp4', '').replace('.MP4', '')
19 |     if not os.path.exists(split_video_path):
20 |         os.makedirs(split_video_path)
21 |     split_audio_path = video_path.replace('videos_crop', 'audios_segment').replace('.mp4', '').replace('.MP4', '')
22 |     if not os.path.exists(split_audio_path):
23 |         os.makedirs(split_audio_path)
24 |     command = f"ffmpeg -nostdin -y -i {video_path} 2>&1 | grep Duration | sed 's/Duration: \(.*\), start/\\1/g'"
25 |     output_terminal = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].decode('utf-8')
26 |     duration_video = int(float(output_terminal.split(":")[1]) * 60 + float(output_terminal.split(":")[2]))
27 |     segment_time = list(range(0, duration_video, 10))
28 |     for i in range(0, len(segment_time) - 1):
29 |         small_video_path = os.path.join(split_video_path, f'{segment_time[i]}_{segment_time[i+1]}.mp4')
30 |         small_audio_path = os.path.join(split_audio_path, f'{segment_time[i]}_{segment_time[i+1]}.wav')
31 |         vid_command = f"ffmpeg -nostdin -y -ss {segment_time[i]} -i {video_path} -t 10 -filter:v fps=25 -b:v 4M {small_video_path}"
32 |         vid_status = os.system(vid_command)
33 |         # aud_command = f"ffmpeg -ss {segment_time[i]} -i {audio_path} -t 10 -ar 16000 {small_audio_path}"
34 |         aud_command = f"ffmpeg -nostdin -y -i {small_video_path} -ar 16000 {small_audio_path}"
35 |         aud_status = os.system(aud_command)
36 |         print(small_video_path, vid_status, aud_status)
37 |         
38 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor:
39 |     inputs = [x for x in source_dir]
40 |     executor.map(segment, inputs)
41 | 


--------------------------------------------------------------------------------
/6_au_sync.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | import soundfile as sf
 5 | import librosa
 6 | 
 7 | dataset_path = sys.argv[1]
 8 | presenter_name = sys.argv[2]
 9 | token = sys.argv[3]
10 | 
11 | obj_path = os.path.join(dataset_path, presenter_name)
12 | ROOT = os.path.join(obj_path, f"output_{token}")
13 | print("ROOT:", ROOT)
14 | start = 0
15 | end = -1
16 | 
17 | with open(os.path.join(obj_path, f"filelist_{token}/raw_filelist.txt"), "r") as f:
18 |     data = f.readlines()
19 | 
20 | data = [line.strip() for line in data]
21 | data.sort()
22 | data = data[start:]
23 | print("Data", start, len(data), len(data))
24 | 
25 | errors = []
26 | results = []
27 | for p in data:
28 |     try:
29 |         d = os.path.join(ROOT, p)
30 |         frames = os.listdir(d)
31 |         frames = [file for file in frames if ".jpg" in file]
32 |         frame_count = len(frames)
33 |         vid_duration = frame_count/25
34 |         # print(vid_duration)
35 |         vid_name = d.split("/")[-1]
36 | 
37 |         org_path = os.path.join(d, f"{vid_name}.wav")
38 |         au_path = os.path.join(d, "audio.wav")
39 |         synced_path = os.path.join(d, "synced_audio.wav")
40 | 
41 |         if not os.path.isfile(au_path):
42 |             status = os.system(f"ffmpeg -i {org_path} -ar 16000 {au_path}")
43 |             if status != 0:
44 |                 errors.append(p)
45 |                 continue
46 |         if os.path.isfile(synced_path):
47 |             continue
48 | 
49 |         au, sr = librosa.load(au_path, sr=16000)
50 |         au_duration = au.shape[0]/sr
51 | 
52 |         extra = int(vid_duration * sr - au.shape[0])
53 |         is_append = extra >= 0
54 |         extra = abs(extra)
55 |         new_au = au
56 |         if extra > 0:
57 |             front = False
58 |             if (is_append):
59 |                 # append audio
60 |                 if front:
61 |                     new_au = np.concatenate([np.zeros(extra), au])
62 |                 else:
63 |                     new_au = np.concatenate([au, np.zeros(extra)])
64 |             else:
65 |                 # cut audio
66 |                 if front:
67 |                     new_au = au[:-extra]
68 |                 else:
69 |                     new_au = au[extra:]
70 |         sf.write(synced_path, new_au, sr)
71 |         results.append(p)
72 |     except Exception:
73 |         print(p)
74 |         errors.append(p)
75 | if not os.path.exists(os.path.join(obj_path, f"filelist_{token}/temp")):
76 |     os.mkdir(os.path.join(obj_path, f"filelist_{token}/temp"))
77 | 
78 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_synced_{start}_{len(data)}.txt"), "w") as f:
79 |     for line in results:
80 |         f.write(line + "\n")
81 | 
82 | with open(os.path.join(obj_path, f"filelist_{token}/temp/output_synced_errors_{start}_{len(data)}.txt"), "w") as f:
83 |     for line in errors:
84 |         f.write(line + "\n")


--------------------------------------------------------------------------------
/4_detection.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import mediapipe as mp
 3 | import os
 4 | import sys
 5 | import subprocess
 6 | import concurrent.futures
 7 | 
 8 | dataset_path = sys.argv[1]
 9 | presenter_name = sys.argv[2]
10 | n_processes = int(sys.argv[3])
11 | 
12 | input_video_path = os.path.join(dataset_path, presenter_name, 'videos_segment')
13 | output_video_path = os.path.join(dataset_path, presenter_name, 'output')
14 | if not os.path.exists(output_video_path):
15 |     os.makedirs(output_video_path)
16 | id_vids = os.listdir(input_video_path)
17 | arr_path_vid = []
18 | for id_vid in id_vids:
19 |     path_id_vid = os.path.join(input_video_path, id_vid)
20 |     if not os.path.exists(path_id_vid.replace("videos_segment", "output")):
21 |         os.makedirs(path_id_vid.replace("videos_segment", f"output"))
22 |     split_vids = os.listdir(path_id_vid)
23 |     for split_vid in split_vids:
24 |         if "mp4" in split_vid or "MP4" in split_vid:
25 |             path_split_vid = os.path.join(path_id_vid, split_vid)
26 |             arr_path_vid.append(path_split_vid)
27 | mp_face_mesh = mp.solutions.face_mesh
28 | 
29 | def detection(path_split_vid):
30 |     path_output = path_split_vid.replace("videos_segment", f"output").replace(".mp4" , "").replace(".MP4", "")
31 |     if not os.path.exists(path_output):
32 |         os.makedirs(path_output)
33 |     cap = cv2.VideoCapture(path_split_vid)
34 |     flag_person = True
35 |     t = 0
36 |     while cap.isOpened():
37 |         path_output_image = f'{path_output}/{str(t).zfill(5)}.jpg'
38 |         print(path_output_image)
39 |         ret, img = cap.read()
40 |         if not ret:
41 |             break
42 |         h, w, _ = img.shape
43 | 
44 |         with mp_face_mesh.FaceMesh( static_image_mode=True, refine_landmarks=True, min_detection_confidence=0.5) as face_mesh:   
45 |             results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
46 |             # Draw face detections of each face.
47 |             if not results.multi_face_landmarks:
48 |                 flag_person = False
49 |                 break
50 |             for face_landmarks in results.multi_face_landmarks:
51 |                 face_indices = face_landmarks.landmark[:-10]
52 |             x1, x2, y1, y2 = int(face_indices[234].x*w), int(face_indices[454].x*w), int(face_indices[10].y*h), int(face_indices[152].y*h)
53 | 
54 |             y2 = y2 + int((y2 - y1)*0.14)
55 |             www = x2 - x1
56 |             x1 = x1 - int(0.07*www)
57 |             x2 = x2 + int(0.07*www)
58 |             img_final = img[y1:y2, x1: x2]
59 |             cv2.imwrite(path_output_image, img_final, [cv2.IMWRITE_JPEG_QUALITY, 100])
60 |             t+=1
61 |     cap.release()
62 |     
63 |     # Delete the video if one frame does not include the presenter's face
64 |     if not flag_person:
65 |         command = f'rm -r {path_output}'
66 |         subprocess.call(command, shell=True)
67 |         return
68 |     
69 |     #Copy audio
70 |     old_audio = path_split_vid.replace("videos_segment", 'audios_segment').replace(".mp4",".wav")
71 |     new_audio = path_split_vid.replace("videos_segment", f"output").replace('.mp4', '')
72 |     command = f"cp {old_audio} {new_audio}/audio.wav"
73 |     subprocess.call(command, shell=True)
74 |     
75 | with concurrent.futures.ProcessPoolExecutor(n_processes) as executor:
76 |     inputs = [x for x in arr_path_vid]
77 |     executor.map(detection, inputs)
78 |         
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | from glob import glob
  2 | 
  3 | 
  4 | import os
  5 | 
  6 | def get_image_list(data_root, split):
  7 | 	filelist = []
  8 | 
  9 | 	with open('filelists/{}.txt'.format(split)) as f:
 10 | 		for line in f:
 11 | 			line = line.strip()
 12 | 			if ' ' in line: line = line.split()[0]
 13 | 			filelist.append(os.path.join(data_root, line))
 14 | 
 15 | 	return filelist
 16 | 
 17 | class HParams:
 18 | 	def __init__(self, **kwargs):
 19 | 		self.data = {}
 20 | 
 21 | 		for key, value in kwargs.items():
 22 | 			self.data[key] = value
 23 | 
 24 | 	def __getattr__(self, key):
 25 | 		if key not in self.data:
 26 | 			raise AttributeError("'HParams' object has no attribute %s" % key)
 27 | 		return self.data[key]
 28 | 
 29 | 	def set_hparam(self, key, value):
 30 | 		self.data[key] = value
 31 | 
 32 | 
 33 | # Default hyperparameters
 34 | hparams = HParams(
 35 |     num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
 36 |     #  network
 37 |     rescale=True,  # Whether to rescale audio prior to preprocessing
 38 |     rescaling_max=0.9,  # Rescaling value
 39 | 
 40 |     # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
 41 |     # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
 42 |     # Does not work if n_ffit is not multiple of hop_size!!
 43 |     use_lws=False,
 44 | 
 45 |     n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
 46 |     hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
 47 |     win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
 48 |     sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
 49 | 
 50 |     frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
 51 | 
 52 |     # Mel and Linear spectrograms normalization/scaling and clipping
 53 |     signal_normalization=True,
 54 |     # Whether to normalize mel spectrograms to some predefined range (following below parameters)
 55 |     allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
 56 |     symmetric_mels=True,
 57 |     # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
 58 |     # faster and cleaner convergence)
 59 |     max_abs_value=4.,
 60 |     # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
 61 |     # be too big to avoid gradient explosion,
 62 |     # not too small for fast convergence)
 63 |     # Contribution by @begeekmyfriend
 64 |     # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
 65 |     # levels. Also allows for better G&L phase reconstruction)
 66 |     preemphasize=True,  # whether to apply filter
 67 |     preemphasis=0.97,  # filter coefficient.
 68 | 
 69 |     # Limits
 70 |     min_level_db=-100,
 71 |     ref_level_db=20,
 72 |     fmin=55,
 73 |     # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
 74 |     # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 75 |     fmax=7600,  # To be increased/reduced depending on data.
 76 | 
 77 |     ###################### Our training parameters #################################
 78 |     img_size=192, # change image size to 192
 79 |     fps=25,
 80 | 
 81 |     # TODO: restore params
 82 |     batch_size=64,
 83 |     # batch_size=2,  # for local testing
 84 |     initial_learning_rate=1e-4,
 85 |     # nepochs=200000000000000000,  ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
 86 |     nepochs=200000000000000000,  # for local testing
 87 |     # num_workers=16,
 88 |     num_workers=16, # for local testing
 89 |     # checkpoint_interval=500,
 90 |     checkpoint_interval=200,
 91 |     # log_interval=100,
 92 |     log_interval=200,
 93 |     # eval_interval=500,
 94 |     eval_interval=500,
 95 |     save_optimizer_state=True,
 96 | 
 97 |     syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence.
 98 |     syncnet_batch_size=128,
 99 |     syncnet_lr=1e-5,
100 |     syncnet_eval_interval=500,
101 |     syncnet_checkpoint_interval=500,
102 | 
103 |     disc_wt=0.07,
104 |     # disc_wt=0.04,
105 |     disc_initial_learning_rate=1e-4,
106 |     num_checkpoints=25
107 | )
108 | 
109 | 
110 | def hparams_debug_string():
111 | 	values = hparams.values()
112 | 	hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
113 | 	return "Hyperparameters:\n" + "\n".join(hp)
114 | 


--------------------------------------------------------------------------------
/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | # import tensorflow as tf
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | from hparams import hparams as hp
  8 | 
  9 | def load_wav(path, sr):
 10 |     return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | def save_wav(wav, path, sr):
 13 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 14 |     #proposed by @dsmiller
 15 |     wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | def save_wavenet_wav(wav, path, sr):
 18 |     librosa.output.write_wav(path, wav, sr=sr)
 19 | 
 20 | def preemphasis(wav, k, preemphasize=True):
 21 |     if preemphasize:
 22 |         return signal.lfilter([1, -k], [1], wav)
 23 |     return wav
 24 | 
 25 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 26 |     if inv_preemphasize:
 27 |         return signal.lfilter([1], [1, -k], wav)
 28 |     return wav
 29 | 
 30 | def get_hop_size():
 31 |     hop_size = hp.hop_size
 32 |     if hop_size is None:
 33 |         assert hp.frame_shift_ms is not None
 34 |         hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
 35 |     return hop_size
 36 | 
 37 | def linearspectrogram(wav):
 38 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 39 |     S = _amp_to_db(np.abs(D)) - hp.ref_level_db
 40 |     
 41 |     if hp.signal_normalization:
 42 |         return _normalize(S)
 43 |     return S
 44 | 
 45 | def melspectrogram(wav):
 46 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 47 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
 48 |     
 49 |     if hp.signal_normalization:
 50 |         return _normalize(S)
 51 |     return S
 52 | 
 53 | def _lws_processor():
 54 |     import lws
 55 |     return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
 56 | 
 57 | def _stft(y):
 58 |     if hp.use_lws:
 59 |         return _lws_processor(hp).stft(y).T
 60 |     else:
 61 |         return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
 62 | 
 63 | ##########################################################
 64 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
 65 | def num_frames(length, fsize, fshift):
 66 |     """Compute number of time frames of spectrogram
 67 |     """
 68 |     pad = (fsize - fshift)
 69 |     if length % fshift == 0:
 70 |         M = (length + pad * 2 - fsize) // fshift + 1
 71 |     else:
 72 |         M = (length + pad * 2 - fsize) // fshift + 2
 73 |     return M
 74 | 
 75 | 
 76 | def pad_lr(x, fsize, fshift):
 77 |     """Compute left and right padding
 78 |     """
 79 |     M = num_frames(len(x), fsize, fshift)
 80 |     pad = (fsize - fshift)
 81 |     T = len(x) + 2 * pad
 82 |     r = (M - 1) * fshift + fsize - T
 83 |     return pad, pad + r
 84 | ##########################################################
 85 | #Librosa correct padding
 86 | def librosa_pad_lr(x, fsize, fshift):
 87 |     return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
 88 | 
 89 | # Conversions
 90 | _mel_basis = None
 91 | 
 92 | def _linear_to_mel(spectogram):
 93 |     global _mel_basis
 94 |     if _mel_basis is None:
 95 |         _mel_basis = _build_mel_basis()
 96 |     return np.dot(_mel_basis, spectogram)
 97 | 
 98 | def _build_mel_basis():
 99 |     assert hp.fmax <= hp.sample_rate // 2
100 |     return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
101 |                                fmin=hp.fmin, fmax=hp.fmax)
102 | 
103 | def _amp_to_db(x):
104 |     min_level = np.exp(hp.min_level_db / 20 * np.log(10))
105 |     return 20 * np.log10(np.maximum(min_level, x))
106 | 
107 | def _db_to_amp(x):
108 |     return np.power(10.0, (x) * 0.05)
109 | 
110 | def _normalize(S):
111 |     if hp.allow_clipping_in_normalization:
112 |         if hp.symmetric_mels:
113 |             return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
114 |                            -hp.max_abs_value, hp.max_abs_value)
115 |         else:
116 |             return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
117 |     
118 |     assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
119 |     if hp.symmetric_mels:
120 |         return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
121 |     else:
122 |         return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
123 | 
124 | def _denormalize(D):
125 |     if hp.allow_clipping_in_normalization:
126 |         if hp.symmetric_mels:
127 |             return (((np.clip(D, -hp.max_abs_value,
128 |                               hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
129 |                     + hp.min_level_db)
130 |         else:
131 |             return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
132 |     
133 |     if hp.symmetric_mels:
134 |         return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
135 |     else:
136 |         return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Parallel Wav2Lip Data Preprocessing 
  2 | 
  3 | ### 1. Run convert video to standard 25 FPS
  4 | assume dataset is located "your-folder-dataset"
  5 | ```bash
  6 | python3 1_convert_25fps.py <your-folder-dataset> <presenter_name> <n_processes>
  7 | ```
  8 | it will automatically create new folder name ***your-folder-dataset/full_voice_25fps***
  9 | 
 10 | the  folder structure: 
 11 | ```
 12 | your-folder-dataset
 13 | |---full_voice
 14 | |       video1.mp4
 15 | |       video2.mp4
 16 | |       ..........
 17 | |---full_voice_25fps
 18 | |       video1.mp4
 19 | |       video2.mp4
 20 | |       ..........
 21 | ```
 22 | 
 23 | 
 24 | ### 2. Run to crop video
 25 | 
 26 | ```bash
 27 | python3 2_crop_video.py <your-folder-dataset> <presenter_name> <n_processes>
 28 | ```
 29 | it will automatically create new folder name ***your-folder-dataset/videos_crop*** 
 30 | 
 31 | the  folder structure: 
 32 | ```
 33 | your-folder-dataset
 34 | |---full_voice
 35 | |---full_voice_25fps
 36 | |---videos_crop
 37 | |       video1.mp4
 38 | |       video2.mp4
 39 | |       ..........
 40 | ```
 41 | 
 42 | 
 43 | ### 3. Run to split each video into 10s videos
 44 | 
 45 | ```bash
 46 | python3 3_segment.py <your-folder-dataset> <presenter_name> <n_processes>
 47 | ```
 48 | it will automatically create new folder name ***your-folder-dataset/videos_segment***  and ***your-folder-dataset/audios_segment*** 
 49 | 
 50 | the  folder structure: 
 51 | ```
 52 | your-folder-dataset
 53 | |---full_voice
 54 | |---full_voice_25fps
 55 | |---videos_crop
 56 | |---videos_segment
 57 | |---------video1
 58 | |             0_10.mp4
 59 | |             10_20.mp4
 60 | |             .........
 61 | |---------video2
 62 | |             0_10.mp4
 63 | |             10_20.mp4
 64 | |             .........
 65 | |---------.............
 66 | |---audios_segment
 67 | |---------video1
 68 | |             0_10.wav
 69 | |             10_20.wav
 70 | |             .........
 71 | |---------video2
 72 | |             0_10.wav
 73 | |             10_20.wav
 74 | |             .........
 75 | |---------.............
 76 | ```
 77 | 
 78 | 
 79 | ### 4. Run face detection 
 80 | 
 81 | ```bash
 82 | python3 4_detection.py <your-folder-dataset> <presenter_name> <n_processes>
 83 | ```
 84 | it will automatically create new folder name ***your-folder-dataset/output*** 
 85 | 
 86 | the  folder structure: 
 87 | ```
 88 | your-folder-dataset
 89 | |---full_voice
 90 | |---full_voice_25fps
 91 | |---videos_crop
 92 | |---videos_segment
 93 | |---audios_segment
 94 | |---output
 95 | |---------video1
 96 | |-------------0_10
 97 | |               00000.jpg
 98 | |               00001.jpg
 99 | |               00002.jpg
100 | |               .........
101 | |               audio.wav
102 | |-------------10_20
103 | |               00000.jpg
104 | |               00001.jpg
105 | |               00002.jpg
106 | |               .........
107 | |               audio.wav
108 | |-------------...........
109 | |---------video2
110 | |-------------0_10
111 | |               00000.jpg
112 | |               00001.jpg
113 | |               00002.jpg
114 | |               .........
115 | |               audio.wav
116 | |-------------10_20
117 | |               00000.jpg
118 | |               00001.jpg
119 | |               00002.jpg
120 | |               .........
121 | |               audio.wav
122 | |-------------...........
123 | |---------.......
124 | ```
125 | 
126 | 
127 | ### 5. Create filelist structure for wav2lip training 
128 | 
129 | ```bash
130 | python3 5_create_filelist.py <your-folder-dataset>
131 | ```
132 | it will automatically create new folder name ***your-folder-dataset/filelist*** 
133 | 
134 | the  folder structure: 
135 | ```
136 | your-folder-dataset
137 | |---full_voice
138 | |---full_voice_25fps
139 | |---videos_crop
140 | |---videos_segment
141 | |---audios_segment
142 | |---output
143 | |---filelist
144 | |       raw_filelist.txt
145 | |       raw_filelist_errors.txt
146 | ```
147 | 
148 | ### 6. Correct audio with video (audio lenght less than video length after converting to 25fps)
149 | 
150 | ```bash
151 | python3 6_au_sync.py <your-folder-dataset>
152 | ```
153 | it will automatically create new folder name ***your-folder-dataset/filelist/temp*** 
154 | 
155 | the  folder structure: 
156 | ```
157 | your-folder-dataset
158 | |---full_voice
159 | |---full_voice_25fps
160 | |---videos_crop
161 | |---videos_segment
162 | |---audios_segment
163 | |---output
164 | |---------video1
165 | |-------------0_10
166 | |               00000.jpg
167 | |               00001.jpg
168 | |               00002.jpg
169 | |               .........
170 | |               audio.wav
171 | |               synced_audio.wav
172 | |-------------10_20
173 | |               00000.jpg
174 | |               00001.jpg
175 | |               00002.jpg
176 | |               .........
177 | |               audio.wav
178 | |               synced_audio.wav
179 | |-------------...........
180 | |---------video2
181 | |-------------0_10
182 | |               00000.jpg
183 | |               00001.jpg
184 | |               00002.jpg
185 | |               .........
186 | |               audio.wav
187 | |               synced_audio.wav
188 | |-------------10_20
189 | |               00000.jpg
190 | |               00001.jpg
191 | |               00002.jpg
192 | |               .........
193 | |               audio.wav
194 | |               synced_audio.wav
195 | |-------------...........
196 | |---------.......
197 | |---filelist
198 | |       raw_filelist.txt
199 | |       raw_filelist_errors.txt
200 | |-------temp
201 | |         output_synced_<start>_<len(data)>.txt
202 | |         output_synced_errors_<start>_<len(data)>.txt
203 | ```
204 | 
205 | ### 7. Convert audio to mel spectrogram
206 | 
207 | ```bash
208 | python3 7_to_mel.py <your-folder-dataset>
209 | ```
210 | 
211 | the  folder structure: 
212 | ```
213 | your-folder-dataset
214 | |---full_voice
215 | |---full_voice_25fps
216 | |---videos_crop
217 | |---videos_segment
218 | |---audios_segment
219 | |---output
220 | |---------video1
221 | |-------------0_10
222 | |               00000.jpg
223 | |               00001.jpg
224 | |               00002.jpg
225 | |               .........
226 | |               audio.wav
227 | |               synced_audio.wav
228 | |               mel.npy
229 | |-------------10_20
230 | |               00000.jpg
231 | |               00001.jpg
232 | |               00002.jpg
233 | |               .........
234 | |               audio.wav
235 | |               synced_audio.wav
236 | |               mel.npy
237 | |-------------...........
238 | |---------video2
239 | |-------------0_10
240 | |               00000.jpg
241 | |               00001.jpg
242 | |               00002.jpg
243 | |               .........
244 | |               audio.wav
245 | |               synced_audio.wav
246 | |               mel.npy
247 | |-------------10_20
248 | |               00000.jpg
249 | |               00001.jpg
250 | |               00002.jpg
251 | |               .........
252 | |               audio.wav
253 | |               synced_audio.wav
254 | |               mel.npy
255 | |-------------...........
256 | |---------.......
257 | |---filelist
258 | |       raw_filelist.txt
259 | |       raw_filelist_errors.txt
260 | |-------temp
261 | |         output_synced_<start>_<len(data)>.txt
262 | |         output_synced_errors_<start>_<len(data)>.txt
263 | |         output_data_mel_errors_<start>_<len(data)>.txt
264 | ```
265 | 


--------------------------------------------------------------------------------