├── .gitignore ├── README.md ├── configs ├── __pycache__ │ └── config_v1.cpython-39.pyc └── config_v1.py ├── data_generate ├── generate_datasets_v1.py └── generate_datasets_v2 │ ├── audio_preprocess.py │ ├── chinese_public_dataset_preprocess.py │ ├── data_vad.py │ ├── features │ ├── LPC.dll │ ├── __pycache__ │ │ ├── features.cpython-39.pyc │ │ ├── util.cpython-39.pyc │ │ └── vad.cpython-39.pyc │ ├── doc │ │ └── bsname.txt │ ├── features.py │ ├── util.py │ └── vad.py │ └── mocap4face │ ├── 2001161359.json │ ├── 2001161359.tflite │ ├── __pycache__ │ └── mocap4face.cpython-39.pyc │ └── mocap4face.py ├── datasets ├── __pycache__ │ └── dataset.cpython-39.pyc └── dataset.py ├── model_weights └── 2001161359.tflite ├── models ├── __pycache__ │ └── mouth_net.cpython-39.pyc └── mouth_net.py ├── third_part ├── LPC.dll ├── __pycache__ │ └── moCapFace.cpython-39.pyc └── moCapFace.py └── train └── coach_v1.py /.gitignore: -------------------------------------------------------------------------------- 1 | experiment/checkpoints/* 2 | experiment/logs/* 3 | Av629249051-P1.mp4_audio.npy 4 | Av629249051-P1.mp4_bs_targets.npy 5 | assets/* 6 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 数据制作 2 | ``` 3 | cd data_generate/generate_datasets_v2 4 | ``` 5 | #### step1 6 | ``` 7 | python chinese_public_dataset_preprocess.py 8 | ``` 9 | #### step2 10 | ``` 11 | python data_vad.py 12 | ``` 13 | #### step3 14 | ``` 15 | python audio_preprocess.py 16 | ``` 17 | 18 | 19 | #### 注意 20 | ``` 21 | 1. 更换脚本中的文件路径 22 | 2. 最后的gt被存放在clean_gt_base中 23 | 3. 处理后的音频数据放在processed_datasets中 24 | ``` 25 | 26 | 27 | 28 | ## 训练 29 | ``` 30 | 先在configs/config_v1.py中进行训练的配置 31 | python train/coach_v1.py 32 | ``` -------------------------------------------------------------------------------- /configs/__pycache__/config_v1.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/configs/__pycache__/config_v1.cpython-39.pyc -------------------------------------------------------------------------------- /configs/config_v1.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'gpu_ids': "0", # 使用的GPU序号 3 | 'lr': 0.00001, # 0.0005, 0.01 4 | 'class_num': 16, 5 | 'ckpt': "experiment/checkpoints/best_model_loss_2.51.pth", 6 | 'lr_update_interval': 30000, # 学习率更新频率 每次*0.99 7 | 'warmup_steps': 0, # 前200个iteration使用warmup, 之后使用正常的学习率 8 | 'watch_interval': 1000, # log打印 9 | 'print_loss': 1000, # loss打印的频率 10 | 'val_interval': 100000, # 验证轮次 11 | 'save_interval': 100000, # 保存模型轮次 12 | 'epoch': 10000, # 13 | 'exp_dir': 'experiment/exp_1', 14 | 'train_batch_size': 64, # 64 15 | 'num_workers': 8, 16 | 'train_target_root': "E:/datasets/audio2face/train_gt", 17 | 'train_data_root': "E:/datasets/audio2face/train_data", 18 | 'val_target_root': "E:/datasets/audio2face/val_gt", 19 | 'val_data_root': "E:/datasets/audio2face/val_data", 20 | } -------------------------------------------------------------------------------- /data_generate/generate_datasets_v1.py: -------------------------------------------------------------------------------- 1 | """ 2 | File : build_dataset 3 | Time : 2022/8/2 11:09 4 | Author : Lu Zeng 5 | 6 | 7 | 这个脚本用来对齐音频和视频(图像) 8 | 如果检测到了人脸就将主体人脸crop下来,并使用mocapface进行标注(只要40个bs的系数, 不需要头部转向的参数) 9 | 并保存该帧对应的音频 10 | """ 11 | import numpy as np 12 | import cv2 13 | from moviepy.editor import * 14 | import matplotlib.pyplot as plt 15 | import scipy.io.wavfile as wavfile 16 | import ffmpeg 17 | from ctypes import * 18 | import mediapipe as mp 19 | import sys 20 | sys.path.append("..") 21 | sys.path.append(".") 22 | from third_part.moCapFace import MoCapFace 23 | 24 | dll = cdll.LoadLibrary(os.path.join('third_part', 'LPC.dll')) 25 | 26 | 27 | def get_source_info_ffmpeg(source_name): 28 | return_value = 0 29 | try: 30 | info = ffmpeg.probe(source_name) 31 | format_name = info['format']['format_name'] 32 | 33 | video_info = next(c for c in info['streams'] if c['codec_type'] == 'video') 34 | audio_info = next(c for c in info['streams'] if c['codec_type'] == 'audio') 35 | codec_name = audio_info['codec_name'] 36 | duration_ts = float(audio_info['duration_ts']) 37 | fps = audio_info['r_frame_rate'] 38 | 39 | print("format_name:{} \ncodec_name:{} \nduration_ts:{} \nfps:{}".format(format_name, codec_name, duration_ts, fps)) 40 | 41 | codec_name = video_info['codec_name'] 42 | duration_ts = float(video_info['duration_ts']) 43 | fps = video_info['r_frame_rate'] 44 | width = video_info['width'] 45 | height = video_info['height'] 46 | num_frames = video_info['nb_frames'] 47 | print("format_name:{} \ncodec_name:{} \nduration_ts:{} \nwidth:{} \nheight:{} \nfps:{} \nnum_frames:{}".format(format_name, 48 | codec_name, 49 | duration_ts, 50 | width, height, 51 | fps, num_frames)) 52 | except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e: 53 | print("init_source:{} error. {}\n".format(source_name, str(e))) 54 | 55 | return return_value, 0, 0 56 | return return_value, fps, num_frames 57 | 58 | 59 | def vis_audio(rate, signal): 60 | print(signal.shape) 61 | print(f"number of channels = {signal.shape[1]}") 62 | length = signal.shape[0] / rate 63 | print(f"length = {length}s") 64 | time = np.linspace(0., length, signal.shape[0]) 65 | plt.plot(time, signal[:, 0], label="Left channel") 66 | plt.plot(time, signal[:, 1], label="Right channel") 67 | plt.legend() 68 | plt.xlabel("Time [s]") 69 | plt.ylabel("Amplitude") 70 | plt.show() 71 | 72 | 73 | # 画出人脸框和关键点 74 | def draw_face(img, bbox, expand_ratio=0.5): 75 | w = bbox[2] - bbox[0] 76 | h = bbox[3] - bbox[1] 77 | corpbbox = [max(0, int(bbox[0] - expand_ratio * w)), 78 | max(0, int(bbox[1] - expand_ratio * h)), 79 | min(img.shape[1] - 1, int(bbox[2] + expand_ratio * w)), 80 | min(img.shape[0] - 1, int(bbox[3] + 0.1 * expand_ratio * h)) 81 | ] 82 | crop = img[corpbbox[1]: corpbbox[3], corpbbox[0]:corpbbox[2], :] 83 | return crop 84 | 85 | 86 | def get_square_image(face, type="center"): 87 | """ 88 | face不是bbox, 而是基于bbox在img上crop出来的人脸图像 89 | 基于短边, 缩短长边 90 | """ 91 | face_h, face_w = face.shape[:2] 92 | if type == "center": 93 | if face_h > face_w: 94 | pad = (face_h - face_w) // 2 95 | if pad != 0: 96 | face = face[pad:-pad, :, :] 97 | elif face_h < face_w: 98 | pad = (face_w - face_h) // 2 99 | if pad != 0: 100 | face = face[:, pad:-pad, :] 101 | 102 | elif type == "upper": 103 | if face_h > face_w: 104 | pad = (face_h - face_w) // 2 105 | if pad != 0: 106 | face = face[:-2 * pad, :, :] 107 | elif face_h < face_w: 108 | # 在水平方向的crop方式照常 109 | pad = (face_w - face_h) // 2 110 | if pad != 0: 111 | face = face[:, pad:-pad, :] 112 | return face 113 | 114 | 115 | def read_frame_as_jpeg(ffmpeg_video, frame_num): 116 | """ 117 | ffmpeg_video: 是已经加载完成的视频数据 118 | 指定帧数读取任意帧 119 | """ 120 | out, err = ( 121 | ffmpeg_video.filter('select', 'gte(n,{})'.format(frame_num)).output('pipe:', vframes=1, format='image2', vcodec='mjpeg').run(capture_stdout=True) 122 | ) 123 | # 将bytes转成nunpy的格式 124 | try: 125 | image_np = bytes_to_numpy(out) 126 | return image_np 127 | except: 128 | return int(-1) 129 | 130 | 131 | def bytes_to_numpy(image_bytes): 132 | image_np = np.frombuffer(image_bytes, dtype=np.uint8) 133 | image_np = cv2.imdecode(image_np, cv2.IMREAD_COLOR) 134 | return image_np 135 | 136 | if __name__ == "__main__": 137 | bs2id = { 138 | 21: 'jawopen', 139 | 23: 'mouthpucker', 140 | 19: 'mouthfunnel', 141 | 12: 'mouthsmileleft', 142 | 29: 'mouthsmileright', 143 | 14: 'mouthfrownleft', 144 | 27: 'mouthfrownright', 145 | 20: 'mouthrolllower', 146 | 24: 'mouthrollupper', 147 | 11: 'mouthupperupleft', 148 | 30: 'mouthupperupright', 149 | } 150 | need_ids = [21, 23, 19, 12, 29, 14, 27, 20, 24, 11, 30] 151 | 152 | 153 | 154 | # video_path = "assets/baijiajiangtan.mp4" 155 | 156 | video_path = r"E:/datasets/audio2face/cctv_short_video_bilibili/Av629249051-P1.mp4" 157 | 158 | flag_name = os.path.basename(video_path) 159 | 160 | # image_save_root = "crop_face_images" 161 | # bs_targets_root = "bs_targets" 162 | 163 | # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<进行视频的处理<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 164 | mp_face_detection = mp.solutions.face_detection 165 | face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5) 166 | 167 | mocapfacenet = MoCapFace() 168 | 169 | flag, fps, num_frames = get_source_info_ffmpeg(video_path) 170 | fps = int(fps.split("/")[0]) # 视频的fps 171 | num_frames = int(num_frames) # 视频的总帧数 172 | 173 | frames = [] 174 | bs_targets = [] 175 | ffmpeg_video = ffmpeg.input(video_path) 176 | # bs_target_txt = open(os.path.join(bs_targets_root, os.path.basename(video_path).split(".")[0] + ".txt"), "w") 177 | for i in range(num_frames): 178 | frame = read_frame_as_jpeg(ffmpeg_video, i) 179 | 180 | if isinstance(frame, int): 181 | # 当前视频帧损坏的情况 182 | # frames.append(frame) 183 | bs_targets.append(frame) 184 | # bs_target_txt.write(str(frame) + "\n") 185 | else: 186 | results = face_detection.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 187 | if results.detections: 188 | # 只有当有检测到东西的时候才进行下面的操作 189 | h, w = frame.shape[:2] 190 | area = 0 191 | x1y1x2y2 = [0, 0, 0, 0] 192 | for detection in results.detections: 193 | bbox = detection.location_data.relative_bounding_box 194 | x1 = bbox.xmin 195 | y1 = bbox.ymin 196 | width = bbox.width 197 | height = bbox.height 198 | this_area = width * height 199 | if this_area > area: 200 | area = this_area 201 | x1y1x2y2 = [int(x1 * w), int(y1 * h), int((x1 + width) * w), int((y1 + height) * h)] 202 | 203 | crop_face = draw_face(frame, x1y1x2y2) 204 | crop_face = get_square_image(crop_face) 205 | bs_target = mocapfacenet.forword(crop_face) 206 | bs_targets.append(bs_target) 207 | bs_target = list(map(str, bs_target)) 208 | # bs_target_txt.write(" ".join(bs_target) + "\n") 209 | 210 | # 没有检测出人脸的情况 211 | # frames.append(frame) 212 | bs_targets.append(int(0)) 213 | # bs_target_txt.write(str(0) + "\n") 214 | 215 | # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<进行音频的处理<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 216 | video = VideoFileClip(video_path) 217 | audio = video.audio 218 | audio.write_audiofile("assets/tmp.wav") 219 | rate, signal = wavfile.read("assets/tmp.wav") # rate:采样率 220 | # rate: 是采样率 221 | # signal: 是音频信号 222 | 223 | # vis_audio(rate, signal) 224 | if signal.shape[-1] == 2: 225 | signal = np.mean(signal, axis=-1) 226 | 227 | frames_per_second = fps # 视频fps(自己的数据集要设置好视频的fps才能和音频一一对应) 228 | chunks_length = 48 # 260音频分割,520ms 前260ms 后260ms 这个是可以自己设置的 229 | 230 | # 每signal个采样一个信号, 这个信号会对应着30帧视频 231 | audio_frameNum = int(len(signal) / rate * frames_per_second) # 计算音频对应的视频帧数(一般这个就等于视频帧数) 232 | 233 | # 前后各添加260ms音频 234 | a = np.zeros(chunks_length * rate // 1000, dtype=np.int16) 235 | 236 | signal = np.hstack((a, signal, a)) 237 | 238 | # signal = signal / (2.**15) 239 | frames_step = 1000.0 / frames_per_second # 视频每帧的时长间隔33.3333ms 240 | rate_kHz = int(rate / 1000) # 采样率:48kHz 241 | 242 | # 开始进行音频的分割 243 | # audio_frames = [signal[int(i * frames_step * rate_kHz): int((i * frames_step + chunks_length * 2) * rate_kHz)] for i 244 | # in range(audio_frameNum)] 245 | audio_frames = [signal[int(i * frames_step * rate / 1000): int((i * frames_step + chunks_length * 2) * rate / 1000)] for i 246 | in range(audio_frameNum)] 247 | 248 | inputData_array = np.zeros(shape=(1, 32, 64)) # 创建一个空3D数组,该数组(1*32*64)最后需要删除 249 | 250 | for i in range(len(audio_frames)): 251 | audio_frame = audio_frames[i] # 每段音频,8320个采样点 252 | 253 | overlap_frames_apart = 0.008 254 | overlap = int(rate * overlap_frames_apart) # 128 samples 255 | frameSize = int(rate * overlap_frames_apart * 2) # 256 samples 256 | numberOfFrames = 64 257 | 258 | frames = np.ndarray( 259 | (numberOfFrames, frameSize)) # initiate a 2D array with numberOfFrames rows and frame size columns 260 | for k in range(0, numberOfFrames): 261 | for i in range(0, frameSize): 262 | if ((k * overlap + i) < len(audio_frame)): 263 | frames[k][i] = audio_frame[k * overlap + i] 264 | else: 265 | frames[k][i] = 0 266 | 267 | frames *= np.hanning(frameSize) 268 | frames_lpc_features = [] 269 | b = (c_double * 32)() 270 | 271 | for k in range(0, numberOfFrames): 272 | a = (c_double * frameSize)(*frames[k]) 273 | dll.LPC(pointer(a), frameSize, 32, pointer(b)) 274 | frames_lpc_features.append(list(b)) 275 | 276 | image_temp1 = np.array(frames_lpc_features) # list2array 277 | image_temp2 = image_temp1.transpose() # array转置 278 | image_temp3 = np.expand_dims(image_temp2, axis=0) # 升维 279 | inputData_array = np.concatenate((inputData_array, image_temp3), axis=0) # array拼接 280 | 281 | # 删除第一行 282 | inputData_array = inputData_array[1:] 283 | 284 | # #扩展为4维:(-1, 32, 64, 1) 285 | inputData_array = np.expand_dims(inputData_array, axis=3) 286 | # print(inputData_array.shape) 287 | # 视频的长度是13831, 基本一致 288 | # (13832, 32, 64, 1) 289 | 290 | # 这里是为了使得视频帧和处理之后的音频长度对齐 291 | max_l = min(inputData_array.shape[0], num_frames) 292 | selected_audio = [] 293 | selected_bs_targets = [] 294 | for index, this_audio in enumerate(inputData_array[:max_l]): 295 | if not isinstance(bs_targets[index], int): 296 | selected_audio.append(this_audio[np.newaxis, :, :, :]) 297 | selected_bs_targets.append(np.array(bs_targets[index])[np.newaxis, :]) 298 | selected_audio = np.concatenate(selected_audio, axis=0) 299 | selected_bs_targets = np.concatenate(selected_bs_targets, axis=0) 300 | 301 | selected_bs_targets = selected_bs_targets[:, need_ids] 302 | 303 | print(selected_audio.shape) 304 | print(selected_bs_targets.shape) 305 | 306 | # 去除共有的前min_len个元素 307 | min_len = min(selected_audio.shape[0], selected_bs_targets.shape[0]) 308 | 309 | np.save("{}_audio.npy".format(flag_name), selected_audio[:min_len]) 310 | np.save("{}_bs_targets.npy".format(flag_name), selected_bs_targets[:min_len]) -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/audio_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import math 6 | import librosa 7 | import numpy as np 8 | import pandas as pd 9 | sys.path.append(".") 10 | sys.path.append("..") 11 | import features.util as util 12 | from features import features 13 | 14 | 15 | def wav_signal_to_feat(signal, 16 | ind_path, 17 | num_frames, 18 | fps, 19 | sample_rate, 20 | win_length, 21 | hop_length, 22 | half_chunks_length, 23 | feat_func, 24 | ): 25 | """ 26 | 音频转成输入特征,特征转换函数作为参数传入 27 | 28 | 音频切分成每个样本 -> 每个样本转特征 -> concat -> 返回结果 29 | ↘ 每个样本的过零率 ↗ 30 | 31 | :param signal: 音频时域信号,直接从wav中读取出来的 32 | :param ind_path: 图像帧的时域位置索引文件 33 | :param num_frames: 图像帧总数量 34 | :param fps: 35 | :param sample_rate: 音频采样率 36 | :param win_length: 音频分帧窗口大小 37 | :param hop_length: 音频分帧的帧移 38 | :param half_chunks_length: 单个训练样本的时长的一半,单位:ms 39 | :param feat_func: 特征提取函数,主要是lambda表达式 40 | 41 | :return: numpy.Array 42 | """ 43 | 44 | ### =================== 这个和我的数据处理的唯一区别是这里的rate是设置死的 160000 ==================== 45 | 46 | signal_length = len(signal) 47 | chunks_signal_samples = half_chunks_length * sample_rate // 1000 48 | # 前后各添加 空白 音频 49 | a = np.zeros(chunks_signal_samples, dtype=np.int16) 50 | signal = np.hstack((a, signal, a)) 51 | 52 | if ind_path is not None: 53 | img_frame_ind = np.load(ind_path) 54 | audio_frames = [signal[ind: ind + chunks_signal_samples * 2] for ind in img_frame_ind] 55 | else: 56 | frames_step = 1000.0 / fps # 视频每帧的时长间隔 57 | rate_kHz = sample_rate // 1000 # 1ms的采样数 58 | 59 | # 帧数不一致,无法对齐,丢弃数据,返回None 60 | if math.fabs(int(signal_length / (frames_step * rate_kHz)) - num_frames) > 2: 61 | print("calculate num frames: {}, actual num frames{}".format( 62 | signal_length / (frames_step * rate_kHz), num_frames)) 63 | print("different frames count, skip data.") 64 | return None 65 | 66 | # 按图像帧的位置,切分每个图像对应的输入音频样本 67 | audio_frames = [ 68 | signal[round(i * frames_step * rate_kHz): round((i * frames_step * rate_kHz) + 2 * chunks_signal_samples)] 69 | if round((i * frames_step * rate_kHz) + 2 * chunks_signal_samples) < len(signal) 70 | else signal[-int(2 * chunks_signal_samples):] 71 | for i in range(num_frames) 72 | ] 73 | 74 | audio_frames = np.array(audio_frames, dtype=np.float32) 75 | 76 | # 音频特征 77 | feat = feat_func(audio_frames) 78 | 79 | # 过零率特征 80 | zc_feat = features.zero_crossing_feat(audio_frames, win_length, hop_length) 81 | 82 | # 这里是包括过零率的 83 | # 这里过零率和特征的长度是一样的, cat在feat的前面 84 | feat = np.concatenate([zc_feat[:, np.newaxis, :], feat], axis=1) 85 | return feat 86 | 87 | 88 | def preprocess(wav_path, 89 | ind_path=None, 90 | num_frames: int = None, 91 | fps: float = 30, 92 | sample_rate=16000, 93 | is_add_noise=True, 94 | add_thick_noise=True, 95 | add_env_noise=True, 96 | env_noise=None, 97 | win_length=256, 98 | hop_length=128, 99 | half_chunks_length=48, 100 | feat_func=features.fbank, 101 | ): 102 | """ 103 | 预处理 104 | 105 | :param wav_path: 音频文件路径 106 | :param ind_path: 图像帧的时域位置索引文件 107 | :param num_frames: 108 | :param fps: 109 | :param sample_rate: 110 | :param is_add_noise: 是否添加轻量级噪声 111 | :param add_thick_noise: 是否添加重量级噪声 112 | :param add_env_noise: 是否添加环境音噪声 113 | :param env_noise: 环境音噪声,时域信号 114 | :param win_length: 115 | :param hop_length: 116 | :param half_chunks_length: 117 | :param feat_func: 118 | :return: 119 | """ 120 | 121 | feat, feat_wgn_light, feat_wgn_thick, feat_env_noise = None, None, None, None 122 | 123 | # 读取文件 124 | signal, rate = librosa.load(wav_path, sr=sample_rate) 125 | signal_length = len(signal) 126 | print("length: ", signal_length, "rate: ", rate) 127 | 128 | # 空文件,返回None 129 | if signal_length == 0: 130 | return None, None, None, None 131 | 132 | # 特征提取 133 | feat = wav_signal_to_feat( 134 | signal, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate, 135 | win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func, 136 | ) 137 | 138 | # 帧数不一致,返回None 139 | if feat is None: 140 | return None, None, None, None 141 | 142 | # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<下面这些都不用看了先, 本任务不会有噪声<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 143 | 144 | # 添加高斯白噪声,信噪比分别为12和6 145 | if is_add_noise: 146 | signal_wgn_light = util.add_noise(signal, 12.) 147 | feat_wgn_light = wav_signal_to_feat( 148 | signal_wgn_light, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate, 149 | win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func, 150 | ) 151 | if add_thick_noise: 152 | signal_wgn_thick = util.add_noise(signal, 6.) 153 | feat_wgn_thick = wav_signal_to_feat( 154 | signal_wgn_thick, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate, 155 | win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func, 156 | ) 157 | # 添加真实环境音噪声 158 | if add_env_noise: 159 | signal_env_noise = util.add_other_noise(signal, noise=env_noise) 160 | feat_env_noise = wav_signal_to_feat( 161 | signal_env_noise, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate, 162 | win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func, 163 | ) 164 | 165 | return feat, feat_wgn_light, feat_wgn_thick, feat_env_noise 166 | 167 | 168 | def selfmade_arkit_preprocess(wav_dir, profile_dir, gt_dir, save_dir, env_noise): 169 | # 文件映射字典 170 | data_files = {os.path.splitext(file)[0]: os.path.join(wav_dir, file) 171 | for file in os.listdir(wav_dir) if file.split(".")[-1] == "wav"} 172 | profiles_dict = {os.path.splitext(file)[0]: os.path.join(profile_dir, file) for file in os.listdir(profile_dir)} 173 | gt_dict = {os.path.splitext(file)[0]: os.path.join(gt_dir, file) for file in os.listdir(gt_dir)} 174 | 175 | for file_id, path in data_files.items(): 176 | if os.path.exists(os.path.join(save_dir, file_id + ".npy")): 177 | print("Exist file {}".format(os.path.join(save_dir, file_id + ".npy"))) 178 | continue 179 | 180 | if file_id not in profiles_dict or file_id not in gt_dict: 181 | continue 182 | 183 | profile = util.load_json_file(profiles_dict[file_id]) 184 | gt_label = np.load(gt_dict[file_id]) 185 | 186 | num_frames = gt_label.shape[0] 187 | # num_frames = int(profile["num_frames"]) 188 | fps = profile["fps"] 189 | 190 | print("Processing {:<15s}, path: {}...".format(file_id, path)) 191 | lpc_feat, lpc_feat_wgn_s, _, feat_env_noise = preprocess( 192 | path, ind_path=None, num_frames=num_frames, fps=fps, 193 | sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise, 194 | win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC 195 | ) 196 | if lpc_feat is not None: 197 | np.save(save_dir + "/" + file_id + ".npy", lpc_feat) 198 | np.save(save_dir + "/" + file_id + ".wgn_s" + ".npy", lpc_feat_wgn_s) 199 | np.save(save_dir + "/" + file_id + ".env_n" + ".npy", feat_env_noise) 200 | 201 | 202 | def facegood_preprocess(wav_dir, label_dir, save_dir, env_noise): 203 | for file in os.listdir(wav_dir): 204 | file_id = file.split(".")[0] 205 | abs_path = os.path.join(wav_dir, file) 206 | print("Processing {:<15s}, path: {}...".format(file, abs_path)) 207 | 208 | # label数量 == 视频总帧数 209 | label_file = os.path.join(label_dir, "bs_value_{}.npy".format(os.path.splitext(file)[0])) 210 | num_frames = np.load(label_file).shape[0] 211 | 212 | feat, feat_wgn_small, feat_wgn_large, feat_env_noise = preprocess( 213 | abs_path, num_frames=num_frames, fps=30, sample_rate=SAMPLE_RATE, add_thick_noise=False, 214 | env_noise=env_noise, 215 | win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC 216 | ) 217 | if feat is not None: 218 | np.save(os.path.join(save_dir, file_id + ".npy"), feat) 219 | np.save(os.path.join(save_dir, file_id + ".wgn_s" + ".npy"), feat_wgn_small) 220 | # np.save(os.path.join(save_dir, file_id + ".wgn_l" + ".npy"), feat_wgn_large) 221 | np.save(os.path.join(save_dir, file_id + ".env_n" + ".npy"), feat_env_noise) 222 | 223 | 224 | def aiwin_preprocess(aiwin_dir, save_dir, env_noise, is_eval=False): 225 | # 文件映射字典 226 | data_files = {file.split(".")[0].lower(): os.path.join(aiwin_dir, file) 227 | for file in os.listdir(aiwin_dir) if file.split(".")[-1] == "wav"} 228 | gt_files = {file.split(".")[0].lower().replace("_anim", ""): os.path.join(aiwin_dir, file) 229 | for file in os.listdir(aiwin_dir) if file.split(".")[-1] == "csv"} 230 | 231 | for file_id, path in data_files.items(): 232 | print("Processing {:<15s}, path: {}...".format(file_id, path)) 233 | 234 | if file_id not in gt_files: 235 | continue 236 | 237 | # label数量 == 视频总帧数 238 | label_file = gt_files[file_id] 239 | num_frames = pd.read_csv(label_file).shape[0] 240 | 241 | feat, feat_wgn_small, feat_wgn_large, feat_env_noise = preprocess( 242 | path, num_frames=num_frames, fps=25, sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise, 243 | win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC 244 | ) 245 | 246 | if feat is not None: 247 | if not is_eval: 248 | np.save(os.path.join(save_dir, "aiwin_" + file_id + ".npy"), feat) 249 | np.save(os.path.join(save_dir, "aiwin_" + file_id + ".wgn_s" + ".npy"), feat_wgn_small) 250 | # np.save(os.path.join(save_dir, "aiwin_" + file_id + ".wgn_l" + ".npy"), feat_wgn_large) 251 | np.save(os.path.join(save_dir, "aiwin_" + file_id + ".env_n" + ".npy"), feat_env_noise) 252 | else: 253 | np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".npy"), feat) 254 | np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".wgn_s" + ".npy"), feat_wgn_small) 255 | # np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".wgn_l" + ".npy"), feat_wgn_large) 256 | np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".env_n" + ".npy"), feat_env_noise) 257 | 258 | 259 | def public_dataset_preprocess(wav_dir, profile_dir, save_dir, env_noise): 260 | profiles_dict = {os.path.splitext(file)[0]: os.path.join(profile_dir, file) for file in os.listdir(profile_dir)} 261 | 262 | # 文件映射字典 263 | data_files = {file.split(".")[0]: os.path.join(wav_dir, file) 264 | for file in os.listdir(wav_dir) if file.split(".")[-1] == "wav"} 265 | 266 | for file_id, path in data_files.items(): 267 | if os.path.exists(os.path.join(save_dir, file_id + ".npy")): 268 | print("Exist file {}".format(os.path.join(save_dir, file_id + ".npy"))) 269 | continue 270 | 271 | if file_id in profiles_dict: 272 | profile = util.load_json_file(profiles_dict[file_id]) 273 | else: 274 | continue 275 | 276 | print("Processing {:<15s}, path: {}...".format(file_id, path)) 277 | feat, feat_wgn_small, _, feat_env_noise = preprocess( 278 | path, ind_path=None, num_frames=int(profile["num_frames"]), fps=profile["fps"], 279 | sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise, 280 | win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC 281 | ) 282 | 283 | if feat is not None: 284 | np.save(save_dir + "/" + file_id + ".npy", feat) 285 | np.save(save_dir + "/" + file_id + ".wgn_s" + ".npy", feat_wgn_small) 286 | np.save(save_dir + "/" + file_id + ".env_n" + ".npy", feat_env_noise) 287 | 288 | 289 | if __name__ == '__main__': 290 | # 参数配置 291 | use_self_made_arkit = False 292 | use_self_made_mocap = False 293 | 294 | use_facegood = False 295 | use_aiwin = False 296 | use_public = True 297 | 298 | SAMPLE_RATE = 16000 299 | WIN_LENGTH = 256 300 | HOP_LENGTH = 128 301 | N_FEAT = 32 302 | HALF_CHUNKS_LENGTH = 48 303 | FEAT_FUNC = lambda x: features.fbank( 304 | x, sample_rate=SAMPLE_RATE, win_length=WIN_LENGTH, hop_length=HOP_LENGTH, n_mels=N_FEAT) 305 | 306 | # 数据路径 307 | save_dir = "E:/datasets/audio2face/processed_datasets" 308 | # 这里还加上了噪声数据来模仿环境噪声 309 | env_noise_file = "E:/3D_face_reconstruct/audio2face/data/train/environmental_noise_2.wav" 310 | 311 | if not os.path.exists(save_dir): 312 | os.makedirs(save_dir) 313 | 314 | # 对之前从视频中分离出来的音频进解析 315 | # sample rate要用之前生成wav文件的时候使用的sr 316 | env_noise, _ = librosa.load(env_noise_file, sr=16000) 317 | 318 | # 自制数据,ARKIT录制 319 | if use_self_made_arkit: 320 | # clean_data_dir = "E:/数据集/人脸视频口型数据/clean" 321 | # selfmade_dataset_preprocess(clean_data_dir, clean_data_dir, save_dir) 322 | 323 | clean_data_dir = "E:/数据集/人脸视频口型数据/raw_3" 324 | profile_dir = "E:/数据集/人脸视频口型数据/profile_3" 325 | gt_dir = "E:/数据集/chinese_video_process/clean_gt_arkit" 326 | selfmade_arkit_preprocess(clean_data_dir, profile_dir, gt_dir, save_dir, env_noise=env_noise) 327 | 328 | # 自制数据,MocapFace识别 329 | if use_self_made_mocap: 330 | pass 331 | 332 | # FACEGOOD样例数据 333 | if use_facegood: 334 | facegood_wav_dir_path = "D:/projects/AI/research/pose/audio2face/data/train/raw_wav" 335 | facegood_label_dir_path = "D:/projects/AI/research/pose/audio2face/data/train/bs_value" 336 | facegood_preprocess(facegood_wav_dir_path, facegood_label_dir_path, save_dir, env_noise=env_noise) 337 | 338 | # AIWIN训练数据 339 | elif use_aiwin: 340 | aiwin_dir_path = "E:/数据集/chinese_video_process/audio2face_data_for_train" 341 | aiwin_eval_dir_path = "E:/数据集/chinese_video_process/audio2face_data_for_evaluation" 342 | # aiwin_preprocess(aiwin_dir_path, save_dir, env_noise=env_noise, is_eval=False) 343 | aiwin_preprocess(aiwin_eval_dir_path, save_dir, env_noise=env_noise, is_eval=True) 344 | 345 | # 公开数据集 346 | elif use_public: 347 | public_dataset_clean_wav_dir = "E:/datasets/audio2face/wav_base" 348 | public_dataset_profile_path = "E:/datasets/audio2face/profile_base" 349 | public_dataset_preprocess(public_dataset_clean_wav_dir, public_dataset_profile_path, save_dir, 350 | env_noise=env_noise) 351 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/chinese_public_dataset_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | """ 4 | # File : features.py 5 | # Time : 2022/8/6 14:33 6 | # Author : Lu Zeng 7 | # version: python 3.7 8 | """ 9 | 10 | import os 11 | import re 12 | import tqdm 13 | import json 14 | 15 | import cv2 16 | import skvideo.io 17 | import numpy as np 18 | import sys 19 | sys.path.append("..") 20 | sys.path.append(".") 21 | from mocap4face.mocap4face import MediapipeFaceDetection 22 | # 23 | MOUTH_RELATED_BLENDSHAPE_LIST = [ 24 | "jawOpen", 25 | "mouthFunnel", 26 | "mouthPucker", 27 | "mouthSmileLeft", 28 | "mouthSmileRight", 29 | "mouthStretchLeft", 30 | "mouthStretchRight", 31 | "mouthRollLower", 32 | "mouthRollUpper", 33 | "mouthShrugUpper", 34 | "mouthPressLeft", 35 | "mouthPressRight", 36 | "mouthLowerDownLeft", 37 | "mouthLowerDownRight", 38 | "mouthUpperUpLeft", 39 | "mouthUpperUpRight", 40 | ] 41 | 42 | 43 | def call_mocapface(model, img_src): 44 | """ 45 | 调用mocapface模型,输出blendshape结果 46 | 只选取嘴型有效的blendshape 47 | 48 | :param model: mocapface模型 49 | :param img_src: 输入图像,BGR通道 50 | :return: 51 | """ 52 | try: 53 | # 先进行人脸的检测, 检测不到人脸 54 | result, _ = model.MediapipeRun(img_src) 55 | except Exception as e: 56 | result = None 57 | 58 | if result is None: 59 | return [0.] * len(MOUTH_RELATED_BLENDSHAPE_LIST) 60 | 61 | face_json = model.jsonFormat(result) 62 | # 从mocapface中将得到的结果从里面拿出来 63 | # gt_label的顺序和MOUTH_RELATED_BLENDSHAPE_LIST中的顺序是一样的 64 | gt_label = [face_json.get(name, 0.) for name in MOUTH_RELATED_BLENDSHAPE_LIST] 65 | return gt_label 66 | 67 | 68 | @DeprecationWarning 69 | def makevideo2(video_file, _model: MediapipeFaceDetection): 70 | """ 71 | 已弃用 72 | cv2读取视频会跳过部分重复帧或者失败帧,导致最后帧数与音频时长对应不上,无法对齐数据 73 | :param video_file: 74 | :param _model: 75 | :return: 76 | """ 77 | capture = cv2.VideoCapture(video_file) 78 | 79 | video_profile_info = { 80 | "width": capture.get(cv2.CAP_PROP_FRAME_WIDTH), 81 | "height": capture.get(cv2.CAP_PROP_FRAME_HEIGHT), 82 | "channel": capture.get(cv2.CAP_PROP_CHANNEL), 83 | "fps": capture.get(cv2.CAP_PROP_FPS), 84 | "num_frames": capture.get(cv2.CAP_PROP_FRAME_COUNT), 85 | } 86 | print(video_profile_info) 87 | 88 | def image_iterator(cap): 89 | while True: 90 | _ret, _img = cap.read() 91 | if not _ret: 92 | break 93 | yield _img 94 | 95 | frame_id = 0 96 | ground_truth_list = [] 97 | if capture.isOpened(): 98 | for img_src in tqdm.tqdm(image_iterator(capture)): 99 | frame_id += 1 100 | gt_label = call_mocapface(_model, img_src) 101 | ground_truth_list.append(gt_label) 102 | else: 103 | print('视频打开失败!') 104 | 105 | gt_matrix = np.array(ground_truth_list) / 100.0 106 | if gt_matrix.shape[0] != video_profile_info["num_frames"]: 107 | video_profile_info["num_frames"] = gt_matrix.shape[0] 108 | 109 | return gt_matrix, video_profile_info 110 | 111 | 112 | def makevideo3(video_file, _model: MediapipeFaceDetection): 113 | """ 114 | cv2 获取视频基本信息 115 | skvideo 读取视频每一帧(注意: 不会漏帧) 116 | 117 | :param video_file: 118 | :param _model: 119 | :return: 120 | """ 121 | capture = cv2.VideoCapture(video_file) 122 | video_profile_info = { 123 | "width": capture.get(cv2.CAP_PROP_FRAME_WIDTH), 124 | "height": capture.get(cv2.CAP_PROP_FRAME_HEIGHT), 125 | "channel": capture.get(cv2.CAP_PROP_CHANNEL), 126 | "fps": capture.get(cv2.CAP_PROP_FPS), 127 | "num_frames": capture.get(cv2.CAP_PROP_FRAME_COUNT), 128 | } 129 | print(video_file, video_profile_info) 130 | capture.release() 131 | 132 | videogen = skvideo.io.vreader(video_file) 133 | 134 | frame_id = 0 135 | ground_truth_list = [] 136 | for img_src in tqdm.tqdm(videogen): 137 | frame_id += 1 138 | # skvideo读取数据为RGB 139 | img_src = cv2.cvtColor(img_src, cv2.COLOR_RGB2BGR) 140 | # 没有人脸的话这里直接返回0, 但是不会丢弃帧 141 | gt_label = call_mocapface(_model, img_src) 142 | ground_truth_list.append(gt_label) 143 | 144 | gt_matrix = np.array(ground_truth_list) / 100.0 145 | if gt_matrix.shape[0] != video_profile_info["num_frames"]: 146 | video_profile_info["num_frames"] = gt_matrix.shape[0] 147 | 148 | return gt_matrix, video_profile_info 149 | 150 | 151 | def get_duplicated_name(root_dir, output_name, output_suffix): 152 | """ 153 | 重名文件 加数字编号后缀 154 | 155 | :param root_dir: 156 | :param output_name: 157 | :param output_suffix: 158 | :return: 159 | """ 160 | # 防止重名 161 | output_file = os.path.join(root_dir, output_name + output_suffix) 162 | dup_ind = 1 163 | while os.path.exists(output_file): 164 | output_file = os.path.join(root_dir, output_name + "." + str(dup_ind) + output_suffix) 165 | dup_ind += 1 166 | return output_file 167 | 168 | 169 | def make_dir(dir_path): 170 | if not os.path.exists(dir_path): 171 | os.makedirs(dir_path) 172 | 173 | 174 | if __name__ == '__main__': 175 | video_data_dir_list = [ 176 | "E:/datasets/audio2face/wanghong_short_video", 177 | ] 178 | # 创建生成数据的时候要保存的目录 179 | wav_data_dir = "E:/datasets/audio2face/wav_base" 180 | profile_data_dir = "E:/datasets/audio2face/profile_base" 181 | gt_data_dir = "E:/datasets/audio2face/ground_truth_base" 182 | 183 | make_dir(wav_data_dir) 184 | make_dir(profile_data_dir) 185 | make_dir(gt_data_dir) 186 | 187 | def get_sub_files(path): 188 | """ 189 | 这是一个通过递归来收集一个文件夹中的文件的函数(遇到文件夹就继续递归调用, 遇到文件就将其加入到list中) 190 | """ 191 | sub_files = os.listdir(path) 192 | all_files = [] 193 | for file in sub_files: 194 | abs_file = os.path.join(path, file) 195 | if os.path.isfile(abs_file): 196 | all_files.append(abs_file) 197 | if os.path.isdir(abs_file): 198 | _files = get_sub_files(abs_file) 199 | all_files.extend(_files) 200 | return all_files 201 | 202 | # 建立mocapface的model用于打标注 203 | model = MediapipeFaceDetection( 204 | tflite_path="./mocap4face/2001161359.tflite", 205 | json_path="./mocap4face/2001161359.json") 206 | 207 | # 这里说话的人和他的语音是一一对应的 208 | speakers_mapping = {} 209 | for video_dir in video_data_dir_list: 210 | for i, file in enumerate(get_sub_files(video_dir)): 211 | 212 | # 1. 得到各种文件的后缀 213 | abs_prefix, suffix = os.path.splitext(file) 214 | prefix, _ = os.path.splitext(os.path.split(file)[-1]) 215 | if suffix != ".avi" and suffix != ".mpg" and suffix != ".mp4": 216 | continue 217 | output_prefix = "_".join(file.removeprefix(video_dir).split(os.path.sep)[1:-1]) + "_" + prefix 218 | 219 | # 音频分离保存,采样率降采样为16kHz,单通道 220 | # 防止重名并更改后缀 221 | output_wav_file = get_duplicated_name(wav_data_dir, output_prefix, ".wav") 222 | # 这里统一将音频的采样率设置为了16k 223 | os.system("ffmpeg -i {} -f wav -ar 16000 -ac 1 {} -y".format(file, output_wav_file)) 224 | 225 | # mocapface 识别结果 226 | # gt_matrix, video_profile_info = makevideo2(file, model) 227 | gt_matrix, video_profile_info = makevideo3(file, model) 228 | 229 | # 保存GT数据 230 | gt_output_file = get_duplicated_name(gt_data_dir, output_prefix, ".npy") 231 | np.save(gt_output_file, gt_matrix) 232 | 233 | # 保存json 234 | profile_output_file = get_duplicated_name(profile_data_dir, output_prefix, ".json") 235 | with open(profile_output_file, "w", encoding="utf-8") as f: 236 | json.dump(video_profile_info, f) 237 | 238 | # 说话人ID 239 | speaker_id = prefix.split("_")[0].lower() 240 | speaker_id = re.sub(r"\d+", "", speaker_id) \ 241 | if re.match(r"^[a-z\u4e00-\u9fa5]+\d+$", speaker_id) \ 242 | else speaker_id 243 | speakers_mapping[output_prefix] = speaker_id 244 | 245 | with open(os.path.join(profile_data_dir, "speakers.json"), "w", encoding="utf-8") as f: 246 | json.dump(speakers_mapping, f, ensure_ascii=False, indent=4) 247 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/data_vad.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | import os 4 | import math 5 | import json 6 | 7 | import librosa 8 | import webrtcvad 9 | import numpy as np 10 | import scipy.io.wavfile as wf 11 | import sys 12 | sys.path.append(".") 13 | sys.path.append("..") 14 | from features.vad import VoiceActivityDetector 15 | 16 | def voice_activate_indices_detect_2(audio_file): 17 | """ 18 | 使用 py-webrtcvad 进行音频VAD处理 19 | 20 | 计算每个子带的对数能量,如果大于阈值,则对当前帧进行处理;否则直接将vad_flag置为0。 21 | 计算每个子带对应的高斯概率,并与子带的权重相乘作为语音/噪声最终的概率。 22 | 计算每个子带的对数似然比, 23 | 每个子带的似然比会和阈值进行比较作为一个局部结果 24 | 所有子带的对数加权似然比之和与阈值比较作一个全局的结果。当全局或局部其中一个为TRUE则认定当前帧是语音帧。 25 | 使用hangover对结果进行平滑 26 | 27 | 超过6个连续音频窗都不是语音,才视为静音片段 28 | 29 | :param audio_file: 30 | :return: 31 | """ 32 | sample_window = 0.03 33 | sample_overlap = 0.03 34 | 35 | v = webrtcvad.Vad(3) 36 | rate, data = wf.read(audio_file) 37 | 38 | sample_start = 0 39 | detected_windows = np.array([]) 40 | sample_window = int(rate * sample_window) 41 | sample_overlap = int(rate * sample_overlap) 42 | # 识别每个音频窗是否为语音 43 | while (sample_start < (len(data) - sample_window)): 44 | sample_end = sample_start + sample_window 45 | if sample_end >= len(data): 46 | sample_end = len(data) - 1 47 | sample_start = sample_end - sample_window - 1 48 | data_window = data[sample_start:sample_end] 49 | detected_windows = np.append(detected_windows, [sample_start, v.is_speech(data_window.tobytes(), rate)]) 50 | sample_start += sample_overlap 51 | detected_windows = detected_windows.reshape(int(len(detected_windows) / 2), 2) 52 | 53 | indices = [] 54 | viol_start, viol_end = -1, -1 55 | act_start, act_end = -1, -1 56 | interval_frames_threshold = 6 57 | for i, (_, flag) in enumerate(detected_windows): 58 | if flag == 0: 59 | viol_start = i if viol_start == -1 else viol_start 60 | viol_end = i 61 | elif viol_end - viol_start >= interval_frames_threshold: 62 | if act_start != -1: 63 | act_end = viol_start + 1 64 | indices.append((int(detected_windows[act_start, 0]), int(detected_windows[act_end, 0]))) 65 | act_start = i - 2 66 | viol_start = -1 67 | viol_end = -1 68 | else: 69 | if act_start == -1: 70 | act_start = i 71 | act_end = i 72 | viol_start = -1 73 | viol_end = -1 74 | 75 | indices.append((int(detected_windows[act_start, 0]), int(detected_windows[act_end, 0]))) 76 | 77 | return indices 78 | 79 | 80 | def vad4(): 81 | """ 82 | 音频文件VAD处理,静音片段识别, 83 | 静音片段的ground truth置零,切分训练集时会将全零的样本去除 84 | 置零前先进行标签平滑处理 85 | 86 | :return: 87 | """ 88 | 89 | # 公开数据集 90 | raw_dir = "E:/datasets/audio2face/wav_base" 91 | gt_dir = "E:/datasets/audio2face/ground_truth_base" 92 | profile_dir = "E:/datasets/audio2face/profile_base" 93 | 94 | output_gt_dir = "E:/datasets/audio2face/clean_gt_base" 95 | 96 | rate = 16000 97 | 98 | if not os.path.exists(output_gt_dir): 99 | os.makedirs(output_gt_dir) 100 | 101 | total_duration = 0 102 | skip_video_count = 0 103 | process_video_count = 0 104 | for file in os.listdir(raw_dir): 105 | file_id, suffix = os.path.splitext(file) 106 | 107 | # 跳过非wav格式的文件 108 | if suffix != ".wav": 109 | continue 110 | 111 | # 文件路径 112 | _gt_file = os.path.join(gt_dir, file_id + ".npy") 113 | _profile_file = os.path.join(profile_dir, file_id + ".json") 114 | if not os.path.exists(_gt_file) or not os.path.exists(_profile_file): 115 | continue 116 | 117 | # 加载数据,gt标签,视频基本信息 118 | with open(_profile_file, "r", encoding="utf-8") as f: 119 | profile = json.load(f) 120 | gt_label = np.load(_gt_file) 121 | audio, sr = librosa.load(os.path.join(raw_dir, file), sr=rate) 122 | 123 | fps = profile["fps"] 124 | frames_step = rate / fps 125 | 126 | # 总帧数无法和音频长度对齐,丢弃 127 | if int(math.fabs(len(audio) / frames_step - profile["num_frames"])) > 2: 128 | skip_video_count += 1 129 | print("Skip {:<15s}, num_frames: {:>8d} , calculate frames: {:>8.2f}".format( 130 | file_id, int(profile["num_frames"]), len(audio) / frames_step)) 131 | continue 132 | 133 | print("Processing {:<15s}, fps: {:<4.2f}, num_frames: {:<8d}".format(file_id, fps, int(profile["num_frames"]))) 134 | process_video_count += 1 135 | 136 | # 去除空白音频信号 137 | # internal_clean_ind = voice_activate_indices_detect(os.path.join(raw_dir, file)) 138 | internal_clean_ind = voice_activate_indices_detect_2(os.path.join(raw_dir, file)) 139 | 140 | # label加hamming窗,平滑 141 | win_size = 5 142 | if gt_label.shape[0] >= win_size: 143 | win = np.hamming(win_size) / np.sum(np.hamming(win_size)) 144 | for i in range(gt_label.shape[1]): 145 | gt_label[:, i] = np.convolve(gt_label[:, i], win, mode="same") 146 | 147 | # 重新构造gt标签,静音片段的gt置零 148 | new_gt_label = np.zeros_like(gt_label) 149 | for start, end in internal_clean_ind: 150 | frame_start_ind = round(start / frames_step) 151 | frame_end_ind = int(end // frames_step) 152 | frame_end_ind = min(frame_end_ind, len(gt_label) - 1) 153 | 154 | new_gt_label[frame_start_ind: frame_end_ind + 1] = gt_label[frame_start_ind: frame_end_ind + 1] 155 | 156 | duration = (frame_end_ind - frame_start_ind) / fps 157 | total_duration += duration 158 | 159 | # 输出保存数据 160 | np.save(os.path.join(output_gt_dir, file_id + ".npy"), new_gt_label) 161 | 162 | # 时长统计 163 | hours = int(total_duration // 3600) 164 | minutes = int((total_duration - hours * 3600) // 60) 165 | seconds = int(total_duration - hours * 3600 - minutes * 60) 166 | 167 | print("Total count: {}, Skip count: {}, Process count: {}, Duration: {:>3d}h {:>2d}m {:>2d}s".format( 168 | skip_video_count + process_video_count, skip_video_count, process_video_count, hours, minutes, seconds)) 169 | 170 | 171 | if __name__ == '__main__': 172 | vad4() 173 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/LPC.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/LPC.dll -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/__pycache__/features.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/features.cpython-39.pyc -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/__pycache__/util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/util.cpython-39.pyc -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/__pycache__/vad.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/vad.cpython-39.pyc -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/doc/bsname.txt: -------------------------------------------------------------------------------- 1 | 116 2 | brow_lower_l 3 | tongue_Scale__X 4 | tongue_Scale_Y 5 | tongue_Scale__Y 6 | tongue_Scale_Z 7 | tongue_Scale__Z 8 | nose_out_l 9 | nose_out_r 10 | tongue_u 11 | tongue_u_u 12 | brow_raise_d 13 | cheek_suck_r 14 | mouth_stretch_u 15 | tongue_u_d 16 | tooth_d_d 17 | tongue_d 18 | tooth_r 19 | tooth_d_u 20 | cheek_UP 21 | eye_blink1_l 22 | eye_blink1_r 23 | eye_blink2_l 24 | eye_blink2_r 25 | eye_lidTight_l 26 | eye_lidTight_r 27 | eye_shutTight_l 28 | eye_shutTight_r 29 | brow_lower_r 30 | eye_upperLidRaise_l 31 | eye_upperLidRaise_r 32 | eye_downLidRaise_l 33 | eye_downLidRaise_r 34 | jaw_sideways_l 35 | jaw_sideways_r 36 | jaw_thrust_c 37 | mouth_chew_c 38 | mouth_chinRaise_d 39 | mouth_chinRaise_u 40 | brow_raise_c 41 | mouth_dimple_l 42 | mouth_dimple_r 43 | mouth_funnel_dl 44 | mouth_funnel_dr 45 | mouth_funnel_ul 46 | mouth_funnel_ur 47 | mouth_lipCornerDepressFix_l 48 | mouth_lipCornerDepressFix_r 49 | mouth_lipCornerDepress_l 50 | mouth_lipCornerDepress_r 51 | brow_raise_l 52 | mouth_lipCornerPullOpen_l 53 | mouth_lipCornerPullOpen_r 54 | mouth_lipCornerPull_l 55 | mouth_lipCornerPull_r 56 | mouth_lipStretchOpen_l 57 | mouth_lipStretchOpen_r 58 | mouth_lipStretch_l 59 | mouth_lipStretch_r 60 | mouth_lowerLipDepress_l 61 | mouth_lowerLipDepress_r 62 | brow_raise_r 63 | mouth_lowerLipProtrude_c 64 | mouth_oh_c 65 | mouth_oo_c 66 | mouth_pressFix_c 67 | mouth_press_l 68 | mouth_press_r 69 | mouth_pucker_l 70 | mouth_pucker_r 71 | mouth_screamFix_c 72 | mouth_sideways_l 73 | cheek_puff_l 74 | mouth_sideways_r 75 | mouth_stretch_c 76 | mouth_suck_dl 77 | mouth_suck_dr 78 | mouth_suck_ul 79 | mouth_suck_ur 80 | mouth_upperLipRaise_l 81 | mouth_upperLipRaise_r 82 | nose_wrinkle_l 83 | nose_wrinkle_r 84 | cheek_puff_r 85 | tooth_l 86 | eye_lookDown1_l 87 | eye_lookDown2_l 88 | eye_lookLeft_l 89 | eye_lookRight_l 90 | eye_lookUp_l 91 | eye_lookDown1_r 92 | eye_lookDown2_r 93 | eye_lookLeft_r 94 | eye_lookRight_r 95 | cheek_raise_l 96 | eye_lookUp_r 97 | tongue_Rot_1X 98 | tongue_Rot__1X 99 | tongue_Rot_2X 100 | tongue_Rot__2X 101 | tongue_Rot_3X 102 | tongue_Rot__3X 103 | tongue_Rot_1Y 104 | tongue_Rot__1Y 105 | tongue_Rot_2Y 106 | cheek_raise_r 107 | tongue_Rot__2Y 108 | tongue_Rot_3Y 109 | tongue_Rot__3Y 110 | tongue_Rot_1Z 111 | tongue_Rot__1Z 112 | tongue_Rot_2Z 113 | tongue_Rot__2Z 114 | tongue_Rot_3Z 115 | tongue_Rot__3Z 116 | tongue_Scale_X 117 | cheek_suck_l -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/features.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | """ 4 | 音频处理的可以看看这个博客: https://www.cnblogs.com/LXP-Never/p/11561355.html 5 | 天池零基础入门音频的教程: https://pythontechworld.com/article/detail/BCcNjuLDVYa6 6 | 分帧函数: https://blog.csdn.net/qq_37653144/article/details/89045363 7 | """ 8 | 9 | import os 10 | from ctypes import * 11 | 12 | import tqdm 13 | import librosa 14 | import numpy as np 15 | 16 | current_dir = os.path.split(os.path.abspath(__file__))[0] 17 | lpc_dll_file = os.path.join(current_dir, "LPC.dll") 18 | lpc_dll = cdll.LoadLibrary(lpc_dll_file) 19 | 20 | 21 | def lpc(audio_frames, sample_rate=16000): 22 | input_data_list = [] 23 | for audio_frame in tqdm.tqdm(audio_frames): 24 | # 8ms帧移, 16ms帧长 25 | overlap_frames_apart = 0.008 26 | overlap = int(sample_rate * overlap_frames_apart) 27 | frameSize = int(sample_rate * overlap_frames_apart * 2) 28 | numberOfFrames = (len(audio_frame) - frameSize) // overlap + 1 29 | 30 | # 构造音频帧 31 | # print(numberOfFrames, frameSize) 32 | frames = np.ndarray((numberOfFrames, frameSize)) 33 | for j in range(0, numberOfFrames): 34 | frames[j] = audio_frame[j * overlap: j * overlap + frameSize] 35 | 36 | # 加窗 37 | frames *= np.hanning(frameSize) 38 | 39 | # LPC 40 | frames_lpc_features = [] 41 | b = (c_double * 32)() 42 | for fr in frames: 43 | a = (c_double * frameSize)(*fr) 44 | # LPC(float *in, int size, int order, float *out) 45 | lpc_dll.LPC(pointer(a), frameSize, 32, pointer(b)); 46 | frames_lpc_features.append(list(b)) 47 | del a 48 | 49 | del b 50 | 51 | image_temp1 = np.array(frames_lpc_features) 52 | image_temp2 = np.expand_dims(image_temp1, axis=0) # 升维 53 | input_data_list.append(image_temp2) 54 | 55 | if not input_data_list: 56 | return None 57 | 58 | inputData_array = np.concatenate(input_data_list, axis=0) 59 | inputData_array = inputData_array.transpose((0, 2, 1)) 60 | 61 | # 扩展为4维:(,32,64,1) 62 | inputData_array = np.expand_dims(inputData_array, axis=3) 63 | 64 | return inputData_array 65 | 66 | 67 | def zero_crossing_feat(_wav, win_length, hop_length): 68 | """ 69 | 过零率 帧变负数负数变正数的时候要通过0这条线, 70 | :param _wav: [-1, 1536] 71 | :param win_length: 256 72 | :param hop_length: 128(这个应该也同时做为位移) 73 | :return: 74 | """ 75 | padding = [(0, 0) for _ in range(_wav.ndim)] # 不需要padding的维度 76 | padding[-1] = (hop_length, hop_length) # 只有最后一个维度才需要padding 77 | y = np.pad(_wav, padding, mode="constant") 78 | 79 | # sum --> / win_lenght 就是求个平均 80 | zc = np.sum( 81 | librosa.zero_crossings( 82 | np.transpose( 83 | # 分帧函数: 将时间序列分割成重叠的帧(所以应该是256帧叠在一起?, 13是特征的维度) 84 | librosa.util.frame(y, frame_length=win_length, hop_length=hop_length), # shape=(-1, 256, 13) 85 | [0, 2, 1]), 86 | pad=False 87 | ), 88 | axis=-1 89 | ) / win_length 90 | 91 | return zc 92 | 93 | 94 | def fbank(_wav, sample_rate, win_length, hop_length, n_mels, window="hann"): 95 | """ 96 | sample_rate: 16000 97 | win_lenght: 256 98 | hop_length: 128 99 | n_mels: 32 100 | window: hann 101 | 这里连续调用了两个音频处理库的包librosa(制作训练的数据集的时候会使用, inference的时候也会使用) 102 | """ 103 | # 如果提供了时间序列输入y,sr,则首先计算其幅值频谱S,然后通过mel_f.dot(S ** power)将其映射到mel scale上 。 104 | # 默认情况下,power= 2在功率谱上运行。 105 | # 这个东西就类似于LPC.dll的功能 106 | mel_spec_feat = librosa.feature.melspectrogram( 107 | y=_wav, 108 | sr=sample_rate, # 160000 109 | win_length=win_length, 110 | hop_length=hop_length, 111 | n_fft=win_length, 112 | window=window, 113 | n_mels=n_mels, 114 | ) 115 | 116 | # 再转换到对数刻度 117 | db_feat = librosa.core.power_to_db( 118 | mel_spec_feat, ref=1.0, amin=1e-10, top_db=None, 119 | ) 120 | feat = (db_feat + 100) / 130. 121 | return feat 122 | 123 | 124 | def mfcc(_wav, sample_rate, win_length, hop_length, n_mels, n_mfcc, window="hann"): 125 | feat = librosa.feature.mfcc( 126 | y=_wav, 127 | sr=sample_rate, 128 | win_length=win_length, 129 | hop_length=hop_length, 130 | n_fft=win_length, 131 | window=window, 132 | n_mels=n_mels, 133 | n_mfcc=n_mfcc, 134 | ) 135 | return feat 136 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/util.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | 4 | import os 5 | import math 6 | import json 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | current_dir = os.path.split(os.path.abspath(__file__))[0] 12 | 13 | FACEGOOD_BS_CONUNT = 116 14 | # the sort of bs name correspond to UE input sort 15 | bs_name_index = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 16 | 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 17 | 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 18 | 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105, 19 | 104, 106, 107, 108, 109, 110, 111, 112, 113, 114, 1, 115] 20 | label_name_list = pd.read_csv( 21 | os.path.join(current_dir, "doc", "bsname.txt"), encoding="utf-8").values.transpose()[0].tolist() 22 | 23 | STANDARD_ARKIT_BS_NAME = ["BlendShapeCount", "EyeBlinkLeft", "EyeLookDownLeft", "EyeLookInLeft", "EyeLookOutLeft", 24 | "EyeLookUpLeft", "EyeSquintLeft", "EyeWideLeft", "EyeBlinkRight", "EyeLookDownRight", 25 | "EyeLookInRight", "EyeLookOutRight", "EyeLookUpRight", "EyeSquintRight", "EyeWideRight", 26 | "JawForward", "JawRight", "JawLeft", "JawOpen", "MouthClose", "MouthFunnel", "MouthPucker", 27 | "MouthRight", "MouthLeft", "MouthSmileLeft", "MouthSmileRight", "MouthFrownLeft", 28 | "MouthFrownRight", "MouthDimpleLeft", "MouthDimpleRight", "MouthStretchLeft", 29 | "MouthStretchRight", "MouthRollLower", "MouthRollUpper", "MouthShrugLower", "MouthShrugUpper", 30 | "MouthPressLeft", "MouthPressRight", "MouthLowerDownLeft", "MouthLowerDownRight", 31 | "MouthUpperUpLeft", "MouthUpperUpRight", "BrowDownLeft", "BrowDownRight", "BrowInnerUp", 32 | "BrowOuterUpLeft", "BrowOuterUpRight", "CheekPuff", "CheekSquintLeft", "CheekSquintRight", 33 | "NoseSneerLeft", "NoseSneerRight", "TongueOut", "HeadYaw", "HeadPitch", "HeadRoll", 34 | "LeftEyeYaw", "LeftEyePitch", "LeftEyeRoll", "RightEyeYaw", "RightEyePitch", "RightEyeRoll", ] 35 | 36 | VALID_BS_NAME = [ 37 | "JawForward", 38 | "JawLeft", 39 | "JawRight", 40 | "JawOpen", 41 | "MouthFunnel", 42 | "MouthPucker", 43 | "MouthLeft", 44 | "MouthRight", 45 | "MouthSmileLeft", 46 | "MouthSmileRight", 47 | "MouthFrownLeft", 48 | "MouthFrownRight", 49 | "MouthDimpleLeft", 50 | "MouthDimpleRight", 51 | "MouthStretchLeft", 52 | "MouthStretchRight", 53 | "MouthRollLower", 54 | "MouthRollUpper", 55 | "MouthShrugLower", 56 | "MouthShrugUpper", 57 | "MouthPressLeft", 58 | "MouthPressRight", 59 | "MouthLowerDownLeft", 60 | "MouthLowerDownRight", 61 | "MouthUpperUpLeft", 62 | "MouthUpperUpRight" 63 | ] 64 | 65 | SELECT_VALID_BS_NAME = [ 66 | "JawOpen", 67 | "MouthFunnel", 68 | "MouthPucker", 69 | "MouthSmileLeft", 70 | "MouthSmileRight", 71 | "MouthStretchLeft", 72 | "MouthStretchRight", 73 | "MouthRollLower", 74 | "MouthRollUpper", 75 | "MouthShrugUpper", 76 | "MouthPressLeft", 77 | "MouthPressRight", 78 | "MouthLowerDownLeft", 79 | "MouthLowerDownRight", 80 | "MouthUpperUpLeft", 81 | "MouthUpperUpRight", 82 | ] 83 | 84 | 85 | def add_noise(origin_signal, snr): 86 | """ 87 | 添加高斯白噪声,固定信噪比 88 | """ 89 | noise = np.random.normal(0, 1, len(origin_signal)) 90 | 91 | # 计算语音信号功率Ps和噪声功率Pn1 92 | Ps = np.sum(origin_signal ** 2) / len(origin_signal) 93 | Pn1 = np.sum(noise ** 2) / len(noise) 94 | 95 | # 计算k值 96 | k = math.sqrt(Ps / (10 ** (snr / 10) * Pn1)) 97 | 98 | # 将噪声数据乘以k, 99 | random_values_we_need = noise * k 100 | 101 | new_signal = origin_signal.astype(np.float64) + random_values_we_need 102 | 103 | return new_signal 104 | 105 | 106 | def add_other_noise(origin_signal, noise): 107 | """ 108 | 添加指定噪声 109 | """ 110 | if len(origin_signal) / len(noise) > 1: 111 | new_noise = np.concatenate([noise] * int(len(origin_signal) / len(noise) + 1)) 112 | new_noise = new_noise[:len(origin_signal)] 113 | else: 114 | upper = len(noise) - len(origin_signal) 115 | start = np.random.randint(0, upper - 1) 116 | new_noise = noise[start:start + len(origin_signal)] 117 | new_signal = origin_signal + new_noise 118 | return new_signal 119 | 120 | 121 | def load_json_file(file_path): 122 | with open(file_path, "r", encoding="utf-8") as f: 123 | profile = json.load(f) 124 | return profile 125 | 126 | 127 | def rectangle_wav(wav): 128 | """ 129 | 将波形信号变为矩形波信号, 130 | 主要用于将时序BlendShape数值进行增强 131 | """ 132 | rect_wav = np.zeros_like(wav) 133 | extremum_indices = [] 134 | for t in range(1, len(wav) - 2): 135 | # 趋势是否改变 136 | is_change_slope = (wav[t + 1] - wav[t] + 1e-16) / (wav[t] - wav[t - 1] + 1e-16) 137 | if is_change_slope < 0: 138 | extremum_indices.append(t) 139 | 140 | # 常量信号,无波形 141 | if not extremum_indices: 142 | rect_wav[:] = wav[:] 143 | return rect_wav 144 | 145 | # 每个极值区间进行赋值 146 | for i, ind in enumerate(extremum_indices): 147 | if i == 0: 148 | start = 0 149 | else: 150 | start = int((ind + extremum_indices[i - 1]) / 2) 151 | 152 | if i == len(extremum_indices) - 1: 153 | end = wav.shape[0] 154 | else: 155 | end = int((ind + extremum_indices[i + 1]) / 2) 156 | rect_wav[start:end] = wav[ind] 157 | 158 | return rect_wav 159 | 160 | 161 | def facegood_bs_label_to_valid_arkit(label_temp): 162 | """ 163 | FACEGOOD样例数据转换成标准ARKITS表情 164 | 165 | :param label_temp: 166 | :return: 167 | """ 168 | _label = np.zeros((label_temp.shape[0], FACEGOOD_BS_CONUNT)) 169 | for i in range(len(bs_name_index)): 170 | _label[:, i] = label_temp[:, bs_name_index[i]] 171 | 172 | num_valid_bs = 26 173 | new_label = np.zeros((_label.shape[0], num_valid_bs), dtype=np.float32) 174 | new_label[:, 0] = _label[:, label_name_list.index("jaw_thrust_c")] 175 | new_label[:, 1] = _label[:, label_name_list.index("jaw_sideways_l")] 176 | new_label[:, 2] = _label[:, label_name_list.index("jaw_sideways_r")] 177 | new_label[:, 3] = _label[:, label_name_list.index("mouth_stretch_c")] 178 | # new_label[:, 4] = _label[:, label_name_list.index("mouth_chew_c")] 179 | new_label[:, 4] = np.max(_label[:, [label_name_list.index(n) 180 | for n in ["mouth_funnel_dl", "mouth_funnel_dr", "mouth_funnel_ul", 181 | "mouth_funnel_ur"]]], axis=1) 182 | new_label[:, 5] = np.max( 183 | _label[:, [label_name_list.index(n) for n in ["mouth_pucker_l", "mouth_pucker_r"]]], axis=1) 184 | new_label[:, 6] = _label[:, label_name_list.index("mouth_sideways_l")] 185 | new_label[:, 7] = _label[:, label_name_list.index("mouth_sideways_r")] 186 | new_label[:, 8] = _label[:, label_name_list.index("mouth_lipCornerPull_l")] 187 | new_label[:, 9] = _label[:, label_name_list.index("mouth_lipCornerPull_r")] 188 | new_label[:, 10] = np.max(_label[:, [label_name_list.index(n) for n in ["mouth_lipCornerDepress_l", 189 | "mouth_lipCornerDepressFix_l"]]], 190 | axis=1) 191 | new_label[:, 11] = np.max(_label[:, [label_name_list.index(n) for n in ["mouth_lipCornerDepress_r", 192 | "mouth_lipCornerDepressFix_r"]]], 193 | axis=1) 194 | new_label[:, 12] = _label[:, label_name_list.index("mouth_dimple_l")] 195 | new_label[:, 13] = _label[:, label_name_list.index("mouth_dimple_r")] 196 | new_label[:, 14] = _label[:, label_name_list.index("mouth_lipStretch_l")] 197 | new_label[:, 15] = _label[:, label_name_list.index("mouth_lipStretch_r")] 198 | new_label[:, 16] = np.max( 199 | _label[:, [label_name_list.index(n) for n in ["mouth_suck_dl", "mouth_suck_dr"]]], axis=1) 200 | new_label[:, 17] = np.max( 201 | _label[:, [label_name_list.index(n) for n in ["mouth_suck_ul", "mouth_suck_ur"]]], axis=1) 202 | new_label[:, 18] = _label[:, label_name_list.index("mouth_chinRaise_d")] 203 | new_label[:, 19] = _label[:, label_name_list.index("mouth_chinRaise_u")] 204 | new_label[:, 20] = _label[:, label_name_list.index("mouth_press_l")] 205 | new_label[:, 21] = _label[:, label_name_list.index("mouth_press_r")] 206 | new_label[:, 22] = _label[:, label_name_list.index("mouth_lowerLipDepress_l")] 207 | new_label[:, 23] = _label[:, label_name_list.index("mouth_lowerLipDepress_r")] 208 | new_label[:, 24] = _label[:, label_name_list.index("mouth_upperLipRaise_l")] 209 | new_label[:, 25] = _label[:, label_name_list.index("mouth_upperLipRaise_r")] 210 | 211 | return new_label 212 | 213 | 214 | def standard_arkit_bs_to_valid(label_temp): 215 | """ 216 | 标准ARKITS表情,抽取有效的嘴部动作 217 | 218 | :param label_temp: 219 | :return: 220 | """ 221 | num_valid_bs = len(VALID_BS_NAME) 222 | indices = [STANDARD_ARKIT_BS_NAME.index(bs) for bs in VALID_BS_NAME] 223 | # indices = [STANDARD_ARKIT_BS_NAME.index(bs) for bs in SELECT_VALID_BS_NAME] 224 | new_label = np.zeros((label_temp.shape[0], num_valid_bs), dtype=np.float32) 225 | new_label[:] = label_temp[:, indices] 226 | return new_label 227 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/features/vad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.io.wavfile as wf 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class VoiceActivityDetector: 7 | """ Use signal energy to detect voice activity in wav file """ 8 | 9 | def __init__(self, 10 | wave_input_filename, 11 | sample_window=0.02, 12 | sample_overlap=0.01, 13 | speech_window=0.5, 14 | speech_energy_threshold=0.6, 15 | speech_start_band=300, 16 | speech_end_band=3000, 17 | ): 18 | self._read_wav(wave_input_filename)._convert_to_mono() 19 | self.sample_window = sample_window # 20 ms 20 | self.sample_overlap = sample_overlap # 10ms 21 | self.speech_window = speech_window # half a second 22 | self.speech_energy_threshold = speech_energy_threshold # 60% of energy in voice band 23 | self.speech_start_band = speech_start_band 24 | self.speech_end_band = speech_end_band 25 | 26 | def _read_wav(self, wave_file): 27 | self.rate, self.data = wf.read(wave_file) 28 | self.channels = len(self.data.shape) 29 | self.filename = wave_file 30 | return self 31 | 32 | def _convert_to_mono(self): 33 | if self.channels == 2: 34 | self.data = np.mean(self.data, axis=1, dtype=self.data.dtype) 35 | self.channels = 1 36 | return self 37 | 38 | def _calculate_frequencies(self, audio_data): 39 | data_freq = np.fft.fftfreq(len(audio_data), 1.0 / self.rate) 40 | data_freq = data_freq[1:] 41 | return data_freq 42 | 43 | def _calculate_amplitude(self, audio_data): 44 | data_ampl = np.abs(np.fft.fft(audio_data)) 45 | data_ampl = data_ampl[1:] 46 | return data_ampl 47 | 48 | def _calculate_energy(self, data): 49 | data_amplitude = self._calculate_amplitude(data) 50 | data_energy = data_amplitude ** 2 51 | return data_energy 52 | 53 | def _znormalize_energy(self, data_energy): 54 | energy_mean = np.mean(data_energy) 55 | energy_std = np.std(data_energy) 56 | energy_znorm = (data_energy - energy_mean) / energy_std 57 | return energy_znorm 58 | 59 | def _connect_energy_with_frequencies(self, data_freq, data_energy): 60 | energy_freq = {} 61 | for (i, freq) in enumerate(data_freq): 62 | if abs(freq) not in energy_freq: 63 | energy_freq[abs(freq)] = data_energy[i] * 2 64 | return energy_freq 65 | 66 | def _calculate_normalized_energy(self, data): 67 | data_freq = self._calculate_frequencies(data) 68 | data_energy = self._calculate_energy(data) 69 | # data_energy = self._znormalize_energy(data_energy) #znorm brings worse results 70 | energy_freq = self._connect_energy_with_frequencies(data_freq, data_energy) 71 | return energy_freq 72 | 73 | def _sum_energy_in_band(self, energy_frequencies, start_band, end_band): 74 | sum_energy = 0 75 | for f in energy_frequencies.keys(): 76 | if start_band < f < end_band: 77 | sum_energy += energy_frequencies[f] 78 | return sum_energy 79 | 80 | def _median_filter(self, x, k): 81 | assert k % 2 == 1, "Median filter length must be odd." 82 | assert x.ndim == 1, "Input must be one-dimensional." 83 | k2 = (k - 1) // 2 84 | y = np.zeros((len(x), k), dtype=x.dtype) 85 | y[:, k2] = x 86 | for i in range(k2): 87 | j = k2 - i 88 | y[j:, i] = x[:-j] 89 | y[:j, i] = x[0] 90 | y[:-j, -(i + 1)] = x[j:] 91 | y[-j:, -(i + 1)] = x[-1] 92 | return np.median(y, axis=1) 93 | 94 | def _smooth_speech_detection(self, detected_windows): 95 | median_window = int(self.speech_window / self.sample_window) 96 | if median_window % 2 == 0: median_window = median_window - 1 97 | median_energy = self._median_filter(detected_windows[:, 1], median_window) 98 | return median_energy 99 | 100 | def convert_windows_to_readible_labels(self, detected_windows): 101 | """ Takes as input array of window numbers and speech flags from speech 102 | detection and convert speech flags to time intervals of speech. 103 | Output is array of dictionaries with speech intervals. 104 | """ 105 | speech_time = [] 106 | is_speech = 0 107 | for window in detected_windows: 108 | if (window[1] == 1.0 and is_speech == 0): 109 | is_speech = 1 110 | speech_label = {} 111 | speech_time_start = window[0] / self.rate 112 | speech_label['speech_begin'] = speech_time_start 113 | print(window[0], speech_time_start) 114 | # speech_time.append(speech_label) 115 | if (window[1] == 0.0 and is_speech == 1): 116 | is_speech = 0 117 | speech_time_end = window[0] / self.rate 118 | speech_label['speech_end'] = speech_time_end 119 | speech_time.append(speech_label) 120 | print(window[0], speech_time_end) 121 | return speech_time 122 | 123 | def plot_detected_speech_regions(self): 124 | """ Performs speech detection and plot original signal and speech regions. 125 | """ 126 | data = self.data 127 | detected_windows = self.detect_speech() 128 | data_speech = np.zeros(len(data)) 129 | it = np.nditer(detected_windows[:, 0], flags=['f_index']) 130 | while not it.finished: 131 | data_speech[int(it[0])] = data[int(it[0])] * detected_windows[it.index, 1] 132 | it.iternext() 133 | plt.figure(figsize=(200, 10)) 134 | plt.plot(data_speech) 135 | plt.show() 136 | plt.figure(figsize=(200, 10)) 137 | plt.plot(data) 138 | plt.show() 139 | return self 140 | 141 | def detect_speech(self): 142 | """ Detects speech regions based on ratio between speech band energy 143 | and total energy. 144 | Output is array of window numbers and speech flags (1 - speech, 0 - nonspeech). 145 | """ 146 | detected_windows = np.array([]) 147 | sample_window = int(self.rate * self.sample_window) 148 | sample_overlap = int(self.rate * self.sample_overlap) 149 | data = self.data 150 | sample_start = 0 151 | start_band = self.speech_start_band 152 | end_band = self.speech_end_band 153 | while (sample_start < (len(data) - sample_window)): 154 | sample_end = sample_start + sample_window 155 | if sample_end >= len(data): sample_end = len(data) - 1 156 | data_window = data[sample_start:sample_end] 157 | energy_freq = self._calculate_normalized_energy(data_window) 158 | sum_voice_energy = self._sum_energy_in_band(energy_freq, start_band, end_band) 159 | sum_full_energy = sum(energy_freq.values()) 160 | speech_ratio = sum_voice_energy / sum_full_energy 161 | # Hipothesis is that when there is a speech sequence we have ratio of energies more than Threshold 162 | speech_ratio = speech_ratio > self.speech_energy_threshold 163 | detected_windows = np.append(detected_windows, [sample_start, speech_ratio]) 164 | sample_start += sample_overlap 165 | detected_windows = detected_windows.reshape(int(len(detected_windows) / 2), 2) 166 | detected_windows[:, 1] = self._smooth_speech_detection(detected_windows) 167 | return detected_windows 168 | -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/mocap4face/2001161359.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "is_face.base_model_path": "2001161359_bs_trans__ti_a8u2m2mh2f2__lrd150_augN01A05_brd03__par1222pyr5_ks753_dsS_dbCS1x1_sz16mul11236__split800_4106000_weights.130-0.39-model.h5" 4 | }, 5 | "model_metadata": { 6 | "input": { 7 | "resolution": [ 8 | 256, 9 | 256 10 | ] 11 | }, 12 | "outputs": { 13 | "position": { 14 | "resolution": [ 15 | 256, 16 | 256 17 | ] 18 | }, 19 | "blendshapes": { 20 | "names": [ 21 | "browOutterUpLeft", 22 | "browInnerUp", 23 | "browDownLeft", 24 | "eyeBlinkLeft", 25 | "eyeSquintLeft", 26 | "eyeWideLeft", 27 | "eyeLookUpLeft", 28 | "eyeLookOutLeft", 29 | "eyeLookInLeft", 30 | "eyeLookDownLeft", 31 | "noseSneerLeft", 32 | "mouthUpperUpLeft", 33 | "mouthSmileLeft", 34 | "mouthLeft", 35 | "mouthFrownLeft", 36 | "mouthLowerDownLeft", 37 | "jawLeft", 38 | "cheekPuff", 39 | "mouthShrugUpper", 40 | "mouthFunnel", 41 | "mouthRollLower", 42 | "jawOpen", 43 | "tongueOut", 44 | "mouthPucker", 45 | "mouthRollUpper", 46 | "jawRight", 47 | "mouthLowerDownRight", 48 | "mouthFrownRight", 49 | "mouthRight", 50 | "mouthSmileRight", 51 | "mouthUpperUpRight", 52 | "noseSneerRight", 53 | "eyeLookDownRight", 54 | "eyeLookInRight", 55 | "eyeLookOutRight", 56 | "eyeLookUpRight", 57 | "eyeWideRight", 58 | "eyeSquintRight", 59 | "eyeBlinkRight", 60 | "browDownRight", 61 | "browInnerUp", 62 | "browOutterUpRight" 63 | ], 64 | "count": 42 65 | }, 66 | "transforms": { 67 | "elements": [ 68 | "quat", 69 | "headCenter3", 70 | "scale1", 71 | "nose3", 72 | "rightEar3", 73 | "leftEar3", 74 | "euler3" 75 | ], 76 | "field_count": 20, 77 | "scale_normalization": { 78 | "scale": 2, 79 | "offset": 0.600000023841858 80 | }, 81 | "euler_rotation_ranges": [ 82 | 45, 83 | 88, 84 | 40 85 | ] 86 | } 87 | } 88 | } 89 | } -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/mocap4face/2001161359.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/mocap4face/2001161359.tflite -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/mocap4face/__pycache__/mocap4face.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/mocap4face/__pycache__/mocap4face.cpython-39.pyc -------------------------------------------------------------------------------- /data_generate/generate_datasets_v2/mocap4face/mocap4face.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import tensorflow as tf 4 | import numpy as np 5 | import cv2 6 | import mediapipe as mp 7 | import time 8 | 9 | addr = '192.168.38.6' 10 | 11 | blendershapes_index_map = [ 12 | "None", 13 | "browInnerUp", 14 | "browOutterUpLeft", 15 | "browOutterUpRight", 16 | "browDownLeft", 17 | "browDownRight", 18 | "eyeWideLeft", 19 | "eyeWideRight", 20 | "cheekSquintLeft", 21 | "cheekSquintRight", 22 | "eyeSquintLeft", 23 | "eyeSquintRight", 24 | "noseSneerLeft", 25 | "noseSneerRight", 26 | "mouthUpperUpLeft", 27 | "mouthUpperUpRight", 28 | "mouthLeft", 29 | "mouthRight", 30 | "mouthSmileLeft", 31 | "mouthSmileRight", 32 | "mouthDimpleLeft", 33 | "mouthDimpleRight", 34 | "mouthFrownLeft", 35 | "mouthFrownRight", 36 | "mouthLowerDownLeft", 37 | "mouthLowerDownRight", 38 | "mouthShrugLower", 39 | "mouthShrugUpper", 40 | "mouthPucker", 41 | "mouthStretchLeft", 42 | "mouthStretchRight", 43 | "mouthFunnel", 44 | "mouthPress", 45 | "jawOpen", 46 | "mouthRollLower", 47 | "mouthRollUpper", 48 | "jawForward", 49 | "jawLeft", 50 | "jawRight", 51 | "cheekPuff", 52 | "eyeBlinkLeft", 53 | "eyeBlinkRight", 54 | "eyeLookDownLeft", 55 | "eyeLookDownRight", 56 | "eyeLookInLeft", 57 | "eyeLookInRight", 58 | "eyeLookOutLeft", 59 | "eyeLookOutRight", 60 | "eyeLookUpLeft", 61 | "eyeLookUpRight", 62 | "mouthPressLeft", 63 | "mouthPressRight", 64 | "headDown", 65 | "headLeft", 66 | "headRight", 67 | "headRollLeft", 68 | "headRollRight", 69 | "headUp", 70 | "tongueOut", 71 | ] 72 | body1 = """{"frame":81,"timestamp":1653020274303}""" 73 | body2 = """#{"cmdList":[{"k":0,"v":{"x":-0.16915,"y":0.44524,"z":-0.14412},"visibility":0.99242},{"k":1,"v":{"x":-0.23624,"y":0.28103,"z":-0.13774},"visibility":0.80215},{"k":2,"v":{"x":-0.25922,"y":0.1164,"z":-0.16851},"visibility":0.3386},{"k":3,"v":{"x":-0.24703,"y":0.10263,"z":-0.18967},"visibility":0.35001},{"k":4,"v":{"x":-0.25588,"y":0.06091,"z":-0.18511},"visibility":0.27888},{"k":5,"v":{"x":0.11057,"y":0.48848,"z":-0.03623},"visibility":0.99176},{"k":6,"v":{"x":0.3029,"y":0.40632,"z":-0.03385},"visibility":0.66099},{"k":7,"v":{"x":0.43823,"y":0.3376,"z":-0.17424},"visibility":0.56821},{"k":8,"v":{"x":0.42734,"y":0.3283,"z":-0.19942},"visibility":0.57001},{"k":9,"v":{"x":0.47437,"y":0.31352,"z":-0.20123},"visibility":0.47294},{"k":10,"v":{"x":0.04314,"y":0.64238,"z":-0.10405},"visibility":0.99465},{"k":11,"v":{"x":0.02436,"y":0.66783,"z":-0.2133},"visibility":0.99679},{"k":12,"v":{"x":-0.08512,"y":0.63474,"z":-0.14779},"visibility":0.9982},{"k":13,"v":{"x":-0.00396,"y":0.66814,"z":-0.22631},"visibility":0.99743},{"k":14,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":15,"v":{"x":-0.10621,"y":0.00281,"z":0.00741},"visibility":1.0},{"k":16,"v":{"x":-0.09716,"y":-0.37997,"z":0.00613},"visibility":1.0},{"k":17,"v":{"x":-0.08832,"y":-0.73587,"z":0.19865},"visibility":1.0},{"k":18,"v":{"x":-0.13203,"y":-0.85234,"z":0.09052},"visibility":1.0},{"k":19,"v":{"x":0.10582,"y":-0.00233,"z":-0.00702},"visibility":1.0},{"k":20,"v":{"x":0.12449,"y":-0.38901,"z":0.0043},"visibility":1.0},{"k":21,"v":{"x":0.15222,"y":-0.72213,"z":0.18485},"visibility":1.0},{"k":22,"v":{"x":0.18626,"y":-0.83152,"z":0.06556},"visibility":1.0},{"k":23,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":24,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":25,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":26,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":27,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781}],"status":0,"valid":1}""" 74 | 75 | 76 | def resize_img_keep_ratio(img, target_size=(800, 800)): 77 | old_size = img.shape[0:2] 78 | ratio = min(float(target_size[i]) / (old_size[i]) for i in range(len(old_size))) 79 | new_size = tuple([int(i * ratio) for i in old_size]) 80 | img = cv2.resize(img, (new_size[1], new_size[0])) 81 | pad_w = target_size[1] - new_size[1] 82 | pad_h = target_size[0] - new_size[0] 83 | top, bottom = pad_h // 2, pad_h - (pad_h // 2) 84 | left, right = pad_w // 2, pad_w - (pad_w // 2) 85 | img_new = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, None, (255, 255, 255)) 86 | return img_new 87 | 88 | 89 | class MediapipeFaceDetection: 90 | def __init__(self, tflite_path="./2001161359.tflite", json_path="./2001161359.json"): 91 | self.face_det = self.MediapipeInit() 92 | self.tfliteInit(tflite_path) 93 | self.getMocapDict(json_path) 94 | 95 | def tfliteInit(self, tflite_file): 96 | # Initialize the interpreter 97 | self.interpreter = tf.lite.Interpreter(model_path=str(tflite_file)) 98 | self.interpreter.allocate_tensors() 99 | 100 | self.input_details = self.interpreter.get_input_details()[0] 101 | self.blendershapes_output_details = self.interpreter.get_output_details()[0] 102 | self.transforms_output_details = self.interpreter.get_output_details()[1] 103 | 104 | def MediapipeInit(self): 105 | face_det = mp.solutions.face_detection.FaceDetection( 106 | min_detection_confidence=0.5, 107 | model_selection=0 108 | ) 109 | return face_det 110 | 111 | def MediapipeRun(self, image, return_face=False): 112 | # Convert the BGR image to RGB before processing. 113 | results = self.face_det.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) 114 | if results.detections is None: 115 | return None, None 116 | h, w, _ = image.shape 117 | face = results.detections[0] 118 | cx = int(face.location_data.relative_bounding_box.xmin * w) 119 | cy = int(face.location_data.relative_bounding_box.ymin * h) 120 | height = int(face.location_data.relative_bounding_box.height * h) 121 | width = int(face.location_data.relative_bounding_box.width * w) 122 | 123 | side_length = max(height, width) + 60 124 | 125 | y_start = int(max((cy + cy + height) / 2 - side_length / 2, 0.)) 126 | y_end = int(min(y_start + side_length, h)) 127 | x_start = int(max((cx + cx + width) / 2 - side_length / 2, 0.)) 128 | x_end = int(min(x_start + side_length, w)) 129 | 130 | # face_image = image[cy:cy+height, cx:cx+width] 131 | face_image = image[y_start:y_end, x_start:x_end] 132 | 133 | # test_image = face_image 134 | test_image = cv2.resize(face_image, (256, 256)) 135 | # test_image = resize_img_keep_ratio(face_image, (256, 256)) 136 | s1 = time.time() 137 | test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255. 138 | self.interpreter.set_tensor(self.input_details["index"], test_image) 139 | self.interpreter.invoke() 140 | blendershapes = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0] * 100 141 | transforms = self.interpreter.get_tensor(self.transforms_output_details["index"]) 142 | # print(time.time() - s1) 143 | if return_face: 144 | return blendershapes, transforms, test_image 145 | return blendershapes, transforms 146 | 147 | def MediapipeRunWithoutFaceDetect(self, image): 148 | # Convert the BGR image to RGB before processing. 149 | face_image = image 150 | test_image = cv2.resize(face_image, (256, 256)) 151 | # test_image = resize_img_keep_ratio(face_image, (256, 256)) 152 | s1 = time.time() 153 | test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255. 154 | self.interpreter.set_tensor(self.input_details["index"], test_image) 155 | self.interpreter.invoke() 156 | blendershapes = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0] * 100 157 | transforms = self.interpreter.get_tensor(self.transforms_output_details["index"]) 158 | # print(time.time() - s1) 159 | return blendershapes, transforms 160 | 161 | def jsonFormat(self, prediction): 162 | json_kv = {} 163 | for idx, emoji_val in enumerate(prediction): 164 | emoji_name = self.mocap[idx] 165 | json_kv[emoji_name] = emoji_val 166 | return json_kv 167 | 168 | def getMocapDict(self, path): 169 | with open(path, 'r') as f: 170 | j = json.load(f) 171 | self.mocap = j['model_metadata']['outputs']['blendshapes']['names'] 172 | self.blendershapes_map = {} 173 | for index, bs in enumerate(blendershapes_index_map): 174 | self.blendershapes_map[bs] = index 175 | 176 | 177 | if __name__ == "__main__": 178 | fmp = MediapipeFaceDetection() 179 | capture = cv2.VideoCapture(0) 180 | if not capture.isOpened(): 181 | print("打开视频失败!") 182 | 183 | fps = capture.get(cv2.CAP_PROP_FPS) 184 | size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), 185 | int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) 186 | fNUMS = capture.get(cv2.CAP_PROP_FRAME_COUNT) 187 | print("fps:", fps) 188 | print("size:", size) 189 | print("fNUMS:", fNUMS) 190 | 191 | f_cnt = 0 192 | time_cnt = 0 193 | while True: 194 | _, frame = capture.read() 195 | if frame is None: 196 | break 197 | f_cnt += 1 198 | res, _ = fmp.MediapipeRun(frame) 199 | face_json = fmp.jsonFormat(res) 200 | 201 | capture.release() 202 | -------------------------------------------------------------------------------- /datasets/__pycache__/dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/datasets/__pycache__/dataset.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from torch.utils.data import Dataset 4 | import numpy as np 5 | import torch 6 | import cv2 7 | import sys 8 | sys.path.append(".") 9 | sys.path.append("..") 10 | import random 11 | import torchvision.transforms as transforms 12 | import json 13 | import matplotlib.pyplot as plt 14 | from PIL import Image 15 | import math 16 | 17 | class AudioDataset(Dataset): 18 | def __init__(self, target_root, data_root): 19 | """ 20 | :param window: 音频序列的长度为3 21 | """ 22 | self.target_root = target_root 23 | self.data_root = data_root 24 | 25 | self.all_data = [] 26 | self.all_gt = [] 27 | 28 | self.pre_process() 29 | 30 | 31 | def vector_transforms(self, data): 32 | # option(1) 这个是全局的mean和std 33 | # data_mean = np.mean(data) 34 | # data_std = np.std(data) 35 | 36 | # option(2) 这个是针对每个特征的mean和std 37 | num_length = data.shape[-1] 38 | data_mean = np.mean(data.reshape(-1, num_length), axis=0, keepdims=True)[np.newaxis, ...] 39 | data_std = np.std(data.reshape(-1, num_length), axis=0, keepdims=True)[np.newaxis, ...] 40 | 41 | # 数据标准化 42 | data = (data - data_mean) / data_std 43 | 44 | return data 45 | 46 | def pre_process(self): 47 | """ 48 | 对数据进行预处理,收集数据 49 | :return: 50 | """ 51 | data_list = os.listdir(self.data_root) 52 | # target_list = os.listdir(self.target_root) 53 | 54 | # for index, item in enumerate(data_list): 55 | # assert item == target_list[index] 56 | 57 | for index, data_name in enumerate(data_list): 58 | data_path = os.path.join(self.data_root, data_name) 59 | target_path = os.path.join(self.target_root, data_name) 60 | data = np.load(data_path) 61 | gt = np.load(target_path) 62 | 63 | # 无口型的片段全部去除,可能是没有人脸,或者噪声数据 64 | # 静音片段的gt也置零 65 | gt_sum = gt.sum(axis=1) 66 | zero_index = np.where(gt_sum == 0)[0] 67 | # 按概率将0标签的输入也置零 68 | # option(2) 69 | if len(zero_index) > 0: 70 | data[zero_index] = 0 71 | 72 | # # option(1) 73 | # select_data = [] 74 | # select_gt = [] 75 | # for i in range(data.shape[0]): 76 | # if i not in zero_index: 77 | # select_data.append(data[i][np.newaxis, ...]) 78 | # select_gt.append(gt[i][np.newaxis, ...]) 79 | # data = np.concatenate(select_data, axis=0) 80 | # gt = np.concatenate(select_gt, axis=0) 81 | 82 | 83 | data = self.vector_transforms(data) 84 | 85 | padding_data = np.zeros(data[0].shape)[np.newaxis, ...] 86 | padding_gt = np.zeros(gt[0].shape)[np.newaxis, ...] 87 | self.all_data.append(data) 88 | self.all_data.append(padding_data) 89 | self.all_gt.append(gt) 90 | self.all_gt.append(padding_gt) 91 | 92 | # 第一个vector是过零率 93 | self.all_data = np.concatenate(self.all_data, axis=0)[:, np.newaxis, :, :] 94 | self.all_gt = np.concatenate(self.all_gt, axis=0) 95 | 96 | def __len__(self): 97 | return len(self.all_data) 98 | 99 | def __getitem__(self, index): 100 | return torch.FloatTensor(np.array(self.all_data[index], dtype=np.float32)), torch.FloatTensor(np.array(self.all_gt[index], dtype=np.float32)) 101 | 102 | if __name__ == "__main__": 103 | target_root = "E:/datasets/audio2face/train_gt" 104 | data_root = "E:/datasets/audio2face/train_data" 105 | trainsets = AudioDataset(target_root, data_root) 106 | trainloader = torch.utils.data.DataLoader(trainsets, batch_size=8, shuffle=True, num_workers=0) 107 | 108 | for batch_idx, (datas, targets) in enumerate(trainloader): 109 | pass 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /model_weights/2001161359.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/model_weights/2001161359.tflite -------------------------------------------------------------------------------- /models/__pycache__/mouth_net.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/models/__pycache__/mouth_net.cpython-39.pyc -------------------------------------------------------------------------------- /models/mouth_net.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | 5 | class Swish(nn.Module): 6 | def __init__(self): 7 | super(Swish, self).__init__() 8 | 9 | def forward(self, x): 10 | x = x * F.sigmoid(x) 11 | return x 12 | 13 | class MouthNet(nn.Module): 14 | def __init__(self, class_num=16): 15 | super(MouthNet, self).__init__() 16 | 17 | # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 18 | encoder1 = [] 19 | layer1 = [] 20 | layer1.append(nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 1), stride=(2, 1), padding=0)) 21 | layer1.append(nn.BatchNorm2d(32)) 22 | layer1.append(Swish()) 23 | layer1 = nn.Sequential(*layer1) 24 | encoder1.append(layer1) 25 | 26 | layer2 = [] 27 | layer2.append(nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 1), stride=(4, 1), padding=0)) 28 | layer2.append(nn.BatchNorm2d(64)) 29 | layer2.append(Swish()) 30 | layer2 = nn.Sequential(*layer2) 31 | encoder1.append(layer2) 32 | 33 | layer3 = [] 34 | layer3.append(nn.Conv2d(in_channels=64, out_channels=96, kernel_size=(3, 1), stride=(4, 1), padding=0)) 35 | layer3.append(nn.BatchNorm2d(96)) 36 | layer3.append(Swish()) 37 | layer3 = nn.Sequential(*layer3) 38 | encoder1.append(layer3) 39 | 40 | self.encoder1 = nn.Sequential(*encoder1) 41 | 42 | # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 43 | encoder2 = [] 44 | layer1 = [] 45 | layer1.append(nn.Conv2d(in_channels=97, out_channels=128, kernel_size=(1, 3), stride=(1, 3), padding=0)) 46 | layer1.append(nn.BatchNorm2d(128)) 47 | layer1.append(Swish()) 48 | layer1 = nn.Sequential(*layer1) 49 | encoder2.append(layer1) 50 | 51 | layer2 = [] 52 | layer2.append(nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(1, 2), stride=(1, 2), padding=0)) 53 | layer2.append(nn.BatchNorm2d(128)) 54 | layer2.append(Swish()) 55 | layer2 = nn.Sequential(*layer2) 56 | encoder2.append(layer2) 57 | 58 | layer3 = [] 59 | layer3.append(nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(1, 2), stride=(1, 2), padding=0)) 60 | layer3.append(nn.BatchNorm2d(128)) 61 | layer3.append(Swish()) 62 | layer3 = nn.Sequential(*layer3) 63 | encoder2.append(layer3) 64 | 65 | self.encoder2 = nn.Sequential(*encoder2) 66 | 67 | regression = [] 68 | regression.append(nn.Linear(128, 64)) 69 | regression.append(nn.Dropout(0.5)) 70 | regression.append(nn.BatchNorm1d(64)) 71 | regression.append(Swish()) 72 | regression.append(nn.Linear(64, class_num)) 73 | self.regression = nn.Sequential(*regression) 74 | 75 | 76 | def forward(self, x): 77 | # 前向传播 78 | feat_zc = x[:, :, 0, :].unsqueeze(-2) 79 | feat = x[:, :, 1:, :] 80 | encoder1 = self.encoder1(feat) 81 | x2 = torch.cat([encoder1, feat_zc], 1) 82 | encoder2 = self.encoder2(x2) 83 | encoder2 = torch.flatten(encoder2, 1) 84 | 85 | bs = self.regression(encoder2) 86 | 87 | return bs 88 | 89 | 90 | if __name__ == "__main__": 91 | print('##############PyTorch################') 92 | net = MouthNet(class_num=16) 93 | x = torch.randn((8, 1, 33, 13)) 94 | y = net(x) -------------------------------------------------------------------------------- /third_part/LPC.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/third_part/LPC.dll -------------------------------------------------------------------------------- /third_part/__pycache__/moCapFace.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/third_part/__pycache__/moCapFace.cpython-39.pyc -------------------------------------------------------------------------------- /third_part/moCapFace.py: -------------------------------------------------------------------------------- 1 | """ 2 | 这是mocapface中抠出的tflite模型 3 | """ 4 | """ 5 | 头部姿态估计模型 6 | """ 7 | import numpy as np 8 | import cv2 9 | import json 10 | import tensorflow as tf 11 | class MoCapFace(object): 12 | def __init__(self): 13 | """ 14 | 使用.tflite来初始化tflite的模型 15 | """ 16 | self.tfliteInit('model_weights/2001161359.tflite') 17 | 18 | def tfliteInit(self, tflite_file): 19 | # Initialize the interpreter 20 | # 初始化解释器 21 | self.interpreter = tf.lite.Interpreter(model_path=tflite_file) 22 | # 为tensor分配显存 23 | self.interpreter.allocate_tensors() 24 | 25 | # 得到输入的place_hoder 26 | self.input_details = self.interpreter.get_input_details()[0] 27 | # 获得输出的hooker 28 | self.blendershapes_output_details = self.interpreter.get_output_details()[0] # bs系数 29 | # self.transforms_output_details = self.interpreter.get_output_details()[1] # 头部朝向 30 | 31 | def forword(self, img): 32 | """ 33 | img: numpy bgr 34 | :param img: 35 | :return: 36 | """ 37 | test_image = cv2.resize(img, [256, 256]) 38 | test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255. 39 | self.interpreter.set_tensor(self.input_details["index"], test_image) 40 | self.interpreter.invoke() 41 | res = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0].tolist() 42 | return res 43 | 44 | if __name__ == "__main__": 45 | mocapface = MoCapFace() 46 | with open("all_image_path.json", 'r') as im_f: 47 | images_f = json.load(im_f) 48 | 49 | for img_path in images_f.values(): 50 | img = cv2.imread(img_path) 51 | res = mocapface.forword(img) 52 | print(len(res)) 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /train/coach_v1.py: -------------------------------------------------------------------------------- 1 | """ 2 | 这个脚本是用来训练面部轮廓的 3 | """ 4 | import sys 5 | sys.path.append(".") 6 | sys.path.append("..") 7 | from models.mouth_net import MouthNet 8 | from torch.utils.tensorboard import SummaryWriter 9 | from datasets.dataset import * 10 | from configs.config_v1 import config as cfg 11 | from torch.utils.data import DataLoader 12 | import os 13 | import torch.optim as optim 14 | import torch.nn as nn 15 | os.environ['CUDA_VISIBLE_DEVICES'] = cfg['gpu_ids'] 16 | 17 | class Coach: 18 | def __init__(self): 19 | self.global_test_loss = float('Inf') 20 | 21 | # 得到配置文件 22 | self.cfg = cfg 23 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 24 | 25 | # 创建主要的网络 26 | self.net = MouthNet(class_num=cfg['class_num']).to(self.device) 27 | self.net.train() 28 | 29 | if cfg['ckpt'] != "": 30 | ckpt = torch.load(cfg['ckpt']) 31 | # 使用不严格的weight加载方式, 并且舍弃shape mismatch的 32 | pretrain_state_dict = ckpt 33 | net_state_dict = self.net.state_dict() 34 | for key in net_state_dict: 35 | if key in pretrain_state_dict.keys(): 36 | if net_state_dict[key].shape != pretrain_state_dict[key].shape: 37 | pretrain_state_dict.pop(key) 38 | self.net.load_state_dict(pretrain_state_dict, strict=False) 39 | 40 | # 使用多卡训练 41 | if torch.cuda.device_count() > 1: 42 | print("Let's use ", torch.cuda.device_count(), "GPUs.") 43 | self.net = nn.DataParallel(self.net) 44 | 45 | # 创建训练日志 46 | if not os.path.exists("experiment/logs"): 47 | os.makedirs("experiment/logs") 48 | self.logger = SummaryWriter(log_dir='./experiment/logs') 49 | 50 | # 创建数据集 51 | trainsets = AudioDataset(cfg['train_target_root'], cfg['train_data_root']) 52 | self.trainloader = torch.utils.data.DataLoader(trainsets, batch_size=self.cfg['train_batch_size'], shuffle=True, 53 | num_workers=cfg['num_workers']) 54 | 55 | testsets = AudioDataset(cfg['val_target_root'], cfg['val_data_root']) 56 | self.testloader = torch.utils.data.DataLoader(testsets, batch_size=self.cfg['train_batch_size'], shuffle=True, 57 | num_workers=cfg['num_workers']) 58 | 59 | # 创建优化器(记得加上正则化的参数) 60 | self.optimizer = optim.Adam(self.net.parameters(), lr=self.cfg['lr'], betas=(0.9, 0.999), eps=1e-08, 61 | weight_decay=0.0001) 62 | 63 | # 创建损失函数 64 | self.MSELoss = torch.nn.MSELoss(reduction='sum') 65 | 66 | def MAELoss(self, pred, target): 67 | """ 68 | 2022.05.17增加了一个MAELoss, 这个loss对异常值比较敏感 69 | """ 70 | loss = torch.sum(torch.abs(pred - target), dim=-1) 71 | loss = torch.mean(loss) 72 | return loss 73 | 74 | def criterion(self, pred, target): 75 | loss = self.MSELoss(pred, target) 76 | return loss 77 | 78 | def update_optimizer_lr(self, optimizer, lr): 79 | """ 80 | 为了动态更新learning rate, 加快训练速度 81 | :param optimizer: torch.optim type 82 | :param lr: learning rate 83 | :return: 84 | """ 85 | for group in optimizer.param_groups: 86 | group['lr'] = lr 87 | 88 | def train(self): 89 | iter_num = 0 90 | mean_loss = 0 91 | for i in range(self.cfg['epoch']): 92 | for idx, (datas, targets) in enumerate(self.trainloader): 93 | iter_num += 1 94 | 95 | datas, targets = datas.to(self.device), targets.to(self.device) 96 | self.optimizer.zero_grad() 97 | outputs = self.net(datas) 98 | 99 | # 计算损失 100 | loss = self.criterion(outputs, targets) 101 | mean_loss += loss.item() 102 | loss.backward() 103 | self.optimizer.step() 104 | # 打印loss 105 | if iter_num % self.cfg['print_loss'] == 0: 106 | mean_loss = mean_loss / self.cfg['print_loss'] 107 | # mean_loss = np.array(mean_loss.detach().cpu()) 108 | print("lr = {} total iteration {} epoch {}, iteration {}, loss = {}".format(str(round(self.optimizer.param_groups[0]['lr'], 6)), 109 | str(iter_num), str(i), str(idx), 110 | str(round(mean_loss, 6)))) 111 | self.logger.add_scalar('{}/{}'.format('train', 'loss'), mean_loss, int(iter_num)) 112 | mean_loss = 0 113 | # test 114 | if iter_num % self.cfg['val_interval'] == 0: 115 | self.net.eval() 116 | self.eval(i, idx) 117 | self.net.train() 118 | 119 | # lr decay 120 | # 2022.05.17调整lr下降的幅度, 之前是0.01, 现在是0.9 or 0.5 121 | # (可能是因为lr太大导致后期的训练波动, 使得eval loss比train loss大) 122 | if (iter_num - self.cfg['warmup_steps']) % self.cfg['lr_update_interval'] == 0: 123 | lr = self.optimizer.param_groups[0]['lr'] * 0.9 124 | self.update_optimizer_lr(self.optimizer, lr) 125 | 126 | elif iter_num < self.cfg['warmup_steps']: 127 | lr = self.optimizer.param_groups[0]['lr'] * (iter_num / self.cfg['warmup_steps']) 128 | self.update_optimizer_lr(self.optimizer, lr) 129 | 130 | def eval(self, epoch, iteration): 131 | 132 | test_loss = 0 133 | test_num = 0 134 | for idx, (datas, targets) in enumerate(self.testloader): 135 | test_num += 1 136 | 137 | datas, targets = datas.to(self.device), targets.to(self.device) 138 | 139 | with torch.no_grad(): 140 | outputs = self.net(datas) 141 | 142 | # 计算损失 143 | loss = self.criterion(outputs, targets) 144 | test_loss += loss.item() 145 | 146 | if test_num > 20: 147 | break 148 | 149 | test_loss = test_loss / test_num 150 | if test_loss < self.global_test_loss: 151 | self.global_test_loss = test_loss 152 | 153 | if not os.path.exists("experiment/checkpoints"): 154 | os.makedirs("experiment/checkpoints") 155 | 156 | torch.save(self.net.state_dict(), 157 | os.path.join("experiment/checkpoints", 'best_model_loss_{}.pth'.format(str(test_loss)))) 158 | 159 | self.logger.add_scalar('{}/{}'.format('test', 'loss'), test_loss, epoch) 160 | print("lr = {} epoch {}, iteration {}, eval loss = {}".format(str(round(self.optimizer.param_groups[0]['lr'], 6)), str(epoch), str(iteration), str(round(test_loss, 6)))) 161 | 162 | if __name__ == '__main__': 163 | coach = Coach() 164 | coach.train() --------------------------------------------------------------------------------