├── .gitignore
├── README.md
├── configs
    ├── __pycache__
    │   └── config_v1.cpython-39.pyc
    └── config_v1.py
├── data_generate
    ├── generate_datasets_v1.py
    └── generate_datasets_v2
    │   ├── audio_preprocess.py
    │   ├── chinese_public_dataset_preprocess.py
    │   ├── data_vad.py
    │   ├── features
    │       ├── LPC.dll
    │       ├── __pycache__
    │       │   ├── features.cpython-39.pyc
    │       │   ├── util.cpython-39.pyc
    │       │   └── vad.cpython-39.pyc
    │       ├── doc
    │       │   └── bsname.txt
    │       ├── features.py
    │       ├── util.py
    │       └── vad.py
    │   └── mocap4face
    │       ├── 2001161359.json
    │       ├── 2001161359.tflite
    │       ├── __pycache__
    │           └── mocap4face.cpython-39.pyc
    │       └── mocap4face.py
├── datasets
    ├── __pycache__
    │   └── dataset.cpython-39.pyc
    └── dataset.py
├── model_weights
    └── 2001161359.tflite
├── models
    ├── __pycache__
    │   └── mouth_net.cpython-39.pyc
    └── mouth_net.py
├── third_part
    ├── LPC.dll
    ├── __pycache__
    │   └── moCapFace.cpython-39.pyc
    └── moCapFace.py
└── train
    └── coach_v1.py


/.gitignore:
--------------------------------------------------------------------------------
1 | experiment/checkpoints/*
2 | experiment/logs/*
3 | Av629249051-P1.mp4_audio.npy
4 | Av629249051-P1.mp4_bs_targets.npy
5 | assets/*
6 | .idea


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 数据制作
 2 | ```
 3 | cd data_generate/generate_datasets_v2
 4 | ```
 5 | #### step1
 6 | ```
 7 | python chinese_public_dataset_preprocess.py
 8 | ```
 9 | #### step2
10 | ```
11 | python data_vad.py
12 | ```
13 | #### step3
14 | ```
15 | python audio_preprocess.py
16 | ```
17 | 
18 | 
19 | #### 注意
20 | ```
21 | 1. 更换脚本中的文件路径
22 | 2. 最后的gt被存放在clean_gt_base中
23 | 3. 处理后的音频数据放在processed_datasets中
24 | ```
25 | 
26 | 
27 | 
28 | ## 训练
29 | ```
30 | 先在configs/config_v1.py中进行训练的配置
31 | python train/coach_v1.py
32 | ```


--------------------------------------------------------------------------------
/configs/__pycache__/config_v1.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/configs/__pycache__/config_v1.cpython-39.pyc


--------------------------------------------------------------------------------
/configs/config_v1.py:
--------------------------------------------------------------------------------
 1 | config = {
 2 |         'gpu_ids': "0",  # 使用的GPU序号
 3 |         'lr': 0.00001,  # 0.0005, 0.01
 4 |         'class_num': 16,
 5 |         'ckpt': "experiment/checkpoints/best_model_loss_2.51.pth",
 6 |         'lr_update_interval': 30000,  # 学习率更新频率 每次*0.99
 7 |         'warmup_steps': 0,  # 前200个iteration使用warmup, 之后使用正常的学习率
 8 |         'watch_interval': 1000,  # log打印
 9 |         'print_loss': 1000,  # loss打印的频率
10 |         'val_interval': 100000,  # 验证轮次
11 |         'save_interval': 100000,  # 保存模型轮次
12 |         'epoch': 10000,  #
13 |         'exp_dir': 'experiment/exp_1',
14 |         'train_batch_size': 64,  # 64
15 |         'num_workers': 8,
16 |         'train_target_root': "E:/datasets/audio2face/train_gt",
17 |         'train_data_root': "E:/datasets/audio2face/train_data",
18 |         'val_target_root': "E:/datasets/audio2face/val_gt",
19 |         'val_data_root': "E:/datasets/audio2face/val_data",
20 | }


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File    : build_dataset
  3 | Time    : 2022/8/2 11:09
  4 | Author  : Lu Zeng
  5 | 
  6 | 
  7 | 这个脚本用来对齐音频和视频(图像)
  8 | 如果检测到了人脸就将主体人脸crop下来，并使用mocapface进行标注(只要40个bs的系数, 不需要头部转向的参数)
  9 | 并保存该帧对应的音频
 10 | """
 11 | import numpy as np
 12 | import cv2
 13 | from moviepy.editor import *
 14 | import matplotlib.pyplot as plt
 15 | import scipy.io.wavfile as wavfile
 16 | import ffmpeg
 17 | from ctypes import *
 18 | import mediapipe as mp
 19 | import sys
 20 | sys.path.append("..")
 21 | sys.path.append(".")
 22 | from third_part.moCapFace import MoCapFace
 23 | 
 24 | dll = cdll.LoadLibrary(os.path.join('third_part', 'LPC.dll'))
 25 | 
 26 | 
 27 | def get_source_info_ffmpeg(source_name):
 28 |     return_value = 0
 29 |     try:
 30 |         info = ffmpeg.probe(source_name)
 31 |         format_name = info['format']['format_name']
 32 | 
 33 |         video_info = next(c for c in info['streams'] if c['codec_type'] == 'video')
 34 |         audio_info = next(c for c in info['streams'] if c['codec_type'] == 'audio')
 35 |         codec_name = audio_info['codec_name']
 36 |         duration_ts = float(audio_info['duration_ts'])
 37 |         fps = audio_info['r_frame_rate']
 38 | 
 39 |         print("format_name:{} \ncodec_name:{} \nduration_ts:{} \nfps:{}".format(format_name, codec_name, duration_ts, fps))
 40 | 
 41 |         codec_name = video_info['codec_name']
 42 |         duration_ts = float(video_info['duration_ts'])
 43 |         fps = video_info['r_frame_rate']
 44 |         width = video_info['width']
 45 |         height = video_info['height']
 46 |         num_frames = video_info['nb_frames']
 47 |         print("format_name:{} \ncodec_name:{} \nduration_ts:{} \nwidth:{} \nheight:{} \nfps:{} \nnum_frames:{}".format(format_name,
 48 |                                                                                                        codec_name,
 49 |                                                                                                        duration_ts,
 50 |                                                                                                        width, height,
 51 |                                                                                                        fps, num_frames))
 52 |     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
 53 |         print("init_source:{} error. {}\n".format(source_name, str(e)))
 54 | 
 55 |         return return_value, 0, 0
 56 |     return return_value, fps, num_frames
 57 | 
 58 | 
 59 | def vis_audio(rate, signal):
 60 |     print(signal.shape)
 61 |     print(f"number of channels = {signal.shape[1]}")
 62 |     length = signal.shape[0] / rate
 63 |     print(f"length = {length}s")
 64 |     time = np.linspace(0., length, signal.shape[0])
 65 |     plt.plot(time, signal[:, 0], label="Left channel")
 66 |     plt.plot(time, signal[:, 1], label="Right channel")
 67 |     plt.legend()
 68 |     plt.xlabel("Time [s]")
 69 |     plt.ylabel("Amplitude")
 70 |     plt.show()
 71 | 
 72 | 
 73 | # 画出人脸框和关键点
 74 | def draw_face(img, bbox, expand_ratio=0.5):
 75 |     w = bbox[2] - bbox[0]
 76 |     h = bbox[3] - bbox[1]
 77 |     corpbbox = [max(0, int(bbox[0] - expand_ratio * w)),
 78 |                 max(0, int(bbox[1] - expand_ratio * h)),
 79 |                 min(img.shape[1] - 1, int(bbox[2] + expand_ratio * w)),
 80 |                 min(img.shape[0] - 1, int(bbox[3] + 0.1 * expand_ratio * h))
 81 |                 ]
 82 |     crop = img[corpbbox[1]: corpbbox[3], corpbbox[0]:corpbbox[2], :]
 83 |     return crop
 84 | 
 85 | 
 86 | def get_square_image(face, type="center"):
 87 |     """
 88 |     face不是bbox, 而是基于bbox在img上crop出来的人脸图像
 89 |     基于短边, 缩短长边
 90 |     """
 91 |     face_h, face_w = face.shape[:2]
 92 |     if type == "center":
 93 |         if face_h > face_w:
 94 |             pad = (face_h - face_w) // 2
 95 |             if pad != 0:
 96 |                 face = face[pad:-pad, :, :]
 97 |         elif face_h < face_w:
 98 |             pad = (face_w - face_h) // 2
 99 |             if pad != 0:
100 |                 face = face[:, pad:-pad, :]
101 | 
102 |     elif type == "upper":
103 |         if face_h > face_w:
104 |             pad = (face_h - face_w) // 2
105 |             if pad != 0:
106 |                 face = face[:-2 * pad, :, :]
107 |         elif face_h < face_w:
108 |             # 在水平方向的crop方式照常
109 |             pad = (face_w - face_h) // 2
110 |             if pad != 0:
111 |                 face = face[:, pad:-pad, :]
112 |     return face
113 | 
114 | 
115 | def read_frame_as_jpeg(ffmpeg_video, frame_num):
116 |   """
117 |   ffmpeg_video: 是已经加载完成的视频数据
118 |   指定帧数读取任意帧
119 |   """
120 |   out, err = (
121 |     ffmpeg_video.filter('select', 'gte(n,{})'.format(frame_num)).output('pipe:', vframes=1, format='image2', vcodec='mjpeg').run(capture_stdout=True)
122 |   )
123 |   # 将bytes转成nunpy的格式
124 |   try:
125 |     image_np = bytes_to_numpy(out)
126 |     return image_np
127 |   except:
128 |       return int(-1)
129 | 
130 | 
131 | def bytes_to_numpy(image_bytes):
132 |     image_np = np.frombuffer(image_bytes, dtype=np.uint8)
133 |     image_np = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
134 |     return image_np
135 | 
136 | if __name__ == "__main__":
137 |     bs2id = {
138 |         21: 'jawopen',
139 |         23: 'mouthpucker',
140 |         19: 'mouthfunnel',
141 |         12: 'mouthsmileleft',
142 |         29: 'mouthsmileright',
143 |         14: 'mouthfrownleft',
144 |         27: 'mouthfrownright',
145 |         20: 'mouthrolllower',
146 |         24: 'mouthrollupper',
147 |         11: 'mouthupperupleft',
148 |         30: 'mouthupperupright',
149 |     }
150 |     need_ids = [21, 23, 19, 12, 29, 14, 27, 20, 24, 11, 30]
151 | 
152 | 
153 | 
154 |     # video_path = "assets/baijiajiangtan.mp4"
155 | 
156 |     video_path = r"E:/datasets/audio2face/cctv_short_video_bilibili/Av629249051-P1.mp4"
157 | 
158 |     flag_name = os.path.basename(video_path)
159 | 
160 |     # image_save_root = "crop_face_images"
161 |     # bs_targets_root = "bs_targets"
162 | 
163 |     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<进行视频的处理<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
164 |     mp_face_detection = mp.solutions.face_detection
165 |     face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)
166 | 
167 |     mocapfacenet = MoCapFace()
168 | 
169 |     flag, fps, num_frames = get_source_info_ffmpeg(video_path)
170 |     fps = int(fps.split("/")[0])  # 视频的fps
171 |     num_frames = int(num_frames)  # 视频的总帧数
172 | 
173 |     frames = []
174 |     bs_targets = []
175 |     ffmpeg_video = ffmpeg.input(video_path)
176 |     # bs_target_txt = open(os.path.join(bs_targets_root, os.path.basename(video_path).split(".")[0] + ".txt"), "w")
177 |     for i in range(num_frames):
178 |         frame = read_frame_as_jpeg(ffmpeg_video, i)
179 | 
180 |         if isinstance(frame, int):
181 |             # 当前视频帧损坏的情况
182 |             # frames.append(frame)
183 |             bs_targets.append(frame)
184 |             # bs_target_txt.write(str(frame) + "\n")
185 |         else:
186 |             results = face_detection.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
187 |             if results.detections:
188 |                 # 只有当有检测到东西的时候才进行下面的操作
189 |                 h, w = frame.shape[:2]
190 |                 area = 0
191 |                 x1y1x2y2 = [0, 0, 0, 0]
192 |                 for detection in results.detections:
193 |                     bbox = detection.location_data.relative_bounding_box
194 |                     x1 = bbox.xmin
195 |                     y1 = bbox.ymin
196 |                     width = bbox.width
197 |                     height = bbox.height
198 |                     this_area = width * height
199 |                     if this_area > area:
200 |                         area = this_area
201 |                         x1y1x2y2 = [int(x1 * w), int(y1 * h), int((x1 + width) * w), int((y1 + height) * h)]
202 | 
203 |                 crop_face = draw_face(frame, x1y1x2y2)
204 |                 crop_face = get_square_image(crop_face)
205 |                 bs_target = mocapfacenet.forword(crop_face)
206 |                 bs_targets.append(bs_target)
207 |                 bs_target = list(map(str, bs_target))
208 |                 # bs_target_txt.write(" ".join(bs_target) + "\n")
209 | 
210 |             # 没有检测出人脸的情况
211 |             # frames.append(frame)
212 |             bs_targets.append(int(0))
213 |             # bs_target_txt.write(str(0) + "\n")
214 | 
215 |     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<进行音频的处理<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
216 |     video = VideoFileClip(video_path)
217 |     audio = video.audio
218 |     audio.write_audiofile("assets/tmp.wav")
219 |     rate, signal = wavfile.read("assets/tmp.wav")  # rate：采样率
220 |     # rate: 是采样率
221 |     # signal: 是音频信号
222 | 
223 |     # vis_audio(rate, signal)
224 |     if signal.shape[-1] == 2:
225 |         signal = np.mean(signal, axis=-1)
226 | 
227 |     frames_per_second = fps  # 视频fps(自己的数据集要设置好视频的fps才能和音频一一对应)
228 |     chunks_length = 48  # 260音频分割，520ms 前260ms 后260ms 这个是可以自己设置的
229 | 
230 |     # 每signal个采样一个信号, 这个信号会对应着30帧视频
231 |     audio_frameNum = int(len(signal) / rate * frames_per_second)  # 计算音频对应的视频帧数(一般这个就等于视频帧数)
232 | 
233 |     # 前后各添加260ms音频
234 |     a = np.zeros(chunks_length * rate // 1000, dtype=np.int16)
235 | 
236 |     signal = np.hstack((a, signal, a))
237 | 
238 |     # signal = signal / (2.**15)
239 |     frames_step = 1000.0 / frames_per_second  # 视频每帧的时长间隔33.3333ms
240 |     rate_kHz = int(rate / 1000)  # 采样率：48kHz
241 | 
242 |     # 开始进行音频的分割
243 |     # audio_frames = [signal[int(i * frames_step * rate_kHz): int((i * frames_step + chunks_length * 2) * rate_kHz)] for i
244 |     #                 in range(audio_frameNum)]
245 |     audio_frames = [signal[int(i * frames_step * rate / 1000): int((i * frames_step + chunks_length * 2) * rate / 1000)] for i
246 |                     in range(audio_frameNum)]
247 | 
248 |     inputData_array = np.zeros(shape=(1, 32, 64))  # 创建一个空3D数组，该数组(1*32*64)最后需要删除
249 | 
250 |     for i in range(len(audio_frames)):
251 |         audio_frame = audio_frames[i]  # 每段音频，8320个采样点
252 | 
253 |         overlap_frames_apart = 0.008
254 |         overlap = int(rate * overlap_frames_apart)  # 128 samples
255 |         frameSize = int(rate * overlap_frames_apart * 2)  # 256 samples
256 |         numberOfFrames = 64
257 | 
258 |         frames = np.ndarray(
259 |             (numberOfFrames, frameSize))  # initiate a 2D array with numberOfFrames rows and frame size columns
260 |         for k in range(0, numberOfFrames):
261 |             for i in range(0, frameSize):
262 |                 if ((k * overlap + i) < len(audio_frame)):
263 |                     frames[k][i] = audio_frame[k * overlap + i]
264 |                 else:
265 |                     frames[k][i] = 0
266 | 
267 |         frames *= np.hanning(frameSize)
268 |         frames_lpc_features = []
269 |         b = (c_double * 32)()
270 | 
271 |         for k in range(0, numberOfFrames):
272 |             a = (c_double * frameSize)(*frames[k])
273 |             dll.LPC(pointer(a), frameSize, 32, pointer(b))
274 |             frames_lpc_features.append(list(b))
275 | 
276 |         image_temp1 = np.array(frames_lpc_features)  # list2array
277 |         image_temp2 = image_temp1.transpose()  # array转置
278 |         image_temp3 = np.expand_dims(image_temp2, axis=0)  # 升维
279 |         inputData_array = np.concatenate((inputData_array, image_temp3), axis=0)  # array拼接
280 | 
281 |     # 删除第一行
282 |     inputData_array = inputData_array[1:]
283 | 
284 |     # #扩展为4维:(-1, 32, 64, 1)
285 |     inputData_array = np.expand_dims(inputData_array, axis=3)
286 |     # print(inputData_array.shape)
287 |     # 视频的长度是13831, 基本一致
288 |     # (13832, 32, 64, 1)
289 | 
290 |     # 这里是为了使得视频帧和处理之后的音频长度对齐
291 |     max_l = min(inputData_array.shape[0], num_frames)
292 |     selected_audio = []
293 |     selected_bs_targets = []
294 |     for index, this_audio in enumerate(inputData_array[:max_l]):
295 |         if not isinstance(bs_targets[index], int):
296 |             selected_audio.append(this_audio[np.newaxis, :, :, :])
297 |             selected_bs_targets.append(np.array(bs_targets[index])[np.newaxis, :])
298 |     selected_audio = np.concatenate(selected_audio, axis=0)
299 |     selected_bs_targets = np.concatenate(selected_bs_targets, axis=0)
300 | 
301 |     selected_bs_targets = selected_bs_targets[:, need_ids]
302 | 
303 |     print(selected_audio.shape)
304 |     print(selected_bs_targets.shape)
305 | 
306 |     # 去除共有的前min_len个元素
307 |     min_len = min(selected_audio.shape[0], selected_bs_targets.shape[0])
308 | 
309 |     np.save("{}_audio.npy".format(flag_name), selected_audio[:min_len])
310 |     np.save("{}_bs_targets.npy".format(flag_name), selected_bs_targets[:min_len])


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/audio_preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | 
  3 | import os
  4 | import sys
  5 | import math
  6 | import librosa
  7 | import numpy as np
  8 | import pandas as pd
  9 | sys.path.append(".")
 10 | sys.path.append("..")
 11 | import features.util as util
 12 | from features import features
 13 | 
 14 | 
 15 | def wav_signal_to_feat(signal,
 16 |                        ind_path,
 17 |                        num_frames,
 18 |                        fps,
 19 |                        sample_rate,
 20 |                        win_length,
 21 |                        hop_length,
 22 |                        half_chunks_length,
 23 |                        feat_func,
 24 |                        ):
 25 |     """
 26 |     音频转成输入特征，特征转换函数作为参数传入
 27 | 
 28 |     音频切分成每个样本 -> 每个样本转特征 -> concat -> 返回结果
 29 |                     ↘ 每个样本的过零率 ↗
 30 | 
 31 |     :param signal: 音频时域信号，直接从wav中读取出来的
 32 |     :param ind_path: 图像帧的时域位置索引文件
 33 |     :param num_frames: 图像帧总数量
 34 |     :param fps:
 35 |     :param sample_rate: 音频采样率
 36 |     :param win_length: 音频分帧窗口大小
 37 |     :param hop_length: 音频分帧的帧移
 38 |     :param half_chunks_length: 单个训练样本的时长的一半，单位：ms
 39 |     :param feat_func: 特征提取函数，主要是lambda表达式
 40 | 
 41 |     :return: numpy.Array
 42 |     """
 43 | 
 44 |     ### =================== 这个和我的数据处理的唯一区别是这里的rate是设置死的 160000 ====================
 45 | 
 46 |     signal_length = len(signal)
 47 |     chunks_signal_samples = half_chunks_length * sample_rate // 1000
 48 |     # 前后各添加 空白 音频
 49 |     a = np.zeros(chunks_signal_samples, dtype=np.int16)
 50 |     signal = np.hstack((a, signal, a))
 51 | 
 52 |     if ind_path is not None:
 53 |         img_frame_ind = np.load(ind_path)
 54 |         audio_frames = [signal[ind: ind + chunks_signal_samples * 2] for ind in img_frame_ind]
 55 |     else:
 56 |         frames_step = 1000.0 / fps  # 视频每帧的时长间隔
 57 |         rate_kHz = sample_rate // 1000  # 1ms的采样数
 58 | 
 59 |         # 帧数不一致，无法对齐，丢弃数据，返回None
 60 |         if math.fabs(int(signal_length / (frames_step * rate_kHz)) - num_frames) > 2:
 61 |             print("calculate num frames: {}, actual num frames{}".format(
 62 |                 signal_length / (frames_step * rate_kHz), num_frames))
 63 |             print("different frames count, skip data.")
 64 |             return None
 65 | 
 66 |         # 按图像帧的位置，切分每个图像对应的输入音频样本
 67 |         audio_frames = [
 68 |             signal[round(i * frames_step * rate_kHz): round((i * frames_step * rate_kHz) + 2 * chunks_signal_samples)]
 69 |             if round((i * frames_step * rate_kHz) + 2 * chunks_signal_samples) < len(signal)
 70 |             else signal[-int(2 * chunks_signal_samples):]
 71 |             for i in range(num_frames)
 72 |         ]
 73 | 
 74 |     audio_frames = np.array(audio_frames, dtype=np.float32)
 75 | 
 76 |     # 音频特征
 77 |     feat = feat_func(audio_frames)
 78 | 
 79 |     # 过零率特征
 80 |     zc_feat = features.zero_crossing_feat(audio_frames, win_length, hop_length)
 81 | 
 82 |     # 这里是包括过零率的
 83 |     # 这里过零率和特征的长度是一样的, cat在feat的前面
 84 |     feat = np.concatenate([zc_feat[:, np.newaxis, :], feat], axis=1)
 85 |     return feat
 86 | 
 87 | 
 88 | def preprocess(wav_path,
 89 |                ind_path=None,
 90 |                num_frames: int = None,
 91 |                fps: float = 30,
 92 |                sample_rate=16000,
 93 |                is_add_noise=True,
 94 |                add_thick_noise=True,
 95 |                add_env_noise=True,
 96 |                env_noise=None,
 97 |                win_length=256,
 98 |                hop_length=128,
 99 |                half_chunks_length=48,
100 |                feat_func=features.fbank,
101 |                ):
102 |     """
103 |     预处理
104 | 
105 |     :param wav_path: 音频文件路径
106 |     :param ind_path: 图像帧的时域位置索引文件
107 |     :param num_frames:
108 |     :param fps:
109 |     :param sample_rate:
110 |     :param is_add_noise: 是否添加轻量级噪声
111 |     :param add_thick_noise: 是否添加重量级噪声
112 |     :param add_env_noise: 是否添加环境音噪声
113 |     :param env_noise: 环境音噪声，时域信号
114 |     :param win_length:
115 |     :param hop_length:
116 |     :param half_chunks_length:
117 |     :param feat_func:
118 |     :return:
119 |     """
120 | 
121 |     feat, feat_wgn_light, feat_wgn_thick, feat_env_noise = None, None, None, None
122 | 
123 |     # 读取文件
124 |     signal, rate = librosa.load(wav_path, sr=sample_rate)
125 |     signal_length = len(signal)
126 |     print("length: ", signal_length, "rate: ", rate)
127 | 
128 |     # 空文件，返回None
129 |     if signal_length == 0:
130 |         return None, None, None, None
131 | 
132 |     # 特征提取
133 |     feat = wav_signal_to_feat(
134 |         signal, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate,
135 |         win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func,
136 |     )
137 | 
138 |     # 帧数不一致，返回None
139 |     if feat is None:
140 |         return None, None, None, None
141 | 
142 |     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<下面这些都不用看了先, 本任务不会有噪声<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
143 | 
144 |     # 添加高斯白噪声，信噪比分别为12和6
145 |     if is_add_noise:
146 |         signal_wgn_light = util.add_noise(signal, 12.)
147 |         feat_wgn_light = wav_signal_to_feat(
148 |             signal_wgn_light, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate,
149 |             win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func,
150 |         )
151 |     if add_thick_noise:
152 |         signal_wgn_thick = util.add_noise(signal, 6.)
153 |         feat_wgn_thick = wav_signal_to_feat(
154 |             signal_wgn_thick, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate,
155 |             win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func,
156 |         )
157 |     # 添加真实环境音噪声
158 |     if add_env_noise:
159 |         signal_env_noise = util.add_other_noise(signal, noise=env_noise)
160 |         feat_env_noise = wav_signal_to_feat(
161 |             signal_env_noise, ind_path, num_frames=num_frames, fps=fps, sample_rate=sample_rate,
162 |             win_length=win_length, hop_length=hop_length, half_chunks_length=half_chunks_length, feat_func=feat_func,
163 |         )
164 | 
165 |     return feat, feat_wgn_light, feat_wgn_thick, feat_env_noise
166 | 
167 | 
168 | def selfmade_arkit_preprocess(wav_dir, profile_dir, gt_dir, save_dir, env_noise):
169 |     # 文件映射字典
170 |     data_files = {os.path.splitext(file)[0]: os.path.join(wav_dir, file)
171 |                   for file in os.listdir(wav_dir) if file.split(".")[-1] == "wav"}
172 |     profiles_dict = {os.path.splitext(file)[0]: os.path.join(profile_dir, file) for file in os.listdir(profile_dir)}
173 |     gt_dict = {os.path.splitext(file)[0]: os.path.join(gt_dir, file) for file in os.listdir(gt_dir)}
174 | 
175 |     for file_id, path in data_files.items():
176 |         if os.path.exists(os.path.join(save_dir, file_id + ".npy")):
177 |             print("Exist file {}".format(os.path.join(save_dir, file_id + ".npy")))
178 |             continue
179 | 
180 |         if file_id not in profiles_dict or file_id not in gt_dict:
181 |             continue
182 | 
183 |         profile = util.load_json_file(profiles_dict[file_id])
184 |         gt_label = np.load(gt_dict[file_id])
185 | 
186 |         num_frames = gt_label.shape[0]
187 |         # num_frames = int(profile["num_frames"])
188 |         fps = profile["fps"]
189 | 
190 |         print("Processing {:<15s}, path: {}...".format(file_id, path))
191 |         lpc_feat, lpc_feat_wgn_s, _, feat_env_noise = preprocess(
192 |             path, ind_path=None, num_frames=num_frames, fps=fps,
193 |             sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise,
194 |             win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC
195 |         )
196 |         if lpc_feat is not None:
197 |             np.save(save_dir + "/" + file_id + ".npy", lpc_feat)
198 |             np.save(save_dir + "/" + file_id + ".wgn_s" + ".npy", lpc_feat_wgn_s)
199 |             np.save(save_dir + "/" + file_id + ".env_n" + ".npy", feat_env_noise)
200 | 
201 | 
202 | def facegood_preprocess(wav_dir, label_dir, save_dir, env_noise):
203 |     for file in os.listdir(wav_dir):
204 |         file_id = file.split(".")[0]
205 |         abs_path = os.path.join(wav_dir, file)
206 |         print("Processing {:<15s}, path: {}...".format(file, abs_path))
207 | 
208 |         # label数量 == 视频总帧数
209 |         label_file = os.path.join(label_dir, "bs_value_{}.npy".format(os.path.splitext(file)[0]))
210 |         num_frames = np.load(label_file).shape[0]
211 | 
212 |         feat, feat_wgn_small, feat_wgn_large, feat_env_noise = preprocess(
213 |             abs_path, num_frames=num_frames, fps=30, sample_rate=SAMPLE_RATE, add_thick_noise=False,
214 |             env_noise=env_noise,
215 |             win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC
216 |         )
217 |         if feat is not None:
218 |             np.save(os.path.join(save_dir, file_id + ".npy"), feat)
219 |             np.save(os.path.join(save_dir, file_id + ".wgn_s" + ".npy"), feat_wgn_small)
220 |             # np.save(os.path.join(save_dir, file_id + ".wgn_l" + ".npy"), feat_wgn_large)
221 |             np.save(os.path.join(save_dir, file_id + ".env_n" + ".npy"), feat_env_noise)
222 | 
223 | 
224 | def aiwin_preprocess(aiwin_dir, save_dir, env_noise, is_eval=False):
225 |     # 文件映射字典
226 |     data_files = {file.split(".")[0].lower(): os.path.join(aiwin_dir, file)
227 |                   for file in os.listdir(aiwin_dir) if file.split(".")[-1] == "wav"}
228 |     gt_files = {file.split(".")[0].lower().replace("_anim", ""): os.path.join(aiwin_dir, file)
229 |                 for file in os.listdir(aiwin_dir) if file.split(".")[-1] == "csv"}
230 | 
231 |     for file_id, path in data_files.items():
232 |         print("Processing {:<15s}, path: {}...".format(file_id, path))
233 | 
234 |         if file_id not in gt_files:
235 |             continue
236 | 
237 |         # label数量 == 视频总帧数
238 |         label_file = gt_files[file_id]
239 |         num_frames = pd.read_csv(label_file).shape[0]
240 | 
241 |         feat, feat_wgn_small, feat_wgn_large, feat_env_noise = preprocess(
242 |             path, num_frames=num_frames, fps=25, sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise,
243 |             win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC
244 |         )
245 | 
246 |         if feat is not None:
247 |             if not is_eval:
248 |                 np.save(os.path.join(save_dir, "aiwin_" + file_id + ".npy"), feat)
249 |                 np.save(os.path.join(save_dir, "aiwin_" + file_id + ".wgn_s" + ".npy"), feat_wgn_small)
250 |                 # np.save(os.path.join(save_dir, "aiwin_" + file_id + ".wgn_l" + ".npy"), feat_wgn_large)
251 |                 np.save(os.path.join(save_dir, "aiwin_" + file_id + ".env_n" + ".npy"), feat_env_noise)
252 |             else:
253 |                 np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".npy"), feat)
254 |                 np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".wgn_s" + ".npy"), feat_wgn_small)
255 |                 # np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".wgn_l" + ".npy"), feat_wgn_large)
256 |                 np.save(os.path.join(save_dir, "aiwin_eval_" + file_id + ".env_n" + ".npy"), feat_env_noise)
257 | 
258 | 
259 | def public_dataset_preprocess(wav_dir, profile_dir, save_dir, env_noise):
260 |     profiles_dict = {os.path.splitext(file)[0]: os.path.join(profile_dir, file) for file in os.listdir(profile_dir)}
261 | 
262 |     # 文件映射字典
263 |     data_files = {file.split(".")[0]: os.path.join(wav_dir, file)
264 |                   for file in os.listdir(wav_dir) if file.split(".")[-1] == "wav"}
265 | 
266 |     for file_id, path in data_files.items():
267 |         if os.path.exists(os.path.join(save_dir, file_id + ".npy")):
268 |             print("Exist file {}".format(os.path.join(save_dir, file_id + ".npy")))
269 |             continue
270 | 
271 |         if file_id in profiles_dict:
272 |             profile = util.load_json_file(profiles_dict[file_id])
273 |         else:
274 |             continue
275 | 
276 |         print("Processing {:<15s}, path: {}...".format(file_id, path))
277 |         feat, feat_wgn_small, _, feat_env_noise = preprocess(
278 |             path, ind_path=None, num_frames=int(profile["num_frames"]), fps=profile["fps"],
279 |             sample_rate=SAMPLE_RATE, add_thick_noise=False, env_noise=env_noise,
280 |             win_length=WIN_LENGTH, hop_length=HOP_LENGTH, half_chunks_length=HALF_CHUNKS_LENGTH, feat_func=FEAT_FUNC
281 |         )
282 | 
283 |         if feat is not None:
284 |             np.save(save_dir + "/" + file_id + ".npy", feat)
285 |             np.save(save_dir + "/" + file_id + ".wgn_s" + ".npy", feat_wgn_small)
286 |             np.save(save_dir + "/" + file_id + ".env_n" + ".npy", feat_env_noise)
287 | 
288 | 
289 | if __name__ == '__main__':
290 |     # 参数配置
291 |     use_self_made_arkit = False
292 |     use_self_made_mocap = False
293 | 
294 |     use_facegood = False
295 |     use_aiwin = False
296 |     use_public = True
297 | 
298 |     SAMPLE_RATE = 16000
299 |     WIN_LENGTH = 256
300 |     HOP_LENGTH = 128
301 |     N_FEAT = 32
302 |     HALF_CHUNKS_LENGTH = 48
303 |     FEAT_FUNC = lambda x: features.fbank(
304 |         x, sample_rate=SAMPLE_RATE, win_length=WIN_LENGTH, hop_length=HOP_LENGTH, n_mels=N_FEAT)
305 | 
306 |     # 数据路径
307 |     save_dir = "E:/datasets/audio2face/processed_datasets"
308 |     # 这里还加上了噪声数据来模仿环境噪声
309 |     env_noise_file = "E:/3D_face_reconstruct/audio2face/data/train/environmental_noise_2.wav"
310 | 
311 |     if not os.path.exists(save_dir):
312 |         os.makedirs(save_dir)
313 | 
314 |     # 对之前从视频中分离出来的音频进解析
315 |     # sample rate要用之前生成wav文件的时候使用的sr
316 |     env_noise, _ = librosa.load(env_noise_file, sr=16000)
317 | 
318 |     # 自制数据，ARKIT录制
319 |     if use_self_made_arkit:
320 |         # clean_data_dir = "E:/数据集/人脸视频口型数据/clean"
321 |         # selfmade_dataset_preprocess(clean_data_dir, clean_data_dir, save_dir)
322 | 
323 |         clean_data_dir = "E:/数据集/人脸视频口型数据/raw_3"
324 |         profile_dir = "E:/数据集/人脸视频口型数据/profile_3"
325 |         gt_dir = "E:/数据集/chinese_video_process/clean_gt_arkit"
326 |         selfmade_arkit_preprocess(clean_data_dir, profile_dir, gt_dir, save_dir, env_noise=env_noise)
327 | 
328 |     # 自制数据，MocapFace识别
329 |     if use_self_made_mocap:
330 |         pass
331 | 
332 |     # FACEGOOD样例数据
333 |     if use_facegood:
334 |         facegood_wav_dir_path = "D:/projects/AI/research/pose/audio2face/data/train/raw_wav"
335 |         facegood_label_dir_path = "D:/projects/AI/research/pose/audio2face/data/train/bs_value"
336 |         facegood_preprocess(facegood_wav_dir_path, facegood_label_dir_path, save_dir, env_noise=env_noise)
337 | 
338 |     # AIWIN训练数据
339 |     elif use_aiwin:
340 |         aiwin_dir_path = "E:/数据集/chinese_video_process/audio2face_data_for_train"
341 |         aiwin_eval_dir_path = "E:/数据集/chinese_video_process/audio2face_data_for_evaluation"
342 |         # aiwin_preprocess(aiwin_dir_path, save_dir, env_noise=env_noise, is_eval=False)
343 |         aiwin_preprocess(aiwin_eval_dir_path, save_dir, env_noise=env_noise, is_eval=True)
344 | 
345 |     # 公开数据集
346 |     elif use_public:
347 |         public_dataset_clean_wav_dir = "E:/datasets/audio2face/wav_base"
348 |         public_dataset_profile_path = "E:/datasets/audio2face/profile_base"
349 |         public_dataset_preprocess(public_dataset_clean_wav_dir, public_dataset_profile_path, save_dir,
350 |                                   env_noise=env_noise)
351 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/chinese_public_dataset_preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | 
  3 | """
  4 | # File   : features.py
  5 | # Time   : 2022/8/6 14:33
  6 | # Author : Lu Zeng
  7 | # version: python 3.7
  8 | """
  9 | 
 10 | import os
 11 | import re
 12 | import tqdm
 13 | import json
 14 | 
 15 | import cv2
 16 | import skvideo.io
 17 | import numpy as np
 18 | import sys
 19 | sys.path.append("..")
 20 | sys.path.append(".")
 21 | from mocap4face.mocap4face import MediapipeFaceDetection
 22 | #
 23 | MOUTH_RELATED_BLENDSHAPE_LIST = [
 24 |     "jawOpen",
 25 |     "mouthFunnel",
 26 |     "mouthPucker",
 27 |     "mouthSmileLeft",
 28 |     "mouthSmileRight",
 29 |     "mouthStretchLeft",
 30 |     "mouthStretchRight",
 31 |     "mouthRollLower",
 32 |     "mouthRollUpper",
 33 |     "mouthShrugUpper",
 34 |     "mouthPressLeft",
 35 |     "mouthPressRight",
 36 |     "mouthLowerDownLeft",
 37 |     "mouthLowerDownRight",
 38 |     "mouthUpperUpLeft",
 39 |     "mouthUpperUpRight",
 40 | ]
 41 | 
 42 | 
 43 | def call_mocapface(model, img_src):
 44 |     """
 45 |     调用mocapface模型，输出blendshape结果
 46 |     只选取嘴型有效的blendshape
 47 | 
 48 |     :param model: mocapface模型
 49 |     :param img_src: 输入图像，BGR通道
 50 |     :return:
 51 |     """
 52 |     try:
 53 |         # 先进行人脸的检测, 检测不到人脸
 54 |         result, _ = model.MediapipeRun(img_src)
 55 |     except Exception as e:
 56 |         result = None
 57 | 
 58 |     if result is None:
 59 |         return [0.] * len(MOUTH_RELATED_BLENDSHAPE_LIST)
 60 | 
 61 |     face_json = model.jsonFormat(result)
 62 |     # 从mocapface中将得到的结果从里面拿出来
 63 |     # gt_label的顺序和MOUTH_RELATED_BLENDSHAPE_LIST中的顺序是一样的
 64 |     gt_label = [face_json.get(name, 0.) for name in MOUTH_RELATED_BLENDSHAPE_LIST]
 65 |     return gt_label
 66 | 
 67 | 
 68 | @DeprecationWarning
 69 | def makevideo2(video_file, _model: MediapipeFaceDetection):
 70 |     """
 71 |     已弃用
 72 |     cv2读取视频会跳过部分重复帧或者失败帧，导致最后帧数与音频时长对应不上，无法对齐数据
 73 |     :param video_file:
 74 |     :param _model:
 75 |     :return:
 76 |     """
 77 |     capture = cv2.VideoCapture(video_file)
 78 | 
 79 |     video_profile_info = {
 80 |         "width": capture.get(cv2.CAP_PROP_FRAME_WIDTH),
 81 |         "height": capture.get(cv2.CAP_PROP_FRAME_HEIGHT),
 82 |         "channel": capture.get(cv2.CAP_PROP_CHANNEL),
 83 |         "fps": capture.get(cv2.CAP_PROP_FPS),
 84 |         "num_frames": capture.get(cv2.CAP_PROP_FRAME_COUNT),
 85 |     }
 86 |     print(video_profile_info)
 87 | 
 88 |     def image_iterator(cap):
 89 |         while True:
 90 |             _ret, _img = cap.read()
 91 |             if not _ret:
 92 |                 break
 93 |             yield _img
 94 | 
 95 |     frame_id = 0
 96 |     ground_truth_list = []
 97 |     if capture.isOpened():
 98 |         for img_src in tqdm.tqdm(image_iterator(capture)):
 99 |             frame_id += 1
100 |             gt_label = call_mocapface(_model, img_src)
101 |             ground_truth_list.append(gt_label)
102 |     else:
103 |         print('视频打开失败！')
104 | 
105 |     gt_matrix = np.array(ground_truth_list) / 100.0
106 |     if gt_matrix.shape[0] != video_profile_info["num_frames"]:
107 |         video_profile_info["num_frames"] = gt_matrix.shape[0]
108 | 
109 |     return gt_matrix, video_profile_info
110 | 
111 | 
112 | def makevideo3(video_file, _model: MediapipeFaceDetection):
113 |     """
114 |     cv2 获取视频基本信息
115 |     skvideo 读取视频每一帧(注意: 不会漏帧)
116 | 
117 |     :param video_file:
118 |     :param _model:
119 |     :return:
120 |     """
121 |     capture = cv2.VideoCapture(video_file)
122 |     video_profile_info = {
123 |         "width": capture.get(cv2.CAP_PROP_FRAME_WIDTH),
124 |         "height": capture.get(cv2.CAP_PROP_FRAME_HEIGHT),
125 |         "channel": capture.get(cv2.CAP_PROP_CHANNEL),
126 |         "fps": capture.get(cv2.CAP_PROP_FPS),
127 |         "num_frames": capture.get(cv2.CAP_PROP_FRAME_COUNT),
128 |     }
129 |     print(video_file, video_profile_info)
130 |     capture.release()
131 | 
132 |     videogen = skvideo.io.vreader(video_file)
133 | 
134 |     frame_id = 0
135 |     ground_truth_list = []
136 |     for img_src in tqdm.tqdm(videogen):
137 |         frame_id += 1
138 |         # skvideo读取数据为RGB
139 |         img_src = cv2.cvtColor(img_src, cv2.COLOR_RGB2BGR)
140 |         # 没有人脸的话这里直接返回0, 但是不会丢弃帧
141 |         gt_label = call_mocapface(_model, img_src)
142 |         ground_truth_list.append(gt_label)
143 | 
144 |     gt_matrix = np.array(ground_truth_list) / 100.0
145 |     if gt_matrix.shape[0] != video_profile_info["num_frames"]:
146 |         video_profile_info["num_frames"] = gt_matrix.shape[0]
147 | 
148 |     return gt_matrix, video_profile_info
149 | 
150 | 
151 | def get_duplicated_name(root_dir, output_name, output_suffix):
152 |     """
153 |     重名文件 加数字编号后缀
154 | 
155 |     :param root_dir:
156 |     :param output_name:
157 |     :param output_suffix:
158 |     :return:
159 |     """
160 |     # 防止重名
161 |     output_file = os.path.join(root_dir, output_name + output_suffix)
162 |     dup_ind = 1
163 |     while os.path.exists(output_file):
164 |         output_file = os.path.join(root_dir, output_name + "." + str(dup_ind) + output_suffix)
165 |         dup_ind += 1
166 |     return output_file
167 | 
168 | 
169 | def make_dir(dir_path):
170 |     if not os.path.exists(dir_path):
171 |         os.makedirs(dir_path)
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     video_data_dir_list = [
176 |         "E:/datasets/audio2face/wanghong_short_video",
177 |     ]
178 |     # 创建生成数据的时候要保存的目录
179 |     wav_data_dir = "E:/datasets/audio2face/wav_base"
180 |     profile_data_dir = "E:/datasets/audio2face/profile_base"
181 |     gt_data_dir = "E:/datasets/audio2face/ground_truth_base"
182 | 
183 |     make_dir(wav_data_dir)
184 |     make_dir(profile_data_dir)
185 |     make_dir(gt_data_dir)
186 | 
187 |     def get_sub_files(path):
188 |         """
189 |         这是一个通过递归来收集一个文件夹中的文件的函数(遇到文件夹就继续递归调用, 遇到文件就将其加入到list中)
190 |         """
191 |         sub_files = os.listdir(path)
192 |         all_files = []
193 |         for file in sub_files:
194 |             abs_file = os.path.join(path, file)
195 |             if os.path.isfile(abs_file):
196 |                 all_files.append(abs_file)
197 |             if os.path.isdir(abs_file):
198 |                 _files = get_sub_files(abs_file)
199 |                 all_files.extend(_files)
200 |         return all_files
201 | 
202 |     # 建立mocapface的model用于打标注
203 |     model = MediapipeFaceDetection(
204 |         tflite_path="./mocap4face/2001161359.tflite",
205 |         json_path="./mocap4face/2001161359.json")
206 | 
207 |     # 这里说话的人和他的语音是一一对应的
208 |     speakers_mapping = {}
209 |     for video_dir in video_data_dir_list:
210 |         for i, file in enumerate(get_sub_files(video_dir)):
211 | 
212 |             # 1. 得到各种文件的后缀
213 |             abs_prefix, suffix = os.path.splitext(file)
214 |             prefix, _ = os.path.splitext(os.path.split(file)[-1])
215 |             if suffix != ".avi" and suffix != ".mpg" and suffix != ".mp4":
216 |                 continue
217 |             output_prefix = "_".join(file.removeprefix(video_dir).split(os.path.sep)[1:-1]) + "_" + prefix
218 | 
219 |             # 音频分离保存，采样率降采样为16kHz，单通道
220 |             # 防止重名并更改后缀
221 |             output_wav_file = get_duplicated_name(wav_data_dir, output_prefix, ".wav")
222 |             # 这里统一将音频的采样率设置为了16k
223 |             os.system("ffmpeg -i {} -f wav -ar 16000 -ac 1 {} -y".format(file, output_wav_file))
224 | 
225 |             # mocapface 识别结果
226 |             # gt_matrix, video_profile_info = makevideo2(file, model)
227 |             gt_matrix, video_profile_info = makevideo3(file, model)
228 | 
229 |             # 保存GT数据
230 |             gt_output_file = get_duplicated_name(gt_data_dir, output_prefix, ".npy")
231 |             np.save(gt_output_file, gt_matrix)
232 | 
233 |             # 保存json
234 |             profile_output_file = get_duplicated_name(profile_data_dir, output_prefix, ".json")
235 |             with open(profile_output_file, "w", encoding="utf-8") as f:
236 |                 json.dump(video_profile_info, f)
237 | 
238 |             # 说话人ID
239 |             speaker_id = prefix.split("_")[0].lower()
240 |             speaker_id = re.sub(r"\d+", "", speaker_id) \
241 |                 if re.match(r"^[a-z\u4e00-\u9fa5]+\d+$", speaker_id) \
242 |                 else speaker_id
243 |             speakers_mapping[output_prefix] = speaker_id
244 | 
245 |     with open(os.path.join(profile_data_dir, "speakers.json"), "w", encoding="utf-8") as f:
246 |         json.dump(speakers_mapping, f, ensure_ascii=False, indent=4)
247 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/data_vad.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | 
  3 | import os
  4 | import math
  5 | import json
  6 | 
  7 | import librosa
  8 | import webrtcvad
  9 | import numpy as np
 10 | import scipy.io.wavfile as wf
 11 | import sys
 12 | sys.path.append(".")
 13 | sys.path.append("..")
 14 | from features.vad import VoiceActivityDetector
 15 | 
 16 | def voice_activate_indices_detect_2(audio_file):
 17 |     """
 18 |     使用 py-webrtcvad 进行音频VAD处理
 19 | 
 20 |     计算每个子带的对数能量，如果大于阈值，则对当前帧进行处理；否则直接将vad_flag置为0。
 21 |     计算每个子带对应的高斯概率，并与子带的权重相乘作为语音/噪声最终的概率。
 22 |     计算每个子带的对数似然比，
 23 |     每个子带的似然比会和阈值进行比较作为一个局部结果
 24 |     所有子带的对数加权似然比之和与阈值比较作一个全局的结果。当全局或局部其中一个为TRUE则认定当前帧是语音帧。
 25 |     使用hangover对结果进行平滑
 26 | 
 27 |     超过6个连续音频窗都不是语音，才视为静音片段
 28 | 
 29 |     :param audio_file:
 30 |     :return:
 31 |     """
 32 |     sample_window = 0.03
 33 |     sample_overlap = 0.03
 34 | 
 35 |     v = webrtcvad.Vad(3)
 36 |     rate, data = wf.read(audio_file)
 37 | 
 38 |     sample_start = 0
 39 |     detected_windows = np.array([])
 40 |     sample_window = int(rate * sample_window)
 41 |     sample_overlap = int(rate * sample_overlap)
 42 |     # 识别每个音频窗是否为语音
 43 |     while (sample_start < (len(data) - sample_window)):
 44 |         sample_end = sample_start + sample_window
 45 |         if sample_end >= len(data):
 46 |             sample_end = len(data) - 1
 47 |             sample_start = sample_end - sample_window - 1
 48 |         data_window = data[sample_start:sample_end]
 49 |         detected_windows = np.append(detected_windows, [sample_start, v.is_speech(data_window.tobytes(), rate)])
 50 |         sample_start += sample_overlap
 51 |     detected_windows = detected_windows.reshape(int(len(detected_windows) / 2), 2)
 52 | 
 53 |     indices = []
 54 |     viol_start, viol_end = -1, -1
 55 |     act_start, act_end = -1, -1
 56 |     interval_frames_threshold = 6
 57 |     for i, (_, flag) in enumerate(detected_windows):
 58 |         if flag == 0:
 59 |             viol_start = i if viol_start == -1 else viol_start
 60 |             viol_end = i
 61 |         elif viol_end - viol_start >= interval_frames_threshold:
 62 |             if act_start != -1:
 63 |                 act_end = viol_start + 1
 64 |                 indices.append((int(detected_windows[act_start, 0]), int(detected_windows[act_end, 0])))
 65 |             act_start = i - 2
 66 |             viol_start = -1
 67 |             viol_end = -1
 68 |         else:
 69 |             if act_start == -1:
 70 |                 act_start = i
 71 |             act_end = i
 72 |             viol_start = -1
 73 |             viol_end = -1
 74 | 
 75 |     indices.append((int(detected_windows[act_start, 0]), int(detected_windows[act_end, 0])))
 76 | 
 77 |     return indices
 78 | 
 79 | 
 80 | def vad4():
 81 |     """
 82 |     音频文件VAD处理，静音片段识别，
 83 |     静音片段的ground truth置零，切分训练集时会将全零的样本去除
 84 |     置零前先进行标签平滑处理
 85 | 
 86 |     :return:
 87 |     """
 88 | 
 89 |     # 公开数据集
 90 |     raw_dir = "E:/datasets/audio2face/wav_base"
 91 |     gt_dir = "E:/datasets/audio2face/ground_truth_base"
 92 |     profile_dir = "E:/datasets/audio2face/profile_base"
 93 | 
 94 |     output_gt_dir = "E:/datasets/audio2face/clean_gt_base"
 95 | 
 96 |     rate = 16000
 97 | 
 98 |     if not os.path.exists(output_gt_dir):
 99 |         os.makedirs(output_gt_dir)
100 | 
101 |     total_duration = 0
102 |     skip_video_count = 0
103 |     process_video_count = 0
104 |     for file in os.listdir(raw_dir):
105 |         file_id, suffix = os.path.splitext(file)
106 | 
107 |         # 跳过非wav格式的文件
108 |         if suffix != ".wav":
109 |             continue
110 | 
111 |         # 文件路径
112 |         _gt_file = os.path.join(gt_dir, file_id + ".npy")
113 |         _profile_file = os.path.join(profile_dir, file_id + ".json")
114 |         if not os.path.exists(_gt_file) or not os.path.exists(_profile_file):
115 |             continue
116 | 
117 |         # 加载数据，gt标签，视频基本信息
118 |         with open(_profile_file, "r", encoding="utf-8") as f:
119 |             profile = json.load(f)
120 |         gt_label = np.load(_gt_file)
121 |         audio, sr = librosa.load(os.path.join(raw_dir, file), sr=rate)
122 | 
123 |         fps = profile["fps"]
124 |         frames_step = rate / fps
125 | 
126 |         # 总帧数无法和音频长度对齐，丢弃
127 |         if int(math.fabs(len(audio) / frames_step - profile["num_frames"])) > 2:
128 |             skip_video_count += 1
129 |             print("Skip {:<15s}, num_frames: {:>8d} , calculate frames: {:>8.2f}".format(
130 |                 file_id, int(profile["num_frames"]), len(audio) / frames_step))
131 |             continue
132 | 
133 |         print("Processing {:<15s}, fps: {:<4.2f}, num_frames: {:<8d}".format(file_id, fps, int(profile["num_frames"])))
134 |         process_video_count += 1
135 | 
136 |         # 去除空白音频信号
137 |         # internal_clean_ind = voice_activate_indices_detect(os.path.join(raw_dir, file))
138 |         internal_clean_ind = voice_activate_indices_detect_2(os.path.join(raw_dir, file))
139 | 
140 |         # label加hamming窗，平滑
141 |         win_size = 5
142 |         if gt_label.shape[0] >= win_size:
143 |             win = np.hamming(win_size) / np.sum(np.hamming(win_size))
144 |             for i in range(gt_label.shape[1]):
145 |                 gt_label[:, i] = np.convolve(gt_label[:, i], win, mode="same")
146 | 
147 |         # 重新构造gt标签，静音片段的gt置零
148 |         new_gt_label = np.zeros_like(gt_label)
149 |         for start, end in internal_clean_ind:
150 |             frame_start_ind = round(start / frames_step)
151 |             frame_end_ind = int(end // frames_step)
152 |             frame_end_ind = min(frame_end_ind, len(gt_label) - 1)
153 | 
154 |             new_gt_label[frame_start_ind: frame_end_ind + 1] = gt_label[frame_start_ind: frame_end_ind + 1]
155 | 
156 |             duration = (frame_end_ind - frame_start_ind) / fps
157 |             total_duration += duration
158 | 
159 |         # 输出保存数据
160 |         np.save(os.path.join(output_gt_dir, file_id + ".npy"), new_gt_label)
161 | 
162 |     # 时长统计
163 |     hours = int(total_duration // 3600)
164 |     minutes = int((total_duration - hours * 3600) // 60)
165 |     seconds = int(total_duration - hours * 3600 - minutes * 60)
166 | 
167 |     print("Total count: {}, Skip count: {}, Process count: {}, Duration: {:>3d}h {:>2d}m {:>2d}s".format(
168 |         skip_video_count + process_video_count, skip_video_count, process_video_count, hours, minutes, seconds))
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     vad4()
173 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/LPC.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/LPC.dll


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/__pycache__/features.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/features.cpython-39.pyc


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/__pycache__/util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/util.cpython-39.pyc


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/__pycache__/vad.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/features/__pycache__/vad.cpython-39.pyc


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/doc/bsname.txt:
--------------------------------------------------------------------------------
  1 | 116
  2 | brow_lower_l
  3 | tongue_Scale__X
  4 | tongue_Scale_Y
  5 | tongue_Scale__Y
  6 | tongue_Scale_Z
  7 | tongue_Scale__Z
  8 | nose_out_l
  9 | nose_out_r
 10 | tongue_u
 11 | tongue_u_u
 12 | brow_raise_d
 13 | cheek_suck_r
 14 | mouth_stretch_u
 15 | tongue_u_d
 16 | tooth_d_d
 17 | tongue_d
 18 | tooth_r
 19 | tooth_d_u
 20 | cheek_UP
 21 | eye_blink1_l
 22 | eye_blink1_r
 23 | eye_blink2_l
 24 | eye_blink2_r
 25 | eye_lidTight_l
 26 | eye_lidTight_r
 27 | eye_shutTight_l
 28 | eye_shutTight_r
 29 | brow_lower_r
 30 | eye_upperLidRaise_l
 31 | eye_upperLidRaise_r
 32 | eye_downLidRaise_l
 33 | eye_downLidRaise_r
 34 | jaw_sideways_l
 35 | jaw_sideways_r
 36 | jaw_thrust_c
 37 | mouth_chew_c
 38 | mouth_chinRaise_d
 39 | mouth_chinRaise_u
 40 | brow_raise_c
 41 | mouth_dimple_l
 42 | mouth_dimple_r
 43 | mouth_funnel_dl
 44 | mouth_funnel_dr
 45 | mouth_funnel_ul
 46 | mouth_funnel_ur
 47 | mouth_lipCornerDepressFix_l
 48 | mouth_lipCornerDepressFix_r
 49 | mouth_lipCornerDepress_l
 50 | mouth_lipCornerDepress_r
 51 | brow_raise_l
 52 | mouth_lipCornerPullOpen_l
 53 | mouth_lipCornerPullOpen_r
 54 | mouth_lipCornerPull_l
 55 | mouth_lipCornerPull_r
 56 | mouth_lipStretchOpen_l
 57 | mouth_lipStretchOpen_r
 58 | mouth_lipStretch_l
 59 | mouth_lipStretch_r
 60 | mouth_lowerLipDepress_l
 61 | mouth_lowerLipDepress_r
 62 | brow_raise_r
 63 | mouth_lowerLipProtrude_c
 64 | mouth_oh_c
 65 | mouth_oo_c
 66 | mouth_pressFix_c
 67 | mouth_press_l
 68 | mouth_press_r
 69 | mouth_pucker_l
 70 | mouth_pucker_r
 71 | mouth_screamFix_c
 72 | mouth_sideways_l
 73 | cheek_puff_l
 74 | mouth_sideways_r
 75 | mouth_stretch_c
 76 | mouth_suck_dl
 77 | mouth_suck_dr
 78 | mouth_suck_ul
 79 | mouth_suck_ur
 80 | mouth_upperLipRaise_l
 81 | mouth_upperLipRaise_r
 82 | nose_wrinkle_l
 83 | nose_wrinkle_r
 84 | cheek_puff_r
 85 | tooth_l
 86 | eye_lookDown1_l
 87 | eye_lookDown2_l
 88 | eye_lookLeft_l
 89 | eye_lookRight_l
 90 | eye_lookUp_l
 91 | eye_lookDown1_r
 92 | eye_lookDown2_r
 93 | eye_lookLeft_r
 94 | eye_lookRight_r
 95 | cheek_raise_l
 96 | eye_lookUp_r
 97 | tongue_Rot_1X
 98 | tongue_Rot__1X
 99 | tongue_Rot_2X
100 | tongue_Rot__2X
101 | tongue_Rot_3X
102 | tongue_Rot__3X
103 | tongue_Rot_1Y
104 | tongue_Rot__1Y
105 | tongue_Rot_2Y
106 | cheek_raise_r
107 | tongue_Rot__2Y
108 | tongue_Rot_3Y
109 | tongue_Rot__3Y
110 | tongue_Rot_1Z
111 | tongue_Rot__1Z
112 | tongue_Rot_2Z
113 | tongue_Rot__2Z
114 | tongue_Rot_3Z
115 | tongue_Rot__3Z
116 | tongue_Scale_X
117 | cheek_suck_l


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/features.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | 
  3 | """
  4 | 音频处理的可以看看这个博客: https://www.cnblogs.com/LXP-Never/p/11561355.html
  5 | 天池零基础入门音频的教程: https://pythontechworld.com/article/detail/BCcNjuLDVYa6
  6 | 分帧函数: https://blog.csdn.net/qq_37653144/article/details/89045363
  7 | """
  8 | 
  9 | import os
 10 | from ctypes import *
 11 | 
 12 | import tqdm
 13 | import librosa
 14 | import numpy as np
 15 | 
 16 | current_dir = os.path.split(os.path.abspath(__file__))[0]
 17 | lpc_dll_file = os.path.join(current_dir, "LPC.dll")
 18 | lpc_dll = cdll.LoadLibrary(lpc_dll_file)
 19 | 
 20 | 
 21 | def lpc(audio_frames, sample_rate=16000):
 22 |     input_data_list = []
 23 |     for audio_frame in tqdm.tqdm(audio_frames):
 24 |         # 8ms帧移， 16ms帧长
 25 |         overlap_frames_apart = 0.008
 26 |         overlap = int(sample_rate * overlap_frames_apart)
 27 |         frameSize = int(sample_rate * overlap_frames_apart * 2)
 28 |         numberOfFrames = (len(audio_frame) - frameSize) // overlap + 1
 29 | 
 30 |         # 构造音频帧
 31 |         # print(numberOfFrames, frameSize)
 32 |         frames = np.ndarray((numberOfFrames, frameSize))
 33 |         for j in range(0, numberOfFrames):
 34 |             frames[j] = audio_frame[j * overlap: j * overlap + frameSize]
 35 | 
 36 |         # 加窗
 37 |         frames *= np.hanning(frameSize)
 38 | 
 39 |         # LPC
 40 |         frames_lpc_features = []
 41 |         b = (c_double * 32)()
 42 |         for fr in frames:
 43 |             a = (c_double * frameSize)(*fr)
 44 |             # LPC(float *in, int size, int order, float *out)
 45 |             lpc_dll.LPC(pointer(a), frameSize, 32, pointer(b));
 46 |             frames_lpc_features.append(list(b))
 47 |             del a
 48 | 
 49 |         del b
 50 | 
 51 |         image_temp1 = np.array(frames_lpc_features)
 52 |         image_temp2 = np.expand_dims(image_temp1, axis=0)  # 升维
 53 |         input_data_list.append(image_temp2)
 54 | 
 55 |     if not input_data_list:
 56 |         return None
 57 | 
 58 |     inputData_array = np.concatenate(input_data_list, axis=0)
 59 |     inputData_array = inputData_array.transpose((0, 2, 1))
 60 | 
 61 |     # 扩展为4维:(,32,64,1)
 62 |     inputData_array = np.expand_dims(inputData_array, axis=3)
 63 | 
 64 |     return inputData_array
 65 | 
 66 | 
 67 | def zero_crossing_feat(_wav, win_length, hop_length):
 68 |     """
 69 |     过零率 帧变负数负数变正数的时候要通过0这条线,
 70 |     :param _wav: [-1, 1536]
 71 |     :param win_length: 256
 72 |     :param hop_length: 128(这个应该也同时做为位移)
 73 |     :return:
 74 |     """
 75 |     padding = [(0, 0) for _ in range(_wav.ndim)]  # 不需要padding的维度
 76 |     padding[-1] = (hop_length, hop_length)  # 只有最后一个维度才需要padding
 77 |     y = np.pad(_wav, padding, mode="constant")
 78 | 
 79 |     # sum --> / win_lenght  就是求个平均
 80 |     zc = np.sum(
 81 |         librosa.zero_crossings(
 82 |             np.transpose(
 83 |                          # 分帧函数: 将时间序列分割成重叠的帧(所以应该是256帧叠在一起？, 13是特征的维度)
 84 |                          librosa.util.frame(y, frame_length=win_length, hop_length=hop_length),  # shape=(-1, 256, 13)
 85 |                          [0, 2, 1]),
 86 |             pad=False
 87 |         ),
 88 |         axis=-1
 89 |     ) / win_length
 90 | 
 91 |     return zc
 92 | 
 93 | 
 94 | def fbank(_wav, sample_rate, win_length, hop_length, n_mels, window="hann"):
 95 |     """
 96 |     sample_rate: 16000
 97 |     win_lenght: 256
 98 |     hop_length: 128
 99 |     n_mels: 32
100 |     window: hann
101 |     这里连续调用了两个音频处理库的包librosa(制作训练的数据集的时候会使用, inference的时候也会使用)
102 |     """
103 |     # 如果提供了时间序列输入y，sr，则首先计算其幅值频谱S，然后通过mel_f.dot（S ** power）将其映射到mel scale上 。
104 |     # 默认情况下，power= 2在功率谱上运行。
105 |     # 这个东西就类似于LPC.dll的功能
106 |     mel_spec_feat = librosa.feature.melspectrogram(
107 |         y=_wav,
108 |         sr=sample_rate,  # 160000
109 |         win_length=win_length,
110 |         hop_length=hop_length,
111 |         n_fft=win_length,
112 |         window=window,
113 |         n_mels=n_mels,
114 |     )
115 | 
116 |     # 再转换到对数刻度
117 |     db_feat = librosa.core.power_to_db(
118 |         mel_spec_feat, ref=1.0, amin=1e-10, top_db=None,
119 |     )
120 |     feat = (db_feat + 100) / 130.
121 |     return feat
122 | 
123 | 
124 | def mfcc(_wav, sample_rate, win_length, hop_length, n_mels, n_mfcc, window="hann"):
125 |     feat = librosa.feature.mfcc(
126 |         y=_wav,
127 |         sr=sample_rate,
128 |         win_length=win_length,
129 |         hop_length=hop_length,
130 |         n_fft=win_length,
131 |         window=window,
132 |         n_mels=n_mels,
133 |         n_mfcc=n_mfcc,
134 |     )
135 |     return feat
136 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/util.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | 
  3 | 
  4 | import os
  5 | import math
  6 | import json
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | current_dir = os.path.split(os.path.abspath(__file__))[0]
 12 | 
 13 | FACEGOOD_BS_CONUNT = 116
 14 | # the sort of bs name correspond to UE input sort
 15 | bs_name_index = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
 16 |                  29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
 17 |                  55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
 18 |                  81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105,
 19 |                  104, 106, 107, 108, 109, 110, 111, 112, 113, 114, 1, 115]
 20 | label_name_list = pd.read_csv(
 21 |     os.path.join(current_dir, "doc", "bsname.txt"), encoding="utf-8").values.transpose()[0].tolist()
 22 | 
 23 | STANDARD_ARKIT_BS_NAME = ["BlendShapeCount", "EyeBlinkLeft", "EyeLookDownLeft", "EyeLookInLeft", "EyeLookOutLeft",
 24 |                           "EyeLookUpLeft", "EyeSquintLeft", "EyeWideLeft", "EyeBlinkRight", "EyeLookDownRight",
 25 |                           "EyeLookInRight", "EyeLookOutRight", "EyeLookUpRight", "EyeSquintRight", "EyeWideRight",
 26 |                           "JawForward", "JawRight", "JawLeft", "JawOpen", "MouthClose", "MouthFunnel", "MouthPucker",
 27 |                           "MouthRight", "MouthLeft", "MouthSmileLeft", "MouthSmileRight", "MouthFrownLeft",
 28 |                           "MouthFrownRight", "MouthDimpleLeft", "MouthDimpleRight", "MouthStretchLeft",
 29 |                           "MouthStretchRight", "MouthRollLower", "MouthRollUpper", "MouthShrugLower", "MouthShrugUpper",
 30 |                           "MouthPressLeft", "MouthPressRight", "MouthLowerDownLeft", "MouthLowerDownRight",
 31 |                           "MouthUpperUpLeft", "MouthUpperUpRight", "BrowDownLeft", "BrowDownRight", "BrowInnerUp",
 32 |                           "BrowOuterUpLeft", "BrowOuterUpRight", "CheekPuff", "CheekSquintLeft", "CheekSquintRight",
 33 |                           "NoseSneerLeft", "NoseSneerRight", "TongueOut", "HeadYaw", "HeadPitch", "HeadRoll",
 34 |                           "LeftEyeYaw", "LeftEyePitch", "LeftEyeRoll", "RightEyeYaw", "RightEyePitch", "RightEyeRoll", ]
 35 | 
 36 | VALID_BS_NAME = [
 37 |     "JawForward",
 38 |     "JawLeft",
 39 |     "JawRight",
 40 |     "JawOpen",
 41 |     "MouthFunnel",
 42 |     "MouthPucker",
 43 |     "MouthLeft",
 44 |     "MouthRight",
 45 |     "MouthSmileLeft",
 46 |     "MouthSmileRight",
 47 |     "MouthFrownLeft",
 48 |     "MouthFrownRight",
 49 |     "MouthDimpleLeft",
 50 |     "MouthDimpleRight",
 51 |     "MouthStretchLeft",
 52 |     "MouthStretchRight",
 53 |     "MouthRollLower",
 54 |     "MouthRollUpper",
 55 |     "MouthShrugLower",
 56 |     "MouthShrugUpper",
 57 |     "MouthPressLeft",
 58 |     "MouthPressRight",
 59 |     "MouthLowerDownLeft",
 60 |     "MouthLowerDownRight",
 61 |     "MouthUpperUpLeft",
 62 |     "MouthUpperUpRight"
 63 | ]
 64 | 
 65 | SELECT_VALID_BS_NAME = [
 66 |     "JawOpen",
 67 |     "MouthFunnel",
 68 |     "MouthPucker",
 69 |     "MouthSmileLeft",
 70 |     "MouthSmileRight",
 71 |     "MouthStretchLeft",
 72 |     "MouthStretchRight",
 73 |     "MouthRollLower",
 74 |     "MouthRollUpper",
 75 |     "MouthShrugUpper",
 76 |     "MouthPressLeft",
 77 |     "MouthPressRight",
 78 |     "MouthLowerDownLeft",
 79 |     "MouthLowerDownRight",
 80 |     "MouthUpperUpLeft",
 81 |     "MouthUpperUpRight",
 82 | ]
 83 | 
 84 | 
 85 | def add_noise(origin_signal, snr):
 86 |     """
 87 |     添加高斯白噪声，固定信噪比
 88 |     """
 89 |     noise = np.random.normal(0, 1, len(origin_signal))
 90 | 
 91 |     # 计算语音信号功率Ps和噪声功率Pn1
 92 |     Ps = np.sum(origin_signal ** 2) / len(origin_signal)
 93 |     Pn1 = np.sum(noise ** 2) / len(noise)
 94 | 
 95 |     # 计算k值
 96 |     k = math.sqrt(Ps / (10 ** (snr / 10) * Pn1))
 97 | 
 98 |     # 将噪声数据乘以k,
 99 |     random_values_we_need = noise * k
100 | 
101 |     new_signal = origin_signal.astype(np.float64) + random_values_we_need
102 | 
103 |     return new_signal
104 | 
105 | 
106 | def add_other_noise(origin_signal, noise):
107 |     """
108 |     添加指定噪声
109 |     """
110 |     if len(origin_signal) / len(noise) > 1:
111 |         new_noise = np.concatenate([noise] * int(len(origin_signal) / len(noise) + 1))
112 |         new_noise = new_noise[:len(origin_signal)]
113 |     else:
114 |         upper = len(noise) - len(origin_signal)
115 |         start = np.random.randint(0, upper - 1)
116 |         new_noise = noise[start:start + len(origin_signal)]
117 |     new_signal = origin_signal + new_noise
118 |     return new_signal
119 | 
120 | 
121 | def load_json_file(file_path):
122 |     with open(file_path, "r", encoding="utf-8") as f:
123 |         profile = json.load(f)
124 |     return profile
125 | 
126 | 
127 | def rectangle_wav(wav):
128 |     """
129 |     将波形信号变为矩形波信号，
130 |     主要用于将时序BlendShape数值进行增强
131 |     """
132 |     rect_wav = np.zeros_like(wav)
133 |     extremum_indices = []
134 |     for t in range(1, len(wav) - 2):
135 |         # 趋势是否改变
136 |         is_change_slope = (wav[t + 1] - wav[t] + 1e-16) / (wav[t] - wav[t - 1] + 1e-16)
137 |         if is_change_slope < 0:
138 |             extremum_indices.append(t)
139 | 
140 |     # 常量信号，无波形
141 |     if not extremum_indices:
142 |         rect_wav[:] = wav[:]
143 |         return rect_wav
144 | 
145 |     # 每个极值区间进行赋值
146 |     for i, ind in enumerate(extremum_indices):
147 |         if i == 0:
148 |             start = 0
149 |         else:
150 |             start = int((ind + extremum_indices[i - 1]) / 2)
151 | 
152 |         if i == len(extremum_indices) - 1:
153 |             end = wav.shape[0]
154 |         else:
155 |             end = int((ind + extremum_indices[i + 1]) / 2)
156 |         rect_wav[start:end] = wav[ind]
157 | 
158 |     return rect_wav
159 | 
160 | 
161 | def facegood_bs_label_to_valid_arkit(label_temp):
162 |     """
163 |     FACEGOOD样例数据转换成标准ARKITS表情
164 | 
165 |     :param label_temp:
166 |     :return:
167 |     """
168 |     _label = np.zeros((label_temp.shape[0], FACEGOOD_BS_CONUNT))
169 |     for i in range(len(bs_name_index)):
170 |         _label[:, i] = label_temp[:, bs_name_index[i]]
171 | 
172 |     num_valid_bs = 26
173 |     new_label = np.zeros((_label.shape[0], num_valid_bs), dtype=np.float32)
174 |     new_label[:, 0] = _label[:, label_name_list.index("jaw_thrust_c")]
175 |     new_label[:, 1] = _label[:, label_name_list.index("jaw_sideways_l")]
176 |     new_label[:, 2] = _label[:, label_name_list.index("jaw_sideways_r")]
177 |     new_label[:, 3] = _label[:, label_name_list.index("mouth_stretch_c")]
178 |     # new_label[:, 4] = _label[:, label_name_list.index("mouth_chew_c")]
179 |     new_label[:, 4] = np.max(_label[:, [label_name_list.index(n)
180 |                                         for n in ["mouth_funnel_dl", "mouth_funnel_dr", "mouth_funnel_ul",
181 |                                                   "mouth_funnel_ur"]]], axis=1)
182 |     new_label[:, 5] = np.max(
183 |         _label[:, [label_name_list.index(n) for n in ["mouth_pucker_l", "mouth_pucker_r"]]], axis=1)
184 |     new_label[:, 6] = _label[:, label_name_list.index("mouth_sideways_l")]
185 |     new_label[:, 7] = _label[:, label_name_list.index("mouth_sideways_r")]
186 |     new_label[:, 8] = _label[:, label_name_list.index("mouth_lipCornerPull_l")]
187 |     new_label[:, 9] = _label[:, label_name_list.index("mouth_lipCornerPull_r")]
188 |     new_label[:, 10] = np.max(_label[:, [label_name_list.index(n) for n in ["mouth_lipCornerDepress_l",
189 |                                                                             "mouth_lipCornerDepressFix_l"]]],
190 |                               axis=1)
191 |     new_label[:, 11] = np.max(_label[:, [label_name_list.index(n) for n in ["mouth_lipCornerDepress_r",
192 |                                                                             "mouth_lipCornerDepressFix_r"]]],
193 |                               axis=1)
194 |     new_label[:, 12] = _label[:, label_name_list.index("mouth_dimple_l")]
195 |     new_label[:, 13] = _label[:, label_name_list.index("mouth_dimple_r")]
196 |     new_label[:, 14] = _label[:, label_name_list.index("mouth_lipStretch_l")]
197 |     new_label[:, 15] = _label[:, label_name_list.index("mouth_lipStretch_r")]
198 |     new_label[:, 16] = np.max(
199 |         _label[:, [label_name_list.index(n) for n in ["mouth_suck_dl", "mouth_suck_dr"]]], axis=1)
200 |     new_label[:, 17] = np.max(
201 |         _label[:, [label_name_list.index(n) for n in ["mouth_suck_ul", "mouth_suck_ur"]]], axis=1)
202 |     new_label[:, 18] = _label[:, label_name_list.index("mouth_chinRaise_d")]
203 |     new_label[:, 19] = _label[:, label_name_list.index("mouth_chinRaise_u")]
204 |     new_label[:, 20] = _label[:, label_name_list.index("mouth_press_l")]
205 |     new_label[:, 21] = _label[:, label_name_list.index("mouth_press_r")]
206 |     new_label[:, 22] = _label[:, label_name_list.index("mouth_lowerLipDepress_l")]
207 |     new_label[:, 23] = _label[:, label_name_list.index("mouth_lowerLipDepress_r")]
208 |     new_label[:, 24] = _label[:, label_name_list.index("mouth_upperLipRaise_l")]
209 |     new_label[:, 25] = _label[:, label_name_list.index("mouth_upperLipRaise_r")]
210 | 
211 |     return new_label
212 | 
213 | 
214 | def standard_arkit_bs_to_valid(label_temp):
215 |     """
216 |     标准ARKITS表情，抽取有效的嘴部动作
217 | 
218 |     :param label_temp:
219 |     :return:
220 |     """
221 |     num_valid_bs = len(VALID_BS_NAME)
222 |     indices = [STANDARD_ARKIT_BS_NAME.index(bs) for bs in VALID_BS_NAME]
223 |     # indices = [STANDARD_ARKIT_BS_NAME.index(bs) for bs in SELECT_VALID_BS_NAME]
224 |     new_label = np.zeros((label_temp.shape[0], num_valid_bs), dtype=np.float32)
225 |     new_label[:] = label_temp[:, indices]
226 |     return new_label
227 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/features/vad.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.io.wavfile as wf
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | class VoiceActivityDetector:
  7 |     """ Use signal energy to detect voice activity in wav file """
  8 | 
  9 |     def __init__(self,
 10 |                  wave_input_filename,
 11 |                  sample_window=0.02,
 12 |                  sample_overlap=0.01,
 13 |                  speech_window=0.5,
 14 |                  speech_energy_threshold=0.6,
 15 |                  speech_start_band=300,
 16 |                  speech_end_band=3000,
 17 |                  ):
 18 |         self._read_wav(wave_input_filename)._convert_to_mono()
 19 |         self.sample_window = sample_window  # 20 ms
 20 |         self.sample_overlap = sample_overlap  # 10ms
 21 |         self.speech_window = speech_window  # half a second
 22 |         self.speech_energy_threshold = speech_energy_threshold  # 60% of energy in voice band
 23 |         self.speech_start_band = speech_start_band
 24 |         self.speech_end_band = speech_end_band
 25 | 
 26 |     def _read_wav(self, wave_file):
 27 |         self.rate, self.data = wf.read(wave_file)
 28 |         self.channels = len(self.data.shape)
 29 |         self.filename = wave_file
 30 |         return self
 31 | 
 32 |     def _convert_to_mono(self):
 33 |         if self.channels == 2:
 34 |             self.data = np.mean(self.data, axis=1, dtype=self.data.dtype)
 35 |             self.channels = 1
 36 |         return self
 37 | 
 38 |     def _calculate_frequencies(self, audio_data):
 39 |         data_freq = np.fft.fftfreq(len(audio_data), 1.0 / self.rate)
 40 |         data_freq = data_freq[1:]
 41 |         return data_freq
 42 | 
 43 |     def _calculate_amplitude(self, audio_data):
 44 |         data_ampl = np.abs(np.fft.fft(audio_data))
 45 |         data_ampl = data_ampl[1:]
 46 |         return data_ampl
 47 | 
 48 |     def _calculate_energy(self, data):
 49 |         data_amplitude = self._calculate_amplitude(data)
 50 |         data_energy = data_amplitude ** 2
 51 |         return data_energy
 52 | 
 53 |     def _znormalize_energy(self, data_energy):
 54 |         energy_mean = np.mean(data_energy)
 55 |         energy_std = np.std(data_energy)
 56 |         energy_znorm = (data_energy - energy_mean) / energy_std
 57 |         return energy_znorm
 58 | 
 59 |     def _connect_energy_with_frequencies(self, data_freq, data_energy):
 60 |         energy_freq = {}
 61 |         for (i, freq) in enumerate(data_freq):
 62 |             if abs(freq) not in energy_freq:
 63 |                 energy_freq[abs(freq)] = data_energy[i] * 2
 64 |         return energy_freq
 65 | 
 66 |     def _calculate_normalized_energy(self, data):
 67 |         data_freq = self._calculate_frequencies(data)
 68 |         data_energy = self._calculate_energy(data)
 69 |         # data_energy = self._znormalize_energy(data_energy) #znorm brings worse results
 70 |         energy_freq = self._connect_energy_with_frequencies(data_freq, data_energy)
 71 |         return energy_freq
 72 | 
 73 |     def _sum_energy_in_band(self, energy_frequencies, start_band, end_band):
 74 |         sum_energy = 0
 75 |         for f in energy_frequencies.keys():
 76 |             if start_band < f < end_band:
 77 |                 sum_energy += energy_frequencies[f]
 78 |         return sum_energy
 79 | 
 80 |     def _median_filter(self, x, k):
 81 |         assert k % 2 == 1, "Median filter length must be odd."
 82 |         assert x.ndim == 1, "Input must be one-dimensional."
 83 |         k2 = (k - 1) // 2
 84 |         y = np.zeros((len(x), k), dtype=x.dtype)
 85 |         y[:, k2] = x
 86 |         for i in range(k2):
 87 |             j = k2 - i
 88 |             y[j:, i] = x[:-j]
 89 |             y[:j, i] = x[0]
 90 |             y[:-j, -(i + 1)] = x[j:]
 91 |             y[-j:, -(i + 1)] = x[-1]
 92 |         return np.median(y, axis=1)
 93 | 
 94 |     def _smooth_speech_detection(self, detected_windows):
 95 |         median_window = int(self.speech_window / self.sample_window)
 96 |         if median_window % 2 == 0: median_window = median_window - 1
 97 |         median_energy = self._median_filter(detected_windows[:, 1], median_window)
 98 |         return median_energy
 99 | 
100 |     def convert_windows_to_readible_labels(self, detected_windows):
101 |         """ Takes as input array of window numbers and speech flags from speech
102 |         detection and convert speech flags to time intervals of speech.
103 |         Output is array of dictionaries with speech intervals.
104 |         """
105 |         speech_time = []
106 |         is_speech = 0
107 |         for window in detected_windows:
108 |             if (window[1] == 1.0 and is_speech == 0):
109 |                 is_speech = 1
110 |                 speech_label = {}
111 |                 speech_time_start = window[0] / self.rate
112 |                 speech_label['speech_begin'] = speech_time_start
113 |                 print(window[0], speech_time_start)
114 |                 # speech_time.append(speech_label)
115 |             if (window[1] == 0.0 and is_speech == 1):
116 |                 is_speech = 0
117 |                 speech_time_end = window[0] / self.rate
118 |                 speech_label['speech_end'] = speech_time_end
119 |                 speech_time.append(speech_label)
120 |                 print(window[0], speech_time_end)
121 |         return speech_time
122 | 
123 |     def plot_detected_speech_regions(self):
124 |         """ Performs speech detection and plot original signal and speech regions.
125 |         """
126 |         data = self.data
127 |         detected_windows = self.detect_speech()
128 |         data_speech = np.zeros(len(data))
129 |         it = np.nditer(detected_windows[:, 0], flags=['f_index'])
130 |         while not it.finished:
131 |             data_speech[int(it[0])] = data[int(it[0])] * detected_windows[it.index, 1]
132 |             it.iternext()
133 |         plt.figure(figsize=(200, 10))
134 |         plt.plot(data_speech)
135 |         plt.show()
136 |         plt.figure(figsize=(200, 10))
137 |         plt.plot(data)
138 |         plt.show()
139 |         return self
140 | 
141 |     def detect_speech(self):
142 |         """ Detects speech regions based on ratio between speech band energy
143 |         and total energy.
144 |         Output is array of window numbers and speech flags (1 - speech, 0 - nonspeech).
145 |         """
146 |         detected_windows = np.array([])
147 |         sample_window = int(self.rate * self.sample_window)
148 |         sample_overlap = int(self.rate * self.sample_overlap)
149 |         data = self.data
150 |         sample_start = 0
151 |         start_band = self.speech_start_band
152 |         end_band = self.speech_end_band
153 |         while (sample_start < (len(data) - sample_window)):
154 |             sample_end = sample_start + sample_window
155 |             if sample_end >= len(data): sample_end = len(data) - 1
156 |             data_window = data[sample_start:sample_end]
157 |             energy_freq = self._calculate_normalized_energy(data_window)
158 |             sum_voice_energy = self._sum_energy_in_band(energy_freq, start_band, end_band)
159 |             sum_full_energy = sum(energy_freq.values())
160 |             speech_ratio = sum_voice_energy / sum_full_energy
161 |             # Hipothesis is that when there is a speech sequence we have ratio of energies more than Threshold
162 |             speech_ratio = speech_ratio > self.speech_energy_threshold
163 |             detected_windows = np.append(detected_windows, [sample_start, speech_ratio])
164 |             sample_start += sample_overlap
165 |         detected_windows = detected_windows.reshape(int(len(detected_windows) / 2), 2)
166 |         detected_windows[:, 1] = self._smooth_speech_detection(detected_windows)
167 |         return detected_windows
168 | 


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/mocap4face/2001161359.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "settings": {
 3 |         "is_face.base_model_path": "2001161359_bs_trans__ti_a8u2m2mh2f2__lrd150_augN01A05_brd03__par1222pyr5_ks753_dsS_dbCS1x1_sz16mul11236__split800_4106000_weights.130-0.39-model.h5"
 4 |     },
 5 |     "model_metadata": {
 6 |         "input": {
 7 |             "resolution": [
 8 |                 256,
 9 |                 256
10 |             ]
11 |         },
12 |         "outputs": {
13 |             "position": {
14 |                 "resolution": [
15 |                     256,
16 |                     256
17 |                 ]
18 |             },
19 |             "blendshapes": {
20 |                 "names": [
21 |                     "browOutterUpLeft",
22 |                     "browInnerUp",
23 |                     "browDownLeft",
24 |                     "eyeBlinkLeft",
25 |                     "eyeSquintLeft",
26 |                     "eyeWideLeft",
27 |                     "eyeLookUpLeft",
28 |                     "eyeLookOutLeft",
29 |                     "eyeLookInLeft",
30 |                     "eyeLookDownLeft",
31 |                     "noseSneerLeft",
32 |                     "mouthUpperUpLeft",
33 |                     "mouthSmileLeft",
34 |                     "mouthLeft",
35 |                     "mouthFrownLeft",
36 |                     "mouthLowerDownLeft",
37 |                     "jawLeft",
38 |                     "cheekPuff",
39 |                     "mouthShrugUpper",
40 |                     "mouthFunnel",
41 |                     "mouthRollLower",
42 |                     "jawOpen",
43 |                     "tongueOut",
44 |                     "mouthPucker",
45 |                     "mouthRollUpper",
46 |                     "jawRight",
47 |                     "mouthLowerDownRight",
48 |                     "mouthFrownRight",
49 |                     "mouthRight",
50 |                     "mouthSmileRight",
51 |                     "mouthUpperUpRight",
52 |                     "noseSneerRight",
53 |                     "eyeLookDownRight",
54 |                     "eyeLookInRight",
55 |                     "eyeLookOutRight",
56 |                     "eyeLookUpRight",
57 |                     "eyeWideRight",
58 |                     "eyeSquintRight",
59 |                     "eyeBlinkRight",
60 |                     "browDownRight",
61 |                     "browInnerUp",
62 |                     "browOutterUpRight"
63 |                 ],
64 |                 "count": 42
65 |             },
66 |             "transforms": {
67 |                 "elements": [
68 |                     "quat",
69 |                     "headCenter3",
70 |                     "scale1",
71 |                     "nose3",
72 |                     "rightEar3",
73 |                     "leftEar3",
74 |                     "euler3"
75 |                 ],
76 |                 "field_count": 20,
77 |                 "scale_normalization": {
78 |                     "scale": 2,
79 |                     "offset": 0.600000023841858
80 |                 },
81 |                 "euler_rotation_ranges": [
82 |                     45,
83 |                     88,
84 |                     40
85 |                 ]
86 |             }
87 |         }
88 |     }
89 | }


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/mocap4face/2001161359.tflite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/mocap4face/2001161359.tflite


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/mocap4face/__pycache__/mocap4face.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/data_generate/generate_datasets_v2/mocap4face/__pycache__/mocap4face.cpython-39.pyc


--------------------------------------------------------------------------------
/data_generate/generate_datasets_v2/mocap4face/mocap4face.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import json
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import cv2
  6 | import mediapipe as mp
  7 | import time
  8 | 
  9 | addr = '192.168.38.6'
 10 | 
 11 | blendershapes_index_map = [
 12 |     "None",
 13 |     "browInnerUp",
 14 |     "browOutterUpLeft",
 15 |     "browOutterUpRight",
 16 |     "browDownLeft",
 17 |     "browDownRight",
 18 |     "eyeWideLeft",
 19 |     "eyeWideRight",
 20 |     "cheekSquintLeft",
 21 |     "cheekSquintRight",
 22 |     "eyeSquintLeft",
 23 |     "eyeSquintRight",
 24 |     "noseSneerLeft",
 25 |     "noseSneerRight",
 26 |     "mouthUpperUpLeft",
 27 |     "mouthUpperUpRight",
 28 |     "mouthLeft",
 29 |     "mouthRight",
 30 |     "mouthSmileLeft",
 31 |     "mouthSmileRight",
 32 |     "mouthDimpleLeft",
 33 |     "mouthDimpleRight",
 34 |     "mouthFrownLeft",
 35 |     "mouthFrownRight",
 36 |     "mouthLowerDownLeft",
 37 |     "mouthLowerDownRight",
 38 |     "mouthShrugLower",
 39 |     "mouthShrugUpper",
 40 |     "mouthPucker",
 41 |     "mouthStretchLeft",
 42 |     "mouthStretchRight",
 43 |     "mouthFunnel",
 44 |     "mouthPress",
 45 |     "jawOpen",
 46 |     "mouthRollLower",
 47 |     "mouthRollUpper",
 48 |     "jawForward",
 49 |     "jawLeft",
 50 |     "jawRight",
 51 |     "cheekPuff",
 52 |     "eyeBlinkLeft",
 53 |     "eyeBlinkRight",
 54 |     "eyeLookDownLeft",
 55 |     "eyeLookDownRight",
 56 |     "eyeLookInLeft",
 57 |     "eyeLookInRight",
 58 |     "eyeLookOutLeft",
 59 |     "eyeLookOutRight",
 60 |     "eyeLookUpLeft",
 61 |     "eyeLookUpRight",
 62 |     "mouthPressLeft",
 63 |     "mouthPressRight",
 64 |     "headDown",
 65 |     "headLeft",
 66 |     "headRight",
 67 |     "headRollLeft",
 68 |     "headRollRight",
 69 |     "headUp",
 70 |     "tongueOut",
 71 | ]
 72 | body1 = """{"frame":81,"timestamp":1653020274303}"""
 73 | body2 = """#{"cmdList":[{"k":0,"v":{"x":-0.16915,"y":0.44524,"z":-0.14412},"visibility":0.99242},{"k":1,"v":{"x":-0.23624,"y":0.28103,"z":-0.13774},"visibility":0.80215},{"k":2,"v":{"x":-0.25922,"y":0.1164,"z":-0.16851},"visibility":0.3386},{"k":3,"v":{"x":-0.24703,"y":0.10263,"z":-0.18967},"visibility":0.35001},{"k":4,"v":{"x":-0.25588,"y":0.06091,"z":-0.18511},"visibility":0.27888},{"k":5,"v":{"x":0.11057,"y":0.48848,"z":-0.03623},"visibility":0.99176},{"k":6,"v":{"x":0.3029,"y":0.40632,"z":-0.03385},"visibility":0.66099},{"k":7,"v":{"x":0.43823,"y":0.3376,"z":-0.17424},"visibility":0.56821},{"k":8,"v":{"x":0.42734,"y":0.3283,"z":-0.19942},"visibility":0.57001},{"k":9,"v":{"x":0.47437,"y":0.31352,"z":-0.20123},"visibility":0.47294},{"k":10,"v":{"x":0.04314,"y":0.64238,"z":-0.10405},"visibility":0.99465},{"k":11,"v":{"x":0.02436,"y":0.66783,"z":-0.2133},"visibility":0.99679},{"k":12,"v":{"x":-0.08512,"y":0.63474,"z":-0.14779},"visibility":0.9982},{"k":13,"v":{"x":-0.00396,"y":0.66814,"z":-0.22631},"visibility":0.99743},{"k":14,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":15,"v":{"x":-0.10621,"y":0.00281,"z":0.00741},"visibility":1.0},{"k":16,"v":{"x":-0.09716,"y":-0.37997,"z":0.00613},"visibility":1.0},{"k":17,"v":{"x":-0.08832,"y":-0.73587,"z":0.19865},"visibility":1.0},{"k":18,"v":{"x":-0.13203,"y":-0.85234,"z":0.09052},"visibility":1.0},{"k":19,"v":{"x":0.10582,"y":-0.00233,"z":-0.00702},"visibility":1.0},{"k":20,"v":{"x":0.12449,"y":-0.38901,"z":0.0043},"visibility":1.0},{"k":21,"v":{"x":0.15222,"y":-0.72213,"z":0.18485},"visibility":1.0},{"k":22,"v":{"x":0.18626,"y":-0.83152,"z":0.06556},"visibility":1.0},{"k":23,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":24,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":25,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":26,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781},{"k":27,"v":{"x":0.01971,"y":0.63476,"z":-0.22465},"visibility":0.99781}],"status":0,"valid":1}"""
 74 | 
 75 | 
 76 | def resize_img_keep_ratio(img, target_size=(800, 800)):
 77 |     old_size = img.shape[0:2]
 78 |     ratio = min(float(target_size[i]) / (old_size[i]) for i in range(len(old_size)))
 79 |     new_size = tuple([int(i * ratio) for i in old_size])
 80 |     img = cv2.resize(img, (new_size[1], new_size[0]))
 81 |     pad_w = target_size[1] - new_size[1]
 82 |     pad_h = target_size[0] - new_size[0]
 83 |     top, bottom = pad_h // 2, pad_h - (pad_h // 2)
 84 |     left, right = pad_w // 2, pad_w - (pad_w // 2)
 85 |     img_new = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, None, (255, 255, 255))
 86 |     return img_new
 87 | 
 88 | 
 89 | class MediapipeFaceDetection:
 90 |     def __init__(self, tflite_path="./2001161359.tflite", json_path="./2001161359.json"):
 91 |         self.face_det = self.MediapipeInit()
 92 |         self.tfliteInit(tflite_path)
 93 |         self.getMocapDict(json_path)
 94 | 
 95 |     def tfliteInit(self, tflite_file):
 96 |         # Initialize the interpreter
 97 |         self.interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
 98 |         self.interpreter.allocate_tensors()
 99 | 
100 |         self.input_details = self.interpreter.get_input_details()[0]
101 |         self.blendershapes_output_details = self.interpreter.get_output_details()[0]
102 |         self.transforms_output_details = self.interpreter.get_output_details()[1]
103 | 
104 |     def MediapipeInit(self):
105 |         face_det = mp.solutions.face_detection.FaceDetection(
106 |             min_detection_confidence=0.5,
107 |             model_selection=0
108 |         )
109 |         return face_det
110 | 
111 |     def MediapipeRun(self, image, return_face=False):
112 |         # Convert the BGR image to RGB before processing.
113 |         results = self.face_det.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
114 |         if results.detections is None:
115 |             return None, None
116 |         h, w, _ = image.shape
117 |         face = results.detections[0]
118 |         cx = int(face.location_data.relative_bounding_box.xmin * w)
119 |         cy = int(face.location_data.relative_bounding_box.ymin * h)
120 |         height = int(face.location_data.relative_bounding_box.height * h)
121 |         width = int(face.location_data.relative_bounding_box.width * w)
122 | 
123 |         side_length = max(height, width) + 60
124 | 
125 |         y_start = int(max((cy + cy + height) / 2 - side_length / 2, 0.))
126 |         y_end = int(min(y_start + side_length, h))
127 |         x_start = int(max((cx + cx + width) / 2 - side_length / 2, 0.))
128 |         x_end = int(min(x_start + side_length, w))
129 | 
130 |         # face_image = image[cy:cy+height, cx:cx+width]
131 |         face_image = image[y_start:y_end, x_start:x_end]
132 | 
133 |         # test_image = face_image
134 |         test_image = cv2.resize(face_image, (256, 256))
135 |         # test_image = resize_img_keep_ratio(face_image, (256, 256))
136 |         s1 = time.time()
137 |         test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255.
138 |         self.interpreter.set_tensor(self.input_details["index"], test_image)
139 |         self.interpreter.invoke()
140 |         blendershapes = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0] * 100
141 |         transforms = self.interpreter.get_tensor(self.transforms_output_details["index"])
142 |         # print(time.time() - s1)
143 |         if return_face:
144 |             return blendershapes, transforms, test_image
145 |         return blendershapes, transforms
146 | 
147 |     def MediapipeRunWithoutFaceDetect(self, image):
148 |         # Convert the BGR image to RGB before processing.
149 |         face_image = image
150 |         test_image = cv2.resize(face_image, (256, 256))
151 |         # test_image = resize_img_keep_ratio(face_image, (256, 256))
152 |         s1 = time.time()
153 |         test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255.
154 |         self.interpreter.set_tensor(self.input_details["index"], test_image)
155 |         self.interpreter.invoke()
156 |         blendershapes = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0] * 100
157 |         transforms = self.interpreter.get_tensor(self.transforms_output_details["index"])
158 |         # print(time.time() - s1)
159 |         return blendershapes, transforms
160 | 
161 |     def jsonFormat(self, prediction):
162 |         json_kv = {}
163 |         for idx, emoji_val in enumerate(prediction):
164 |             emoji_name = self.mocap[idx]
165 |             json_kv[emoji_name] = emoji_val
166 |         return json_kv
167 | 
168 |     def getMocapDict(self, path):
169 |         with open(path, 'r') as f:
170 |             j = json.load(f)
171 |         self.mocap = j['model_metadata']['outputs']['blendshapes']['names']
172 |         self.blendershapes_map = {}
173 |         for index, bs in enumerate(blendershapes_index_map):
174 |             self.blendershapes_map[bs] = index
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     fmp = MediapipeFaceDetection()
179 |     capture = cv2.VideoCapture(0)
180 |     if not capture.isOpened():
181 |         print("打开视频失败！")
182 | 
183 |     fps = capture.get(cv2.CAP_PROP_FPS)
184 |     size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
185 |             int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
186 |     fNUMS = capture.get(cv2.CAP_PROP_FRAME_COUNT)
187 |     print("fps:", fps)
188 |     print("size:", size)
189 |     print("fNUMS:", fNUMS)
190 | 
191 |     f_cnt = 0
192 |     time_cnt = 0
193 |     while True:
194 |         _, frame = capture.read()
195 |         if frame is None:
196 |             break
197 |         f_cnt += 1
198 |         res, _ = fmp.MediapipeRun(frame)
199 |         face_json = fmp.jsonFormat(res)
200 | 
201 |     capture.release()
202 | 


--------------------------------------------------------------------------------
/datasets/__pycache__/dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/datasets/__pycache__/dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from torch.utils.data import Dataset
  4 | import numpy as np
  5 | import torch
  6 | import cv2
  7 | import sys
  8 | sys.path.append(".")
  9 | sys.path.append("..")
 10 | import random
 11 | import torchvision.transforms as transforms
 12 | import json
 13 | import matplotlib.pyplot as plt
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | class AudioDataset(Dataset):
 18 |     def __init__(self, target_root, data_root):
 19 |         """
 20 |         :param window: 音频序列的长度为3
 21 |         """
 22 |         self.target_root = target_root
 23 |         self.data_root = data_root
 24 | 
 25 |         self.all_data = []
 26 |         self.all_gt = []
 27 | 
 28 |         self.pre_process()
 29 | 
 30 | 
 31 |     def vector_transforms(self, data):
 32 |         # option(1) 这个是全局的mean和std
 33 |         # data_mean = np.mean(data)
 34 |         # data_std = np.std(data)
 35 | 
 36 |         # option(2) 这个是针对每个特征的mean和std
 37 |         num_length = data.shape[-1]
 38 |         data_mean = np.mean(data.reshape(-1, num_length), axis=0, keepdims=True)[np.newaxis, ...]
 39 |         data_std = np.std(data.reshape(-1, num_length), axis=0, keepdims=True)[np.newaxis, ...]
 40 | 
 41 |         # 数据标准化
 42 |         data = (data - data_mean) / data_std
 43 | 
 44 |         return data
 45 | 
 46 |     def pre_process(self):
 47 |         """
 48 |         对数据进行预处理，收集数据
 49 |         :return:
 50 |         """
 51 |         data_list = os.listdir(self.data_root)
 52 |         # target_list = os.listdir(self.target_root)
 53 | 
 54 |         # for index, item in enumerate(data_list):
 55 |         #     assert item == target_list[index]
 56 | 
 57 |         for index, data_name in enumerate(data_list):
 58 |             data_path = os.path.join(self.data_root, data_name)
 59 |             target_path = os.path.join(self.target_root, data_name)
 60 |             data = np.load(data_path)
 61 |             gt = np.load(target_path)
 62 | 
 63 |             # 无口型的片段全部去除，可能是没有人脸，或者噪声数据
 64 |             # 静音片段的gt也置零
 65 |             gt_sum = gt.sum(axis=1)
 66 |             zero_index = np.where(gt_sum == 0)[0]
 67 |             # 按概率将0标签的输入也置零
 68 |             # option(2)
 69 |             if len(zero_index) > 0:
 70 |                 data[zero_index] = 0
 71 | 
 72 |             # # option(1)
 73 |             # select_data = []
 74 |             # select_gt = []
 75 |             # for i in range(data.shape[0]):
 76 |             #     if i not in zero_index:
 77 |             #         select_data.append(data[i][np.newaxis, ...])
 78 |             #         select_gt.append(gt[i][np.newaxis, ...])
 79 |             # data = np.concatenate(select_data, axis=0)
 80 |             # gt = np.concatenate(select_gt, axis=0)
 81 | 
 82 | 
 83 |             data = self.vector_transforms(data)
 84 | 
 85 |             padding_data = np.zeros(data[0].shape)[np.newaxis, ...]
 86 |             padding_gt = np.zeros(gt[0].shape)[np.newaxis, ...]
 87 |             self.all_data.append(data)
 88 |             self.all_data.append(padding_data)
 89 |             self.all_gt.append(gt)
 90 |             self.all_gt.append(padding_gt)
 91 | 
 92 |         # 第一个vector是过零率
 93 |         self.all_data = np.concatenate(self.all_data, axis=0)[:, np.newaxis, :, :]
 94 |         self.all_gt = np.concatenate(self.all_gt, axis=0)
 95 | 
 96 |     def __len__(self):
 97 |         return len(self.all_data)
 98 | 
 99 |     def __getitem__(self, index):
100 |         return torch.FloatTensor(np.array(self.all_data[index], dtype=np.float32)), torch.FloatTensor(np.array(self.all_gt[index], dtype=np.float32))
101 | 
102 | if __name__ == "__main__":
103 |     target_root = "E:/datasets/audio2face/train_gt"
104 |     data_root = "E:/datasets/audio2face/train_data"
105 |     trainsets = AudioDataset(target_root, data_root)
106 |     trainloader = torch.utils.data.DataLoader(trainsets, batch_size=8, shuffle=True, num_workers=0)
107 | 
108 |     for batch_idx, (datas, targets) in enumerate(trainloader):
109 |         pass
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/model_weights/2001161359.tflite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/model_weights/2001161359.tflite


--------------------------------------------------------------------------------
/models/__pycache__/mouth_net.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/models/__pycache__/mouth_net.cpython-39.pyc


--------------------------------------------------------------------------------
/models/mouth_net.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import torch
 4 | 
 5 | class Swish(nn.Module):
 6 |     def __init__(self):
 7 |         super(Swish, self).__init__()
 8 | 
 9 |     def forward(self, x):
10 |         x = x * F.sigmoid(x)
11 |         return x
12 | 
13 | class MouthNet(nn.Module):
14 |     def __init__(self, class_num=16):
15 |         super(MouthNet, self).__init__()
16 | 
17 |         # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
18 |         encoder1 = []
19 |         layer1 = []
20 |         layer1.append(nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 1), stride=(2, 1), padding=0))
21 |         layer1.append(nn.BatchNorm2d(32))
22 |         layer1.append(Swish())
23 |         layer1 = nn.Sequential(*layer1)
24 |         encoder1.append(layer1)
25 | 
26 |         layer2 = []
27 |         layer2.append(nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 1), stride=(4, 1), padding=0))
28 |         layer2.append(nn.BatchNorm2d(64))
29 |         layer2.append(Swish())
30 |         layer2 = nn.Sequential(*layer2)
31 |         encoder1.append(layer2)
32 | 
33 |         layer3 = []
34 |         layer3.append(nn.Conv2d(in_channels=64, out_channels=96, kernel_size=(3, 1), stride=(4, 1), padding=0))
35 |         layer3.append(nn.BatchNorm2d(96))
36 |         layer3.append(Swish())
37 |         layer3 = nn.Sequential(*layer3)
38 |         encoder1.append(layer3)
39 | 
40 |         self.encoder1 = nn.Sequential(*encoder1)
41 | 
42 |         # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
43 |         encoder2 = []
44 |         layer1 = []
45 |         layer1.append(nn.Conv2d(in_channels=97, out_channels=128, kernel_size=(1, 3), stride=(1, 3), padding=0))
46 |         layer1.append(nn.BatchNorm2d(128))
47 |         layer1.append(Swish())
48 |         layer1 = nn.Sequential(*layer1)
49 |         encoder2.append(layer1)
50 | 
51 |         layer2 = []
52 |         layer2.append(nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(1, 2), stride=(1, 2), padding=0))
53 |         layer2.append(nn.BatchNorm2d(128))
54 |         layer2.append(Swish())
55 |         layer2 = nn.Sequential(*layer2)
56 |         encoder2.append(layer2)
57 | 
58 |         layer3 = []
59 |         layer3.append(nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(1, 2), stride=(1, 2), padding=0))
60 |         layer3.append(nn.BatchNorm2d(128))
61 |         layer3.append(Swish())
62 |         layer3 = nn.Sequential(*layer3)
63 |         encoder2.append(layer3)
64 | 
65 |         self.encoder2 = nn.Sequential(*encoder2)
66 | 
67 |         regression = []
68 |         regression.append(nn.Linear(128, 64))
69 |         regression.append(nn.Dropout(0.5))
70 |         regression.append(nn.BatchNorm1d(64))
71 |         regression.append(Swish())
72 |         regression.append(nn.Linear(64, class_num))
73 |         self.regression = nn.Sequential(*regression)
74 | 
75 | 
76 |     def forward(self, x):
77 |         # 前向传播
78 |         feat_zc = x[:, :, 0, :].unsqueeze(-2)
79 |         feat = x[:, :, 1:, :]
80 |         encoder1 = self.encoder1(feat)
81 |         x2 = torch.cat([encoder1, feat_zc], 1)
82 |         encoder2 = self.encoder2(x2)
83 |         encoder2 = torch.flatten(encoder2, 1)
84 | 
85 |         bs = self.regression(encoder2)
86 | 
87 |         return bs
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     print('##############PyTorch################')
92 |     net = MouthNet(class_num=16)
93 |     x = torch.randn((8, 1, 33, 13))
94 |     y = net(x)


--------------------------------------------------------------------------------
/third_part/LPC.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/third_part/LPC.dll


--------------------------------------------------------------------------------
/third_part/__pycache__/moCapFace.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Allen-lz/audio2face_pytorch/8ebb1bc02d590f23a6b5df634f67dece9f97fcdb/third_part/__pycache__/moCapFace.cpython-39.pyc


--------------------------------------------------------------------------------
/third_part/moCapFace.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 这是mocapface中抠出的tflite模型
 3 | """
 4 | """
 5 | 头部姿态估计模型
 6 | """
 7 | import numpy as np
 8 | import cv2
 9 | import json
10 | import tensorflow as tf
11 | class MoCapFace(object):
12 |     def __init__(self):
13 |         """
14 |         使用.tflite来初始化tflite的模型
15 |         """
16 |         self.tfliteInit('model_weights/2001161359.tflite')
17 | 
18 |     def tfliteInit(self, tflite_file):
19 |         # Initialize the interpreter
20 |         # 初始化解释器
21 |         self.interpreter = tf.lite.Interpreter(model_path=tflite_file)
22 |         # 为tensor分配显存
23 |         self.interpreter.allocate_tensors()
24 | 
25 |         # 得到输入的place_hoder
26 |         self.input_details = self.interpreter.get_input_details()[0]
27 |         # 获得输出的hooker
28 |         self.blendershapes_output_details = self.interpreter.get_output_details()[0]   # bs系数
29 |         # self.transforms_output_details = self.interpreter.get_output_details()[1]  # 头部朝向
30 | 
31 |     def forword(self, img):
32 |         """
33 |         img: numpy bgr
34 |         :param img:
35 |         :return:
36 |         """
37 |         test_image = cv2.resize(img, [256, 256])
38 |         test_image = np.expand_dims(test_image, axis=0).astype(self.input_details["dtype"]) / 255.
39 |         self.interpreter.set_tensor(self.input_details["index"], test_image)
40 |         self.interpreter.invoke()
41 |         res = self.interpreter.get_tensor(self.blendershapes_output_details["index"])[0].tolist()
42 |         return res
43 | 
44 | if __name__ == "__main__":
45 |     mocapface = MoCapFace()
46 |     with open("all_image_path.json", 'r') as im_f:
47 |         images_f = json.load(im_f)
48 | 
49 |     for img_path in images_f.values():
50 |         img = cv2.imread(img_path)
51 |         res = mocapface.forword(img)
52 |         print(len(res))
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/train/coach_v1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 这个脚本是用来训练面部轮廓的
  3 | """
  4 | import sys
  5 | sys.path.append(".")
  6 | sys.path.append("..")
  7 | from models.mouth_net import MouthNet
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | from datasets.dataset import *
 10 | from configs.config_v1 import config as cfg
 11 | from torch.utils.data import DataLoader
 12 | import os
 13 | import torch.optim as optim
 14 | import torch.nn as nn
 15 | os.environ['CUDA_VISIBLE_DEVICES'] = cfg['gpu_ids']
 16 | 
 17 | class Coach:
 18 |     def __init__(self):
 19 |         self.global_test_loss = float('Inf')
 20 | 
 21 |         # 得到配置文件
 22 |         self.cfg = cfg
 23 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 24 | 
 25 |         # 创建主要的网络
 26 |         self.net = MouthNet(class_num=cfg['class_num']).to(self.device)
 27 |         self.net.train()
 28 | 
 29 |         if cfg['ckpt'] != "":
 30 |             ckpt = torch.load(cfg['ckpt'])
 31 |             # 使用不严格的weight加载方式, 并且舍弃shape mismatch的
 32 |             pretrain_state_dict = ckpt
 33 |             net_state_dict = self.net.state_dict()
 34 |             for key in net_state_dict:
 35 |                 if key in pretrain_state_dict.keys():
 36 |                     if net_state_dict[key].shape != pretrain_state_dict[key].shape:
 37 |                         pretrain_state_dict.pop(key)
 38 |             self.net.load_state_dict(pretrain_state_dict, strict=False)
 39 | 
 40 |         # 使用多卡训练
 41 |         if torch.cuda.device_count() > 1:
 42 |             print("Let's use ", torch.cuda.device_count(), "GPUs.")
 43 |             self.net = nn.DataParallel(self.net)
 44 | 
 45 |         # 创建训练日志
 46 |         if not os.path.exists("experiment/logs"):
 47 |             os.makedirs("experiment/logs")
 48 |         self.logger = SummaryWriter(log_dir='./experiment/logs')
 49 | 
 50 |         # 创建数据集
 51 |         trainsets = AudioDataset(cfg['train_target_root'], cfg['train_data_root'])
 52 |         self.trainloader = torch.utils.data.DataLoader(trainsets, batch_size=self.cfg['train_batch_size'], shuffle=True,
 53 |                                                        num_workers=cfg['num_workers'])
 54 | 
 55 |         testsets = AudioDataset(cfg['val_target_root'], cfg['val_data_root'])
 56 |         self.testloader = torch.utils.data.DataLoader(testsets, batch_size=self.cfg['train_batch_size'], shuffle=True,
 57 |                                                        num_workers=cfg['num_workers'])
 58 | 
 59 |         # 创建优化器(记得加上正则化的参数)
 60 |         self.optimizer = optim.Adam(self.net.parameters(), lr=self.cfg['lr'], betas=(0.9, 0.999), eps=1e-08,
 61 |                                         weight_decay=0.0001)
 62 | 
 63 |         # 创建损失函数
 64 |         self.MSELoss = torch.nn.MSELoss(reduction='sum')
 65 | 
 66 |     def MAELoss(self, pred, target):
 67 |         """
 68 |         2022.05.17增加了一个MAELoss, 这个loss对异常值比较敏感
 69 |         """
 70 |         loss = torch.sum(torch.abs(pred - target), dim=-1)
 71 |         loss = torch.mean(loss)
 72 |         return loss
 73 | 
 74 |     def criterion(self, pred, target):
 75 |         loss = self.MSELoss(pred, target)
 76 |         return loss
 77 | 
 78 |     def update_optimizer_lr(self, optimizer, lr):
 79 |         """
 80 |         为了动态更新learning rate， 加快训练速度
 81 |         :param optimizer: torch.optim type
 82 |         :param lr: learning rate
 83 |         :return:
 84 |         """
 85 |         for group in optimizer.param_groups:
 86 |             group['lr'] = lr
 87 | 
 88 |     def train(self):
 89 |         iter_num = 0
 90 |         mean_loss = 0
 91 |         for i in range(self.cfg['epoch']):
 92 |             for idx, (datas, targets) in enumerate(self.trainloader):
 93 |                 iter_num += 1
 94 | 
 95 |                 datas, targets = datas.to(self.device), targets.to(self.device)
 96 |                 self.optimizer.zero_grad()
 97 |                 outputs = self.net(datas)
 98 | 
 99 |                 # 计算损失
100 |                 loss = self.criterion(outputs, targets)
101 |                 mean_loss += loss.item()
102 |                 loss.backward()
103 |                 self.optimizer.step()
104 |                 # 打印loss
105 |                 if iter_num % self.cfg['print_loss'] == 0:
106 |                     mean_loss = mean_loss / self.cfg['print_loss']
107 |                     # mean_loss = np.array(mean_loss.detach().cpu())
108 |                     print("lr = {} total iteration {} epoch {}, iteration {}, loss = {}".format(str(round(self.optimizer.param_groups[0]['lr'], 6)),
109 |                                                                                                 str(iter_num), str(i), str(idx),
110 |                                                                                                 str(round(mean_loss, 6))))
111 |                     self.logger.add_scalar('{}/{}'.format('train', 'loss'), mean_loss, int(iter_num))
112 |                     mean_loss = 0
113 |                 # test
114 |                 if iter_num % self.cfg['val_interval'] == 0:
115 |                     self.net.eval()
116 |                     self.eval(i, idx)
117 |                     self.net.train()
118 | 
119 |                 # lr decay
120 |                 # 2022.05.17调整lr下降的幅度, 之前是0.01, 现在是0.9 or 0.5
121 |                 # (可能是因为lr太大导致后期的训练波动, 使得eval loss比train loss大)
122 |                 if (iter_num - self.cfg['warmup_steps']) % self.cfg['lr_update_interval'] == 0:
123 |                     lr = self.optimizer.param_groups[0]['lr'] * 0.9
124 |                     self.update_optimizer_lr(self.optimizer, lr)
125 | 
126 |                 elif iter_num < self.cfg['warmup_steps']:
127 |                     lr = self.optimizer.param_groups[0]['lr'] * (iter_num / self.cfg['warmup_steps'])
128 |                     self.update_optimizer_lr(self.optimizer, lr)
129 | 
130 |     def eval(self, epoch, iteration):
131 | 
132 |         test_loss = 0
133 |         test_num = 0
134 |         for idx, (datas, targets) in enumerate(self.testloader):
135 |             test_num += 1
136 | 
137 |             datas, targets = datas.to(self.device), targets.to(self.device)
138 | 
139 |             with torch.no_grad():
140 |                 outputs = self.net(datas)
141 | 
142 |             # 计算损失
143 |             loss = self.criterion(outputs, targets)
144 |             test_loss += loss.item()
145 | 
146 |             if test_num > 20:
147 |                 break
148 | 
149 |         test_loss = test_loss / test_num
150 |         if test_loss < self.global_test_loss:
151 |             self.global_test_loss = test_loss
152 | 
153 |             if not os.path.exists("experiment/checkpoints"):
154 |                 os.makedirs("experiment/checkpoints")
155 | 
156 |             torch.save(self.net.state_dict(),
157 |                            os.path.join("experiment/checkpoints", 'best_model_loss_{}.pth'.format(str(test_loss))))
158 | 
159 |         self.logger.add_scalar('{}/{}'.format('test', 'loss'), test_loss, epoch)
160 |         print("lr = {} epoch {}, iteration {}, eval loss = {}".format(str(round(self.optimizer.param_groups[0]['lr'], 6)), str(epoch), str(iteration), str(round(test_loss, 6))))
161 | 
162 | if __name__ == '__main__':
163 |     coach = Coach()
164 |     coach.train()


--------------------------------------------------------------------------------