├── Myslience.py ├── README.md └── run.py /Myslience.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # -*- coding:utf-8 -*- 4 | 5 | ''' 6 | 7 | @项目名称:VAD_python 8 | 9 | @作者:kingback 10 | 11 | @文件名称:Myslience.py 12 | 13 | @IDE:PyCharm 14 | 15 | @文件创建时间:2021-07-19 16:25:36 16 | 17 | @月份:7月 18 | 19 | 20 | ''' 21 | 22 | import os 23 | import soundfile as sf 24 | import numpy as np 25 | import librosa 26 | from scipy.signal import medfilt 27 | import matplotlib.pyplot as plt 28 | 29 | 30 | class MySlience(object): 31 | 32 | def __init__(self,src_dir,des_dir): 33 | # 设置属性 34 | self.src_dir=src_dir #待切除静音段的语音所在文件夹 35 | self.des_dir=des_dir #去除静音段后语音所在文件夹 36 | 37 | # 返回传入文件夹内的所有语音文件路径 38 | def file_name(self,file_dir): 39 | L = [] 40 | for root, dirs, files in os.walk(file_dir): 41 | for file in files: 42 | if os.path.splitext(file)[1] == '.wav': 43 | L.append(os.path.join(root, file)) 44 | return L 45 | 46 | 47 | def demain(self): 48 | src_files=self.file_name(self.src_dir) 49 | for src_filenamae in src_files: 50 | self.slience(src_filenamae,self.des_dir) 51 | 52 | def frame2Time(self,frameNum,framelen,inc,fs): 53 | frames=np.array(range(0,frameNum,1)) 54 | frames=frames*inc+framelen/2 55 | frameTime=frames/fs 56 | return frameTime 57 | 58 | # 59 | def slience(self,filename, slice_dir): 60 | 61 | print('当前在操作文件:', filename) 62 | 63 | clean_data, _ = sf.read(filename) 64 | 65 | ''' 66 | Step 1: 这一步根据动态阈值实现端点检测,并绘图表示 67 | ''' 68 | 69 | # 求取MFCCs参数 70 | y, sr = librosa.load(filename, sr=16000) 71 | mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, n_fft=1024, hop_length=512) 72 | 73 | # x轴坐标变换 74 | frames=mfccs.shape[1] 75 | time=np.array(range(len(y)))/16000 #采样点转换为时间刻度 76 | frame2Time=self.frame2Time(frames,1024,512,16000) #帧转换为时间刻度 77 | 78 | #设置图片大小 79 | plt.figure(figsize=(10, 7)) 80 | 81 | # 画出短时过零率 82 | plt.subplot(311) 83 | # 显示中文 84 | plt.plot(time, y, 'black') 85 | plt.xlabel("Time/s\n (a) Voice waveform", fontsize=15, fontweight='bold') 86 | plt.ylabel("amplitude/V", fontsize=15, fontweight='bold') 87 | # 修改坐标轴字体及大小 88 | plt.yticks(fontproperties='Times New Roman', size=15,weight='bold') 89 | plt.xticks(fontproperties='Times New Roman', size=15,weight='bold') 90 | # 时间刻度显示 91 | plt.axis([0,max(frame2Time),0.8*min(y),1.1*max(y)]) 92 | 93 | 94 | # 画图显示MFcc0 95 | plt.subplot(312) 96 | # plt.title("The original MFCC0 feature",fontproperties='Times New Roman', size=15,fontweight='bold') 97 | plt.plot(frame2Time,mfccs[0, :],'black') 98 | plt.xlabel("Time/s\n (b) MFCC0 feature",fontsize=15,fontweight='bold') 99 | plt.ylabel("amplitude",fontsize=15,fontweight='bold') 100 | # 修改坐标轴字体及大小 101 | plt.yticks(fontproperties='Times New Roman', size=15,weight='bold') 102 | plt.xticks(fontproperties='Times New Roman', size=15,weight='bold') 103 | plt.axis([0,max(frame2Time),min(mfccs[0, :])-10,max(mfccs[0, :])+10]) 104 | 105 | 106 | # # 对mfcc进行中值滤波,平滑参数 107 | Mfcc1=medfilt(mfccs[0,:],9) 108 | 109 | # 110 | pic = mfccs[0, :] 111 | pic=Mfcc1 112 | start = 0 113 | end = 0 114 | points = [] 115 | print(min(pic) * 0.9) 116 | min_data = min(pic) * 0.9 117 | for i in range((pic.shape[0])): 118 | if (pic[i] < min_data and start == 0): 119 | start = i 120 | # end=i 121 | if (pic[i] < min_data and start != 0): 122 | end = i 123 | # print(end) 124 | 125 | elif (pic[i] > min_data and start != 0): 126 | # print('当前时间段为:',start,end) 127 | hh = [start, end] 128 | points.append(hh) 129 | start = 0 130 | 131 | # 解决 文件的最后为静音 132 | if (pic[-1] < min_data and start != 0): 133 | hh = [start, end] 134 | points.append(hh) 135 | start = 0 136 | distances = [] 137 | 138 | for i in range(len(points)): 139 | 140 | two_ends = points[i] 141 | distance = two_ends[1] - two_ends[0] 142 | if (distance > 5): 143 | distances.append(points[i]) 144 | 145 | 146 | # 这里说明一下,distance内存的是最终需要的结果 147 | print(distances, np.array(distances).shape) 148 | print(points) 149 | 150 | plt.subplot(313) 151 | # 对mfcc进行中值滤波 152 | Mfcc1 = medfilt(mfccs[0, :], 9) 153 | plt.plot(frame2Time, Mfcc1,'black') 154 | plt.axis([0, max(frame2Time), min(Mfcc1) - 10, max(Mfcc1) + 10]) 155 | plt.xlabel("Time/s\n (c) MFCC0 median filtering and Voice Activity Detection ", fontsize=15,fontweight='bold') 156 | plt.ylabel("amplitude", fontsize=15,fontweight='bold') 157 | # 修改坐标轴字体及大小 158 | plt.yticks(fontproperties='Times New Roman', size=15,weight='bold') 159 | plt.xticks(fontproperties='Times New Roman', size=15,weight='bold') 160 | 161 | # 开始画出剪切掉的部分 162 | starts = np.array(distances)[:, 0] 163 | ends = np.array(distances)[:, 1] 164 | starts2time = (starts * 512 + 512) / 16000 165 | ends2time = (ends * 512 + 512) / 16000 166 | print(ends2time) 167 | plt.vlines(starts2time, min(Mfcc1)-10, max(Mfcc1)+10, colors="black", linestyles="solid",lw=2) 168 | plt.vlines(ends2time, min(Mfcc1)-10, max(Mfcc1)+10, colors="black", linestyles="dashed",lw=2.5) 169 | 170 | 171 | # 设置标题 172 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] # 用来正常显示中文标签 173 | plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 174 | plt.tight_layout() # 解决绘图时上下标题重叠现象 175 | 176 | # 保存到本地文件夹 177 | name = filename.split('\\')[-1] 178 | ''' 179 | 选择保存为svg格式或者png格式 180 | ''' 181 | # png_name = os.path.join(slice_dir, name) + ".png" 182 | # plt.savefig(png_name, dpi=200) 183 | 184 | svg_name = os.path.join(slice_dir, name) + ".svg" 185 | plt.savefig(svg_name, bbox_inches='tight') 186 | 187 | 188 | # 画图展示 189 | plt.show() 190 | 191 | ''' 192 | Step2: 这一步根据端点检测结果,去除语音中静音段。 193 | ''' 194 | 195 | 196 | # 取出来端点,按照端点,进行切割,分情况讨论:len(distances)==0 :未检测到静音段 else: 检测到存在静音段 197 | 198 | if (len(distances) == 0): 199 | 200 | print('检测到的静音段的个数为: %s 未对文件进行处理:' % len(distances)) 201 | slience_clean = os.path.join(slice_dir, name) 202 | sf.write(slience_clean, clean_data, 16000) 203 | 204 | 205 | else: 206 | # print(points) 207 | slience_data = [] 208 | for i in range(len(distances)): 209 | if (i == 0): 210 | start, end = distances[i] 211 | # 将左右端点转换到 采样点 212 | if (start == 1): 213 | internal_clean = clean_data[0:0] 214 | else: 215 | # 求取开始帧的开头 216 | start = (start - 1) * 512 217 | # 求取结束帧的结尾 218 | end = (end - 1) * 512 + 1024 219 | 220 | internal_clean = clean_data[0:start - 1] 221 | else: 222 | _, end = distances[i - 1] 223 | start, _ = distances[i] 224 | start = (start - 1) * 512 225 | end = (end - 1) * 512 + 1024 226 | internal_clean = clean_data[end + 1:start] 227 | 228 | hhh = np.array(internal_clean) 229 | print('纯净的片段的长度为:', hhh.shape) 230 | 231 | # 开始拼接 232 | slience_data.extend(internal_clean) 233 | 234 | 235 | # 开始 添加 最后一部分,需要分情况讨论,1. 文件末尾本来就是静音的 2.文件末尾不是静音的 236 | ll = len(distances) 237 | _, end = distances[ll - 1] 238 | end = (end - 1) * 512 + 1024 239 | end_part_clean = clean_data[end:len(clean_data)] 240 | slience_data.extend(end_part_clean) 241 | 242 | 243 | # #文件写入 244 | hh = np.array(slience_data) 245 | print('去除静音段的片段采样点个数为:', len(hh)) 246 | slience_file_path = os.path.join(slice_dir, name) 247 | sf.write(slience_file_path, slience_data, 16000) 248 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VAD_Python 2 | 3 | 实现对语音进行端点检测,并去除语音中静音段,可以作为语音信号处理的一个预处理。 4 | 5 | # Step 1. 安装必备库 6 | ~~~python 7 | soundfile==0.10.3 8 | numpy==1.19.3 9 | librosa==0.8.0 10 | scipy==1.2.1 11 | matplotlib==3.3.3 12 | ~~~ 13 | 14 | # Step 2. 进入文件run.py 修改 15 | 16 | ~~~python 17 | 18 | ## 修改自己的文件夹路径 19 | src_dir = r"./切割源文件" 20 | des_dir = r"./切割后文件" 21 | 22 | ~~~ 23 | # Step 3. 运行run.py 得到输出结果 24 | ![image](https://user-images.githubusercontent.com/39001883/126146075-b9cce2f1-b3ea-4ada-ac83-a530b5394a69.png) 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | ''' 4 | 5 | @项目名称:VAD_python 6 | 7 | @作者:kingback 8 | 9 | @文件名称:run.py 10 | 11 | @IDE:PyCharm 12 | 13 | @文件创建时间:2021-07-19 16:25:36 14 | 15 | @月份:7月 16 | 17 | 18 | ''' 19 | 20 | from Myslience import MySlience 21 | 22 | 23 | if __name__ == '__main__': 24 | # 绘画材料 25 | src_dir = r"./切割源文件" 26 | des_dir = r"./切割后文件" 27 | 28 | mySlience = MySlience(src_dir, des_dir) 29 | mySlience.demain() 30 | 31 | --------------------------------------------------------------------------------