├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── chapter10_语音识别 ├── DTW │ ├── 10.1DTW孤立自识别.mdown │ ├── DTW.py │ ├── mysound.wav │ ├── p1 │ │ ├── 0.wav │ │ ├── 1.wav │ │ ├── 2.wav │ │ ├── 3.wav │ │ ├── 4.wav │ │ ├── 5.wav │ │ ├── 6.wav │ │ ├── 7.wav │ │ ├── 8.wav │ │ └── 9.wav │ ├── p2 │ │ ├── 0.wav │ │ ├── 1.wav │ │ ├── 2.wav │ │ ├── 3.wav │ │ ├── 4.wav │ │ ├── 5.wav │ │ ├── 6.wav │ │ ├── 7.wav │ │ ├── 8.wav │ │ └── 9.wav │ └── p3 │ │ ├── 0.wav │ │ ├── 1.wav │ │ ├── 2.wav │ │ ├── 3.wav │ │ ├── 4.wav │ │ ├── 5.wav │ │ ├── 6.wav │ │ ├── 7.wav │ │ ├── 8.wav │ │ └── 9.wav └── HMM │ ├── 10.2隐马尔科夫模型孤立字识别.mdown │ ├── HMM.py │ ├── hmm_gmm.py │ ├── rec_data.mat │ ├── test_data │ ├── 1_1.wav │ ├── 1_10.wav │ ├── 1_2.wav │ ├── 1_3.wav │ ├── 1_4.wav │ ├── 1_5.wav │ ├── 1_6.wav │ ├── 1_7.wav │ ├── 1_8.wav │ └── 1_9.wav │ ├── tra_data.mat │ └── trainding_data │ ├── 10_1.wav │ ├── 10_10.wav │ ├── 10_2.wav │ ├── 10_3.wav │ ├── 10_4.wav │ ├── 10_5.wav │ ├── 10_6.wav │ ├── 10_7.wav │ ├── 10_8.wav │ ├── 10_9.wav │ ├── 1_1.wav │ ├── 1_10.wav │ ├── 1_2.wav │ ├── 1_3.wav │ ├── 1_4.wav │ ├── 1_5.wav │ ├── 1_6.wav │ ├── 1_7.wav │ ├── 1_8.wav │ ├── 1_9.wav │ ├── 2_1.wav │ ├── 2_10.wav │ ├── 2_2.wav │ ├── 2_3.wav │ ├── 2_4.wav │ ├── 2_5.wav │ ├── 2_6.wav │ ├── 2_7.wav │ ├── 2_8.wav │ ├── 2_9.wav │ ├── 3_1.wav │ ├── 3_10.wav │ ├── 3_2.wav │ ├── 3_3.wav │ ├── 3_4.wav │ ├── 3_5.wav │ ├── 3_6.wav │ ├── 3_7.wav │ ├── 3_8.wav │ ├── 3_9.wav │ ├── 4_1.wav │ ├── 4_10.wav │ ├── 4_2.wav │ ├── 4_3.wav │ ├── 4_4.wav │ ├── 4_5.wav │ ├── 4_6.wav │ ├── 4_7.wav │ ├── 4_8.wav │ ├── 4_9.wav │ ├── 5_1.wav │ ├── 5_10.wav │ ├── 5_2.wav │ ├── 5_3.wav │ ├── 5_4.wav │ ├── 5_5.wav │ ├── 5_6.wav │ ├── 5_7.wav │ ├── 5_8.wav │ ├── 5_9.wav │ ├── 6_1.wav │ ├── 6_10.wav │ ├── 6_2.wav │ ├── 6_3.wav │ ├── 6_4.wav │ ├── 6_5.wav │ ├── 6_6.wav │ ├── 6_7.wav │ ├── 6_8.wav │ ├── 6_9.wav │ ├── 7_1.wav │ ├── 7_10.wav │ ├── 7_2.wav │ ├── 7_3.wav │ ├── 7_4.wav │ ├── 7_5.wav │ ├── 7_6.wav │ ├── 7_7.wav │ ├── 7_8.wav │ ├── 7_9.wav │ ├── 8_1.wav │ ├── 8_10.wav │ ├── 8_2.wav │ ├── 8_3.wav │ ├── 8_4.wav │ ├── 8_5.wav │ ├── 8_6.wav │ ├── 8_7.wav │ ├── 8_8.wav │ ├── 8_9.wav │ ├── 9_1.wav │ ├── 9_10.wav │ ├── 9_2.wav │ ├── 9_3.wav │ ├── 9_4.wav │ ├── 9_5.wav │ ├── 9_6.wav │ ├── 9_7.wav │ ├── 9_8.wav │ └── 9_9.wav ├── chapter11_说话人识别 ├── 11.1VQ.mdown ├── 11.2基于GMM的说话人识别模型.mdown ├── GMM.py ├── VQ.py ├── VQQ.py ├── VQ_data │ ├── SX1.WAV │ ├── SX2.WAV │ ├── SX3.WAV │ ├── SX4.WAV │ ├── TX1_1.WAV │ ├── TX1_2.WAV │ ├── TX1_3.WAV │ ├── TX1_4.WAV │ ├── TX2_1.WAV │ ├── TX2_2.WAV │ ├── TX2_3.WAV │ ├── TX2_4.WAV │ ├── TX3_1.WAV │ ├── TX3_2.WAV │ ├── TX3_3.WAV │ ├── TX3_4.WAV │ ├── TX4_1.WAV │ ├── TX4_2.WAV │ ├── TX4_3.WAV │ ├── TX4_4.WAV │ ├── TX5_1.WAV │ ├── TX5_2.WAV │ ├── TX5_3.WAV │ ├── TX5_4.WAV │ ├── mysound.wav │ └── process.py ├── gmm_data │ ├── rec_data.mat │ └── tra_data.mat ├── hmm_train_test.py └── myGMM.py ├── chapter12_情感识别 ├── 12.1KNN.mdown ├── 12.2神经网络.mdown ├── 12.3SVM语音感情识别.mdown ├── 12.4降维的语音情感识别.mdown ├── KNN │ ├── A_fear.mat │ ├── F_happiness.mat │ ├── N_neutral.mat │ ├── T_sadness.mat │ ├── W_anger.mat │ └── knn.py ├── NNs │ ├── A_fear.mat │ ├── F_happiness.mat │ ├── LVQ.py │ ├── N_neutral.mat │ ├── T_sadness.mat │ ├── W_anger.mat │ ├── my_pca_lda.py │ ├── pca_lda_sklearn.py │ ├── pnn.py │ └── svm.py └── images │ └── 情感识别框图.jpg ├── chapter2_基础 ├── 2.1采集与读取.mdown ├── 2.2语音编辑.mdown ├── 2.3声强与响度.mdown ├── 2.4语音信号生成.mdown ├── C2_1_y.wav ├── C2_2_1.py ├── C2_2_y.wav ├── C2_2_y_conved.wav ├── C2_2_y_noised.wav ├── C2_3_1.py ├── C2_3_y.wav ├── C2_4_s.py ├── a.wav ├── audioplayer.py ├── audiorecorder.py ├── audiowriter.py └── soundBase.py ├── chapter3_分析实验 ├── 3.1语音分帧与加窗.mdown ├── 3.2短时时域分析.mdown ├── 3.3短时频域分析.mdown ├── 3.4倒谱分析与MFCC系数.mdown ├── 3.5线性预测分析.mdown ├── 3.6线谱对转化.mdown ├── C3_1_y.wav ├── C3_1_y_1.py ├── C3_1_y_2.py ├── C3_2_y.py ├── C3_2_y.wav ├── C3_3_y.py ├── C3_3_y.wav ├── C3_4_y_1.py ├── C3_4_y_1.wav ├── C3_4_y_2.py ├── C3_4_y_4.py ├── C3_4_y_4.wav ├── C3_5_y.wav ├── C3_5_y_1.py ├── C3_5_y_2.py ├── C3_5_y_3.py ├── dct.py ├── images │ ├── Amdf.png │ ├── Zcr.png │ ├── corr.png │ ├── dct.png │ ├── en.png │ ├── energy.png │ ├── lpc.png │ ├── lpcff.png │ ├── mel.png │ ├── mfcc.png │ ├── spec.png │ ├── window.png │ ├── 倒谱.png │ └── 同态.png ├── lpc.py ├── mel.py ├── test.py ├── timefeature.py ├── windows.py └── 倒谱计算.py ├── chapter4_特征提取 ├── 4.1语音端点检测.mdown ├── 4.2基音周期检测.mdown ├── 4.3共振峰估计.mdown ├── C4_1_y.wav ├── C4_1_y_1.py ├── C4_1_y_2.py ├── C4_1_y_3.py ├── C4_1_y_4.py ├── C4_1_y_5.py ├── C4_2_y.py ├── C4_2_y.wav ├── C4_3_y.py ├── C4_3_y.wav ├── end_detection.py ├── images │ ├── En.png │ ├── TwoThr.png │ ├── corr.png │ ├── ellip.png │ ├── pitch.png │ ├── 共振峰估计.png │ ├── 对数频率距离.png │ ├── 能熵比.png │ └── 能零比.png ├── pitch_detection.py └── 共振峰估计.py ├── chapter5_语音降噪 ├── 5.1自适应滤波器.mdown ├── 5.2谱减法.mdown ├── 5.3维纳滤波.mdown ├── 5.4小波分解.mdown ├── C5_1_5.py ├── C5_1_y.wav ├── C5_2_y.py ├── C5_2_y.wav ├── C5_3_y.wav ├── C5_4_y.py ├── C5_4_y.wav ├── Wavelet.py ├── images │ ├── LMS.png │ └── wavelet.png ├── wp_mfcc.py └── 自适应滤波.py ├── chapter6_语音编码 ├── 6.1PCM编码.mdown ├── 6.2LPC编码.mdown ├── 6.3ADPCM编码.mdown ├── ADPCM.py ├── C6_1_y.py ├── C6_1_y.wav ├── C6_2_y.py ├── C6_3_y.py ├── C6_3_y.wav ├── LPC解码.png ├── PCM.py └── images │ ├── ADPMC.png │ ├── LPC解码.png │ ├── PCM流程.png │ └── pcm.png ├── chapter7_语音合成 ├── 7.1帧合并.mdown ├── 7.2LPC的语音合成.mdown ├── 7.3共振峰检测和基音参数的语音合成.mdown ├── 7.4语音的变调和变速.mdown ├── C7_1_y.py ├── C7_1_y.wav ├── C7_2_y.py ├── C7_2_y.wav ├── C7_3_y.py ├── C7_3_y.wav ├── flipframe.py ├── myfilter.py └── test.py └── chapter8_隐藏试验 ├── C8_1_y.DAT ├── C8_1_y.py └── C8_1_y.wav /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: ['http://m.qpic.cn/psc?/V12DgbRP0a1TXP/ruAMsa53pVQWN7FLK88i5v4.insf.ZHWVeORtvgzbl.vHII7E8ek5dARBRWrXiQl200fh19JT1dhkfbtR0fFThtQrqHbBn8Xxx3v9blYoEg!/b', 13 | 'http://m.qpic.cn/psc?/V12DgbRP0a1TXP/45NBuzDIW489QBoVep5mcSWetEN.fWtw790qmGKuWtj0GVwIhckkAX.88GSFFJX4PqpthhQGL6oaPvOeVmb8muDqRmmKeUrI9eA.C6DMaQw!/b'] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | matlab/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python_sound_open 2 | 3 | 4 | ![](https://img.shields.io/badge/License-Apache-green) 5 | ![](https://img.shields.io/badge/python-3.x-green) 6 | 7 | 8 | ## env 9 | ~~~ 10 | OS: windows10 x64 11 | pyaudio==0.2.11 12 | librosa==0.7.2 13 | matplotlib==3.2.1 14 | scipy==1.2.1 15 | numpy==1.16.0 16 | pandas==0.25.1 17 | 18 | ~~~ 19 | 20 | ## blogs 21 | 22 | 23 | 《语音信号处理试验教程》(梁瑞宇等)的代码主要是Matlab实现的,现在Python比较热门,所以把这个项目大部分内容写成了Python实现,大部分是手动写的。使用[CSDN博客](https://blog.csdn.net/sinat_18131557)查看帮助文件: 24 | 25 | [Python语音基础操作--2.1语音录制,播放,读取](https://blog.csdn.net/sinat_18131557/article/details/105339507) 26 | 27 | [Python语音基础操作--2.2语音编辑](https://blog.csdn.net/sinat_18131557/article/details/105339686) 28 | 29 | [Python语音基础操作--2.3声强与响度](https://blog.csdn.net/sinat_18131557/article/details/105340027) 30 | 31 | [Python语音基础操作--2.4语音信号生成](https://blog.csdn.net/sinat_18131557/article/details/105340256) 32 | 33 | [Python语音基础操作--3.1语音分帧与加窗](https://blog.csdn.net/sinat_18131557/article/details/105340416) 34 | 35 | [Python语音基础操作--3.2短时时域分析](https://blog.csdn.net/sinat_18131557/article/details/105795509) 36 | 37 | [Python语音基础操作--3.3短时频域分析](https://blog.csdn.net/sinat_18131557/article/details/105795626) 38 | 39 | [Python语音基础操作--3.4倒谱分析与MFCC系数](https://blog.csdn.net/sinat_18131557/article/details/105795864) 40 | 41 | [Python语音基础操作--3.5线性预测分析](https://blog.csdn.net/sinat_18131557/article/details/105795944) 42 | 43 | [Python语音基础操作--4.1语音端点检测](https://blog.csdn.net/sinat_18131557/article/details/106017459) 44 | 45 | [Python语音基础操作--4.2基音周期检测](https://blog.csdn.net/sinat_18131557/article/details/106017542) 46 | 47 | [Python语音基础操作--4.3共振峰估计](https://blog.csdn.net/sinat_18131557/article/details/106017598) 48 | 49 | [Python语音基础操作--5.1自适应滤波](https://blog.csdn.net/sinat_18131557/article/details/106440692) 50 | 51 | [Python语音基础操作--5.2谱减法](https://blog.csdn.net/sinat_18131557/article/details/106440714) 52 | 53 | [Python语音基础操作--5.4小波分解](https://blog.csdn.net/sinat_18131557/article/details/106440757) 54 | 55 | [Python语音基础操作--6.1PCM编码](https://blog.csdn.net/sinat_18131557/article/details/106440778) 56 | 57 | [Python语音基础操作--6.2LPC编码](https://blog.csdn.net/sinat_18131557/article/details/106440802) 58 | 59 | [Python语音基础操作--6.3ADPCM编码](https://blog.csdn.net/sinat_18131557/article/details/106440815) 60 | 61 | [Python语音基础操作--7.1帧合并](https://blog.csdn.net/sinat_18131557/article/details/106440852) 62 | 63 | [Python语音基础操作--7.2LPC的语音合成](https://blog.csdn.net/sinat_18131557/article/details/106440872) 64 | 65 | [Python语音基础操作--10.1基于动态时间规整(DTW)的孤立字语音识别试验](https://blog.csdn.net/sinat_18131557/article/details/106440909) 66 | 67 | [Python语音基础操作--10.2隐马尔科夫模型的孤立字识别](https://blog.csdn.net/sinat_18131557/article/details/106440938) 68 | 69 | [Python语音基础操作--11.1矢量量化(VQ)的说话人情感识别](https://blog.csdn.net/sinat_18131557/article/details/106440998) 70 | 71 | [Python语音基础操作--11.2基于GMM的说话人识别模型](https://blog.csdn.net/sinat_18131557/article/details/106441013) 72 | 73 | [Python语音基础操作--12.1基于KNN的情感识别](https://blog.csdn.net/sinat_18131557/article/details/106441088) 74 | 75 | [Python语音基础操作--12.2基于神经网络的情感识别](https://blog.csdn.net/sinat_18131557/article/details/106441104) 76 | 77 | [Python语音基础操作--12.3基于支持向量机SVM的语音情感识别](https://blog.csdn.net/sinat_18131557/article/details/106441142) 78 | 79 | [Python语音基础操作--12.4基于LDA,PCA的语音情感识别](https://blog.csdn.net/sinat_18131557/article/details/106441186) 80 | 81 | 82 | # Ref 83 | 1. [《语音信号处理试验教程》(梁瑞宇等)](https://github.com/bastamon/sound_signal_process-matlab-) 84 | 85 | -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/10.1DTW孤立自识别.mdown: -------------------------------------------------------------------------------- 1 | 基于动态时间规整(DTW)的孤立字语音识别试验 2 | ### 模板匹配法语音识别系统 3 | 用户将词汇表中每个词依次说一遍,并且将其特征矢量时序作为模板存入模板库,在识别阶段,将输入语音的特征矢量时间序列依次与模板库中每个模板进行相识度比较,将相识度最高者作为识别的结果输出。 4 | ### 特征 5 | 使用MFCC系数以及一阶和二阶差分作为特征参数。MFCC是将人耳的听觉特征与语音参数相结合的一种特征参数。MFCC的计算可以参考3.4节。 6 | ### 动态时间规整(DTW) 7 | 在识别阶段的模式匹配中,不能简单地将输入模板与词库中模板相比较实现识别,因为语音信号具有相当大的随机性,这些差异不仅好酷哦音强的大小,频谱的偏移,还有发音持续时间不可能是完全相同的,而词库中模板不可能睡着模板输入持续时间的变换而进行伸缩,所以时间规整是不可少的。DTW是吧时间规整和距离测度计算结合起来的非线性规整技术,是模板匹配的方法。 8 | 9 | 假设某一参考模板的特征矢量为:$a_1,...a_m,...,a_M$,输入语音的特征矢量序列$b_1,..,b_m,...,b_M$,而且$M\neq N$,那么动态时间规整就是要找到时间规整函数$m=T(n)$,把输入的时间轴n非线性映射到参考模板的时间轴$m$上,并且满足: 10 | $$D=\underset{T(n)}{\min}\sum_{n=1}^Nd[n,T(n)]$$ 11 | 12 | $d[n,T(n)]$表示两个矢量之间的距离,可以采用欧式距离计算$d(x,y)=\frac{1}{k}\sqrt{\sum_{i=1}^k(x_i-y_i)^2}$,D是最佳时间路径下的两个补办的距离测度。 13 | 14 | 这是一个典型的最优化问题,一般采用动态规划算法(DP)实现,为了方便计算,把一个N阶段的决策过程转化为N个简单段的决策过程,也就是为N个子问题逐一做出决策,根据语音信号的性质,时间规整函数要满足: 15 | - 边界条件,$T(1)=1,T(N)=M$ 16 | - 单调上升:$T(n+1)-T(n)\geqslant 0$ 17 | - 连续性限制:有些特殊音素可能会对正确识别起很大作用,某个音素的差异可能就是区分的依据,为了保证信息损失最小,规整函数一般规定不云溪跳过任何一点,即$\Phi(i_n+1)-\Phi(i_n)\leqslant 1$ 18 | 19 | 使用递推的方式进行计算,也就是计算(0,0)到(N,M)点的最短距离,首先计算(N,M)前一个点到他的距离,然后在计算(0,0)到(N,M)前一个点的距离,得到最短距离。 20 | -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/mysound.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/mysound.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/0.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p1/9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p1/9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/0.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p2/9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p2/9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/0.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/DTW/p3/9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/DTW/p3/9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/10.2隐马尔科夫模型孤立字识别.mdown: -------------------------------------------------------------------------------- 1 | 隐马尔科夫模型(Hidden Markov Models, HMM)作为语音信号的一种统计模型,在语音处理中得到广泛应用。 2 | 3 | 一个用于语音识别的HMM通常用三组模型参数$\bold{M=\{A,B,\pi\}}$来定义,假设某HMM一共有N个状态$\{S_i\}_{i-1}^N$,那么参数的定义为: 4 | $\bold{A}$:状态转移概率矩阵; 5 | $$A=\begin{bmatrix} 6 | a_{11}& ...& a_{1N}\\...&...&...\\a_{N1}&...&a_{NN} 7 | \end{bmatrix}$$ 8 | 9 | 其中$a_{ij}$是从状态$S_i$到状态$S_j$转移时的转移概率,并且$0\leqslant a_{ij}\leqslant 1,\sum_{i=1}^Na_{ij}=1$ 10 | 11 | $\pi$:系统初始状态概率的集合,$\{\pi\}_{i=1}^N$表示初始状态是$s_i$的概率,即:$\pi_i=P[S_1=s_i](1\leqslant i\leqslant N),\sum_{i=1}^N\pi_{ij}=1$ 12 | $\bold{B}$:处处观测值概率集合,$\bold{B}=\{b_{ij}(k)\}$,其中$b_{ij}(k)$是从状态$S_i$到状态$S_j$转移时,观测值为k的输出概率。根据观察集合$X$的取值可将HMM分为连续型和离散型。 13 | 14 | ### 前向-后向算法 15 | 用来计算给定观测值序列$\bold{O}=o_1o_2...o_T$以及一个模型$\bold{M=\{A,B,\pi\}}$时,由模型$M$产生出$O$的概率$\bold{P(O|M)}$,设$S_1$是初始状态,$S_N$是终了状态,则前向-后向算法描述如下: 16 | (1)前向算法 17 | 根据输出观察值序列重前向后递推计算输出序列; 18 | |符号|函数| 19 | |--|--| 20 | |$\bold{O}=o_1o_2...o_T$|输出观察序列列表| 21 | |$\bold{P(O\|M)}$|给定模型M时,输出符号序列O的概率| 22 | |$a_{ij}$|状态$S_i$到状态$S_j$的转移概率| 23 | |$b_{ij}(o_t)$|状态$S_i$到状态$S_j$的转移时输出$o_t$的概率| 24 | |$\bold{\alpha_t(j)}$|输出部分符号序列$o_1o_2...o_t$并到达状态$S_j$的概率(前向概率)| 25 | 26 | $\bold{\alpha_t(j)}$可以由递推计算:初始化为$\bold{\alpha_0(1)}=1,\bold{\alpha_0(j)}=0(j\neq 1)$ 27 | 28 | 递推公式为: 29 | $$\bold{\alpha_t}(j)=\sum_i\bold{\alpha_{t-1}}(i)a_{ij}b_{ij}(o_t),(t=1,2,...,T;i,j=1,2,...,N)$$ 30 | 31 | 最后结果: 32 | $$\bold{P(O|M)}=\bold{\alpha_T}(N)$$ 33 | 34 | t时刻的$\bold{\alpha_t}(j)$等于t-1时刻的所有状态的$\bold{\alpha_{t-1}}(i)a_{ij}b_{ij}(o_t)$的和,如果状态$S_i$到状态$S_j$没有转移时,$a_{ij}=0$.前向算法计算量大大减小,为$N(N+1)(T-1)$次乘法和$N(N-1)(T+1)$次加法。 35 | (2)后向算法 36 | 定义$\bold{\beta}_t(i)$为后向概率,即从状态$S_i$开始到状态$S_N$结束输出部分符号序列为$o_{t+1},o_{t+2},...,o_{T}$的概率,初始化:$\bold{\beta_T(N)}=1,\bold{\beta_T(j)}=0,(j\neq N)$ 37 | 递推公式: 38 | $$\bold{\beta}_t(i)=\sum_j\beta_{t+1}(j)a_{ij}b_{ij}(o_{t+1}),(t=T,T-1,...,1;i,j=1,2,...,N)$$ 39 | 40 | 所以:$\bold{P(O|M)}=\sum\limits_{i=1}^N\beta_1(i)\pi_i=\beta_0(1)$ 41 | 42 | 后向计算的计算量为$N^2T$,根据定义可以知道:$\bold{P(O|M)}=\sum\limits_{i=1}^N\sum\limits_{j=1}^N\alpha_t(i)a_{ij}b_{ij}(o_{t+1})\beta_{t+1}(j)$ 43 | 44 | ### 维特比(Viterbi)算法 45 | Viterbi解决的是给定观察符号序列$O=o_1o_2...o_T$和模型$\bold{M=\{A,B,\pi\}}$,求出状态序列$S=s_1s_2...s_T$的问题。最佳意义上的状态序列是使$P(S,O|M)$最大时确定的状态序列,即HMM输出一个观察值序列$O=o_1o_2...o_T$时,可能通过的状态序列有多种,这里面使输出概率最大的序列状态。 46 | 初始化:$\alpha_0'(1)=1,\alpha_0'(j)=0(j\neq 1)$ 47 | 递推公式:$\alpha_t'(j)=\underset{i}{\max}\alpha_{t-1}'(j-1)a_{ij}b_{ij}(o_t),(t=1,2,...,T;i,j=1,2,...,N)$ 48 | 最后:$P_{\max}(S,O|M)=\alpha_T'(N)$ 49 | 每一次使$\alpha_t'(j)$最大的状态i组成的状态序列就是所求的最佳状态序列。求最佳状态序列的方式为: 50 | 1)给定每个状态准备一个数组变量$\alpha_t'(j)$,初始化时,令初始状态$S_1$的数组变量$\alpha_0'(1)=1$,其他状态$\alpha_0'(j)=0$ 51 | 2)根据t时刻输出的观察符号$o_t$计算$\alpha_t'(j)=\underset{i}{\max}\alpha_{t-1}'a_{ij}b_{ij}(o_t)=\underset{i}{\max}\{\alpha_{t-1}'a_{1j}b_{1j}(o_t),\alpha_{t-1}'a_{2j}b_{2j}(o_t),...,\alpha_{t-1}'a_{Nj}b_{Nj}(o_t)\}$,当状态$S_i$到状态$S_j$没有转移时,$a_{ij}=0$。设计一个符号数组变量,称为追加状态序列寄存器,利用这个最佳状态序列寄存器吧每次使得$\alpha_t'(j)$最大的状态保存下来。 52 | 3)当$t\neq T$时转移到2),否则转移到4)。 53 | 4)把最终的状态寄存器$\alpha_T'(N)$内的值取出,则$P_{\max}(S,O|M)=\alpha_T'(N)$为最佳状态序列寄存器的值,就是所求的最佳状态序列。 54 | 55 | ### Baum-Welch算法 56 | Baum-Welch算法解决的是HMM训练,即HMM参数估计问题,给定一个观察序列$O=o_1o_2...o_T$,该算法能确定一个$M=\{A,B,\pi\}$,使$P(O|M)$最大,这是一个泛函极值问题。由于给定的训练序列有限,因而不存在一个最佳的方法来估计模型,Baum-Welch算法也是利用递归思想,使$P(O|M)$局部放大后,最后得到优化的模型参数$M=\{A,B,\pi\}$。利用Baum-Welch算法重估公式得到的新模型$\hat M$,一定有$P(O|\hat M)>P(O|M)$,重复估计过程,直到$P(O|\hat M)$收敛,不再明显增大,此时的$\hat M$即为所求模型。 57 | 58 | 给定一个观察值符号序列$O=o_1o_2...o_T$,以及一个需要通过训练进行重估计参数的HMM模型$M=\{A,B,\pi\}$,按前向-后向算法,设对于符号序列,在时刻t从状态$S_i$到状态$S_j$的转移概率为$\gamma_t(i,j)$: 59 | $$\gamma_t(i,j)=\frac{\alpha_{t-1}(i)a_{ij}b_{ij}(o_t)\beta_t(j)}{\alpha_T(N)}=\frac{\alpha_{t-1}(i)a_{ij}b_{ij}(o_t)\beta_t(j)}{\sum_i\alpha_t(i)\beta_t(i)}$$ 60 | 61 | 同时,对于符号序列$O=o_1o_2...o_T$,在t时刻的Markov链处于状态$S_i$的概率为: 62 | $$\sum\limits_{j=1}^N\gamma_t(i,j)=\frac{\alpha_t(i)\beta_t(i)}{\sum_i\alpha_t(i)\beta_t(i)}$$ 63 | 64 | 这时,状态$S_i$到状态$S_j$转移次数的期望为$\sum_t\gamma_t(i,j)$,而从状态$S_i$转移出去的次数期望为$\sum_j\sum_t\gamma_t(i,j)$,所以重估公式为: 65 | $$\hat a_{ij}=\frac{\sum_t\gamma_t(i,j)}{\sum_j\sum_t\gamma_t(i,j)}=\frac{\sum_t\alpha_{t-1}(i)a_{ij}b_{ij}(o_t)\beta_t(j)}{\sum_t\alpha_t(i)\beta_t(i)}$$ 66 | 67 | $$\hat b_{ij}=\frac{\sum\limits_{t:o_t=k}\gamma_t(i,j)}{\sum_t\gamma_t(i,j)}=\frac{\sum\limits_{t:o_t=k}\alpha_{t-1}(i)a_{ij}b_{ij}(o_t)\beta_t(j)}{\sum_t\alpha_{t-1}(i)a_{ij}b_{ij}(o_t)\beta_t(j)}$$ 68 | 69 | 得到的新模型就是$\hat M=\{\hat A,\hat B,\hat\pi\}$。 70 | 具体的实现步骤为: 71 | 1)适当地选择$a_{ij}$和$b_{ij}(k)$的初始值,常用的设定方式为:给予从状态i转移出去的每条弧相等的转移概率,即 72 | $$a_{ij}=\frac{1}{从状态i转移出去的弧的条数}$$ 73 | 74 | 给予每个输出观察符号相等的输出概率初始值,即: 75 | $$b_{ij}(k)=\frac{1}{码本中码字的个数}$$ 76 | 77 | 并且每条弧上给予相同的输出概率矩阵。 78 | 2)给定一个(训练)观察值符号序列$O=o_1o_2...o_T$,由初始模型计算$\gamma_t(i,j)$,并且由重估公式,计算$\hat a_{ij}$和$\hat b_{ij}(k)$. 79 | 3)再给定一个(训练)观察值序列$O=o_1o_2...o_T$,吧前一次的$\hat a_{ij}$和$\hat b_{ij}(k)$作为初始模型计算$\gamma_t(i,j)$,重新计算$\hat a_{ij}$和$\hat b_{ij}(k)$. 80 | 4)直到$\hat a_{ij}$和$\hat b_{ij}(k)$收敛为止。 81 | 82 | 语音识别一般采用从左到右的HMM,所以初始状态概率$\pi_i$不需要顾及,总设定为: 83 | $$\pi_1=1,\pi_i=0,(i=2,...,N)$$ 84 | -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/HMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | class HMM: 6 | def __init__(self, N, M): 7 | self.N = N 8 | self.M = M 9 | self.init = np.zeros(N) 10 | self.init[0] = 1 11 | self.trans = np.zeros((N, N)) 12 | 13 | 14 | def kmeans1(d, k): 15 | n, p = d.shape 16 | # 任取k个作为初始化 17 | per = np.random.permutation(n)[:k] 18 | x = d[per, :] 19 | y = x + 1 20 | 21 | 22 | def InitHMM(obs, N, M): 23 | def getmix(vector, M): 24 | mean, esp, nn = kmeans1(vector, M) 25 | 26 | print(1) 27 | K = len(obs.keys()) # 樣本數量 28 | hmm = HMM(N, M) 29 | # 初始化狀態轉移矩陣A 30 | for i in range(N - 1): 31 | hmm.trans[i, i] = 0.5 32 | hmm.trans[i, i + 1] = 0.5 33 | hmm.trans[N - 1, N - 1] = 1 34 | # 初始化输出观测值概率B(连续混合正态分布) 35 | for k in range(K): 36 | T = obs[k]['fea'].shape[0] 37 | seg = np.linspace(0, T, N + 1) 38 | obs[k]['segment'] = np.array([int(i) for i in seg]) 39 | 40 | for i in range(N): 41 | for k in range(K): 42 | seg1 = obs[k]['segment'][i] 43 | seg2 = obs[k]['segment'][i + 1] 44 | if k == 0: 45 | vector = obs[k]['fea'][seg1:seg2, :] 46 | else: 47 | vector = np.vstack((vector, obs[k]['fea'][seg1:seg2, :])) 48 | mix = getmix(vector, M[i]) 49 | return hmm 50 | -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/hmm_gmm.py: -------------------------------------------------------------------------------- 1 | from chapter3_分析实验.mel import Nmfcc 2 | from scipy.io import wavfile, loadmat 3 | from hmmlearn import hmm 4 | from sklearn.externals import joblib 5 | import numpy as np 6 | import os 7 | 8 | """ 9 | 代码来自:https://blog.csdn.net/chinatelecom08/article/details/82901480 10 | 并进行了部分更改 11 | """ 12 | 13 | 14 | def gen_wavlist(wavpath): 15 | """ 16 | 得到数据文件序列 17 | :param wavpath: 18 | :return: 19 | """ 20 | wavdict = {} 21 | labeldict = {} 22 | for (dirpath, dirnames, filenames) in os.walk(wavpath): 23 | for filename in filenames: 24 | if filename.endswith('.wav'): 25 | filepath = os.sep.join([dirpath, filename]) 26 | fileid = filename.strip('.wav') 27 | wavdict[fileid] = filepath 28 | label = fileid.split('_')[1] 29 | labeldict[fileid] = label 30 | return wavdict, labeldict 31 | 32 | 33 | def compute_mfcc(file): 34 | """ 35 | 读取数据并计算mfcc 36 | :param file: 文件名 37 | :return: mfcc系数 38 | """ 39 | """ 40 | 有手动修改wavfile.read()函数的返回值,添加了bit_depth的返回,如果报错,修改调用方式为: 41 | fs, audio = wavfile.read(file) 42 | 2020-3-20 Jie Y. 43 | """ 44 | fs, audio, bits = wavfile.read(file) 45 | """ 46 | 由于部分信号太短而报错,所以fs//2了 47 | """ 48 | mfcc = Nmfcc(audio, fs // 2, 12, frameSize=int(fs // 2 * 0.025), inc=int(fs // 2 * 0.01)) 49 | return mfcc 50 | 51 | 52 | ''' 53 | &usage: 搭建HMM-GMM的孤立词识别模型 54 | 参数意义: 55 | CATEGORY: 所有标签的列表 56 | n_comp: 每个孤立词中的状态数 57 | n_mix: 每个状态包含的混合高斯数量 58 | cov_type: 协方差矩阵的类型 59 | n_iter: 训练迭代次数 60 | ''' 61 | 62 | 63 | class Model: 64 | def __init__(self, CATEGORY=None, n_comp=3, n_mix=3, cov_type='diag', n_iter=1000): 65 | super(Model, self).__init__() 66 | self.CATEGORY = CATEGORY 67 | self.category = len(CATEGORY) 68 | self.n_comp = n_comp 69 | self.n_mix = n_mix 70 | self.cov_type = cov_type 71 | self.n_iter = n_iter 72 | # 关键步骤,初始化models,返回特定参数的模型的列表 73 | self.models = [] 74 | for k in range(self.category): 75 | model = hmm.GMMHMM(n_components=self.n_comp, n_mix=self.n_mix, covariance_type=self.cov_type, 76 | n_iter=self.n_iter) 77 | self.models.append(model) 78 | 79 | def train(self, tdata): 80 | for i in range(tdata.shape[1]): 81 | model = self.models[i] 82 | for x in range(tdata[0, i].shape[1]): 83 | data = tdata[0, i][0, x].squeeze() 84 | mfcc = Nmfcc(data, 8000, 24, 256, 80) 85 | model.fit(mfcc) 86 | 87 | def test(self, pdata): 88 | label = [] 89 | result = [] 90 | for k in range(pdata.shape[1]): 91 | for i in range(pdata[0, k].shape[1]): 92 | label.append(str(k + 1)) 93 | data = pdata[0, k][0, i].squeeze() 94 | mfcc = Nmfcc(data, 8000, 24, 256, 80) 95 | result_one = [] 96 | for m in range(self.category): 97 | model = self.models[m] 98 | re = model.score(mfcc) 99 | result_one.append(re) 100 | result.append(self.CATEGORY[np.argmax(np.array(result_one))]) 101 | print('识别得到结果:\n', result) 102 | print('原始标签类别:\n', label) 103 | # 检查识别率,为:正确识别的个数/总数 104 | totalnum = len(label) 105 | correctnum = 0 106 | for i in range(totalnum): 107 | if result[i] == label[i]: 108 | correctnum += 1 109 | print('识别率:', correctnum / totalnum) 110 | 111 | def save(self, path="models.pkl"): 112 | joblib.dump(self.models, path) 113 | 114 | def load(self, path="models.pkl"): 115 | self.models = joblib.load(path) 116 | 117 | 118 | tdata = loadmat('tra_data.mat')['tdata'] 119 | pdata = loadmat('rec_data.mat')['rdata'] 120 | 121 | CATEGORY = [str(i + 1) for i in range(tdata.shape[1])] 122 | # 进行训练 123 | models = Model(CATEGORY=CATEGORY) 124 | models.train(tdata) 125 | models.test(tdata) 126 | models.test(pdata) 127 | -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/rec_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/rec_data.mat -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/test_data/1_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/test_data/1_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/tra_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/tra_data.mat -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/10_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/10_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/1_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/1_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/2_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/2_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/3_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/3_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/4_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/4_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/5_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/5_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/6_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/6_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/7_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/7_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/8_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/8_9.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_1.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_10.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_2.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_3.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_4.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_5.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_6.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_7.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_8.wav -------------------------------------------------------------------------------- /chapter10_语音识别/HMM/trainding_data/9_9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter10_语音识别/HMM/trainding_data/9_9.wav -------------------------------------------------------------------------------- /chapter11_说话人识别/11.2基于GMM的说话人识别模型.mdown: -------------------------------------------------------------------------------- 1 | 高斯混合模型(Gaussian Mixture Model, GMM)可以看做一种状态数为1的连续分布隐马尔科夫模型。一个$M$阶混合高斯模型的概率密度函数是$M$个高斯概率密度函数加权求和得到的: 2 | $$P(X|\lambda)=\sum_{i=1}^Mw_ib_i(X)$$ 3 | 4 | 其中$X$是D维随机向量,为说话人识别算法提取的特征矢量,$w_i$是混合权重,满足$\sum_{i=1}^Mw_i=1$,每个分布因子$b_i(X_t)$是D维的联合高斯概率分布: 5 | $$b_i(X)=\frac{1}{(2\pi)^{D/2}|\sum_i|^{1/2}}\exp\{-\frac{1}{2}(X-\mu_i)^t(\sum)_i^{-1}(X-\mu_i)\}$$ 6 | 7 | $\mu_i$是均值向量,$(\sum)_i$是协方差矩阵。$\lambda=\{w_i,\mu_i,\sum_i\},i=1,2,...,M$ 8 | 9 | GMM常用的参数确定方法是极大似然估计,在具体实施时候,通常采用期望值最大算法估计参数$\lambda$,采用EM算法估计一个新的参数$\hat \lambda$,使得新参数模型下的似然度$P(X|\hat \lambda)\geqslant P(X|\lambda)$,迭代到模型收敛。每次迭代三组参数的重估公式为: 10 | 混合全職重估: 11 | $$w_i=\frac{1}{T}\sum_{t=1}^TP(i|X_t,\lambda)$$ 12 | 13 | 均值重估: 14 | $$\mu_i=\frac{\sum_{t=1}^TP(i|X_t,\lambda)X_t}{\sum_{t=1}^TP(i|X_t,\lambda)}$$ 15 | 16 | 方差重估: 17 | $$\sigma_i^2=\frac{\sum_{t=1}^TP(i|X_t,\lambda)(X_t-\mu_i)^2}{\sum_{t=1}^TP(i|X_t,\lambda)}$$ 18 | 19 | 其中,分量的後驗概率為: 20 | $$P(i|X_t,\lambda)=\frac{w_ib_i(X_t)}{\sum_{k=1}^Mw_kb_k(X_t)}$$ 21 | 22 | 基于GMM的说话人识别模型为: 23 | - 对每个说话人$n=1,2,...,N$的训练语音集,提取特征参数,基于最大似然准则,通过EM算法,建立与改说话人对应的高斯混合模型参数$\lambda_1,\lambda_2,...,\lambda_N$ 24 | - 对于带识别说话人的语音,提取特征参数,然后计算其关于训练后得到的每一个说话人的模型$\lambda_1,\lambda_2,...,\lambda_N$的似然值$p(X|\lambda_n)$,将其中最大似然值对应的需要作为说话人的识别结果$n^*=\arg \underset{n}{\max}p(X|\lambda_n)$ 25 | 26 | 还要注意: 27 | GMM模型的高斯分布个数M和模型初始参数必须首先确定,M一般又试验得来,可取$M=4,8,16$。参数初始化一般采用聚类的方式将特征矢量归为与混合数相等的各类中,然后分别计算各个类的方差和均值,作为初始矩阵和均值,方差矩阵可以是全矩阵,也可以是对角矩阵。 28 | 在实际应用中,往往得不到大量充分的训练数据对模型参数进行训练,由于训练数据的不充分,GMM模型的协方差矩阵的一些分量可能会很小,他们会严重影响系统性能。为了避免小的值对系统性能的影响,在EM算法的迭代计算中,对协方差的值设定一个门限值,在先练过程中令协方差的值不小于门限值。 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /chapter11_说话人识别/GMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def k_means(centres, data, kiter): 5 | dim, data_sz = data.T.shape 6 | ncentres = centres.shape[0] 7 | per = np.random.permutation(data_sz) 8 | perm = per[:ncentres] 9 | centres = data[perm, :] 10 | id = np.eye(ncentres) 11 | for n in range(kiter): 12 | old_centres = centres 13 | d2 = np.multiply(np.ones((ncentres, 1)), np.sum(np.power(data, 2), axis=1, keepdims=True).T).T + \ 14 | np.multiply(np.ones((data_sz, 1)), np.sum(np.power(centres, 2), axis=1, keepdims=True).T) - \ 15 | 2 * np.matmul(data, centres.T) 16 | pos = np.argmin(d2, axis=1) 17 | post = id[pos, :] 18 | num_points = np.sum(post, axis=0) 19 | for j in range(ncentres): 20 | if num_points[j] > 0: 21 | p = np.where(post[:, j] > 0.5)[0] 22 | centres[j, :] = np.sum(data[p, :], axis=0) / num_points[j] 23 | e = np.sum(d2[pos, :]) 24 | if n > 0: 25 | if np.max(np.abs(centres - old_centres)) < 0.0001 and abs(old_e - e) < 0.0001: 26 | return centres, post 27 | old_e = e 28 | return centres, post 29 | 30 | 31 | def gmm_init(ncentres, data, kiter, covar_type): 32 | """ 33 | GMM模型初始化 34 | :param ncentres:混合模型数目 35 | :param data:训练数据 36 | :param kiter:kmeans的迭代次数 37 | :param covar_type: 38 | :return: 39 | """ 40 | mix = {} 41 | dim, data_sz = data.T.shape 42 | mix['priors'] = [1 / ncentres for _ in range(ncentres)] 43 | centres = np.random.rand(ncentres, dim) 44 | mix['covars'] = np.tile(np.eye(dim), (ncentres, 1, 1)) 45 | GMM_WIDTH = 1 46 | centres, post = k_means(centres, data, kiter) 47 | cluster_sizes = np.max(np.sum(post, axis=1), axis=0) 48 | priors = cluster_sizes / np.sum(cluster_sizes) 49 | for j in range(ncentres): 50 | p = np.where(post[:, j] > 0.5)[0] 51 | c = data[p, :] 52 | diffs = c - np.multiply(np.ones((c.shape[0], 1)), centres[j, :]) 53 | mix['covars'][j, ...] = np.matmul(diffs.T, diffs) / (c.shape[1] + 1e-7) 54 | if np.linalg.matrix_rank(mix['covars'][j, ...]) < dim: 55 | mix['covars'][j, ...] += GMM_WIDTH * np.eye(dim) 56 | mix['ncentres'] = ncentres 57 | mix['covar_type'] = covar_type 58 | mix['centres'] = centres 59 | return mix 60 | 61 | 62 | def calcpost(mix, x): 63 | pass 64 | 65 | 66 | def gmm_em(mix, x, emiter): 67 | """ 68 | 69 | :param mix: 70 | :param x: 71 | :param emiter: 72 | :return: 73 | """ 74 | dim, data_sz = x.T.shape 75 | init_covars = mix['covars'] 76 | MIN_COVAR = 0.001 77 | for cnt in range(emiter): 78 | # --- E step: 计算充分统计量 --- 79 | post, act = calcpost(mix, x) 80 | -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def dis(u, xi): 5 | """ 6 | 计算欧式距离 7 | :param u: 8 | :param xi: 9 | :return: 10 | """ 11 | k = u.shape[1] 12 | xi = xi.squeeze() 13 | dis = np.zeros(k) 14 | for i in range(k): 15 | ui = u[:, i] 16 | dis[i] = np.sum(np.power(xi - ui, 2)) 17 | return dis 18 | 19 | 20 | def lbg(x, k): 21 | """ 22 | 完成lbg均值聚类算法 23 | :param x:为row*col矩阵,每一列为一个样本,每个样本有row个元素 24 | :param k:返回k个分类 25 | :return: 26 | """ 27 | row, col = x.shape 28 | epislon = 0.03 29 | delta = 0.01 30 | u = np.mean(x, axis=1).reshape(x.shape[0], -1) # 第一个聚类中心,总体均值 31 | v = {} 32 | for i3 in range(int(np.log2(k))): 33 | u = np.hstack((u * (1 - epislon), u * (1 + epislon))) 34 | D = 0 35 | DD = 1 36 | while abs((D - DD) / (DD + epislon)) > delta: 37 | DD = D 38 | for i in range(pow(2, i3 + 1)): # 初始化 39 | vv = {} 40 | vv['num'] = 0 41 | vv['ele'] = np.zeros(row) 42 | v[i] = vv 43 | for i in range(col): # 计算第i个样本 44 | distance = dis(u, x[:, i]) 45 | pos = np.argmin(distance) 46 | v[pos]['num'] += 1 47 | if v[pos]['num'] == 1: 48 | v[pos]['ele'] = x[:, i] 49 | else: 50 | v[pos]['ele'] = np.vstack((v[pos]['ele'], x[:, i])) 51 | for i in range(pow(2, i3 + 1)): 52 | u[:, i] = np.mean(v[i]['ele'], axis=0) 53 | for m in range(v[i]['ele'].shape[0]): 54 | D += np.sum(np.power(v[i]['ele'][m] - u[:, i], 2)) 55 | for i in range(k): 56 | v[i]['mean'] = u[:, i] 57 | return v 58 | -------------------------------------------------------------------------------- /chapter11_说话人识别/VQQ.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter11_说话人识别.VQ import * 3 | from chapter10_语音识别.DTW.DCW import mfccf 4 | 5 | plt.rcParams['font.sans-serif'] = ['SimHei'] 6 | plt.rcParams['axes.unicode_minus'] = False 7 | 8 | k = 8 9 | N = 4 10 | 11 | ## 生成book 12 | u = np.zeros(N) 13 | for i in range(1, N + 1): 14 | s = 'VQ_data/SX' + str(i) + '.WAV' 15 | # s = 'VQ_data/mysound.WAV' 16 | data, fs, bits = soundBase(s).audioread() 17 | data /= np.max(data) 18 | mel = mfccf(12, data, fs) 19 | v = lbg(mel.T, k) 20 | u[i] = v 21 | 22 | ## 识别过程 23 | M = 4 # 每个人有M个待识别的样本 24 | l = 5 25 | 26 | # 这部分需要用新的语音信号对于MATLAB调试查看结果 27 | for iii in range(l): 28 | for i in range(M): 29 | Dstu = np.zeros(N) 30 | s = 'VQ_data/TX{}_{}.wav'.format(iii, i) 31 | data, fs, bits = soundBase(s).audioread() 32 | data /= np.max(data) 33 | mel = mfccf(12, data, fs) # 测试数据特征 34 | for ii in range(N): 35 | for jj in range(mel.shape[1]): 36 | distance = dis(u[jj], mel[:, jj]) 37 | Dstu[ii] += np.min(distance) 38 | pos = np.argmin(Dstu) 39 | if Dstu[pos] / mel.shape[1] >= 81: 40 | print('测试者不是系统内人\n') 41 | else: 42 | print('测试者为{}'.format(pos + 1)) 43 | -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/SX1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/SX1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/SX2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/SX2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/SX3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/SX3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/SX4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/SX4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX1_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX1_1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX1_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX1_2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX1_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX1_3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX1_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX1_4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX2_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX2_1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX2_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX2_2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX2_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX2_3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX2_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX2_4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX3_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX3_1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX3_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX3_2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX3_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX3_3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX3_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX3_4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX4_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX4_1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX4_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX4_2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX4_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX4_3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX4_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX4_4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX5_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX5_1.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX5_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX5_2.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX5_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX5_3.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/TX5_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/TX5_4.WAV -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/mysound.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/VQ_data/mysound.wav -------------------------------------------------------------------------------- /chapter11_说话人识别/VQ_data/process.py: -------------------------------------------------------------------------------- 1 | from sphfile import SPHFile 2 | import glob 3 | 4 | if __name__ == "__main__": 5 | path = r'*.WAV' 6 | sph_files = glob.glob(path) 7 | print(len(sph_files)) 8 | for i in sph_files: 9 | print(i) 10 | sph = SPHFile(i) 11 | filename = i.replace(".WAV", ".wav") 12 | sph.write_wav(filename) 13 | 14 | print("Completed") 15 | -------------------------------------------------------------------------------- /chapter11_说话人识别/gmm_data/rec_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/gmm_data/rec_data.mat -------------------------------------------------------------------------------- /chapter11_说话人识别/gmm_data/tra_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter11_说话人识别/gmm_data/tra_data.mat -------------------------------------------------------------------------------- /chapter11_说话人识别/hmm_train_test.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter11_说话人识别.GMM import * 3 | from scipy.io import loadmat 4 | from chapter3_分析实验.C3_1_y_1 import enframe 5 | from chapter3_分析实验.mel import melbankm 6 | from sklearn.mixture import GMM 7 | 8 | from chapter10_语音识别.DTW.DTW import mfccf 9 | import warnings 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | 14 | def Nmfcc(x, fs, p, frameSize, inc): 15 | """ 16 | 计算mfcc系数 17 | :param x: 输入信号 18 | :param fs: 采样率 19 | :param p: Mel滤波器组的个数 20 | :param frameSize: 分帧的每帧长度 21 | :param inc: 帧移 22 | :return: 23 | """ 24 | # 预处理-预加重 25 | xx = lfilter([1, -0.97], [1], x) 26 | # 预处理-分幀 27 | xx = enframe(xx, frameSize, inc) 28 | # 预处理-加窗 29 | xx = np.multiply(xx, np.hanning(frameSize)) 30 | # 计算FFT 31 | xx = np.fft.fft(xx) 32 | # 计算能量谱 33 | xx = np.multiply(np.abs(xx), np.abs(xx)) 34 | # 计算通过Mel滤波器的能量 35 | xx = xx[:, :frameSize // 2 + 1] 36 | bank = melbankm(p, frameSize, fs, 0, 0.5 * fs, 0) 37 | ss = np.matmul(xx, bank.T) 38 | # 计算DCT倒谱 39 | n_dct = 20 40 | M = bank.shape[0] 41 | m = np.array([i for i in range(M)]) 42 | mfcc = np.zeros((ss.shape[0], n_dct)) 43 | for n in range(n_dct): 44 | mfcc[:, n] = np.sqrt(2 / M) * np.sum(np.multiply(np.log(ss), np.cos((2 * m - 1) * n * np.pi / 2 / M)), axis=1) 45 | return mfcc 46 | 47 | 48 | def train(): 49 | tdata = loadmat('gmm_data/tra_data.mat')['tdata'] 50 | models = {} 51 | for spk_cyc in range(Spk_num): 52 | print('训练第{}个说话人'.format(spk_cyc)) 53 | ## 提取MFCC特征 54 | gmm = GMM(n_components=16, n_iter=200, covariance_type='diag', n_init=3) 55 | for sp in range(Tra_num): 56 | speech = tdata[0, spk_cyc][0, sp].squeeze() 57 | # mfcc = Nmfcc(speech, fs, 20, int(fs * 0.02), int(fs * 0.01)) 58 | mfcc = mfccf(12, speech, fs) 59 | cc = mfcc[:, :-1] 60 | ## 训练 61 | # kiter = 5 # Kmeans的最大迭代次数 62 | # emiter = 30 # EM算法的最大迭代次数 63 | # mix = gmm_init(ncentres, cof.T, emiter, 'full') 64 | # mix, post, errlog = gmm_em(mix, cof.T, emiter) 65 | gmm.fit(cc) 66 | models[spk_cyc] = gmm 67 | return models 68 | 69 | 70 | 71 | def test(models): 72 | rdata = loadmat('gmm_data/rec_data.mat')['rdata'] 73 | all = 0 74 | tp = 0 75 | for i in range(rdata.shape[1]): 76 | for j in range(rdata[0, i].shape[1]): 77 | data = rdata[0, i][0, j].squeeze() 78 | # mfcc = Nmfcc(data, fs, 20, int(fs * 0.02), int(fs * 0.01)) 79 | mfcc = mfccf(12, data, fs) 80 | cc = mfcc[:, :-1] 81 | log_likelihood = np.zeros(len(models.values())) 82 | for key in models.keys(): 83 | score = models[key].score(cc) 84 | log_likelihood[key] = np.sum(score) 85 | r = np.argmax(log_likelihood) 86 | print('第{i}个人的第{j}条语音,被识别为{r}'.format(i=i, j=j, r=r)) 87 | all += 1 88 | if r == i: 89 | tp += 1 90 | print('acc={}'.format(tp / all)) 91 | 92 | 93 | if __name__ == '__main__': 94 | Spk_num = 6 # 说话人数 95 | Tra_num = 5 # 每个说话人的训练数据条数 96 | ncentres = 16 # 混合成分数量 97 | fs = 16000 98 | models = train() 99 | test(models) 100 | -------------------------------------------------------------------------------- /chapter11_说话人识别/myGMM.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from scipy.io.wavfile import read 4 | from sklearn.mixture import GMM 5 | from speakerfeatures import extract_features 6 | import warnings 7 | 8 | warnings.filterwarnings("ignore") 9 | 10 | # path to training data 11 | source = "development_set\\" 12 | 13 | # path where training speakers will be saved 14 | dest = "speaker_models\\" 15 | train_file = "development_set_enroll.txt" 16 | file_paths = open(train_file, 'r') 17 | 18 | count = 1 19 | # Extracting features for each speaker (5 files per speakers) 20 | features = np.asarray(()) 21 | for path in file_paths: 22 | path = path.strip() 23 | # read the audio 24 | sr, audio = read(source + path) 25 | # extract 40 dimensional MFCC & delta MFCC features 26 | vector = extract_features(audio, sr) 27 | 28 | if features.size == 0: 29 | features = vector 30 | else: 31 | features = np.vstack((features, vector)) 32 | # when features of 5 files of speaker are concatenated, then do model training 33 | if count == 5: 34 | gmm = GMM(n_components=16, n_iter=200, covariance_type='diag', n_init=3) 35 | gmm.fit(features) 36 | 37 | # dumping the trained gaussian model 38 | picklefile = path.split("-")[0] + ".gmm" 39 | pickle.dump(gmm, open(dest + picklefile, 'w')) 40 | print('+ modeling completed for speaker:', picklefile, " with data point = ", features.shape) 41 | features = np.asarray(()) 42 | count = 0 43 | count = count + 1 44 | -------------------------------------------------------------------------------- /chapter12_情感识别/12.1KNN.mdown: -------------------------------------------------------------------------------- 1 | 基于KNN的语音情感识别 2 | 情感识别的框图如下,主要就是特征提取建模,然后在待测样本中提取相同的特征,送到模型中计算进行识别。 3 | ![框图](./images/情感识别框图.jpg) 4 | 5 | ### 特征 6 | 短时能量: 7 | $$E_n=\sum_{m=0}^{N-1}x_n^2(m)$$ 8 | 9 | 短时抖动能量: 10 | $$E_s=\frac{\frac{1}{M-1}\sum_{n=1}^{M-1}|E_n-E_{n+1}|}{\frac{1}{M}\sum_{n=1}^ME_n}\times 100$$ 11 | 12 | 短时能量的线性回归系数: 13 | $$E_r=\frac{\sum_{n=1}^Mn·E_n-\frac{1}{M}\sum_{n=1}^Mn·\sum_{n=1}^ME_n}{\sum_{n=1}^Mn^2-\frac{1}{M}(\sum_{n=1}^M n)^2}$$ 14 | 15 | 短时能量的线性回归系数均方误差: 16 | $$E_p=\frac{1}{M}\sum_{n=1}^M(E_n-(\mu_1-E_r-\mu_n)-E_r·n)^2$$ 17 | 18 | 其中$\mu_n=\frac{1}{M}\sum_{n=1}^Mn$ 19 | 20 | 250Hz以下短时能量与全部短时能量的比: 21 | $$E_{250}/E=\frac{\sum_{n=1}^ME_{250,n}}{\sum_{n=1}^ME_{n}}\times 100$$ 22 | 23 | 基音频率及其衍生参数 24 | $$R_n(k)=\sum x_n(m)x_n(m+k)$$ 25 | 26 | 基音周期可以检测$R_n(k)$位置得出,其倒数即为基音频率F,将第i个浊音帧的基音频率表示为$F0_i$,语音中包含浊音总帧数为$M^*$,语音中总帧数为M: 27 | 一阶基音频率抖动: 28 | $$F0_{s1}=\frac{\frac{1}{M^*-1}\sum\limits_{i=1}^{M^*-1}|F0_i-F0_{i+1}|}{\frac{1}{M^*}\sum\limits_{i=1}^{M^*}F0_i}\times 100$$ 29 | 30 | 二阶基音频率抖动: 31 | $$F0_{s2}=\frac{\frac{1}{M^*-2}\sum\limits_{i=2}^{M^*-1}|2F0_i-F0_{i-1}-F0_{i+1}|}{\frac{1}{M^*}\sum\limits_{i=1}^{M^*}F0_i}\times 100$$ 32 | 33 | 所有相邻帧中,满足$F(i)*F(i+1)!=0$的两帧可定义浊音间差分基音$dF=F(i)-F(i+1)$ 34 | 35 | 还有共振峰参数,MFCC等。 36 | 37 | ### K近邻分类 38 | 39 | KNN分类的思想是给定一个在特征空间中的待分类样本,如果其附近的K个训练样本大多数属于一个类别,那么他也是这个类别。已知特征集为$\{X_1,X_2,...,X_K\}$,待分类样本为$X$,计算$X$与$X_l$的距离$D(X,X_l)=\sqrt{\sum_{i=1}^N[X(i)-X_l(i)]^2}$,$\min\{D(X,X_l)\}$是最邻近,选出$D(X,X_l)$最小的K个,然后投票决定。 40 | - 提取特征,构成特征向量$X_1,X_2,...,X_n$ 41 | - 设定K的值 42 | - 提取待识别样本的向量$X$,计算$X$与$X_l$的距离$D(X,X_l)$ 43 | - 统计$D(X,X_l)$前K个最小值分类,投票决定$X$的分类结果 44 | 45 | 46 | ### 代码实现 47 | 代码直接调用了mat文件,没有去在音频文件中提取。 48 | ~~~py 49 | import numpy as np 50 | from scipy.io import loadmat 51 | from sklearn.metrics import confusion_matrix, f1_score 52 | import matplotlib.pyplot as plt 53 | 54 | 55 | def confusion_matrix_info(y_true, y_pred, labels=['fear', 'happy', 'neutr', 'sad', 'anger'], title='confusion matrix'): 56 | """ 57 | 计算混淆矩阵以及一些评价指标,并将混淆矩阵绘图出来 58 | :param y_true: 真实标签,非one-hot编码 59 | :param y_pred: 预测标签,非one-hot编码 60 | :param labels: 标签的含义 61 | :param title: 绘图的标题 62 | :return: 63 | """ 64 | import seaborn as sns 65 | import pandas as pd 66 | C2 = confusion_matrix(y_true, y_pred) 67 | C = pd.DataFrame(C2, columns=labels, index=labels) 68 | m, _ = C2.shape 69 | for i in range(m): 70 | precision = C2[i, i] / sum(C2[:, i]) 71 | recall = C2[i, i] / sum(C2[i, :]) 72 | f1 = 2 * precision * recall / (precision + recall) 73 | print('In class {}:\t total samples: {}\t true predict samples: {}\t' 74 | 'acc={:.4f},\trecall={:.4f},\tf1-score={:.4f}'.format( 75 | labels[i], sum(C2[i, :]), C2[i, i], precision, recall, f1)) 76 | print('-' * 100, '\n', 'average f1={:.4f}'.format(f1_score(y_true, y_pred, average='micro'))) 77 | 78 | f, ax = plt.subplots() 79 | sns.heatmap(C, annot=True, ax=ax, cmap=plt.cm.binary) 80 | ax.set_title(title) 81 | ax.set_xlabel('predict') 82 | ax.set_ylabel('true') 83 | plt.show() 84 | 85 | 86 | def get_most_label(result): 87 | rst = {} 88 | for r in result: 89 | if r not in rst.keys(): 90 | rst[r] = 1 91 | else: 92 | rst[r] += 1 93 | m = sorted(rst.items(), key=lambda x: x[1], reverse=True) 94 | return m[0][0] 95 | 96 | 97 | K = 9 98 | 99 | fear = loadmat('A_Fear.mat')['fearVec'] 100 | happy = loadmat('F_Happiness.mat')['hapVec'] 101 | neutral = loadmat('N_neutral.mat')['neutralVec'] 102 | sadness = loadmat('T_sadness.mat')['sadnessVec'] 103 | anger = loadmat('W_anger.mat')['angerVec'] 104 | 105 | data = np.hstack((fear, happy, neutral, sadness, anger)) 106 | y = np.array([[i] * 50 for i in range(5)]).flatten() 107 | per = np.random.permutation(250) 108 | data_train = data[:, per[:180]] 109 | label_train = y[per[:180]] 110 | data_test = data[:, per[180:]] 111 | label_test = y[per[180:]] 112 | label_pred = np.zeros(250 - 180) 113 | j = 0 114 | for test in data_test.T: 115 | scores = np.zeros(len(data_train.T)) 116 | for i in range(len(data_train.T)): 117 | scores[i] = np.sum(np.power(test - data_train[:, i], 2)) 118 | pos = np.argsort(scores)[:K] 119 | result = label_train[pos] 120 | label = get_most_label(result) 121 | label_pred[j] = label 122 | j += 1 123 | 124 | confusion_matrix_info(label_test, label_pred) 125 | 126 | ~~~ 127 | 128 | -------------------------------------------------------------------------------- /chapter12_情感识别/12.2神经网络.mdown: -------------------------------------------------------------------------------- 1 | 人工神经网络(Artificial Neural Network , ANN) 是一种由大量简单处理单元构成的并行分布式数学模型。在人类意识到人脑计算与传统计算机处理方式的区别时,神经网络就成了科学家们探究信息处理任务的关注对象。人工神经网络主要从两方面模仿大脑工作:从外界环境中学习,用突触权值存储知识。人类大脑接收外界剌激,感受器转换为电冲击传递给神经元构成的网络,再经由效应器把电冲击转换为可识别的效应输出。 2 | 3 | 神经元是神经网络处理信息的基本单位,是由突触权值、加法器、激活函数三部分构成的非线性模型。 4 | 5 | 神经网络的训练就是反向传播的过程,最终的误差为$\epsilon(n)=\frac{1}{2}\sum_{j\in C}e_j^2(n)$,那么更新权重$w_{ij}(n)$的方式就是求偏导$\Delta w_{ij}=\frac{\partial \epsilon}{\partial w_{ij}}$,使用梯度下降: 6 | $$w_{ij}=w_{ij}-\alpha\Delta w_{ij}$$ 7 | 8 | ### 概率神经网络 9 | 概率神经网络(Probabilistic Neural Network, PNN)的主要思想是利用贝叶斯决策规则,即错误分类期望最小,在多维输入空间内分离决策空间。是一种基于统计原理的人工神经网络,使用Parzen窗口函数作为激活函数。 10 | 11 | 贝叶斯决策理论: 12 | $$if p(w_i|\overrightarrow{x})>p(w_j|\overrightarrow{x}),\forall j\neq i,\overrightarrow{x}\in w_i,then \overrightarrow{x}\in w_i$$ 13 | 14 | 其中:$p(w_i|\overrightarrow{x})=p(w_i)p(\overrightarrow{x}|w_j)$ 15 | 16 | 由于类的概率密度函数$p(w_j|\overrightarrow{x})$是未知的,用高斯核的Parzen估计: 17 | $$p(\overrightarrow{x}|w_i)=\frac{1}{N_i}\sum_{k=1}^{N_i}\frac{1}{(2\pi)^{l/2}\sigma^l}\exp{(\frac{||\overrightarrow{x}-\overrightarrow{x}_{ik}||^2}{2\sigma^2})}$$ 18 | 19 | 其中,$\overrightarrow{x}_{ik}$是属于$w_i$类的第k个训练样本,l是样本向量的维度,$\sigma$是平滑参数,$N_i$是第$2_i$类的训练样本总量。去掉常数项,判别函数可以简化为: 20 | $$g_i(\overrightarrow{x})=\frac{p(w_i)}{N_i}\sum_{k=1}^{N_i}\exp{(-\frac{||\overrightarrow{x}-\overrightarrow{x}_{ik}||^2}{2\sigma^2})}$$ 21 | 22 | PNN网络有四个部分,输入层,样本层,求和层,竞争层。首先将输入向量$\overrightarrow{x}$输入到输入层,在输入层中,网络计算输入向量与训练样本向量之间的差值$\overrightarrow{x}-\overrightarrow{x}_{ik}$,差值绝对值$||\overrightarrow{x}-\overrightarrow{x}_{ik}||$的大小代表着两个向量之间的距离,输入层的输出向量$\overrightarrow{x}-\overrightarrow{x}_{ik}$送入样本层,样本层节点数量等于训练样本数量的总和。样本层的主要工作是:判断那些类别与输入向量有关,将相关度高的类别集中起来,样本册的输出值表示了相识度,送入求和层,求和层的节点个数为类的数量,每个节点对应一个类,通过求和层的竞争传递函数进行判决,最后的判决结果由竞争层输出。每个结果只有一个1,其余都是0,概率值最大的那一类的结果为1. 23 | ### LVQ神经网络 24 | 学习向量量化( Learning Vector Quantization , LVQ)神经网络,属于前向有监督神经网络类型,在模式识别和优化领域有着广泛的应用。LVQ神经网络由输入层、隐含层和输出层气层组成,输入层与隐含层间为完全连接,每个输出层神经元与隐含层神经元的不同组相连接。隐含层和输出层神经元之间的连接权值同定为l。在网络训练过程中,输入层和隐含层神经元间的权值被修改。当某个输入模式被送至网络时,最接近输入模式的隐含神经元因获得激发而赢得竞争,因而允许它产生一个"1",而其他隐含层神经元都被迫产生"0"。与包含获胜神经元的隐含层神经元组相连接的输出神经元也发出"1" ,而其他输出神经元均发出"0"。 25 | 26 | #### LVQ1 27 | 1)网络初始化。用较小的随机数设定输入层和隐含层之间的权值初始值。 28 | 2)输入向量的输入。将输入向量$X=[x_1,x_2,...,x_n]^T$送入到输入层。 29 | 3)计算隐含层权值向量与输入向量的距离$d_j=\sqrt{\sum_{i=1}^n(x_i-w_{ij})^2}$ 30 | 4)选择与权值向量的距离最小的神经元。计-算并选择输入向量和权值向量的距离最小的 31 | 神经元,并把其称为胜出神经元,记为$j^*$。 32 | 5)更新连接权值。如果胜出神经元和预先指定的分类一致,称为正确分类,否则称为 33 | 不正确分类。正确分类和不正确分类时权值的调整量分别使用如下公式: 34 | $$\Delta w_{ij}=\left \{\begin{array}{ll} 35 | +\alpha(x_i-w{ij})& 分类正确时\\-\alpha(x_i-w{ij})& 分类不正确时 36 | \end{array}\right.$$ 37 | 38 | 6)达到最大迭代次数结束。 39 | 40 | #### LVQ2 41 | LVQ2的算法其他内容与LVQ1一样,就第5点不一样: 42 | 5)更新链接权值,如果胜出神经元1属于正确分类,则权值更新与LVQ1相同,如果不是正确分类,则选取另一个神经元2,他的权值向量与输入向量的距离只比神经元1的大一点,那么在满足限量两个条件时使用更新表达式更新。 43 | - 神经元2输入正确分类 44 | - 神经元2,胜出神经元1与输入向量的距离差很小。 -------------------------------------------------------------------------------- /chapter12_情感识别/12.3SVM语音感情识别.mdown: -------------------------------------------------------------------------------- 1 | 20 个世纪90 年代Vapnik等人提出了支持向量机(SVM) 算法,它是一种基于统计理论的学习方法,其日的是为了改善神经网络学习方法的不足。目前SVM已经广泛应用于数据挖掘、模式识别等领域。支持向量机在机器学习领域有着重要的地位,其集最大间隔的超平面、凸二次规划问题、核分析方法等多种技术于一身,具有广阔的发展和应用前景。支持向量机从当初被提出,经过Dual、Smith等人的逐步完善,Vapnik在《统计学习理论》的论著中论证了SVM算法优于归纳推理给出的误差率的界。大量研究表明SVM 算法是一种非常有效的学习方法,它能够在高维特征空间得到优化的泛化界的超平面,能够使用核技术从而避免局部最小,通过间隔和限制支持向量的个数控制容量来防止过拟合。目前,SVM技术已经应用于各个领域,理论与实践都得到了充分的发展,在人脸识别,目标识别,文本识别等都有SVM 技术的成功应用。 2 | 3 | SVM算法是统计学习理论的一种实现方式。最基本思路就是要找到使测试样本的分类错误率达到最低的最佳超平面,也就是要找到一个分割平面,使得训练集中的训练样本距离该平面的距离尽量的远,该分割平面两侧的空白区域(margin)最大。超平面为: 4 | $$wx+b=0$$ 5 | 6 | 两侧的支持向量有: 7 | $$\begin{array}{ll} 8 | wx_1+b=1\\wx_2+b=-1 9 | \end{array}$$ 10 | 11 | 支持向量到平面的集合间隔为: 12 | $$d^*=\frac{wx+b}{||w||}=\frac{1}{||w||}$$ 13 | 14 | 为了让$d^*$最大,也就是$||w||$最小,那么构造一个目标函数为:$$\min\frac{1}{2}||w||^2$$ 15 | 16 | 条件是任何一个点到超平面的距离不小于1(线性可分数据上):$y_i(wx_i+b)-1\geqslant 0$ 17 | 18 | 转化为Lagrange算法: 19 | $$L(w,b,\alpha)=\frac{1}{2}||w||^2-\sum_{i=1}^n\alpha_i(y_i(wx_i+b)-1) 20 | \tag{目标函数}$$ 21 | 22 | 令$\frac{\partial L}{\partial w}=0$,$\frac{\partial L}{\partial b}=0$有: 23 | $$\begin{array}{ll} 24 | w=\sum_{i=1}^n\alpha_iy_ix_i\\\sum\alpha_iy_i=0 25 | \end{array}\tag{微分结果}$$ 26 | 27 | 所以$w$可以用$\{x_1,x_2,...,x_n\}$线性表示,并且有一部分$\alpha_i=0$,则对于$\alpha_i\neq 0$的样本矢量$x_i$为支持向量:$w=\sum_{i\in sv}\alpha_iy_ix_i$ 28 | 将微分结果带入目标函数,条件为$\alpha_i\geqslant 0,\sum\alpha_iy_i=0$ 29 | 使得: 30 | $$\max\{Q(\alpha)\}=\max\{-\frac{1}{2}\sum\limits_{i,j=1}^n\alpha_i\alpha_jy_iy_j(x_i,x_j)+\sum_{i=1}^n\alpha_i\}$$ 31 | 32 | 如果是线性不可分:約束條件為: 33 | $$\begin{array}{ll} 34 | wx_1+b\geqslant1-\chi_1\\wx_2+b\leqslant\chi_i-1 35 | \end{array}$$ 36 | 37 | 所以优化问题成为了: 38 | $$\begin{array}{ll} 39 | \min\frac{1}{2}||w||^2+C\sum_{i=1}^n\chi_i\\y_i(wx_i+b)\geqslant 1-\chi_i\\\chi_i\geqslant 0 40 | \end{array}$$ 41 | 42 | 构造的Lagrange算法: 43 | $$L(w,b,\chi,\alpha,\beta)=\frac{1}{2}||w||^2+C\sum_{i=1}^n\chi_i-\sum_{i=1}^n\alpha_i(y_i(wx_i+b)-1+\chi_i)-\sum_{i=1}^n\beta_i\chi_i$$ 44 | 45 | 令$\frac{\partial L}{\partial w}=0$,$\frac{\partial L}{\partial b}=0$,$\frac{\partial L}{\partial \chi}=0$有: 46 | $$\begin{array}{ll} 47 | w=\sum_{i=1}^n\alpha_iy_ix_i\\\sum\alpha_iy_i=0\\C-\alpha_i-\beta_i=0 48 | \end{array}\tag{微分结果}$$ 49 | 50 | 所以对偶问题: 51 | $$\max\{-\frac{1}{2}\sum_{i=1}^n\sum_{j=1}^ny_iy_j\alpha_i\alpha_j(x_i,x_j)+\sum_{i=1}^n\alpha_i\}$$ 52 | 53 | $$s.t. \sum_{i=1}^n\alpha_iy_i=0,0\leqslant \alpha_i\leqslant C$$ 54 | -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/A_fear.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/KNN/A_fear.mat -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/F_happiness.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/KNN/F_happiness.mat -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/N_neutral.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/KNN/N_neutral.mat -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/T_sadness.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/KNN/T_sadness.mat -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/W_anger.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/KNN/W_anger.mat -------------------------------------------------------------------------------- /chapter12_情感识别/KNN/knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | from sklearn.metrics import confusion_matrix, f1_score 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def confusion_matrix_info(y_true, y_pred, labels=['fear', 'happy', 'neutr', 'sad', 'anger'], 8 | title='confusion matrix'): 9 | """ 10 | 计算混淆矩阵以及一些评价指标,并将混淆矩阵绘图出来 11 | :param y_true: 真实标签,非one-hot编码 12 | :param y_pred: 预测标签,非one-hot编码 13 | :param labels: 标签的含义 14 | :param title: 绘图的标题 15 | :return: 16 | """ 17 | import seaborn as sns 18 | import pandas as pd 19 | C2 = confusion_matrix(y_true, y_pred) 20 | C = pd.DataFrame(C2, columns=labels, index=labels) 21 | m, _ = C2.shape 22 | for i in range(m): 23 | precision = C2[i, i] / sum(C2[:, i]) 24 | recall = C2[i, i] / sum(C2[i, :]) 25 | f1 = 2 * precision * recall / (precision + recall) 26 | print('In class {}:\t total samples: {}\t true predict samples: {}\t' 27 | 'acc={:.4f},\trecall={:.4f},\tf1-score={:.4f}'.format( 28 | labels[i], sum(C2[i, :]), C2[i, i], precision, recall, f1)) 29 | print('-' * 100, '\n', 'average f1={:.4f}'.format(f1_score(y_true, y_pred, average='micro'))) 30 | 31 | f, ax = plt.subplots() 32 | sns.heatmap(C, annot=True, ax=ax, cmap=plt.cm.binary) 33 | ax.set_title(title) 34 | ax.set_xlabel('predict') 35 | ax.set_ylabel('true') 36 | plt.show() 37 | 38 | 39 | def get_most_label(result): 40 | rst = {} 41 | for r in result: 42 | if r not in rst.keys(): 43 | rst[r] = 1 44 | else: 45 | rst[r] += 1 46 | m = sorted(rst.items(), key=lambda x: x[1], reverse=True) 47 | return m[0][0] 48 | 49 | 50 | K = 9 51 | 52 | fear = loadmat('A_Fear.mat')['fearVec'] 53 | happy = loadmat('F_Happiness.mat')['hapVec'] 54 | neutral = loadmat('N_neutral.mat')['neutralVec'] 55 | sadness = loadmat('T_sadness.mat')['sadnessVec'] 56 | anger = loadmat('W_anger.mat')['angerVec'] 57 | 58 | data = np.hstack((fear, happy, neutral, sadness, anger)) 59 | y = np.array([[i] * 50 for i in range(5)]).flatten() 60 | per = np.random.permutation(250) 61 | data_train = data[:, per[:180]] 62 | label_train = y[per[:180]] 63 | data_test = data[:, per[180:]] 64 | label_test = y[per[180:]] 65 | label_pred = np.zeros(250 - 180) 66 | j = 0 67 | for test in data_test.T: 68 | scores = np.zeros(len(data_train.T)) 69 | for i in range(len(data_train.T)): 70 | scores[i] = np.sum(np.power(test - data_train[:, i], 2)) 71 | pos = np.argsort(scores)[:K] 72 | result = label_train[pos] 73 | label = get_most_label(result) 74 | label_pred[j] = label 75 | j += 1 76 | 77 | confusion_matrix_info(label_test, label_pred) 78 | -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/A_fear.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/NNs/A_fear.mat -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/F_happiness.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/NNs/F_happiness.mat -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/LVQ.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from scipy.io import loadmat 4 | from keras.utils import to_categorical 5 | from sklearn.metrics import confusion_matrix, f1_score 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def confusion_matrix_info(y_true, y_pred, labels=['fear', 'happy', 'neutr', 'sad', 'anger'], 10 | title='confusion matrix'): 11 | """ 12 | 计算混淆矩阵以及一些评价指标,并将混淆矩阵绘图出来 13 | :param y_true: 真实标签,非one-hot编码 14 | :param y_pred: 预测标签,非one-hot编码 15 | :param labels: 标签的含义 16 | :param title: 绘图的标题 17 | :return: 18 | """ 19 | import seaborn as sns 20 | import pandas as pd 21 | C2 = confusion_matrix(y_true, y_pred) 22 | C = pd.DataFrame(C2, columns=labels, index=labels) 23 | m, _ = C2.shape 24 | for i in range(m): 25 | precision = C2[i, i] / sum(C2[:, i]) 26 | recall = C2[i, i] / sum(C2[i, :]) 27 | f1 = 2 * precision * recall / (precision + recall) 28 | print('In class {}:\t total samples: {}\t true predict samples: {}\t' 29 | 'acc={:.4f},\trecall={:.4f},\tf1-score={:.4f}'.format( 30 | labels[i], sum(C2[i, :]), C2[i, i], precision, recall, f1)) 31 | print('-' * 100, '\n', 'average f1={:.4f}'.format(f1_score(y_true, y_pred, average='micro'))) 32 | 33 | f, ax = plt.subplots() 34 | sns.heatmap(C, annot=True, ax=ax, cmap=plt.cm.binary) 35 | ax.set_title(title) 36 | ax.set_xlabel('predict') 37 | ax.set_ylabel('true') 38 | plt.show() 39 | 40 | 41 | class LVQnet: 42 | def __init__(self, input_sz, output_sz, groups): 43 | ''' 44 | 初始化,给出输入向量的维度和输出的种类数 45 | groups是竞争层的分组状况,如[1,2,3,2] 46 | 意为竞争层共有8个神经元,4组输出 47 | ''' 48 | assert len(groups) == output_sz 49 | self.groups = groups 50 | self.hidden_sz = sum(groups) 51 | # 随机初始化神经元的原型向量 52 | self.prototype = np.random.rand(self.hidden_sz, input_sz) * 0.01 53 | self.hidden2out = np.zeros((output_sz, self.hidden_sz)) 54 | cnt = 0 55 | for i in range(len(groups)): 56 | for j in range(groups[i]): 57 | self.hidden2out[i][cnt] = 1 58 | cnt += 1 59 | 60 | def fit(self, X, Y, lr=0.5, iterations=10000): 61 | N = len(X) 62 | for t in range(iterations): 63 | gamma = lr * (1 - t / iterations) 64 | idx = random.randint(0, N - 1) 65 | x = X[idx] 66 | out = self.predict(x) 67 | y = Y[idx] 68 | delta = abs(out - y) 69 | sign = int(np.sum(delta) == 0) * 2 - 1 70 | # 根据delta修正获胜神经元的原型向量 71 | self.prototype[self.winner] += gamma * sign * self.v[self.winner] 72 | 73 | def predict_mat(self, x): 74 | l = x.shape[0] 75 | result = [] 76 | for i in range(l): 77 | out = self.predict(x[i, :]) 78 | result.append(out) 79 | return np.array(result) 80 | 81 | def predict(self, x): 82 | x = np.tile(x, (self.hidden_sz, 1)) 83 | v = x - self.prototype 84 | self.v = v 85 | distance = np.sum(v ** 2, axis=1).reshape(-1) 86 | winner = np.argmin(distance) 87 | self.winner = winner 88 | out = np.zeros((self.hidden_sz, 1)) 89 | out[winner][0] = 1 90 | out = self.hidden2out.dot(out).reshape(-1) 91 | return out 92 | 93 | 94 | fear = loadmat('A_Fear.mat')['fearVec'] 95 | happy = loadmat('F_Happiness.mat')['hapVec'] 96 | neutral = loadmat('N_neutral.mat')['neutralVec'] 97 | sadness = loadmat('T_sadness.mat')['sadnessVec'] 98 | anger = loadmat('W_anger.mat')['angerVec'] 99 | 100 | data = np.hstack((fear, happy, neutral, sadness, anger)) 101 | y = np.array([[i] * 50 for i in range(5)]).flatten() 102 | yy = to_categorical(y) 103 | network = LVQnet(140, 5, [50] * 5) 104 | network.fit(data.T, yy, lr=0.05, iterations=100000) 105 | yp = network.predict_mat(data.T) 106 | yp = np.argmax(yp, axis=1) 107 | 108 | confusion_matrix_info(y, yp) 109 | -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/N_neutral.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/NNs/N_neutral.mat -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/T_sadness.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/NNs/T_sadness.mat -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/W_anger.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/NNs/W_anger.mat -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/pca_lda_sklearn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | from sklearn.metrics import confusion_matrix, f1_score, classification_report 4 | import matplotlib.pyplot as plt 5 | from sklearn.decomposition import pca 6 | from sklearn import svm 7 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 8 | from sklearn.neighbors import KNeighborsClassifier 9 | 10 | 11 | def confusion_matrix_info(y_true, y_pred, labels=['fear', 'happy', 'neutr', 'sad', 'anger'], 12 | title='confusion matrix'): 13 | """ 14 | 计算混淆矩阵以及一些评价指标,并将混淆矩阵绘图出来 15 | :param y_true: 真实标签,非one-hot编码 16 | :param y_pred: 预测标签,非one-hot编码 17 | :param labels: 标签的含义 18 | :param title: 绘图的标题 19 | :return: 20 | """ 21 | import seaborn as sns 22 | import pandas as pd 23 | C2 = confusion_matrix(y_true, y_pred) 24 | C = pd.DataFrame(C2, columns=labels, index=labels) 25 | m, _ = C2.shape 26 | for i in range(m): 27 | precision = C2[i, i] / sum(C2[:, i]) 28 | recall = C2[i, i] / sum(C2[i, :]) 29 | f1 = 2 * precision * recall / (precision + recall) 30 | print('In class {}:\t total samples: {}\t true predict samples: {}\t' 31 | 'acc={:.4f},\trecall={:.4f},\tf1-score={:.4f}'.format( 32 | labels[i], sum(C2[i, :]), C2[i, i], precision, recall, f1)) 33 | print('-' * 100, '\n', 'average f1={:.4f}'.format(f1_score(y_true, y_pred, average='micro'))) 34 | 35 | f, ax = plt.subplots() 36 | sns.heatmap(C, annot=True, ax=ax, cmap=plt.cm.binary) 37 | ax.set_title(title) 38 | ax.set_xlabel('predict') 39 | ax.set_ylabel('true') 40 | plt.show() 41 | 42 | 43 | if __name__ == '__main__': 44 | # 载入数据 45 | fear = loadmat('A_Fear.mat')['fearVec'] 46 | happy = loadmat('F_Happiness.mat')['hapVec'] 47 | neutral = loadmat('N_neutral.mat')['neutralVec'] 48 | sadness = loadmat('T_sadness.mat')['sadnessVec'] 49 | anger = loadmat('W_anger.mat')['angerVec'] 50 | data = np.hstack((fear, happy, neutral, sadness, anger)).T 51 | y = np.array([[i] * 50 for i in range(5)]).flatten() 52 | # PCA降维 53 | p = pca.PCA(10) 54 | data_re = p.fit(data.T) # fit后返回的是一个结构体,降维后的数据在components_里面 55 | # 分类器分类 56 | clf = svm.SVC() 57 | clf.fit(data_re.components_.T[::2], y[::2]) 58 | yp = clf.predict(data_re.components_.T[::3]) 59 | confusion_matrix_info(y[::3], yp, title='PCA') 60 | print(classification_report(y[::3], yp, target_names=['fear', 'happy', 'neutr', 'sad', 'anger'])) 61 | print('-' * 100) 62 | # LDA降维与分类 63 | lda = LinearDiscriminantAnalysis() 64 | lda.fit(data[::2], y[::2]) 65 | yp = lda.predict(data[::3]) 66 | confusion_matrix_info(y[::3], yp, title='LDA') 67 | print(classification_report(y[::3], yp, target_names=['fear', 'happy', 'neutr', 'sad', 'anger'])) 68 | -------------------------------------------------------------------------------- /chapter12_情感识别/NNs/svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | from sklearn.metrics import confusion_matrix, f1_score, classification_report 4 | import matplotlib.pyplot as plt 5 | from sklearn import svm 6 | 7 | 8 | def confusion_matrix_info(y_true, y_pred, labels=['fear', 'happy', 'neutr', 'sad', 'anger'], 9 | title='confusion matrix'): 10 | """ 11 | 计算混淆矩阵以及一些评价指标,并将混淆矩阵绘图出来 12 | :param y_true: 真实标签,非one-hot编码 13 | :param y_pred: 预测标签,非one-hot编码 14 | :param labels: 标签的含义 15 | :param title: 绘图的标题 16 | :return: 17 | """ 18 | import seaborn as sns 19 | import pandas as pd 20 | C2 = confusion_matrix(y_true, y_pred) 21 | C = pd.DataFrame(C2, columns=labels, index=labels) 22 | m, _ = C2.shape 23 | for i in range(m): 24 | precision = C2[i, i] / sum(C2[:, i]) 25 | recall = C2[i, i] / sum(C2[i, :]) 26 | f1 = 2 * precision * recall / (precision + recall) 27 | print('In class {}:\t total samples: {}\t true predict samples: {}\t' 28 | 'acc={:.4f},\trecall={:.4f},\tf1-score={:.4f}'.format( 29 | labels[i], sum(C2[i, :]), C2[i, i], precision, recall, f1)) 30 | print('-' * 100, '\n', 'average f1={:.4f}'.format(f1_score(y_true, y_pred, average='micro'))) 31 | 32 | f, ax = plt.subplots() 33 | sns.heatmap(C, annot=True, ax=ax, cmap=plt.cm.binary) 34 | ax.set_title(title) 35 | ax.set_xlabel('predict') 36 | ax.set_ylabel('true') 37 | plt.show() 38 | 39 | 40 | if __name__ == '__main__': 41 | fear = loadmat('A_Fear.mat')['fearVec'] 42 | happy = loadmat('F_Happiness.mat')['hapVec'] 43 | neutral = loadmat('N_neutral.mat')['neutralVec'] 44 | sadness = loadmat('T_sadness.mat')['sadnessVec'] 45 | anger = loadmat('W_anger.mat')['angerVec'] 46 | 47 | data = np.hstack((fear, happy, neutral, sadness, anger)).T 48 | y = np.array([[i] * 50 for i in range(5)]).flatten() 49 | clf = svm.SVC() 50 | train = [True if (i + 1) % 5 != 0 else False for i in range(250)] 51 | test = [True if (i + 1) % 5 == 0 else False for i in range(250)] 52 | clf.fit(data[train], y[train]) 53 | yp = clf.predict(data[test]) 54 | confusion_matrix_info(y[test], yp) 55 | print(classification_report(y[test], yp, target_names=['fear', 'happy', 'neutr', 'sad', 'anger'])) 56 | -------------------------------------------------------------------------------- /chapter12_情感识别/images/情感识别框图.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter12_情感识别/images/情感识别框图.jpg -------------------------------------------------------------------------------- /chapter2_基础/2.2语音编辑.mdown: -------------------------------------------------------------------------------- 1 | ## 信号相加 2 | 读取了语音信号之后可以看到是一个一维数组,可以直接通过一维数组(列表)的形式进行操作。在两个序列长度不一样时候,可以在短的一个序列后补零。 3 | ~~~py 4 | class soundBase: 5 | def __init__(self, path): 6 | self.path = path 7 | 8 | def sound_add(self, data1, data2): 9 | if len(data1) < len(data2): 10 | tmp = np.zeros([len(data2)]) 11 | for i in range(len(data1)): 12 | tmp[i] += data1[i] 13 | return tmp + data2 14 | elif len(data1) > len(data2): 15 | tmp = np.zeros([len(data1)]) 16 | for i in range(len(data2)): 17 | tmp[i] += data2[i] 18 | return tmp + data1 19 | else: 20 | return data1 + data2 21 | ~~~ 22 | 23 | ## 卷积 24 | 卷积是一个常用的计算,两个序列$x_1,x_2$的卷积表达式为: 25 | $$y(n)=\sum\limits_{k=-\infty}^{\infty}x_1(k)x_2(n-k)$$ 26 | 27 | 对于离散信号来说: 28 | $$y(n)=\sum\limits_{k=0}^{N}x_1(k)x_2(n-k)$$ 29 | 30 | 计算方式可以参考[一维信号的卷积认识](https://blog.csdn.net/sinat_18131557/article/details/103432004) 31 | 32 | ## 采样频率的转化 33 | 采样频率的转化是为了做升采样和降采样。降采样是对序列$x(n)$间隔$D-1$个点进行抽取: 34 | $$x_D(m)=x(Dm)\tag{抽取}$$ 35 | 36 | 其中$D$为正整数。为了米面抽取序列后频谱混叠,通常在抽取前将信号通过一个抗混叠滤波器。 37 | 内插就是在原序列的样本点之间插入$I-1$个值,原始序列为$x(n)$,内插后的序列为$x_I(m)$ 38 | $$x_I(m)=\left\{\begin{array}{ll} 39 | x(\frac{m}{I})&,m=0,±I,±2I...\\ 40 | 0&,others 41 | \end{array}\right.\tag{内插}$$ 42 | 43 | 内插之后,通过低通滤波器,移植混叠信号。 44 | 45 | 在matlab中都可以通过`resample`函数来进行,通过配置参数的不同即可。在python的实现中,利用`audiowrite`的参数`fs`来实现,在读取信号的时候,输出fs,然后对fs进行变化后写入。 46 | ~~~py 47 | class soundBase: 48 | def __init__(self, path): 49 | self.path = path 50 | 51 | def audiowrite(self, data, fs, binary=True, channel=1, path=[]): 52 | if binary: 53 | wf = wave.open(self.path, 'wb') 54 | wf.setframerate(fs) 55 | wf.setnchannels(channel) 56 | wf.setsampwidth(2) 57 | wf.writeframes(b''.join(data)) 58 | else: 59 | if len(path) == 0: 60 | path = self.path 61 | wavfile.write(path, fs, data) 62 | 63 | def audioread(self): 64 | fs, data = wavfile.read(self.path) 65 | return data, fs 66 | 67 | sb = soundBase('C2_2_y.wav') 68 | data, fs = sb.audioread() 69 | sb_cc = soundBase('C2_2_y_conved_2.wav') 70 | sb_c.audiowrite(data, fs * 2) 71 | ~~~ 72 | -------------------------------------------------------------------------------- /chapter2_基础/2.3声强与响度.mdown: -------------------------------------------------------------------------------- 1 | ## 声压 2 | 声压是定量描述声波的最基本的物理量,它是由于声扰动产生的逾量压强,是空间位置和时间的函数。由于声压的测量比较易于实现,而且通过声压的测量也可以间接求得质点振速等其他声学参量,因此,声压已成为人们最为普遍采用的定量描述声披性质的物理量。 3 | ### 有效声压 4 | 通常讲的卢压指的是有效声压,即在一定时间间隔内将瞬时声压对时间求方均根值所得。设语音长度度为$T$, 离散点数为$N$, 则有效声压的计算公式为 5 | $$P_e=\sqrt{\frac{1}{T}\sum\limits_{n=1}^Nx^2\Delta t}=\sqrt{\frac{1}{N\Delta t}\sum\limits_{n=1}^Nx^2\Delta t}=\sqrt{\frac{1}{N}\sum\limits_{n=1}^Nx^2}$$ 6 | 7 | 8 | 9 | 其中$x$表示采样点。只要保证所取的点数$N$足够大,即可保证计算的准确性。用于计算声压级值的语音帧长一般为20ms、50ms、100ms、200ms以及500ms。计算的结果是一个序列。 10 | 11 | ~~~py 12 | class soundBase: 13 | def __init__(self, path): 14 | self.path = path 15 | 16 | def audioread(self, formater='sample'): 17 | """ 18 | 读取语音文件 19 | 2020-2-26 Jie Y. Init 20 | :param formater: 获取数据的格式,为sample时,数据为float32的,[-1,1],同matlab同名函数. 否则为文件本身的数据格式 21 | :return: 语音数据data, 采样率fs 22 | """ 23 | fs, data = wavfile.read(self.path) 24 | if formater == 'sample': 25 | data, _ = librosa.load(self.path, sr=fs) 26 | return data, fs 27 | 28 | def SPL(self, data, fs, frameLen=100, isplot=True): 29 | """ 30 | 计算声压曲线 31 | 2020-2-26 Jie Y. Init 32 | :param data: 语音信号数据 33 | :param fs: 采样率 34 | :param frameLen: 计算声压的时间长度(ms单位) 35 | :param isplot: 是否绘图,默认是 36 | :return: 返回声压列表spls 37 | """ 38 | 39 | def spl_cal(s, fs, frameLen): 40 | """ 41 | 根据数学公式计算单个声压值 42 | $y=\sqrt(\sum_{i=1}^Nx^2(i))$ 43 | 2020-2-26 Jie Y. Init 44 | :param s: 输入数据 45 | :param fs: 采样率 46 | :param frameLen: 计算声压的时间长度(ms单位) 47 | :return: 单个声压数值 48 | """ 49 | l = len(s) 50 | M = frameLen * fs / 1000 51 | if not l == M: 52 | exit('输入信号长度与所定义帧长不等!') 53 | # 计算有效声压 54 | pp = 0 55 | for i in range(int(M)): 56 | pp += (s[i] * s[i]) 57 | pa = np.sqrt(pp / M) 58 | p0 = 2e-5 59 | spl = 20 * np.log10(pa / p0) 60 | return spl 61 | 62 | length = len(data) 63 | M = fs * frameLen // 1000 64 | m = length % M 65 | if not m < M // 2: 66 | # 最后一帧长度不小于M的一半 67 | data = np.hstack((data, np.zeros(M - m))) 68 | else: 69 | # 最后一帧长度小于M的一半 70 | data = data[:M * (length // M)] 71 | spls = np.zeros(len(data) // M) 72 | for i in range(length // M - 1): 73 | s = data[i * M:(i + 1) * M] 74 | spls[i] = spl_cal(s, fs, frameLen) 75 | 76 | if isplot: 77 | plt.subplot(211) 78 | plt.plot(data) 79 | plt.subplot(212) 80 | plt.step([i for i in range(len(spls))], spls) 81 | plt.show() 82 | return spls 83 | 84 | 85 | sb = soundBase('C2_3_y.wav') 86 | data, fs = sb.audioread() 87 | sb.SPL(data, fs) 88 | ~~~ 89 | ### 声压级 90 | 声音的有效声压与基准声压之比,取以10为底的对数,再乘以20,即为声压级,通常以符号$L_p$表示,单位为dB。 91 | $$L_p=20\lg\frac{P_e}{p_{ref}}(dB)$$ 92 | 93 | 其中,$P_e$为待测声压的有效值,$P_{ref}$为参考声压,在空气中一般取$2\times 10^{-5}Pa$。 94 | 95 | 96 | 97 | ## 声强 98 | 在物理学巾,声波在单位时间内作用在与其传递方向垂直的单位面积上的能量称为声强。日常生活中能听到的声音其强度范围很大,最大和最小之间可达$10^{12}$倍。 99 | ### 声强级 100 | 101 | 用声强的物理学单位表示声音强弱很不方便。当人耳听到两个强度不同的声音时,感觉的大小大致上与两个卢强比值的对数成比例。因此,用对数尺度来表示声音强度的等级,其单位为分贝(dB) 。 102 | $$L_I=10\lg (I/I_0)(dB)$$ 103 | 104 | 在声学中,$I_0=1\times 10^{-12}W/m^2$。 105 | ### 声压与声强的关系 106 | 对于球面波和平面波,声压与声强的关系是: 107 | $$I=P^2/(\rho·c)$$ 108 | 其中,$\rho$为空气密度,$c$为声速,在标准大气压和20摄氏度的环境下,$\rho·c=408 Pa·s/m$,这个数值叫国际单位值,也叫瑞丽,称为空气对声波的特性阻抗。 109 | 110 | ## 响度 111 | 响度描述的是声音的响亮程度,表示人耳对声音的主观感受,其计量单位是宋。定义为声压级为40dB的1 kHz纯音的响度为1 Son (宋) 。人耳对声音的感觉,不仅和声压有关,还和频率有关。声压级相同,频率不同的声音,听起来响亮程度也不同。如空压机与电锯,同是100dB声压级的噪声,昕起来电锯声要响得多。按人耳对声音的感觉特性,依据卢压和频率定出人对声音的主观音响感觉量,称为响度级,单位为方,符号phon。根据国际协议规定,OdB声级的1000 Hz纯音的晌度级定义为0 phon。其他频率声音的声级与响度级的对应关系,要从等响度曲线才能查出。 -------------------------------------------------------------------------------- /chapter2_基础/C2_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/C2_1_y.wav -------------------------------------------------------------------------------- /chapter2_基础/C2_2_1.py: -------------------------------------------------------------------------------- 1 | from soundBase import soundBase 2 | from random import randint, random 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | # 2.2 练习1 7 | sb = soundBase('C2_2_y.wav') 8 | # 读取语音 9 | ''' 10 | 这里的wavfile.read()函数修改了里面的代码,返回项return fs, data 改为了return fs, data, bit_depth 11 | 如果这里报错,可以将wavfile.read()修改。 12 | :param formater: 获取数据的格式,为sample时,数据为float32的,[-1,1],同matlab同名函数. 否则为文件本身的数据格式 13 | 指定formater为任意非sample字符串,则返回原始数据。 14 | :return: 语音数据data, 采样率fs,数据位数bits 15 | ''' 16 | data, fs, nbits = sb.audioread() 17 | print(fs) 18 | max_data = max(data) 19 | noise = [random() * 0.1 for i in range(len(data))] 20 | fixed2 = sb.sound_add(data, noise) 21 | plt.subplot(311) 22 | plt.plot(data) 23 | plt.subplot(312) 24 | plt.plot(noise) 25 | plt.subplot(313) 26 | plt.plot(fixed2) 27 | plt.show() 28 | sb_f = soundBase('C2_2_y_noised.wav') 29 | sb_f.audiowrite(fixed2, fs) 30 | # sb_f.audioplayer() 31 | 32 | # 2.2 练习2 33 | conved = np.convolve(data, noise, 'same') 34 | sb_c = soundBase('C2_2_y_conved.wav') 35 | sb_c.audiowrite(conved, fs) 36 | # sb_c.audioplayer() 37 | 38 | # 2.2 练习3 39 | plt.subplot(211) 40 | x = [i / fs for i in range(len(data))] 41 | plt.plot(x, data) 42 | sb_cc = soundBase('C2_2_y_conved_2.wav') 43 | sb_c.audiowrite(data, fs * 2) 44 | ''' 45 | 这里的wavfile.read()函数修改了里面的代码,返回项return fs, data 改为了return fs, data, bit_depth 46 | 如果这里报错,可以将wavfile.read()修改。 47 | :param formater: 获取数据的格式,为sample时,数据为float32的,[-1,1],同matlab同名函数. 否则为文件本身的数据格式 48 | 指定formater为任意非sample字符串,则返回原始数据。 49 | :return: 语音数据data, 采样率fs,数据位数bits 50 | ''' 51 | data, fs_, nbits = sb_c.audioread() 52 | x = [i / fs_ for i in range(len(data))] 53 | print(fs_) 54 | plt.subplot(212) 55 | plt.plot(x, data) 56 | plt.show() 57 | -------------------------------------------------------------------------------- /chapter2_基础/C2_2_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/C2_2_y.wav -------------------------------------------------------------------------------- /chapter2_基础/C2_2_y_conved.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/C2_2_y_conved.wav -------------------------------------------------------------------------------- /chapter2_基础/C2_2_y_noised.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/C2_2_y_noised.wav -------------------------------------------------------------------------------- /chapter2_基础/C2_3_1.py: -------------------------------------------------------------------------------- 1 | from soundBase import soundBase 2 | from random import randint 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | # 2.2 练习1 7 | sb = soundBase('C2_3_y.wav') 8 | data, fs, nbits = sb.audioread() 9 | # sb.SPL(data, fs) 10 | spl, freq = sb.iso226(50) 11 | -------------------------------------------------------------------------------- /chapter2_基础/C2_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/C2_3_y.wav -------------------------------------------------------------------------------- /chapter2_基础/C2_4_s.py: -------------------------------------------------------------------------------- 1 | from soundBase import soundBase 2 | 3 | sb = soundBase('a.wav') 4 | y = sb.vowel_generate(16000) 5 | sb.audiowrite(y, 16000) 6 | sb.soundplot() 7 | sb.audioplayer() 8 | -------------------------------------------------------------------------------- /chapter2_基础/a.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter2_基础/a.wav -------------------------------------------------------------------------------- /chapter2_基础/audioplayer.py: -------------------------------------------------------------------------------- 1 | """PyAudio Example: Play a WAVE file.""" 2 | 3 | import pyaudio 4 | import wave 5 | 6 | CHUNK = 1024 7 | FILENAME = 'C2_1_y.wav' 8 | 9 | 10 | def player(filename=FILENAME): 11 | wf = wave.open(filename, 'rb') 12 | 13 | p = pyaudio.PyAudio() 14 | 15 | stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), 16 | channels=wf.getnchannels(), 17 | rate=wf.getframerate(), 18 | output=True) 19 | 20 | data = wf.readframes(CHUNK) 21 | 22 | while data != b'': 23 | stream.write(data) 24 | data = wf.readframes(CHUNK) 25 | 26 | stream.stop_stream() 27 | stream.close() 28 | 29 | p.terminate() 30 | 31 | 32 | player(FILENAME) 33 | -------------------------------------------------------------------------------- /chapter2_基础/audiorecorder.py: -------------------------------------------------------------------------------- 1 | import pyaudio 2 | import wave 3 | 4 | CHUNK = 1024 5 | FORMAT = pyaudio.paInt16 6 | CHANNELS = 2 7 | RATE = 16000 8 | RECORD_SECONDS = 2 9 | WAVE_OUTPUT_FILENAME = "Oldboy.wav" 10 | 11 | p = pyaudio.PyAudio() 12 | 13 | stream = p.open(format=FORMAT, 14 | channels=CHANNELS, 15 | rate=RATE, 16 | input=True, 17 | frames_per_buffer=CHUNK) 18 | 19 | print("开始录音,请说话......") 20 | 21 | frames = [] 22 | 23 | for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): 24 | data = stream.read(CHUNK) 25 | frames.append(data) 26 | 27 | print("录音结束,请闭嘴!") 28 | 29 | stream.stop_stream() 30 | stream.close() 31 | p.terminate() 32 | 33 | wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') 34 | wf.setnchannels(CHANNELS) 35 | wf.setsampwidth(p.get_sample_size(FORMAT)) 36 | wf.setframerate(RATE) 37 | wf.writeframes(b''.join(frames)) 38 | wf.close() 39 | -------------------------------------------------------------------------------- /chapter2_基础/audiowriter.py: -------------------------------------------------------------------------------- 1 | import librosa # 填充,默认频率为22050,可以改变频率 2 | from scipy.io import wavfile # 原音无损 3 | import numpy as np 4 | import librosa.display 5 | import matplotlib.pyplot as plt 6 | 7 | fs, data = wavfile.read('C2_3_y.wav') # 原始频率,原始数据 8 | print("长度 = {0} 秒".format(len(data) / fs)) 9 | 10 | data1, sample_rate = librosa.load('C2_3_y.wav', sr=fs) 11 | print("长度 = {0} 秒".format(len(data1) / sample_rate)) 12 | plt.plot(data1) 13 | plt.show() 14 | 15 | # path = 'C2_1_y_2.wav' 16 | # librosa.output.write_wav(path, data.astype(np.float32), sr=sample_rate) 17 | 18 | plt.figure(figsize=(14, 5)) 19 | librosa.display.waveplot(data1, sample_rate) 20 | plt.show() 21 | -------------------------------------------------------------------------------- /chapter3_分析实验/3.1语音分帧与加窗.mdown: -------------------------------------------------------------------------------- 1 | 一般来讲,语音信号的采样率是挺高的,而且认为语音信号在一定时间段的基本特性不会发生较大的变化,具有一定的短时平稳性。进行“短时分析”就行将信号分解成一段一段地来处理,每一段就叫一帧,大约10~30ms,也就是一秒钟大约33~100帧,对于通常10K的采样率来说,这样也能有一定的信息存在。分帧通常有一定的交叠部分,就是帧移。帧移与帧长的比通常为0~1/2。 2 | 3 | 分帧就是通过加窗函数实现的,假设原始信号为$s(n)$,窗函数为$w(n)$,那么分帧就是$s_w(n)=s(n)*w(n)$,窗函数需要满足1. 窗口两端不引起急剧变化,应该平滑过渡到0,2.在频域有较宽的3dB贷款以及较大的边带最大值。窗口的长度一般为10~20ms。有三种常见的窗函数为: 4 | - 矩形窗 5 | $$w(n)=\left\{\begin{array}{ll} 6 | 1&,0\leqslant n \leqslant N-1\\ 7 | 0&,others 8 | \end{array}\right. \tag{矩形窗}$$ 9 | 10 | - 汉明窗 11 | 12 | $$w(n)=\left\{\begin{array}{ll} 13 | 0.54-0.46\cos[2\pi n/(N-1)]&,0\leqslant n \leqslant N-1\\ 14 | 0&,others 15 | \end{array}\right.\tag{汉明窗}$$ 16 | 17 | - 海宁窗 18 | 19 | $$w(n)=\left\{\begin{array}{ll} 20 | 0.5(1-\cos(2\pi n/(N-1)))&,0\leqslant n \leqslant N-1\\ 21 | 0&,others 22 | \end{array}\right.\tag{汉明窗}$$ 23 | 24 | 其图形如下: 25 | ![.\images\window.png](images/window.png) 26 | 27 | ~~~py 28 | import matplotlib.pyplot as plt 29 | import numpy as np 30 | 31 | plt.rcParams['font.family'] = ['sans-serif'] 32 | plt.rcParams['font.sans-serif'] = ['SimHei'] 33 | 34 | N = 32 35 | nn = [i for i in range(N)] 36 | plt.subplot(3, 1, 1) 37 | plt.stem(np.ones(N)) 38 | plt.title('(a)矩形窗') 39 | 40 | w = 0.54 - 0.46 * np.cos(np.multiply(nn, 2 * np.pi) / (N - 1)) 41 | plt.subplot(3, 1, 2) 42 | plt.stem(w) 43 | plt.title('(b)汉明窗') 44 | 45 | w = 0.5 * (1 - np.cos(np.multiply(nn, 2 * np.pi) / (N - 1))) 46 | plt.subplot(3, 1, 3) 47 | plt.stem(w) 48 | plt.title('(c)海宁窗') 49 | # plt.show() 50 | plt.savefig('images/window.png') 51 | plt.close() 52 | ~~~ 53 | 54 | 分帧操作,相当于将信号分解为若干个信号片段,并将片段与窗函数进行对应元素的乘法。 55 | ~~~py 56 | from scipy.io import wavfile 57 | import numpy as np 58 | import matplotlib.pyplot as plt 59 | 60 | plt.rcParams['font.family'] = ['sans-serif'] 61 | plt.rcParams['font.sans-serif'] = ['SimHei'] 62 | plt.rcParams['axes.unicode_minus'] = False 63 | 64 | 65 | def enframe(x, win, inc=None): 66 | nx = len(x) 67 | if isinstance(win, list): 68 | nwin = len(win) 69 | nlen = nwin # 帧长=窗长 70 | elif isinstance(win, int): 71 | nwin = 1 72 | nlen = win # 设置为帧长 73 | if inc is None: 74 | inc = nlen 75 | nf = (nx - nlen + inc) // inc 76 | frameout = np.zeros((nf, nlen)) 77 | indf = np.multiply(inc, np.array([i for i in range(nf)])) 78 | for i in range(nf): 79 | frameout[i, :] = x[indf[i]:indf[i] + nlen] 80 | if isinstance(win, list): 81 | frameout = np.multiply(frameout, np.array(win)) 82 | return frameout 83 | 84 | 85 | fs, data = wavfile.read('C3_1_y.wav') 86 | inc = 100 87 | wlen = 200 88 | en = enframe(data, wlen, inc) 89 | i = input('其实帧(i):') 90 | i = int(i) 91 | tlabel = i 92 | plt.subplot(4, 1, 1) 93 | x = [i for i in range((tlabel - 1) * inc, (tlabel - 1) * inc + wlen)] 94 | plt.plot(x, en[tlabel, :]) 95 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 96 | plt.title('(a)当前波形帧号{}'.format(tlabel)) 97 | 98 | plt.subplot(4, 1, 2) 99 | x = [i for i in range((tlabel + 1 - 1) * inc, (tlabel + 1 - 1) * inc + wlen)] 100 | plt.plot(x, en[i + 1, :]) 101 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 102 | plt.title('(b)当前波形帧号{}'.format(tlabel + 1)) 103 | 104 | plt.subplot(4, 1, 3) 105 | x = [i for i in range((tlabel + 2 - 1) * inc, (tlabel + 2 - 1) * inc + wlen)] 106 | plt.plot(x, en[i + 2, :]) 107 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 108 | plt.title('(c)当前波形帧号{}'.format(tlabel + 2)) 109 | 110 | plt.subplot(4, 1, 4) 111 | x = [i for i in range((tlabel + 3 - 1) * inc, (tlabel + 3 - 1) * inc + wlen)] 112 | plt.plot(x, en[i + 3, :]) 113 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 114 | plt.title('(d)当前波形帧号{}'.format(tlabel + 3)) 115 | 116 | # plt.show() 117 | plt.savefig('images/en.png') 118 | plt.close() 119 | 120 | ~~~ 121 | 122 | ![分帧](images/en.png) 123 | 124 | 另外,也可以直接使用`numpy.hanning(N)`来获得窗函数。其中N就是点数。 -------------------------------------------------------------------------------- /chapter3_分析实验/3.3短时频域分析.mdown: -------------------------------------------------------------------------------- 1 | ## 短时傅里叶变换 2 | 3 | 语音信号是典型的非平稳信号,但是由于其非平稳性由发声器官的物理运动过程而产生,这种过程是相对变换缓慢的,在10~30ms以内可以认为是平稳的。傅里叶分析时分析线性系统和平稳信号稳态特征的手段,而短时傅里叶分析,是用稳态分析方法处理非平稳信号的一种方法。 4 | 5 | 假设语音波形时域信号为$x(l)$,加窗分帧处理后得到的第$n$帧语音信号为$x_n(m)$,那有: 6 | $$x_n(m)=w(m)x(n+m),1\leqslant m\leqslant N$$ 7 | 8 | 对分帧信号进行短时傅里叶变化就是: 9 | $$X_n(e^{jw})=\sum\limits_{m=1}^Nx_n(m)e^{-jwm}$$ 10 | 11 | 其中,定义角频率$w=2\pi k/N$,得到了离散的短时傅里叶变化(DFT)。实际上就是$X_n(e^{jw})$在频域的取样: 12 | $$X_n(e^{j\frac{2\pi k}{N}})=X_n(k)=\sum\limits_{m=1}^Nx_n(m)e^{-j\frac{2\pi km}{N}},1\leqslant k \leqslant N$$ 13 | 14 | 实际中,可以使用FFT算法代替换成$x_n(m)$到$X_n(k)$的转换。 15 | ~~~py 16 | def STFFT(x, win, nfft, inc): 17 | xn = enframe(x, win, inc) 18 | xn = xn.T 19 | y = np.fft.fft(xn, nfft, axis=0) 20 | return y[:nfft // 2, :] 21 | ~~~ 22 | 输入数据首先分帧处理,使用之前创建过的函数`enframe(x, win, inc)`。然后直接调用`np.fft.fft(xn, nfft, axis=0)`进行fft变化处理,中间有一个转置操作,是为了让时间轴作为横坐标,k作为纵坐标。 23 | 24 | ## 语谱图的表示 25 | 26 | 一般定义$|X_n(k)|$为$x_n(m)$的短时幅度谱估计,而时间处频谱能量密度函数$P(n,k)$表示为: 27 | $$P(n,k)=|X_n(k)|^2$$ 28 | 29 | 可以看出$P(n,k)$是一个非负的实数矩阵,以时间n作为横坐标,k作为纵坐标,就可以绘制一张热图(或灰度图),这就是语谱图。如果通过$10\lg P(n,k)$处理后,语谱图的单位就是dB,将变换后的矩阵精细图像和色彩映射后,就能得到彩色的语谱图。 30 | 31 | 语谱图中的横杠表示他们是共振峰,从横杠对应的频率和宽度可以确定相应的共振峰的频率域带宽,在一个语音段中,有没有横杠的出现是判断是不是浊音的重要标志。竖条是语谱图中与时间轴垂直的条纹,每个竖直条表示一个基音,条纹的起点相当于声门脉冲的起点,条纹之间的距离表示基音周期。 32 | 33 | 在python中,读取到语音信号以后可以直接使用 34 | ~~~py 35 | plt.specgram(data, NFFT=256, Fs=fs, window=np.hanning(256)) 36 | plt.ylabel('Frequency') 37 | plt.xlabel('Time(s)') 38 | plt.show() 39 | ~~~ 40 | 进行绘制语谱图,如果想要使用短时傅里叶变化得到的结果来做,那么首先看下输出的结果是一个复数矩阵,所以先求模后平方`np.abs(y)*np.abs(y)`,那么用`plt.matshow`可以得到结果,不过这样的语谱图上下颠倒的,使用`np.flip(np.abs(y)*np.abs(y), 0))`上数据上下翻转一下。在绘制之前最好转化为dB单位,就是以10取对数,不然啥也看不见,黑乎乎一片。 41 | ~~~py 42 | from chapter3_分析实验.windows import * 43 | from chapter3_分析实验.timefeature import * 44 | from chapter2_基础.soundBase import * 45 | 46 | 47 | def STFFT(x, win, nfft, inc): 48 | xn = enframe(x, win, inc) 49 | xn = xn.T 50 | y = np.fft.fft(xn, nfft, axis=0) 51 | return y[:nfft // 2, :] 52 | 53 | 54 | data, fs = soundBase('C3_3_y.wav').audioread() 55 | 56 | wlen = 256 57 | nfft = wlen 58 | win = hanning_window(wlen) 59 | inc = 128 60 | 61 | y = STFFT(data, win, nfft, inc) 62 | freq = [i * fs / wlen for i in range(wlen // 2)] 63 | frame = FrameTimeC(y.shape[1], wlen, inc, fs) 64 | 65 | plt.matshow(np.log10(np.flip(np.abs(y)*np.abs(y), 0))) 66 | plt.colorbar() 67 | plt.show() 68 | 69 | plt.specgram(data, NFFT=256, Fs=fs, window=np.hanning(256)) 70 | plt.ylabel('Frequency') 71 | plt.xlabel('Time(s)') 72 | plt.show() 73 | ~~~ 74 | 得到的语谱图大约是这样的: 75 | ![spec](images/spec.png) -------------------------------------------------------------------------------- /chapter3_分析实验/3.6线谱对转化.mdown: -------------------------------------------------------------------------------- 1 | 由于$A(z)=1-\sum\limits_{i=1}^pa_iz^{-1}$作为线性预测误差滤波器,其倒数$H(z)=1/A(z)$为线性预测合成滤波器,这个滤波器常被用于重建语音。直接对$a_i$进行编码时,$H(z)$的稳定性得不到保障。使用线谱对(Line Spectrum Pair, LSP)作为线性预测的等价,可以提高其鲁棒性。 2 | 3 | LSP通过求解$p+1$阶对称和反对称多项式的共轭复根得到。其中$p+1$阶的对称和反对称多项式表示为: 4 | $$\begin{array}{lc} 5 | P(z)=A(z)+z^{-(p+1)}A(z^{-1})\\Q(z)=A(z)-z^{-(p+1)}A(z^{-1}) 6 | \end{array}$$ 7 | 8 | 其中: 9 | $$z^{-(p+1)}A(z^{-1})=z^{-(p+1)}-a_1z^{-(p)}-a_2z^{-(p-1)}-...a_pz^{-1}$$ 10 | 如果用$\omega_i$和$\theta_i$分别表示$P(z)$和$Q(z)$的第$i$个根。有: 11 | $$\begin{array}{lc} 12 | P(z)=(1+z^{-1})\prod_{i=1}^{p/2}(1-2\cos \omega_iz^{-1}+z^{-2})\\ 13 | Q(z)=(1-z^{-1})\prod_{i=1}^{p/2}(1-2\cos \theta_iz^{-1}+z^{-2}) 14 | \end{array}$$ 15 | 16 | $\omega_i$和$\theta_i$就是LSP系数对应的线谱频率(LSF)。由于LSP参数成对出现,并且反应信号的频谱特性,所以被称为线谱对。 17 | 18 | 更加定义可以很简单将LSP转化为LPC: 19 | $$A(z)=\frac{1}{2}[P(z)+Q(z)]$$ -------------------------------------------------------------------------------- /chapter3_分析实验/C3_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_1_y.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_1_y_1.py: -------------------------------------------------------------------------------- 1 | from scipy.io import wavfile 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | plt.rcParams['font.family'] = ['sans-serif'] 6 | plt.rcParams['font.sans-serif'] = ['SimHei'] 7 | plt.rcParams['axes.unicode_minus'] = False 8 | 9 | 10 | def enframe(x, win, inc=None): 11 | nx = len(x) 12 | if isinstance(win, list) or isinstance(win, np.ndarray): 13 | nwin = len(win) 14 | nlen = nwin # 帧长=窗长 15 | elif isinstance(win, int): 16 | nwin = 1 17 | nlen = win # 设置为帧长 18 | if inc is None: 19 | inc = nlen 20 | nf = (nx - nlen + inc) // inc 21 | frameout = np.zeros((nf, nlen)) 22 | indf = np.multiply(inc, np.array([i for i in range(nf)])) 23 | for i in range(nf): 24 | frameout[i, :] = x[indf[i]:indf[i] + nlen] 25 | if isinstance(win, list) or isinstance(win, np.ndarray): 26 | frameout = np.multiply(frameout, np.array(win)) 27 | return frameout 28 | 29 | 30 | if __name__ == "__main__": 31 | fs, data, nbits = wavfile.read('C3_1_y.wav') 32 | 33 | 34 | inc = 100 35 | wlen = 200 36 | en = enframe(data, wlen, inc) 37 | i = input('其实帧(i):') 38 | i = int(i) 39 | tlabel = i 40 | plt.subplot(4, 1, 1) 41 | x = [i for i in range((tlabel - 1) * inc, (tlabel - 1) * inc + wlen)] 42 | plt.plot(x, en[tlabel, :]) 43 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 44 | plt.title('(a)当前波形帧号{}'.format(tlabel)) 45 | 46 | plt.subplot(4, 1, 2) 47 | x = [i for i in range((tlabel + 1 - 1) * inc, (tlabel + 1 - 1) * inc + wlen)] 48 | plt.plot(x, en[i + 1, :]) 49 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 50 | plt.title('(b)当前波形帧号{}'.format(tlabel + 1)) 51 | 52 | plt.subplot(4, 1, 3) 53 | x = [i for i in range((tlabel + 2 - 1) * inc, (tlabel + 2 - 1) * inc + wlen)] 54 | plt.plot(x, en[i + 2, :]) 55 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 56 | plt.title('(c)当前波形帧号{}'.format(tlabel + 2)) 57 | 58 | plt.subplot(4, 1, 4) 59 | x = [i for i in range((tlabel + 3 - 1) * inc, (tlabel + 3 - 1) * inc + wlen)] 60 | plt.plot(x, en[i + 3, :]) 61 | plt.xlim([(i - 1) * inc + 1, (i + 2) * inc + wlen]) 62 | plt.title('(d)当前波形帧号{}'.format(tlabel + 3)) 63 | 64 | # plt.show() 65 | plt.savefig('images/en.png') 66 | plt.close() 67 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_1_y_2.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | plt.rcParams['font.family'] = ['sans-serif'] 5 | plt.rcParams['font.sans-serif'] = ['SimHei'] 6 | 7 | N = 32 8 | nn = [i for i in range(N)] 9 | plt.subplot(3, 1, 1) 10 | plt.stem(np.ones(N)) 11 | plt.title('(a)矩形窗') 12 | 13 | w = 0.54 - 0.46 * np.cos(np.multiply(nn, 2 * np.pi) / (N - 1)) 14 | plt.subplot(3, 1, 2) 15 | plt.stem(w) 16 | plt.title('(b)汉明窗') 17 | 18 | w = 0.5 * (1 - np.cos(np.multiply(nn, 2 * np.pi) / (N - 1))) 19 | plt.subplot(3, 1, 3) 20 | plt.stem(w) 21 | plt.title('(c)海宁窗') 22 | # plt.show() 23 | plt.savefig('images/window.png') 24 | plt.close() 25 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_2_y.py: -------------------------------------------------------------------------------- 1 | from scipy.io import wavfile 2 | import matplotlib.pyplot as plt 3 | from chapter3_分析实验.windows import * 4 | from chapter3_分析实验.timefeature import * 5 | from chapter2_基础.soundBase import * 6 | 7 | data, fs = soundBase('C3_2_y.wav').audioread() 8 | 9 | 10 | 11 | inc = 100 12 | wlen = 200 13 | win = hanning_window(wlen) 14 | N = len(data) 15 | time = [i / fs for i in range(N)] 16 | 17 | EN = STEn(data, win, inc) # 短时能量 18 | Mn = STMn(data, win, inc) # 短时平均幅度 19 | Zcr = STZcr(data, win, inc) # 短时过零率 20 | 21 | X = enframe(data, win, inc) 22 | X = X.T 23 | Ac = STAc(X) 24 | Ac = Ac.T 25 | Ac = Ac.flatten() 26 | 27 | Amdf = STAmdf(X) 28 | Amdf = Amdf.flatten() 29 | 30 | fig = plt.figure(figsize=(14, 13)) 31 | plt.subplot(3, 1, 1) 32 | plt.plot(time, data) 33 | plt.title('(a)语音波形') 34 | plt.subplot(3, 1, 2) 35 | frameTime = FrameTimeC(len(EN), wlen, inc, fs) 36 | plt.plot(frameTime, Mn) 37 | plt.title('(b)短时幅值') 38 | plt.subplot(3, 1, 3) 39 | plt.plot(frameTime, EN) 40 | plt.title('(c)短时能量') 41 | plt.savefig('images/energy.png') 42 | 43 | fig = plt.figure(figsize=(10, 13)) 44 | plt.subplot(2, 1, 1) 45 | plt.plot(time, data) 46 | plt.title('(a)语音波形') 47 | plt.subplot(2, 1, 2) 48 | plt.plot(frameTime, Zcr) 49 | plt.title('(b)短时过零率') 50 | plt.savefig('images/Zcr.png') 51 | 52 | fig = plt.figure(figsize=(10, 13)) 53 | plt.subplot(2, 1, 1) 54 | plt.plot(time, data) 55 | plt.title('(a)语音波形') 56 | plt.subplot(2, 1, 2) 57 | plt.plot(Ac) 58 | plt.title('(b)短时自相关') 59 | plt.savefig('images/corr.png') 60 | 61 | fig = plt.figure(figsize=(10, 13)) 62 | plt.subplot(2, 1, 1) 63 | plt.plot(time, data) 64 | plt.title('(a)语音波形') 65 | plt.subplot(2, 1, 2) 66 | plt.plot(Amdf) 67 | plt.title('(b)短时幅度差') 68 | plt.savefig('images/Amdf.png') 69 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_2_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_2_y.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_3_y.py: -------------------------------------------------------------------------------- 1 | from chapter3_分析实验.windows import * 2 | from chapter3_分析实验.timefeature import * 3 | from chapter2_基础.soundBase import * 4 | 5 | 6 | def STFFT(x, win, nfft, inc): 7 | xn = enframe(x, win, inc) 8 | xn = xn.T 9 | y = np.fft.fft(xn, nfft, axis=0) 10 | return y[:nfft // 2, :] 11 | 12 | 13 | data, fs = soundBase('C3_3_y.wav').audioread() 14 | 15 | wlen = 256 16 | nfft = wlen 17 | win = hanning_window(wlen) 18 | inc = 128 19 | 20 | y = STFFT(data, win, nfft, inc) 21 | freq = [i * fs / wlen for i in range(wlen // 2)] 22 | frame = FrameTimeC(y.shape[1], wlen, inc, fs) 23 | 24 | plt.matshow(np.log10(np.flip(np.abs(y) * np.abs(y), 0))) 25 | plt.colorbar() 26 | plt.savefig('images/spec.png') 27 | plt.close() 28 | 29 | plt.specgram(data, NFFT=256, Fs=fs, window=np.hanning(256)) 30 | plt.ylabel('Frequency') 31 | plt.xlabel('Time(s)') 32 | plt.show() 33 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_3_y.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_4_y_1.py: -------------------------------------------------------------------------------- 1 | from chapter3_分析实验.windows import * 2 | from chapter3_分析实验.timefeature import * 3 | from chapter2_基础.soundBase import * 4 | from chapter3_分析实验.倒谱计算 import * 5 | 6 | data, fs = soundBase('C3_4_y_1.wav').audioread() 7 | nlen = 1000 8 | y = data[:nlen] 9 | N = 1024 10 | time = [i / fs for i in range(nlen)] 11 | z = cceps(y) 12 | zr = rcceps(y) 13 | yy = icceps(z) 14 | 15 | plt.subplot(4, 1, 1) 16 | plt.plot(time, y) 17 | plt.title('原始信号') 18 | plt.subplot(4, 1, 2) 19 | plt.plot(time, z) 20 | plt.title('复倒谱') 21 | plt.subplot(4, 1, 3) 22 | plt.plot(time, zr) 23 | plt.title('实倒谱') 24 | plt.subplot(4, 1, 4) 25 | plt.plot(time, yy) 26 | plt.title('重构信号') 27 | plt.savefig('images/倒谱.png') 28 | plt.close() 29 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_4_y_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_4_y_1.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_4_y_2.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.dct import * 3 | 4 | f = 50 5 | fs = 1000 6 | N = 1000 7 | n = np.array([i for i in range(N)]) 8 | 9 | xn = np.cos(2 * np.pi * f * n / fs) 10 | y = dct(xn) 11 | y = np.where(abs(y) < 5, 0, y) 12 | 13 | zn = idct(y) 14 | 15 | plt.subplot(3, 1, 1) 16 | plt.plot(xn) 17 | plt.subplot(3, 1, 2) 18 | plt.plot(y) 19 | plt.subplot(3, 1, 3) 20 | plt.plot(zn) 21 | plt.savefig('images/dct.png') 22 | plt.close() 23 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_4_y_4.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.dct import * 3 | from chapter3_分析实验.mel import * 4 | 5 | data, fs, _ = soundBase('C3_4_y_4.wav').audioread() 6 | 7 | wlen = 200 8 | inc = 80 9 | num = 8 10 | data = data / np.max(data) 11 | mfcc = Nmfcc(data, fs, num, wlen, inc) 12 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_4_y_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_4_y_4.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_5_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/C3_5_y.wav -------------------------------------------------------------------------------- /chapter3_分析实验/C3_5_y_1.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.lpc import * 3 | from scipy.signal import lfilter 4 | 5 | data, fs = soundBase('C3_5_y.wav').audioread() 6 | L = 240 7 | x = data[8000:8000 + L] 8 | x = (x - np.mean(x)) / np.std(x) 9 | p = 12 10 | ar, G = lpc_coeff(x, p) 11 | b = np.zeros(p + 1) 12 | b[0] = 1 13 | b[1:] = -ar[1:] 14 | est_x = lfilter(b, 1, x) 15 | plt.subplot(2, 1, 1) 16 | plt.plot(x) 17 | plt.subplot(2, 1, 2) 18 | plt.plot(est_x) 19 | plt.savefig('images/lpc.png') 20 | plt.close() 21 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_5_y_2.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.lpc import * 3 | 4 | data, fs = soundBase('C3_5_y.wav').audioread() 5 | L = 240 6 | p = 12 7 | x = data[8000:8000 + L] 8 | ar, G = lpc_coeff(x, p) 9 | nfft = 512 10 | W2 = nfft // 2 11 | m = np.array([i for i in range(W2)]) 12 | Y = np.fft.fft(x, nfft) 13 | Y1 = lpcff(ar, W2) 14 | plt.subplot(2, 1, 1) 15 | plt.plot(x) 16 | plt.subplot(2, 1, 2) 17 | plt.plot(m, 20 * np.log(np.abs(Y[m]))) 18 | plt.plot(m, 20 * np.log(np.abs(Y1[m]))) 19 | plt.savefig('images/lpcff.png') 20 | plt.close() 21 | -------------------------------------------------------------------------------- /chapter3_分析实验/C3_5_y_3.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.lpc import * 3 | from chapter3_分析实验.倒谱计算 import * 4 | 5 | data, fs = soundBase('C3_5_y.wav').audioread() 6 | L = 240 7 | p = 12 8 | x = data[8000:8000 + L] 9 | ar, G = lpc_coeff(x, p) 10 | 11 | lpcc1 = lpc_lpccm(ar, p, p) 12 | lpcc2 = rcceps(ar) 13 | plt.subplot(2, 1, 1) 14 | plt.plot(lpcc1) 15 | plt.subplot(2, 1, 2) 16 | plt.plot(lpcc2) 17 | plt.show() 18 | -------------------------------------------------------------------------------- /chapter3_分析实验/dct.py: -------------------------------------------------------------------------------- 1 | # 离散余弦变换 2 | import numpy as np 3 | 4 | 5 | def dct(x): 6 | N = len(x) 7 | X = np.zeros(N) 8 | ts = np.array([i for i in range(N)]) 9 | C = np.ones(N) 10 | C[0] = np.sqrt(2) / 2 11 | for k in range(N): 12 | X[k] = np.sqrt(2 / N) * np.sum(C[k] * np.multiply(x, np.cos((2 * ts + 1) * k * np.pi / 2 / N))) 13 | return X 14 | 15 | 16 | def idct(X): 17 | N = len(X) 18 | x = np.zeros(N) 19 | ts = np.array([i for i in range(N)]) 20 | C = np.ones(N) 21 | C[0] = np.sqrt(2) / 2 22 | for n in range(N): 23 | x[n] = np.sqrt(2 / N) * np.sum(np.multiply(np.multiply(C[ts], X[ts]), np.cos((2 * n + 1) * np.pi * ts / 2 / N))) 24 | return x 25 | -------------------------------------------------------------------------------- /chapter3_分析实验/images/Amdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/Amdf.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/Zcr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/Zcr.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/corr.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/dct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/dct.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/en.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/energy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/energy.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/lpc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/lpc.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/lpcff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/lpcff.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/mel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/mel.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/mfcc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/mfcc.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/spec.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/window.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/倒谱.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/倒谱.png -------------------------------------------------------------------------------- /chapter3_分析实验/images/同态.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter3_分析实验/images/同态.png -------------------------------------------------------------------------------- /chapter3_分析实验/lpc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def lpc_coeff(s, p): 5 | """ 6 | :param s: 一帧数据 7 | :param p: 线性预测的阶数 8 | :return: 9 | """ 10 | n = len(s) 11 | # 计算自相关函数 12 | Rp = np.zeros(p) 13 | for i in range(p): 14 | Rp[i] = np.sum(np.multiply(s[i + 1:n], s[:n - i - 1])) 15 | Rp0 = np.matmul(s, s.T) 16 | Ep = np.zeros((p, 1)) 17 | k = np.zeros((p, 1)) 18 | a = np.zeros((p, p)) 19 | # 处理i=0的情况 20 | Ep0 = Rp0 21 | k[0] = Rp[0] / Rp0 22 | a[0, 0] = k[0] 23 | Ep[0] = (1 - k[0] * k[0]) * Ep0 24 | # i=1开始,递归计算 25 | if p > 1: 26 | for i in range(1, p): 27 | k[i] = (Rp[i] - np.sum(np.multiply(a[:i, i - 1], Rp[i - 1::-1]))) / Ep[i - 1] 28 | a[i, i] = k[i] 29 | Ep[i] = (1 - k[i] * k[i]) * Ep[i - 1] 30 | for j in range(i - 1, -1, -1): 31 | a[j, i] = a[j, i - 1] - k[i] * a[i - j - 1, i - 1] 32 | ar = np.zeros(p + 1) 33 | ar[0] = 1 34 | ar[1:] = -a[:, p - 1] 35 | G = np.sqrt(Ep[p - 1]) 36 | return ar, G 37 | 38 | 39 | def lpcff(ar, npp=None): 40 | """ 41 | :param ar: 线性预测系数 42 | :param npp: FFT阶数 43 | :return: 44 | """ 45 | p1 = ar.shape[0] 46 | if npp is None: 47 | npp = p1 - 1 48 | ff = 1 / np.fft.fft(ar, 2 * npp + 2) 49 | return ff[:len(ff) // 2] 50 | 51 | 52 | def lpc_lpccm(ar, n_lpc, n_lpcc): 53 | lpcc = np.zeros(n_lpcc) 54 | lpcc[0] = ar[0] # 计算n=1的lpcc 55 | for n in range(1, n_lpc): # 计算n=2,,p的lpcc 56 | lpcc[n] = ar[n] 57 | for l in range(n - 1): 58 | lpcc[n] += ar[l] * lpcc[n - 1] * (n - l) / n 59 | for n in range(n_lpc, n_lpcc): # 计算n>p的lpcc 60 | lpcc[n] = 0 61 | for l in range(n_lpc): 62 | lpcc[n] += ar[l] * lpcc[n - 1] * (n - l) / n 63 | return -lpcc 64 | -------------------------------------------------------------------------------- /chapter3_分析实验/mel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pylab as plt 3 | from scipy.signal import lfilter 4 | from .C3_1_y_1 import enframe 5 | 6 | 7 | def melbankm(p, NFFT, fs, fl, fh, w=None): 8 | """ 9 | 计算Mel滤波器组 10 | :param p: 滤波器个数 11 | :param n: 一帧FFT后的数据长度 12 | :param fs: 采样率 13 | :param fl: 最低频率 14 | :param fh: 最高频率 15 | :param w: 窗(没有加窗,无效参数) 16 | :return: 17 | """ 18 | bl = 1125 * np.log(1 + fl / 700) # 把 Hz 变成 Mel 19 | bh = 1125 * np.log(1 + fh / 700) 20 | B = bh - bl # Mel带宽 21 | y = np.linspace(0, B, p + 2) # 将梅尔刻度等间隔 22 | Fb = 700 * (np.exp(y / 1125) - 1) # 把 Mel 变成Hz 23 | W2 = int(NFFT / 2 + 1) 24 | df = fs / NFFT 25 | freq = [int(i * df) for i in range(W2)] # 采样频率值 26 | bank = np.zeros((p, W2)) 27 | for k in range(1, p + 1): 28 | f0, f1, f2 = Fb[k], Fb[k - 1], Fb[k + 1] 29 | n1 = np.floor(f1 / df) 30 | n2 = np.floor(f2 / df) 31 | n0 = np.floor(f0 / df) 32 | for i in range(1, W2): 33 | if n1 <= i <= n0: 34 | bank[k - 1, i] = (i - n1) / (n0 - n1) 35 | elif n0 < i <= n2: 36 | bank[k - 1, i] = (n2 - i) / (n2 - n0) 37 | elif i > n2: 38 | break 39 | # plt.plot(freq, bank[k - 1, :], 'r') 40 | # plt.savefig('images/mel.png') 41 | return bank 42 | 43 | 44 | def Nmfcc(x, fs, p, frameSize, inc, nfft=512, n_dct=12): 45 | """ 46 | 计算mfcc系数 47 | :param x: 输入信号 48 | :param fs: 采样率 49 | :param p: Mel滤波器组的个数 50 | :param frameSize: 分帧的每帧长度 51 | :param inc: 帧移 52 | :return: 53 | """ 54 | # 预处理-预加重 55 | xx = lfilter([1, -0.9375], [1], x) 56 | # 预处理-分幀 57 | xx = enframe(xx, frameSize, inc) 58 | # 预处理-加窗 59 | xx = np.multiply(xx, np.hanning(frameSize)) 60 | # 计算FFT 61 | xx = np.fft.rfft(xx, nfft) 62 | # 计算能量谱 63 | xx = (np.abs(xx) ** 2) / nfft 64 | # 计算通过Mel滤波器的能量 65 | bank = melbankm(p, nfft, fs, 0, 0.5 * fs, 0) 66 | ss = np.matmul(xx, bank.T) 67 | # 计算DCT倒谱 68 | M = bank.shape[0] 69 | m = np.array([i for i in range(M)]) 70 | mfcc = np.zeros((ss.shape[0], n_dct)) 71 | for n in range(n_dct): 72 | mfcc[:, n] = np.sqrt(2 / M) * np.sum(np.multiply(np.log(ss), np.cos((2 * m - 1) * n * np.pi / 2 / M)), axis=1) 73 | return mfcc 74 | 75 | 76 | if __name__ == '__main__': 77 | melbankm(24, 256, 8000, 0, 4000) 78 | -------------------------------------------------------------------------------- /chapter3_分析实验/test.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import scipy.io.wavfile 3 | from matplotlib import pyplot as plt 4 | from scipy.fftpack import dct 5 | 6 | sample_rate, signal = scipy.io.wavfile.read('C3_4_y_4.wav') 7 | 8 | print(sample_rate, len(signal)) 9 | # 读取前3.5s 的数据 10 | signal = signal[0:int(3.5 * sample_rate)] 11 | print(signal) 12 | 13 | # 预先处理 14 | pre_emphasis = 0.97 15 | emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1]) 16 | 17 | frame_size = 0.025 18 | frame_stride = 0.1 19 | frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate 20 | signal_length = len(emphasized_signal) 21 | frame_length = int(round(frame_length)) 22 | frame_step = int(round(frame_step)) 23 | num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step)) 24 | 25 | pad_signal_length = num_frames * frame_step + frame_length 26 | z = numpy.zeros((pad_signal_length - signal_length)) 27 | pad_signal = numpy.append(emphasized_signal, z) 28 | 29 | indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile( 30 | numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T 31 | 32 | frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)] 33 | 34 | # 加上汉明窗 35 | frames *= numpy.hamming(frame_length) 36 | # frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation ** 37 | 38 | # 傅立叶变换和功率谱 39 | NFFT = 512 40 | mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT 41 | # print(mag_frames.shape) 42 | pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum 43 | 44 | low_freq_mel = 0 45 | # 将频率转换为Mel 46 | nfilt = 40 47 | high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700)) 48 | mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale 49 | hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz 50 | 51 | bin = numpy.floor((NFFT + 1) * hz_points / sample_rate) 52 | 53 | fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1)))) 54 | 55 | for m in range(1, nfilt + 1): 56 | f_m_minus = int(bin[m - 1]) # left 57 | f_m = int(bin[m]) # center 58 | f_m_plus = int(bin[m + 1]) # right 59 | for k in range(f_m_minus, f_m): 60 | fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) 61 | for k in range(f_m, f_m_plus): 62 | fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) 63 | filter_banks = numpy.dot(pow_frames, fbank.T) 64 | filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability 65 | filter_banks = 20 * numpy.log10(filter_banks) # dB 66 | 67 | num_ceps = 12 68 | mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)] 69 | (nframes, ncoeff) = mfcc.shape 70 | 71 | n = numpy.arange(ncoeff) 72 | cep_lifter = 22 73 | lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter) 74 | mfcc *= lift # * 75 | 76 | # filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) 77 | mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8) 78 | 79 | print(mfcc.shape) 80 | plt.plot(filter_banks) 81 | 82 | plt.show() -------------------------------------------------------------------------------- /chapter3_分析实验/timefeature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .C3_1_y_1 import enframe 3 | 4 | 5 | def STAc(x): 6 | """ 7 | 计算短时相关函数 8 | :param x: 9 | :return: 10 | """ 11 | para = np.zeros(x.shape) 12 | fn = x.shape[1] 13 | for i in range(fn): 14 | R = np.correlate(x[:, i], x[:, i], 'valid') 15 | para[:, i] = R 16 | return para 17 | 18 | 19 | def STEn(x, win, inc): 20 | """ 21 | 计算短时能量函数 22 | :param x: 23 | :param win: 24 | :param inc: 25 | :return: 26 | """ 27 | X = enframe(x, win, inc) 28 | s = np.multiply(X, X) 29 | return np.sum(s, axis=1) 30 | 31 | 32 | def STMn(x, win, inc): 33 | """ 34 | 计算短时平均幅度计算函数 35 | :param x: 36 | :param win: 37 | :param inc: 38 | :return: 39 | """ 40 | X = enframe(x, win, inc) 41 | s = np.abs(X) 42 | return np.mean(s, axis=1) 43 | 44 | 45 | def STZcr(x, win, inc, delta=0): 46 | """ 47 | 计算短时过零率 48 | :param x: 49 | :param win: 50 | :param inc: 51 | :return: 52 | """ 53 | absx = np.abs(x) 54 | x = np.where(absx < delta, 0, x) 55 | X = enframe(x, win, inc) 56 | X1 = X[:, :-1] 57 | X2 = X[:, 1:] 58 | s = np.multiply(X1, X2) 59 | sgn = np.where(s < 0, 1, 0) 60 | return np.sum(sgn, axis=1) 61 | 62 | 63 | def STAmdf(X): 64 | """ 65 | 计算短时幅度差,好像有点问题 66 | :param X: 67 | :return: 68 | """ 69 | # para = np.zeros(X.shape) 70 | fn = X.shape[1] 71 | wlen = X.shape[0] 72 | para = np.zeros((wlen, wlen)) 73 | for i in range(fn): 74 | u = X[:, i] 75 | for k in range(wlen): 76 | en = len(u) 77 | para[k, :] = np.sum(np.abs(u[k:] - u[:en - k])) 78 | return para 79 | 80 | 81 | def FrameTimeC(frameNum, frameLen, inc, fs): 82 | ll = np.array([i for i in range(frameNum)]) 83 | return ((ll - 1) * inc + frameLen / 2) / fs 84 | -------------------------------------------------------------------------------- /chapter3_分析实验/windows.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def reg_window(N): 5 | return np.ones(N) 6 | 7 | 8 | def hanning_window(N): 9 | nn = [i for i in range(N)] 10 | return 0.5 * (1 - np.cos(np.multiply(nn, 2 * np.pi) / (N - 1))) 11 | 12 | 13 | def hamming_window(N): 14 | nn = [i for i in range(N)] 15 | return 0.54 - 0.46 * np.cos(np.multiply(nn, 2 * np.pi) / (N - 1)) 16 | -------------------------------------------------------------------------------- /chapter3_分析实验/倒谱计算.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def cceps(x): 5 | """ 6 | 计算复倒谱 7 | """ 8 | y = np.fft.fft(x) 9 | return np.fft.ifft(np.log(y)) 10 | 11 | 12 | def icceps(y): 13 | """ 14 | 计算复倒谱的逆变换 15 | """ 16 | x = np.fft.fft(y) 17 | return np.fft.ifft(np.exp(x)) 18 | 19 | 20 | def rcceps(x): 21 | """ 22 | 计算实倒谱 23 | """ 24 | y = np.fft.fft(x) 25 | return np.fft.ifft(np.log(np.abs(y))) 26 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/C4_1_y.wav -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y_1.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.end_detection import * 3 | 4 | data, fs = soundBase('C4_1_y.wav').audioread() 5 | data /= np.max(data) 6 | N = len(data) 7 | wlen = 200 8 | inc = 80 9 | IS = 0.1 10 | overlap = wlen - inc 11 | NIS = int((IS * fs - wlen) // inc + 1) 12 | fn = (N - wlen) // inc + 1 13 | 14 | frameTime = FrameTimeC(fn, wlen, inc, fs) 15 | time = [i / fs for i in range(N)] 16 | 17 | voiceseg, vsl, SF, NF, amp, zcr = vad_TwoThr(data, wlen, inc, NIS) 18 | 19 | plt.subplot(3, 1, 1) 20 | plt.plot(time, data) 21 | 22 | plt.subplot(3, 1, 2) 23 | plt.plot(frameTime, amp) 24 | 25 | plt.subplot(3, 1, 3) 26 | plt.plot(frameTime, zcr) 27 | 28 | for i in range(vsl): 29 | plt.subplot(3, 1, 1) 30 | plt.plot(frameTime[voiceseg[i]['start']], 1, '.k') 31 | plt.plot(frameTime[voiceseg[i]['end']], 1, 'or') 32 | 33 | plt.subplot(3, 1, 2) 34 | plt.plot(frameTime[voiceseg[i]['start']], 1, '.k') 35 | plt.plot(frameTime[voiceseg[i]['end']], 1, 'or') 36 | 37 | plt.subplot(3, 1, 3) 38 | plt.plot(frameTime[voiceseg[i]['start']], 1, '.k') 39 | plt.plot(frameTime[voiceseg[i]['end']], 1, 'or') 40 | 41 | plt.savefig('images/TwoThr.png') 42 | plt.close() 43 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y_2.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.end_detection import * 3 | 4 | data, fs = soundBase('C4_1_y.wav').audioread() 5 | data -= np.mean(data) 6 | data /= np.max(data) 7 | IS = 0.25 8 | wlen = 200 9 | inc = 80 10 | N = len(data) 11 | time = [i / fs for i in range(N)] 12 | wnd = np.hamming(wlen) 13 | NIS = int((IS * fs - wlen) // inc + 1) 14 | thr1 = 1.1 15 | thr2 = 1.3 16 | voiceseg, vsl, SF, NF, Rum = vad_corr(data, wnd, inc, NIS, thr1, thr2) 17 | fn = len(SF) 18 | frameTime = FrameTimeC(fn, wlen, inc, fs) 19 | 20 | plt.subplot(2, 1, 1) 21 | plt.plot(time, data) 22 | plt.subplot(2, 1, 2) 23 | plt.plot(frameTime, Rum) 24 | 25 | for i in range(vsl): 26 | plt.subplot(2, 1, 1) 27 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 28 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 29 | plt.legend(['signal', 'start', 'end']) 30 | 31 | plt.subplot(2, 1, 2) 32 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 33 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 34 | plt.legend(['xcorr', 'start', 'end']) 35 | 36 | plt.savefig('images/corr.png') 37 | plt.close() 38 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y_3.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.end_detection import * 3 | 4 | data, fs = soundBase('C4_1_y.wav').audioread() 5 | data -= np.mean(data) 6 | data /= np.max(data) 7 | IS = 0.25 8 | wlen = 200 9 | inc = 80 10 | N = len(data) 11 | time = [i / fs for i in range(N)] 12 | wnd = np.hamming(wlen) 13 | overlap = wlen - inc 14 | NIS = int((IS * fs - wlen) // inc + 1) 15 | thr1 = 0.99 16 | thr2 = 0.96 17 | voiceseg, vsl, SF, NF, Enm = vad_specEN(data, wnd, inc, NIS, thr1, thr2, fs) 18 | 19 | fn = len(SF) 20 | frameTime = FrameTimeC(fn, wlen, inc, fs) 21 | 22 | plt.subplot(2, 1, 1) 23 | plt.plot(time, data) 24 | plt.subplot(2, 1, 2) 25 | plt.plot(frameTime, Enm) 26 | 27 | for i in range(vsl): 28 | plt.subplot(2, 1, 1) 29 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 30 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 31 | plt.legend(['signal', 'start', 'end']) 32 | 33 | plt.subplot(2, 1, 2) 34 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 35 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 36 | plt.legend(['熵谱', 'start', 'end']) 37 | 38 | plt.savefig('images/En.png') 39 | plt.close() 40 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y_4.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.end_detection import * 3 | 4 | data, fs = soundBase('C4_1_y.wav').audioread() 5 | data -= np.mean(data) 6 | data /= np.max(data) 7 | IS = 0.25 8 | wlen = 200 9 | inc = 80 10 | N = len(data) 11 | time = [i / fs for i in range(N)] 12 | wnd = np.hamming(wlen) 13 | overlap = wlen - inc 14 | NIS = int((IS * fs - wlen) // inc + 1) 15 | 16 | mode = 2 17 | if mode == 1: 18 | thr1 = 3 19 | thr2 = 4 20 | tlabel = '能零比' 21 | elif mode == 2: 22 | thr1 = 0.05 23 | thr2 = 0.1 24 | tlabel = '能熵比' 25 | voiceseg, vsl, SF, NF, Epara = vad_pro(data, wnd, inc, NIS, thr1, thr2, mode) 26 | 27 | fn = len(SF) 28 | frameTime = FrameTimeC(fn, wlen, inc, fs) 29 | 30 | plt.subplot(2, 1, 1) 31 | plt.plot(time, data) 32 | plt.subplot(2, 1, 2) 33 | plt.plot(frameTime, Epara) 34 | 35 | for i in range(vsl): 36 | plt.subplot(2, 1, 1) 37 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 38 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 39 | plt.legend(['signal', 'start', 'end']) 40 | 41 | plt.subplot(2, 1, 2) 42 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 43 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 44 | plt.legend([tlabel, 'start', 'end']) 45 | 46 | plt.savefig('images/{}.png'.format(tlabel)) 47 | plt.close() 48 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_1_y_5.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.end_detection import * 3 | 4 | 5 | def awgn(x, snr): 6 | snr = 10 ** (snr / 10.0) 7 | xpower = np.sum(x ** 2) / len(x) 8 | npower = xpower / snr 9 | return np.random.randn(len(x)) * np.sqrt(npower) + x 10 | 11 | 12 | data, fs = soundBase('C4_1_y.wav').audioread() 13 | data -= np.mean(data) 14 | data /= np.max(data) 15 | IS = 0.25 16 | wlen = 200 17 | inc = 80 18 | SNR = 10 19 | N = len(data) 20 | time = [i / fs for i in range(N)] 21 | wnd = np.hamming(wlen) 22 | overlap = wlen - inc 23 | NIS = int((IS * fs - wlen) // inc + 1) 24 | signal = awgn(data, SNR) 25 | 26 | y = enframe(signal, wnd, inc) 27 | frameTime = FrameTimeC(y.shape[0], wlen, inc, fs) 28 | 29 | Y = np.abs(np.fft.fft(y, axis=1)) 30 | Y = Y[:, :wlen // 2] 31 | N = np.mean(Y[:NIS, :], axis=0) 32 | NoiseCounter = 0 33 | SF = np.zeros(y.shape[0]) 34 | NF = np.zeros(y.shape[0]) 35 | D = np.zeros(y.shape[0]) 36 | # 前导段设置NF=1,SF=0 37 | SF[:NIS] = 0 38 | NF[:NIS] = 1 39 | for i in range(NIS, y.shape[0]): 40 | NoiseFlag, SpeechFlag, NoiseCounter, Dist = vad_LogSpec(Y[i, :], N, NoiseCounter, 2.5, 8) 41 | SF[i] = SpeechFlag 42 | NF[i] = NoiseFlag 43 | D[i] = Dist 44 | sindex = np.where(SF == 1) 45 | voiceseg = findSegment(np.where(SF == 1)[0]) 46 | vosl = len(voiceseg) 47 | 48 | plt.subplot(3, 1, 1) 49 | plt.plot(time, data) 50 | plt.subplot(3, 1, 2) 51 | plt.plot(time, signal) 52 | plt.subplot(3, 1, 3) 53 | plt.plot(frameTime, D) 54 | 55 | for i in range(vosl): 56 | plt.subplot(3, 1, 1) 57 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 58 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 59 | plt.legend(['signal', 'start', 'end']) 60 | 61 | plt.subplot(3, 1, 2) 62 | plt.plot(frameTime[voiceseg[i]['start']], 0, '.k') 63 | plt.plot(frameTime[voiceseg[i]['end']], 0, 'or') 64 | plt.legend(['noised', 'start', 'end']) 65 | 66 | plt.subplot(3, 1, 3) 67 | plt.plot(frameTime[voiceseg[i]['start']], 1, '.k') 68 | plt.plot(frameTime[voiceseg[i]['end']], 1, 'or') 69 | plt.legend(['对数频率距离', 'start', 'end']) 70 | 71 | plt.savefig('images/对数频率距离.png') 72 | plt.close() 73 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_2_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.pitch_detection import * 3 | 4 | data, fs = soundBase('C4_2_y.wav').audioread() 5 | data -= np.mean(data) 6 | data /= np.max(np.abs(data)) 7 | wlen = 320 8 | inc = 80 9 | N = len(data) 10 | time = [i / fs for i in range(N)] 11 | T1 = 0.05 12 | 13 | # 4.2.1 14 | voiceseg, vosl, SF, Ef = pitch_vad(data, wlen, inc, T1) 15 | fn = len(SF) 16 | frameTime = FrameTimeC(fn, wlen, inc, fs) 17 | 18 | plt.figure(figsize=(14, 8)) 19 | 20 | plt.subplot(5, 1, 1) 21 | plt.plot(time, data) 22 | plt.subplot(5, 1, 2) 23 | plt.plot(frameTime, Ef) 24 | for i in range(vosl): 25 | plt.subplot(5, 1, 2) 26 | plt.plot(frameTime[voiceseg[i]['start']], Ef[voiceseg[i]['start']], '.k') 27 | plt.plot(frameTime[voiceseg[i]['end']], Ef[voiceseg[i]['start']], 'or') 28 | plt.legend(['能熵比', 'start', 'end']) 29 | 30 | # 4.2.3 31 | voiceseg, vsl, SF, Ef, period = pitch_Ceps(data, wlen, inc, T1, fs, miniL=10) 32 | plt.subplot(5, 1, 3) 33 | plt.plot(frameTime, period) 34 | for i in range(vsl): 35 | plt.subplot(5, 1, 3) 36 | plt.plot(frameTime[voiceseg[i]['start']], Ef[voiceseg[i]['start']], '.k') 37 | plt.plot(frameTime[voiceseg[i]['end']], Ef[voiceseg[i]['start']], 'or') 38 | plt.legend(['倒谱法', 'start', 'end']) 39 | 40 | # 4.2.4 41 | voiceseg, vsl, SF, Ef, period = pitch_Corr(data, wlen, inc, T1, fs) 42 | plt.subplot(5, 1, 4) 43 | plt.plot(frameTime, period) 44 | for i in range(vsl): 45 | plt.subplot(5, 1, 4) 46 | plt.plot(frameTime[voiceseg[i]['start']], Ef[voiceseg[i]['start']], '.k') 47 | plt.plot(frameTime[voiceseg[i]['end']], Ef[voiceseg[i]['start']], 'or') 48 | plt.legend(['自相关', 'start', 'end']) 49 | 50 | # 4.2.5 51 | p = 12 52 | voiceseg, vsl, SF, Ef, period = pitch_Lpc(data, wlen, inc, T1, fs, p) 53 | plt.subplot(5, 1, 5) 54 | plt.plot(frameTime, period) 55 | for i in range(vsl): 56 | plt.subplot(5, 1, 5) 57 | plt.plot(frameTime[voiceseg[i]['start']], Ef[voiceseg[i]['start']], '.k') 58 | plt.plot(frameTime[voiceseg[i]['end']], Ef[voiceseg[i]['start']], 'or') 59 | plt.legend(['线性预测', 'start', 'end']) 60 | 61 | plt.savefig('images/pitch.png') 62 | plt.close() 63 | 64 | # 4.2.2 65 | from scipy.signal import ellipord, ellip, freqz 66 | 67 | fs = 8000 68 | fs2 = fs / 2 69 | Wp = np.array([60, 500]) / fs2 70 | Ws = np.array([20, 1500]) / fs2 71 | Rp = 1 72 | Rs = 40 73 | n, Wn = ellipord(Wp, Ws, Rp, Rs) 74 | b, a = ellip(n, Rp, Rs, Wn, 'bandpass') 75 | print(b) 76 | print(a) 77 | 78 | w, H = freqz(b, a, 1000) 79 | H, w = H[:500], w[:500] 80 | mag = np.abs(H) 81 | db = 20 * np.log10((mag + 1e-20) / np.max(mag)) 82 | 83 | plt.plot(w / np.pi * fs2, db) 84 | plt.ylim([-90, 10]) 85 | plt.title('椭圆滤波器频率响应') 86 | plt.savefig('images/ellip.png') 87 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_2_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/C4_2_y.wav -------------------------------------------------------------------------------- /chapter4_特征提取/C4_3_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.共振峰估计 import * 3 | from scipy.signal import lfilter 4 | 5 | plt.figure(figsize=(14, 12)) 6 | 7 | data, fs = soundBase('C4_3_y.wav').audioread() 8 | # 预处理-预加重 9 | u = lfilter([1, -0.99], [1], data) 10 | 11 | cepstL = 6 12 | wlen = len(u) 13 | wlen2 = wlen // 2 14 | # 预处理-加窗 15 | u2 = np.multiply(u, np.hamming(wlen)) 16 | # 预处理-FFT,取对数 17 | U_abs = np.log(np.abs(np.fft.fft(u2))[:wlen2]) 18 | # 4.3.1 19 | freq = [i * fs / wlen for i in range(wlen2)] 20 | val, loc, spec = Formant_Cepst(u, cepstL) 21 | plt.subplot(4, 1, 1) 22 | plt.plot(freq, U_abs, 'k') 23 | plt.title('频谱') 24 | plt.subplot(4, 1, 2) 25 | plt.plot(freq, spec, 'k') 26 | plt.title('倒谱法共振峰估计') 27 | for i in range(len(loc)): 28 | plt.subplot(4, 1, 2) 29 | plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k') 30 | plt.text(freq[loc[i]], spec[loc[i]], 'Freq={}'.format(int(freq[loc[i]]))) 31 | # 4.3.2 32 | p = 12 33 | freq = [i * fs / 512 for i in range(256)] 34 | F, Bw, pp, U, loc = Formant_Interpolation(u, p, fs) 35 | 36 | plt.subplot(4, 1, 3) 37 | plt.plot(freq, U) 38 | plt.title('LPC内插法的共振峰估计') 39 | 40 | for i in range(len(Bw)): 41 | plt.subplot(4, 1, 3) 42 | plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(U), U[loc[i]]], '-.k') 43 | plt.text(freq[loc[i]], U[loc[i]], 'Freq={:.0f}\nHp={:.2f}\nBw={:.2f}'.format(F[i], pp[i], Bw[i])) 44 | 45 | # 4.3.3 46 | 47 | p = 12 48 | freq = [i * fs / 512 for i in range(256)] 49 | 50 | n_frmnt = 4 51 | F, Bw, U = Formant_Root(u, p, fs, n_frmnt) 52 | 53 | plt.subplot(4, 1, 4) 54 | plt.plot(freq, U) 55 | plt.title('LPC求根法的共振峰估计') 56 | 57 | for i in range(len(Bw)): 58 | plt.subplot(4, 1, 4) 59 | plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(U), U[loc[i]]], '-.k') 60 | plt.text(freq[loc[i]], U[loc[i]], 'Freq={:.0f}\nBw={:.2f}'.format(F[i], Bw[i])) 61 | 62 | plt.savefig('images/共振峰估计.png') 63 | plt.close() 64 | -------------------------------------------------------------------------------- /chapter4_特征提取/C4_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/C4_3_y.wav -------------------------------------------------------------------------------- /chapter4_特征提取/images/En.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/En.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/TwoThr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/TwoThr.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/corr.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/ellip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/ellip.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/pitch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/pitch.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/共振峰估计.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/共振峰估计.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/对数频率距离.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/对数频率距离.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/能熵比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/能熵比.png -------------------------------------------------------------------------------- /chapter4_特征提取/images/能零比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter4_特征提取/images/能零比.png -------------------------------------------------------------------------------- /chapter4_特征提取/pitch_detection.py: -------------------------------------------------------------------------------- 1 | from chapter3_分析实验.C3_1_y_1 import enframe 2 | from chapter3_分析实验.timefeature import * 3 | from chapter4_特征提取.end_detection import findSegment 4 | 5 | 6 | def pitch_vad(x, wnd, inc, T1, miniL=10): 7 | """ 8 | 使用能熵比检测基音,实际上就是语音分段 9 | :param x: 10 | :param wnd: 11 | :param inc: 12 | :param T1: 13 | :param miniL: 14 | :return: 15 | """ 16 | y = enframe(x, wnd, inc) 17 | fn = y.shape[0] 18 | if isinstance(wnd, int): 19 | wlen = wnd 20 | else: 21 | wlen = len(wnd) 22 | 23 | Sp = np.abs(np.fft.fft(y, axis=1)) 24 | Sp = Sp[:, :wlen // 2 + 1] 25 | Esum = np.sum(np.multiply(Sp, Sp), axis=1) 26 | prob = Sp / np.sum(Sp, axis=1, keepdims=True) 27 | H = -np.sum(np.multiply(prob, np.log10(prob + 1e-16)), axis=1) 28 | H = np.where(H < 0.1, np.max(H), H) 29 | Ef = np.sqrt(1 + np.abs(Esum / H)) 30 | Ef = Ef / np.max(Ef) 31 | 32 | zseg = findSegment(np.where(Ef > T1)[0]) 33 | zsl = len(zseg.keys()) 34 | SF = np.zeros(fn) 35 | for k in range(zsl): 36 | if zseg[k]['duration'] < miniL: 37 | zseg.pop(k) 38 | else: 39 | SF[zseg[k]['start']:zseg[k]['end']] = 1 40 | return zseg, len(zseg.keys()), SF, Ef 41 | 42 | 43 | def pitch_Ceps(x, wnd, inc, T1, fs, miniL=10): 44 | """ 45 | 倒谱法基音周期检测函数 46 | :param x: 47 | :param wnd: 48 | :param inc: 49 | :param T1: 50 | :param fs: 51 | :param miniL: 52 | :return: 53 | """ 54 | y = enframe(x, wnd, inc) 55 | fn = y.shape[0] 56 | if isinstance(wnd, int): 57 | wlen = wnd 58 | else: 59 | wlen = len(wnd) 60 | voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) 61 | lmin = fs // 500 # 基音周期的最小值 62 | lmax = fs // 60 # 基音周期的最大值 63 | period = np.zeros(fn) 64 | y1 = y[np.where(SF == 1)[0], :] 65 | y1 = np.multiply(y1, np.hamming(wlen)) 66 | xx = np.fft.fft(y1, axis=1) 67 | b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-10)) 68 | Lc = np.argmax(b[:, lmin:lmax], axis=1) + lmin - 1 69 | period[np.where(SF == 1)[0]] = Lc 70 | return voiceseg, vsl, SF, Ef, period 71 | 72 | 73 | def pitch_Corr(x, wnd, inc, T1, fs, miniL=10): 74 | """ 75 | 自相关法基音周期检测函数 76 | :param x: 77 | :param wnd: 78 | :param inc: 79 | :param T1: 80 | :param fs: 81 | :param miniL: 82 | :return: 83 | """ 84 | y = enframe(x, wnd, inc) 85 | fn = y.shape[0] 86 | if isinstance(wnd, int): 87 | wlen = wnd 88 | else: 89 | wlen = len(wnd) 90 | voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) 91 | lmin = fs // 500 # 基音周期的最小值 92 | lmax = fs // 60 # 基音周期的最大值 93 | period = np.zeros(fn) 94 | for i in range(vsl): 95 | ixb = voiceseg[i]['start'] 96 | ixd = voiceseg[i]['duration'] 97 | for k in range(ixd): 98 | ru = np.correlate(y[k + ixb, :], y[k + ixb, :], 'full') 99 | ru = ru[wlen:] 100 | tloc = np.argmax(ru[lmin:lmax]) 101 | period[k + ixb] = lmin + tloc 102 | 103 | return voiceseg, vsl, SF, Ef, period 104 | 105 | 106 | def pitch_Lpc(x, wnd, inc, T1, fs, p, miniL=10): 107 | """ 108 | 线性预测法基音周期检测函数 109 | :param x: 110 | :param wnd: 111 | :param inc: 112 | :param T1: 113 | :param fs: 114 | :param p: 115 | :param miniL: 116 | :return: 117 | """ 118 | from scipy.signal import lfilter 119 | from chapter3_分析实验.lpc import lpc_coeff 120 | y = enframe(x, wnd, inc) 121 | fn = y.shape[0] 122 | if isinstance(wnd, int): 123 | wlen = wnd 124 | else: 125 | wlen = len(wnd) 126 | voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) 127 | lmin = fs // 500 # 基音周期的最小值 128 | lmax = fs // 60 # 基音周期的最大值 129 | period = np.zeros(fn) 130 | for k in range(y.shape[0]): 131 | if SF[k] == 1: 132 | u = np.multiply(y[k, :], np.hamming(wlen)) 133 | ar, _ = lpc_coeff(u, p) 134 | ar[0] = 0 135 | z = lfilter(-ar, [1], u) 136 | E = u - z 137 | xx = np.fft.fft(E) 138 | b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-20)) 139 | lc = np.argmax(b[lmin:lmax]) 140 | period[k] = lc + lmin 141 | return voiceseg, vsl, SF, Ef, period 142 | -------------------------------------------------------------------------------- /chapter4_特征提取/共振峰估计.py: -------------------------------------------------------------------------------- 1 | # 共振峰估计函数 2 | import numpy as np 3 | from chapter3_分析实验.timefeature import * 4 | from chapter3_分析实验.lpc import lpc_coeff 5 | 6 | 7 | def local_maxium(x): 8 | """ 9 | 求序列的极大值 10 | :param x: 11 | :return: 12 | """ 13 | d = np.diff(x) 14 | l_d = len(d) 15 | maxium = [] 16 | loc = [] 17 | for i in range(l_d - 1): 18 | if d[i] > 0 and d[i + 1] <= 0: 19 | maxium.append(x[i + 1]) 20 | loc.append(i + 1) 21 | return maxium, loc 22 | 23 | 24 | def Formant_Cepst(u, cepstL): 25 | """ 26 | 倒谱法共振峰估计函数 27 | :param u: 28 | :param cepstL: 29 | :return: 30 | """ 31 | wlen2 = len(u) // 2 32 | U = np.log(np.abs(np.fft.fft(u)[:wlen2])) 33 | Cepst = np.fft.ifft(U) 34 | cepst = np.zeros(wlen2, dtype=np.complex) 35 | cepst[:cepstL] = Cepst[:cepstL] 36 | cepst[-cepstL + 1:] = Cepst[-cepstL + 1:] 37 | spec = np.real(np.fft.fft(cepst)) 38 | val, loc = local_maxium(spec) 39 | return val, loc, spec 40 | 41 | 42 | def Formant_Interpolation(u, p, fs): 43 | """ 44 | 插值法估计共振峰函数 45 | :param u: 46 | :param p: 47 | :param fs: 48 | :return: 49 | """ 50 | ar, _ = lpc_coeff(u, p) 51 | U = np.power(np.abs(np.fft.rfft(ar, 2 * 255)), -2) 52 | df = fs / 512 53 | val, loc = local_maxium(U) 54 | ll = len(loc) 55 | pp = np.zeros(ll) 56 | F = np.zeros(ll) 57 | Bw = np.zeros(ll) 58 | for k in range(ll): 59 | m = loc[k] 60 | m1, m2 = m - 1, m + 1 61 | p = val[k] 62 | p1, p2 = U[m1], U[m2] 63 | aa = (p1 + p2) / 2 - p 64 | bb = (p2 - p1) / 2 65 | cc = p 66 | dm = -bb / 2 / aa 67 | pp[k] = -bb * bb / 4 / aa + cc 68 | m_new = m + dm 69 | bf = -np.sqrt(bb * bb - 4 * aa * (cc - pp[k] / 2)) / aa 70 | F[k] = (m_new - 1) * df 71 | Bw[k] = bf * df 72 | return F, Bw, pp, U, loc 73 | 74 | 75 | def Formant_Root(u, p, fs, n_frmnt): 76 | """ 77 | LPC求根法的共振峰估计函数 78 | :param u: 79 | :param p: 80 | :param fs: 81 | :param n_frmnt: 82 | :return: 83 | """ 84 | ar, _ = lpc_coeff(u, p) 85 | U = np.power(np.abs(np.fft.rfft(ar, 2 * 255)), -2) 86 | const = fs / (2 * np.pi) 87 | rts = np.roots(ar) 88 | yf = [] 89 | Bw = [] 90 | for i in range(len(ar) - 1): 91 | re = np.real(rts[i]) 92 | im = np.imag(rts[i]) 93 | fromn = const * np.arctan2(im, re) 94 | bw = -2 * const * np.log(np.abs(rts[i])) 95 | if fromn > 150 and bw < 700 and fromn < fs / 2: 96 | yf.append(fromn) 97 | Bw.append(bw) 98 | return yf[:min(len(yf), n_frmnt)], Bw[:min(len(Bw), n_frmnt)], U 99 | -------------------------------------------------------------------------------- /chapter5_语音降噪/5.1自适应滤波器.mdown: -------------------------------------------------------------------------------- 1 | 语音降噪主要研究如何利用信号处理技术消除信号中的强噪声干扰,从而提高输出信噪比以提取出有用信号的技术。消除信号中噪声污染的通常方法是让受污染的信号通过二个能抑制噪声而让信号相对不变的滤波器,此滤波器从信号不可检测的噪声场中取得输入,将此输入加以滤波,抵消其中的原始噪声,从而达到提高信噪比的目的。 2 | 3 | 然而,由于干扰通常都是随机的,从带噪语音巾提取完全纯净的语音几乎不可能。在这种情况下,语音增强的目的主要有两个:一是改进语音质量,消除背景噪声,使昕者乐于接受,不感觉疲劳,这是一种主观度量;二是提高语音可懂度,这是一种客观度量。这两个目的往往不能兼得,所以实际应用中总是视具体情况而有所侧重的。 4 | 5 | 根据语音和噪声的特点,出现了很多种语音增强算法。比较常用的有谱减法、维纳滤波法、卡尔曼滤波法、自适应滤波法等。此外,随着科学技术的发展又出现了一些新的增强技术,如基于神经网络的语音增强、基于HMM 的语音增强、基于昕觉感知的语音增强、基于多分辨率分析的语音增强、基于语音产生模型的线性滤波法、基于小波变换的语音增强方法、梳状洁、波法、自相关法、基于语音模型的语音增强方法等。 6 | 7 | ### 带噪语音模型 8 | 9 | 而通常所说噪声是局部平稳的,是指一段带噪语音中的噪声,具有和语音段开始前那段噪声相同的统计特性,且在整个语音段中保持不变。也就是说,可以根据语音开始前那段噪声来估计语音中所叠加的噪声统计特性。 10 | 11 | ### LMS自适应滤波器 12 | 所谓自适应滤波器就是利用前一时刻已获得的滤波器参数等结果,自动地调节现时刻的滤波器参数,以适应信号和|噪声未知的随机变化的统计特性,从而实现最优滤波。 13 | 14 | 最小方均(LMS) 白适应算法就是以已知期望响应和滤波器输出信号之间误差的方均值最小为准的,依据输入信号在迭代过程中估计梯度矢量,并更新权系数以达到最优的白适应迭代算法。LMS 算法是一种梯度最速下降方法,其显著的优点是它的简单性,这种算法不需要计算相应的相关函数,也不需要进行矩阵运算。 15 | 16 | 滤波器的输出$y(n)$表示为: 17 | $$y(n)=\bold{W^T}(n)\bold{X}(n)=\sum_{i=0}^{N-1}w_i(n)\bold{x}(n-i)$$ 18 | 19 | 对于LMS滤波器的结构,误差为:$\bold{e}(n)=\bold{d}(n)-\bold{y}(n)$。方均误差为: 20 | $$\epsilon=\bold{E}[\bold{e^2}(n)]=\bold{E}[\bold{d}(n)-\bold{y}(n)]=\bold{E}[\bold{d^2}(n)]+\bold{W^T}(n)R\bold{W}(n)-2\bold{PW}(n)$$ 21 | 22 | 其中$\bold{R}=\bold{E}[\bold{X}\bold{X^T}]$,是$N\times N$的自相关矩阵,$\bold{P}=\bold{E}[\bold{d}(n)\bold{X^T}(n)]$为互相关矩阵,代表理想信号$\bold{d}(n)$与输入矢量$\bold{X}(n)$的相关性。 23 | 24 | 在达到误差$\epsilon$最小时,有: 25 | $$\frac{\partial \epsilon}{\partial\bold{W}(n)}|_{\bold{W}(n)=W^*}=0$$ 26 | 27 | 有: 28 | $$\bold{RW^*-P}=0\rightarrow\bold{W^*=\bold{R^{-1}P}}$$ 29 | 30 | LMS算法使用梯度下降来解,即$\bold{W:=W-\mu \Delta W(n)}$ 31 | 32 | $$\Delta W(n)=\frac{\partial E[e^2(n)]}{\partial W(n)}=2E[e(n)]\frac{\partial E[e(n)]}{\partial W(n)}=2E[e(n)]\frac{\partial E[\bold{d}(n)-\bold{y}(n)]}{\partial W(n)}=-2E[e(n)x(n)]$$ 33 | 34 | 那么: 35 | $$W_{(n+1)}=W_{(n)}+2\mu \Delta e(n)x(n)$$ 36 | 37 | ### 语音质量性能指标 38 | - 信噪比 39 | $$SNR=10\lg \frac{\sum\limits_{n=1}^Ns^2(n)}{\sum\limits_{n=1}^Nd^2(n)}$$ 40 | $s$表示信号,$d$表示噪声。 41 | - PESQ(Perceptual Evaluation of Speech Quality) 42 | PESQ 算法需要带噪的衰减信号和一个原始的参考信号。开始时将两个待比较的语音信号经过电平调整、输入滤波器滤波、时间对准和补偿、昕觉变换之后,分别提取两路信号的参数,综合其时频特性,得到PESQ分数,最终将这个分数映射到主观平均意见分(MOS)。PESQ得分范围在-0.5到4.5之间。得分越高表示语音质量越好。 43 | -------------------------------------------------------------------------------- /chapter5_语音降噪/5.2谱减法.mdown: -------------------------------------------------------------------------------- 1 | ### 谱减法 2 | 对于任何一帧信号$x_i(m)$做FFT变换后: 3 | $$X_i(k)=\sum_{m=1}^Nx_i(m)\exp(j\frac{2\pi mk}{N})$$ 4 | 5 | 对于$X_i(k)$的幅值为$|X_i(k)|$,角度为$X^i_{angle}(k)=\arctan[\frac{Im(X_i(k))}{Re(X_i(k))}]$,前导噪声段时长为IS,对应帧数为NIS,可以得到该噪声段的平均能量为: 6 | $$D(k)=\frac{1}{NIS}\sum_{i=1}^{NIS}|X_i(k)|^2$$ 7 | 8 | 谱减公式为: 9 | $$|\hat X_i(k)|^2=\left \{\begin{array}{ll} 10 | |X_i(k)|^2-a\times D(k)& |X_i(k)|^2\geqslant a \times D(k)\\ 11 | b\times D(k)&|X_i(k)|^2< a \times D(k) 12 | \end{array} \right.$$ 13 | 14 | 其中,$a,b$是两个常数,$a$为过减因子,$b$为增益补偿因子。 15 | 16 | 利用谱减后的幅值$|\hat X_i(k)|$,以及原先的相位角$X^i_{angle}(k)$,可以利用iFFT求出增强后的语音序列$\hat x_i(m)$。 17 | 18 | ### Boll改进谱减法 19 | (一)谱减公式改为: 20 | $$|\hat X_i(k)|^{\gamma}=\left \{\begin{array}{ll} 21 | |X_i(k)|^{\gamma}-a\times D(k)& |X_i(k)|^{\gamma}\geqslant a \times D(k)\\ 22 | b\times D(k)&|X_i(k)|^{\gamma}< a \times D(k) 23 | \end{array} \right.$$ 24 | 25 | $$D(k)=\frac{1}{NIS}\sum_{i=1}^{NIS}|X_i(k)|^{\gamma}$$ 26 | 27 | 当$\gamma=1$,算法相当于用谱幅值做谱减法,当$\gamma=2$,算法相当于用功率谱幅值做谱减法。 28 | 29 | (二)计算平均谱值代替 30 | $$Y_i(k)=\frac{1}{2M+1}\sum_{j=-M}^MX_{i+j}(k)$$ 31 | 32 | 使用$Y_i(k)$代替$X_i(k)$,可以得到较小的谱估算方差。 33 | 34 | (三)减小噪声残留 35 | $$D_i(k)=\left \{\begin{array}{ll} 36 | D_i(k)& D_i(k)\geqslant \max|N_R(k)|\\ 37 | \min\{D_j(k)|j \in [i-1,i,i+1]\}&D_i(k)< \max|N_R(k)| 38 | \end{array} \right.$$ 39 | 40 | 其中,$\max|N_R(k)|$为最大残余噪声。 41 | -------------------------------------------------------------------------------- /chapter5_语音降噪/5.3维纳滤波.mdown: -------------------------------------------------------------------------------- 1 | 基本维纳滤波就是用来解决从噪声中提取信号问题的一种过滤(或滤波)方法。它基于平稳随机过程模型,且假设退化模型为线性空间不变系统的。实际上这种线性滤波问题,可以看成是一种估计问题或一种线性估计问题。基本的维纳滤波是根据全部过去的和当前的观察数据来估计信号的当前值,它的解是以均方误差最小条件下所得到的系统的传递函数H(z)或单位样本响应h(n)的形式给出的,因此更常称这种系统为最佳线性过滤器或滤波器。设计维纳滤波器的过程就是寻求在最小均方误差下滤波器的单位样本响应h(n)或传递函数H(z)的表达式,其实质是解维纳-霍夫(Wiener-Hopf)方程。 2 | 3 | 带语音信号为: 4 | $$x(n)=s(n)+v(n)$$ 5 | 6 | 进过维纳滤波器$h(n)$的输出响应$y(n)$为: 7 | $$y(n)=x(n)*h(n)=\sum_mh(m)x(n-m)$$ 8 | 9 | 理论上,x(n)通过线性系统h(n)后得到的r(n)应尽量接近于s(n),因此y(n)为s(n)的估计值,可用$\hat s$(n)表示,即$y(n)=\hat s(n)$。$\hat s(n)$按最小均方误差准则使$\hat s(n)$和$s(n)$的均方误差$E[e^2(n)]=E[[s(n)-\hat s(n)]^2]$最小。 10 | $$\frac{\partial E[e^2(n)]}{\partial h(n)}=E[2e(n)\frac{\partial e(n)}{\partial h(n)}]=E[2e(n)x(n-m)]=0$$ 11 | 12 | 带入$e(n)$的式子: 13 | $$E[(s(n)-\hat s(n))x(n-m)]=0$$ 14 | 15 | 用$R_x(m-l)=E[x(n-m)x(n-l)]$表示$x(n)$的自相关函数,$R_{sx}(m)=E[s(n)x(n-m)]$表示$s(n)$和$x(n)$的互相关函数。 16 | 17 | 那么期望方程可以写成: 18 | $$\sum_lh(l)R_x(m-l)=R_{sx}(m)\tag{Wiener-Hopf方程}$$ 19 | 20 | 如果$R_{sx}(m)$和$R_x(m-l)$是已知的,那么解这个方程就是求维纳滤波器的冲击响应。 21 | 22 | 当l从0到N-1取有限个整数值时,设滤波器冲击响应序列的长度为N,冲击响应为:$\bold{h}=[h(0)h(1)...h(N-1)]^T$,滤波器数据输入矢量$\bold{x}(n)=[x(n)x(n-1)...x(n-N+1)]^T$,滤波器的输出为:$\bold{y}(n)=\hat s(n)=x^T(n)h(n)=h^Tx(n)$。 23 | 用$\bold{Q}=E[x(n)s(n)]$表示互相关函数,$\bold{R}=E[x(n)x^T(n)]$是$x(n)$的自相关函数,所以Wiener-Hopf方程可以写成: 24 | $$\bold{Q=Rh}$$ 25 | 26 | 那么$\bold{h_{opt}=R^{-1}Q}$ -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_1_5.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter4_特征提取.共振峰估计 import * 3 | from chapter5_语音降噪.自适应滤波 import * 4 | 5 | 6 | def awgn(x, snr): 7 | snr = 10 ** (snr / 10.0) 8 | xpower = np.sum(x ** 2) / len(x) 9 | npower = xpower / snr 10 | return x + np.random.randn(len(x)) * np.sqrt(npower) 11 | 12 | 13 | data, fs = soundBase('C5_1_y.wav').audioread() 14 | data -= np.mean(data) 15 | data /= np.max(np.abs(data)) 16 | 17 | # 5.1.1 18 | N = len(data) 19 | time = [i / fs for i in range(N)] 20 | SNR = 5 21 | 22 | r1 = awgn(data, SNR) 23 | M, mu = 64, 0.001 24 | itr = len(r1) 25 | snr1 = SNR_Calc(data, r1 - data) 26 | [y, W, e] = LMS(r1, data, M, mu, itr) 27 | [yn, Wn, en] = NLMS(r1, data, M, mu, itr) 28 | output = e / np.max(np.abs(e)) 29 | outputn = en / np.max(np.abs(en)) 30 | snr2 = SNR_Calc(data, data - output) 31 | snr2n = SNR_Calc(data, data - outputn) 32 | plt.subplot(4, 1, 1) 33 | plt.plot(time, data) 34 | plt.ylabel('原始信号') 35 | plt.subplot(4, 1, 2) 36 | plt.ylabel('加入噪声') 37 | plt.plot(time, r1) 38 | plt.subplot(4, 1, 3) 39 | plt.ylabel('LMS去噪') 40 | plt.plot(time, output) 41 | 42 | plt.subplot(4, 1, 4) 43 | plt.ylabel('NLMS去噪') 44 | plt.plot(time, outputn) 45 | plt.savefig('images/LMS.png') 46 | plt.close() 47 | 48 | print('加入噪声SNR:{:.4f}\tLMS滤波后噪声SNR:{:.4f}\t下降SNR:{:.4f}'.format(snr1, snr2, snr2 - snr1)) 49 | print('加入噪声SNR:{:.4f}\tNLMS滤波后噪声SNR:{:.4f}\t下降SNR:{:.4f}'.format(snr1, snr2n, snr2n - snr1)) 50 | -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/C5_1_y.wav -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_2_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter5_语音降噪.自适应滤波 import * 3 | 4 | 5 | def awgn(x, snr): 6 | snr = 10 ** (snr / 10.0) 7 | xpower = np.sum(x ** 2) / len(x) 8 | npower = xpower / snr 9 | return x + np.random.randn(len(x)) * np.sqrt(npower) 10 | 11 | 12 | data, fs = soundBase('C5_1_y.wav').audioread() 13 | data -= np.mean(data) 14 | data /= np.max(np.abs(data)) 15 | IS = 0.25 # 设置前导无话段长度 16 | wlen = 200 # 设置帧长为25ms 17 | inc = 80 # 设置帧移为10ms 18 | SNR = 5 # 设置信噪比SNR 19 | N = len(data) # 信号长度 20 | time = [i / fs for i in range(N)] # 设置时间 21 | r1 = awgn(data, SNR) 22 | NIS = int((IS * fs - wlen) // inc + 1) 23 | # 5.2.1 24 | snr1 = SNR_Calc(r1, r1 - data) 25 | a, b = 4, 0.001 26 | output = SpectralSub(r1, wlen, inc, NIS, a, b) 27 | if len(output) < len(r1): 28 | filted = np.zeros(len(r1)) 29 | filted[:len(output)] = output 30 | elif len(output) > len(r1): 31 | filted = output[:len(r1)] 32 | else: 33 | filted = output 34 | 35 | plt.subplot(4, 1, 1) 36 | plt.plot(time, data) 37 | plt.ylabel('原始信号') 38 | plt.subplot(4, 1, 2) 39 | plt.plot(time, r1) 40 | plt.ylabel('加噪声信号') 41 | plt.subplot(4, 1, 3) 42 | plt.ylabel('滤波信号') 43 | plt.plot(time, filted) 44 | 45 | # 5.2.2 46 | 47 | 48 | 49 | 50 | plt.show() 51 | -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_2_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/C5_2_y.wav -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/C5_3_y.wav -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_4_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter5_语音降噪.Wavelet import * 3 | 4 | plt.rcParams['font.sans-serif'] = ['SimHei'] 5 | plt.rcParams['axes.unicode_minus'] = False 6 | 7 | 8 | def awgn(x, snr): 9 | snr = 10 ** (snr / 10.0) 10 | xpower = np.sum(x ** 2) / len(x) 11 | npower = xpower / snr 12 | return x + np.random.randn(len(x)) * np.sqrt(npower) 13 | 14 | 15 | data, fs = soundBase('C5_4_y.wav').audioread() 16 | data -= np.mean(data) 17 | data /= np.max(np.abs(data)) 18 | SNR = 5 19 | N = len(data) 20 | s = awgn(data, SNR) 21 | time = [i / fs for i in range(N)] # 设置时间 22 | 23 | wname = 'db7' 24 | jN = 6 25 | 26 | res_s = Wavelet_Soft(s, jN, wname) 27 | res_d = Wavelet_Hard(s, jN, wname) 28 | res_hs = Wavelet_hardSoft(s, jN, wname) 29 | res_a = Wavelet_average(s, jN, wname) 30 | 31 | plt.figure(figsize=(14, 10)) 32 | plt.subplot(3, 2, 1) 33 | plt.plot(time, data) 34 | plt.ylabel('原始信号') 35 | plt.subplot(3, 2, 2) 36 | plt.plot(time, s) 37 | plt.ylabel('加噪声信号') 38 | plt.subplot(3, 2, 3) 39 | plt.ylabel('小波软阈值滤波') 40 | plt.plot(time, res_s) 41 | 42 | plt.subplot(3, 2, 4) 43 | plt.ylabel('小波硬阈值滤波') 44 | plt.plot(time, res_d) 45 | 46 | plt.subplot(3, 2, 5) 47 | plt.ylabel('小波折中阈值滤波') 48 | plt.plot(time, res_hs) 49 | 50 | plt.subplot(3, 2, 6) 51 | plt.ylabel('小波加权滤波') 52 | plt.plot(time, res_a) 53 | 54 | plt.savefig('images/wavelet.png') 55 | plt.close() 56 | -------------------------------------------------------------------------------- /chapter5_语音降噪/C5_4_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/C5_4_y.wav -------------------------------------------------------------------------------- /chapter5_语音降噪/images/LMS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/images/LMS.png -------------------------------------------------------------------------------- /chapter5_语音降噪/images/wavelet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter5_语音降噪/images/wavelet.png -------------------------------------------------------------------------------- /chapter5_语音降噪/wp_mfcc.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @application: 4 | @file: wp_mfcc.py 5 | @time: 2020/9/11 下午 14:39 6 | @desc: 测试小波包-MFCC文件 7 | ''' 8 | 9 | from chapter2_基础.soundBase import * 10 | from chapter5_语音降噪.Wavelet import * 11 | 12 | data, fs, _ = soundBase('C5_4_y.wav').audioread() 13 | data -= np.mean(data) 14 | data /= np.max(np.abs(data)) 15 | 16 | wpcoeff = wavePacketDec(data, 3, 'db7') 17 | dd = wavePacketRec(wpcoeff, 'db7') 18 | 19 | for i in range(len(dd)): 20 | plt.subplot(len(dd), 2, 2 * i + 1) 21 | plt.plot(dd[i]) 22 | plt.subplot(len(dd), 2, 2 * i + 2) 23 | plt.plot(np.linspace(0, fs / 2, len(dd[i]) // 2 + 1), np.abs(np.fft.rfft(dd[i]) / (len(dd[i]) // 2 + 1)) ** 2) 24 | plt.show() 25 | 26 | # 使用小波包-MFCC结合提取特征 27 | wmfcc = WPMFCC(data, fs, 12, 200, 100, 3, 'db7') 28 | mfcc = Nmfcc(data, fs, 12, 200, 100) 29 | print(1) 30 | -------------------------------------------------------------------------------- /chapter5_语音降噪/自适应滤波.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from chapter3_分析实验.C3_1_y_1 import enframe 3 | 4 | 5 | def SNR_Calc(s, r): 6 | """ 7 | 计算信号的信噪比 8 | :param s: 信号 9 | :param r1: 噪声 10 | :return: 11 | """ 12 | Ps = np.sum(np.power(s - np.mean(s), 2)) 13 | Pr = np.sum(np.power(r - np.mean(r), 2)) 14 | return 10 * np.log10(Ps / Pr) 15 | 16 | 17 | def LMS(xn, dn, M, mu, itr): 18 | """ 19 | 使用LMS自适应滤波 20 | :param xn:输入的信号序列 21 | :param dn:所期望的响应序列 22 | :param M:滤波器的阶数 23 | :param mu:收敛因子(步长) 24 | :param itr:迭代次数 25 | :return: 26 | """ 27 | en = np.zeros(itr) # 误差序列,en(k)表示第k次迭代时预期输出与实际输入的误差 28 | W = np.zeros((M, itr)) # 每一行代表一个加权参量,每一列代表-次迭代,初始为0 29 | # 迭代计算 30 | for k in range(M, itr): 31 | x = xn[k:k - M:-1] 32 | y = np.matmul(W[:, k - 1], x) 33 | en[k] = dn[k] - y 34 | W[:, k] = W[:, k - 1] + 2 * mu * en[k] * x 35 | # 求最优输出序列 36 | yn = np.inf * np.ones(len(xn)) 37 | for k in range(M, len(xn)): 38 | x = xn[k:k - M:-1] 39 | yn[k] = np.matmul(W[:, -1], x) 40 | return yn, W, en 41 | 42 | 43 | def NLMS(xn, dn, M, mu, itr): 44 | """ 45 | 使用Normal LMS自适应滤波 46 | :param xn:输入的信号序列 47 | :param dn:所期望的响应序列 48 | :param M:滤波器的阶数 49 | :param mu:收敛因子(步长) 50 | :param itr:迭代次数 51 | :return: 52 | """ 53 | en = np.zeros(itr) # 误差序列,en(k)表示第k次迭代时预期输出与实际输入的误差 54 | W = np.zeros((M, itr)) # 每一行代表一个加权参量,每一列代表-次迭代,初始为0 55 | # 迭代计算 56 | for k in range(M, itr): 57 | x = xn[k:k - M:-1] 58 | y = np.matmul(W[:, k - 1], x) 59 | en[k] = dn[k] - y 60 | W[:, k] = W[:, k - 1] + 2 * mu * en[k] * x / (np.sum(np.multiply(x, x)) + 1e-10) 61 | # 求最优输出序列 62 | yn = np.inf * np.ones(len(xn)) 63 | for k in range(M, len(xn)): 64 | x = xn[k:k - M:-1] 65 | yn[k] = np.matmul(W[:, -1], x) 66 | return yn, W, en 67 | 68 | 69 | def SpectralSub(signal, wlen, inc, NIS, a, b): 70 | """ 71 | 谱减法滤波 72 | :param signal: 73 | :param wlen: 74 | :param inc: 75 | :param NIS: 76 | :param a: 77 | :param b: 78 | :return: 79 | """ 80 | wnd = np.hamming(wlen) 81 | y = enframe(signal, wnd, inc) 82 | fn, flen = y.shape 83 | y_a = np.abs(np.fft.fft(y, axis=1)) 84 | y_a2 = np.power(y_a, 2) 85 | y_angle = np.angle(np.fft.fft(y, axis=1)) 86 | Nt = np.mean(y_a2[:NIS, ], axis=0) 87 | 88 | y_a2 = np.where(y_a2 >= a * Nt, y_a2 - a * Nt, b * Nt) 89 | 90 | X = y_a2 * np.cos(y_angle) + 1j * y_a2 * np.sin(y_angle) 91 | hatx = np.real(np.fft.ifft(X, axis=1)) 92 | 93 | sig = np.zeros(int((fn - 1) * inc + wlen)) 94 | 95 | for i in range(fn): 96 | start = i * inc 97 | sig[start:start + flen] += hatx[i, :] 98 | return sig 99 | 100 | 101 | def SpectralSubIm(signal, wind, inc, NIS, Gamma, Beta): 102 | pass 103 | -------------------------------------------------------------------------------- /chapter6_语音编码/6.1PCM编码.mdown: -------------------------------------------------------------------------------- 1 | 脉冲编码调制(Pulse Code Modulation, PCM)是语音信号的重要编码方式之一。语音编码是将模拟信号转为数字信号的语音通信技术,分为波形编码、参量编码和混合编码等类型。波形编码针对语音波形进行,在降低量化样本比特数的同时保持了良好的语音质量。PCM编码就是一种波形编码方法,通过每隔一段时间对模拟语音信号采样,将其取整量化,用二进制码表示抽样量化的隔值,实现将语音数字化的编码调制。 2 | PCM是现代数字传输系统普遍采用的调制方式。PCM可以向用户提供多种业务,包括2M-155Mbit/s速率的数字数据专线业务和语音、图像、远程教学等业务,适用于传输速率要求高、需要更高带宽的应用,在语音信号处理中有着广泛的运用。PCM 分为抽样、量化和编码三个步骤。 3 | ![PCM](images/PCM流程.png) 4 | 抽样过程就是抽取某点的频率值的过程。显然,在1 s 内抽取的点越多,获取的频率信息越丰富。为了复原波形, 一次振动中必须有2 个点的采样,人耳能够感觉到的最高频率为20 kHz ,因此要满足人耳的听觉要求,则需要至少每秒进行40000 次采样,即采样率为44.1 kHz 。光有频率信息是不够的,还必须获得该频率的能量值并量化,用于表示信号强度。量化电平数为2 的整数次幕,常用16 bit 的采样大小即$2^{16}$。例如对一个语音信号进行8次采样,采样点分别对应的能量值分别为A1-A8,使用2 bit的采样大小只能保留A1-A8中4个点,而进行3 bit的采样则刚好记录下8 点的所有信息。采样率和采样大小的值越大,记录的波形越接近原始信号。 5 | 6 | 抽样量化就是常规A/D的过程,量化后的抽样信号在一定范围内仅有有限个可取的样值,且信号正负幅度分布的对称性使正负样值个数相等,若将有限个量化样值绝对值从小到大排列,依次赋予十进制数字,以﹢、﹣号为前缀,则量化后的抽样信号就转化为按时序排列的十进制数字码流。将数字转化为二进制编码,根据十进制代码总个数确定二进制位数,即字长。这样将量化的抽样信号变成给定字长的二进制码流的过程就是编码。 7 | 8 | ~~~py 9 | import numpy as np 10 | 11 | 12 | def PCM_encode(x): 13 | n = len(x) 14 | out = np.zeros((n, 8)) 15 | for i in range(n): 16 | # 符号位 17 | if x[i] > 0: 18 | out[i, 0] = 1 19 | else: 20 | out[i, 0] = 0 21 | # 数据位 22 | if abs(x[i]) < 32: 23 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 0, 2, 0 24 | elif abs(x[i]) < 64: 25 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 1, 2, 32 26 | elif abs(x[i]) < 128: 27 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 0, 4, 64 28 | elif abs(x[i]) < 256: 29 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 1, 8, 128 30 | elif abs(x[i]) < 512: 31 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 0, 16, 256 32 | elif abs(x[i]) < 1024: 33 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 1, 32, 512 34 | elif abs(x[i]) < 2048: 35 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 0, 64, 1024 36 | else: 37 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 1, 128, 2048 38 | 39 | if abs(x[i]) >= 4096: 40 | out[i, 1:] = np.array([1, 1, 1, 1, 1, 1]) 41 | else: 42 | tmp = bin(int((abs(x[i]) - st) / step)).replace('0b', '') 43 | tmp = '0' * (4 - len(tmp)) + tmp 44 | t = [int(k) for k in tmp] 45 | out[i, 4:] = t 46 | return out.reshape(8 * n) 47 | 48 | 49 | def PCM_decode(ins, v): 50 | inn = ins.reshape(len(ins) // 8, 8) 51 | slot = np.array([0, 32, 64, 128, 256, 512, 1024, 2048]) 52 | step = np.array([2, 2, 4, 8, 16, 32, 64, 128]) 53 | out = np.zeros(len(ins) // 8) 54 | for i in range(inn.shape[0]): 55 | sgn = 2 * inn[i, 0] - 1 56 | tmp = int(inn[i, 1] * 4 + inn[i, 2] * 2 + inn[i, 3]) 57 | st = slot[tmp] 58 | dt = (inn[i, 4] * 8 + inn[i, 5] * 4 + inn[i, 6] * 2 + inn[i, 7]) * step[tmp] + 0.5 * step[tmp] 59 | out[i] = sgn * (st + dt) / 4096 * v 60 | return out 61 | 62 | ~~~ 63 | 64 | ~~~py 65 | from chapter2_基础.soundBase import * 66 | from chapter6_语音编码.PCM import * 67 | 68 | plt.rcParams['font.sans-serif'] = ['SimHei'] 69 | plt.rcParams['axes.unicode_minus'] = False 70 | 71 | data, fs = soundBase('C6_1_y.wav').audioread() 72 | 73 | sxx = np.array(list(map(int, data * 4096))) 74 | 75 | y = PCM_encode(sxx) 76 | 77 | yy = PCM_decode(y, 1) 78 | 79 | plt.subplot(3, 1, 1) 80 | plt.plot(data) 81 | plt.title('编码前') 82 | 83 | plt.subplot(3, 1, 2) 84 | plt.plot(yy) 85 | plt.title('解码后') 86 | 87 | plt.subplot(3, 1, 3) 88 | plt.plot(yy - data) 89 | plt.title('误差') 90 | 91 | plt.savefig('images/pcm.png') 92 | plt.close() 93 | 94 | ~~~ 95 | 96 | ![pcm](images/pcm.png) 97 | -------------------------------------------------------------------------------- /chapter6_语音编码/6.2LPC编码.mdown: -------------------------------------------------------------------------------- 1 | 线性预测编码(linear predictive coding, LPC) 是运用于音频信号处理与语音处理的压缩编码方式,根据线性预测模型的信息表示数字语音信号谱包络。它是最有效的语音分析技术之一,也是低位速高质量语音编码的最有用的方法之一,能够提供非常精确的语音参数预测。线性预测编码通过估计共振峰剔除它们在语音信号中的作用,估计保留的蜂鸣音强度与频率来分析语音信号;同时,使用蜂呜参数与残余信号生成源信号,使用共振峰生成表示声道的滤波器,源、信号经过滤坡器的处理来逆向合成语音信号。由于语音信号随着时间变化这个过程是在一段段的语音信号帧上进行处理的,通常每秒30-50帧就能对可理解的信号进行很好的压缩。 2 | 3 | 线性预测编码通常用于语音的重新合成,它是电话公司使用的声音压缩格式,如GSM标准就在使用LPC编码格式。它还用作安全无线通信中的格式,在安全的无线通信中,声音必须进行数字化、加密然后通过狭窄的语音信道传输。 4 | 5 | 线性预测分析的基本思想是:由于语音样点之间存在相关性,所以可以用过去的样点值来预测现在或将来的样点值,即一个语音抽样可以用过去若干个语音抽样或它们的线性组合来逼近。通过使实现语音抽样与线性预测抽样之间的误差在某个准则(通常为最小均方误差准则)下达到最小值来决定一组预测系数。这一组预测系数就反映了语音信号的特性,可以作为语音信号的特征参数用于语音合成和语音识别等。 6 | 7 | 线性预测分析的基本假设是认为语音信号是一个激励信号通过一个滤波器得到的。采样点的输出$s(n)$可以用前p个样本的线性组合来表示: 8 | $$\hat s(n)=\sum^p a_is(n-i)$$ 9 | 10 | 通过$E(n)=s(n)-\hat s(n)$最小来求解。LPC参数是模拟人的发声器官,是基于语音合成的模型参数,每个声管对应一个LPC模型的极点,一般情况下,极点个数为12~16之间,就可以足够清晰地描述语音信号的特征,选择p=12可以对绝大多数语音信号的声道模型取得足够的近似,增大p可以改善近似效果,但是计算量变大。 11 | 12 | 13 | ~~~py 14 | from scipy.signal import filtfilt 15 | 16 | from chapter2_基础.soundBase import * 17 | from chapter3_分析实验.lpc import lpc_coeff 18 | 19 | plt.rcParams['font.sans-serif'] = ['SimHei'] 20 | plt.rcParams['axes.unicode_minus'] = False 21 | 22 | data, fs = soundBase('C6_1_y.wav').audioread() 23 | N = len(data) 24 | time = [i / fs for i in range(N)] # 设置时间 25 | p = 12 26 | ar, g = lpc_coeff(data, p) 27 | ar[0] = 0 28 | est_x = filtfilt(-ar, [1], data) 29 | 30 | plt.subplot(2, 1, 1) 31 | plt.plot(time, data, 'k') 32 | plt.plot(time, est_x, 'c') 33 | plt.title('LPC解码') 34 | plt.legend(['信号', '解码信号']) 35 | plt.subplot(2, 1, 2) 36 | plt.plot(est_x - data) 37 | plt.title('误差') 38 | plt.savefig('LPC解码.png') 39 | plt.close() 40 | 41 | ~~~ 42 | 43 | ![LPC](images/LPC解码.png) -------------------------------------------------------------------------------- /chapter6_语音编码/ADPCM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def adpcm_decoder(code, sign_bit): 5 | """ 6 | APDCM解码函数 7 | :param code: 8 | :param sign_bit: 9 | :return: 10 | """ 11 | l = len(code) 12 | y = np.zeros(l) 13 | ss2 = np.zeros(l) 14 | ss2[0] = 1 15 | # 生成步长查找表 16 | index = [-1, 4] 17 | currentIndex = 0 18 | startval = 1 19 | endval = 127 20 | base = np.exp(np.log(2) / 8) 21 | # 近似步长 22 | const = startval / base 23 | numSteps = int(round(np.log(endval / const) / np.log(base))) 24 | n = [i + 1 for i in range(numSteps)] 25 | base = np.exp(np.log(endval / startval) / (numSteps - 1)) 26 | const = startval / base 27 | tabel2 = np.round(const * np.power(base, n)) 28 | 29 | for n in range(1, l): 30 | neg = code[n] >= sign_bit 31 | if neg: 32 | temp = code[n] - sign_bit 33 | else: 34 | temp = code[n] 35 | temp2 = (temp + 0.5) * ss2[n - 1] 36 | if neg: 37 | temp2 = -temp2 38 | y[n] = y[n - 1] + temp2 39 | if y[n] > 127: 40 | y[n] = 127 41 | elif y[n] < -127: 42 | y[n] = -127 43 | # 计算新的步长 44 | currentIndex += index[int(temp)] 45 | if currentIndex < 0: 46 | currentIndex = 0 47 | elif currentIndex > numSteps: 48 | currentIndex = numSteps 49 | ss2[n] = tabel2[currentIndex] 50 | return y / 128 51 | 52 | 53 | def adpcm_encoder(x, sign_bit): 54 | """ 55 | APDCM编码函数 56 | :param x: 57 | :param sign_bit: 58 | :return: 59 | """ 60 | x *= 128 61 | l = len(x) 62 | # 生成步长查找表 63 | index = [-1, 4] 64 | currentIndex = 1 65 | startval = 1 66 | endval = 127 67 | base = np.exp(np.log(2) / 8) 68 | # 近似步长 69 | const = startval / base 70 | numSteps = int(round(np.log(endval / const) / np.log(base))) 71 | n = [i + 1 for i in range(numSteps)] 72 | base = np.exp(np.log(endval / startval) / (numSteps - 1)) 73 | const = startval / base 74 | tabel2 = np.round(const * np.power(base, n)) 75 | 76 | ss = np.zeros(l) 77 | ss[0] = tabel2[0] 78 | z = np.zeros(l) 79 | code = np.zeros(l) 80 | d = np.zeros(l) 81 | neg = 0 82 | for n in range(1, l): 83 | d[n] = x[n] - z[n - 1] 84 | if d[n] < 0: 85 | neg = 1 86 | code[n] += sign_bit 87 | d[n] = -d[n] 88 | else: 89 | neg = 0 90 | if d[n] >= ss[n - 1]: 91 | code[n] += 1 92 | if neg: 93 | temp = code[n] - sign_bit 94 | else: 95 | temp = code[n] 96 | temp2 = (temp + 0.5) * ss[n - 1] 97 | if neg: 98 | temp2 = -temp2 99 | z[n] = z[n - 1] + temp2 100 | if z[n] > 127: 101 | z[n] = 127 102 | elif z[n] < -127: 103 | z[n] = -127 104 | # 计算新的步长 105 | currentIndex += index[int(temp)] 106 | if currentIndex < 0: 107 | currentIndex = 0 108 | elif currentIndex > numSteps: 109 | currentIndex = numSteps 110 | ss[n] = tabel2[currentIndex] 111 | return code 112 | -------------------------------------------------------------------------------- /chapter6_语音编码/C6_1_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter6_语音编码.PCM import * 3 | 4 | plt.rcParams['font.sans-serif'] = ['SimHei'] 5 | plt.rcParams['axes.unicode_minus'] = False 6 | 7 | data, fs = soundBase('C6_1_y.wav').audioread() 8 | 9 | sxx = np.array(list(map(int, data * 4096))) 10 | 11 | y = PCM_encode(sxx) 12 | 13 | yy = PCM_decode(y, 1) 14 | 15 | plt.subplot(3, 1, 1) 16 | plt.plot(data) 17 | plt.title('编码前') 18 | 19 | plt.subplot(3, 1, 2) 20 | plt.plot(yy) 21 | plt.title('解码后') 22 | 23 | plt.subplot(3, 1, 3) 24 | plt.plot(yy - data) 25 | plt.title('误差') 26 | 27 | plt.savefig('images/pcm.png') 28 | plt.close() 29 | -------------------------------------------------------------------------------- /chapter6_语音编码/C6_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/C6_1_y.wav -------------------------------------------------------------------------------- /chapter6_语音编码/C6_2_y.py: -------------------------------------------------------------------------------- 1 | from scipy.signal import filtfilt 2 | 3 | from chapter2_基础.soundBase import * 4 | from chapter3_分析实验.lpc import lpc_coeff 5 | 6 | plt.rcParams['font.sans-serif'] = ['SimHei'] 7 | plt.rcParams['axes.unicode_minus'] = False 8 | 9 | data, fs = soundBase('C6_1_y.wav').audioread() 10 | N = len(data) 11 | time = [i / fs for i in range(N)] # 设置时间 12 | p = 12 13 | ar, g = lpc_coeff(data, p) 14 | ar[0] = 0 15 | est_x = filtfilt(-ar, [1], data) 16 | 17 | plt.subplot(2, 1, 1) 18 | plt.plot(time, data, 'k') 19 | plt.plot(time, est_x, 'c') 20 | plt.title('LPC解码') 21 | plt.legend(['信号', '解码信号']) 22 | plt.subplot(2, 1, 2) 23 | plt.plot(est_x - data) 24 | plt.title('误差') 25 | plt.savefig('images/LPC解码.png') 26 | plt.close() 27 | -------------------------------------------------------------------------------- /chapter6_语音编码/C6_3_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter6_语音编码.ADPCM import * 3 | 4 | plt.rcParams['font.sans-serif'] = ['SimHei'] 5 | plt.rcParams['axes.unicode_minus'] = False 6 | 7 | data, fs = soundBase('C6_3_y.wav').audioread() 8 | N = len(data) 9 | time = [i / fs for i in range(N)] # 设置时间 10 | sig_bit = 2 11 | 12 | ss = adpcm_encoder(data, sig_bit) 13 | yy = adpcm_decoder(ss, sig_bit) 14 | 15 | plt.subplot(2, 1, 1) 16 | plt.plot(time, data / np.max(data), 'k') 17 | plt.plot(time, yy / np.max(yy), 'c') 18 | plt.title('ADPCM解码') 19 | plt.legend(['信号', '解码信号']) 20 | plt.subplot(2, 1, 2) 21 | plt.plot(data / np.max(data) - yy / np.max(yy)) 22 | plt.title('误差') 23 | plt.savefig('images/ADPMC.png') 24 | plt.close() 25 | -------------------------------------------------------------------------------- /chapter6_语音编码/C6_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/C6_3_y.wav -------------------------------------------------------------------------------- /chapter6_语音编码/LPC解码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/LPC解码.png -------------------------------------------------------------------------------- /chapter6_语音编码/PCM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def PCM_encode(x): 5 | n = len(x) 6 | out = np.zeros((n, 8)) 7 | for i in range(n): 8 | # 符号位 9 | if x[i] > 0: 10 | out[i, 0] = 1 11 | else: 12 | out[i, 0] = 0 13 | # 数据位 14 | if abs(x[i]) < 32: 15 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 0, 2, 0 16 | elif abs(x[i]) < 64: 17 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 1, 2, 32 18 | elif abs(x[i]) < 128: 19 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 0, 4, 64 20 | elif abs(x[i]) < 256: 21 | out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 1, 8, 128 22 | elif abs(x[i]) < 512: 23 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 0, 16, 256 24 | elif abs(x[i]) < 1024: 25 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 1, 32, 512 26 | elif abs(x[i]) < 2048: 27 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 0, 64, 1024 28 | else: 29 | out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 1, 128, 2048 30 | 31 | if abs(x[i]) >= 4096: 32 | out[i, 1:] = np.array([1, 1, 1, 1, 1, 1]) 33 | else: 34 | tmp = bin(int((abs(x[i]) - st) / step)).replace('0b', '') 35 | tmp = '0' * (4 - len(tmp)) + tmp 36 | t = [int(k) for k in tmp] 37 | out[i, 4:] = t 38 | return out.reshape(8 * n) 39 | 40 | 41 | def PCM_decode(ins, v): 42 | inn = ins.reshape(len(ins) // 8, 8) 43 | slot = np.array([0, 32, 64, 128, 256, 512, 1024, 2048]) 44 | step = np.array([2, 2, 4, 8, 16, 32, 64, 128]) 45 | out = np.zeros(len(ins) // 8) 46 | for i in range(inn.shape[0]): 47 | sgn = 2 * inn[i, 0] - 1 48 | tmp = int(inn[i, 1] * 4 + inn[i, 2] * 2 + inn[i, 3]) 49 | st = slot[tmp] 50 | dt = (inn[i, 4] * 8 + inn[i, 5] * 4 + inn[i, 6] * 2 + inn[i, 7]) * step[tmp] + 0.5 * step[tmp] 51 | out[i] = sgn * (st + dt) / 4096 * v 52 | return out 53 | -------------------------------------------------------------------------------- /chapter6_语音编码/images/ADPMC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/images/ADPMC.png -------------------------------------------------------------------------------- /chapter6_语音编码/images/LPC解码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/images/LPC解码.png -------------------------------------------------------------------------------- /chapter6_语音编码/images/PCM流程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/images/PCM流程.png -------------------------------------------------------------------------------- /chapter6_语音编码/images/pcm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter6_语音编码/images/pcm.png -------------------------------------------------------------------------------- /chapter7_语音合成/7.2LPC的语音合成.mdown: -------------------------------------------------------------------------------- 1 | 对于线性预测器$\hat s(n)=\sum\limits_{i=1}^pa_is(n-i)$,其误差为$e(n)=s(n)-\hat s(n)$,那么在预测误差$e(n)$和预测系数$a_i$已知的情况下,就可以求出合成语音。 2 | ~~~py 3 | from chapter2_基础.soundBase import * 4 | from chapter7_语音合成.flipframe import * 5 | from chapter3_分析实验.C3_1_y_1 import enframe 6 | from chapter3_分析实验.lpc import lpc_coeff 7 | 8 | from scipy.signal import lfilter 9 | 10 | plt.rcParams['font.sans-serif'] = ['SimHei'] 11 | plt.rcParams['axes.unicode_minus'] = False 12 | 13 | data, fs = soundBase('C7_2_y.wav').audioread() 14 | 15 | data -= np.mean(data) 16 | data /= np.max(np.abs(data)) 17 | N = len(data) 18 | time = [i / fs for i in range(N)] # 设置时间 19 | p = 12 20 | wlen, inc = 200, 80 21 | msoverlap = wlen - inc 22 | y = enframe(data, wlen, inc) 23 | fn = y.shape[0] 24 | Acoef = np.zeros((y.shape[0], p + 1)) 25 | resid = np.zeros(y.shape) 26 | synFrame = np.zeros(y.shape) 27 | ## 7.2.1 28 | 29 | # 求每帧的LPC系数与预测误差 30 | for i in range(fn): 31 | a, _ = lpc_coeff(y[i, :], p) 32 | Acoef[i, :] = a 33 | resid[i, :] = lfilter(a, [1], y[i, :]) 34 | 35 | # 语音合成 36 | for i in range(fn): 37 | synFrame[i, :] = lfilter([1], Acoef[i, :], resid[i, :]) 38 | 39 | outspeech = Filpframe_OverlapS(synFrame, np.hamming(wlen), inc) 40 | plt.subplot(2, 1, 1) 41 | plt.plot(data / np.max(np.abs(data)), 'k') 42 | plt.title('原始信号') 43 | plt.subplot(2, 1, 2) 44 | plt.title('还原信号-LPC与误差') 45 | plt.plot(outspeech / np.max(np.abs(outspeech)), 'c') 46 | 47 | plt.show() 48 | 49 | ~~~ 50 | 51 | ![LPC](images/LPC与误差.png) -------------------------------------------------------------------------------- /chapter7_语音合成/7.3共振峰检测和基音参数的语音合成.mdown: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/7.3共振峰检测和基音参数的语音合成.mdown -------------------------------------------------------------------------------- /chapter7_语音合成/7.4语音的变调和变速.mdown: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/7.4语音的变调和变速.mdown -------------------------------------------------------------------------------- /chapter7_语音合成/C7_1_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter7_语音合成.flipframe import * 3 | from chapter3_分析实验.C3_1_y_1 import enframe 4 | 5 | plt.rcParams['font.sans-serif'] = ['SimHei'] 6 | plt.rcParams['axes.unicode_minus'] = False 7 | 8 | data, fs = soundBase('C7_1_y.wav').audioread() 9 | 10 | wlen = 256 11 | wnd = np.hamming(wlen) 12 | overlap = 100 13 | f = enframe(data, wnd, overlap) 14 | plt.figure(figsize=(14, 12)) 15 | # 7.1.1 16 | fn_overlap = Filpframe_OverlapA(f, wnd, overlap) 17 | plt.subplot(3, 2, 1) 18 | plt.plot(data / np.max(np.abs(data)), 'k') 19 | plt.title('原始信号') 20 | plt.subplot(3, 2, 2) 21 | plt.title('还原信号-重叠相加法') 22 | plt.plot(fn_overlap / np.max(np.abs(fn_overlap)), 'c') 23 | 24 | # 7.1.2 25 | fn_s = Filpframe_OverlapS(f, wnd, overlap) 26 | plt.subplot(3, 2, 3) 27 | plt.plot(data / np.max(np.abs(data)), 'k') 28 | plt.title('原始信号') 29 | plt.subplot(3, 2, 4) 30 | plt.title('还原信号-重叠存储法') 31 | plt.plot(fn_s / np.max(np.abs(fn_s)), 'c') 32 | 33 | # 7.1.3 34 | fn_l = Filpframe_LinearA(f, wnd, overlap) 35 | plt.subplot(3, 2, 5) 36 | plt.plot(data / np.max(np.abs(data)), 'k') 37 | plt.title('原始信号') 38 | plt.subplot(3, 2, 6) 39 | plt.title('还原信号-线性叠加法') 40 | plt.plot(fn_l / np.max(np.abs(fn_l)), 'c') 41 | 42 | plt.savefig('images/flipframe.png') 43 | plt.close() 44 | -------------------------------------------------------------------------------- /chapter7_语音合成/C7_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/C7_1_y.wav -------------------------------------------------------------------------------- /chapter7_语音合成/C7_2_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter7_语音合成.flipframe import * 3 | from chapter3_分析实验.C3_1_y_1 import enframe 4 | from chapter3_分析实验.lpc import lpc_coeff 5 | 6 | from scipy.signal import lfilter 7 | 8 | plt.rcParams['font.sans-serif'] = ['SimHei'] 9 | plt.rcParams['axes.unicode_minus'] = False 10 | 11 | data, fs = soundBase('C7_2_y.wav').audioread() 12 | 13 | data -= np.mean(data) 14 | data /= np.max(np.abs(data)) 15 | N = len(data) 16 | time = [i / fs for i in range(N)] # 设置时间 17 | p = 12 18 | wlen, inc = 200, 80 19 | msoverlap = wlen - inc 20 | y = enframe(data, wlen, inc) 21 | fn = y.shape[0] 22 | Acoef = np.zeros((y.shape[0], p + 1)) 23 | resid = np.zeros(y.shape) 24 | synFrame = np.zeros(y.shape) 25 | ## 7.2.1 26 | 27 | # 求每帧的LPC系数与预测误差 28 | for i in range(fn): 29 | a, _ = lpc_coeff(y[i, :], p) 30 | Acoef[i, :] = a 31 | resid[i, :] = lfilter(a, [1], y[i, :]) 32 | 33 | # 语音合成 34 | for i in range(fn): 35 | synFrame[i, :] = lfilter([1], Acoef[i, :], resid[i, :]) 36 | 37 | outspeech = Filpframe_OverlapS(synFrame, np.hamming(wlen), inc) 38 | plt.subplot(2, 1, 1) 39 | plt.plot(data / np.max(np.abs(data)), 'k') 40 | plt.title('原始信号') 41 | plt.subplot(2, 1, 2) 42 | plt.title('还原信号-LPC与误差') 43 | plt.plot(outspeech / np.max(np.abs(outspeech)), 'c') 44 | 45 | plt.savefig('images/LPC与误差.png') 46 | plt.close() 47 | -------------------------------------------------------------------------------- /chapter7_语音合成/C7_2_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/C7_2_y.wav -------------------------------------------------------------------------------- /chapter7_语音合成/C7_3_y.py: -------------------------------------------------------------------------------- 1 | from chapter2_基础.soundBase import * 2 | from chapter3_分析实验.timefeature import * 3 | from chapter7_语音合成.flipframe import * 4 | from chapter3_分析实验.C3_1_y_1 import enframe 5 | from chapter3_分析实验.lpc import lpc_coeff 6 | from chapter4_特征提取.共振峰估计 import * 7 | 8 | from chapter4_特征提取.pitch_detection import * 9 | 10 | from chapter7_语音合成.myfilter import * 11 | 12 | from scipy.signal import lfilter 13 | 14 | plt.rcParams['font.sans-serif'] = ['SimHei'] 15 | plt.rcParams['axes.unicode_minus'] = False 16 | 17 | data, fs = soundBase('C7_3_y.wav').audioread() 18 | data -= np.mean(data) 19 | data /= np.max(np.abs(data)) 20 | data = lfilter([1, -0.99], 1, data) 21 | N = len(data) 22 | time = [i / fs for i in range(N)] # 设置时间 23 | wlen = 240 24 | inc = 80 25 | overlap = wlen - inc 26 | n2 = [i for i in range(wlen // 2)] 27 | w1 = [i / overlap for i in range(overlap)] 28 | w2 = [i / overlap for i in range(overlap - 1, -1, -1)] 29 | wnd = np.hamming(wlen) 30 | X = enframe(data, wnd, inc) 31 | fn = X.shape[0] 32 | Etmp = np.sum(np.power(X, 2), axis=1) 33 | Etmp /= np.max(Etmp) 34 | T1, r2 = 0.1, 0.5 35 | miniL = 10 36 | mnlong = 5 37 | ThrC = [10, 15] 38 | p = 12 39 | 40 | frameTime = FrameTimeC(fn, wlen, inc, fs) 41 | Doption = 0 42 | 43 | voiceseg, vosl, SF, Ef, period = pitch_Ceps(data, wlen, inc, T1, fs) 44 | Dpitch = pitfilterm1(period, voiceseg, vosl) 45 | ## 共振峰检测 46 | Frmt = np.zeros((3, fn)) 47 | Bw = np.zeros((3, fn)) 48 | U = np.zeros((3, fn)) 49 | for i in range(len(SF)): 50 | Frmt[:, i], Bw[:, i], U[:, i] = Formant_Root(X[:, ], p, fs, 3) 51 | 52 | # 语音合成 53 | 54 | zint = np.zeros((2, 4)) 55 | tal = 0 56 | for i in range(fn): 57 | yf = Frmt[:, i] 58 | bw = Bw[:, i] 59 | ## To Be continue 60 | 61 | -------------------------------------------------------------------------------- /chapter7_语音合成/C7_3_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/C7_3_y.wav -------------------------------------------------------------------------------- /chapter7_语音合成/flipframe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def Filpframe_OverlapA(x, win, inc): 5 | """ 6 | 基于重叠相加法的信号还原函数 7 | :param x: 分帧数据 8 | :param win: 窗 9 | :param inc: 帧移 10 | :return: 11 | """ 12 | nf, slen = x.shape 13 | nx = (nf - 1) * inc + slen 14 | frameout = np.zeros(nx) 15 | x = x / win 16 | for i in range(nf): 17 | start = i * inc 18 | frameout[start:start + slen] += x[i, :] 19 | return frameout 20 | 21 | 22 | def Filpframe_OverlapS(x, win, inc): 23 | """ 24 | 基于重叠存储法的信号还原函数 25 | :param x: 分帧数据 26 | :param win: 窗 27 | :param inc: 帧移 28 | :return: 29 | """ 30 | nf, slen = x.shape 31 | nx = (nf - 1) * inc + slen 32 | frameout = np.zeros(nx) 33 | x = x / win 34 | for i in range(nf): 35 | frameout[slen + (i - 1) * inc:slen + i * inc] += x[i, slen - inc:] 36 | return frameout 37 | 38 | 39 | def Filpframe_LinearA(x, win, inc): 40 | """ 41 | 基于比例重叠相加法的信号还原函数 42 | :param x: 分帧数据 43 | :param win: 窗 44 | :param inc: 帧移 45 | :return: 46 | """ 47 | nf, slen = x.shape 48 | nx = (nf - 1) * inc + slen 49 | frameout = np.zeros(nx) 50 | overlap = len(win) - inc 51 | x = x / win 52 | w1 = [i / overlap for i in range(overlap)] 53 | w2 = [i / overlap for i in range(overlap - 1, -1, -1)] 54 | for i in range(nf): 55 | if i == 0: 56 | frameout[:slen] = x[i, :] 57 | else: 58 | M = slen + (i - 1) * inc 59 | y = frameout[M - overlap:M] * w2 + x[i, :overlap] * w1 60 | xn = x[i, overlap:] 61 | yy = np.hstack((y, xn)) 62 | frameout[M - overlap:M - overlap + slen] += yy 63 | return frameout 64 | -------------------------------------------------------------------------------- /chapter7_语音合成/myfilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import medfilt 3 | 4 | 5 | def linsmoothm(x, n=3): 6 | win = np.hamming(n) 7 | win /= np.sum(win) 8 | l = len(x) 9 | y = np.zeros(l) 10 | if np.mod(n, 2) == 0: 11 | ll = n // 2 12 | xx = np.hstack((x[0], x, x[-1] * np.ones(ll))) 13 | else: 14 | ll = (n - 1) // 2 15 | xx = np.hstack((x[0], x, x[-1] * np.ones(ll + 1))) 16 | for i in range(l): 17 | y[i] = np.matmul(win, xx[i:i + n]) 18 | return y 19 | 20 | 21 | def pitfilterm1(x, vseg, vsl): 22 | y = np.zeros(len(x)) 23 | for i in vseg.keys(): 24 | ixb = vseg[i]['start'] 25 | ixe = vseg[i]['end'] 26 | u = x[ixb:ixe] 27 | u = medfilt(u, 5) 28 | v0 = linsmoothm(u, 5) 29 | y[ixb:ixe] = v0 30 | return y 31 | -------------------------------------------------------------------------------- /chapter7_语音合成/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter7_语音合成/test.py -------------------------------------------------------------------------------- /chapter8_隐藏试验/C8_1_y.DAT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter8_隐藏试验/C8_1_y.DAT -------------------------------------------------------------------------------- /chapter8_隐藏试验/C8_1_y.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | from chapter2_基础.soundBase import * 3 | 4 | plt.rcParams['font.sans-serif'] = ['SimHei'] 5 | plt.rcParams['axes.unicode_minus'] = False 6 | 7 | 8 | def hide_message(x, meg, nBits=1): 9 | if nBits != 1: 10 | exit('Only nBits=1 support now......') 11 | xx = np.zeros(len(x)) 12 | xx[:] = x[:] 13 | l = len(meg) 14 | pads = np.mod(l, nBits) 15 | if pads: 16 | l += nBits - pads 17 | meg_l = np.zeros(l) 18 | meg_l[:l] = meg 19 | meg = meg_l 20 | m_len = l // nBits 21 | meg_n = meg.reshape(m_len, nBits) 22 | for i in range(nBits): 23 | for j in range(m_len): 24 | if meg_n[j, i]: 25 | xx[j] = x[j] // 2 * 2 26 | else: 27 | xx[j] = x[j] // 2 * 2 + 1 28 | return xx, m_len 29 | 30 | 31 | def extract_message(x, m_len, nBits=1): 32 | if nBits != 1: 33 | exit('Only nBits=1 support now......') 34 | meg = np.zeros((m_len, nBits)) 35 | for i in range(nBits): 36 | for j in range(m_len): 37 | meg[j, i] = x[j] % 2 38 | return meg 39 | 40 | 41 | data, fs, bits = soundBase('C8_1_y.wav').audioread(return_nbits=True) 42 | data16 = (data + 1) * np.power(2, bits - 1) 43 | nBits = 1 44 | s = loadmat('C8_1_y.DAT') 45 | 46 | x_embed, m_len = hide_message(data16, s['message'][0], 1) 47 | meg_rec = extract_message(x_embed, m_len, 1) 48 | 49 | plt.figure(figsize=(14, 12)) 50 | plt.subplot(3, 1, 1) 51 | plt.plot(data16) 52 | plt.subplot(3, 1, 2) 53 | plt.plot(x_embed) 54 | plt.subplot(3, 1, 3) 55 | plt.plot(data16 - x_embed) 56 | 57 | plt.show() 58 | 59 | plt.subplot(2, 1, 1) 60 | plt.imshow(s['message'][0].reshape(s['n_mess'][0][0], s['m_mess'][0][0]).T) 61 | plt.subplot(2, 1, 2) 62 | plt.imshow(meg_rec.reshape(s['n_mess'][0][0], s['m_mess'][0][0]).T) 63 | plt.show() 64 | -------------------------------------------------------------------------------- /chapter8_隐藏试验/C8_1_y.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/busyyang/python_sound_open/39dcb99a1085512120f1732ec4f75eb78e833213/chapter8_隐藏试验/C8_1_y.wav --------------------------------------------------------------------------------