├── README.md ├── audio.py ├── hparams.py ├── main.py ├── test.wav └── wavs ├── error.wav ├── orig.wav └── pred.wav /README.md: -------------------------------------------------------------------------------- 1 | # LPC_for_TTS 2 | Linear Prediction Coefficients estimation from mel-spectrogram implemented in Python based on Levinson-Durbin algorithm. 3 | 4 | 基于Levinson-Durbin归纳法来做线性预测系数的估计。此代码可用于LPC系数的估计,也可用于LPCNet等合成器的特征提取。流程是从音频得到梅尔谱,梅尔谱得到LPC。 5 | 6 | ```Python 7 | from audio import * 8 | import numpy as np 9 | from hparams import Hparams as hparams 10 | 11 | input_wav_file = 'test.wav' 12 | sample_rate = 24000 13 | lpc_order = 8 14 | 15 | orig_audio, pred_audio, residual, lpcs = lpc_audio(input_wav_file, lpc_order, hparams) 16 | 17 | save_wav(pred_audio, 'wavs/pred.wav', hparams) 18 | save_wav(orig_audio, 'wavs/orig.wav', hparams) 19 | save_wav(residual, 'wavs/error.wav', hparams) 20 | ``` 21 | 22 | Raw audio: 23 | ![image](https://user-images.githubusercontent.com/11649939/111761869-562df580-88db-11eb-933f-be4c07712d25.png) 24 | 25 | Predicted audio: 26 | ![image](https://user-images.githubusercontent.com/11649939/111761943-67770200-88db-11eb-957d-73197d9e4e46.png) 27 | 28 | Prediction error: 29 | ![image](https://user-images.githubusercontent.com/11649939/111762018-7b226880-88db-11eb-9efb-43c32ff942ae.png) 30 | -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import linalg as LA 3 | import librosa 4 | import librosa.filters 5 | from scipy.io import wavfile 6 | from scipy.fftpack import ifft 7 | 8 | 9 | def load_wav(filefilepath, hparams): 10 | """ 这里推荐用wavfile并手动scale的方式,而librosa.core.load会产生较大误差 """ 11 | 12 | _, raw_data = wavfile.read(filefilepath) 13 | raw_data = raw_data.astype(np.float32) 14 | float_data = (raw_data + 32768) / 65535. * 2 - 1 15 | 16 | return float_data 17 | 18 | 19 | def save_wav(float_data, filepath, hparams): 20 | 21 | data = (float_data + 1) / 2 * 65535. - 32768 22 | wavfile.write(filepath, hparams.sample_rate, data.astype(np.int16)) 23 | 24 | def melspectrogram(wav, hparams): 25 | 26 | D = _stft(wav, hparams) 27 | 28 | S = _linear_to_mel(np.abs(D), hparams) 29 | 30 | S = normalize_spec(S) 31 | 32 | return S.astype(np.float32) 33 | 34 | def normalize_spec(spectrogram): 35 | return np.log(1. + 10000 * spectrogram) 36 | 37 | def denormalize_spec(spectrogram): 38 | return (np.exp(spectrogram) - 1.) / 10000 39 | 40 | def denormed_melsp_to_linearsp(denormed_mel, hparams): 41 | """ 梅尔谱转化成线性谱 """ 42 | denormed_mel = denormalize_spec(denormed_mel) 43 | 44 | return _mel_to_linear(denormed_mel, hparams) 45 | 46 | def linearsp_to_autocorr(linearsp): 47 | power = linearsp**2 48 | fft_power = np.concatenate([power, power[::-1, :][1:-1, :]], axis=0) 49 | return ifft(fft_power, n=fft_power.shape[0], axis=0).real 50 | 51 | def autocorr_to_lpc(ac, hparams, lpc_order): 52 | sample_rate = hparams.sample_rate 53 | ac = ac[0:lpc_order + 1, :] 54 | theta = (2 * np.pi * 40 / sample_rate)**2 55 | 56 | # 对自相关系数做平滑化处理 57 | lag_window = np.exp([[-0.5 * theta * i**2] for i in range(lpc_order + 1)]) 58 | ac = ac * lag_window 59 | 60 | return levinson_durbin(lpc_order, ac) 61 | 62 | def levinson_durbin(lpc_order, auto_corr): 63 | """ lpc_order: 阶数, auto_corr: 自相关系数 """ 64 | 65 | num_frames = auto_corr.shape[-1] 66 | 67 | # 假设a_0=1,因此这里会多出一个元素 68 | Ak = np.zeros((lpc_order+1, num_frames), dtype=np.float32) 69 | Ak[0, :] = 1.0 70 | 71 | # 根据递推公式,可以反推出必有E0=R0 72 | E0 = np.copy(auto_corr[0, :]) 73 | Ek = E0 74 | 75 | for k in range(lpc_order): 76 | lamb = 0. 77 | for j in range(k+1): 78 | lamb -= Ak[j, :] * auto_corr[k+1-j, :] 79 | 80 | lamb /= np.maximum(1e-6, Ek) 81 | 82 | # 根据a[n] = a[n] +lambda*a[k+1-n] 83 | # 每次赋值两个元素 84 | for n in range((k+1)//2+1): 85 | temp = Ak[k+1-n, :] + lamb * Ak[n, :] 86 | Ak[n, :] = Ak[n, :]+lamb*Ak[k+1-n, :] 87 | Ak[k+1-n, :] = temp 88 | 89 | Ek = Ek * (1-lamb**2) 90 | 91 | # 返回值不包含第一行的0 92 | return Ak[1:, :] 93 | 94 | 95 | def lpc_predict(lpcs, signal_slice, clip_lpc=True): 96 | 97 | # 自回归线性组合 98 | pred = np.sum(lpcs * signal_slice, axis=0) 99 | 100 | if clip_lpc: 101 | pred = np.clip(pred, -1., 1.) 102 | 103 | return pred 104 | 105 | 106 | def lpc_reconstruction(lpcs, lpc_order, audio): 107 | """ 从LPC中去恢复音频,并计算误差 """ 108 | 109 | num_points = lpcs.shape[-1] 110 | 111 | if audio.shape[0] == num_points: 112 | # 起始点以0作为填充 113 | audio = np.pad(audio, ((lpc_order, 0)), 'constant') 114 | 115 | elif audio.shape[0] != num_points + lpc_order: 116 | raise RuntimeError('dimensions of lpcs and audio must match') 117 | 118 | indices = np.reshape(np.arange(lpc_order), [-1, 1]) + np.arange( 119 | lpcs.shape[-1]) 120 | 121 | signal_slices = audio[indices] 122 | pred = lpc_predict(lpcs, signal_slices) 123 | origin_audio = audio[lpc_order:] 124 | 125 | error = origin_audio - pred 126 | 127 | return origin_audio, pred, error 128 | 129 | def lpc_audio(input_wav_file, lpc_order, hparams): 130 | wav = load_wav(input_wav_file, hparams) 131 | 132 | # 根据自相关系数与功率谱之间的关系 133 | mel = melspectrogram(wav, hparams) 134 | linear = denormed_melsp_to_linearsp(mel, hparams) 135 | autocorr = linearsp_to_autocorr(linear) 136 | 137 | # 根据LD归纳算法计算LPC 138 | lpcs = autocorr_to_lpc(autocorr, hparams, lpc_order) 139 | 140 | # 根据自回归性质以及线性组合公式 141 | lpcs = -1 * lpcs[::-1, :] 142 | 143 | # 对每一帧都进行同样的自回归计算 144 | lpcs = np.repeat(lpcs, 240, axis=-1) 145 | lpcs = lpcs[:, :wav.shape[-1]] 146 | 147 | orig_audio, pred, error = lpc_reconstruction(lpcs, lpc_order, wav) 148 | 149 | return orig_audio, pred, error, lpcs 150 | 151 | def _stft(y, hparams): 152 | 153 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size) 154 | 155 | def _linear_to_mel(spectogram, hparams): 156 | 157 | mel_basis = _build_mel_basis(hparams) 158 | 159 | return np.dot(mel_basis, spectogram) 160 | 161 | def _mel_to_linear(mel_spectrogram, hparams): 162 | 163 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 164 | 165 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 166 | 167 | def _build_mel_basis(hparams): 168 | 169 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 170 | fmin=20, fmax=hparams.sample_rate/2) 171 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Hparams(object): 4 | 5 | num_mels = 80 6 | n_fft = 2048 7 | hop_size = 240 8 | win_size = None 9 | sample_rate = 24000 10 | 11 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from audio import * 4 | import numpy as np 5 | from hparams import Hparams as hparams 6 | 7 | input_wav_file = 'test.wav' 8 | sample_rate = 24000 9 | lpc_order = 8 10 | 11 | orig_audio, pred_audio, residual, lpcs = lpc_audio(input_wav_file, lpc_order, hparams) 12 | 13 | save_wav(pred_audio, 'wavs/pred.wav', hparams) 14 | save_wav(orig_audio, 'wavs/orig.wav', hparams) 15 | save_wav(residual, 'wavs/error.wav', hparams) 16 | -------------------------------------------------------------------------------- /test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzw922cn/LPC_for_TTS/1b26cc8cf89f88a353c70a5ea0e2603b8470dd43/test.wav -------------------------------------------------------------------------------- /wavs/error.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzw922cn/LPC_for_TTS/1b26cc8cf89f88a353c70a5ea0e2603b8470dd43/wavs/error.wav -------------------------------------------------------------------------------- /wavs/orig.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzw922cn/LPC_for_TTS/1b26cc8cf89f88a353c70a5ea0e2603b8470dd43/wavs/orig.wav -------------------------------------------------------------------------------- /wavs/pred.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzw922cn/LPC_for_TTS/1b26cc8cf89f88a353c70a5ea0e2603b8470dd43/wavs/pred.wav --------------------------------------------------------------------------------