├── Pseudo-whisper.png ├── list_example ├── data_gen.py ├── rq2_gen.py ├── README.md └── utils.py /Pseudo-whisper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chaufanglin/Normal2Whisper/HEAD/Pseudo-whisper.png -------------------------------------------------------------------------------- /list_example: -------------------------------------------------------------------------------- 1 | s116u180n /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u180n.WAV 2 | s116u195n /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u195n.WAV 3 | s116u198n /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u198n.WAV 4 | s116u227n /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u227n.WAV 5 | s116u245n /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u245n.WAV -------------------------------------------------------------------------------- /data_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import argparse 4 | import soundfile as sf 5 | import librosa 6 | from utils import pseudo_whisper_gen 7 | 8 | def generate(data_list, output_dir): 9 | 10 | if not os.path.exists(output_dir): 11 | os.makedirs(output_dir) 12 | 13 | with open(data_list, 'r') as f1: 14 | content = f1.readlines() 15 | content = [x.strip() for x in content] 16 | for line in content: 17 | name, filepath = line.split(' ',1) # check one/two space 18 | # filepath, _ = filepath.split(' |', 1) 19 | 20 | # spkpath, _ = filepath.split('normal/', 1)[1].split('/s', 1) 21 | # spkpath = os.path.join(output_dir, spkpath) 22 | filename_split = os.path.basename(filepath).split('.', 1)[0].split('-') 23 | spkpath = os.path.join(output_dir, filename_split[0], filename_split[1]) 24 | 25 | # name = name.replace("n","pw") 26 | if os.path.exists(os.path.join(spkpath, name) + '-pw.wav'): # '.wav'): # 27 | continue 28 | 29 | if not os.path.exists(spkpath): 30 | os.makedirs(spkpath, exist_ok=True) 31 | print("make dir: %s" % spkpath) 32 | 33 | s_n, fs = librosa.load(filepath, sr=16000, dtype=np.float64) # resample to 16k 34 | # s_n, fs = sf.read(filepath) 35 | s_pw = pseudo_whisper_gen(s_n, fs) 36 | 37 | sf.write(os.path.join(spkpath, name) + '-pw.wav', s_pw, fs) #'.wav', s_pw, fs) # '-pw.wav', s_pw, fs) 38 | f1.close() 39 | 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser(description = 'Generate pseudo whispered speech data') 43 | 44 | data_list_default = './list_example' 45 | output_dir_default = './output_dir' 46 | 47 | parser.add_argument('--data_list', type = str, help = 'List for the normal speech directories.', default = data_list_default) 48 | parser.add_argument('--output_dir', type = str, help = 'Directory for the output pseudo whispered speech.', default = output_dir_default) 49 | 50 | argv = parser.parse_args() 51 | 52 | data_list = argv.data_list 53 | output_dir = argv.output_dir 54 | 55 | print(data_list) 56 | 57 | generate(data_list, output_dir) -------------------------------------------------------------------------------- /rq2_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import argparse 4 | import soundfile as sf 5 | import librosa 6 | from utils import glottal_remove_gen, bandwidth_widen_gen 7 | 8 | def generate(data_list, output_dir, generating_mode): 9 | 10 | if not os.path.exists(output_dir): 11 | os.makedirs(output_dir) 12 | 13 | with open(data_list, 'r') as f1: 14 | content = f1.readlines() 15 | content = [x.strip() for x in content] 16 | for line in content: 17 | name, filepath = line.split(' ',1) 18 | # filepath, _ = filepath.split(' |', 1) 19 | 20 | spkpath, _ = filepath.split('normal/', 1)[1].split('/s', 1) 21 | spkpath = os.path.join(output_dir, spkpath) 22 | # filename_split = os.path.basename(filepath).split('.', 1)[0].split('-') 23 | # spkpath = os.path.join(output_dir, filename_split[0], filename_split[1]) 24 | 25 | # name = name.replace("n","pw") 26 | if os.path.exists(os.path.join(spkpath, name) + '.wav'): 27 | continue 28 | 29 | if not os.path.exists(spkpath): 30 | os.makedirs(spkpath, exist_ok=True) 31 | print("make dir: %s" % spkpath) 32 | 33 | s_n, fs = librosa.load(filepath, sr=16000, dtype=np.float64) # resample to 16k 34 | # s_n, fs = sf.read(filepath) 35 | if generating_mode == '1': 36 | s_pw = glottal_remove_gen(s_n, fs) 37 | else: 38 | s_pw = bandwidth_widen_gen(s_n, fs) 39 | 40 | # sf.write(os.path.join(spkpath, name) + '.wav', s_pw, fs) 41 | f1.close() 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description = 'Generate pseudo whispered speech data') 46 | 47 | data_list_default = './list_example' 48 | output_dir_default = './output_dir' 49 | generating_mode_default = '1' 50 | 51 | parser.add_argument('--data_list', type = str, help = 'List for the normal speech directories.', default = data_list_default) 52 | parser.add_argument('--output_dir', type = str, help = 'Directory for the output modified speech.', default = output_dir_default) 53 | parser.add_argument('--generating_mode', type = str, help = 'Generating mode: 1) glottal contribution removal; 2) formant bandwidth widen.', 54 | default = generating_mode_default) 55 | 56 | argv = parser.parse_args() 57 | data_list = argv.data_list 58 | output_dir = argv.output_dir 59 | generating_mode = argv.generating_mode 60 | 61 | generate(data_list, output_dir, generating_mode) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Normal2Whisper 2 | This is an implementation of our pseudo-whispered speech conversion method in the paper Improving Whispered Speech Recognition Performance using Pseudo-whispered based Data Augmentation ([pdf](https://arxiv.org/pdf/2311.05179.pdf); to appear in ASRU 2023). 3 | 4 | 5 | 6 | ## Dependencies 7 | * Python 3.9 8 | * Numpy 9 | * soundfile 10 | * librosa 11 | * [PyWorld](https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder) 12 | 13 | ## Functions 14 | 1. `utils.py` 15 | 16 | This script has all the essential functions used in our proposed method. 17 | 18 | > **Note:** In our work, the speech files are or re-sampled to 16 kHz. So the parameter of GFM-IAIF-GC is based on this sample rate. 19 | 20 | 2. `data_gen.py` 21 | 22 | This script is used to convert normal speech into pseudo-whispered speech. 23 | 24 | 3. `rq2_gen.py` 25 | 26 | This script is used to convert normal speech into: 27 | 1) normal speech without glottal contributions; 28 | 2) normal speech with widened formant bandwidth and shifted formant frequencies. 29 | 30 | ## Usage 31 | **1. Convert normal speech into pseudo-whispered speech from your dataset:** 32 | 33 | ```Bash 34 | python data_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \ 35 | --output_dir './data/training/wTIMIT/PW(PATH TO OUTPUT PW DIRECTORY)' 36 | ``` 37 | 38 | **2. Convert normal speech into 1) normal speech without glottal contributions:** 39 | 40 | ```Bash 41 | python rq2_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \ 42 | --output_dir './data/training/wTIMIT/s1(PATH TO OUTPUT DIRECTORY)' \ 43 | --generating_mode '1' 44 | ``` 45 | 46 | **3. Convert normal speech into 2) normal speech with widened formant bandwidth and shifted formant frequencies:** 47 | 48 | ```Bash 49 | python rq2_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \ 50 | --output_dir './data/training/wTIMIT/s2(PATH TO OUTPUT DIRECTORY)' \ 51 | --generating_mode '2' 52 | ``` 53 | 54 | >**Note:** you can check `./list_example` to see an example of the input data list. You can get the list by using this command: 55 | >```Bash 56 | >find ./corpora/wTIMIT/nist/TRAIN/normal/US/(PATH TO SOURCE TRAINING DATA) -name "*.WAV" | awk '{split($0,a,"/");split(a[14],b,"."); print b[1] ,$0}' > list 57 | >``` 58 | >You may need to check your data directory and change `a[14]` in the command. 59 | 60 | ## Citation 61 | ```bibtex 62 | @INPROCEEDINGS{10389801, 63 | author={Lin, Zhaofeng and Patel, Tanvina and Scharenborg, Odette}, 64 | booktitle={2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, 65 | title={Improving Whispered Speech Recognition Performance Using Pseudo-Whispered Based Data Augmentation}, 66 | year={2023}, 67 | volume={}, 68 | number={}, 69 | pages={1-8}, 70 | keywords={Error analysis;Databases;Conferences;Training data;Transforms;Data augmentation;Acoustics;Whispered speech;pseudo-whisper;end-to-end speech recognition;wTIMIT;signal processing}, 71 | doi={10.1109/ASRU57964.2023.10389801}} 72 | ``` 73 | 74 | ## Contact 75 | If you have any questions, feel free to open an issue or send me an email linzh (at) tcd.ie 76 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.random import randn 3 | import soundfile as sf 4 | from scipy.signal import lfilter 5 | from scipy.signal.windows import hann 6 | from librosa import lpc 7 | import pyworld as pw 8 | 9 | 10 | def wav2world(x, fs, fft_size=None): 11 | """Convenience function to do all WORLD analysis steps in a single call. 12 | In this case only `frame_period` can be configured and other parameters 13 | are fixed to their defaults. Likewise, F0 estimation is fixed to 14 | DIO plus StoneMask refinement. 15 | Parameters 16 | ---------- 17 | x : ndarray 18 | Input waveform signal. 19 | fs : int 20 | Sample rate of input signal in Hz. 21 | fft_size : int 22 | Length of Fast Fourier Transform (in number of samples) 23 | The resulting dimension of `ap` adn `sp` will be `fft_size` // 2 + 1 24 | Returns 25 | ------- 26 | f0 : ndarray 27 | F0 contour. 28 | sp : ndarray 29 | Spectral envelope. 30 | ap : ndarray 31 | Aperiodicity. 32 | t : ndarray 33 | Temporal position of each frame. 34 | """ 35 | f0, t = pw.harvest(x, fs) 36 | sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) 37 | ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) 38 | return f0, sp, ap, t 39 | 40 | 41 | def moving_average(data, length): 42 | output = np.empty(data.shape) 43 | maf = np.bartlett(length)/length # Bartlett window is a triangular window 44 | for i in range(data.shape[0]): 45 | output[i,:] = np.convolve(data[i,:], maf,'same') 46 | return output 47 | 48 | 49 | def gfm_iaif(s_gvl, nv=48, ng=3, d=0.99, win=None): 50 | """ 51 | Glottal Flow Model-based Iterative Adaptive Inverse Filtering. 52 | 53 | Note: 54 | Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 55 | This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie) 56 | 57 | Parameters: 58 | ---------- 59 | s_gvl: Speech signal frame 60 | nv: Order of LP analysis for vocal tract (def. 48) 61 | ng: Order of LP analysis for glottal source (def. 3) 62 | d: Leaky integration coefficient (def. 0.99) 63 | win: Window used before LPC (def. Hanning) 64 | 65 | Returns: 66 | ------- 67 | av: LP coefficients of vocal tract contribution 68 | ag: LP coefficients of glottis contribution 69 | al: LP coefficients of lip radiation contribution 70 | """ 71 | 72 | # ----- Set default parameters ------------------------------------------- 73 | if win is None: 74 | # Window for LPC estimation 75 | win = np.hanning(len(s_gvl)) 76 | 77 | # ----- Addition of pre-frame -------------------------------------------- 78 | # For the successive removals of the estimated LPC envelopes, a 79 | # mean-normalized pre-frame ramp is added at the beginning of the frame 80 | # in order to diminish ripple. The ramp is removed after each filtering. 81 | Lpf = nv + 1 # Pre-frame length 82 | x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl]) # Prepend 83 | idx_pf = np.arange(Lpf, len(x_gvl)) # Indexes that exclude the pre-frame 84 | 85 | # ----- Cancel lip radiation contribution -------------------------------- 86 | # Define lip radiation filter 87 | al = [1, -d] 88 | 89 | # Integration of signal using filter 1/[1 -d z^(-1)] 90 | # - Input signal (for LPC estimation) 91 | s_gv = lfilter([1], al, s_gvl) 92 | # - Pre-framed input signal (for LPC envelope removal) 93 | x_gv = lfilter([1], al, x_gvl) 94 | 95 | # ----- Gross glottis estimation ----------------------------------------- 96 | # Iterative estimation of glottis with ng first order filters 97 | ag1 = lpc(s_gv*win, order=1) # First 1st order LPC estimation 98 | 99 | for i in range(ng-2): 100 | # Cancel current estimate of glottis contribution from speech signal 101 | x_v1x = lfilter(ag1,1,x_gv) # Inverse filtering 102 | s_v1x = x_v1x[idx_pf] # Remove pre-ramp 103 | 104 | # Next 1st order LPC estimation 105 | ag1x = lpc(s_v1x*win, order=1) # 1st order LPC 106 | 107 | # Update gross estimate of glottis contribution 108 | ag1 = np.convolve(ag1,ag1x) # Combine 1st order estimation with previous 109 | 110 | 111 | # ----- Gross vocal tract estimation ------------------------------------- 112 | # Cancel gross estimate of glottis contribution from speech signal 113 | x_v1 = lfilter(ag1,1,x_gv) # Inverse filtering 114 | s_v1 = x_v1[idx_pf] # Remove pre-ramp 115 | 116 | # Gross estimate of the vocal tract filter 117 | av1 = lpc(s_v1*win, order=nv) # nv order LPC estimation 118 | 119 | # ----- Fine glottis estimation ------------------------------------------ 120 | # Cancel gross estimate of vocal tract contribution from speech signal 121 | x_g1 = lfilter(av1,1,x_gv) # Inverse filtering 122 | s_g1 = x_g1[idx_pf] # Remove pre-ramp 123 | 124 | # Fine estimate of the glottis filter 125 | ag = lpc(s_g1*win, order=ng) # ng order LPC estimation 126 | 127 | # ----- Fine vocal tract estimation -------------------------------------- 128 | # Cancel fine estimate of glottis contribution from speech signal 129 | x_v = lfilter(ag,1,x_gv) # Inverse filtering 130 | s_v = x_v[idx_pf] # Remove pre-ramp 131 | 132 | # Fine estimate of the vocal tract filter 133 | av = lpc(s_v*win, order=nv) # nv order LPC estimation 134 | 135 | 136 | return av, ag, al 137 | 138 | 139 | def gfm_iaif_glottal_remove(s_gvl, nv=48, ng=3, d=0.99, win=None): 140 | """ 141 | Glootal removal function based on GFM-IAIF. 142 | 143 | Note: 144 | Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 145 | This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie) 146 | Parameters: 147 | ---------- 148 | s_gvl: Speech signal frame 149 | nv: Order of LP analysis for vocal tract (def. 48) 150 | ng: Order of LP analysis for glottal source (def. 3) 151 | d: Leaky integration coefficient (def. 0.99) 152 | win: Window used before LPC (def. Hanning) 153 | 154 | Returns: 155 | ------- 156 | s_v: Speech signal with glottis contribution cancelled 157 | """ 158 | 159 | # ----- Set default parameters ------------------------------------------- 160 | if win is None: 161 | # Window for LPC estimation 162 | win = np.hanning(len(s_gvl)) 163 | 164 | # ----- Addition of pre-frame -------------------------------------------- 165 | # For the successive removals of the estimated LPC envelopes, a 166 | # mean-normalized pre-frame ramp is added at the beginning of the frame 167 | # in order to diminish ripple. The ramp is removed after each filtering. 168 | Lpf = nv + 1 # Pre-frame length 169 | x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl]) # Prepend 170 | idx_pf = np.arange(Lpf, len(x_gvl)) # Indexes that exclude the pre-frame 171 | 172 | # ----- Cancel lip radiation contribution -------------------------------- 173 | # Define lip radiation filter 174 | al = [1, -d] 175 | 176 | # Integration of signal using filter 1/[1 -d z^(-1)] 177 | # - Input signal (for LPC estimation) 178 | s_gv = lfilter([1], al, s_gvl) 179 | # - Pre-framed input signal (for LPC envelope removal) 180 | x_gv = lfilter([1], al, x_gvl) 181 | 182 | # ----- Gross glottis estimation ----------------------------------------- 183 | # Iterative estimation of glottis with ng first order filters 184 | ag1 = lpc(s_gv*win, order=1) # First 1st order LPC estimation 185 | 186 | for i in range(ng-2): 187 | # Cancel current estimate of glottis contribution from speech signal 188 | x_v1x = lfilter(ag1,1,x_gv) # Inverse filtering 189 | s_v1x = x_v1x[idx_pf] # Remove pre-ramp 190 | 191 | # Next 1st order LPC estimation 192 | ag1x = lpc(s_v1x*win, order=1) # 1st order LPC 193 | 194 | # Update gross estimate of glottis contribution 195 | ag1 = np.convolve(ag1,ag1x) # Combine 1st order estimation with previous 196 | 197 | 198 | # ----- Gross vocal tract estimation ------------------------------------- 199 | # Cancel gross estimate of glottis contribution from speech signal 200 | x_v1 = lfilter(ag1,1,x_gv) # Inverse filtering 201 | s_v1 = x_v1[idx_pf] # Remove pre-ramp 202 | 203 | # Gross estimate of the vocal tract filter 204 | av1 = lpc(s_v1*win, order=nv) # nv order LPC estimation 205 | 206 | # ----- Fine glottis estimation ------------------------------------------ 207 | # Cancel gross estimate of vocal tract contribution from speech signal 208 | x_g1 = lfilter(av1,1,x_gv) # Inverse filtering 209 | s_g1 = x_g1[idx_pf] # Remove pre-ramp 210 | 211 | # Fine estimate of the glottis filter 212 | ag = lpc(s_g1*win, order=ng) # ng order LPC estimation 213 | 214 | # ----- Fine vocal tract estimation -------------------------------------- 215 | # Cancel fine estimate of glottis contribution from speech signal 216 | x_v = lfilter(ag,1,x_gv) # Inverse filtering 217 | s_v = x_v[idx_pf] # Remove pre-ramp 218 | 219 | return s_v 220 | 221 | 222 | def pseudo_whisper_gen(s_n, fs, Lv=16): 223 | """ 224 | Pseudo whispered speech generating function, using GFM-IAIF and moving averge filtering. 225 | 226 | Note: 227 | This code is written by Zhaofeng Lin (linzh@tcd.ie) 228 | 229 | Parameters: 230 | ---------- 231 | s_n: Normal speech wavform 232 | fs: Sample rate 233 | Lv: order of LP analysis for vocal tract (default: 16) 234 | 235 | Returns: 236 | ------- 237 | y_pw: Pseudo whispered speech wavform 238 | """ 239 | 240 | EPSILON = 1e-8 241 | 242 | # Overlapp-add (OLA) method 243 | nfft = pw.get_cheaptrick_fft_size(fs) 244 | win_length = int(30*fs/1000) # 30ms * fs / 1000 245 | nhop = round(win_length / 2) 246 | window = np.hamming(win_length) 247 | nframes = int(np.ceil(s_n.size / nhop)) 248 | 249 | s_gfm = np.zeros(s_n.shape) # allocate output speech without glottal source 250 | 251 | for n in range(nframes): 252 | startPoint = n * nhop # starting point of windowing 253 | if startPoint + win_length > s_n.size: 254 | s_gfm[startPoint - nhop + win_length: ] = EPSILON 255 | continue 256 | else: 257 | sn_frame = s_n[startPoint : startPoint+win_length] * window 258 | 259 | s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv) 260 | 261 | s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame 262 | 263 | # Extract GFM 264 | f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs) 265 | 266 | # Moving Averge Filtering 267 | maf_freq = 400 # 400 Hz 268 | maf_w_len = round(maf_freq/fs * nfft) # 400 Hz 269 | sp_maf = moving_average(sp_gfm, maf_w_len) 270 | 271 | # Zero F0 and unit Ap 272 | f0_zero = np.zeros(f0_gfm.shape) + EPSILON 273 | ap_unit = np.ones(ap_gfm.shape) - EPSILON 274 | 275 | y_pw = pw.synthesize(f0_zero, sp_maf, ap_unit, fs, pw.default_frame_period) 276 | 277 | return y_pw 278 | 279 | 280 | def glottal_remove_gen(s_n, fs, Lv=16): 281 | """ 282 | Speech without glottal contribution generating function, using GFM-IAIF. 283 | 284 | Note: 285 | This code is written by Zhaofeng Lin (linzh@tcd.ie) 286 | 287 | Parameters: 288 | ---------- 289 | s_n: Normal speech wavform 290 | fs: Sample rate 291 | Lv: order of LP analysis for vocal tract (default: 16) 292 | 293 | Returns: 294 | ------- 295 | y_no_glottal: Speech wavform without glottal contribution 296 | """ 297 | 298 | EPSILON = 1e-8 299 | 300 | # Overlapp-add (OLA) method 301 | nfft = pw.get_cheaptrick_fft_size(fs) 302 | win_length = int(30*fs/1000) # 30ms * fs / 1000 303 | nhop = round(win_length / 2) 304 | window = np.hamming(win_length) 305 | nframes = int(np.ceil(s_n.size / nhop)) 306 | 307 | s_gfm = np.zeros(s_n.shape) # allocate output speech without glottal source 308 | 309 | for n in range(nframes): 310 | startPoint = n * nhop # starting point of windowing 311 | if startPoint + win_length > s_n.size: 312 | s_gfm[startPoint - nhop + win_length: ] = EPSILON 313 | continue 314 | else: 315 | sn_frame = s_n[startPoint : startPoint+win_length] * window 316 | 317 | s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv) 318 | 319 | s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame 320 | 321 | # Extract GFM 322 | f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs) 323 | # Zero F0 and unit Ap 324 | f0_zero = np.zeros(f0_gfm.shape) + EPSILON 325 | ap_unit = np.ones(ap_gfm.shape) - EPSILON 326 | 327 | y_no_glottal = pw.synthesize(f0_zero, sp_gfm, ap_unit, fs, pw.default_frame_period) 328 | 329 | return y_no_glottal 330 | 331 | 332 | def bandwidth_widen_gen(s_n, fs, maf_freq=400): 333 | """ 334 | Speech with expanded formant bandwidth generating function, using moving averge filtering. 335 | 336 | Note: 337 | This code is written by Zhaofeng Lin (linzh@tcd.ie) 338 | 339 | Parameters: 340 | ---------- 341 | s_n: Normal speech wavform 342 | fs: Sample rate 343 | maf_freq: Moving Averge Filtering window length (default: 400 Hz) 344 | 345 | Returns: 346 | ------- 347 | y_bandwidth: Speech waveform with expanded formant bandwidth 348 | """ 349 | 350 | # Extract normal speech 351 | f0_n, sp_n, ap_n, _ = wav2world(s_n, fs) 352 | 353 | # Moving Averge Filtering 354 | nfft = pw.get_cheaptrick_fft_size(fs) 355 | maf_w_len = round(maf_freq/fs * nfft) # 400 Hz 356 | sp_maf = moving_average(sp_n, maf_w_len) 357 | 358 | y_bandwidth = pw.synthesize(f0_n, sp_maf, ap_n, fs, pw.default_frame_period) 359 | 360 | return y_bandwidth 361 | 362 | 363 | def lpcfit(x, p=12, h=128, w=None, ov=1): 364 | """ 365 | Fit LPC to short-time segments. 366 | 367 | Note: 368 | Function originally coded by Dan Ellis (http://labrosa.org/~dpwe/resources/matlab/polewarp/lpcfit.m). 369 | This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie) 370 | 371 | Parameters: 372 | ---------- 373 | x: a stretch of signal 374 | p: LPC prder (default: 12) 375 | h: hopping size (default: 128) 376 | w: window size (default: 2*h) 377 | ov: overlap-add parameter (default: 1) 378 | 379 | Returns: 380 | ------- 381 | a: successive all-pole coefficients in rows 382 | g: per-frame gains 383 | e: residual excitation 384 | """ 385 | if w is None: 386 | w = 2*h 387 | 388 | if x.ndim == 1: 389 | x = x[np.newaxis, :] 390 | 391 | npts = x.shape[1] 392 | nhops = npts // h 393 | 394 | # Pad x with zeros so that we can extract complete w-length windows 395 | # from it 396 | x = np.pad(x, ((0, 0), ((w-h)//2, (w-h)//2 + h%2)), mode='constant') 397 | 398 | a = np.zeros((nhops, p+1)) 399 | g = np.zeros(nhops) 400 | if ov == 0: 401 | e = np.zeros(npts) 402 | else: 403 | e = np.zeros((nhops-1)*h+w) 404 | 405 | # Pre-emphasis 406 | pre = np.array([1, -0.9]) 407 | x = lfilter(pre, 1, x, axis=1) 408 | 409 | for hop in range(nhops): 410 | # Extract segment of signal 411 | xx = x[:, hop*h : hop*h+w] 412 | # Apply hanning window 413 | wxx = xx * np.hanning(w)[np.newaxis, :] 414 | # Form autocorrelation (calculates *way* too many points) 415 | rxx = np.correlate(wxx[0], wxx[0], mode='full') 416 | # extract just the points we need (middle p+1 points) 417 | rxx = rxx[w+w//2-p-1:w+w//2+1] 418 | # Setup the normal equations 419 | R = np.toeplitz(rxx[:p]) 420 | # Solve for a (horribly inefficient to use full inv()) 421 | an = np.linalg.solve(R, rxx[1:p+1]) 422 | # Calculate residual by filtering windowed xx 423 | aa = np.concatenate(([1], -an)) 424 | if ov == 0: 425 | rs = lfilter(aa, 1, xx[0, (w-h)//2:(w+h)//2]) 426 | else: 427 | rs = lfilter(aa, 1, wxx[0]) 428 | G = np.sqrt(np.mean(rs**2)) 429 | # Save filter, gain and residual 430 | a[hop,:] = aa 431 | g[hop] = G 432 | if ov == 0: 433 | e[hop*h : (hop+1)*h] = rs / G 434 | else: 435 | e[hop*h : hop*h+w] += rs / G 436 | 437 | # Throw away first (win-hop)/2 pts if in overlap mode 438 | # for proper synchronization of resynth 439 | if ov != 0: 440 | e = e[(w-h)//2:] 441 | 442 | return a, g, e 443 | 444 | 445 | def warppoles(a, alpha): 446 | """ 447 | warp an all-pole polynomial by substitution 448 | 449 | Parameters: 450 | ---------- 451 | a (numpy.ndarray): all-pole polynomial defined by rows of a. 452 | alpha (float): first-order warp factor. Negative alpha shifts poles up in frequency. 453 | 454 | Returns: 455 | ------- 456 | tuple: polynomials have zeros too, hence B and A. 457 | B (numpy.ndarray): warped all-pole polynomial. 458 | A (numpy.ndarray): zeros polynomial. 459 | 460 | """ 461 | # Construct z-hat^-1 polynomial 462 | d = np.array([-alpha, 1]) 463 | c = np.array([1, -alpha]) 464 | 465 | nrows, order = a.shape 466 | 467 | A = np.zeros((nrows, order)) 468 | B = np.zeros((nrows, order)) 469 | 470 | B[:, 0] = a[:, 0] 471 | A[:, 0] = np.ones(nrows) 472 | 473 | dd = d 474 | cc = c 475 | 476 | # This code originally mapped zeros. I adapted it to map 477 | # poles just by interchanging b and a, then swapping again at the 478 | # end. Sorry that makes the variables confusing to read. 479 | for n in range(1, order): 480 | 481 | for row in range(nrows): 482 | 483 | # add another factor to num, den 484 | B[row, :order] = np.convolve(B[row, :order-1], c) 485 | 486 | # accumulate terms from this factor 487 | B[:, :len(dd)] = B[:, :len(dd)] + np.multiply(a[:, n], dd) 488 | 489 | dd = np.convolve(dd, d) 490 | cc = np.convolve(cc, c) 491 | 492 | # Construct the uniform A polynomial (same for all rows) 493 | AA = np.ones(1) 494 | for n in range(2, order+1): 495 | AA = np.convolve(AA, c) 496 | 497 | A = np.tile(AA, (nrows, 1)) 498 | 499 | # Exchange zeros and poles 500 | T = np.copy(A) 501 | A = np.copy(B) 502 | B = np.copy(T) 503 | 504 | return B, A 505 | 506 | 507 | def lpcsynth(a, g, e=[], h=128, ov=1): 508 | """ 509 | Resynthesize from LPC representation. 510 | 511 | Each row of a is an LPC fit to a h-point (non-overlapping) 512 | frame of data. g gives the overall gains for each frame and 513 | e is an excitation signal (if e is empty, white noise is used; 514 | if e is a scalar, a pulse train is used with that period). 515 | ov nonzero selects overlap-add of reconstructed 516 | windows, else e is assumed to consist of independent hop-sized 517 | segments that will line up correctly without cross-fading 518 | (matching the ov option to lpcfit; default is ov = 1). 519 | 520 | Returns d as the resulting LPC resynthesis. 521 | """ 522 | if not e: 523 | e = randn(1, nepts) 524 | if isinstance(e, (int, float)): 525 | pd = e 526 | e = np.zeros(npts) 527 | e[::pd] = np.sqrt(pd) 528 | else: 529 | npts = len(e) - ov * (w - h) 530 | nepts = len(e) 531 | 532 | w = 2 * h 533 | nhops, p = a.shape 534 | npts = nhops * h 535 | nepts = npts + ov * (w - h) 536 | e = np.hstack((e, np.zeros(w))) 537 | d = np.zeros(npts) 538 | 539 | for hop in range(nhops): 540 | hbase = (hop - 1) * h 541 | oldbit = d[hbase : hbase + h] 542 | aa = a[hop] 543 | G = g[hop] 544 | if ov == 0: 545 | newbit = G * lfilter(aa, 1, e[hbase : hbase + h]) 546 | else: 547 | newbit = G * lfilter(aa, 1, e[hbase : hbase + w])[:w] 548 | newbit = oldbit + (hann(w) * newbit) 549 | if ov == 0: 550 | d[hbase : hbase + h] = newbit 551 | else: 552 | d[hbase : hbase + w] = newbit[:w] 553 | 554 | # De-emphasis (must match pre-emphasis in lpcfit) 555 | pre = [1, -0.9] 556 | d = lfilter(pre, 1, d) 557 | 558 | return d 559 | --------------------------------------------------------------------------------