├── Pseudo-whisper.png
├── list_example
├── data_gen.py
├── rq2_gen.py
├── README.md
└── utils.py


/Pseudo-whisper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chaufanglin/Normal2Whisper/HEAD/Pseudo-whisper.png


--------------------------------------------------------------------------------
/list_example:
--------------------------------------------------------------------------------
1 | s116u180n  /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u180n.WAV 
2 | s116u195n  /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u195n.WAV 
3 | s116u198n  /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u198n.WAV 
4 | s116u227n  /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u227n.WAV 
5 | s116u245n  /tudelft.net/staff-bulk/ewi/insy/SpeechLab/corpora/wTIMIT/nist/TRAIN/normal/US/116/s116u245n.WAV 


--------------------------------------------------------------------------------
/data_gen.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import argparse
 4 | import soundfile as sf
 5 | import librosa
 6 | from utils import pseudo_whisper_gen
 7 | 
 8 | def generate(data_list, output_dir):
 9 | 
10 |     if not os.path.exists(output_dir):
11 |         os.makedirs(output_dir)
12 | 
13 |     with open(data_list, 'r') as f1:
14 |         content = f1.readlines()
15 |         content = [x.strip() for x in content] 
16 |         for line in content:
17 |             name, filepath = line.split('  ',1) # check one/two space
18 |             # filepath, _ = filepath.split(' |', 1)
19 | 
20 |             # spkpath, _ = filepath.split('normal/', 1)[1].split('/s', 1)
21 |             # spkpath = os.path.join(output_dir, spkpath)
22 |             filename_split = os.path.basename(filepath).split('.', 1)[0].split('-')
23 |             spkpath = os.path.join(output_dir, filename_split[0], filename_split[1])
24 | 
25 |             # name = name.replace("n","pw")
26 |             if os.path.exists(os.path.join(spkpath, name) + '-pw.wav'): # '.wav'): #
27 |                 continue
28 | 
29 |             if not os.path.exists(spkpath):
30 |                 os.makedirs(spkpath, exist_ok=True)
31 |                 print("make dir: %s" % spkpath)
32 |             
33 |             s_n, fs = librosa.load(filepath, sr=16000, dtype=np.float64)  # resample to 16k
34 |             # s_n, fs = sf.read(filepath)
35 |             s_pw = pseudo_whisper_gen(s_n, fs)
36 |             
37 |             sf.write(os.path.join(spkpath, name) + '-pw.wav', s_pw, fs) #'.wav', s_pw, fs)    # '-pw.wav', s_pw, fs)
38 |     f1.close()
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     parser = argparse.ArgumentParser(description = 'Generate pseudo whispered speech data')
43 | 
44 |     data_list_default = './list_example'
45 |     output_dir_default = './output_dir'
46 | 
47 |     parser.add_argument('--data_list', type = str, help = 'List for the normal speech directories.', default = data_list_default)
48 |     parser.add_argument('--output_dir', type = str, help = 'Directory for the output pseudo whispered speech.', default = output_dir_default)
49 | 
50 |     argv = parser.parse_args()
51 | 
52 |     data_list = argv.data_list
53 |     output_dir = argv.output_dir
54 | 
55 |     print(data_list)
56 |     
57 |     generate(data_list, output_dir)


--------------------------------------------------------------------------------
/rq2_gen.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import argparse
 4 | import soundfile as sf
 5 | import librosa
 6 | from utils import glottal_remove_gen, bandwidth_widen_gen
 7 | 
 8 | def generate(data_list, output_dir, generating_mode):
 9 | 
10 |     if not os.path.exists(output_dir):
11 |         os.makedirs(output_dir)
12 | 
13 |     with open(data_list, 'r') as f1:
14 |         content = f1.readlines()
15 |         content = [x.strip() for x in content] 
16 |         for line in content:
17 |             name, filepath = line.split('  ',1)
18 |             # filepath, _ = filepath.split(' |', 1)
19 | 
20 |             spkpath, _ = filepath.split('normal/', 1)[1].split('/s', 1)
21 |             spkpath = os.path.join(output_dir, spkpath)
22 |             # filename_split = os.path.basename(filepath).split('.', 1)[0].split('-')
23 |             # spkpath = os.path.join(output_dir, filename_split[0], filename_split[1])
24 | 
25 |             # name = name.replace("n","pw")
26 |             if os.path.exists(os.path.join(spkpath, name) + '.wav'):
27 |                 continue
28 | 
29 |             if not os.path.exists(spkpath):
30 |                 os.makedirs(spkpath, exist_ok=True)
31 |                 print("make dir: %s" % spkpath)
32 |             
33 |             s_n, fs = librosa.load(filepath, sr=16000, dtype=np.float64) # resample to 16k
34 |             # s_n, fs = sf.read(filepath)
35 |             if generating_mode == '1':
36 |                 s_pw = glottal_remove_gen(s_n, fs)
37 |             else:
38 |                 s_pw = bandwidth_widen_gen(s_n, fs)
39 |             
40 |             # sf.write(os.path.join(spkpath, name) + '.wav', s_pw, fs)
41 |     f1.close()
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description = 'Generate pseudo whispered speech data')
46 | 
47 |     data_list_default = './list_example'
48 |     output_dir_default = './output_dir'
49 |     generating_mode_default = '1'
50 | 
51 |     parser.add_argument('--data_list', type = str, help = 'List for the normal speech directories.', default = data_list_default)
52 |     parser.add_argument('--output_dir', type = str, help = 'Directory for the output modified speech.', default = output_dir_default)
53 |     parser.add_argument('--generating_mode', type = str, help = 'Generating mode: 1) glottal contribution removal; 2) formant bandwidth widen.', 
54 |                         default = generating_mode_default)
55 | 
56 |     argv = parser.parse_args()
57 |     data_list = argv.data_list
58 |     output_dir = argv.output_dir
59 |     generating_mode = argv.generating_mode
60 | 
61 |     generate(data_list, output_dir, generating_mode)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Normal2Whisper
 2 | This is an implementation of our pseudo-whispered speech conversion method in the paper Improving Whispered Speech Recognition Performance using Pseudo-whispered based Data Augmentation ([pdf](https://arxiv.org/pdf/2311.05179.pdf); to appear in ASRU 2023).
 3 | 
 4 | <img src="Pseudo-whisper.png" width="100%">
 5 | 
 6 | ## Dependencies
 7 | * Python 3.9 
 8 | * Numpy
 9 | * soundfile
10 | * librosa
11 | * [PyWorld](https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder)
12 | 
13 | ## Functions
14 | 1. `utils.py`
15 | 
16 |     This script has all the essential functions used in our proposed method.
17 | 
18 |     > **Note:** In our work, the speech files are or re-sampled to 16 kHz. So the parameter of GFM-IAIF-GC is based on this sample rate.
19 | 
20 | 2. `data_gen.py`
21 | 
22 |     This script is used to convert normal speech into pseudo-whispered speech. 
23 | 
24 | 3. `rq2_gen.py`
25 | 
26 |     This script is used to convert normal speech into: 
27 |     1) normal speech without glottal contributions; 
28 |     2) normal speech with widened formant bandwidth and shifted formant frequencies. 
29 | 
30 | ## Usage
31 | **1. Convert normal speech into pseudo-whispered speech from your dataset:**
32 | 
33 | ```Bash
34 | python data_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \
35 |                    --output_dir './data/training/wTIMIT/PW(PATH TO OUTPUT PW DIRECTORY)' 
36 | ```
37 | 
38 | **2. Convert normal speech into 1) normal speech without glottal contributions:**
39 | 
40 | ```Bash
41 | python rq2_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \
42 |                   --output_dir './data/training/wTIMIT/s1(PATH TO OUTPUT DIRECTORY)' \
43 |                   --generating_mode '1'
44 | ```
45 | 
46 | **3. Convert normal speech into 2) normal speech with widened formant bandwidth and shifted formant frequencies:**
47 | 
48 | ```Bash
49 | python rq2_gen.py --data_list './list_example(PATH TO THE LIST OF SOURCE TRAINING DATA)' \
50 |                   --output_dir './data/training/wTIMIT/s2(PATH TO OUTPUT DIRECTORY)' \
51 |                   --generating_mode '2'
52 | ```
53 | 
54 | >**Note:** you can check `./list_example` to see an example of the input data list. You can get the list by using this command: 
55 | >```Bash
56 | >find ./corpora/wTIMIT/nist/TRAIN/normal/US/(PATH TO SOURCE TRAINING DATA) -name "*.WAV" | awk '{split($0,a,"/");split(a[14],b,"."); print b[1] ,$0}' > list
57 | >```
58 | >You may need to check your data directory and change `a[14]` in the command.
59 | 
60 | ## Citation
61 | ```bibtex
62 | @INPROCEEDINGS{10389801,
63 |   author={Lin, Zhaofeng and Patel, Tanvina and Scharenborg, Odette},
64 |   booktitle={2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, 
65 |   title={Improving Whispered Speech Recognition Performance Using Pseudo-Whispered Based Data Augmentation}, 
66 |   year={2023},
67 |   volume={},
68 |   number={},
69 |   pages={1-8},
70 |   keywords={Error analysis;Databases;Conferences;Training data;Transforms;Data augmentation;Acoustics;Whispered speech;pseudo-whisper;end-to-end speech recognition;wTIMIT;signal processing},
71 |   doi={10.1109/ASRU57964.2023.10389801}}
72 | ```
73 | 
74 | ## Contact
75 | If you have any questions, feel free to open an issue or send me an email linzh (at) tcd.ie
76 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.random import randn
  3 | import soundfile as sf
  4 | from scipy.signal import lfilter
  5 | from scipy.signal.windows import hann
  6 | from librosa import lpc
  7 | import pyworld as pw
  8 | 
  9 | 
 10 | def wav2world(x, fs, fft_size=None):
 11 |     """Convenience function to do all WORLD analysis steps in a single call.
 12 |     In this case only `frame_period` can be configured and other parameters
 13 |     are fixed to their defaults. Likewise, F0 estimation is fixed to
 14 |     DIO plus StoneMask refinement.
 15 |     Parameters
 16 |     ----------
 17 |     x : ndarray
 18 |         Input waveform signal.
 19 |     fs : int
 20 |         Sample rate of input signal in Hz.
 21 |     fft_size : int
 22 |         Length of Fast Fourier Transform (in number of samples)
 23 |         The resulting dimension of `ap` adn `sp` will be `fft_size` // 2 + 1
 24 |     Returns
 25 |     -------
 26 |     f0 : ndarray
 27 |         F0 contour.
 28 |     sp : ndarray
 29 |         Spectral envelope.
 30 |     ap : ndarray
 31 |         Aperiodicity.
 32 |     t  : ndarray
 33 |         Temporal position of each frame.
 34 |     """
 35 |     f0, t = pw.harvest(x, fs)
 36 |     sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
 37 |     ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)
 38 |     return f0, sp, ap, t
 39 | 
 40 | 
 41 | def moving_average(data, length):
 42 |     output = np.empty(data.shape)
 43 |     maf = np.bartlett(length)/length  # Bartlett window is a triangular window
 44 |     for i in range(data.shape[0]):
 45 |         output[i,:] = np.convolve(data[i,:], maf,'same')
 46 |     return output
 47 | 
 48 | 
 49 | def gfm_iaif(s_gvl, nv=48, ng=3, d=0.99, win=None):
 50 |     """
 51 |     Glottal Flow Model-based Iterative Adaptive Inverse Filtering.
 52 | 
 53 |     Note:
 54 |     Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 
 55 |     This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie)
 56 | 
 57 |     Parameters:
 58 |     ----------
 59 |         s_gvl: Speech signal frame
 60 |         nv: Order of LP analysis for vocal tract (def. 48)
 61 |         ng: Order of LP analysis for glottal source (def. 3)
 62 |         d: Leaky integration coefficient (def. 0.99)
 63 |         win: Window used before LPC (def. Hanning)
 64 | 
 65 |     Returns:
 66 |     -------
 67 |         av: LP coefficients of vocal tract contribution
 68 |         ag: LP coefficients of glottis contribution
 69 |         al: LP coefficients of lip radiation contribution
 70 |     """
 71 | 
 72 |     # ----- Set default parameters -------------------------------------------
 73 |     if win is None:
 74 |         # Window for LPC estimation
 75 |         win = np.hanning(len(s_gvl))
 76 | 
 77 |     # ----- Addition of pre-frame --------------------------------------------
 78 |     # For the successive removals of the estimated LPC envelopes, a
 79 |     # mean-normalized pre-frame ramp is added at the beginning of the frame
 80 |     # in order to diminish ripple. The ramp is removed after each filtering.
 81 |     Lpf = nv + 1  # Pre-frame length
 82 |     x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl])  # Prepend
 83 |     idx_pf = np.arange(Lpf, len(x_gvl))  # Indexes that exclude the pre-frame
 84 | 
 85 |     # ----- Cancel lip radiation contribution --------------------------------
 86 |     # Define lip radiation filter
 87 |     al = [1, -d]
 88 | 
 89 |     # Integration of signal using filter 1/[1 -d z^(-1)]
 90 |     # - Input signal (for LPC estimation)
 91 |     s_gv = lfilter([1], al, s_gvl)
 92 |     # - Pre-framed input signal (for LPC envelope removal)
 93 |     x_gv = lfilter([1], al, x_gvl)
 94 | 
 95 |     # ----- Gross glottis estimation -----------------------------------------
 96 |     # Iterative estimation of glottis with ng first order filters
 97 |     ag1 = lpc(s_gv*win, order=1)         # First 1st order LPC estimation
 98 | 
 99 |     for i in range(ng-2):
100 |         # Cancel current estimate of glottis contribution from speech signal
101 |         x_v1x = lfilter(ag1,1,x_gv)        # Inverse filtering
102 |         s_v1x = x_v1x[idx_pf]        # Remove pre-ramp
103 | 
104 |         # Next 1st order LPC estimation
105 |         ag1x = lpc(s_v1x*win, order=1)        # 1st order LPC
106 | 
107 |         # Update gross estimate of glottis contribution
108 |         ag1 = np.convolve(ag1,ag1x)        # Combine 1st order estimation with previous
109 | 
110 | 
111 |     # ----- Gross vocal tract estimation -------------------------------------
112 |     # Cancel gross estimate of glottis contribution from speech signal
113 |     x_v1 = lfilter(ag1,1,x_gv)       # Inverse filtering
114 |     s_v1 = x_v1[idx_pf]         # Remove pre-ramp
115 | 
116 |     # Gross estimate of the vocal tract filter
117 |     av1 = lpc(s_v1*win, order=nv)        # nv order LPC estimation
118 | 
119 |     # ----- Fine glottis estimation ------------------------------------------
120 |     # Cancel gross estimate of vocal tract contribution from speech signal
121 |     x_g1 = lfilter(av1,1,x_gv)       # Inverse filtering
122 |     s_g1 = x_g1[idx_pf]         # Remove pre-ramp
123 | 
124 |     # Fine estimate of the glottis filter
125 |     ag = lpc(s_g1*win, order=ng)        # ng order LPC estimation
126 | 
127 |     # ----- Fine vocal tract estimation --------------------------------------
128 |     # Cancel fine estimate of glottis contribution from speech signal
129 |     x_v = lfilter(ag,1,x_gv)       # Inverse filtering
130 |     s_v = x_v[idx_pf]         # Remove pre-ramp
131 | 
132 |     # Fine estimate of the vocal tract filter
133 |     av = lpc(s_v*win, order=nv)        # nv order LPC estimation
134 | 
135 | 
136 |     return av, ag, al
137 | 
138 | 
139 | def gfm_iaif_glottal_remove(s_gvl, nv=48, ng=3, d=0.99, win=None):
140 |     """
141 |     Glootal removal function based on GFM-IAIF.
142 | 
143 |     Note:
144 |     Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 
145 |     This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie)
146 |     Parameters:
147 |     ----------
148 |         s_gvl: Speech signal frame
149 |         nv: Order of LP analysis for vocal tract (def. 48)
150 |         ng: Order of LP analysis for glottal source (def. 3)
151 |         d: Leaky integration coefficient (def. 0.99)
152 |         win: Window used before LPC (def. Hanning)
153 | 
154 |     Returns:
155 |     -------
156 |         s_v: Speech signal with glottis contribution cancelled 
157 |     """
158 | 
159 |     # ----- Set default parameters -------------------------------------------
160 |     if win is None:
161 |         # Window for LPC estimation
162 |         win = np.hanning(len(s_gvl))
163 | 
164 |     # ----- Addition of pre-frame --------------------------------------------
165 |     # For the successive removals of the estimated LPC envelopes, a
166 |     # mean-normalized pre-frame ramp is added at the beginning of the frame
167 |     # in order to diminish ripple. The ramp is removed after each filtering.
168 |     Lpf = nv + 1  # Pre-frame length
169 |     x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl])  # Prepend
170 |     idx_pf = np.arange(Lpf, len(x_gvl))  # Indexes that exclude the pre-frame
171 | 
172 |     # ----- Cancel lip radiation contribution --------------------------------
173 |     # Define lip radiation filter
174 |     al = [1, -d]
175 | 
176 |     # Integration of signal using filter 1/[1 -d z^(-1)]
177 |     # - Input signal (for LPC estimation)
178 |     s_gv = lfilter([1], al, s_gvl)
179 |     # - Pre-framed input signal (for LPC envelope removal)
180 |     x_gv = lfilter([1], al, x_gvl)
181 | 
182 |     # ----- Gross glottis estimation -----------------------------------------
183 |     # Iterative estimation of glottis with ng first order filters
184 |     ag1 = lpc(s_gv*win, order=1)         # First 1st order LPC estimation
185 | 
186 |     for i in range(ng-2):
187 |         # Cancel current estimate of glottis contribution from speech signal
188 |         x_v1x = lfilter(ag1,1,x_gv)        # Inverse filtering
189 |         s_v1x = x_v1x[idx_pf]        # Remove pre-ramp
190 | 
191 |         # Next 1st order LPC estimation
192 |         ag1x = lpc(s_v1x*win, order=1)        # 1st order LPC
193 | 
194 |         # Update gross estimate of glottis contribution
195 |         ag1 = np.convolve(ag1,ag1x)        # Combine 1st order estimation with previous
196 | 
197 | 
198 |     # ----- Gross vocal tract estimation -------------------------------------
199 |     # Cancel gross estimate of glottis contribution from speech signal
200 |     x_v1 = lfilter(ag1,1,x_gv)       # Inverse filtering
201 |     s_v1 = x_v1[idx_pf]         # Remove pre-ramp
202 | 
203 |     # Gross estimate of the vocal tract filter
204 |     av1 = lpc(s_v1*win, order=nv)        # nv order LPC estimation
205 | 
206 |     # ----- Fine glottis estimation ------------------------------------------
207 |     # Cancel gross estimate of vocal tract contribution from speech signal
208 |     x_g1 = lfilter(av1,1,x_gv)       # Inverse filtering
209 |     s_g1 = x_g1[idx_pf]         # Remove pre-ramp
210 | 
211 |     # Fine estimate of the glottis filter
212 |     ag = lpc(s_g1*win, order=ng)        # ng order LPC estimation
213 | 
214 |     # ----- Fine vocal tract estimation --------------------------------------
215 |     # Cancel fine estimate of glottis contribution from speech signal
216 |     x_v = lfilter(ag,1,x_gv)       # Inverse filtering
217 |     s_v = x_v[idx_pf]         # Remove pre-ramp
218 | 
219 |     return s_v
220 | 
221 | 
222 | def pseudo_whisper_gen(s_n, fs, Lv=16):
223 |     """
224 |     Pseudo whispered speech generating function, using GFM-IAIF and moving averge filtering.
225 | 
226 |     Note:
227 |     This code is written by Zhaofeng Lin (linzh@tcd.ie)
228 | 
229 |     Parameters:
230 |     ----------
231 |         s_n: Normal speech wavform 
232 |         fs: Sample rate
233 |         Lv: order of LP analysis for vocal tract (default: 16)
234 | 
235 |     Returns:
236 |     -------
237 |         y_pw: Pseudo whispered speech wavform
238 |     """
239 | 
240 |     EPSILON = 1e-8
241 | 
242 |     # Overlapp-add (OLA) method
243 |     nfft = pw.get_cheaptrick_fft_size(fs)
244 |     win_length = int(30*fs/1000) # 30ms * fs / 1000
245 |     nhop = round(win_length / 2)
246 |     window = np.hamming(win_length)
247 |     nframes = int(np.ceil(s_n.size / nhop))
248 | 
249 |     s_gfm = np.zeros(s_n.shape)     # allocate output speech without glottal source
250 | 
251 |     for n in range(nframes):
252 |         startPoint = n * nhop     # starting point of windowing
253 |         if startPoint + win_length > s_n.size:
254 |             s_gfm[startPoint - nhop + win_length: ] = EPSILON
255 |             continue
256 |         else:
257 |             sn_frame = s_n[startPoint : startPoint+win_length] * window
258 | 
259 |         s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv)
260 | 
261 |         s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame
262 | 
263 |     # Extract GFM
264 |     f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs)
265 | 
266 |     # Moving Averge Filtering
267 |     maf_freq = 400  # 400 Hz
268 |     maf_w_len = round(maf_freq/fs * nfft)    # 400 Hz
269 |     sp_maf = moving_average(sp_gfm, maf_w_len)
270 | 
271 |     # Zero F0 and unit Ap
272 |     f0_zero = np.zeros(f0_gfm.shape) + EPSILON
273 |     ap_unit = np.ones(ap_gfm.shape) - EPSILON
274 | 
275 |     y_pw = pw.synthesize(f0_zero, sp_maf, ap_unit, fs, pw.default_frame_period)
276 | 
277 |     return y_pw
278 | 
279 | 
280 | def glottal_remove_gen(s_n, fs, Lv=16):
281 |     """
282 |     Speech without glottal contribution generating function, using GFM-IAIF.
283 | 
284 |     Note:
285 |     This code is written by Zhaofeng Lin (linzh@tcd.ie)
286 | 
287 |     Parameters:
288 |     ----------
289 |         s_n: Normal speech wavform 
290 |         fs: Sample rate
291 |         Lv: order of LP analysis for vocal tract (default: 16)
292 | 
293 |     Returns:
294 |     -------
295 |         y_no_glottal: Speech wavform without glottal contribution
296 |     """
297 | 
298 |     EPSILON = 1e-8
299 | 
300 |     # Overlapp-add (OLA) method
301 |     nfft = pw.get_cheaptrick_fft_size(fs)
302 |     win_length = int(30*fs/1000) # 30ms * fs / 1000
303 |     nhop = round(win_length / 2)
304 |     window = np.hamming(win_length)
305 |     nframes = int(np.ceil(s_n.size / nhop))
306 | 
307 |     s_gfm = np.zeros(s_n.shape)     # allocate output speech without glottal source
308 | 
309 |     for n in range(nframes):
310 |         startPoint = n * nhop     # starting point of windowing
311 |         if startPoint + win_length > s_n.size:
312 |             s_gfm[startPoint - nhop + win_length: ] = EPSILON
313 |             continue
314 |         else:
315 |             sn_frame = s_n[startPoint : startPoint+win_length] * window
316 | 
317 |         s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv)
318 | 
319 |         s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame
320 | 
321 |     # Extract GFM
322 |     f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs)
323 |     # Zero F0 and unit Ap
324 |     f0_zero = np.zeros(f0_gfm.shape) + EPSILON
325 |     ap_unit = np.ones(ap_gfm.shape) - EPSILON
326 | 
327 |     y_no_glottal = pw.synthesize(f0_zero, sp_gfm, ap_unit, fs, pw.default_frame_period)
328 | 
329 |     return y_no_glottal
330 | 
331 | 
332 | def bandwidth_widen_gen(s_n, fs, maf_freq=400):
333 |     """
334 |     Speech with expanded formant bandwidth generating function, using moving averge filtering.
335 | 
336 |     Note:
337 |     This code is written by Zhaofeng Lin (linzh@tcd.ie)
338 | 
339 |     Parameters:
340 |     ----------
341 |         s_n: Normal speech wavform 
342 |         fs: Sample rate
343 |         maf_freq: Moving Averge Filtering window length (default: 400 Hz)
344 | 
345 |     Returns:
346 |     -------
347 |         y_bandwidth: Speech waveform with expanded formant bandwidth
348 |     """
349 | 
350 |     # Extract normal speech
351 |     f0_n, sp_n, ap_n, _ = wav2world(s_n, fs)
352 | 
353 |     # Moving Averge Filtering
354 |     nfft = pw.get_cheaptrick_fft_size(fs)
355 |     maf_w_len = round(maf_freq/fs * nfft)    # 400 Hz
356 |     sp_maf = moving_average(sp_n, maf_w_len)
357 | 
358 |     y_bandwidth = pw.synthesize(f0_n, sp_maf, ap_n, fs, pw.default_frame_period)
359 | 
360 |     return y_bandwidth
361 | 
362 | 
363 | def lpcfit(x, p=12, h=128, w=None, ov=1):
364 |     """
365 |     Fit LPC to short-time segments.
366 | 
367 |     Note:
368 |     Function originally coded by Dan Ellis (http://labrosa.org/~dpwe/resources/matlab/polewarp/lpcfit.m). 
369 |     This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie)
370 | 
371 |     Parameters:
372 |     ----------
373 |         x: a stretch of signal
374 |         p: LPC prder (default: 12)
375 |         h: hopping size (default: 128)
376 |         w: window size (default: 2*h)
377 |         ov: overlap-add parameter (default: 1)
378 |     
379 |     Returns:
380 |     -------
381 |         a: successive all-pole coefficients in rows
382 |         g: per-frame gains
383 |         e: residual excitation
384 |     """
385 |     if w is None:
386 |         w = 2*h
387 | 
388 |     if x.ndim == 1:
389 |         x = x[np.newaxis, :]
390 | 
391 |     npts = x.shape[1]
392 |     nhops = npts // h
393 | 
394 |     # Pad x with zeros so that we can extract complete w-length windows
395 |     # from it
396 |     x = np.pad(x, ((0, 0), ((w-h)//2, (w-h)//2 + h%2)), mode='constant')
397 | 
398 |     a = np.zeros((nhops, p+1))
399 |     g = np.zeros(nhops)
400 |     if ov == 0:
401 |         e = np.zeros(npts)
402 |     else:
403 |         e = np.zeros((nhops-1)*h+w)
404 | 
405 |     # Pre-emphasis
406 |     pre = np.array([1, -0.9])
407 |     x = lfilter(pre, 1, x, axis=1)
408 | 
409 |     for hop in range(nhops):
410 |         # Extract segment of signal
411 |         xx = x[:, hop*h : hop*h+w]
412 |         # Apply hanning window
413 |         wxx = xx * np.hanning(w)[np.newaxis, :]
414 |         # Form autocorrelation (calculates *way* too many points)
415 |         rxx = np.correlate(wxx[0], wxx[0], mode='full')
416 |         # extract just the points we need (middle p+1 points)
417 |         rxx = rxx[w+w//2-p-1:w+w//2+1]
418 |         # Setup the normal equations
419 |         R = np.toeplitz(rxx[:p])
420 |         # Solve for a (horribly inefficient to use full inv())
421 |         an = np.linalg.solve(R, rxx[1:p+1])
422 |         # Calculate residual by filtering windowed xx
423 |         aa = np.concatenate(([1], -an))
424 |         if ov == 0:
425 |             rs = lfilter(aa, 1, xx[0, (w-h)//2:(w+h)//2])
426 |         else:
427 |             rs = lfilter(aa, 1, wxx[0])
428 |         G = np.sqrt(np.mean(rs**2))
429 |         # Save filter, gain and residual
430 |         a[hop,:] = aa
431 |         g[hop] = G
432 |         if ov == 0:
433 |             e[hop*h : (hop+1)*h] = rs / G
434 |         else:
435 |             e[hop*h : hop*h+w] += rs / G
436 | 
437 |     # Throw away first (win-hop)/2 pts if in overlap mode
438 |     # for proper synchronization of resynth
439 |     if ov != 0:
440 |         e = e[(w-h)//2:]
441 | 
442 |     return a, g, e
443 | 
444 | 
445 | def warppoles(a, alpha):
446 |     """
447 |     warp an all-pole polynomial by substitution
448 | 
449 |     Parameters:
450 |     ----------
451 |         a (numpy.ndarray): all-pole polynomial defined by rows of a.
452 |         alpha (float): first-order warp factor. Negative alpha shifts poles up in frequency.
453 | 
454 |     Returns:
455 |     -------
456 |         tuple: polynomials have zeros too, hence B and A.
457 |         B (numpy.ndarray): warped all-pole polynomial.
458 |         A (numpy.ndarray): zeros polynomial.
459 | 
460 |     """
461 |     # Construct z-hat^-1 polynomial
462 |     d = np.array([-alpha, 1])
463 |     c = np.array([1, -alpha])
464 | 
465 |     nrows, order = a.shape
466 | 
467 |     A = np.zeros((nrows, order))
468 |     B = np.zeros((nrows, order))
469 | 
470 |     B[:, 0] = a[:, 0]
471 |     A[:, 0] = np.ones(nrows)
472 | 
473 |     dd = d
474 |     cc = c
475 | 
476 |     # This code originally mapped zeros.  I adapted it to map
477 |     # poles just by interchanging b and a, then swapping again at the
478 |     # end.  Sorry that makes the variables confusing to read.
479 |     for n in range(1, order):
480 | 
481 |         for row in range(nrows):
482 | 
483 |             # add another factor to num, den
484 |             B[row, :order] = np.convolve(B[row, :order-1], c)
485 | 
486 |         # accumulate terms from this factor
487 |         B[:, :len(dd)] = B[:, :len(dd)] + np.multiply(a[:, n], dd)
488 | 
489 |         dd = np.convolve(dd, d)
490 |         cc = np.convolve(cc, c)
491 | 
492 |     # Construct the uniform A polynomial (same for all rows)
493 |     AA = np.ones(1)
494 |     for n in range(2, order+1):
495 |         AA = np.convolve(AA, c)
496 | 
497 |     A = np.tile(AA, (nrows, 1))
498 | 
499 |     # Exchange zeros and poles
500 |     T = np.copy(A)
501 |     A = np.copy(B)
502 |     B = np.copy(T)
503 | 
504 |     return B, A
505 | 
506 | 
507 | def lpcsynth(a, g, e=[], h=128, ov=1):
508 |     """
509 |     Resynthesize from LPC representation.
510 | 
511 |     Each row of a is an LPC fit to a h-point (non-overlapping) 
512 |     frame of data.  g gives the overall gains for each frame and 
513 |     e is an excitation signal (if e is empty, white noise is used; 
514 |     if e is a scalar, a pulse train is used with that period).
515 |     ov nonzero selects overlap-add of reconstructed 
516 |     windows, else e is assumed to consist of independent hop-sized 
517 |     segments that will line up correctly without cross-fading
518 |     (matching the ov option to lpcfit; default is ov = 1).
519 |     
520 |     Returns d as the resulting LPC resynthesis.
521 |     """
522 |     if not e:
523 |         e = randn(1, nepts)
524 |     if isinstance(e, (int, float)):
525 |         pd = e
526 |         e = np.zeros(npts)
527 |         e[::pd] = np.sqrt(pd)
528 |     else:
529 |         npts = len(e) - ov * (w - h)
530 |         nepts = len(e)
531 |         
532 |     w = 2 * h
533 |     nhops, p = a.shape
534 |     npts = nhops * h
535 |     nepts = npts + ov * (w - h)
536 |     e = np.hstack((e, np.zeros(w)))
537 |     d = np.zeros(npts)
538 | 
539 |     for hop in range(nhops):
540 |         hbase = (hop - 1) * h
541 |         oldbit = d[hbase : hbase + h]
542 |         aa = a[hop]
543 |         G = g[hop]
544 |         if ov == 0:
545 |             newbit = G * lfilter(aa, 1, e[hbase : hbase + h])
546 |         else:
547 |             newbit = G * lfilter(aa, 1, e[hbase : hbase + w])[:w]
548 |             newbit = oldbit + (hann(w) * newbit)
549 |         if ov == 0:
550 |             d[hbase : hbase + h] = newbit
551 |         else:
552 |             d[hbase : hbase + w] = newbit[:w]
553 | 
554 |     # De-emphasis (must match pre-emphasis in lpcfit)
555 |     pre = [1, -0.9]
556 |     d = lfilter(pre, 1, d)
557 | 
558 |     return d
559 | 


--------------------------------------------------------------------------------