├── .gitignore ├── IOMethods.py ├── LICENSE ├── MaskingMethods.py ├── QMF ├── __init__.py ├── old_vers │ ├── IOMethods.py │ ├── __init__.py │ ├── foldingmat_filterbank.py │ ├── mixed.wav │ ├── mixedrek.wav │ ├── qmf1024_8x.mat │ ├── qmf1024qn.mat │ ├── qmf_realtime.py │ ├── qmf_realtime_analysis.py │ ├── qmf_realtime_sinmod.py │ └── qmf_realtime_synthesis.py ├── qmf.dat ├── qmf1024.mat ├── qmf1024_8x.mat ├── qmf1024qn.mat ├── qmf_comp_call.py ├── qmf_realtime_class.py └── xn.mat ├── README.md ├── TFMethods.py └── testFiles ├── mixed.wav ├── mt_test.py ├── pulse.wav ├── ramp.wav └── sc03_16m.wav /.gitignore: -------------------------------------------------------------------------------- 1 | # OS generated files # 2 | ###################### 3 | .DS_Store 4 | .DS_Store? 5 | ._* 6 | .Spotlight-V100 7 | .Trashes 8 | ehthumbs.db 9 | Thumbs.db 10 | *~ 11 | 12 | # Py-Generated files # 13 | ###################### 14 | *.pyc 15 | *.py~ 16 | *.md~ 17 | __pycache__/ 18 | *.py[cod] 19 | *.bin 20 | -------------------------------------------------------------------------------- /IOMethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'S.I. Mimilakis' 3 | __copyright__ = 'MacSeNet' 4 | 5 | import os, subprocess, csv 6 | import numpy as np 7 | import wave as _wave 8 | from scipy.io.wavfile import write, read 9 | from sys import platform 10 | 11 | class AudioIO: 12 | """ Class for handling audio input/output operations. 13 | It supports reading and writing of various audio formats 14 | via 'audioRead' & 'audioWrite' methods. Moreover playback 15 | can be performed by using 'sound' method. For formats 16 | different than '.wav' a coder is needed. In this case 17 | libffmpeg is being used, where the absolute path of 18 | the static build should be given to the class variable. 19 | Finally, energy normalisation and anti-clipping methods 20 | are also covered in the last two methods. 21 | 22 | Basic Usage examples: 23 | Import the class : 24 | import IOMethods as IO 25 | -For loading wav files: 26 | x, fs = IO.AudioIO.wavRead('myWavFile.wav', mono = True) 27 | -In case that compressed files are about to be read specify 28 | the path to the libffmpeg library by changing the 'pathToffmpeg' 29 | variable and then type: 30 | x, fs = IO.AudioIO.audioRead() 31 | -For writing wav files: 32 | IO.AudioIO.audioWrite(x, fs, 16, 'myNewWavFile.wav', 'wav') 33 | 34 | -For listening wav files: 35 | IO.AudioIO.sound(x,fs) 36 | 37 | """ 38 | # Normalisation parameters for wavreading and writing 39 | normFact = {'int8' : (2**7) -1, 40 | 'int16': (2**15)-1, 41 | 'int24': (2**23)-1, 42 | 'int32': (2**31)-1, 43 | 'int64': (2**63)-1, 44 | 'float32': 1.0, 45 | 'float64': 1.0} 46 | 47 | # 'Silence' the bash output 48 | FNULL = open(os.devnull, 'w') 49 | 50 | # Absolute path needed here 51 | pathToffmpeg = '/home/mis/Documents/Python/Projects/SourceSeparation/MiscFiles' 52 | 53 | 54 | def __init__(self): 55 | pass 56 | 57 | @staticmethod 58 | def audioRead(fileName, mono=False): 59 | """ Function to load audio files such as *.mp3, *.au, *.wma, *.m4a, *.x-wav & *.aiff. 60 | It first converts them to .wav and reads them with the methods below. 61 | Currently, it uses a static build of ffmpeg. 62 | 63 | Args: 64 | fileName: (str) Absolute filename of WAV file 65 | mono: (bool) Switch if samples should be converted to mono 66 | Returns: 67 | samples: (np array) Audio samples (between [-1,1] 68 | (if stereo: numSamples x numChannels, 69 | if mono: numSamples) 70 | sampleRate: (float): Sampling frequency [Hz] 71 | """ 72 | 73 | # Get the absolute path 74 | fileName = os.path.abspath(fileName) 75 | 76 | # Linux 77 | if (platform == "linux") or (platform == "linux2"): 78 | convDict = { 79 | 'm4a':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 80 | + ' -i ' + fileName + ' ', -3], 81 | 'mp3':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 82 | + ' -i ' + fileName + ' ', -3], 83 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 84 | + ' -i ' + fileName + ' ', -2], 85 | 'wma':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 86 | + ' -i ' + fileName + ' ', -3], 87 | 'aiff':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 88 | + ' -i ' + fileName + ' ', -4], 89 | 'wav':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 90 | + ' -i ' + fileName + ' ', -3] 91 | } 92 | 93 | # MacOSX 94 | elif (platform == "darwin"): 95 | convDict = { 96 | 'm4a':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 97 | + ' -i ' + fileName + ' ', -3], 98 | 'mp3':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 99 | + ' -i ' + fileName + ' ', -3], 100 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 101 | + ' -i ' + fileName + ' ', -2], 102 | 'wma':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 103 | + ' -i ' + fileName + ' ', -3], 104 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 105 | + ' -i ' + fileName + ' ', -4], 106 | 'wav': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 107 | + ' -i ' + fileName + ' ', -3] 108 | } 109 | # Add windows support! 110 | else : 111 | raise Exception('This OS is not supported.') 112 | 113 | # Construct 114 | 115 | if fileName[convDict['mp3'][1]:] == 'mp3': 116 | print(fileName[convDict['mp3'][1]:]) 117 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['mp3'][1]] + 'wav')) 118 | subprocess.call(convDict['mp3'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 119 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 120 | os.remove(modfileName) 121 | 122 | elif fileName[convDict['au'][1]:] == 'au': 123 | print(fileName[convDict['au'][1]:]) 124 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['au'][1]] + 'wav')) 125 | subprocess.call(convDict['au'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 126 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 127 | os.remove(modfileName) 128 | 129 | elif fileName[convDict['wma'][1]:] == 'wma': 130 | print(fileName[convDict['wma'][1]:]) 131 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['wma'][1]] + 'wav')) 132 | subprocess.call(convDict['wma'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 133 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 134 | os.remove(modfileName) 135 | 136 | elif fileName[convDict['aiff'][1]:] == 'aiff': 137 | print(fileName[convDict['aiff'][1]:]) 138 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['aiff'][1]] + 'wav')) 139 | subprocess.call(convDict['aiff'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 140 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 141 | os.remove(modfileName) 142 | 143 | elif fileName[convDict['wav'][1]:] == 'wav': 144 | """ 145 | General purpose reading of wav files that do not contain the RIFF header. 146 | """ 147 | print('x-wav') 148 | modfileName = os.path.join(os.path.abspath(fileName[:-4] + '_temp.wav')) 149 | subprocess.call(convDict['wav'][0] + modfileName, shell=True, stdout=AudioIO.FNULL, 150 | stderr=subprocess.STDOUT) 151 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 152 | os.remove(modfileName) 153 | 154 | elif fileName[convDict['m4a'][1]:] == 'm4a': 155 | print(fileName[convDict['m4a'][1]:]) 156 | modfileName = os.path.join(os.path.abspath(fileName[:-4] + '_temp.wav')) 157 | subprocess.call(convDict['m4a'][0] + modfileName, shell=True, stdout=AudioIO.FNULL, 158 | stderr=subprocess.STDOUT) 159 | samples, sampleRate = AudioIO.wavRead(modfileName, mono) 160 | os.remove(modfileName) 161 | 162 | else : 163 | raise Exception('This format is not supported.') 164 | 165 | return samples, sampleRate 166 | 167 | @staticmethod 168 | def audioWrite(y, fs, nbits, audioFile, format): 169 | """ Write samples to WAV file and then converts to selected 170 | format using ffmpeg. 171 | Args: 172 | samples: (ndarray / 2D ndarray) (floating point) sample vector 173 | mono: DIM: nSamples 174 | stereo: DIM: nSamples x nChannels 175 | 176 | fs: (int) Sample rate in Hz 177 | nBits: (int) Number of bits 178 | audioFile: (string) File name to write 179 | format: (string) Selected format 180 | 'mp3' : Writes to .mp3 181 | 'wma' : Writes to .wma 182 | 'wav' : Writes to .wav 183 | 'aiff' : Writes to .aiff 184 | 'au' : Writes to .au 185 | 'm4a' : Writes to .m4a 186 | """ 187 | 188 | # Linux 189 | if (platform == "linux") or (platform == "linux2"): 190 | convDict = { 191 | 'm4a': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -3], 192 | 'mp3': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -3], 193 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -2], 194 | 'wma': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -3], 195 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -4] 196 | } 197 | 198 | # MacOSX 199 | elif (platform == "darwin"): 200 | convDict = { 201 | 'm4a': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -3], 202 | 'mp3': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -3], 203 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -2], 204 | 'wma': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -3], 205 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -4] 206 | } 207 | 208 | else : 209 | raise Exception('This OS is not supported.') 210 | 211 | if (format == 'mp3'): 212 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['mp3'][1]] + 'wav')) 213 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 214 | subprocess.call(convDict['mp3'][0] + prmfileName + ' ' + audioFile, 215 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 216 | os.remove(prmfileName) 217 | 218 | elif (format == 'wav'): 219 | AudioIO.wavWrite(y, fs, nbits, audioFile) 220 | 221 | elif (format == 'wma'): 222 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['wma'][1]] + 'wav')) 223 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 224 | subprocess.call(convDict['wma'][0] + prmfileName + ' ' + audioFile, 225 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 226 | os.remove(prmfileName) 227 | 228 | elif (format == 'aiff'): 229 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['aiff'][1]] + 'wav')) 230 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 231 | subprocess.call(convDict['aiff'][0] + prmfileName + ' ' + audioFile, 232 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 233 | os.remove(prmfileName) 234 | 235 | elif (format == 'au'): 236 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['au'][1]] + 'wav')) 237 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 238 | subprocess.call(convDict['au'][0] + prmfileName + ' ' + audioFile, 239 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 240 | os.remove(prmfileName) 241 | 242 | elif (format == 'm4a'): 243 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['m4a'][1]] + 'wav')) 244 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 245 | subprocess.call(convDict['m4a'][0] + prmfileName + ' -b:a 320k ' + audioFile, 246 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 247 | os.remove(prmfileName) 248 | else : 249 | raise Exception('This format is not supported.') 250 | 251 | @staticmethod 252 | def wavRead(fileName, mono=False): 253 | """ Function to load WAV file. 254 | 255 | Args: 256 | fileName: (str) Absolute filename of WAV file 257 | mono: (bool) Switch if samples should be converted to mono 258 | Returns: 259 | samples: (np array) Audio samples (between [-1,1] 260 | (if stereo: numSamples x numChannels, 261 | if mono: numSamples) 262 | sampleRate: (float): Sampling frequency [Hz] 263 | """ 264 | try: 265 | samples, sampleRate = AudioIO._loadWAVWithWave(fileName) 266 | sWidth = _wave.open(fileName).getsampwidth() 267 | if sWidth == 1: 268 | #print('8bit case') 269 | samples = samples.astype(float) / AudioIO.normFact['int8'] - 1.0 270 | elif sWidth == 2: 271 | #print('16bit case') 272 | samples = samples.astype(float) / AudioIO.normFact['int16'] 273 | elif sWidth == 3: 274 | #print('24bit case') 275 | samples = samples.astype(float) / AudioIO.normFact['int24'] 276 | except: 277 | #print('32bit case') 278 | samples, sampleRate = AudioIO._loadWAVWithScipy(fileName) 279 | 280 | # mono conversion 281 | if mono: 282 | if samples.ndim == 2 and samples.shape[1] > 1: 283 | samples = (samples[:, 0] + samples[:, 1])*0.5 284 | 285 | return samples, sampleRate 286 | 287 | @staticmethod 288 | def _loadWAVWithWave(fileName): 289 | """ Load samples & sample rate from 24 bit WAV file """ 290 | wav = _wave.open(fileName) 291 | rate = wav.getframerate() 292 | nchannels = wav.getnchannels() 293 | sampwidth = wav.getsampwidth() 294 | nframes = wav.getnframes() 295 | data = wav.readframes(nframes) 296 | wav.close() 297 | array = AudioIO._wav2array(nchannels, sampwidth, data) 298 | 299 | return array, rate 300 | 301 | @staticmethod 302 | def _loadWAVWithScipy(fileName): 303 | """ Load samples & sample rate from WAV file """ 304 | inputData = read(fileName) 305 | samples = inputData[1] 306 | sampleRate = inputData[0] 307 | 308 | return samples, sampleRate 309 | 310 | @staticmethod 311 | def _wav2array(nchannels, sampwidth, data): 312 | """data must be the string containing the bytes from the wav file.""" 313 | num_samples, remainder = divmod(len(data), sampwidth * nchannels) 314 | if remainder > 0: 315 | raise ValueError('The length of data is not a multiple of ' 316 | 'sampwidth * num_channels.') 317 | if sampwidth > 4: 318 | raise ValueError("sampwidth must not be greater than 4.") 319 | 320 | if sampwidth == 3: 321 | a = np.empty((num_samples, nchannels, 4), dtype = np.uint8) 322 | raw_bytes = np.fromstring(data, dtype = np.uint8) 323 | a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth) 324 | a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255 325 | result = a.view(' 16: 352 | fX = y 353 | 354 | write(audioFile, fs, fX) 355 | 356 | @staticmethod 357 | def sound(x,fs): 358 | """ Plays a wave file using the pyglet library. But first, it has to be written. 359 | Termination of the playback is being performed by any keyboard input and Enter. 360 | Args: 361 | x: (array) Floating point samples 362 | fs: (int) The sampling rate 363 | """ 364 | import pyglet as pg 365 | global player 366 | # Call the writing function 367 | AudioIO.wavWrite(x, fs, 16, 'testPlayback.wav') 368 | # Initialize playback engine 369 | player = pg.media.Player() 370 | # Initialize the object with the audio file 371 | playback = pg.media.load('testPlayback.wav') 372 | # Set it to player 373 | player.queue(playback) 374 | # Sound call 375 | player.play() 376 | # Killed by "keyboard" 377 | kill = raw_input() 378 | if kill or kill == '': 379 | AudioIO.stop() 380 | # Remove the dummy wave write 381 | os.remove('testPlayback.wav') 382 | 383 | @staticmethod 384 | def stop(): 385 | """ Stops a playback object of the pyglet library. 386 | It does not accept arguments, but a player has to be 387 | already initialized by the above "sound" method. 388 | """ 389 | global player 390 | # Just Pause & Destruct 391 | player.pause() 392 | player = None 393 | return None 394 | 395 | if __name__ == "__main__": 396 | # Define File 397 | myReadFile = 'EnterYourWavFile.wav' 398 | 399 | # Read the file 400 | x, fs = AudioIO.wavRead(myReadFile, mono = True) 401 | 402 | # Gain parameter 403 | g = 0.2 404 | 405 | # Listen to it 406 | AudioIO.sound(x*g,fs) 407 | 408 | # Make it better and write it to disk 409 | x2 = np.empty((len(x),2), dtype = np.float32) 410 | try : 411 | x2[:,0] = x * g 412 | x2[:,1] = np.roll(x*g, 512) 413 | except ValueError: 414 | x2[:,0] = x[:,0] * g 415 | x2[:,1] = np.roll(x[:,0] * g, 256) 416 | 417 | # Listen to stereo processed 418 | AudioIO.sound(x2*g,fs) 419 | AudioIO.audioWrite(x2, fs, 16, 'myNewWavFile.wav', 'wav') -------------------------------------------------------------------------------- /MaskingMethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'S.I. Mimilakis' 3 | __copyright__ = 'MacSeNet' 4 | 5 | import numpy as np 6 | from scipy.fftpack import fft, ifft 7 | from TFMethods import TimeFrequencyDecomposition as TF 8 | 9 | class FrequencyMasking: 10 | """Class containing various time-frequency masking methods, for processing Time-Frequency representations. 11 | """ 12 | 13 | def __init__(self, mX, sTarget, nResidual, psTarget = [], pnResidual = [], alpha = 1.2, method = 'Wiener'): 14 | self._mX = mX 15 | self._eps = np.finfo(np.float).eps 16 | self._sTarget = sTarget 17 | self._nResidual = nResidual 18 | self._pTarget = psTarget 19 | self._pY = pnResidual 20 | self._mask = [] 21 | self._Out = [] 22 | self._alpha = alpha 23 | self._method = method 24 | self._iterations = 200 25 | self._lr = 3e-3#2e-3 26 | self._hetaplus = 1.2 27 | self._hetaminus = 0.5 28 | 29 | def __call__(self, reverse = False): 30 | 31 | if (self._method == 'Phase'): 32 | if not self._pTarget.size or not self._pTarget.size: 33 | raise ValueError('Phase-sensitive masking cannot be performed without phase information.') 34 | else: 35 | FrequencyMasking.phaseSensitive(self) 36 | if not(reverse) : 37 | FrequencyMasking.applyMask(self) 38 | else : 39 | FrequencyMasking.applyReverseMask(self) 40 | 41 | elif (self._method == 'IRM'): 42 | FrequencyMasking.IRM(self) 43 | if not(reverse) : 44 | FrequencyMasking.applyMask(self) 45 | else : 46 | FrequencyMasking.applyReverseMask(self) 47 | 48 | elif (self._method == 'IAM'): 49 | FrequencyMasking.IAM(self) 50 | if not(reverse) : 51 | FrequencyMasking.applyMask(self) 52 | else : 53 | FrequencyMasking.applyReverseMask(self) 54 | 55 | elif (self._method == 'IBM'): 56 | FrequencyMasking.IBM(self) 57 | if not(reverse) : 58 | FrequencyMasking.applyMask(self) 59 | else : 60 | FrequencyMasking.applyReverseMask(self) 61 | 62 | elif (self._method == 'UBBM'): 63 | FrequencyMasking.UBBM(self) 64 | if not(reverse) : 65 | FrequencyMasking.applyMask(self) 66 | else : 67 | FrequencyMasking.applyReverseMask(self) 68 | 69 | 70 | elif (self._method == 'Wiener'): 71 | FrequencyMasking.Wiener(self) 72 | if not(reverse) : 73 | FrequencyMasking.applyMask(self) 74 | else : 75 | FrequencyMasking.applyReverseMask(self) 76 | 77 | elif (self._method == 'alphaWiener'): 78 | FrequencyMasking.alphaHarmonizableProcess(self) 79 | if not(reverse) : 80 | FrequencyMasking.applyMask(self) 81 | else : 82 | FrequencyMasking.applyReverseMask(self) 83 | 84 | elif (self._method == 'expMask'): 85 | FrequencyMasking.ExpM(self) 86 | if not(reverse) : 87 | FrequencyMasking.applyMask(self) 88 | else : 89 | FrequencyMasking.applyReverseMask(self) 90 | 91 | elif (self._method == 'MWF'): 92 | print('Multichannel Wiener Filtering') 93 | FrequencyMasking.MWF(self) 94 | 95 | return self._Out 96 | 97 | def IRM(self): 98 | """ 99 | Computation of Ideal Amplitude Ratio Mask. As appears in : 100 | H Erdogan, John R. Hershey, Shinji Watanabe, and Jonathan Le Roux, 101 | "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," 102 | in ICASSP 2015, Brisbane, April, 2015. 103 | Args: 104 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 105 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component 106 | Returns: 107 | mask: (2D ndarray) Array that contains time frequency gain values 108 | 109 | """ 110 | print('Ideal Amplitude Ratio Mask') 111 | self._mask = np.divide(self._sTarget, (self._eps + self._sTarget + self._nResidual)) 112 | 113 | def IAM(self): 114 | """ 115 | Computation of Ideal Amplitude Mask. As appears in : 116 | H. Erdogan, J. R. Hershey, S. Watanabe, and J. Le Roux, 117 | "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," 118 | in ICASSP 2015, Brisbane, April, 2015. 119 | Args: 120 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 121 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component 122 | (In this case the observed mixture should be placed) 123 | Returns: 124 | mask: (2D ndarray) Array that contains time frequency gain values 125 | 126 | """ 127 | print('Ideal Amplitude Mask') 128 | self._mask = np.divide(self._sTarget, (self._eps + self._nResidual)) 129 | 130 | def ExpM(self): 131 | """ 132 | Approximate a signal via element-wise exponentiation. As appears in : 133 | S.I. Mimilakis, K. Drossos, T. Virtanen, and G. Schuller, 134 | "Deep Neural Networks for Dynamic Range Compression in Mastering Applications," 135 | in proc. of the 140th Audio Engineering Society Convention, Paris, 2016. 136 | Args: 137 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 138 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component 139 | Returns: 140 | mask: (2D ndarray) Array that contains time frequency gain values 141 | 142 | """ 143 | print('Exponential mask') 144 | self._mask = np.divide(np.log(self._sTarget.clip(self._eps, np.inf)**self._alpha),\ 145 | np.log(self._nResidual.clip(self._eps, np.inf)**self._alpha)) 146 | 147 | def IBM(self): 148 | """ 149 | Computation of Ideal Binary Mask. 150 | Args: 151 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 152 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component 153 | Returns: 154 | mask: (2D ndarray) Array that contains time frequency gain values 155 | 156 | """ 157 | print('Ideal Binary Mask') 158 | theta = 0.5 159 | mask = np.divide(self._sTarget ** self._alpha, (self._eps + self._nResidual ** self._alpha)) 160 | bg = np.where(mask >= theta) 161 | sm = np.where(mask < theta) 162 | mask[bg[0],bg[1]] = 1. 163 | mask[sm[0], sm[1]] = 0. 164 | self._mask = mask 165 | 166 | def UBBM(self): 167 | """ 168 | Computation of Upper Bound Binary Mask. As appears in : 169 | - J.J. Burred, "From Sparse Models to Timbre Learning: New Methods for Musical Source Separation", PhD Thesis, 170 | TU Berlin, 2009. 171 | 172 | Args: 173 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 174 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component (Should not contain target source!) 175 | Returns: 176 | mask: (2D ndarray) Array that contains time frequency gain values 177 | """ 178 | print('Upper Bound Binary Mask') 179 | mask = 20. * np.log(self._eps + np.divide((self._eps + (self._sTarget ** self._alpha)), 180 | ((self._eps + (self._nResidual ** self._alpha))))) 181 | bg = np.where(mask >= 0) 182 | sm = np.where(mask < 0) 183 | mask[bg[0],bg[1]] = 1. 184 | mask[sm[0], sm[1]] = 0. 185 | self._mask = mask 186 | 187 | def Wiener(self): 188 | """ 189 | Computation of Wiener-like Mask. As appears in : 190 | H Erdogan, John R. Hershey, Shinji Watanabe, and Jonathan Le Roux, 191 | "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," 192 | in ICASSP 2015, Brisbane, April, 2015. 193 | Args: 194 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 195 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component 196 | Returns: 197 | mask: (2D ndarray) Array that contains time frequency gain values 198 | """ 199 | print('Wiener-like Mask') 200 | localsTarget = self._sTarget ** 2. 201 | numElements = len(self._nResidual) 202 | if numElements > 1: 203 | localnResidual = self._nResidual[0] ** 2. + localsTarget 204 | for indx in range(1, numElements): 205 | localnResidual += self._nResidual[indx] ** 2. 206 | else : 207 | localnResidual = self._nResidual[0] ** 2. + localsTarget 208 | 209 | self._mask = np.divide((localsTarget + self._eps), (self._eps + localnResidual)) 210 | 211 | def alphaHarmonizableProcess(self): 212 | """ 213 | Computation of Wiener like mask using fractional power spectrograms. As appears in : 214 | A. Liutkus, R. Badeau, "Generalized Wiener filtering with fractional power spectrograms", 215 | 40th International Conference on Acoustics, Speech and Signal Processing (ICASSP), 216 | Apr 2015, Brisbane, Australia. 217 | Args: 218 | sTarget: (2D ndarray) Magnitude Spectrogram of the target component 219 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component or a list 220 | of 2D ndarrays which will be added together 221 | Returns: 222 | mask: (2D ndarray) Array that contains time frequency gain values 223 | 224 | """ 225 | print('Harmonizable Process with alpha:', str(self._alpha)) 226 | localsTarget = self._sTarget ** self._alpha 227 | numElements = len(self._nResidual) 228 | if numElements > 1: 229 | localnResidual = self._nResidual[0] ** self._alpha + localsTarget 230 | for indx in range(1, numElements): 231 | localnResidual += self._nResidual[indx] ** self._alpha 232 | else : 233 | localnResidual = self._nResidual[0] ** self._alpha + localsTarget 234 | 235 | self._mask = np.divide((localsTarget + self._eps), (self._eps + localnResidual)) 236 | 237 | def phaseSensitive(self): 238 | """ 239 | Computation of Phase Sensitive Mask. As appears in : 240 | H Erdogan, John R. Hershey, Shinji Watanabe, and Jonathan Le Roux, 241 | "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," 242 | in ICASSP 2015, Brisbane, April, 2015. 243 | 244 | Args: 245 | mTarget: (2D ndarray) Magnitude Spectrogram of the target component 246 | pTarget: (2D ndarray) Phase Spectrogram of the output component 247 | mY: (2D ndarray) Magnitude Spectrogram of the residual component 248 | pY: (2D ndarray) Phase Spectrogram of the residual component 249 | Returns: 250 | mask: (2D ndarray) Array that contains time frequency gain values 251 | 252 | """ 253 | print('Phase Sensitive Masking.') 254 | # Compute Phase Difference 255 | Theta = (self._pTarget - self._pY) 256 | self._mask = 2./ (1. + np.exp(-np.multiply(np.divide(self._sTarget, self._eps + self._nResidual), np.cos(Theta)))) - 1. 257 | 258 | def optAlpha(self, initloss): 259 | """ 260 | A simple gradiend descent method using the RProp algorithm, 261 | for finding optimum power-spectral density exponents (alpha) for generalized wiener filtering. 262 | Args: 263 | sTarget : (2D ndarray) Magnitude Spectrogram of the target component 264 | nResidual: (2D ndarray) Magnitude Spectrogram of the residual component or a list 265 | of 2D ndarrays which will be added together 266 | initloss : (float) Initial loss, for comparisson 267 | Returns: 268 | mask: (2D ndarray) Array that contains time frequency gain values 269 | 270 | """ 271 | # Initialization of the parameters 272 | # Put every source spectrogram into an array, given an input list. 273 | slist = list(self._nResidual) 274 | slist.insert(0, self._sTarget) 275 | numElements = len(slist) 276 | slist = np.asarray(slist) 277 | 278 | alpha = np.array([1.15] * (numElements)) # Initialize an array of alpha values to be found. 279 | dloss = np.array([0.] * (numElements)) # Initialize an array of loss functions to be used. 280 | lrs = np.array([self._lr] * (numElements)) # Initialize an array of learning rates to be applied to each source. 281 | 282 | # Begin of otpimization 283 | isloss = [] 284 | for iter in xrange(self._iterations): 285 | # The actual function of additive power spectrograms 286 | Xhat = np.sum(np.power(slist, np.reshape(alpha, (numElements, 1, 1))), axis=0) 287 | for source in xrange(numElements): 288 | # Derivative with respect to the function of additive power spectrograms 289 | dX = (slist[source, :, :]**alpha[source]) * np.log(slist[source, :, :] + self._eps) 290 | 291 | # Chain rule between the above derivative and the IS derivative 292 | dloss[source] = self._dIS(Xhat) * np.mean(dX) 293 | 294 | alpha -= (lrs*dloss) 295 | 296 | # Make sure the initial alpha are inside reasonable values 297 | alpha = np.clip(alpha, a_min = 0.5, a_max = 2.) 298 | 299 | # Check IS Loss by computing Xhat 300 | Xhat = 0 301 | for source in xrange(numElements): 302 | Xhat += slist[source, :, :] ** alpha[source] 303 | 304 | isloss.append(self._IS(Xhat)) 305 | if (iter > 2): 306 | # Apply RProp 307 | if (isloss[-2] - isloss[-1] > 0): 308 | lrs *= self._hetaplus 309 | 310 | if (isloss[-2] - isloss[-1] < 0): 311 | lrs *= self._hetaminus 312 | 313 | if (iter > 4): 314 | if (np.abs(isloss[-2] - isloss[-1]) < 1e-4 and np.abs(isloss[-3] - isloss[-2]) < 1e-4): 315 | print('Local Minimum Found') 316 | print('Final Loss: ' + str(isloss[-1]) + ' with characteristic exponent(s): ' + str(alpha)) 317 | break 318 | 319 | print('Loss: ' + str(isloss[-1]) + ' with characteristic exponent(s): ' + str(alpha)) 320 | 321 | # Evaluate Xhat for the mask update 322 | self._mask = np.divide((slist[0, :, :] ** alpha[0] + self._eps), (self._mX ** self._alpha + self._eps)) 323 | self._closs = isloss[-1] 324 | self._alpha = alpha 325 | 326 | def MWF(self): 327 | """ Multi-channel Wiener filtering as appears in: 328 | I. Cohen, J. Benesty, and S. Gannot, Speech Processing in Modern 329 | Communication, Springer, Berlin, Heidelberg, 2010, Chapter 9. 330 | Args: 331 | mTarget: (3D ndarray) Magnitude Spectrogram of the target component 332 | mY: (3D ndarray) Magnitude Spectrogram of the output component 333 | (M channels x F frequency samples x T time-frames). 334 | Returns: 335 | _Out: (3D ndarray) Array that contains the estimated source. 336 | """ 337 | # Parameter for the update 338 | flambda = 0.99 # Forgetting Factor 339 | 340 | cX = self._sTarget ** self._alpha 341 | cN = self._nResidual ** self._alpha 342 | 343 | M = self._mX.shape[0] # Number of channels 344 | gF = 1./M # Gain factor 345 | eM = cX.shape[0] # Number of estimated channels 346 | F = cX.shape[1] # Number of frequency samples 347 | T = cX.shape[2] # Number of time-frames 348 | fout = np.zeros((M,F,T), dtype = np.float32) # Initializing output 349 | I = np.eye(M) # Identity matrix 350 | 351 | # Initialization of covariance matrices 352 | Rxx = np.repeat(np.reshape(I, (M,M,1)), F, axis = -1) 353 | Rnn = np.repeat(np.reshape(I, (M,M,1)), F, axis = -1) 354 | 355 | # Recursive updates 356 | for t in xrange(T): 357 | for f in xrange(F): 358 | if eM == 1: 359 | Rxx[:, :, f] = flambda * Rxx[:, :, f] + (1. - flambda) * (cX[:, f, t]) 360 | Rnn[:, :, f] = flambda * Rnn[:, :, f] + (1. - flambda) * (cN[:, f, t]) 361 | else: 362 | Rxx[:, :, f] = (np.dot(cX[:, f:f+1, t], cX[:, f:f+1, t].T))/np.sum(cX[:,f,t], axis = 0) 363 | Rnn[:, :, f] = (np.dot(cN[:, f:f+1, t], cN[:, f:f+1, t].T))/np.sum(cN[:,f,t], axis = 0) 364 | 365 | inv = np.dot(np.linalg.pinv(Rnn[:, :, f]), (Rnn[:, :, f] + Rxx[:, :, f])) 366 | if eM == 1: 367 | Wf = ((inv - I)/( (cN[:,f,t] + cX[:, f, t] + 1e-16)/(cX[:,f,t] + 1e-16) + np.trace(inv) * gF)) 368 | else : 369 | Wf = ((inv - I)/(gF * np.trace(inv))) 370 | 371 | fout[:, f, t] = np.dot(Wf.T, self._mX[:, f, t]) 372 | 373 | self._Out = np.abs(fout) 374 | 375 | def applyMask(self): 376 | """ Compute the filtered output spectrogram. 377 | Args: 378 | mask: (2D ndarray) Array that contains time frequency gain values 379 | mX: (2D ndarray) Input Magnitude Spectrogram 380 | Returns: 381 | Y: (2D ndarray) Filtered version of the Magnitude Spectrogram 382 | """ 383 | if self._method == 'expMask': 384 | self._Out = (self._mX ** self._alpha) ** self._mask 385 | else : 386 | self._Out = np.multiply(self._mask, self._mX) 387 | 388 | def applyReverseMask(self): 389 | """ Compute the filtered output spectrogram, reversing the gain values. 390 | Args: 391 | mask: (2D ndarray) Array that contains time frequency gain values 392 | mX: (2D ndarray) Input Magnitude Spectrogram 393 | Returns: 394 | Y: (2D ndarray) Filtered version of the Magnitude Spectrogram 395 | """ 396 | if self._method == 'expMask': 397 | raise ValueError('Cannot compute that using such masking method.') 398 | else : 399 | self._Out = np.multiply( (1. - self._mask), self._mX) 400 | 401 | def _IS(self, Xhat): 402 | """ Compute the Itakura-Saito distance between the observed magnitude spectrum 403 | and the estimated one. 404 | Args: 405 | mX : (2D ndarray) Input Magnitude Spectrogram 406 | Xhat : (2D ndarray) Estimated Magnitude Spectrogram 407 | Returns: 408 | dis : (float) Average Itakura-Saito distance 409 | """ 410 | r1 = (np.abs(self._mX)**self._alpha + self._eps) / (np.abs(Xhat) + self._eps) 411 | lg = np.log((np.abs(self._mX)**self._alpha + self._eps)) - np.log((np.abs(Xhat) + self._eps)) 412 | return np.mean(r1 - lg - 1.) 413 | 414 | def _dIS(self, Xhat): 415 | """ Computation of the first derivative of Itakura-Saito function. As appears in : 416 | Cedric Fevotte and Jerome Idier, "Algorithms for nonnegative matrix factorization 417 | with the beta-divergence", in CoRR, vol. abs/1010.1763, 2010. 418 | Args: 419 | mX : (2D ndarray) Input Magnitude Spectrogram 420 | Xhat : (2D ndarray) Estimated Magnitude Spectrogram 421 | Returns: 422 | dis' : (float) Average of first derivative of Itakura-Saito distance. 423 | """ 424 | dis = (np.abs(Xhat + self._eps) ** (-2.)) * (np.abs(Xhat) - np.abs(self._mX)**self._alpha) 425 | return (np.mean(dis)) 426 | 427 | if __name__ == "__main__": 428 | 429 | # Small test 430 | kSin = (0.5 * np.cos(np.arange(4096) * (1000.0 * (3.1415926 * 2.0) / 44100))) 431 | noise = (np.random.uniform(-0.25,0.25,4096)) 432 | # Noisy observation 433 | obs = (kSin + noise) 434 | 435 | kSinX = fft(kSin, 4096) 436 | noisX = fft(noise, 4096) 437 | obsX = fft(obs, 4096) 438 | 439 | # Wiener Case 440 | mask = FrequencyMasking(np.abs(obsX), np.abs(kSinX), [np.abs(noisX)], [], [], alpha = 2., method = 'alphaWiener') 441 | sinhat = mask() 442 | noisehat = mask(reverse = True) 443 | # Access the mask if needed 444 | ndmask = mask._mask -------------------------------------------------------------------------------- /QMF/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'S.I. Mimilakis' 3 | __copyright__ = 'MacSeNet' -------------------------------------------------------------------------------- /QMF/old_vers/IOMethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'S.I. Mimilakis' 3 | __copyright__ = 'MacSeNet' 4 | 5 | import os, subprocess, csv 6 | import numpy as np 7 | import wave as _wave 8 | from scipy.io.wavfile import write, read 9 | from sys import platform 10 | 11 | class AudioIO: 12 | """ Class for handling audio input/output operations. 13 | It supports reading and writing of various audio formats 14 | via 'audioRead' & 'audioWrite' methods. Moreover playback 15 | can be performed by using 'sound' method. For formats 16 | different than '.wav' a coder is needed. In this case 17 | libffmpeg is being used, where the absolute path of 18 | the static build should be given to the class variable. 19 | Finally, energy normalisation and anti-clipping methods 20 | are also covered in the last two methods. 21 | 22 | Basic Usage examples: 23 | Import the class : 24 | import IOMethods as IO 25 | -For loading wav files: 26 | x, fs = IO.AudioIO.wavRead('myWavFile.wav', mono = True) 27 | -In case that compressed files are about to be read specify 28 | the path to the libffmpeg library by changing the 'pathToffmpeg' 29 | variable and then type: 30 | x, fs = IO.AudioIO.audioRead() 31 | -For writing wav files: 32 | IO.AudioIO.audioWrite(x, fs, 16, 'myNewWavFile.wav', 'wav') 33 | 34 | -For listening wav files: 35 | IO.AudioIO.sound(x,fs) 36 | 37 | """ 38 | # Normalisation parameters for wavreading and writing 39 | normFact = {'int8' : (2**7) -1, 40 | 'int16': (2**15)-1, 41 | 'int24': (2**23)-1, 42 | 'int32': (2**31)-1, 43 | 'int64': (2**63)-1, 44 | 'float32': 1.0, 45 | 'float64': 1.0} 46 | 47 | # 'Silence' the bash output 48 | FNULL = open(os.devnull, 'w') 49 | 50 | # Absolute path needed here 51 | pathToffmpeg = '/home/mis/Documents/Python/Projects/SourceSeparation/MiscFiles' 52 | 53 | def __init__(self): 54 | pass 55 | 56 | @staticmethod 57 | def audioRead(fileName, mono=False, startSec=None, endSec=None): 58 | """ Function to load audio files such as *.mp3, *.au, *.wma & *.aiff. 59 | It first converts them to .wav and reads them with the methods below. 60 | Currently, it uses a static build of ffmpeg. 61 | 62 | Args: 63 | fileName: (str) Absolute filename of WAV file 64 | mono: (bool) Switch if samples should be converted to mono 65 | startSec: (float) Segment start time in seconds (if None, segment starts at the beginning of the WAV file) 66 | endSec: (float) Segment end time in seconds (if None, segment ends at the end of the WAV file) 67 | Returns: 68 | samples: (np array) Audio samples (between [-1,1] 69 | (if stereo: numSamples x numChannels, 70 | if mono: numSamples) 71 | sampleRate: (float): Sampling frequency [Hz] 72 | """ 73 | 74 | # Get the absolute path 75 | fileName = os.path.abspath(fileName) 76 | 77 | # Linux 78 | if (platform == "linux") or (platform == "linux2"): 79 | convDict = { 80 | 'mp3':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 81 | + ' -i ' + fileName + ' ', -3], 82 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 83 | + ' -i ' + fileName + ' ', -2], 84 | 'wma':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 85 | + ' -i ' + fileName + ' ', -3], 86 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') 87 | + ' -i ' + fileName + ' ', -4] 88 | } 89 | 90 | # MacOSX 91 | elif (platform == "darwin"): 92 | convDict = { 93 | 'mp3':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 94 | + ' -i ' + fileName + ' ', -3], 95 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 96 | + ' -i ' + fileName + ' ', -2], 97 | 'wma':[os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 98 | + ' -i ' + fileName + ' ', -3], 99 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') 100 | + ' -i ' + fileName + ' ', -4] 101 | } 102 | # Add windows support! 103 | else : 104 | raise Exception('This OS is not supported.') 105 | 106 | # Construct 107 | 108 | if fileName[convDict['mp3'][1]:] == 'mp3': 109 | print(fileName[convDict['mp3'][1]:]) 110 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['mp3'][1]] + 'wav')) 111 | subprocess.call(convDict['mp3'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 112 | samples, sampleRate = AudioIO.wavRead(modfileName, mono, startSec, endSec) 113 | os.remove(modfileName) 114 | 115 | elif fileName[convDict['au'][1]:] == 'au': 116 | print(fileName[convDict['au'][1]:]) 117 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['au'][1]] + 'wav')) 118 | subprocess.call(convDict['au'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 119 | samples, sampleRate = AudioIO.wavRead(modfileName, mono, startSec, endSec) 120 | os.remove(modfileName) 121 | 122 | elif fileName[convDict['wma'][1]:] == 'wma': 123 | print(fileName[convDict['wma'][1]:]) 124 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['wma'][1]] + 'wav')) 125 | subprocess.call(convDict['wma'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 126 | samples, sampleRate = AudioIO.wavRead(modfileName, mono, startSec, endSec) 127 | os.remove(modfileName) 128 | 129 | elif fileName[convDict['aiff'][1]:] == 'aiff': 130 | print(fileName[convDict['aiff'][1]:]) 131 | modfileName = os.path.join(os.path.abspath(fileName[:convDict['aiff'][1]] + 'wav')) 132 | subprocess.call(convDict['aiff'][0]+modfileName, shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 133 | samples, sampleRate = AudioIO.wavRead(modfileName, mono, startSec, endSec) 134 | os.remove(modfileName) 135 | 136 | else : 137 | raise Exception('This format is not supported.') 138 | 139 | return samples, sampleRate 140 | 141 | @staticmethod 142 | def audioWrite(y, fs, nbits, audioFile, format): 143 | """ Write samples to WAV file and then converts to selected 144 | format using ffmpeg. 145 | Args: 146 | samples: (ndarray / 2D ndarray) (floating point) sample vector 147 | mono: DIM: nSamples 148 | stereo: DIM: nSamples x nChannels 149 | 150 | fs: (int) Sample rate in Hz 151 | nBits: (int) Number of bits 152 | audioFile: (string) WAV file name to write 153 | format: (string) Selected format 154 | 'mp3' : Writes to .mp3 155 | 'wma' : Writes to .wma 156 | 'wav' : Writes to .wav 157 | 'aiff' : Writes to .aiff 158 | 'au' : Writes to .au 159 | """ 160 | 161 | # Linux 162 | if (platform == "linux") or (platform == "linux2"): 163 | convDict = { 164 | 'mp3': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -3], 165 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -2], 166 | 'wma': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -3], 167 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_linux') + ' -i ', -4] 168 | } 169 | 170 | # MacOSX 171 | elif (platform == "darwin"): 172 | convDict = { 173 | 'mp3': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -3], 174 | 'au': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -2], 175 | 'wma': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -3], 176 | 'aiff': [os.path.join(AudioIO.pathToffmpeg, 'ffmpeg_osx') + ' -i ', -4] 177 | } 178 | 179 | # Add windows support! 180 | else : 181 | raise Exception('This OS is not supported.') 182 | 183 | if (format == 'mp3'): 184 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['mp3'][1]] + 'wav')) 185 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 186 | subprocess.call(convDict['mp3'][0] + prmfileName + ' ' + audioFile, 187 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 188 | os.remove(prmfileName) 189 | 190 | elif (format == 'wav'): 191 | AudioIO.wavWrite(y, fs, nbits, audioFile) 192 | 193 | elif (format == 'wma'): 194 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['wma'][1]] + 'wav')) 195 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 196 | subprocess.call(convDict['wma'][0] + prmfileName + ' ' + audioFile, 197 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 198 | os.remove(prmfileName) 199 | 200 | elif (format == 'aiff'): 201 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['aiff'][1]] + 'wav')) 202 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 203 | subprocess.call(convDict['aiff'][0] + prmfileName + ' ' + audioFile, 204 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 205 | os.remove(prmfileName) 206 | 207 | elif (format == 'au'): 208 | prmfileName = os.path.join(os.path.abspath(audioFile[:convDict['au'][1]] + 'wav')) 209 | AudioIO.wavWrite(y, fs, nbits, prmfileName) 210 | subprocess.call(convDict['au'][0] + prmfileName + ' ' + audioFile, 211 | shell = True, stdout=AudioIO.FNULL, stderr=subprocess.STDOUT) 212 | os.remove(prmfileName) 213 | else : 214 | raise Exception('This format is not supported.') 215 | 216 | @staticmethod 217 | def wavRead(fileName, mono=False, startSec=None, endSec=None): 218 | """ Function to load WAV file. 219 | 220 | Args: 221 | fileName: (str) Absolute filename of WAV file 222 | mono: (bool) Switch if samples should be converted to mono 223 | startSec: (float) Segment start time in seconds (if None, segment starts at the beginning of the WAV file) 224 | endSec: (float) Segment end time in seconds (if None, segment ends at the end of the WAV file) 225 | Returns: 226 | samples: (np array) Audio samples (between [-1,1] 227 | (if stereo: numSamples x numChannels, 228 | if mono: numSamples) 229 | sampleRate: (float): Sampling frequency [Hz] 230 | """ 231 | try: 232 | samples, sampleRate = AudioIO._loadWAVWithWave(fileName) 233 | sWidth = _wave.open(fileName).getsampwidth() 234 | if sWidth == 1: 235 | #print('8bit case') 236 | samples = samples.astype(float) / AudioIO.normFact['int8'] - 1.0 237 | elif sWidth == 2: 238 | #print('16bit case') 239 | samples = samples.astype(float) / AudioIO.normFact['int16'] 240 | elif sWidth == 3: 241 | #print('24bit case') 242 | samples = samples.astype(float) / AudioIO.normFact['int24'] 243 | except: 244 | #print('32bit case') 245 | samples, sampleRate = AudioIO._loadWAVWithScipy(fileName) 246 | 247 | # mono conversion 248 | if mono: 249 | if samples.ndim == 2 and samples.shape[1] > 1: 250 | samples = (samples[:, 0] + samples[:, 1])*0.5 251 | 252 | # segment selection 253 | songLenSamples = samples.shape[0] 254 | if startSec is None: 255 | startIdx = 0 256 | else: 257 | startIdx = int(round(startSec*sampleRate)) 258 | if endSec is None: 259 | endIdx = songLenSamples-1 260 | else: 261 | endIdx = int(round(endSec*sampleRate)) 262 | if startIdx < 0 or startIdx > songLenSamples: 263 | raise Exception("Segment start sample index out of song boundaries!") 264 | if endIdx < startIdx or endIdx > songLenSamples: 265 | raise Exception("Segment end sample index out of song boundaries!") 266 | if samples.ndim == 1: 267 | samples = samples[startIdx:endIdx] 268 | else: 269 | samples = samples[startIdx:endIdx, :] 270 | 271 | return samples, sampleRate 272 | 273 | @staticmethod 274 | def _loadWAVWithWave(fileName): 275 | """ Load samples & sample rate from 24 bit WAV file """ 276 | wav = _wave.open(fileName) 277 | rate = wav.getframerate() 278 | nchannels = wav.getnchannels() 279 | sampwidth = wav.getsampwidth() 280 | nframes = wav.getnframes() 281 | data = wav.readframes(nframes) 282 | wav.close() 283 | array = AudioIO._wav2array(nchannels, sampwidth, data) 284 | 285 | return array, rate 286 | 287 | @staticmethod 288 | def _loadWAVWithScipy(fileName): 289 | """ Load samples & sample rate from WAV file """ 290 | inputData = read(fileName) 291 | samples = inputData[1] 292 | sampleRate = inputData[0] 293 | 294 | return samples, sampleRate 295 | 296 | @staticmethod 297 | def _wav2array(nchannels, sampwidth, data): 298 | """data must be the string containing the bytes from the wav file.""" 299 | num_samples, remainder = divmod(len(data), sampwidth * nchannels) 300 | if remainder > 0: 301 | raise ValueError('The length of data is not a multiple of ' 302 | 'sampwidth * num_channels.') 303 | if sampwidth > 4: 304 | raise ValueError("sampwidth must not be greater than 4.") 305 | 306 | if sampwidth == 3: 307 | a = np.empty((num_samples, nchannels, 4), dtype = np.uint8) 308 | raw_bytes = np.fromstring(data, dtype = np.uint8) 309 | a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth) 310 | a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255 311 | result = a.view(' 16: 338 | fX = y 339 | 340 | write(audioFile, fs, fX) 341 | 342 | @staticmethod 343 | def sound(x,fs): 344 | """ Plays a wave file using the pyglet library. But first, it has to be written. 345 | Args: 346 | x: (array) Floating point samples 347 | fs: (int) The sampling rate 348 | """ 349 | import pyglet as pg 350 | # Call the writing function 351 | AudioIO.wavWrite(x, fs, 16, 'testPlayback.wav') 352 | # Initialize playback engine 353 | playback = pg.media.load('testPlayback.wav') 354 | # Sound call 355 | playback.play() 356 | # Remove the dummy wave write 357 | os.remove('testPlayback.wav') 358 | 359 | @staticmethod 360 | def energyNormalisation(x1, x2, wsz = 1024): 361 | """ Function to perform energy normalisation of two audio signals, 362 | based on envelopes acquired by Hilbert transformation. 363 | 364 | Args: 365 | x1 : (np array) Absolute filename of WAV file 366 | x2 : (np array) Switch if samples should be converted to mono 367 | wsz : (int) Number of samples to take into account for the 368 | computation of the analytic function. If set 369 | to zero the whole signal will be analysed 370 | at once. 371 | Returns: 372 | y1 : (np array) Energy normalised output signal 373 | y2 : (np array) Energy normalised output signal 374 | """ 375 | x1.shape = (len(x1), 1) 376 | x2.shape = (len(x2), 1) 377 | 378 | if wsz == 0: 379 | xa1 = AF.HilbertTransformation(x1, mode = 'global', wsz = wsz) 380 | xa2 = AF.HilbertTransformation(x2, mode = 'global', wsz = wsz) 381 | 382 | energy1 = np.mean(np.abs(xa1) ** 2.0) 383 | energy2 = np.mean(np.abs(xa2) ** 2.0) 384 | 385 | if energy1 > energy2: 386 | rt = energy1/energy2 387 | y2 = x2 * rt 388 | y1 = x1 389 | else : 390 | rt = energy2/energy1 391 | y1 = x1 * rt 392 | y2 = x2 393 | 394 | y1 = AudioIO.twoSideClip(y1, -1.0, 1.0) 395 | y2 = AudioIO.twoSideClip(y2, -1.0, 1.0) 396 | 397 | else: 398 | 399 | if len(x1) > len(x2): 400 | x1 = np.append(x1, np.zeros(len(x1)%wsz)) 401 | x2 = np.append(x2, np.zeros(len(x1) - len(x2))) 402 | else: 403 | x2 = np.append(x2, np.zeros(len(x2)%wsz)) 404 | x1 = np.append(x1, np.zeros(len(x2) - len(x1))) 405 | 406 | xa1 = AF.HilbertTransformation(x1, mode = 'local', wsz = wsz) 407 | xa2 = AF.HilbertTransformation(x2, mode = 'local', wsz = wsz) 408 | 409 | y1 = np.empty(len(x1)) 410 | y2 = np.empty(len(x2)) 411 | 412 | energy1 = np.abs(xa1) 413 | energy2 = np.abs(xa2) 414 | 415 | pin = 0 416 | pend = len(x1) - wsz 417 | 418 | while pin <= pend : 419 | 420 | lclE1 = np.mean(energy1[pin : pin + wsz]) 421 | lclE2 = np.mean(energy2[pin : pin + wsz]) 422 | 423 | if (lclE1 > lclE2) and (lclE1 > 1e-4) and (lclE2 > 1e-4): 424 | rt = lclE1/lclE2 425 | bufferY2 = x2[pin : pin + wsz] * rt 426 | bufferY1 = x1[pin : pin + wsz] 427 | 428 | elif (lclE1 < lclE2) and (lclE1 > 1e-4) and (lclE2 > 1e-4): 429 | rt = lclE2/lclE1 430 | bufferY1 = x1[pin : pin + wsz] * rt 431 | bufferY2 = x2[pin : pin + wsz] 432 | 433 | else: 434 | bufferY1 = x1[pin : pin + wsz] 435 | bufferY2 = x2[pin : pin + wsz] 436 | 437 | y1[pin : pin + wsz] = AudioIO.twoSideClip(bufferY1, -1.0, 1.0) 438 | y2[pin : pin + wsz] = AudioIO.twoSideClip(bufferY2, -1.0, 1.0) 439 | 440 | pin += wsz 441 | 442 | y1.shape = (len(y1),1) 443 | y2.shape = (len(y2),1) 444 | return y1, y2 445 | 446 | @staticmethod 447 | def twoSideClip(x, minimum, maximum): 448 | """ Method to limit an input array inside a given 449 | range. 450 | Args: 451 | x : (np array) Input array to be limited 452 | minimum : (int) Minimum value to be considered for cliping. 453 | maximum : (int) Maximum value to be considered for cliping. 454 | 455 | Returns: 456 | x : (np array) Limited output array 457 | """ 458 | 459 | for indx in range(len(x)): 460 | if x[indx] < minimum: 461 | x[indx] = minimum 462 | elif x[indx] > maximum: 463 | x[indx] = maximum 464 | 465 | return x 466 | 467 | if __name__ == "__main__": 468 | # Define File 469 | myReadFile = 'EnterYourWavFile.wav' 470 | # Read the file 471 | x, fs = AudioIO.wavRead(myReadFile, mono = True) 472 | # Gain parameter 473 | g = 0.5 474 | # Listen to it 475 | AudioIO.sound(x*g,fs) 476 | # Make it better and write it to disk 477 | x2 = np.empty((len(x),2), dtype = np.float32) 478 | try : 479 | x2[:,0] = x * g 480 | x2[:,1] = np.roll(x*g, 512) 481 | except ValueError: 482 | x2[:,0] = x[:,0] * g 483 | x2[:,1] = np.roll(x[:,0] * g, 256) 484 | # Listen to stereo processed 485 | AudioIO.sound(x2*g,fs) 486 | AudioIO.audioWrite(x2, fs, 16, 'myNewWavFile.wav', 'wav') 487 | 488 | -------------------------------------------------------------------------------- /QMF/old_vers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'S.I. Mimilakis' 3 | __copyright__ = 'MacSeNet' -------------------------------------------------------------------------------- /QMF/old_vers/foldingmat_filterbank.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.fftpack as spfft 3 | 4 | def ha2Pa3d(ha,N): 5 | #usage: Pa=ha2Pa3d(ha,N); 6 | #produces the analysis polyphase matrix Pa 7 | #in 3D matrix representation 8 | #from a basband filter ha with 9 | #a cosine modulation 10 | #N: Blocklength 11 | #Gerald Schuller 12 | #shl@idmt.fhg.de 13 | #Dec-2-15 14 | 15 | import numpy as np 16 | 17 | L=len(ha); 18 | 19 | blocks=int(np.ceil(L/N)); 20 | #print(blocks) 21 | 22 | Pa=np.zeros((N,N,blocks)); 23 | 24 | for k in range(N): #subband 25 | for m in range(blocks): #m: block number 26 | for nphase in range(N): #nphase: Phase 27 | n=m*N+nphase; 28 | #indexing like impulse response, phase index is reversed (N-np): 29 | Pa[N-1-nphase,k,m]=ha[n]*np.sqrt(2.0/N)*np.cos(np.pi/N*(k+0.5)*(blocks*N-1-n-N/2.0+0.5)); 30 | 31 | return Pa 32 | 33 | def hs2Ps3d(hs,N): 34 | #usage: Ps=hs2Ps3d(hs,N); 35 | #produces the synthesis polyphase matrix Ps 36 | #in 3D matrix representation 37 | #from a basband filter hs with 38 | #a cosine modulation 39 | #N: Blocklength 40 | #Gerald Schuller 41 | #shl@idmt.fhg.de 42 | #Dec-2-15 43 | 44 | import numpy as np 45 | 46 | L=len(hs); 47 | 48 | blocks=int(np.ceil(L/N)); 49 | #print(blocks) 50 | 51 | Ps=np.zeros((N,N,blocks)); 52 | 53 | for k in range(N): #subband 54 | for m in range(blocks): #m: block number 55 | for nphase in range(N): #nphase: Phase 56 | n=m*N+nphase; 57 | #synthesis: 58 | Ps[k,nphase,m]=hs[n]*np.sqrt(2.0/N)*np.cos(np.pi/N*(k+0.5)*(n-N/2.0+0.5)); 59 | 60 | return Ps 61 | 62 | def ha2Fa3d(ha,N): 63 | #usage: Fa=ha2Fa3d(ha,N); 64 | #produces the analysis polyphase folding matrix Fa with all polyphase components 65 | #in 3D matrix representation 66 | #from a basband filter ha with 67 | #a cosine modulation 68 | #N: Blocklength 69 | #Gerald Schuller 70 | #shl@idmt.fhg.de 71 | #Dec-2-15 72 | 73 | Pa=ha2Pa3d(ha,N); 74 | Fa=polmatmult(Pa,DCToMatrix(N)) 75 | #round zeroth polyphase component to 7 decimals after point: 76 | Fa=np.around(Fa,8) 77 | 78 | return Fa 79 | 80 | def hs2Fs3d(hs,N): 81 | #usage: Fs=hs2Fs3d(hs,N); 82 | #produces the synthesis polyphase folding matrix Fs with all polyphase components 83 | #in 3D matrix representation 84 | #from a basband filter ha with 85 | #a cosine modulation 86 | #N: Blocklength 87 | #Gerald Schuller 88 | #shl@idmt.fhg.de 89 | #Dec-2-15 90 | 91 | Ps=hs2Ps3d(hs,N); 92 | Fs=polmatmult(DCToMatrix(N),Ps) 93 | #round zeroth polyphase component to 7 decimals after point: 94 | Fs=np.around(Fs,8) 95 | 96 | return Fs 97 | 98 | def DCToMatrix(N): 99 | #produces an odd DCT matrix with size NxN 100 | #Gerald Schuller, Dec. 2015 101 | 102 | import numpy as np 103 | 104 | y=np.zeros((N,N,1)); 105 | 106 | for n in range(N): 107 | for k in range(N): 108 | y[n,k,0]=np.sqrt(2.0/N)*np.cos(np.pi/N*(k+0.5)*(n+0.5)); 109 | #y(n,k)=cos(pi/N*(k-0.5)*(n-1)); 110 | return y 111 | 112 | def polmatmult(A,B): 113 | #function C=polmatmult(A,B) 114 | #multiplies 2 polynomial matrices A and B, where each matrix entry is a polynomial, e.g. in z^-1. 115 | #Those polynomial entries are in the 3rd dimension 116 | #The third dimension can also be interpreted as containing the (2D) coefficient matrices for each 117 | #exponent of z^-1. 118 | #Result is C=A*B; 119 | 120 | import numpy as np 121 | from scipy import sparse 122 | 123 | [NAx,NAy,NAz]=A.shape; 124 | [NBx,NBy,NBz]=B.shape; 125 | 126 | #Degree +1 of resulting polynomial, with NAz-1 and NBz-1 beeing the degree of the input polynomials: 127 | Deg=NAz+NBz-1; 128 | 129 | C=np.zeros((NAx,NBy,Deg)); 130 | 131 | for n in range(Deg): 132 | for m in range(n+1): 133 | if ((n-m)= 0 ])) 121 | 122 | if option == 'matplotlib' : 123 | # Matplotlib 124 | line.set_ydata(20. * np.log10(b_mX[0, :-1] + 1e-16)) 125 | # Check for scaling the noise! 126 | line2.set_ydata(20. * np.log10(mt[0, :-1] + 1e-16)) 127 | plt.draw() 128 | plt.pause(0.00001) 129 | 130 | else : 131 | # Pygame 132 | screen.fill(background_color) 133 | prv_pos = (60, 480) 134 | prv_pos2 = (60, 480) 135 | prv_pos3 = (60, 480) 136 | for n in xrange(0, wsz): 137 | val = 20. * np.log10(b_mX[0, n] + 1e-16) 138 | val2 = 20. * np.log10(mt[0, n] + 1e-16) 139 | val3 = 20. * np.log10(b_nX[0, n] * mt[0, n] + 1e-16) 140 | val3 /= -120 141 | val /= -120 142 | val2/= -120 143 | val *= 480 144 | val2 *= 480 145 | val3 *= 480 146 | position = (n + 60, int(val)) 147 | position2 = (n + 60, int(val2)) 148 | position3 = (n + 60, int(val3)) 149 | pygame.draw.line(screen, color, prv_pos, position) 150 | pygame.draw.line(screen, color2, prv_pos2, position2) 151 | pygame.draw.line(screen, color3, prv_pos3, position3) 152 | prv_pos = position 153 | prv_pos2 = position2 154 | prv_pos3 = position3 155 | 156 | # Print the surface 157 | screen.blit(xlabel, (895, 460)) 158 | screen.blit(ylabel, (0, 5)) 159 | screen.blit(legendA, (800, 0)) 160 | screen.blit(legendB, (800, 15)) 161 | offset = font.render("Masking Threshold Offset in dB: " + str(gain), 1, (0, 250, 0)) 162 | bpc = font.render("Average bits per subband: " + str(bc), 1, (190, 160, 110)) 163 | helptext = font.render("(Adjust the threshold by pressing 'Up' & 'Down' Arrow keys)", 1, (0, 250, 0)) 164 | screen.blit(offset, (300, 0)) 165 | screen.blit(helptext, (300, 15)) 166 | screen.blit(bpc, (300, 30)) 167 | 168 | for n2 in xrange(len(dBpos)): 169 | dB = font.render(str(np.int(dBScales[n2])), 1, (0,120,120)) 170 | screen.blit(dB, (20, int(dBpos[n2]))) 171 | 172 | # Display 173 | pygame.display.flip() 174 | 175 | # Acquire Segment 176 | xSeg = x[pin:pin+wsz, 0] 177 | nSeg = noise[pin:pin+wsz, 0] 178 | 179 | # Perform DFT on segment 180 | mX, pX = TF.TimeFrequencyDecomposition.DFT(xSeg, w, N) 181 | nX, npX = TF.TimeFrequencyDecomposition.DFT(nSeg, w, N) 182 | 183 | # Set it to buffer 184 | b_mX[0, :] = mX 185 | b_nX[0, :] = nX 186 | # Masking threshold 187 | mt = pm.maskingThreshold(b_mX) * (10**(gain/20.)) 188 | 189 | # Resynthesize 190 | nSeg = TF.TimeFrequencyDecomposition.iDFT(nX * mt[0, :], npX, wsz) 191 | xSeg = TF.TimeFrequencyDecomposition.iDFT(mX, pX, wsz) 192 | mix = (xSeg + nSeg) * hop 193 | 194 | ola_buffer[0, 0:wsz] = prv_seg 195 | ola_buffer[0, hop:hop+wsz] += mix 196 | 197 | # Place it to output buffer 198 | output_buffer = ola_buffer[:, :wsz] 199 | 200 | # Playback 201 | writedata = output_buffer[0, :].astype(np.float32).tostring() 202 | stream.write(writedata, num_frames = wsz/2, exception_on_underflow = False) 203 | 204 | # Store previous frame samples 205 | prv_seg = ola_buffer[:, hop:wsz+hop] 206 | 207 | # Clear the overlap 208 | ola_buffer = np.zeros((1, wsz+hop), dtype = np.float32) 209 | 210 | # Update pointer and index 211 | pin += hop 212 | indx += 1 213 | 214 | plt.close() 215 | stream.stop_stream() 216 | stream.close() 217 | p.terminate() 218 | pygame.display.quit() 219 | pygame.quit() 220 | -------------------------------------------------------------------------------- /testFiles/pulse.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/Audio-Masking-Methods/64782c3ff9564b06677c2a2112b2b2bdb48dc2a1/testFiles/pulse.wav -------------------------------------------------------------------------------- /testFiles/ramp.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/Audio-Masking-Methods/64782c3ff9564b06677c2a2112b2b2bdb48dc2a1/testFiles/ramp.wav -------------------------------------------------------------------------------- /testFiles/sc03_16m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/Audio-Masking-Methods/64782c3ff9564b06677c2a2112b2b2bdb48dc2a1/testFiles/sc03_16m.wav --------------------------------------------------------------------------------