├── requirements.txt ├── .gitattributes ├── data ├── 000001.wav └── 000002.wav ├── hparams.py ├── readme.md ├── LICENSE ├── LWS.py ├── test.py ├── .gitignore ├── griffin_lim.py └── utils └── audio.py /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=1.3.0 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /data/000001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/candlewill/Griffin_lim/HEAD/data/000001.wav -------------------------------------------------------------------------------- /data/000002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/candlewill/Griffin_lim/HEAD/data/000002.wav -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # Default hyperparameters: 4 | hparams = tf.contrib.training.HParams( 5 | # Audio: 6 | num_mels=80, 7 | num_freq=513, 8 | sample_rate=22050, 9 | frame_length_ms=50, 10 | frame_shift_ms=12.5, 11 | preemphasis=0.97, 12 | min_level_db=-100, 13 | ref_level_db=20, 14 | max_abs_value=4, 15 | power=1.5, 16 | fft_size=1024, 17 | hop_size=256, 18 | 19 | # Eval: 20 | griffin_lim_iters=60 21 | ) 22 | 23 | 24 | def hparams_debug_string(): 25 | values = hparams.values() 26 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 27 | return 'Hyperparameters:\n' + '\n'.join(hp) 28 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Griffin Lim 2 | 3 | This repo is a TensorFlow implementation of Griffin-Lim algorithm for voice reconstruction. 4 | 5 | For comparision, a librosa implementation version is also included in this repo. 6 | 7 | ### Dependencies 8 | 9 | * TensorFlow >=1.3 10 | * Python 3.x 11 | * scipy 12 | * numpy 13 | * librosa 14 | 15 | ### Run 16 | To test the performance, we could execute the following command: 17 | ``` 18 | python test.py 19 | ``` 20 | 21 | The `test.py` script does the following things in order: 22 | 1. Extract the spectrogram features from waves 23 | 2. Reconstruction voice using Griffin-lim algorithm 24 | 25 | ### Data 26 | All data used is in the `data` folder. If we want to use other data, replace it. 27 | 28 | ### Acknowledgement 29 | Some code are borrowed from the following repo: 30 | 31 | * [Kyubyong/tensorflow-exercises](https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb) 32 | * [keithito/tacotron](https://github.com/keithito/tacotron) 33 | 34 | ### Contact 35 | [Yunchao He](yunchaohe@gmail.com) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 heyunchao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /LWS.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from utils import audio 3 | from hparams import hparams 4 | import numpy as np 5 | import os 6 | import lws 7 | 8 | 9 | def main(): 10 | data_foler = "data" 11 | wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")] 12 | outputs_lws = [file + ".lws.gen.wav" for file in wavs] 13 | wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs] 14 | 15 | lws_processor = lws.lws(512, 128, mode="speech") # 512: window length; 128: window shift 16 | i = 0 17 | for x in wavs: 18 | X = lws_processor.stft(x) # where x is a single-channel waveform 19 | X0 = np.abs(X) # Magnitude spectrogram 20 | print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0))) 21 | X1 = lws_processor.run_lws( 22 | X0) # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram) 23 | print(X1.shape) 24 | print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1))) 25 | print(X1.shape) 26 | wav = lws_processor.istft(X1).astype(np.float32) 27 | 28 | audio.save_wav(wav, outputs_lws[i]) 29 | i += 1 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from utils import audio 3 | from hparams import hparams 4 | import numpy as np 5 | from griffin_lim import inv_spectrogram, tf 6 | import os 7 | 8 | if __name__ == '__main__': 9 | data_foler = "data" 10 | wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")] 11 | outputs_py = [file + ".py.gen.wav" for file in wavs] 12 | outputs_tf = [file + ".tf.gen.wav" for file in wavs] 13 | wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs] 14 | spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs] 15 | print("Linear spectrograms dim: ") 16 | print(spectrogram[0].shape) 17 | # --------------------------------- librosa Version --------------------------------- 18 | # convert back 19 | gens = [audio.inv_spectrogram(s) for s in spectrogram] 20 | 21 | for gen, output in zip(gens, outputs_py): 22 | audio.save_wav(gen, output) 23 | 24 | # --------------------------------- TensorFlow Version --------------------------------- 25 | 26 | samples = [inv_spectrogram(spec) for spec in spectrogram] 27 | 28 | with tf.Session() as sess: 29 | samples = [sess.run(sample) for sample in samples] 30 | 31 | for gen, output in zip(samples, outputs_tf): 32 | audio.save_wav(gen, output) 33 | 34 | print("Done!") 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv/ 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | __pycache__ 93 | .idea 94 | data 95 | -------------------------------------------------------------------------------- /griffin_lim.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import tensorflow as tf 3 | from hparams import hparams 4 | 5 | """ 6 | TensorFlow implementation of Griffin-lim Algorithm for voice reconstruction 7 | """ 8 | 9 | 10 | # TF 11 | def spectrogram2wav(spectrogram, n_iter=hparams.griffin_lim_iters, n_fft=(hparams.num_freq - 1) * 2, 12 | win_length=int(hparams.frame_length_ms / 1000 * hparams.sample_rate), 13 | hop_length=int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)): 14 | '''Converts spectrogram into a waveform using Griffin-lim's raw. 15 | ''' 16 | 17 | def invert_spectrogram(spectrogram): 18 | ''' 19 | spectrogram: [t, f] 20 | ''' 21 | spectrogram = tf.expand_dims(spectrogram, 0) 22 | inversed = tf.contrib.signal.inverse_stft(spectrogram, win_length, hop_length, n_fft) 23 | squeezed = tf.squeeze(inversed, 0) 24 | return squeezed 25 | 26 | spectrogram = tf.transpose(spectrogram) 27 | 28 | spectrogram = tf.cast(spectrogram, dtype=tf.complex64) # [t, f] 29 | X_best = tf.identity(spectrogram) 30 | for i in range(n_iter): 31 | X_t = invert_spectrogram(X_best) 32 | est = tf.contrib.signal.stft(X_t, win_length, hop_length, n_fft, pad_end=False) # (1, T, n_fft/2+1) 33 | phase = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) # [t, f] 34 | X_best = spectrogram * phase # [t, t] 35 | X_t = invert_spectrogram(X_best) 36 | y = tf.real(X_t) 37 | 38 | return y 39 | 40 | 41 | def inv_spectrogram(spectrogram): 42 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 43 | return _inv_preemphasis(spectrogram2wav(S ** hparams.power)) # Reconstruct phase 44 | 45 | 46 | def _denormalize(D): 47 | return (((tf.clip_by_value(D, -hparams.max_abs_value, 48 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / ( 49 | 2 * hparams.max_abs_value)) + hparams.min_level_db) 50 | 51 | 52 | def _db_to_amp(x): 53 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 54 | 55 | 56 | def _inv_preemphasis(x): 57 | N = tf.shape(x)[0] 58 | i = tf.constant(0) 59 | W = tf.zeros(shape=tf.shape(x), dtype=tf.float32) 60 | 61 | def condition(i, y): 62 | return tf.less(i, N) 63 | 64 | def body(i, y): 65 | tmp = tf.slice(x, [0], [i + 1]) 66 | tmp = tf.concat([tf.zeros([N - i - 1]), tmp], -1) 67 | y = hparams.preemphasis * y + tmp 68 | i = tf.add(i, 1) 69 | return [i, y] 70 | 71 | final = tf.while_loop(condition, body, [i, W]) 72 | 73 | y = final[1] 74 | 75 | return y 76 | -------------------------------------------------------------------------------- /utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from scipy import signal 5 | 6 | from scipy.io import wavfile 7 | 8 | from hparams import hparams 9 | 10 | 11 | def load_wav(path, sr): 12 | return librosa.core.load(path, sr=sr)[0] 13 | 14 | 15 | def save_wav(wav, path): 16 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 17 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 18 | 19 | 20 | def spectrogram(y): 21 | D = _stft(_preemphasis(y)) 22 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 23 | return _normalize(S) 24 | 25 | 26 | def inv_spectrogram(spectrogram): 27 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 28 | return _inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase 29 | 30 | 31 | def melspectrogram(y): 32 | D = _stft(_preemphasis(y)) 33 | S = _amp_to_db(_linear_to_mel(np.abs(D))) 34 | return _normalize(S) 35 | 36 | 37 | def inv_melspectrogram(melspectrogram): 38 | S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear 39 | return _inv_preemphasis(_griffin_lim(S ** 1.5)) # Reconstruct phase 40 | 41 | 42 | # Based on https://github.com/librosa/librosa/issues/434 43 | def _griffin_lim(S): 44 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 45 | S_complex = np.abs(S).astype(np.complex) 46 | for i in range(hparams.griffin_lim_iters): 47 | if i > 0: 48 | angles = np.exp(1j * np.angle(_stft(y))) 49 | y = _istft(S_complex * angles) 50 | return y 51 | 52 | 53 | def _stft(y): 54 | return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 55 | 56 | 57 | def _istft(y): 58 | return librosa.istft(y, hop_length=get_hop_size()) 59 | 60 | 61 | # Conversions: 62 | 63 | _mel_basis = None 64 | _inv_mel_basis = None 65 | 66 | 67 | def _linear_to_mel(spectrogram): 68 | global _mel_basis 69 | if _mel_basis is None: 70 | _mel_basis = _build_mel_basis() 71 | return np.dot(_mel_basis, spectrogram) 72 | 73 | 74 | def _mel_to_linear(mel_spectrogram): 75 | global _inv_mel_basis 76 | if _inv_mel_basis is None: 77 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 78 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 79 | 80 | 81 | def _build_mel_basis(): 82 | n_fft = (hparams.num_freq - 1) * 2 83 | return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 84 | 85 | 86 | def _amp_to_db(x): 87 | return 20 * np.log10(np.maximum(1e-5, x)) 88 | 89 | 90 | def _db_to_amp(x): 91 | return np.power(10.0, x * 0.05) 92 | 93 | 94 | def _preemphasis(x): 95 | return signal.lfilter([1, -hparams.preemphasis], [1], x) 96 | 97 | 98 | def _inv_preemphasis(x): 99 | return signal.lfilter([1], [1, -hparams.preemphasis], x) 100 | 101 | 102 | def _normalize(S): 103 | return np.clip( 104 | (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 105 | -hparams.max_abs_value, hparams.max_abs_value) 106 | 107 | 108 | def _denormalize(D): 109 | return (((np.clip(D, -hparams.max_abs_value, 110 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / ( 111 | 2 * hparams.max_abs_value)) 112 | + hparams.min_level_db) 113 | 114 | 115 | def get_hop_size(): 116 | hop_size = hparams.hop_size 117 | if hop_size is None: 118 | assert hparams.frame_shift_ms is not None 119 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 120 | return hop_size 121 | --------------------------------------------------------------------------------