├── requirements.txt
├── .gitattributes
├── data
    ├── 000001.wav
    └── 000002.wav
├── hparams.py
├── readme.md
├── LICENSE
├── LWS.py
├── test.py
├── .gitignore
├── griffin_lim.py
└── utils
    └── audio.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=1.3.0


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto


--------------------------------------------------------------------------------
/data/000001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/candlewill/Griffin_lim/HEAD/data/000001.wav


--------------------------------------------------------------------------------
/data/000002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/candlewill/Griffin_lim/HEAD/data/000002.wav


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | # Default hyperparameters:
 4 | hparams = tf.contrib.training.HParams(
 5 |     # Audio:
 6 |     num_mels=80,
 7 |     num_freq=513,
 8 |     sample_rate=22050,
 9 |     frame_length_ms=50,
10 |     frame_shift_ms=12.5,
11 |     preemphasis=0.97,
12 |     min_level_db=-100,
13 |     ref_level_db=20,
14 |     max_abs_value=4,
15 |     power=1.5,
16 |     fft_size=1024,
17 |     hop_size=256,
18 | 
19 |     # Eval:
20 |     griffin_lim_iters=60
21 | )
22 | 
23 | 
24 | def hparams_debug_string():
25 |     values = hparams.values()
26 |     hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
27 |     return 'Hyperparameters:\n' + '\n'.join(hp)
28 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Griffin Lim
 2 | 
 3 | This repo is a TensorFlow implementation of Griffin-Lim algorithm for voice reconstruction.
 4 | 
 5 | For comparision, a librosa implementation version is also included in this repo.
 6 | 
 7 | ### Dependencies
 8 | 
 9 | * TensorFlow  >=1.3
10 | * Python 3.x
11 | * scipy
12 | * numpy
13 | * librosa
14 | 
15 | ### Run
16 | To test the performance, we could execute the following command:
17 | ```
18 | python test.py
19 | ```
20 | 
21 | The `test.py` script does the following things in order:
22 | 1. Extract the spectrogram features from waves
23 | 2. Reconstruction voice using Griffin-lim algorithm
24 | 
25 | ### Data
26 | All data used is in the `data` folder. If we want to use other data, replace it.
27 | 
28 | ### Acknowledgement
29 | Some code are borrowed from the following repo:
30 | 
31 | * [Kyubyong/tensorflow-exercises](https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb)
32 | * [keithito/tacotron](https://github.com/keithito/tacotron)
33 | 
34 | ### Contact
35 | [Yunchao He](yunchaohe@gmail.com)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 heyunchao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/LWS.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | from utils import audio
 3 | from hparams import hparams
 4 | import numpy as np
 5 | import os
 6 | import lws
 7 | 
 8 | 
 9 | def main():
10 |     data_foler = "data"
11 |     wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
12 |     outputs_lws = [file + ".lws.gen.wav" for file in wavs]
13 |     wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]
14 | 
15 |     lws_processor = lws.lws(512, 128, mode="speech")  # 512: window length; 128: window shift
16 |     i = 0
17 |     for x in wavs:
18 |         X = lws_processor.stft(x)  # where x is a single-channel waveform
19 |         X0 = np.abs(X)  # Magnitude spectrogram
20 |         print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0)))
21 |         X1 = lws_processor.run_lws(
22 |             X0)  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
23 |         print(X1.shape)
24 |         print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1)))
25 |         print(X1.shape)
26 |         wav = lws_processor.istft(X1).astype(np.float32)
27 | 
28 |         audio.save_wav(wav, outputs_lws[i])
29 |         i += 1
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | from utils import audio
 3 | from hparams import hparams
 4 | import numpy as np
 5 | from griffin_lim import inv_spectrogram, tf
 6 | import os
 7 | 
 8 | if __name__ == '__main__':
 9 |     data_foler = "data"
10 |     wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
11 |     outputs_py = [file + ".py.gen.wav" for file in wavs]
12 |     outputs_tf = [file + ".tf.gen.wav" for file in wavs]
13 |     wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]
14 |     spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs]
15 |     print("Linear spectrograms dim: ")
16 |     print(spectrogram[0].shape)
17 |     # --------------------------------- librosa Version ---------------------------------
18 |     # convert back
19 |     gens = [audio.inv_spectrogram(s) for s in spectrogram]
20 | 
21 |     for gen, output in zip(gens, outputs_py):
22 |         audio.save_wav(gen, output)
23 | 
24 |     # --------------------------------- TensorFlow Version ---------------------------------
25 | 
26 |     samples = [inv_spectrogram(spec) for spec in spectrogram]
27 | 
28 |     with tf.Session() as sess:
29 |         samples = [sess.run(sample) for sample in samples]
30 | 
31 |     for gen, output in zip(samples, outputs_tf):
32 |         audio.save_wav(gen, output)
33 | 
34 |     print("Done!")
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | .venv/
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | __pycache__
93 | .idea
94 | data
95 | 


--------------------------------------------------------------------------------
/griffin_lim.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | import tensorflow as tf
 3 | from hparams import hparams
 4 | 
 5 | """
 6 | TensorFlow implementation of Griffin-lim Algorithm for voice reconstruction
 7 | """
 8 | 
 9 | 
10 | # TF
11 | def spectrogram2wav(spectrogram, n_iter=hparams.griffin_lim_iters, n_fft=(hparams.num_freq - 1) * 2,
12 |                     win_length=int(hparams.frame_length_ms / 1000 * hparams.sample_rate),
13 |                     hop_length=int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)):
14 |     '''Converts spectrogram into a waveform using Griffin-lim's raw.
15 |     '''
16 | 
17 |     def invert_spectrogram(spectrogram):
18 |         '''
19 |         spectrogram: [t, f]
20 |         '''
21 |         spectrogram = tf.expand_dims(spectrogram, 0)
22 |         inversed = tf.contrib.signal.inverse_stft(spectrogram, win_length, hop_length, n_fft)
23 |         squeezed = tf.squeeze(inversed, 0)
24 |         return squeezed
25 | 
26 |     spectrogram = tf.transpose(spectrogram)
27 | 
28 |     spectrogram = tf.cast(spectrogram, dtype=tf.complex64)  # [t, f]
29 |     X_best = tf.identity(spectrogram)
30 |     for i in range(n_iter):
31 |         X_t = invert_spectrogram(X_best)
32 |         est = tf.contrib.signal.stft(X_t, win_length, hop_length, n_fft, pad_end=False)  # (1, T, n_fft/2+1)
33 |         phase = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)  # [t, f]
34 |         X_best = spectrogram * phase  # [t, t]
35 |     X_t = invert_spectrogram(X_best)
36 |     y = tf.real(X_t)
37 | 
38 |     return y
39 | 
40 | 
41 | def inv_spectrogram(spectrogram):
42 |     S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
43 |     return _inv_preemphasis(spectrogram2wav(S ** hparams.power))  # Reconstruct phase
44 | 
45 | 
46 | def _denormalize(D):
47 |     return (((tf.clip_by_value(D, -hparams.max_abs_value,
48 |                                hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (
49 |                      2 * hparams.max_abs_value)) + hparams.min_level_db)
50 | 
51 | 
52 | def _db_to_amp(x):
53 |     return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
54 | 
55 | 
56 | def _inv_preemphasis(x):
57 |     N = tf.shape(x)[0]
58 |     i = tf.constant(0)
59 |     W = tf.zeros(shape=tf.shape(x), dtype=tf.float32)
60 | 
61 |     def condition(i, y):
62 |         return tf.less(i, N)
63 | 
64 |     def body(i, y):
65 |         tmp = tf.slice(x, [0], [i + 1])
66 |         tmp = tf.concat([tf.zeros([N - i - 1]), tmp], -1)
67 |         y = hparams.preemphasis * y + tmp
68 |         i = tf.add(i, 1)
69 |         return [i, y]
70 | 
71 |     final = tf.while_loop(condition, body, [i, W])
72 | 
73 |     y = final[1]
74 | 
75 |     return y
76 | 


--------------------------------------------------------------------------------
/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | from scipy import signal
  5 | 
  6 | from scipy.io import wavfile
  7 | 
  8 | from hparams import hparams
  9 | 
 10 | 
 11 | def load_wav(path, sr):
 12 |     return librosa.core.load(path, sr=sr)[0]
 13 | 
 14 | 
 15 | def save_wav(wav, path):
 16 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 17 |     wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 18 | 
 19 | 
 20 | def spectrogram(y):
 21 |     D = _stft(_preemphasis(y))
 22 |     S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 23 |     return _normalize(S)
 24 | 
 25 | 
 26 | def inv_spectrogram(spectrogram):
 27 |     S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
 28 |     return _inv_preemphasis(_griffin_lim(S ** hparams.power))  # Reconstruct phase
 29 | 
 30 | 
 31 | def melspectrogram(y):
 32 |     D = _stft(_preemphasis(y))
 33 |     S = _amp_to_db(_linear_to_mel(np.abs(D)))
 34 |     return _normalize(S)
 35 | 
 36 | 
 37 | def inv_melspectrogram(melspectrogram):
 38 |     S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram)))  # Convert back to linear
 39 |     return _inv_preemphasis(_griffin_lim(S ** 1.5))  # Reconstruct phase
 40 | 
 41 | 
 42 | # Based on https://github.com/librosa/librosa/issues/434
 43 | def _griffin_lim(S):
 44 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 45 |     S_complex = np.abs(S).astype(np.complex)
 46 |     for i in range(hparams.griffin_lim_iters):
 47 |         if i > 0:
 48 |             angles = np.exp(1j * np.angle(_stft(y)))
 49 |         y = _istft(S_complex * angles)
 50 |     return y
 51 | 
 52 | 
 53 | def _stft(y):
 54 |     return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size())
 55 | 
 56 | 
 57 | def _istft(y):
 58 |     return librosa.istft(y, hop_length=get_hop_size())
 59 | 
 60 | 
 61 | # Conversions:
 62 | 
 63 | _mel_basis = None
 64 | _inv_mel_basis = None
 65 | 
 66 | 
 67 | def _linear_to_mel(spectrogram):
 68 |     global _mel_basis
 69 |     if _mel_basis is None:
 70 |         _mel_basis = _build_mel_basis()
 71 |     return np.dot(_mel_basis, spectrogram)
 72 | 
 73 | 
 74 | def _mel_to_linear(mel_spectrogram):
 75 |     global _inv_mel_basis
 76 |     if _inv_mel_basis is None:
 77 |         _inv_mel_basis = np.linalg.pinv(_build_mel_basis())
 78 |     return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
 79 | 
 80 | 
 81 | def _build_mel_basis():
 82 |     n_fft = (hparams.num_freq - 1) * 2
 83 |     return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
 84 | 
 85 | 
 86 | def _amp_to_db(x):
 87 |     return 20 * np.log10(np.maximum(1e-5, x))
 88 | 
 89 | 
 90 | def _db_to_amp(x):
 91 |     return np.power(10.0, x * 0.05)
 92 | 
 93 | 
 94 | def _preemphasis(x):
 95 |     return signal.lfilter([1, -hparams.preemphasis], [1], x)
 96 | 
 97 | 
 98 | def _inv_preemphasis(x):
 99 |     return signal.lfilter([1], [1, -hparams.preemphasis], x)
100 | 
101 | 
102 | def _normalize(S):
103 |     return np.clip(
104 |         (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
105 |         -hparams.max_abs_value, hparams.max_abs_value)
106 | 
107 | 
108 | def _denormalize(D):
109 |     return (((np.clip(D, -hparams.max_abs_value,
110 |                       hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (
111 |                      2 * hparams.max_abs_value))
112 |             + hparams.min_level_db)
113 | 
114 | 
115 | def get_hop_size():
116 |     hop_size = hparams.hop_size
117 |     if hop_size is None:
118 |         assert hparams.frame_shift_ms is not None
119 |         hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
120 |     return hop_size
121 | 


--------------------------------------------------------------------------------