├── README.md ├── _pesq_itu_results.txt ├── _pesq_results.txt ├── avr_pesq ├── ca_pesq.sh ├── cln.txt ├── config.py ├── cut_data ├── cut_cln_wav.m └── cut_wav.m ├── data ├── batch_num.txt ├── cv.list ├── cv │ ├── all_real_cln_dt.txt │ ├── all_real_rev_dt.txt │ ├── cln_dt.txt │ ├── far_dt │ ├── far_dt.txt │ ├── inputs.txt │ ├── inputs2.txt │ ├── inputs_dir.txt │ ├── inputs_feat.txt │ ├── nall_real_cln_dt.txt │ ├── name │ ├── near_dt │ └── near_dt.txt ├── test │ ├── far │ │ ├── cln_et.txt │ │ ├── far_dir.txt │ │ ├── inputs.txt │ │ ├── inputs_dir.txt │ │ ├── inputs_feat.txt │ │ ├── name │ │ ├── pro.sh │ │ └── test.list │ ├── near │ │ ├── cln_et.txt │ │ ├── inputs.txt │ │ ├── inputs_dir.txt │ │ ├── inputs_feat.txt │ │ ├── name │ │ ├── near_dir.txt │ │ └── test.list │ └── real │ │ ├── all_real_cln_et.txt │ │ ├── cln │ │ ├── inputs.txt │ │ ├── inputs_dir.txt │ │ ├── inputs_feat.txt │ │ ├── name │ │ ├── rev_dir.txt │ │ └── test.list ├── tr.list ├── tr │ ├── inputs.txt │ ├── inputs2.txt │ ├── inputs_feat.txt │ └── inputs_nmae └── train_cmvn.npz ├── evaluate.py ├── ex_trac.sh ├── inputs.scp ├── io_funcs ├── __init__.py ├── __init__.pyc ├── __pycache__ │ └── kaldi_io.cpython-35.pyc ├── convert_cmvn_to_numpy.py ├── cut_cln_wav.m ├── kaldi_io.py ├── kaldi_io.pyc ├── make_sete.py ├── make_setf.py ├── make_tfrecords.py ├── make_tfrecords_rta.py ├── teconvert_cmvn_to_numpy.py ├── test.py ├── test2.py ├── tfrecords_dataset.py ├── tfrecords_dataset.pyc ├── tfrecords_dataset_test.py ├── tfrecords_io.py ├── tfrecords_io.pyc ├── tfrecords_io_test.py └── verify_tfrecords.py ├── mini_data ├── Noise │ ├── Babble.wav │ ├── Babble2.wav │ ├── F16.wav │ ├── F162.wav │ ├── Factory1.wav │ ├── Factory2.wav │ ├── Pink.wav │ ├── Pink2.wav │ ├── Volvo.wav │ ├── Volvo2.wav │ ├── White.wav │ └── White2.wav ├── cln_dt.txt ├── test_noise │ ├── n64.wav │ └── n71.wav ├── test_speech │ └── cln_et.txt ├── train_noise │ ├── n1.wav │ ├── n49.wav │ └── n95.wav └── train_speech │ ├── TRAIN_DR1_FCJF0_SA1.WAV │ ├── TRAIN_DR1_FKFB0_SX348.WAV │ ├── TRAIN_DR1_MPGR0_SX150.WAV │ ├── TRAIN_DR1_MRDD0_SI1680.WAV │ ├── cleandata.txt │ └── cleandata_test.txt ├── models ├── __init__.py ├── attention_dir │ └── resnet2_rced.py ├── dnn.py ├── dnn_trainer.py └── resnet_rced.py ├── pesq ├── pre_process_data.py ├── pre_process_test.py ├── scripts ├── audio_utilities.py ├── audio_utilities.pyc ├── config.py ├── config.pyc ├── dataset_test.sh ├── datasets │ ├── __init__.py │ ├── __init__.pyc │ ├── audio.py │ ├── audio.pyc │ ├── preprocessor.py │ └── wavenet_preprocessor.py ├── get_train_val_scp.py ├── hparams.py ├── io_test.sh ├── parse_options.sh ├── prepare_data.py ├── prepare_data.pyc ├── spectrogram_to_wave.py ├── spectrogram_to_wave.pyc └── train_dnn.py ├── train.sh ├── train.txt └── utils ├── __init__.py ├── __init__.pyc ├── add_additive_noise.py ├── bnorm.py ├── bnorm.pyc ├── common.py ├── generate_plots.py ├── misc.py ├── misc.pyc ├── ops.py ├── ops.pyc └── select_data.py /README.md: -------------------------------------------------------------------------------- 1 | [English](https://github.com/linan2/TensorFlow-speech-enhancement.git) | 中文 2 | # 基于深度特征映射的语音增强方法 3 | 本项目为可以利用DNN和CNN的方法来进行语音增强,其中DNN使用的三个隐层每个隐层512个节点,CNN使用的是R-CED的网络结构并且加入了一些resnet来防止过拟合。你也可以选择是否使用dropout或者l2等。 4 | 5 | ## 注意: 6 | requirements:TensorFlow1.5 Python2.7 7 | 8 | [制造数据](https://github.com/linan2/add_reverb2.git) 在运行此代码之前你应该先准备好干净和相应的含噪或者含混响的语音; 或者运行utils/add_additive_noise.py 添加相应信噪比的加性噪声,这里使用的事NOISEX-92噪声库在目录mini_data/Noise底下,带2的是8k采样率其他是相应的16k采样率噪音。 9 | 10 | 如果你的任务是去混响,在运行此代码之前你需要将含混响的语音剪切的和干净的语音一样长,cut_wav里面的脚本可能对你有用。 11 | 12 | 如果你的任务是做特征增强(不需要还原到语音),你可以把log spectragram特征替换成其他特征(比如MFCC)。 13 | 14 | ## 使用: 15 | 第一步. 运行 ex_trac.sh 数据准备并将数据分成训练集和交叉验证集,然后提取 log spectragram 特征. 16 | 17 | 第二步. 运行 train.sh 来训练和测试模型. 18 | 19 | 第三步. 运行 ca_pesq.sh 使用PESQ来评价你的结果。 20 | 21 | ## 补充: 22 | 代码还不完善,持续更新ing…大家如果发现有什么bug可以在代码上直接更改,然后更新。科研任务重,更新慢大家见谅。 23 | 24 | 本人在 REVERB challenge 数据集上测试了此代码的效果,PESQ能提高大约0.6—0.8。 25 | 26 | 过段时间我会继续更新一些比如生成对抗网络、 多任务学习和多目标学习的模型, 一些基于注意力机制的模型也会进行更新,敬请期待… 27 | 28 | 在解码阶段,您可以选择G&L声码器,也可以使用有噪声的语音原始的相位来合成语音,但是我已经尝试过G&L方法,与原始的相位的使用相比,它不会获得更好的性能。 29 | 30 | 运行环境教程: 31 | https://github.com/linan2/tensorflow-1.4.0.git 32 | 33 | [1] Li N., Ge M., Wang L., Dang J. (2019) [A Fast Convolutional Self-attention Based Speech Dereverberation Method for Robust Speech Recognition](https://link.springer.com/chapter/10.1007/978-3-030-36718-3_25). In: Gedeon T., Wong K., Lee M. (eds) Neural Information Processing. ICONIP 2019. Lecture Notes in Computer Science, vol 11955. Springer, Cham 34 | 35 | [2] Wang, K., Zhang, J., Sun, S., Wang, Y., Xiang, F., Xie, L. (2018) Investigating Generative Adversarial Networks Based Speech Dereverberation for Robust Speech Recognition. Proc. Interspeech 2018, 1581-1585, DOI: 10.21437/Interspeech.2018-1780. 36 | 37 | [3] Ge, M., Wang, L., Li, N., Shi, H., Dang, J., Li, X. (2019) Environment-Dependent Attention-Driven Recurrent Convolutional Neural Network for Robust Speech Enhancement. Proc. Interspeech 2019, 3153-3157, DOI: 10.21437/Interspeech.2019-1477. 38 | 39 | 40 | Email: linanvae@163.com 41 | -------------------------------------------------------------------------------- /avr_pesq: -------------------------------------------------------------------------------- 1 | far_pesq = 2.25776 2 | near_pesq = 2.34079 3 | real_pesq = 1.83094 4 | -------------------------------------------------------------------------------- /ca_pesq.sh: -------------------------------------------------------------------------------- 1 | stage=0 2 | if [ $stage -le 0 ]; then 3 | python evaluate.py calculate_pesq --workspace='workspace' --speech_dir='cut_data/SIMU/cln_evl' --type='test2/far' 4 | 5 | cat _pesq_results.txt|tail -n 539 |head -n 538|awk '{sum+=$2} END {print "far_pesq = ", sum/NR}' > avr_pesq 6 | wait 7 | fi 8 | if [ $stage -le 1 ]; then 9 | python evaluate.py calculate_pesq --workspace='workspace' --speech_dir='cut_data/SIMU/cln_evl' --type='test2/near' 10 | 11 | cat _pesq_results.txt|tail -n 539 |head -n 538|awk '{sum+=$2} END {print "near_pesq = ", sum/NR}' >> avr_pesq 12 | wait 13 | fi 14 | 15 | if [ $stage -le 2 ]; then 16 | python evaluate.py calculate_pesq --workspace='workspace' --speech_dir='cut_data/Real/cln_et' --type='test2/real' 17 | 18 | cat _pesq_results.txt|tail -n 373 |head -n 372|awk '{sum+=$2} END {print "real_pesq = ", sum/NR}' >> avr_pesq 19 | wait 20 | fi 21 | cat avr_pesq 22 | rm avr_pesq 23 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | sample_rate = 16000 2 | n_window = 400 # windows size for FFT 25ms 3 | n_overlap = 160 # overlap of window 10ms 4 | -------------------------------------------------------------------------------- /cut_data/cut_cln_wav.m: -------------------------------------------------------------------------------- 1 | % ---------------------------------------------------------------------------------------------------- 2 | % parameters and configures 3 | % ---------------------------------------------------------------------------------------------------- 4 | %dir_name = {'c31/','c34/','c35/','c38/','c3c/','c3d/','c3f/','c3j/','c3k/','c3l/','c3p/','c3s/','c3t/','c3w/','c3z/','c40/','c41/','c42/','c45/','c49/'}; 5 | %dir_name = {'c30/','c32/','c33/','c37/','c39/','c3b/','c3h/','c3o/','c3q/','c3r/','c3y/','c46/','c48/','c4a/'}; 6 | %dir_name = {'c36/','c3a/','c3e/','c3g/','c3i/','c3m/','c3n/','c3u/','c3v/','c3x/','c43/','c44/','c47/','c4b/'}; 7 | %dir_name = {'c02/','c05/','c08/','c0b/','c0e/','c0h/','c0k/','c0n/','c0q/','c0t/','c0w/','c0z/','c12/','c15/','c18/','c1b/','c1e/','c1h/','c1k/','c1n/','c1q/','c1t/','c1w/','c1z/','c22/','c25/','c28/','c2b/','c2e/','c2h/','c2k/','c03/','c06/','c09/','c0c/','c0f/','c0i/','c0l/','c0o/','c0r/','c0u/','c0x/','c10/','c13/','c16/','c19/','c1c/','c1f/','c1i/','c1l/','c1o/','c1r/','c1u/','c1x/','c20/','c23/','c26/','c29/','c2c/','c2f/','c2i/','c2l/','c04/','c07/','c0a/','c0d/','c0g/','c0j/','c0m/','c0p/','c0s/','c0v/','c0y/','c11/','c14/','c17/','c1a/','c1d/','c1g/','c1j/','c1m/','c1p/','c1s/','c1v/','c1y/','c21/','c24/','c27/','c2a/','c2d/','c2g/','c2j/'}; 8 | dir_name ={'c02/'}; 9 | %disp(length(dir_name)); 10 | 11 | % --------------------------------------------------------------------------------------------------- 12 | % cut wavforms 13 | % -------------------------------------------------------------------------------------------------- 14 | for t=1:length(dir_name) 15 | % get the current sub-directory 16 | tempdir=dir_name{t}; 17 | disp(tempdir); 18 | % define the path of reverberation wavforms and enhanced wavforms 19 | clean_filedir = ['/CDShare/REVERB_DATA/raw_wsj0_data/data/primary_microphone/si_tr/',tempdir]; 20 | enh_filedir = ['/Work18/2015/gemeng/se/mydnn/tools/MSLP/MCMSLP_L750_D512/dereverb_GSSn1a1b0.15/si_tr/',tempdir,'/1/RAW/']; 21 | % get all the file names of enhanced wavforms 22 | dirOutput = dir([enh_filedir, '*_2.wav']); 23 | file_name = {dirOutput.name}'; 24 | disp(file_name); 25 | [rows,cols] = size(file_name); 26 | 27 | % cut the reverberation wavforms based on the length of the corresponding enhanced wavforms 28 | save_path = ['/Work18/2015/gemeng/se/mydnn/tools/MSLP/MCMSLP_L750_D512/dereverb_GSSn1a1b0.15/cln_cut/si_tr/',tempdir]; 29 | mkdir(save_path); 30 | for i=1:rows 31 | enh_na = file_name{i}; 32 | clean_na = [enh_na(1:8),'.wav']; 33 | disp(clean_na) 34 | %na = file_name(i); 35 | %audiopath=dir([filedir,file_name{i}]); 36 | [clean_x, Fs] = audioread([clean_filedir, clean_na]); 37 | [enh_x,Fs] = audioread([enh_filedir, enh_na]); 38 | %[r,c]=size(x); 39 | %if c > 1 40 | % disp(na); 41 | %end; 42 | y = clean_x(1:length(enh_x)); 43 | wrt_path = [save_path, clean_na]; 44 | audiowrite(wrt_path,y,Fs); 45 | end; 46 | %disp(x); 47 | %disp(Fs); 48 | end; 49 | -------------------------------------------------------------------------------- /cut_data/cut_wav.m: -------------------------------------------------------------------------------- 1 | filename1 = 'si_tr.txt' 2 | [name1,path1] = textread(filename1,'%s %s') 3 | filename2 = 'REVERB_WSJCAM0_tr.txt' 4 | [name2,path2] = textread(filename2,'%s %s') 5 | wavlist1 = path1; 6 | wavlist1 = [wavlist1]; 7 | wavlist2 = path2; 8 | wavlist2 = [wavlist2]; 9 | 10 | 11 | for i=1:length(wavlist2) 12 | wav_cln = audioread(wavlist1{i}); 13 | wav_rev = audioread(wavlist2{i}); 14 | disp(wavlist1{i}); 15 | disp(wavlist2{i}); 16 | Fs = 16000 17 | disp(length(wav_cln)); 18 | disp(length(wav_rev)) 19 | y = wav_rev(1:length(wav_cln)); 20 | str1 = '.wav' 21 | wrt_path = ['reverb/',name2{i},str1]; 22 | audiowrite(wrt_path,y,Fs); 23 | end; 24 | -------------------------------------------------------------------------------- /data/batch_num.txt: -------------------------------------------------------------------------------- 1 | 11043 54451 -------------------------------------------------------------------------------- /data/cv.list: -------------------------------------------------------------------------------- 1 | data/tfrecords/cv.tfrecords 2 | -------------------------------------------------------------------------------- /data/cv/all_real_rev_dt.txt: -------------------------------------------------------------------------------- 1 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0201.wav 2 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0202.wav 3 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0205.wav 4 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0206.wav 5 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0208.wav 6 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0209.wav 7 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020b.wav 8 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020c.wav 9 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020g.wav 10 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020i.wav 11 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020j.wav 12 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020l.wav 13 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020n.wav 14 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020q.wav 15 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0201.wav 16 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0203.wav 17 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0205.wav 18 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0207.wav 19 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0208.wav 20 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020b.wav 21 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020c.wav 22 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020e.wav 23 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020g.wav 24 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020h.wav 25 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020i.wav 26 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020l.wav 27 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020m.wav 28 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020n.wav 29 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020o.wav 30 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020p.wav 31 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020r.wav 32 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0201.wav 33 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0202.wav 34 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0203.wav 35 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0204.wav 36 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0205.wav 37 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0206.wav 38 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0207.wav 39 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0208.wav 40 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0209.wav 41 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020a.wav 42 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020b.wav 43 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020c.wav 44 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020d.wav 45 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020e.wav 46 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020f.wav 47 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020g.wav 48 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020k.wav 49 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020l.wav 50 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020m.wav 51 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020n.wav 52 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020p.wav 53 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020q.wav 54 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020r.wav 55 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0201.wav 56 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0202.wav 57 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0203.wav 58 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0204.wav 59 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0205.wav 60 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0206.wav 61 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0209.wav 62 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020a.wav 63 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020b.wav 64 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020d.wav 65 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020e.wav 66 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020f.wav 67 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020g.wav 68 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020h.wav 69 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020i.wav 70 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020k.wav 71 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020l.wav 72 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020m.wav 73 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020o.wav 74 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020p.wav 75 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020q.wav 76 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020r.wav 77 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0202.wav 78 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0203.wav 79 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0207.wav 80 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0208.wav 81 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020a.wav 82 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020b.wav 83 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020d.wav 84 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020e.wav 85 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020g.wav 86 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020i.wav 87 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020l.wav 88 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020m.wav 89 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020o.wav 90 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020q.wav 91 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0203.wav 92 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0204.wav 93 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0207.wav 94 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020a.wav 95 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020d.wav 96 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020e.wav 97 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020f.wav 98 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020k.wav 99 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020m.wav 100 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020o.wav 101 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020p.wav 102 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020r.wav 103 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020s.wav 104 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020t.wav 105 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020u.wav 106 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020v.wav 107 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020w.wav 108 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020x.wav 109 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020y.wav 110 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c020z.wav 111 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0210.wav 112 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0211.wav 113 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t10c0212.wav 114 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0202.wav 115 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0204.wav 116 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0206.wav 117 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020d.wav 118 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020f.wav 119 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020j.wav 120 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020k.wav 121 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020q.wav 122 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020s.wav 123 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020t.wav 124 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020u.wav 125 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020v.wav 126 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020y.wav 127 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c020z.wav 128 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0210.wav 129 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0211.wav 130 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0212.wav 131 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t6c0213.wav 132 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020h.wav 133 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020i.wav 134 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020j.wav 135 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020o.wav 136 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020s.wav 137 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020t.wav 138 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020u.wav 139 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020w.wav 140 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020x.wav 141 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020y.wav 142 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c020z.wav 143 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0210.wav 144 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0211.wav 145 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0212.wav 146 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t7c0213.wav 147 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0207.wav 148 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0208.wav 149 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020c.wav 150 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020j.wav 151 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020n.wav 152 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020t.wav 153 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020u.wav 154 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020v.wav 155 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020w.wav 156 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020x.wav 157 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c020z.wav 158 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0210.wav 159 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0211.wav 160 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0212.wav 161 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t8c0213.wav 162 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0205.wav 163 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0206.wav 164 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0209.wav 165 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020c.wav 166 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020f.wav 167 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020h.wav 168 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020j.wav 169 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020n.wav 170 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020p.wav 171 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020r.wav 172 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020s.wav 173 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020t.wav 174 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020u.wav 175 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020v.wav 176 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020w.wav 177 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c020x.wav 178 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0210.wav 179 | /Work18/2017/linan/SE/my_enh/cut_data/Real/real_dev/t9c0211.wav 180 | -------------------------------------------------------------------------------- /data/test/far/name: -------------------------------------------------------------------------------- 1 | c30c0201 2 | c30c0202 3 | c30c0203 4 | c30c0204 5 | c30c0205 6 | c30c0206 7 | c30c0207 8 | c30c0208 9 | c30c0209 10 | c30c020a 11 | c30c020b 12 | c30c020c 13 | c30c020d 14 | c30c020e 15 | c30c020f 16 | c30c020g 17 | c30c020h 18 | c30c020i 19 | c30c020j 20 | c30c020k 21 | c30c020l 22 | c30c020m 23 | c30c020n 24 | c30c020o 25 | c30c020p 26 | c30c020q 27 | c30c020r 28 | c30c020s 29 | c30c020t 30 | c30c020u 31 | c30c020v 32 | c30c020w 33 | c30c020x 34 | c30c020y 35 | c30c020z 36 | c30c0210 37 | c30c0211 38 | c30c0212 39 | c30c0213 40 | c30c0214 41 | c30c0215 42 | c32c0201 43 | c32c0202 44 | c32c0203 45 | c32c0204 46 | c32c0205 47 | c32c0206 48 | c32c0207 49 | c32c0208 50 | c32c0209 51 | c32c020a 52 | c32c020b 53 | c32c020c 54 | c32c020d 55 | c32c020e 56 | c32c020f 57 | c32c020g 58 | c32c020h 59 | c32c020i 60 | c32c020j 61 | c32c020k 62 | c32c020l 63 | c32c020m 64 | c32c020n 65 | c32c020o 66 | c32c020p 67 | c32c020q 68 | c32c020r 69 | c32c020s 70 | c32c020t 71 | c32c020u 72 | c32c020v 73 | c32c020w 74 | c32c020x 75 | c32c020y 76 | c32c020z 77 | c32c0210 78 | c32c0211 79 | c32c0212 80 | c32c0213 81 | c32c0214 82 | c32c0215 83 | c33c0201 84 | c33c0202 85 | c33c0203 86 | c33c0204 87 | c33c0205 88 | c33c0206 89 | c33c0207 90 | c33c0208 91 | c33c0209 92 | c33c020a 93 | c33c020b 94 | c33c020c 95 | c33c020d 96 | c33c020e 97 | c33c020f 98 | c33c020g 99 | c33c020h 100 | c33c020i 101 | c33c020j 102 | c33c020k 103 | c33c020l 104 | c33c020m 105 | c33c020n 106 | c33c020o 107 | c33c020p 108 | c33c020q 109 | c33c020r 110 | c33c020s 111 | c33c020t 112 | c33c020u 113 | c33c020v 114 | c33c020w 115 | c33c020x 116 | c33c020y 117 | c33c020z 118 | c33c0210 119 | c33c0211 120 | c33c0212 121 | c33c0213 122 | c37c0201 123 | c37c0202 124 | c37c0203 125 | c37c0204 126 | c37c0205 127 | c37c0206 128 | c37c0207 129 | c37c0208 130 | c37c0209 131 | c37c020a 132 | c37c020b 133 | c37c020c 134 | c37c020d 135 | c37c020e 136 | c37c020f 137 | c37c020g 138 | c37c020h 139 | c37c020i 140 | c37c020j 141 | c37c020k 142 | c37c020l 143 | c37c020m 144 | c37c020n 145 | c37c020o 146 | c37c020p 147 | c37c020q 148 | c37c020r 149 | c37c020s 150 | c37c020t 151 | c37c020u 152 | c37c020v 153 | c37c020w 154 | c37c020x 155 | c37c020y 156 | c37c020z 157 | c37c0210 158 | c37c0211 159 | c37c0212 160 | c37c0213 161 | c39c0201 162 | c39c0202 163 | c39c0203 164 | c39c0204 165 | c39c0205 166 | c39c0206 167 | c39c0207 168 | c39c0208 169 | c39c0209 170 | c39c020a 171 | c39c020b 172 | c39c020c 173 | c39c020d 174 | c39c020e 175 | c39c020f 176 | c39c020g 177 | c39c020h 178 | c39c020i 179 | c39c020j 180 | c39c020k 181 | c39c020l 182 | c39c020m 183 | c39c020n 184 | c39c020o 185 | c39c020p 186 | c39c020q 187 | c39c020r 188 | c39c020s 189 | c39c020t 190 | c39c020u 191 | c39c020v 192 | c39c020w 193 | c39c020x 194 | c39c020y 195 | c39c020z 196 | c39c0210 197 | c39c0211 198 | c39c0212 199 | c3bc0201 200 | c3bc0202 201 | c3bc0203 202 | c3bc0204 203 | c3bc0205 204 | c3bc0206 205 | c3bc0207 206 | c3bc0208 207 | c3bc0209 208 | c3bc020a 209 | c3bc020b 210 | c3bc020c 211 | c3bc020d 212 | c3bc020e 213 | c3bc020f 214 | c3bc020g 215 | c3bc020h 216 | c3bc020i 217 | c3bc020j 218 | c3bc020k 219 | c3bc020l 220 | c3bc020m 221 | c3bc020n 222 | c3bc020o 223 | c3bc020p 224 | c3bc020q 225 | c3bc020r 226 | c3bc020s 227 | c3bc020t 228 | c3bc020u 229 | c3bc020v 230 | c3bc020w 231 | c3bc020x 232 | c3bc020y 233 | c3bc020z 234 | c3bc0210 235 | c3bc0211 236 | c3bc0212 237 | c3hc0201 238 | c3hc0202 239 | c3hc0203 240 | c3hc0204 241 | c3hc0205 242 | c3hc0206 243 | c3hc0207 244 | c3hc0208 245 | c3hc0209 246 | c3hc020a 247 | c3hc020b 248 | c3hc020c 249 | c3hc020d 250 | c3hc020e 251 | c3hc020f 252 | c3hc020g 253 | c3hc020h 254 | c3hc020i 255 | c3hc020j 256 | c3hc020k 257 | c3hc020l 258 | c3hc020m 259 | c3hc020n 260 | c3hc020o 261 | c3hc020p 262 | c3hc020q 263 | c3hc020r 264 | c3hc020s 265 | c3hc020t 266 | c3hc020u 267 | c3hc020v 268 | c3hc020w 269 | c3hc020x 270 | c3hc020y 271 | c3hc020z 272 | c3hc0210 273 | c3hc0211 274 | c3oc0201 275 | c3oc0202 276 | c3oc0203 277 | c3oc0204 278 | c3oc0205 279 | c3oc0206 280 | c3oc0207 281 | c3oc0208 282 | c3oc0209 283 | c3oc020a 284 | c3oc020b 285 | c3oc020c 286 | c3oc020d 287 | c3oc020e 288 | c3oc020f 289 | c3oc020g 290 | c3oc020h 291 | c3oc020i 292 | c3oc020j 293 | c3oc020k 294 | c3oc020l 295 | c3oc020m 296 | c3oc020n 297 | c3oc020o 298 | c3oc020p 299 | c3oc020q 300 | c3oc020r 301 | c3oc020s 302 | c3oc020t 303 | c3oc020u 304 | c3oc020v 305 | c3oc020w 306 | c3oc020x 307 | c3oc020y 308 | c3oc020z 309 | c3oc0210 310 | c3oc0212 311 | c3qc0201 312 | c3qc0202 313 | c3qc0203 314 | c3qc0204 315 | c3qc0205 316 | c3qc0206 317 | c3qc0207 318 | c3qc0208 319 | c3qc0209 320 | c3qc020a 321 | c3qc020b 322 | c3qc020c 323 | c3qc020d 324 | c3qc020e 325 | c3qc020f 326 | c3qc020g 327 | c3qc020h 328 | c3qc020i 329 | c3qc020j 330 | c3qc020k 331 | c3qc020l 332 | c3qc020m 333 | c3qc020n 334 | c3qc020o 335 | c3qc020p 336 | c3qc020q 337 | c3qc020r 338 | c3qc020s 339 | c3qc020t 340 | c3qc020u 341 | c3qc020v 342 | c3qc020w 343 | c3qc020x 344 | c3qc020y 345 | c3qc020z 346 | c3qc0210 347 | c3qc0211 348 | c3qc0212 349 | c3rc0201 350 | c3rc0202 351 | c3rc0203 352 | c3rc0204 353 | c3rc0205 354 | c3rc0206 355 | c3rc0207 356 | c3rc0208 357 | c3rc0209 358 | c3rc020a 359 | c3rc020b 360 | c3rc020c 361 | c3rc020d 362 | c3rc020e 363 | c3rc020f 364 | c3rc020g 365 | c3rc020h 366 | c3rc020i 367 | c3rc020j 368 | c3rc020k 369 | c3rc020l 370 | c3rc020m 371 | c3rc020n 372 | c3rc020o 373 | c3rc020p 374 | c3rc020q 375 | c3rc020r 376 | c3rc020s 377 | c3rc020t 378 | c3rc020u 379 | c3rc020v 380 | c3rc020w 381 | c3yc0201 382 | c3yc0202 383 | c3yc0203 384 | c3yc0204 385 | c3yc0205 386 | c3yc0206 387 | c3yc0207 388 | c3yc0208 389 | c3yc0209 390 | c3yc020a 391 | c3yc020b 392 | c3yc020c 393 | c3yc020d 394 | c3yc020e 395 | c3yc020f 396 | c3yc020g 397 | c3yc020h 398 | c3yc020i 399 | c3yc020j 400 | c3yc020k 401 | c3yc020l 402 | c3yc020m 403 | c3yc020n 404 | c3yc020o 405 | c3yc020p 406 | c3yc020q 407 | c3yc020r 408 | c3yc020s 409 | c3yc020t 410 | c3yc020u 411 | c3yc020v 412 | c3yc020w 413 | c3yc020x 414 | c3yc020y 415 | c3yc020z 416 | c3yc0210 417 | c3yc0211 418 | c3yc0212 419 | c3yc0213 420 | c3yc0214 421 | c46c0201 422 | c46c0202 423 | c46c0203 424 | c46c0204 425 | c46c0205 426 | c46c0206 427 | c46c0207 428 | c46c0208 429 | c46c0209 430 | c46c020a 431 | c46c020b 432 | c46c020c 433 | c46c020d 434 | c46c020e 435 | c46c020f 436 | c46c020g 437 | c46c020h 438 | c46c020i 439 | c46c020j 440 | c46c020k 441 | c46c020l 442 | c46c020m 443 | c46c020n 444 | c46c020o 445 | c46c020p 446 | c46c020q 447 | c46c020r 448 | c46c020s 449 | c46c020t 450 | c46c020u 451 | c46c020v 452 | c46c020w 453 | c46c020x 454 | c46c020y 455 | c46c020z 456 | c46c0210 457 | c46c0211 458 | c46c0212 459 | c46c0213 460 | c46c0214 461 | c48c0201 462 | c48c0202 463 | c48c0203 464 | c48c0204 465 | c48c0205 466 | c48c0206 467 | c48c0207 468 | c48c0208 469 | c48c0209 470 | c48c020a 471 | c48c020b 472 | c48c020c 473 | c48c020d 474 | c48c020e 475 | c48c020f 476 | c48c020g 477 | c48c020h 478 | c48c020i 479 | c48c020j 480 | c48c020k 481 | c48c020l 482 | c48c020m 483 | c48c020n 484 | c48c020o 485 | c48c020p 486 | c48c020q 487 | c48c020r 488 | c48c020s 489 | c48c020t 490 | c48c020u 491 | c48c020v 492 | c48c020w 493 | c48c020x 494 | c48c020y 495 | c48c020z 496 | c48c0210 497 | c48c0211 498 | c48c0212 499 | c48c0213 500 | c4ac0201 501 | c4ac0202 502 | c4ac0203 503 | c4ac0204 504 | c4ac0205 505 | c4ac0206 506 | c4ac0207 507 | c4ac0208 508 | c4ac0209 509 | c4ac020a 510 | c4ac020b 511 | c4ac020c 512 | c4ac020d 513 | c4ac020e 514 | c4ac020f 515 | c4ac020g 516 | c4ac020h 517 | c4ac020i 518 | c4ac020j 519 | c4ac020k 520 | c4ac020l 521 | c4ac020m 522 | c4ac020n 523 | c4ac020o 524 | c4ac020p 525 | c4ac020q 526 | c4ac020r 527 | c4ac020s 528 | c4ac020t 529 | c4ac020u 530 | c4ac020v 531 | c4ac020w 532 | c4ac020x 533 | c4ac020y 534 | c4ac020z 535 | c4ac0210 536 | c4ac0211 537 | c4ac0212 538 | c4ac0213 539 | -------------------------------------------------------------------------------- /data/test/far/pro.sh: -------------------------------------------------------------------------------- 1 | find /Work18/2017/linan/SE/my_enh/cut_data/SIMU/far_evl/ -name *.wav > far_dir 2 | paste -d ' ' far_dir cln_et.txt > inputs_dir.txt 3 | paste -d name inputs_feat.txt > inputs.txt 4 | awk -F / {print } inputs_feat.txt | awk -F . {print } > name 5 | find /Work18/2017/linan/SE/my_enh/workspace/features/spectrogram/test/far/ -name *.p > inputs_feat.txt 6 | -------------------------------------------------------------------------------- /data/test/far/test.list: -------------------------------------------------------------------------------- 1 | data/test/far/tfrecords/test.tfrecords 2 | -------------------------------------------------------------------------------- /data/test/near/name: -------------------------------------------------------------------------------- 1 | c30c0201 2 | c30c0202 3 | c30c0203 4 | c30c0204 5 | c30c0205 6 | c30c0206 7 | c30c0207 8 | c30c0208 9 | c30c0209 10 | c30c020a 11 | c30c020b 12 | c30c020c 13 | c30c020d 14 | c30c020e 15 | c30c020f 16 | c30c020g 17 | c30c020h 18 | c30c020i 19 | c30c020j 20 | c30c020k 21 | c30c020l 22 | c30c020m 23 | c30c020n 24 | c30c020o 25 | c30c020p 26 | c30c020q 27 | c30c020r 28 | c30c020s 29 | c30c020t 30 | c30c020u 31 | c30c020v 32 | c30c020w 33 | c30c020x 34 | c30c020y 35 | c30c020z 36 | c30c0210 37 | c30c0211 38 | c30c0212 39 | c30c0213 40 | c30c0214 41 | c30c0215 42 | c32c0201 43 | c32c0202 44 | c32c0203 45 | c32c0204 46 | c32c0205 47 | c32c0206 48 | c32c0207 49 | c32c0208 50 | c32c0209 51 | c32c020a 52 | c32c020b 53 | c32c020c 54 | c32c020d 55 | c32c020e 56 | c32c020f 57 | c32c020g 58 | c32c020h 59 | c32c020i 60 | c32c020j 61 | c32c020k 62 | c32c020l 63 | c32c020m 64 | c32c020n 65 | c32c020o 66 | c32c020p 67 | c32c020q 68 | c32c020r 69 | c32c020s 70 | c32c020t 71 | c32c020u 72 | c32c020v 73 | c32c020w 74 | c32c020x 75 | c32c020y 76 | c32c020z 77 | c32c0210 78 | c32c0211 79 | c32c0212 80 | c32c0213 81 | c32c0214 82 | c32c0215 83 | c33c0201 84 | c33c0202 85 | c33c0203 86 | c33c0204 87 | c33c0205 88 | c33c0206 89 | c33c0207 90 | c33c0208 91 | c33c0209 92 | c33c020a 93 | c33c020b 94 | c33c020c 95 | c33c020d 96 | c33c020e 97 | c33c020f 98 | c33c020g 99 | c33c020h 100 | c33c020i 101 | c33c020j 102 | c33c020k 103 | c33c020l 104 | c33c020m 105 | c33c020n 106 | c33c020o 107 | c33c020p 108 | c33c020q 109 | c33c020r 110 | c33c020s 111 | c33c020t 112 | c33c020u 113 | c33c020v 114 | c33c020w 115 | c33c020x 116 | c33c020y 117 | c33c020z 118 | c33c0210 119 | c33c0211 120 | c33c0212 121 | c33c0213 122 | c37c0201 123 | c37c0202 124 | c37c0203 125 | c37c0204 126 | c37c0205 127 | c37c0206 128 | c37c0207 129 | c37c0208 130 | c37c0209 131 | c37c020a 132 | c37c020b 133 | c37c020c 134 | c37c020d 135 | c37c020e 136 | c37c020f 137 | c37c020g 138 | c37c020h 139 | c37c020i 140 | c37c020j 141 | c37c020k 142 | c37c020l 143 | c37c020m 144 | c37c020n 145 | c37c020o 146 | c37c020p 147 | c37c020q 148 | c37c020r 149 | c37c020s 150 | c37c020t 151 | c37c020u 152 | c37c020v 153 | c37c020w 154 | c37c020x 155 | c37c020y 156 | c37c020z 157 | c37c0210 158 | c37c0211 159 | c37c0212 160 | c37c0213 161 | c39c0201 162 | c39c0202 163 | c39c0203 164 | c39c0204 165 | c39c0205 166 | c39c0206 167 | c39c0207 168 | c39c0208 169 | c39c0209 170 | c39c020a 171 | c39c020b 172 | c39c020c 173 | c39c020d 174 | c39c020e 175 | c39c020f 176 | c39c020g 177 | c39c020h 178 | c39c020i 179 | c39c020j 180 | c39c020k 181 | c39c020l 182 | c39c020m 183 | c39c020n 184 | c39c020o 185 | c39c020p 186 | c39c020q 187 | c39c020r 188 | c39c020s 189 | c39c020t 190 | c39c020u 191 | c39c020v 192 | c39c020w 193 | c39c020x 194 | c39c020y 195 | c39c020z 196 | c39c0210 197 | c39c0211 198 | c39c0212 199 | c3bc0201 200 | c3bc0202 201 | c3bc0203 202 | c3bc0204 203 | c3bc0205 204 | c3bc0206 205 | c3bc0207 206 | c3bc0208 207 | c3bc0209 208 | c3bc020a 209 | c3bc020b 210 | c3bc020c 211 | c3bc020d 212 | c3bc020e 213 | c3bc020f 214 | c3bc020g 215 | c3bc020h 216 | c3bc020i 217 | c3bc020j 218 | c3bc020k 219 | c3bc020l 220 | c3bc020m 221 | c3bc020n 222 | c3bc020o 223 | c3bc020p 224 | c3bc020q 225 | c3bc020r 226 | c3bc020s 227 | c3bc020t 228 | c3bc020u 229 | c3bc020v 230 | c3bc020w 231 | c3bc020x 232 | c3bc020y 233 | c3bc020z 234 | c3bc0210 235 | c3bc0211 236 | c3bc0212 237 | c3hc0201 238 | c3hc0202 239 | c3hc0203 240 | c3hc0204 241 | c3hc0205 242 | c3hc0206 243 | c3hc0207 244 | c3hc0208 245 | c3hc0209 246 | c3hc020a 247 | c3hc020b 248 | c3hc020c 249 | c3hc020d 250 | c3hc020e 251 | c3hc020f 252 | c3hc020g 253 | c3hc020h 254 | c3hc020i 255 | c3hc020j 256 | c3hc020k 257 | c3hc020l 258 | c3hc020m 259 | c3hc020n 260 | c3hc020o 261 | c3hc020p 262 | c3hc020q 263 | c3hc020r 264 | c3hc020s 265 | c3hc020t 266 | c3hc020u 267 | c3hc020v 268 | c3hc020w 269 | c3hc020x 270 | c3hc020y 271 | c3hc020z 272 | c3hc0210 273 | c3hc0211 274 | c3oc0201 275 | c3oc0202 276 | c3oc0203 277 | c3oc0204 278 | c3oc0205 279 | c3oc0206 280 | c3oc0207 281 | c3oc0208 282 | c3oc0209 283 | c3oc020a 284 | c3oc020b 285 | c3oc020c 286 | c3oc020d 287 | c3oc020e 288 | c3oc020f 289 | c3oc020g 290 | c3oc020h 291 | c3oc020i 292 | c3oc020j 293 | c3oc020k 294 | c3oc020l 295 | c3oc020m 296 | c3oc020n 297 | c3oc020o 298 | c3oc020p 299 | c3oc020q 300 | c3oc020r 301 | c3oc020s 302 | c3oc020t 303 | c3oc020u 304 | c3oc020v 305 | c3oc020w 306 | c3oc020x 307 | c3oc020y 308 | c3oc020z 309 | c3oc0210 310 | c3oc0212 311 | c3qc0201 312 | c3qc0202 313 | c3qc0203 314 | c3qc0204 315 | c3qc0205 316 | c3qc0206 317 | c3qc0207 318 | c3qc0208 319 | c3qc0209 320 | c3qc020a 321 | c3qc020b 322 | c3qc020c 323 | c3qc020d 324 | c3qc020e 325 | c3qc020f 326 | c3qc020g 327 | c3qc020h 328 | c3qc020i 329 | c3qc020j 330 | c3qc020k 331 | c3qc020l 332 | c3qc020m 333 | c3qc020n 334 | c3qc020o 335 | c3qc020p 336 | c3qc020q 337 | c3qc020r 338 | c3qc020s 339 | c3qc020t 340 | c3qc020u 341 | c3qc020v 342 | c3qc020w 343 | c3qc020x 344 | c3qc020y 345 | c3qc020z 346 | c3qc0210 347 | c3qc0211 348 | c3qc0212 349 | c3rc0201 350 | c3rc0202 351 | c3rc0203 352 | c3rc0204 353 | c3rc0205 354 | c3rc0206 355 | c3rc0207 356 | c3rc0208 357 | c3rc0209 358 | c3rc020a 359 | c3rc020b 360 | c3rc020c 361 | c3rc020d 362 | c3rc020e 363 | c3rc020f 364 | c3rc020g 365 | c3rc020h 366 | c3rc020i 367 | c3rc020j 368 | c3rc020k 369 | c3rc020l 370 | c3rc020m 371 | c3rc020n 372 | c3rc020o 373 | c3rc020p 374 | c3rc020q 375 | c3rc020r 376 | c3rc020s 377 | c3rc020t 378 | c3rc020u 379 | c3rc020v 380 | c3rc020w 381 | c3yc0201 382 | c3yc0202 383 | c3yc0203 384 | c3yc0204 385 | c3yc0205 386 | c3yc0206 387 | c3yc0207 388 | c3yc0208 389 | c3yc0209 390 | c3yc020a 391 | c3yc020b 392 | c3yc020c 393 | c3yc020d 394 | c3yc020e 395 | c3yc020f 396 | c3yc020g 397 | c3yc020h 398 | c3yc020i 399 | c3yc020j 400 | c3yc020k 401 | c3yc020l 402 | c3yc020m 403 | c3yc020n 404 | c3yc020o 405 | c3yc020p 406 | c3yc020q 407 | c3yc020r 408 | c3yc020s 409 | c3yc020t 410 | c3yc020u 411 | c3yc020v 412 | c3yc020w 413 | c3yc020x 414 | c3yc020y 415 | c3yc020z 416 | c3yc0210 417 | c3yc0211 418 | c3yc0212 419 | c3yc0213 420 | c3yc0214 421 | c46c0201 422 | c46c0202 423 | c46c0203 424 | c46c0204 425 | c46c0205 426 | c46c0206 427 | c46c0207 428 | c46c0208 429 | c46c0209 430 | c46c020a 431 | c46c020b 432 | c46c020c 433 | c46c020d 434 | c46c020e 435 | c46c020f 436 | c46c020g 437 | c46c020h 438 | c46c020i 439 | c46c020j 440 | c46c020k 441 | c46c020l 442 | c46c020m 443 | c46c020n 444 | c46c020o 445 | c46c020p 446 | c46c020q 447 | c46c020r 448 | c46c020s 449 | c46c020t 450 | c46c020u 451 | c46c020v 452 | c46c020w 453 | c46c020x 454 | c46c020y 455 | c46c020z 456 | c46c0210 457 | c46c0211 458 | c46c0212 459 | c46c0213 460 | c46c0214 461 | c48c0201 462 | c48c0202 463 | c48c0203 464 | c48c0204 465 | c48c0205 466 | c48c0206 467 | c48c0207 468 | c48c0208 469 | c48c0209 470 | c48c020a 471 | c48c020b 472 | c48c020c 473 | c48c020d 474 | c48c020e 475 | c48c020f 476 | c48c020g 477 | c48c020h 478 | c48c020i 479 | c48c020j 480 | c48c020k 481 | c48c020l 482 | c48c020m 483 | c48c020n 484 | c48c020o 485 | c48c020p 486 | c48c020q 487 | c48c020r 488 | c48c020s 489 | c48c020t 490 | c48c020u 491 | c48c020v 492 | c48c020w 493 | c48c020x 494 | c48c020y 495 | c48c020z 496 | c48c0210 497 | c48c0211 498 | c48c0212 499 | c48c0213 500 | c4ac0201 501 | c4ac0202 502 | c4ac0203 503 | c4ac0204 504 | c4ac0205 505 | c4ac0206 506 | c4ac0207 507 | c4ac0208 508 | c4ac0209 509 | c4ac020a 510 | c4ac020b 511 | c4ac020c 512 | c4ac020d 513 | c4ac020e 514 | c4ac020f 515 | c4ac020g 516 | c4ac020h 517 | c4ac020i 518 | c4ac020j 519 | c4ac020k 520 | c4ac020l 521 | c4ac020m 522 | c4ac020n 523 | c4ac020o 524 | c4ac020p 525 | c4ac020q 526 | c4ac020r 527 | c4ac020s 528 | c4ac020t 529 | c4ac020u 530 | c4ac020v 531 | c4ac020w 532 | c4ac020x 533 | c4ac020y 534 | c4ac020z 535 | c4ac0210 536 | c4ac0211 537 | c4ac0212 538 | c4ac0213 539 | -------------------------------------------------------------------------------- /data/test/near/test.list: -------------------------------------------------------------------------------- 1 | data/test/near/tfrecords/test.tfrecords 2 | -------------------------------------------------------------------------------- /data/test/real/name: -------------------------------------------------------------------------------- 1 | t21c0201 2 | t21c0202 3 | t21c0204 4 | t21c0205 5 | t21c0209 6 | t21c020b 7 | t21c020d 8 | t21c020e 9 | t21c020g 10 | t21c020i 11 | t21c020j 12 | t21c020k 13 | t21c020m 14 | t21c020n 15 | t21c020o 16 | t21c020q 17 | t22c0202 18 | t22c0205 19 | t22c0206 20 | t22c0208 21 | t22c020b 22 | t22c020d 23 | t22c020e 24 | t22c020f 25 | t22c020h 26 | t22c020i 27 | t22c020k 28 | t22c020m 29 | t22c020n 30 | t22c020p 31 | t23c0201 32 | t23c0202 33 | t23c0203 34 | t23c020b 35 | t23c020c 36 | t23c020d 37 | t23c020e 38 | t23c020f 39 | t23c020h 40 | t23c020i 41 | t23c020j 42 | t23c020k 43 | t23c020m 44 | t23c020n 45 | t23c020o 46 | t23c020p 47 | t23c020q 48 | t24c0201 49 | t24c0202 50 | t24c0203 51 | t24c0204 52 | t24c0205 53 | t24c0206 54 | t24c0207 55 | t24c0208 56 | t24c0209 57 | t24c020a 58 | t24c020b 59 | t24c020d 60 | t24c020e 61 | t24c020f 62 | t24c020h 63 | t24c020j 64 | t24c020k 65 | t24c020l 66 | t24c020m 67 | t24c020n 68 | t24c020o 69 | t24c020p 70 | t24c020q 71 | t25c0201 72 | t25c0203 73 | t25c0204 74 | t25c0205 75 | t25c0207 76 | t25c0208 77 | t25c0209 78 | t25c020b 79 | t25c020d 80 | t25c020g 81 | t25c020h 82 | t25c020i 83 | t25c020j 84 | t25c020l 85 | t25c020o 86 | t25c020p 87 | t25c020r 88 | t36c0201 89 | t36c0202 90 | t36c0203 91 | t36c0204 92 | t36c0206 93 | t36c0207 94 | t36c0208 95 | t36c020g 96 | t36c020k 97 | t36c020n 98 | t37c0202 99 | t37c0204 100 | t37c0205 101 | t37c0209 102 | t37c020a 103 | t37c020g 104 | t37c020h 105 | t37c020i 106 | t37c020j 107 | t37c020l 108 | t37c020n 109 | t37c020p 110 | t37c020r 111 | t38c0201 112 | t38c0202 113 | t38c0203 114 | t38c0204 115 | t38c0205 116 | t38c0206 117 | t38c0207 118 | t38c0208 119 | t38c0209 120 | t38c020a 121 | t38c020b 122 | t38c020c 123 | t38c020d 124 | t38c020e 125 | t38c020f 126 | t38c020h 127 | t38c020i 128 | t38c020j 129 | t38c020k 130 | t38c020l 131 | t38c020m 132 | t38c020o 133 | t38c020r 134 | t38c020s 135 | t39c0201 136 | t39c0202 137 | t39c0203 138 | t39c0204 139 | t39c0205 140 | t39c0206 141 | t39c0207 142 | t39c0208 143 | t39c0209 144 | t39c020a 145 | t39c020b 146 | t39c020c 147 | t39c020d 148 | t39c020e 149 | t39c020f 150 | t39c020g 151 | t39c020h 152 | t39c020k 153 | t39c020l 154 | t39c020n 155 | t39c020o 156 | t39c020p 157 | t39c020q 158 | t39c020r 159 | t39c020s 160 | t40c0201 161 | t40c0202 162 | t40c0203 163 | t40c0204 164 | t40c0205 165 | t40c0206 166 | t40c0207 167 | t40c0208 168 | t40c0209 169 | t40c020a 170 | t40c020b 171 | t40c020c 172 | t40c020d 173 | t40c020e 174 | t40c020f 175 | t40c020g 176 | t40c020h 177 | t40c020i 178 | t40c020k 179 | t40c020l 180 | t40c020m 181 | t40c020n 182 | t40c020o 183 | t40c020p 184 | t40c020q 185 | t40c020r 186 | t40c020s 187 | t21c0206 188 | t21c0207 189 | t21c0208 190 | t21c020a 191 | t21c020c 192 | t21c020f 193 | t21c020h 194 | t21c020l 195 | t21c020p 196 | t21c020r 197 | t21c020s 198 | t21c020t 199 | t21c020u 200 | t21c020v 201 | t21c020w 202 | t21c020x 203 | t21c020y 204 | t21c020z 205 | t21c0210 206 | t21c0211 207 | t21c0212 208 | t22c0201 209 | t22c0203 210 | t22c0204 211 | t22c0207 212 | t22c0209 213 | t22c020a 214 | t22c020c 215 | t22c020g 216 | t22c020j 217 | t22c020l 218 | t22c020o 219 | t22c020q 220 | t22c020r 221 | t22c020s 222 | t22c020t 223 | t22c020u 224 | t22c020v 225 | t22c020w 226 | t22c020x 227 | t22c020y 228 | t22c020z 229 | t22c0210 230 | t22c0211 231 | t23c0204 232 | t23c0205 233 | t23c0206 234 | t23c0207 235 | t23c0208 236 | t23c0209 237 | t23c020a 238 | t23c020g 239 | t23c020l 240 | t23c020r 241 | t23c020s 242 | t23c020t 243 | t23c020u 244 | t23c020v 245 | t23c020w 246 | t23c020x 247 | t23c020y 248 | t23c020z 249 | t23c0210 250 | t23c0211 251 | t23c0212 252 | t24c020c 253 | t24c020i 254 | t24c020r 255 | t24c020s 256 | t24c020t 257 | t24c020u 258 | t24c020v 259 | t24c020w 260 | t24c020y 261 | t24c020z 262 | t24c0210 263 | t24c0211 264 | t24c0212 265 | t25c0202 266 | t25c0206 267 | t25c020a 268 | t25c020c 269 | t25c020e 270 | t25c020f 271 | t25c020k 272 | t25c020m 273 | t25c020n 274 | t25c020q 275 | t25c020s 276 | t25c020t 277 | t25c020u 278 | t25c020v 279 | t25c020w 280 | t25c020x 281 | t25c020y 282 | t25c020z 283 | t25c0210 284 | t25c0211 285 | t25c0212 286 | t25c0213 287 | t25c0214 288 | t25c0215 289 | t36c0205 290 | t36c0209 291 | t36c020a 292 | t36c020b 293 | t36c020c 294 | t36c020d 295 | t36c020e 296 | t36c020f 297 | t36c020h 298 | t36c020i 299 | t36c020l 300 | t36c020m 301 | t36c020o 302 | t36c020p 303 | t36c020r 304 | t36c020s 305 | t36c020t 306 | t36c020u 307 | t36c020w 308 | t36c020x 309 | t36c020y 310 | t36c020z 311 | t36c0210 312 | t36c0211 313 | t36c0212 314 | t36c0213 315 | t36c0214 316 | t37c0203 317 | t37c0206 318 | t37c0208 319 | t37c020b 320 | t37c020c 321 | t37c020d 322 | t37c020e 323 | t37c020f 324 | t37c020k 325 | t37c020m 326 | t37c020o 327 | t37c020q 328 | t37c020s 329 | t37c020t 330 | t37c020u 331 | t37c020v 332 | t37c020w 333 | t37c020x 334 | t37c020y 335 | t37c020z 336 | t37c0210 337 | t37c0212 338 | t37c0213 339 | t38c020n 340 | t38c020t 341 | t38c020u 342 | t38c020v 343 | t38c020w 344 | t38c020x 345 | t38c020y 346 | t38c020z 347 | t38c0210 348 | t38c0211 349 | t38c0212 350 | t38c0214 351 | t39c020t 352 | t39c020u 353 | t39c020v 354 | t39c020x 355 | t39c020y 356 | t39c020z 357 | t39c0210 358 | t39c0211 359 | t39c0212 360 | t39c0213 361 | t39c0214 362 | t40c020t 363 | t40c020u 364 | t40c020v 365 | t40c020w 366 | t40c020x 367 | t40c020y 368 | t40c020z 369 | t40c0210 370 | t40c0211 371 | t40c0213 372 | t40c0214 373 | -------------------------------------------------------------------------------- /data/test/real/test.list: -------------------------------------------------------------------------------- 1 | data/test/real/tfrecords/test.tfrecords 2 | -------------------------------------------------------------------------------- /data/tr.list: -------------------------------------------------------------------------------- 1 | data/tfrecords/tr.tfrecords 2 | -------------------------------------------------------------------------------- /data/train_cmvn.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/data/train_cmvn.npz -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019.7 Nan LEE 2 | 3 | import argparse 4 | import os 5 | import csv 6 | import numpy as np 7 | import cPickle 8 | #import matplotlib.pyplot as plt 9 | 10 | 11 | def calculate_pesq(args): 12 | """Calculate PESQ of all enhaced speech. 13 | 14 | Args: 15 | workspace: str, path of workspace. 16 | speech_dir: str, path of clean speech. 17 | te_snr: float, testing SNR. 18 | """ 19 | workspace = args.workspace 20 | speech_dir = args.speech_dir 21 | #te_snr = args.te_snr 22 | type = args.type 23 | # Remove already existed file. 24 | os.system('rm _pesq_itu_results.txt') 25 | os.system('rm _pesq_results.txt') 26 | 27 | # Calculate PESQ of all enhaced speech. 28 | enh_speech_dir = os.path.join(workspace, "enh_wavs", type) 29 | names = os.listdir(enh_speech_dir) 30 | for (cnt, na) in enumerate(names): 31 | print(cnt, na) 32 | enh_path = os.path.join(enh_speech_dir, na) 33 | 34 | speech_na = na.split('.')[0] 35 | speech_path = os.path.join(speech_dir, "%s.wav" % speech_na) 36 | print(speech_path) 37 | print(enh_path) 38 | # Call executable PESQ tool. 39 | cmd = ' '.join(["./pesq", speech_path, enh_path, "+16000"]) 40 | os.system(cmd) 41 | 42 | 43 | def get_stats(args): 44 | """Calculate stats of PESQ. 45 | """ 46 | pesq_path = "_pesq_results.txt" 47 | with open(pesq_path, 'rb') as f: 48 | reader = csv.reader(f, delimiter='\t') 49 | lis = list(reader) 50 | 51 | pesq_dict = {} 52 | for i1 in xrange(1, len(lis) - 1): 53 | li = lis[i1] 54 | na = li[0] 55 | pesq = float(li[1]) 56 | noise_type = na.split('.')[1] 57 | if noise_type not in pesq_dict.keys(): 58 | pesq_dict[noise_type] = [pesq] 59 | else: 60 | pesq_dict[noise_type].append(pesq) 61 | 62 | avg_list, std_list = [], [] 63 | f = "{0:<16} {1:<16}" 64 | print(f.format("Noise", "PESQ")) 65 | print("---------------------------------") 66 | for noise_type in pesq_dict.keys(): 67 | pesqs = pesq_dict[noise_type] 68 | avg_pesq = np.mean(pesqs) 69 | std_pesq = np.std(pesqs) 70 | avg_list.append(avg_pesq) 71 | std_list.append(std_pesq) 72 | print(f.format(noise_type, "%.2f +- %.2f" % (avg_pesq, std_pesq))) 73 | print("---------------------------------") 74 | print(f.format("Avg.", "%.2f +- %.2f" % (np.mean(avg_list), np.mean(std_list)))) 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = argparse.ArgumentParser() 79 | subparsers = parser.add_subparsers(dest='mode') 80 | 81 | parser_plot_training_stat = subparsers.add_parser('plot_training_stat') 82 | parser_plot_training_stat.add_argument('--workspace', type=str, required=True) 83 | #parser_plot_training_stat.add_argument('--tr_snr', type=float, required=True) 84 | parser_plot_training_stat.add_argument('--bgn_iter', type=int, required=True) 85 | parser_plot_training_stat.add_argument('--fin_iter', type=int, required=True) 86 | parser_plot_training_stat.add_argument('--interval_iter', type=int, required=True) 87 | 88 | parser_calculate_pesq = subparsers.add_parser('calculate_pesq') 89 | parser_calculate_pesq.add_argument('--type', type=str, required=True) 90 | parser_calculate_pesq.add_argument('--workspace', type=str, required=True) 91 | parser_calculate_pesq.add_argument('--speech_dir', type=str, required=True) 92 | #parser_calculate_pesq.add_argument('--te_snr', type=float, required=True) 93 | 94 | parser_get_stats = subparsers.add_parser('get_stats') 95 | 96 | args = parser.parse_args() 97 | 98 | if args.mode == 'plot_training_stat': 99 | plot_training_stat(args) 100 | elif args.mode == 'calculate_pesq': 101 | calculate_pesq(args) 102 | elif args.mode == 'get_stats': 103 | get_stats(args) 104 | else: 105 | raise Exception("Error!") 106 | -------------------------------------------------------------------------------- /ex_trac.sh: -------------------------------------------------------------------------------- 1 | train_file_dir=data/train_dir 2 | test_file_dir=data/test_dir 3 | val_size=1000 4 | #####################Data prepare######################## 5 | find $train_file_dir -name '*.wav' > data/all_wav 6 | find $test_file_dir -name '*.wav' > data/test_wav 7 | for dataset in data/all_wav data/test_wav;do 8 | awk -F '/' '{print $NF}' $dataset | awk -F '.' '{print $1}' > data/wav_name 9 | past -d ' ' data/wav_name $dataset > data/`echo $dataset | awk -F '/' '{print $2}'`.txt 10 | rm data/wav_name 11 | done 12 | python scripts/get_train_val_scp.py --data_dir=$data --val_size $val_size 13 | echo "Finish data prepare!" 14 | date 15 | ######################################################## 16 | 17 | python2.7 pre_process_data.py calculate_train_features --train_speech_path="data/tr/inputs.txt" --data_type=train 18 | python2.7 pre_process_data.py calculate_train_features --train_speech_path="data/cv/inputs.txt" --data_type=cv 19 | python2.7 pre_process_test.py calculate_train_features --train_speech_path="data/test/real/rev_dir.txt" --data_type=test/real 20 | -------------------------------------------------------------------------------- /io_funcs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/io_funcs/__init__.py -------------------------------------------------------------------------------- /io_funcs/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/io_funcs/__init__.pyc -------------------------------------------------------------------------------- /io_funcs/__pycache__/kaldi_io.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/io_funcs/__pycache__/kaldi_io.cpython-35.pyc -------------------------------------------------------------------------------- /io_funcs/convert_cmvn_to_numpy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | """Convert inputs and lables GLOBAL cmvns to a Numpy file.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import argparse 13 | import os 14 | import sys 15 | import struct 16 | 17 | import numpy as np 18 | 19 | def convert_cmvn_to_numpy(inputs_cmvn, labels_cmvn, save_dir): 20 | """Convert global binary ark cmvn to numpy format.""" 21 | 22 | print("Convert %s and %s to Numpy format" % (inputs_cmvn, labels_cmvn)) 23 | inputs_filename = inputs_cmvn 24 | labels_filename = labels_cmvn 25 | 26 | inputs = read_binary_file(inputs_filename, 0) 27 | labels = read_binary_file(labels_filename, 0) 28 | 29 | inputs_frame = inputs[0][-1] 30 | labels_frame = labels[0][-1] 31 | 32 | # assert inputs_frame == labels_frame 33 | 34 | cmvn_inputs = np.hsplit(inputs, [inputs.shape[1] - 1])[0] 35 | cmvn_labels = np.hsplit(labels, [labels.shape[1] - 1])[0] 36 | 37 | mean_inputs = cmvn_inputs[0] / inputs_frame 38 | stddev_inputs = np.sqrt(cmvn_inputs[1] / inputs_frame - mean_inputs ** 2) 39 | mean_labels = cmvn_labels[0] / labels_frame 40 | stddev_labels = np.sqrt(cmvn_labels[1] / labels_frame - mean_labels ** 2) 41 | 42 | cmvn_name = os.path.join(save_dir, "train_cmvn.npz") 43 | np.savez(cmvn_name, 44 | mean_inputs=mean_inputs, 45 | stddev_inputs=stddev_inputs, 46 | mean_labels=mean_labels, 47 | stddev_labels=stddev_labels) 48 | 49 | print("Write to %s" % cmvn_name) 50 | 51 | 52 | def read_binary_file(filename, offset=0): 53 | """Read data from matlab binary file (row, col and matrix). 54 | 55 | Returns: 56 | A numpy matrix containing data of the given binary file. 57 | """ 58 | read_buffer = open(filename, 'rb') 59 | read_buffer.seek(int(offset), 0) 60 | header = struct.unpack(' 1 40 | % disp(na); 41 | %end; 42 | y = clean_x(1:length(enh_x)); 43 | wrt_path = [save_path, clean_na]; 44 | audiowrite(wrt_path,y,Fs); 45 | end; 46 | %disp(x); 47 | %disp(Fs); 48 | end; 49 | -------------------------------------------------------------------------------- /io_funcs/kaldi_io.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang Xiaomi 5 | 6 | """IO classes for reading and writing kaldi .ark 7 | 8 | This module provides io interfaces for reading and writing kaldi .ark files. 9 | Currently, this module only supports binary-formatted .ark files. Text .ark 10 | files are not supported. 11 | 12 | To use this module, you need to provide kaldi .scp files only. The .ark 13 | locations with corresponding offsets can be retrieved from .scp files. 14 | """ 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import sys 20 | import struct 21 | import random 22 | import numpy as np 23 | 24 | class GlobalHeader(object): 25 | """ Compress ark format header. """ 26 | def __init__(self, format, header): 27 | self.format = format 28 | self.min_value = header[0] 29 | self.range = header[1] 30 | self.num_rows = header[2] 31 | self.num_cols = header[3] 32 | 33 | class PerColHeader(object): 34 | """ Compress ark format per column header. """ 35 | def __init__(self, header): 36 | self.percentile_0 = header[0] 37 | self.percentile_25 = header[1] 38 | self.percentile_75 = header[2] 39 | self.percentile_100 = header[3] 40 | 41 | class ArkReader(object): 42 | """ Class to read Kaldi ark format. 43 | 44 | Each time, it reads one line of the .scp file and reads in the 45 | corresponding features into a numpy matrix. It only supports 46 | binary-formatted .ark files. Text files are not supported. 47 | 48 | Attributes: 49 | utt_ids: A list saving utterance identities. 50 | scp_data: A list saving .ark path and offset for items in utt_ids. 51 | scp_position: An integer indicating which utt_id and correspoding 52 | scp_data will be read next. 53 | """ 54 | 55 | def __init__(self, name="ArkReader"): 56 | self.name = name 57 | 58 | def __call__(self, scp_path): 59 | """Init utt_ids along with scp_data according to .scp file.""" 60 | self.scp_position = 0 61 | fin = open(scp_path,"r") 62 | self.utt_ids = [] 63 | self.scp_data = [] 64 | line = fin.readline() 65 | while line != '' and line != None: 66 | utt_id, path_pos = line.replace('\n','').split(' ') 67 | path, pos = path_pos.split(':') 68 | self.utt_ids.append(utt_id) 69 | self.scp_data.append((path, pos)) 70 | line = fin.readline() 71 | 72 | fin.close() 73 | 74 | def shuffle(self): 75 | """Shuffle utt_ids along with scp_data and reset scp_position.""" 76 | zipped = zip(self.utt_ids, self.scp_data) 77 | random.shuffle(zipped) 78 | self.utt_ids, self.scp_data = zip(*zipped) # unzip and assign 79 | self.scp_position = 0 80 | 81 | def read_ark(self, ark_file, ark_offset=0): 82 | """Read data from the archive (.ark from kaldi). 83 | 84 | Returns: 85 | A numpy matrix containing data of ark_file. 86 | """ 87 | ark_read_buffer = open(ark_file, 'rb') 88 | ark_read_buffer.seek(int(ark_offset), 0) 89 | header = struct.unpack('= len(self.scp_data): #if at end of file loop around 175 | looped = True 176 | self.scp_position = 0 177 | else: 178 | looped = False 179 | 180 | self.scp_position += 1 181 | 182 | utt_ids = self.utt_ids[self.scp_position-1] 183 | utt_data = self.read_utt_data_from_index(self.scp_position-1) 184 | 185 | return utt_ids, utt_data, looped 186 | 187 | def read_next_scp(self): 188 | """Read the next utterance ID but don't read the data. 189 | 190 | Returns: 191 | The utterance ID of the utterance that was read. 192 | """ 193 | if self.scp_position >= len(self.scp_data): #if at end of file loop around 194 | self.scp_position = 0 195 | 196 | self.scp_position += 1 197 | 198 | return self.utt_ids[self.scp_position-1] 199 | 200 | def read_previous_scp(self): 201 | """Read the previous utterance ID but don't read the data. 202 | 203 | Returns: 204 | The utterance ID of the utterance that was read. 205 | """ 206 | if self.scp_position < 0: #if at beginning of file loop around 207 | self.scp_position = len(self.scp_data) - 1 208 | 209 | self.scp_position -= 1 210 | 211 | return self.utt_ids[self.scp_position+1] 212 | 213 | def read_utt_data_from_id(self, utt_id): 214 | """Read the data of a certain utterance ID. 215 | 216 | Args: 217 | utt_id: A string indicating a certain utterance ID. 218 | 219 | Returns: 220 | A numpy array containing the utterance data corresponding to the ID. 221 | """ 222 | utt_mat = self.read_utt_data_from_index(self.utt_ids.index(utt_id)) 223 | 224 | return utt_mat 225 | 226 | def read_utt_data_from_index(self, index): 227 | """Read the data of a certain index. 228 | 229 | Args: 230 | index: A integer index corresponding to a certain utterance ID. 231 | 232 | Returns: 233 | A numpy array containing the utterance data corresponding to the 234 | index. 235 | """ 236 | return self.read_ark(self.scp_data[index][0], self.scp_data[index][1]) 237 | 238 | def split(self): 239 | """Split of the data that was read so far.""" 240 | self.scp_data = self.scp_data[self.scp_position:-1] 241 | self.utt_ids = self.utt_ids[self.scp_position:-1] 242 | 243 | 244 | class ArkWriter(object): 245 | """Class to write numpy matrices into Kaldi .ark file and create the 246 | corresponding .scp file. It only supports binary-formatted .ark files. 247 | Text and compressed .ark files are not supported. 248 | 249 | Attributes: 250 | scp_path: The path to the .scp file that will be written. 251 | scp_file_write: The file object corresponds to scp_path. 252 | 253 | """ 254 | 255 | def __init__(self, scp_path): 256 | """Arkwriter constructor.""" 257 | self.scp_path = scp_path 258 | self.scp_file_write = open(self.scp_path, "w") 259 | 260 | def write_next_utt(self, ark_path, utt_id, utt_mat): 261 | """Read an utterance to the archive. 262 | 263 | Args: 264 | ark_path: Path to the .ark file that will be used for writing. 265 | utt_id: The utterance ID. 266 | utt_mat: A numpy array containing the utterance data. 267 | """ 268 | ark_file_write = open(ark_path,"ab") 269 | utt_mat = np.asarray(utt_mat, dtype=np.float32) 270 | rows, cols = utt_mat.shape 271 | ark_file_write.write(struct.pack('<%ds'%(len(utt_id)), utt_id)) 272 | pos = ark_file_write.tell() 273 | ark_file_write.write(struct.pack(' 0.0 and is_training: 68 | weights_regularizer = l2_regularizer(dnn.l2_scale) 69 | else: 70 | weights_regularizer = None 71 | dnn.keep_prob = 1.0 72 | 73 | if not reuse: 74 | print("****************************************") 75 | print("*** Generator summary ***") 76 | print("G inputs shape: {}".format(inputs.get_shape())) 77 | sys.stdout.flush() 78 | 79 | h = fully_connected(inputs, units, 80 | activation_fn=activation_fn, 81 | normalizer_fn=normalizer_fn, 82 | normalizer_params=normalizer_params, 83 | weights_initializer=xavier_initializer(), 84 | weights_regularizer=weights_regularizer, 85 | biases_initializer=tf.zeros_initializer()) 86 | h = self.dropout(h, dnn.keep_prob) 87 | if not reuse: 88 | print("G layer 1 output shape: {}".format(h.get_shape()), end=" *** ") 89 | self.nnet_info(normalizer_fn, dnn.keep_prob, weights_regularizer) 90 | 91 | for layer in range(hidden_layers): 92 | h = fully_connected(h, units, 93 | activation_fn=activation_fn, 94 | normalizer_fn=normalizer_fn, 95 | normalizer_params=normalizer_params, 96 | weights_initializer=xavier_initializer(), 97 | weights_regularizer=weights_regularizer, 98 | biases_initializer=tf.zeros_initializer()) 99 | h = self.dropout(h, dnn.keep_prob) 100 | if not reuse: 101 | print("G layer {} output shape: {}".format( 102 | layer+2, h.get_shape()), end=" *** ") 103 | self.nnet_info(normalizer_fn, dnn.keep_prob, weights_regularizer) 104 | 105 | # Linear output 106 | y = fully_connected(h, dnn.output_dim, 107 | activation_fn=None, 108 | weights_initializer=xavier_initializer(), 109 | weights_regularizer=weights_regularizer, 110 | biases_initializer=tf.zeros_initializer()) 111 | if not reuse: 112 | print("G output shape: {}".format(y.get_shape())) 113 | sys.stdout.flush() 114 | return y 115 | 116 | def dropout(self, x, keep_prob): 117 | if keep_prob != 1.0: 118 | y = tf.nn.dropout(x, keep_prob) 119 | else: 120 | y = x 121 | return y 122 | 123 | def nnet_info(self, batch_norm, keep_prob, weights_regularizer): 124 | if batch_norm is not None: 125 | print("use batch normalization", end=" *** ") 126 | if keep_prob != 1.0: 127 | print("keep prob is {}".format(keep_prob), end=" *** ") 128 | if weights_regularizer is not None: 129 | print("L2 regularizer scale is {}".format(self.dnn.l2_scale), end=" *** ") 130 | print() 131 | -------------------------------------------------------------------------------- /models/dnn_trainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019.1 Nan LEE 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os 11 | import sys 12 | 13 | import numpy as np 14 | 15 | sys.path.append(os.path.dirname(sys.path[0])) 16 | from models.dnn import * 17 | from models.resnet_rced import * 18 | from utils.ops import * 19 | from keras.backend.tensorflow_backend import set_session 20 | config = tf.ConfigProto() 21 | config.gpu_options.per_process_gpu_memory_fraction = 0.4 22 | # config.gpu_options.allow_growth = True 23 | set_session(tf.Session(config=config)) 24 | 25 | class Model(object): 26 | 27 | def __init__(self, name='BaseModel'): 28 | self.name = name 29 | 30 | def save(self, save_dir, step): 31 | model_name = self.name 32 | if not os.path.exists(save_dir): 33 | os.makedirs(save_dir) 34 | if not hasattr(self, 'saver'): 35 | self.saver = tf.train.Saver() 36 | self.saver.save(self.sess, 37 | os.path.join(save_dir, model_name), 38 | global_step=step) 39 | 40 | def load(self, save_dir, model_file=None, moving_average=False): 41 | if not os.path.exists(save_dir): 42 | print('[!] Checkpoints path does not exist...') 43 | return False 44 | print('[*] Reading checkpoints...') 45 | if model_file is None: 46 | ckpt = tf.train.get_checkpoint_state(save_dir) 47 | if ckpt and ckpt.model_checkpoint_path: 48 | ckpt_name = os.path.basename(ckpt.model_checkpoint_path) 49 | else: 50 | return False 51 | else: 52 | ckpt_name = model_file 53 | 54 | if moving_average: 55 | # Restore the moving average version of the learned variables for eval. 56 | variable_averages = tf.train.ExponentialMovingAverage( 57 | self.MOVING_AVERAGE_DECAY) 58 | variables_to_restore = variable_averages.variables_to_restore() 59 | saver = tf.train.Saver(variables_to_restore) 60 | else: 61 | saver = tf.train.Saver() 62 | saver.restore(self.sess, os.path.join(save_dir, ckpt_name)) 63 | print('[*] Read {}'.format(ckpt_name)) 64 | return True 65 | 66 | 67 | class DNNTrainer(Model): 68 | def __init__(self, sess, args, devices, 69 | inputs, labels, cross_validation=False, name='DNNTrainer'): 70 | super(DNNTrainer, self).__init__(name) 71 | self.sess = sess 72 | self.cross_validation = cross_validation 73 | self.MOVING_AVERAGE_DECAY = 0.9999 74 | if cross_validation: 75 | self.keep_prob = 1.0 76 | else: 77 | self.keep_prob = args.keep_prob 78 | self.batch_norm = args.batch_norm 79 | self.batch_size = args.batch_size 80 | self.devices = devices 81 | self.save_dir = args.save_dir 82 | self.writer = tf.summary.FileWriter(os.path.join( 83 | args.save_dir,'train'), sess.graph) 84 | self.l2_scale = args.l2_scale 85 | # data 86 | self.input_dim = args.input_dim 87 | self.output_dim = args.output_dim 88 | self.left_context = args.left_context 89 | self.right_context = args.right_context 90 | self.batch_size = args.batch_size 91 | # Batch Normalization 92 | self.batch_norm = args.batch_norm 93 | self.g_disturb_weights = False 94 | # define the functions 95 | self.g_learning_rate = tf.Variable(args.g_learning_rate, trainable=False) 96 | if args.g_type == 'dnn': 97 | self.generator = DNN(self) 98 | elif args.g_type == 'res_rced': 99 | self.generator = R_RCED(self) 100 | else: 101 | raise ValueError('Unrecognized G type {}'.format(args.g_type)) 102 | if labels is None: 103 | self.generator(inputs, labels, reuse=False) 104 | else: 105 | self.build_model(inputs, labels) 106 | 107 | def build_model(self, inputs, labels): 108 | all_g_grads = [] 109 | # g_opt = tf.train.RMSPropOptimizer(self.g_learning_rate) 110 | # g_opt = tf.train.GradientDescentOptimizer(self.g_learning_rate) 111 | g_opt = tf.train.AdamOptimizer(self.g_learning_rate) 112 | # Track the moving averages of all trainable variables. 113 | variable_averages = tf.train.ExponentialMovingAverage( 114 | self.MOVING_AVERAGE_DECAY) 115 | 116 | with tf.variable_scope(tf.get_variable_scope()): 117 | for idx, device in enumerate(self.devices): 118 | with tf.device("/%s" % device): 119 | with tf.name_scope("device_%s" % idx): 120 | with variables_on_gpu(): 121 | self.build_model_single_gpu(idx, inputs, labels) 122 | tf.get_variable_scope().reuse_variables() 123 | if not self.cross_validation: 124 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 125 | with tf.control_dependencies(update_ops): 126 | g_grads = g_opt.compute_gradients( 127 | self.g_losses[-1], var_list=self.g_vars) 128 | all_g_grads.append(g_grads) 129 | if not self.cross_validation: 130 | avg_g_grads = average_gradients(all_g_grads) 131 | g_apply_gradient_op = g_opt.apply_gradients(avg_g_grads) 132 | variables_averages_op = variable_averages.apply( 133 | tf.trainable_variables()) 134 | # Group all updates to into a single train op. 135 | self.g_opt = tf.group(g_apply_gradient_op, variables_averages_op) 136 | 137 | 138 | def build_model_single_gpu(self, gpu_idx, inputs, labels): 139 | if gpu_idx == 0: 140 | g = self.generator(inputs, labels, reuse=False) 141 | 142 | g = self.generator(inputs, labels, reuse=True) 143 | 144 | if gpu_idx == 0: 145 | self.g_losses = [] 146 | self.g_mse_losses = [] 147 | self.g_l2_losses = [] 148 | 149 | g_mse_loss = tf.losses.mean_squared_error(g, labels) 150 | if not self.cross_validation and self.l2_scale > 0.0: 151 | reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, '.*g_model') 152 | g_l2_loss = tf.reduce_sum(reg_losses) 153 | else: 154 | g_l2_loss = tf.constant(0.0) 155 | g_loss = g_mse_loss + g_l2_loss 156 | 157 | self.g_mse_losses.append(g_mse_loss) 158 | self.g_l2_losses.append(g_l2_loss) 159 | self.g_losses.append(g_loss) 160 | 161 | self.g_mse_loss_summ = scalar_summary("g_mse_loss", g_mse_loss) 162 | self.g_l2_loss_summ = scalar_summary("g_l2_loss", g_l2_loss) 163 | self.g_loss_summ = scalar_summary("g_loss", g_loss) 164 | 165 | summaries = [self.g_mse_loss_summ, 166 | self.g_l2_loss_summ, 167 | self.g_loss_summ] 168 | 169 | self.summaries = tf.summary.merge(summaries) 170 | 171 | if gpu_idx == 0: 172 | self.get_vars() 173 | 174 | def get_vars(self): 175 | t_vars = tf.trainable_variables() 176 | self.g_vars_dict = {} 177 | for var in t_vars: 178 | if var.name.startswith('g_'): 179 | self.g_vars_dict[var.name] = var 180 | self.g_vars = self.g_vars_dict.values() 181 | self.all_vars = t_vars 182 | if self.g_disturb_weights and not self.cross_validation: 183 | stddev = 0.00001 184 | print("Add Gaussian noise to G weights (stddev = %s)" % (stddev)) 185 | sys.stdout.flush() 186 | self.g_disturb = [v.assign( 187 | tf.add(v, tf.truncated_normal([], 0, stddev))) for v in self.g_vars] 188 | else: 189 | print("Not add noise to G weights") 190 | -------------------------------------------------------------------------------- /models/resnet_rced.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019.7 Lee 5 | 6 | """Redundant Convolutional Encoder Decoder (R-CED) 7 | A fully convolutional neural network for speech enhancement(https://arxiv.org/pdf/1609.07132). 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import sys 15 | import numpy as np 16 | import tensorflow as tf 17 | from tensorflow.contrib.layers import xavier_initializer, l2_regularizer 18 | from tensorflow.contrib.layers import batch_norm, fully_connected 19 | 20 | class R_RCED(object): 21 | 22 | def __init__(self, rced): 23 | self.rced = rced 24 | 25 | def __call__(self, inputs, labels, reuse=False): 26 | """Build CNN models. On first pass will make vars.""" 27 | self.inputs = inputs 28 | self.labels = labels 29 | print("-----------------------------inputs--------") 30 | print(np.shape(inputs)) 31 | self.inputs_O = inputs 32 | outputs = self.infer(reuse) 33 | print(np.shape(outputs)) 34 | return outputs 35 | 36 | def infer(self, reuse): 37 | rced = self.rced 38 | activation_fn = tf.nn.relu 39 | is_training = True 40 | 41 | input_dim = rced.input_dim 42 | left_context = rced.left_context 43 | right_context = rced.right_context 44 | splice_dim = left_context + 1 + right_context 45 | #inputs_O = self.inputs 46 | in_dims = self.inputs.get_shape().as_list() 47 | if len(in_dims) == 2: 48 | # shape format [batch, width] 49 | dims = self.inputs.get_shape().as_list() 50 | assert dims[0] == rced.batch_size 51 | inputs = tf.reshape(self.inputs, [dims[0], splice_dim, input_dim]) 52 | inputs = tf.expand_dims(inputs, -1) 53 | elif len(in_dims) == 3: 54 | # shape format [batch, length, width] 55 | dims = self.inputs.get_shape().as_list() 56 | assert dims[0] == 1 57 | inputs = tf.squeeze(self.inputs, [0]) 58 | inputs = tf.reshape(self.inputs, [-1, splice_dim, input_dim]) 59 | inputs = tf.expand_dims(inputs, -1) 60 | 61 | # If test of cv , BN should use global mean / stddev 62 | if rced.cross_validation: 63 | is_training = False 64 | 65 | with tf.variable_scope('g_model') as scope: 66 | if reuse: 67 | scope.reuse_variables() 68 | 69 | if rced.batch_norm: 70 | normalizer_fn = batch_norm 71 | normalizer_params = { 72 | "is_training": is_training, 73 | "scale": True, 74 | "renorm": True 75 | } 76 | else: 77 | normalizer_fn = None 78 | normalizer_params = None 79 | 80 | if rced.l2_scale > 0.0 and is_training: 81 | weights_regularizer = l2_regularizer(rced.l2_scale) 82 | else: 83 | weights_regularizer = None 84 | keep_prob = 1.0 85 | 86 | if not reuse: 87 | print("*** Generator summary ***") 88 | print("G inputs shape: {}".format(inputs.get_shape())) 89 | 90 | # inputs format [batch, in_height, in_width, in_channels] 91 | # filters format [filter_height, filter_width, in_channels, out_channels] 92 | filters_num = [12, 12, 24, 24, 32, 64, 32, 24, 24, 12, 12] 93 | filters_width = [13, 11, 9, 7, 7, 7, 7 ,7, 9, 11, 13] 94 | assert len(filters_num) == len(filters_num) 95 | inputs_O = tf.reshape(inputs, [-1, splice_dim * input_dim]) 96 | inputs_0 = tf.contrib.layers.conv2d(inputs, filters_num[0],[splice_dim, filters_width[0]],activation_fn=activation_fn, 97 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 98 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 99 | #inputs_333 = inputs + inputs_0 100 | inputs_1 = tf.contrib.layers.conv2d(inputs_0, filters_num[1],[splice_dim, filters_width[1]],activation_fn=activation_fn, 101 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 102 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 103 | # #inputs_1 = inputs_1 + inputs_0 104 | #inputs_1=tf.layers.max_pooling2d(inputs=inputs_1, pool_size=[2, 2], strides=2,padding = 'valid') 105 | inputs_2 = tf.contrib.layers.conv2d(inputs_1, filters_num[2],[splice_dim, filters_width[2]],activation_fn=activation_fn, 106 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 107 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 108 | #inputs_2 = inputs_2 + inputs_1 109 | inputs_3 = tf.contrib.layers.conv2d(inputs_2, filters_num[3],[splice_dim, filters_width[3]],activation_fn=activation_fn, 110 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 111 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 112 | # #inputs_3 = inputs_3 + inputs_2 113 | #inputs_3=tf.layers.max_pooling2d(inputs=inputs_3, pool_size=[2, 2], strides=2,padding = 'valid') 114 | inputs_4 = tf.contrib.layers.conv2d(inputs_3, filters_num[4],[splice_dim, filters_width[4]],activation_fn=activation_fn, 115 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 116 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 117 | inputs_5 = tf.contrib.layers.conv2d(inputs_3, filters_num[5],[splice_dim, filters_width[5]],activation_fn=activation_fn, 118 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 119 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 120 | #inputs_4 = inputs_4 + inputs_3 121 | inputs_6 = tf.contrib.layers.conv2d(inputs_4, filters_num[6],[splice_dim, filters_width[6]],activation_fn=activation_fn, 122 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 123 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 124 | # #inputs_5 = inputs_5 + inputs_4 125 | #inputs_5=tf.layers.max_pooling2d(inputs=inputs_5, pool_size=[2, 2], strides=2) 126 | #inputs_5=tf.layers.conv2d_transpose(inputs_5,filters = filters_num[6],kernel_size= (2,2),strides= (2,2),padding= 'valid',activation= tf.nn.relu) 127 | inputs_7 = tf.contrib.layers.conv2d(inputs_5, filters_num[7],[splice_dim, filters_width[7]],activation_fn=activation_fn, 128 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 129 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 130 | 131 | #inputs_7 = inputs_7 + inputs_3 132 | inputs_8 = tf.contrib.layers.conv2d(inputs_6, filters_num[8],[splice_dim, filters_width[8]],activation_fn=activation_fn, 133 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 134 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 135 | 136 | #inputs_8 = inputs_8 + inputs_6 137 | #inputs_7=tf.layers.conv2d_transpose(inputs_7,filters = filters_num[6],kernel_size= (2,2),strides= (2,2),padding= 'valid',activation= tf.nn.relu) 138 | #inputs_7=tf.layers.max_pooling2d(inputs=inputs_7, pool_size=[2, 2], strides=2) 139 | inputs_8 = tf.contrib.layers.conv2d(inputs_7, filters_num[8],[splice_dim, filters_width[8]],activation_fn=activation_fn, 140 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 141 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 142 | 143 | inputs_9 = tf.contrib.layers.conv2d(inputs_8, filters_num[9],[splice_dim, filters_width[9]],activation_fn=activation_fn, 144 | normalizer_fn=normalizer_fn, normalizer_params=normalizer_params,weights_initializer=xavier_initializer(), 145 | weights_regularizer=weights_regularizer,biases_initializer=tf.zeros_initializer()) 146 | #inputs_9=tf.layers.max_pooling2d(inputs=inputs_9, pool_size=[2, 2], strides=2) 147 | #inputs_9 = inputs_9 + inputs_8 148 | print("***********shaper---------------------") 149 | print(np.shape(inputs_9)) 150 | 151 | # name_I = "inputs_"+str(len(filters_num)+1) 152 | # inputs = name_I 153 | # Linear output 154 | # inputs = tf.reshape(inputs, [rced.batch_size, -1]) 155 | inputs_D = tf.reshape(inputs_9, [-1, 11 * 257 * filters_num[-1]]) 156 | print("***********reshaper------------after---------") 157 | print(np.shape(inputs_D)) 158 | 159 | inputs_D = tf.concat([inputs_D, inputs_O],1) 160 | y = fully_connected(inputs_D, 257, 161 | activation_fn=None, 162 | weights_initializer=xavier_initializer(), 163 | weights_regularizer=weights_regularizer, 164 | biases_initializer=tf.zeros_initializer()) 165 | if not reuse: 166 | print("G output shape: {}".format(y.get_shape())) 167 | sys.stdout.flush() 168 | 169 | return y 170 | 171 | def nnet_info(self, batch_norm, keep_prob, weights_regularizer): 172 | if batch_norm is not None: 173 | print("use batch normalization", end=" *** ") 174 | if keep_prob != 1.0: 175 | print("keep prob is {}".format(keep_prob), 176 | end=" *** ") 177 | if weights_regularizer is not None: 178 | print("L2 regularizer scale is {}".format(self.rced.l2_scale), 179 | end=" *** ") 180 | 181 | print() 182 | -------------------------------------------------------------------------------- /pesq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/pesq -------------------------------------------------------------------------------- /pre_process_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019.7 Nan Lee 5 | 6 | import os 7 | import soundfile 8 | import numpy as np 9 | import argparse 10 | import csv 11 | import time 12 | #import matplotlib.pyplot as plt 13 | from scipy import signal 14 | import pickle 15 | import cPickle 16 | import h5py 17 | from sklearn import preprocessing 18 | import fnmatch 19 | # import prepare_data as pp_data 20 | import config as cfg 21 | 22 | def create_folder(fd): 23 | if not os.path.exists(fd): 24 | os.makedirs(fd) 25 | 26 | def read_audio(path, target_fs=None): 27 | (audio, fs) = soundfile.read(path) 28 | if audio.ndim > 1: 29 | audio = np.mean(audio, axis=1) 30 | if target_fs is not None and fs != target_fs: 31 | audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs) 32 | fs = target_fs 33 | return audio, fs 34 | 35 | def write_audio(path, audio, sample_rate): 36 | soundfile.write(file=path, data=audio, samplerate=sample_rate) 37 | 38 | def calculate_train_features(args): 39 | """Calculate spectrogram for mixed, speech and noise audio. Then write the 40 | features to disk. 41 | 42 | Args: 43 | workspace: str, path of workspace. 44 | data_type: str, 'train' | 'test'. 45 | speech_path:str, noisy_speech_dir clean_speech_dir 46 | """ 47 | data_type = args.data_type 48 | fs = cfg.sample_rate 49 | train_speech_path = args.train_speech_path 50 | cnt =0 51 | t1 = time.time() 52 | with open(train_speech_path,'r') as speech_org_path: 53 | for ii in speech_org_path: 54 | #read clean and noisy speech 55 | path_tmp = ii.split() 56 | noise_path = path_tmp[0] 57 | #out_feature_name = noise_path.split("/")[-1] 58 | cln_path = path_tmp[1] 59 | #out_feature_name = cln_path.split("/")[-1] 60 | out_feature_name = noise_path.split("/")[-1] 61 | (reverb_speech_audio, _) = read_audio(noise_path, target_fs=fs) 62 | (clean_speech_audio, _) = read_audio(cln_path, target_fs=fs) 63 | #extract logspectram feature 64 | mixed_complx_x = calc_sp(reverb_speech_audio, mode='magnitude') 65 | mixed_complx_x = np.log(mixed_complx_x + 1e-08).astype(np.float32) 66 | speech_x = calc_sp(clean_speech_audio, mode='magnitude') 67 | #print(mixed_complx_x) 68 | speech_x = np.log(speech_x + 1e-08).astype(np.float32) 69 | # the output feature path 70 | out_feat_path = os.path.join("workspace", "features", "spectrogram",data_type,"%s.p" % out_feature_name) 71 | create_folder(os.path.dirname(out_feat_path)) 72 | data = [mixed_complx_x, speech_x] 73 | cPickle.dump(data, open(out_feat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) 74 | cnt += 1 75 | print cnt 76 | print("Extracting feature time: %s" % (time.time() - t1)) 77 | def rms(y): 78 | """Root mean square. 79 | """ 80 | return np.sqrt(np.mean(np.abs(y) ** 2, axis=0, keepdims=False)) 81 | 82 | def get_amplitude_scaling_factor(s, n, snr, method='rms'): 83 | """Given s and n, return the scaler s according to the snr. 84 | 85 | Args: 86 | s: ndarray, source1. 87 | n: ndarray, source2. 88 | snr: float, SNR. 89 | method: 'rms'. 90 | 91 | Outputs: 92 | float, scaler. 93 | """ 94 | original_sn_rms_ratio = rms(s) / rms(n) 95 | target_sn_rms_ratio = 10. ** (float(snr) / 20.) # snr = 20 * lg(rms(s) / rms(n)) 96 | signal_scaling_factor = target_sn_rms_ratio / original_sn_rms_ratio 97 | return signal_scaling_factor 98 | 99 | def calc_sp(audio, mode): 100 | """Calculate spectrogram. 101 | 102 | Args: 103 | audio: 1darray. 104 | mode: string, 'magnitude' | 'complex' 105 | 106 | Returns: 107 | spectrogram: 2darray, (n_time, n_freq). 108 | """ 109 | n_window = cfg.n_window 110 | n_overlap = cfg.n_overlap 111 | ham_win = np.hamming(n_window) 112 | [f, t, x] = signal.spectral.spectrogram( 113 | audio, 114 | window=ham_win, 115 | nperseg=n_window, 116 | noverlap=n_overlap, 117 | detrend=False, 118 | return_onesided=True, 119 | mode=mode) 120 | x = x.T 121 | if mode == 'magnitude': 122 | x = x.astype(np.float32) 123 | elif mode == 'complex': 124 | x = x.astype(np.complex64) 125 | else: 126 | raise Exception("Incorrect mode!") 127 | return x 128 | def log_sp(x): 129 | return np.log(x + 1e-08) 130 | if __name__ == '__main__': 131 | parser = argparse.ArgumentParser() 132 | subparsers = parser.add_subparsers(dest='mode') 133 | 134 | parser_calculate_train_features = subparsers.add_parser('calculate_train_features') 135 | parser_calculate_train_features.add_argument('--train_speech_path', type=str, required=True) 136 | parser_calculate_train_features.add_argument('--data_type', type=str, required=True) 137 | 138 | args = parser.parse_args() 139 | if args.mode == 'create_mixture_csv': 140 | create_mixture_csv(args) 141 | elif args.mode == 'calculate_train_features': 142 | calculate_train_features(args) 143 | elif args.mode == 'calculate_test_features': 144 | calculate_test_features(args) 145 | elif args.mode == 'pack_features': 146 | pack_features(args) 147 | elif args.mode == 'compute_scaler': 148 | compute_scaler(args) 149 | else: 150 | raise Exception("Error!") 151 | -------------------------------------------------------------------------------- /pre_process_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019.7 Nan Lee 5 | 6 | import os 7 | import soundfile 8 | import numpy as np 9 | import argparse 10 | import csv 11 | import time 12 | #import matplotlib.pyplot as plt 13 | from scipy import signal 14 | import pickle 15 | import cPickle 16 | import h5py 17 | from sklearn import preprocessing 18 | import fnmatch 19 | # import prepare_data as pp_data 20 | import config as cfg 21 | 22 | def create_folder(fd): 23 | if not os.path.exists(fd): 24 | os.makedirs(fd) 25 | 26 | def read_audio(path, target_fs=None): 27 | (audio, fs) = soundfile.read(path) 28 | if audio.ndim > 1: 29 | audio = np.mean(audio, axis=1) 30 | if target_fs is not None and fs != target_fs: 31 | audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs) 32 | fs = target_fs 33 | return audio, fs 34 | 35 | def write_audio(path, audio, sample_rate): 36 | soundfile.write(file=path, data=audio, samplerate=sample_rate) 37 | 38 | def calculate_train_features(args): 39 | """Calculate spectrogram for mixed, speech and noise audio. Then write the 40 | features to disk. 41 | 42 | Args: 43 | workspace: str, path of workspace. 44 | data_type: str, 'train' | 'test'. 45 | speech_path:str, noisy_speech_dir clean_speech_dir 46 | """ 47 | data_type = args.data_type 48 | fs = cfg.sample_rate 49 | train_speech_path = args.train_speech_path 50 | cnt =0 51 | t1 = time.time() 52 | with open(train_speech_path,'r') as speech_org_path: 53 | for ii in speech_org_path: 54 | #read clean and noisy speech 55 | path_tmp = ii.split() 56 | noise_path = path_tmp[0] 57 | #out_feature_name = noise_path.split("/")[-1] 58 | out_feature_name = noise_path.split("/")[-1] 59 | (reverb_speech_audio, _) = read_audio(noise_path, target_fs=fs) 60 | #extract logspectram feature 61 | mixed_complx_x = calc_sp(reverb_speech_audio, mode='complex') 62 | #mixed_complx_x = np.log(mixed_complx_x + 1e-08).astype(np.float32) 63 | # the output feature path 64 | out_feat_path = os.path.join("workspace", "features", "spectrogram",data_type,"%s.p" % out_feature_name) 65 | create_folder(os.path.dirname(out_feat_path)) 66 | data = [mixed_complx_x] 67 | cPickle.dump(data, open(out_feat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) 68 | cnt += 1 69 | print cnt 70 | print("Extracting feature time: %s" % (time.time() - t1)) 71 | def rms(y): 72 | """Root mean square. 73 | """ 74 | return np.sqrt(np.mean(np.abs(y) ** 2, axis=0, keepdims=False)) 75 | 76 | def get_amplitude_scaling_factor(s, n, snr, method='rms'): 77 | """Given s and n, return the scaler s according to the snr. 78 | 79 | Args: 80 | s: ndarray, source1. 81 | n: ndarray, source2. 82 | snr: float, SNR. 83 | method: 'rms'. 84 | 85 | Outputs: 86 | float, scaler. 87 | """ 88 | original_sn_rms_ratio = rms(s) / rms(n) 89 | target_sn_rms_ratio = 10. ** (float(snr) / 20.) # snr = 20 * lg(rms(s) / rms(n)) 90 | signal_scaling_factor = target_sn_rms_ratio / original_sn_rms_ratio 91 | return signal_scaling_factor 92 | 93 | def calc_sp(audio, mode): 94 | """Calculate spectrogram. 95 | 96 | Args: 97 | audio: 1darray. 98 | mode: string, 'magnitude' | 'complex' 99 | 100 | Returns: 101 | spectrogram: 2darray, (n_time, n_freq). 102 | """ 103 | n_window = cfg.n_window 104 | n_overlap = cfg.n_overlap 105 | ham_win = np.hamming(n_window) 106 | [f, t, x] = signal.spectral.spectrogram( 107 | audio, 108 | window=ham_win, 109 | nperseg=n_window, 110 | noverlap=n_overlap, 111 | detrend=False, 112 | return_onesided=True, 113 | mode=mode) 114 | x = x.T 115 | if mode == 'magnitude': 116 | x = x.astype(np.float32) 117 | elif mode == 'complex': 118 | x = x.astype(np.complex64) 119 | else: 120 | raise Exception("Incorrect mode!") 121 | return x 122 | def log_sp(x): 123 | return np.log(x + 1e-08) 124 | if __name__ == '__main__': 125 | parser = argparse.ArgumentParser() 126 | subparsers = parser.add_subparsers(dest='mode') 127 | 128 | parser_calculate_train_features = subparsers.add_parser('calculate_train_features') 129 | parser_calculate_train_features.add_argument('--train_speech_path', type=str, required=True) 130 | parser_calculate_train_features.add_argument('--data_type', type=str, required=True) 131 | 132 | args = parser.parse_args() 133 | if args.mode == 'create_mixture_csv': 134 | create_mixture_csv(args) 135 | elif args.mode == 'calculate_train_features': 136 | calculate_train_features(args) 137 | elif args.mode == 'calculate_test_features': 138 | calculate_test_features(args) 139 | elif args.mode == 'pack_features': 140 | pack_features(args) 141 | elif args.mode == 'compute_scaler': 142 | compute_scaler(args) 143 | else: 144 | raise Exception("Error!") 145 | -------------------------------------------------------------------------------- /scripts/audio_utilities.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/audio_utilities.pyc -------------------------------------------------------------------------------- /scripts/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summary: Config file. 3 | Author: Qiuqiang Kong 4 | Created: 2017.12.21 5 | Modified: - 6 | """ 7 | 8 | sample_rate = 16000 9 | n_window = 512 # windows size for FFT 10 | n_overlap = 256 # overlap of window 11 | -------------------------------------------------------------------------------- /scripts/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/config.pyc -------------------------------------------------------------------------------- /scripts/dataset_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Ke Wang 4 | 5 | set -euo pipefail 6 | 7 | stage=2 8 | 9 | train_dir=data/train/io_test 10 | #train_dir=data/train/train_100h 11 | 12 | if [ $stage -le 0 ]; then 13 | python io_funcs/convert_cmvn_to_numpy.py \ 14 | --inputs=$train_dir/inputs.cmvn \ 15 | --labels=$train_dir/labels.cmvn \ 16 | --save_dir=$train_dir 17 | fi 18 | 19 | if [ $stage -le 1 ]; then 20 | nj=1 21 | logdir=exp/ 22 | if [ ! -d $logdir ]; then 23 | mkdir -p $logdir 24 | fi 25 | 26 | rm -rf $logdir/.error || exit 1; 27 | bash scripts/split_scp.sh --nj $nj $train_dir 28 | for i in $(seq $nj); do 29 | ( 30 | python io_funcs/make_tfrecords.py \ 31 | --inputs=$train_dir/split${nj}/inputs${i}.scp \ 32 | --labels=$train_dir/split${nj}/labels${i}.scp \ 33 | --cmvn_dir=$train_dir \ 34 | --apply_cmvn=true \ 35 | --output_dir=$train_dir/tfrecords \ 36 | --name="train${i}" 37 | ) || touch $logdir/.error & 38 | done 39 | wait 40 | [ -f $logdir/.error ] && \ 41 | echo "$0: there was a problem while making TFRecords" && exit 1 42 | echo "Making TFRecords done." 43 | fi 44 | 45 | if [ $stage -le 2 ]; then 46 | CUDA_VISIBLE_DEVICES="3" python io_funcs/tfrecords_dataset_test.py \ 47 | --batch_size=128 \ 48 | --input_dim=257 \ 49 | --output_dim=40 \ 50 | --num_threads=32 \ 51 | --num_epochs=1 \ 52 | --data_dir=$train_dir/tfrecords 53 | fi 54 | 55 | -------------------------------------------------------------------------------- /scripts/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /scripts/datasets/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/datasets/__init__.pyc -------------------------------------------------------------------------------- /scripts/datasets/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr): 18 | librosa.output.write_wav(path, wav, sr=sr) 19 | 20 | def preemphasis(wav, k, preemphasize=True): 21 | if preemphasize: 22 | return signal.lfilter([1, -k], [1], wav) 23 | return wav 24 | 25 | def inv_preemphasis(wav, k, inv_preemphasize=True): 26 | if inv_preemphasize: 27 | return signal.lfilter([1], [1, -k], wav) 28 | return wav 29 | 30 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py 31 | def start_and_end_indices(quantized, silence_threshold=2): 32 | for start in range(quantized.size): 33 | if abs(quantized[start] - 127) > silence_threshold: 34 | break 35 | for end in range(quantized.size - 1, 1, -1): 36 | if abs(quantized[end] - 127) > silence_threshold: 37 | break 38 | 39 | assert abs(quantized[start] - 127) > silence_threshold 40 | assert abs(quantized[end] - 127) > silence_threshold 41 | 42 | return start, end 43 | 44 | def trim_silence(wav, hparams): 45 | '''Trim leading and trailing silence 46 | 47 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end. 48 | ''' 49 | #Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset. 50 | return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0] 51 | 52 | def get_hop_size(hparams): 53 | hop_size = hparams.hop_size 54 | if hop_size is None: 55 | assert hparams.frame_shift_ms is not None 56 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 57 | return hop_size 58 | 59 | def linearspectrogram(wav, hparams): 60 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 61 | S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db 62 | 63 | if hparams.signal_normalization: 64 | return _normalize(S, hparams) 65 | return S 66 | 67 | def melspectrogram(wav, hparams): 68 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 69 | S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db 70 | 71 | if hparams.signal_normalization: 72 | return _normalize(S, hparams) 73 | return S 74 | 75 | def inv_linear_spectrogram(linear_spectrogram, hparams): 76 | '''Converts linear spectrogram to waveform using librosa''' 77 | if hparams.signal_normalization: 78 | D = _denormalize(linear_spectrogram, hparams) 79 | else: 80 | D = linear_spectrogram 81 | 82 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 83 | 84 | if hparams.use_lws: 85 | processor = _lws_processor(hparams) 86 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 87 | y = processor.istft(D).astype(np.float32) 88 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 89 | else: 90 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 91 | 92 | 93 | def inv_mel_spectrogram(mel_spectrogram, hparams): 94 | '''Converts mel spectrogram to waveform using librosa''' 95 | if hparams.signal_normalization: 96 | D = _denormalize(mel_spectrogram, hparams) 97 | else: 98 | D = mel_spectrogram 99 | 100 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear 101 | 102 | if hparams.use_lws: 103 | processor = _lws_processor(hparams) 104 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 105 | y = processor.istft(D).astype(np.float32) 106 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 107 | else: 108 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 109 | 110 | def _lws_processor(hparams): 111 | import lws 112 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") 113 | 114 | def _griffin_lim(S): 115 | '''librosa implementation of Griffin-Lim 116 | Based on https://github.com/librosa/librosa/issues/434 117 | ''' 118 | #angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 119 | S_complex = np.abs(S) 120 | #y = _istft(S_complex * angles) 121 | y = signal.spectral.istft(S,nperseg=512,noverlap=256) 122 | for i in range(100): 123 | theta = np.angle(signal.spectral.stft(y,nperseg=512,noverlap=256)) 124 | tmp = S_complex * np.exp(1j * theta) 125 | y = signal.spectral.istft(tmp) 126 | return y 127 | #angles = np.exp(2j * np.pi * np.random.rand(*S.shape) 128 | #S_complex = np.abs(S).astype(np.complex) 129 | #y = _istft(S_complex * angles) 130 | #for i in range(30): 131 | # angles = np.exp(1j * np.angle(_stft(y))) 132 | # y = _istft(S_complex * angles) 133 | # return y 134 | 135 | def _stft(y): 136 | 137 | #if hparams.use_lws: 138 | # return _lws_processor(hparams).stft(y).T 139 | #else: 140 | return librosa.stft(y=y, hop_length=256, win_length=512) 141 | 142 | def _istft(y): 143 | return librosa.istft(y, hop_length=256, win_length=512) 144 | 145 | ########################################################## 146 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 147 | def num_frames(length, fsize, fshift): 148 | """Compute number of time frames of spectrogram 149 | """ 150 | pad = (fsize - fshift) 151 | if length % fshift == 0: 152 | M = (length + pad * 2 - fsize) // fshift + 1 153 | else: 154 | M = (length + pad * 2 - fsize) // fshift + 2 155 | return M 156 | 157 | 158 | def pad_lr(x, fsize, fshift): 159 | """Compute left and right padding 160 | """ 161 | M = num_frames(len(x), fsize, fshift) 162 | pad = (fsize - fshift) 163 | T = len(x) + 2 * pad 164 | r = (M - 1) * fshift + fsize - T 165 | return pad, pad + r 166 | ########################################################## 167 | #Librosa correct padding 168 | def librosa_pad_lr(x, fsize, fshift): 169 | '''compute right padding (final frame) 170 | ''' 171 | return int(fsize // 2) 172 | 173 | 174 | # Conversions 175 | _mel_basis = None 176 | _inv_mel_basis = None 177 | 178 | def _linear_to_mel(spectogram, hparams): 179 | global _mel_basis 180 | if _mel_basis is None: 181 | _mel_basis = _build_mel_basis(hparams) 182 | return np.dot(_mel_basis, spectogram) 183 | 184 | def _mel_to_linear(mel_spectrogram, hparams): 185 | global _inv_mel_basis 186 | if _inv_mel_basis is None: 187 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 188 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 189 | 190 | def _build_mel_basis(hparams): 191 | assert hparams.fmax <= hparams.sample_rate // 2 192 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 193 | fmin=hparams.fmin, fmax=hparams.fmax) 194 | 195 | def _amp_to_db(x, hparams): 196 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 197 | return 20 * np.log10(np.maximum(min_level, x)) 198 | 199 | def _db_to_amp(x): 200 | return np.power(10.0, (x) * 0.05) 201 | 202 | def _normalize(S, hparams): 203 | if hparams.allow_clipping_in_normalization: 204 | if hparams.symmetric_mels: 205 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 206 | -hparams.max_abs_value, hparams.max_abs_value) 207 | else: 208 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 209 | 210 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 211 | if hparams.symmetric_mels: 212 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 213 | else: 214 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 215 | 216 | def _denormalize(D, hparams): 217 | if hparams.allow_clipping_in_normalization: 218 | if hparams.symmetric_mels: 219 | return (((np.clip(D, -hparams.max_abs_value, 220 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 221 | + hparams.min_level_db) 222 | else: 223 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 224 | 225 | if hparams.symmetric_mels: 226 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 227 | else: 228 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 229 | -------------------------------------------------------------------------------- /scripts/datasets/audio.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/datasets/audio.pyc -------------------------------------------------------------------------------- /scripts/datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 19 | - wav_dir: output directory of the preprocessed speech audio dataset 20 | - n_jobs: Optional, number of worker process to parallelize across 21 | - tqdm: Optional, provides a nice progress bar 22 | 23 | Returns: 24 | - A list of tuple describing the train examples. this should be written to train.txt 25 | """ 26 | 27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 28 | # optimization purposes and it can be omited 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | futures = [] 31 | index = 1 32 | for input_dir in input_dirs: 33 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: 34 | for line in f: 35 | parts = line.strip().split('|') 36 | basename = parts[0] 37 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename)) 38 | text = parts[2] 39 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams))) 40 | index += 1 41 | 42 | return [future.result() for future in tqdm(futures) if future.result() is not None] 43 | 44 | 45 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): 46 | """ 47 | Preprocesses a single utterance wav/text pair 48 | 49 | this writes the mel scale spectogram to disk and return a tuple to write 50 | to the train.txt file 51 | 52 | Args: 53 | - mel_dir: the directory to write the mel spectograms into 54 | - linear_dir: the directory to write the linear spectrograms into 55 | - wav_dir: the directory to write the preprocessed wav into 56 | - index: the numeric index to use in the spectogram filename 57 | - wav_path: path to the audio file containing the speech input 58 | - text: text spoken in the input audio file 59 | - hparams: hyper parameters 60 | 61 | Returns: 62 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 63 | """ 64 | try: 65 | # Load the audio as numpy array 66 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 67 | except FileNotFoundError: #catch missing wav exception 68 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 69 | wav_path)) 70 | return None 71 | 72 | #rescale wav 73 | if hparams.rescale: 74 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 75 | 76 | #M-AILABS extra silence specific 77 | if hparams.trim_silence: 78 | wav = audio.trim_silence(wav, hparams) 79 | 80 | #Mu-law quantize 81 | if is_mulaw_quantize(hparams.input_type): 82 | #[0, quantize_channels) 83 | out = mulaw_quantize(wav, hparams.quantize_channels) 84 | 85 | #Trim silences 86 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 87 | wav = wav[start: end] 88 | out = out[start: end] 89 | 90 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 91 | out_dtype = np.int16 92 | 93 | elif is_mulaw(hparams.input_type): 94 | #[-1, 1] 95 | out = mulaw(wav, hparams.quantize_channels) 96 | constant_values = mulaw(0., hparams.quantize_channels) 97 | out_dtype = np.float32 98 | 99 | else: 100 | #[-1, 1] 101 | out = wav 102 | constant_values = 0. 103 | out_dtype = np.float32 104 | 105 | # Compute the mel scale spectrogram from the wav 106 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 107 | mel_frames = mel_spectrogram.shape[1] 108 | 109 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 110 | return None 111 | 112 | #Compute the linear scale spectrogram from the wav 113 | linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) 114 | linear_frames = linear_spectrogram.shape[1] 115 | 116 | #sanity check 117 | assert linear_frames == mel_frames 118 | 119 | if hparams.use_lws: 120 | #Ensure time resolution adjustement between audio and mel-spectrogram 121 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 122 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 123 | 124 | #Zero pad audio signal 125 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 126 | else: 127 | #Ensure time resolution adjustement between audio and mel-spectrogram 128 | pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) 129 | 130 | #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) 131 | out = np.pad(out, pad, mode='reflect') 132 | 133 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 134 | 135 | #time resolution adjustement 136 | #ensure length of raw audio is multiple of hop size so that we can use 137 | #transposed convolution to upsample 138 | out = out[:mel_frames * audio.get_hop_size(hparams)] 139 | assert len(out) % audio.get_hop_size(hparams) == 0 140 | time_steps = len(out) 141 | 142 | # Write the spectrogram and audio to disk 143 | audio_filename = 'audio-{}.npy'.format(index) 144 | mel_filename = 'mel-{}.npy'.format(index) 145 | linear_filename = 'linear-{}.npy'.format(index) 146 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) 147 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 148 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) 149 | 150 | # Return a tuple describing this training example 151 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text) 152 | -------------------------------------------------------------------------------- /scripts/datasets/wavenet_preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 19 | - wav_dir: output directory of the preprocessed speech audio dataset 20 | - n_jobs: Optional, number of worker process to parallelize across 21 | - tqdm: Optional, provides a nice progress bar 22 | 23 | Returns: 24 | - A list of tuple describing the train examples. this should be written to train.txt 25 | """ 26 | 27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 28 | # optimization purposes and it can be omited 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | futures = [] 31 | for file in os.listdir(input_dir): 32 | wav_path = os.path.join(input_dir, file) 33 | basename = os.path.basename(wav_path).replace('.wav', '') 34 | futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams))) 35 | 36 | return [future.result() for future in tqdm(futures) if future.result() is not None] 37 | 38 | 39 | def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams): 40 | """ 41 | Preprocesses a single utterance wav/text pair 42 | 43 | this writes the mel scale spectogram to disk and return a tuple to write 44 | to the train.txt file 45 | 46 | Args: 47 | - mel_dir: the directory to write the mel spectograms into 48 | - linear_dir: the directory to write the linear spectrograms into 49 | - wav_dir: the directory to write the preprocessed wav into 50 | - index: the numeric index to use in the spectrogram filename 51 | - wav_path: path to the audio file containing the speech input 52 | - text: text spoken in the input audio file 53 | - hparams: hyper parameters 54 | 55 | Returns: 56 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 57 | """ 58 | try: 59 | # Load the audio as numpy array 60 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 61 | except FileNotFoundError: #catch missing wav exception 62 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 63 | wav_path)) 64 | return None 65 | 66 | #rescale wav 67 | if hparams.rescale: 68 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 69 | 70 | #M-AILABS extra silence specific 71 | if hparams.trim_silence: 72 | wav = audio.trim_silence(wav, hparams) 73 | 74 | #Mu-law quantize 75 | if is_mulaw_quantize(hparams.input_type): 76 | #[0, quantize_channels) 77 | out = mulaw_quantize(wav, hparams.quantize_channels) 78 | 79 | #Trim silences 80 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 81 | wav = wav[start: end] 82 | out = out[start: end] 83 | 84 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 85 | out_dtype = np.int16 86 | 87 | elif is_mulaw(hparams.input_type): 88 | #[-1, 1] 89 | out = mulaw(wav, hparams.quantize_channels) 90 | constant_values = mulaw(0., hparams.quantize_channels) 91 | out_dtype = np.float32 92 | 93 | else: 94 | #[-1, 1] 95 | out = wav 96 | constant_values = 0. 97 | out_dtype = np.float32 98 | 99 | # Compute the mel scale spectrogram from the wav 100 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 101 | mel_frames = mel_spectrogram.shape[1] 102 | 103 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 104 | return None 105 | 106 | if hparams.use_lws: 107 | #Ensure time resolution adjustement between audio and mel-spectrogram 108 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 109 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 110 | 111 | #Zero pad audio signal 112 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 113 | else: 114 | #Ensure time resolution adjustement between audio and mel-spectrogram 115 | pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) 116 | 117 | #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) 118 | out = np.pad(out, pad, mode='reflect') 119 | 120 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 121 | 122 | #time resolution adjustement 123 | #ensure length of raw audio is multiple of hop size so that we can use 124 | #transposed convolution to upsample 125 | out = out[:mel_frames * audio.get_hop_size(hparams)] 126 | assert len(out) % audio.get_hop_size(hparams) == 0 127 | time_steps = len(out) 128 | 129 | # Write the spectrogram and audio to disk 130 | audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) 131 | mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) 132 | np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) 133 | np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) 134 | 135 | #global condition features 136 | if hparams.gin_channels > 0: 137 | raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training') 138 | speaker_id = '' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) 139 | else: 140 | speaker_id = '' 141 | 142 | # Return a tuple describing this training example 143 | return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames) 144 | -------------------------------------------------------------------------------- /scripts/get_train_val_scp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | """Get train and validation set.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import print_function 10 | 11 | import argparse 12 | import os 13 | import pprint 14 | import random 15 | import sys 16 | 17 | 18 | def main(): 19 | inputs_scp = os.path.join(FLAGS.data_dir, "all_wav.txt") 20 | tr_dir = os.path.join(FLAGS.data_dir, "tr") 21 | cv_dir = os.path.join(FLAGS.data_dir, "cv") 22 | tr_inputs_scp = os.path.join(tr_dir, "inputs.txt") 23 | cv_inputs_scp = os.path.join(cv_dir, "inputs.txt") 24 | 25 | print("Split to %s and %s" % (tr_dir, cv_dir)) 26 | 27 | if not os.path.exists(tr_dir): 28 | os.makedirs(tr_dir) 29 | if not os.path.exists(cv_dir): 30 | os.makedirs(cv_dir) 31 | 32 | with open(inputs_scp, 'r') as fr_inputs, \ 33 | open(tr_inputs_scp, 'w') as fw_tr_inputs, \ 34 | open(cv_inputs_scp, 'w') as fw_cv_inputs: 35 | lists_inputs = fr_inputs.readlines() 36 | if len(lists_inputs) <= FLAGS.val_size: 37 | print(("Validation size %s is bigger than inputs scp length %s." 38 | " Please reduce validation size.") % ( 39 | FLAGS.val_size, len(lists_inputs))) 40 | 41 | lists = range(len(lists_inputs)) 42 | random.shuffle(lists) 43 | # print(lists) 44 | for i in lists: 45 | line_input = lists_inputs[i] 46 | print(line_input) 47 | if i < FLAGS.val_size: 48 | fw_cv_inputs.write(line_input) 49 | else: 50 | fw_tr_inputs.write(line_input) 51 | 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument( 56 | '--data_dir', 57 | type=str, 58 | required=True, 59 | help="Directory name of data to spliting." 60 | "(Note: inputs.scp and labels.scp)" 61 | ) 62 | parser.add_argument( 63 | '--val_size', 64 | type=int, 65 | default=361, 66 | help="Validation set size." 67 | ) 68 | 69 | FLAGS, unparsed = parser.parse_known_args() 70 | # pp = pprint.PrettyPrinter() 71 | # pp.pprint(FLAGS.__dict__) 72 | main() 73 | -------------------------------------------------------------------------------- /scripts/io_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Ke Wang 4 | 5 | set -euo pipefail 6 | 7 | stage=2 8 | 9 | # train_dir=data/train/io_test 10 | train_dir=data/train/train_100h 11 | 12 | if [ $stage -le 0 ]; then 13 | python io_funcs/convert_cmvn_to_numpy.py \ 14 | --inputs=$train_dir/inputs.cmvn \ 15 | --labels=$train_dir/labels.cmvn \ 16 | --save_dir=$train_dir 17 | fi 18 | 19 | if [ $stage -le 1 ]; then 20 | nj=1 21 | logdir=exp/ 22 | if [ ! -d $logdir ]; then 23 | mkdir -p $logdir 24 | fi 25 | 26 | rm -rf $logdir/.error || exit 1; 27 | bash scripts/split_scp.sh --nj $nj $train_dir 28 | for i in $(seq $nj); do 29 | ( 30 | python io_funcs/make_tfrecords.py \ 31 | --inputs=$train_dir/split${nj}/inputs${i}.scp \ 32 | --labels=$train_dir/split${nj}/labels${i}.scp \ 33 | --cmvn_dir=$train_dir \ 34 | --apply_cmvn=True \ 35 | --output_dir=$train_dir/tfrecords \ 36 | --name="train${i}" 37 | ) || touch $logdir/.error & 38 | done 39 | wait 40 | [ -f $logdir/.error ] && \ 41 | echo "$0: there was a problem while making TFRecords" && exit 1 42 | echo "Making TFRecords done." 43 | fi 44 | 45 | if [ $stage -le 2 ]; then 46 | CUDA_VISIBLE_DEVICES="3" python io_funcs/tfrecords_io_test.py \ 47 | --batch_size=128 \ 48 | --input_dim=257 \ 49 | --output_dim=40 \ 50 | --num_threads=32 \ 51 | --num_epochs=1 \ 52 | --data_dir=$train_dir/tfrecords 53 | fi 54 | 55 | -------------------------------------------------------------------------------- /scripts/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /scripts/prepare_data.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/prepare_data.pyc -------------------------------------------------------------------------------- /scripts/spectrogram_to_wave.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summary: Recover spectrogram to wave. 3 | Author: Qiuqiang Kong 4 | Created: 2017.09 5 | Modified: - 6 | """ 7 | import numpy as np 8 | import numpy 9 | import decimal 10 | 11 | def recover_wav(pd_abs_x, gt_x, n_overlap, winfunc, wav_len=None): 12 | """Recover wave from spectrogram. 13 | If you are using scipy.signal.spectrogram, you may need to multipy a scaler 14 | to the recovered audio after using this function. For example, 15 | recover_scaler = np.sqrt((ham_win**2).sum()) 16 | 17 | Args: 18 | pd_abs_x: 2d array, (n_time, n_freq) 19 | gt_x: 2d complex array, (n_time, n_freq) 20 | n_overlap: integar. 21 | winfunc: func, the analysis window to apply to each frame. 22 | wav_len: integer. Pad or trunc to wav_len with zero. 23 | 24 | Returns: 25 | 1d array. 26 | """ 27 | x = real_to_complex(pd_abs_x, gt_x) 28 | x = half_to_whole(x) 29 | frames = ifft_to_wav(x) 30 | (n_frames, n_window) = frames.shape 31 | s = deframesig(frames=frames, siglen=0, frame_len=n_window, 32 | frame_step=n_window-n_overlap, winfunc=winfunc) 33 | if wav_len: 34 | s = pad_or_trunc(s, wav_len) 35 | return s 36 | 37 | def real_to_complex(pd_abs_x, gt_x): 38 | """Recover pred spectrogram's phase from ground truth's phase. 39 | 40 | Args: 41 | pd_abs_x: 2d array, (n_time, n_freq) 42 | gt_x: 2d complex array, (n_time, n_freq) 43 | 44 | Returns: 45 | 2d complex array, (n_time, n_freq) 46 | """ 47 | theta = np.angle(gt_x) 48 | cmplx = pd_abs_x * np.exp(1j * theta) 49 | return cmplx 50 | 51 | def half_to_whole(x): 52 | """Recover whole spectrogram from half spectrogram. 53 | """ 54 | return np.concatenate((x, np.fliplr(np.conj(x[:, 1:-1]))), axis=1) 55 | 56 | def ifft_to_wav(x): 57 | """Recover wav from whole spectrogram""" 58 | return np.real(np.fft.ifft(x)) 59 | 60 | def pad_or_trunc(s, wav_len): 61 | if len(s) >= wav_len: 62 | s = s[0 : wav_len] 63 | else: 64 | s = np.concatenate((s, np.zeros(wav_len - len(s)))) 65 | return s 66 | 67 | def recover_gt_wav(x, n_overlap, winfunc, wav_len=None): 68 | """Recover ground truth wav. 69 | """ 70 | x = half_to_whole(x) 71 | frames = ifft_to_wav(x) 72 | (n_frames, n_window) = frames.shape 73 | s = deframesig(frames=frames, siglen=0, frame_len=n_window, 74 | frame_step=n_window-n_overlap, winfunc=winfunc) 75 | if wav_len: 76 | s = pad_or_trunc(s, wav_len) 77 | return s 78 | 79 | def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((x,))): 80 | """Does overlap-add procedure to undo the action of framesig. 81 | Ref: From https://github.com/jameslyons/python_speech_features 82 | 83 | :param frames: the array of frames. 84 | :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. 85 | :param frame_len: length of each frame measured in samples. 86 | :param frame_step: number of samples after the start of the previous frame that the next frame should begin. 87 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. 88 | :returns: a 1-D signal. 89 | """ 90 | frame_len = round_half_up(frame_len) 91 | frame_step = round_half_up(frame_step) 92 | numframes = numpy.shape(frames)[0] 93 | assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' 94 | 95 | indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T 96 | indices = numpy.array(indices,dtype=numpy.int32) 97 | padlen = (numframes-1)*frame_step + frame_len 98 | 99 | if siglen <= 0: siglen = padlen 100 | 101 | rec_signal = numpy.zeros((padlen,)) 102 | window_correction = numpy.zeros((padlen,)) 103 | win = winfunc(frame_len) 104 | 105 | for i in range(0,numframes): 106 | window_correction[indices[i,:]] = window_correction[indices[i,:]] + win + 1e-15 #add a little bit so it is never zero 107 | rec_signal[indices[i,:]] = rec_signal[indices[i,:]] + frames[i,:] 108 | 109 | rec_signal = rec_signal/window_correction 110 | return rec_signal[0:siglen] 111 | 112 | def round_half_up(number): 113 | return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) -------------------------------------------------------------------------------- /scripts/spectrogram_to_wave.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/scripts/spectrogram_to_wave.pyc -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019.7 Nan LEE 4 | 5 | set -euo pipefail 6 | 7 | stage=0 8 | 9 | nj=1 10 | val_size=500 11 | train_dir=data 12 | test_dir=data/test 13 | logdir=exp 14 | tr_list=$train_dir/tr.list 15 | cv_list=$train_dir/cv.list 16 | test_list=$test_dir/test.list 17 | save_dir=exp/dnn 18 | 19 | # Data prepare 20 | if [ $stage -le 0 ]; then 21 | # Make TFRecords file 22 | echo "Begin making TFRecords files ..." 23 | if [ ! -d $logdir ]; then 24 | mkdir -p $logdir || exit 1; 25 | fi 26 | 27 | # cv set 28 | declare -i verbose=30 29 | [ -d $train_dir/tfrecords ] && (rm -rf $train_dir/tfrecords || exit 1;) 30 | mkdir -p $train_dir/tfrecords || exit 1; 31 | 32 | TF_CPP_MIN_LOG_LEVEL=1 python io_funcs/make_setf.py --inputs=$train_dir/cv/inputs_feat.txt --name="cv" 33 | echo "$train_dir/tfrecords/cv.tfrecords" > $cv_list 34 | wait 35 | date 36 | 37 | TF_CPP_MIN_LOG_LEVEL=1 python io_funcs/make_setf.py --inputs=$train_dir/tr/inputs_feat.txt --name="tr" 38 | echo "$train_dir/tfrecords/tr.tfrecords" > $tr_list 39 | wait 40 | date 41 | 42 | [ -f $train_dir/batch_num.txt ] && rm $train_dir/batch_num.txt 43 | echo "Make train TFRecords files sucessed." 44 | echo "" 45 | fi 46 | #exit 0; 47 | # Train model 48 | if [ $stage -le 2 ]; then 49 | echo "$(date): $(hostname)" 50 | CUDA_VISIBLE_DEVICES="1,2,3" TF_CPP_MIN_LOG_LEVEL=2 \ 51 | python scripts/train_dnn.py \ 52 | --data_dir=$train_dir \ 53 | --tr_list_file=$tr_list \ 54 | --cv_list_file=$cv_list \ 55 | --g_type="res_rced" \ 56 | --save_dir=$save_dir \ 57 | --batch_size=64 \ 58 | --g_learning_rate=0.001 \ 59 | --keep_lr=2 \ 60 | --batch_norm=true \ 61 | --keep_prob=1 \ 62 | --l2_scale=0 \ 63 | --input_dim=257 \ 64 | --output_dim=257 \ 65 | --left_context=5 \ 66 | --right_context=5 \ 67 | --min_epoches=30 \ 68 | --max_epoches=35 \ 69 | --decay_factor=0.8 \ 70 | --start_halving_impr=0.01 \ 71 | --end_halving_impr=0.001 \ 72 | --num_threads=1 \ 73 | --num_gpu=1 || exit 1; 74 | 75 | echo "Finished training successfully on $(date)" 76 | echo "" 77 | fi 78 | # exit 0; 79 | 80 | # Decode 81 | 82 | if [ $stage -le 4 ]; then 83 | echo "Prepare test data" 84 | if [ -f $logdir/.test.error ]; then 85 | rm -rf $logdir/.test.error || exit 1; 86 | fi 87 | declare -i verbose=30 88 | # [ -d $test_dir/tfrecords ] && (rm -rf $test_dir/tfrecords || exit 1;) 89 | # mkdir -p $test_dir/tfrecords || exit 1; 90 | for datase in data/test/*;do 91 | # for datase in data/simusi;do 92 | rm -rf $datase/tfrecords 93 | TF_CPP_MIN_LOG_LEVEL=1 python io_funcs/make_sete.py \ 94 | --inputs=$datase/inputs.txt \ 95 | --output_dir=$datase/tfrecords \ 96 | --name="test" || touch $logdir/.test.error & 97 | echo "$datase/tfrecords/test.tfrecords" > $datase/test.list 98 | # exit 0; 99 | wait 100 | done 101 | fi 102 | # Decode 103 | if [ $stage -le 5 ]; then 104 | 105 | echo "Start decoding test data" 106 | for datase in data/test/*;do 107 | # for datase in data/simusi;do 108 | CUDA_VISIBLE_DEVICES="1" TF_CPP_MIN_LOG_LEVEL=2 python scripts/train_dnn.py \ 109 | --decode \ 110 | --data_dir=$train_dir \ 111 | --test_list_file=$datase/test.list \ 112 | --g_type="res_rced" \ 113 | --save_dir=$save_dir \ 114 | --g_learning_rate=0.001 \ 115 | --batch_norm=true \ 116 | --input_dim=257 \ 117 | --output_dim=257 \ 118 | --left_context=5 \ 119 | --right_context=5 \ 120 | --batch_size=1 \ 121 | --keep_prob=1 \ 122 | --l2_scale=0 \ 123 | --num_threads=1 \ 124 | --savetestdir=$datase || exit 1; 125 | echo "Decoding done" 126 | wait 127 | done 128 | fi 129 | 130 | exit 0 131 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | 7 | -------------------------------------------------------------------------------- /utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/utils/__init__.pyc -------------------------------------------------------------------------------- /utils/add_additive_noise.py: -------------------------------------------------------------------------------- 1 | """ 2 | LI, Nan 3 | 2019.08 4 | """ 5 | import os 6 | import soundfile 7 | import numpy as np 8 | import argparse 9 | import csv 10 | import time 11 | #import matplotlib.pyplot as plt 12 | from scipy import signal 13 | #import pickle 14 | #import cPickle 15 | import h5py 16 | from sklearn import preprocessing 17 | import librosa 18 | import prepare_data as pp_data 19 | import config as cfg 20 | import math 21 | from utils.tools import * 22 | import random 23 | def create_folder(fd): 24 | if not os.path.exists(fd): 25 | os.makedirs(fd) 26 | 27 | def read_audio(path, target_fs=None): 28 | (audio, fs) = soundfile.read(path) 29 | if audio.ndim > 1: 30 | audio = np.mean(audio, axis=1) 31 | if target_fs is not None and fs != target_fs: 32 | audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs) 33 | fs = target_fs 34 | print(fs) 35 | return audio, fs 36 | 37 | def write_audio(path, audio, sample_rate): 38 | soundfile.write(file=path, data=audio, samplerate=sample_rate) 39 | 40 | ### 41 | def create_mixture_csv(args): 42 | """Create csv containing mixture information. 43 | Each line in the .csv file contains [speech_name, noise_name, noise_onset, noise_offset] 44 | 45 | Args: 46 | workspace: str, path of workspace. 47 | speech_dir: str, path of speech data. 48 | noise_dir: str, path of noise data. 49 | data_type: str, 'train' | 'test'. 50 | magnification: int, only used when data_type='train', number of noise 51 | selected to mix with a speech. E.g., when magnication=3, then 4620 52 | speech with create 4620*3 mixtures. magnification should not larger 53 | than the species of noises. 54 | """ 55 | workspace = args.workspace 56 | speech_dir = args.speech_dir 57 | noise_dir = args.noise_dir 58 | data_type = args.data_type 59 | magnification = args.magnification 60 | fs = cfg.sample_rate 61 | 62 | speech_names = [na for na in os.listdir(speech_dir) if na.lower().endswith(".wav")] 63 | noise_names = [na for na in os.listdir(noise_dir) if na.lower().endswith(".wav")] 64 | 65 | rs = np.random.RandomState(0) 66 | out_csv_path = os.path.join(workspace, "mixture_csvs", "%s.csv" % data_type) 67 | pp_data.create_folder(os.path.dirname(out_csv_path)) 68 | 69 | cnt = 0 70 | f = open(out_csv_path, 'w') 71 | f.write("%s\t%s\t%s\t%s\n" % ("speech_name", "noise_name", "noise_onset", "noise_offset")) 72 | for speech_na in speech_names: 73 | # Read speech. 74 | speech_path = os.path.join(speech_dir, speech_na) 75 | (speech_audio, _) = read_audio(speech_path) 76 | len_speech = len(speech_audio) 77 | 78 | # For training data, mix each speech with randomly picked #magnification noises. 79 | if data_type == 'train': 80 | selected_noise_names = rs.choice(noise_names, size=magnification, replace=False) 81 | # For test data, mix each speech with all noises. 82 | elif data_type == 'test': 83 | selected_noise_names = noise_names 84 | else: 85 | raise Exception("data_type must be train | test!") 86 | 87 | # Mix one speech with different noises many times. 88 | for noise_na in selected_noise_names: 89 | noise_path = os.path.join(noise_dir, noise_na) 90 | (noise_audio, _) = read_audio(noise_path) 91 | 92 | len_noise = len(noise_audio) 93 | 94 | if len_noise <= len_speech: 95 | noise_onset = 0 96 | nosie_offset = len_speech 97 | # If noise longer than speech then randomly select a segment of noise. 98 | else: 99 | noise_onset = rs.randint(0, len_noise - len_speech, size=1)[0] 100 | nosie_offset = noise_onset + len_speech 101 | 102 | if cnt % 100 == 0: 103 | print cnt 104 | 105 | cnt += 1 106 | f.write("%s\t%s\t%d\t%d\n" % (speech_na, noise_na, noise_onset, nosie_offset)) 107 | f.close() 108 | print(out_csv_path) 109 | print("Create %s mixture csv finished!" % data_type) 110 | 111 | ### 112 | def calculate_mixture_features(args): 113 | mixture_csv_path = os.path.join("mini_data_bak","train_speech/cleandata.txt") 114 | out_dir = "/Work18/2017/linan/ASR/data/aur/train" 115 | with open(mixture_csv_path, 'rb') as f: 116 | lis = list(f) 117 | for x in lis: 118 | x.replace("\n", "") 119 | print(x) 120 | print("finish read") 121 | noise_dir = "mini_data/Noise" 122 | all_noise_na = ["Babble2.wav", "F162.wav", "Factory2.wav", "Pink2.wav", "Volvo2.wav", "White2.wav"] 123 | all_snr = [-10, -5, 0, 5, 10, 15, 20] 124 | t1 = time.time() 125 | cnt = 0 126 | fs = 8000 127 | for i1 in xrange(0, len(lis)): 128 | speech_path = lis[i1].replace("\n", "") 129 | 130 | # Read speech audio. 131 | (speech_audio, _) = read_audio(speech_path, target_fs=fs) 132 | name = speech_path.split("/")[-1] 133 | # Read noise audio. 134 | rrr = random.randint(0,5) 135 | noise_na = all_noise_na[rrr] 136 | noise_path = os.path.join(noise_dir, noise_na) 137 | (noise_audio, _) = read_audio(noise_path, target_fs=fs) 138 | 139 | noise_len = np.shape(noise_audio)[0] 140 | speech_len = np.shape(speech_audio)[0] 141 | 142 | rdm = random.randint(0,noise_len-speech_len) 143 | noise_audio = noise_audio[rdm:(rdm+speech_len)] 144 | rrr2 = random.randint(0,6) 145 | snr = all_snr[rrr2] 146 | print("all_snr:",all_snr[rrr2]) 147 | # Scale speech to given snr. 148 | scaler = get_amplitude_scaling_factor(speech_audio, noise_audio, snr=snr) 149 | #speech_audio /= scaler 150 | 151 | noise_audio*=scaler 152 | # Get normalized mixture, speech, noise. 153 | print("speech audio shape:",np.shape(speech_audio)) 154 | print("noise audio shape:",np.shape(noise_audio)) 155 | (mixed_audio, speech_audio, noise_audio, alpha) = additive_mixing(speech_audio, noise_audio) 156 | print(np.shape(speech_audio)) 157 | print(np.shape(mixed_audio)) 158 | tmp1 = np.sum(speech_audio**2) 159 | tmp2 = np.sum((mixed_audio-speech_audio)**2) 160 | 161 | noise2 = "noise" + str(snr) 162 | if snr < 0: 163 | snr2 = -snr 164 | noise2 = "noise_" + str(snr2) 165 | out_noise_path = os.path.join(out_dir,"noise",name) 166 | #audiowrite('test_speech.wav', speech_audio, samp_rate=16000) 167 | audiowrite(out_noise_path, noise_audio, samp_rate=fs) 168 | snr_bi = tmp1/tmp2 169 | labels = 10*np.log10(snr_bi) 170 | print("cacu:snr",labels) 171 | snr2 = "snr" + str(snr) 172 | if snr < 0: 173 | snr2 = -snr 174 | snr2 = "snr_" + str(snr2) 175 | out_put_path = os.path.join(out_dir,snr2,name) 176 | print(out_put_path) 177 | audiowrite(out_put_path, mixed_audio, samp_rate=fs) 178 | def rms(y): 179 | """Root mean square. 180 | """ 181 | return np.sum(y**2) 182 | #return np.sqrt(sum(np.abs(y) ** 2, axis=0, keepdims=False)) 183 | 184 | def get_amplitude_scaling_factor(s, n, snr, method='rms'): 185 | """Given s and n, return the scaler s according to the snr. 186 | 187 | Args: 188 | s: ndarray, source1. 189 | n: ndarray, source2. 190 | snr: float, SNR. 191 | method: 'rms'. 192 | 193 | Outputs: 194 | float, scaler. 195 | """ 196 | original_sn_rms_ratio = rms(s) / rms(n) 197 | target_sn_rms_ratio = 10. ** (float(snr) / 10.) # snr = 10 * lg(rms(s) / rms(n)) 198 | signal_scaling_factor = np.sqrt(original_sn_rms_ratio/target_sn_rms_ratio) 199 | return signal_scaling_factor 200 | 201 | def additive_mixing(s, n): 202 | """Mix normalized source1 and source2. 203 | 204 | Args: 205 | s: ndarray, source1. 206 | n: ndarray, source2. 207 | 208 | Returns: 209 | mix_audio: ndarray, mixed audio. 210 | s: ndarray, pad or truncated and scalered source1. 211 | n: ndarray, scaled source2. 212 | alpha: float, normalize coefficient. 213 | """ 214 | mixed_audio = s + n 215 | 216 | alpha = 1. / np.max(np.abs(mixed_audio)) 217 | mixed_audio *= alpha 218 | s *= alpha 219 | n *= alpha 220 | return mixed_audio, s, n, alpha 221 | 222 | def calc_sp(audio, mode): 223 | """Calculate spectrogram. 224 | 225 | Args: 226 | audio: 1darray. 227 | mode: string, 'magnitude' | 'complex' 228 | 229 | Returns: 230 | spectrogram: 2darray, (n_time, n_freq). 231 | """ 232 | n_window = cfg.n_window 233 | n_overlap = cfg.n_overlap 234 | ham_win = np.hamming(n_window) 235 | [f, t, x] = signal.spectral.spectrogram( 236 | audio, 237 | window=ham_win, 238 | nperseg=n_window, 239 | noverlap=n_overlap, 240 | detrend=False, 241 | return_onesided=True, 242 | mode=mode) 243 | x = x.T 244 | if mode == 'magnitude': 245 | x = x.astype(np.float32) 246 | elif mode == 'complex': 247 | x = x.astype(np.complex64) 248 | else: 249 | raise Exception("Incorrect mode!") 250 | return x 251 | 252 | ### 253 | 254 | def log_sp(x): 255 | return np.log(x + 1e-08) 256 | 257 | ### 258 | def load_hdf5(hdf5_path): 259 | """Load hdf5 data. 260 | """ 261 | with h5py.File(hdf5_path, 'r') as hf: 262 | x = hf.get('x') 263 | y = hf.get('y') 264 | x = np.array(x) # (n_segs, n_concat, n_freq) 265 | y = np.array(y) # (n_segs, n_freq) 266 | return x, y 267 | 268 | def np_mean_absolute_error(y_true, y_pred): 269 | return np.mean(np.abs(y_pred - y_true)) 270 | 271 | ### 272 | if __name__ == '__main__': 273 | parser = argparse.ArgumentParser() 274 | subparsers = parser.add_subparsers(dest='mode') 275 | 276 | parser_create_mixture_csv = subparsers.add_parser('create_mixture_csv') 277 | parser_create_mixture_csv.add_argument('--workspace', type=str, required=True) 278 | parser_create_mixture_csv.add_argument('--speech_dir', type=str, required=True) 279 | parser_create_mixture_csv.add_argument('--noise_dir', type=str, required=True) 280 | parser_create_mixture_csv.add_argument('--data_type', type=str, required=True) 281 | parser_create_mixture_csv.add_argument('--magnification', type=int, default=1) 282 | 283 | parser_calculate_mixture_features = subparsers.add_parser('calculate_mixture_features') 284 | parser_calculate_mixture_features.add_argument('--workspace', type=str, required=True) 285 | parser_calculate_mixture_features.add_argument('--speech_dir', type=str, required=True) 286 | parser_calculate_mixture_features.add_argument('--noise_dir', type=str, required=True) 287 | parser_calculate_mixture_features.add_argument('--data_type', type=str, required=True) 288 | parser_calculate_mixture_features.add_argument('--snr', type=float, required=True) 289 | 290 | parser_pack_features = subparsers.add_parser('pack_features') 291 | parser_pack_features.add_argument('--workspace', type=str, required=True) 292 | parser_pack_features.add_argument('--data_type', type=str, required=True) 293 | parser_pack_features.add_argument('--snr', type=float, required=True) 294 | parser_pack_features.add_argument('--n_concat', type=int, required=True) 295 | parser_pack_features.add_argument('--n_hop', type=int, required=True) 296 | 297 | parser_compute_scaler = subparsers.add_parser('compute_scaler') 298 | parser_compute_scaler.add_argument('--workspace', type=str, required=True) 299 | parser_compute_scaler.add_argument('--data_type', type=str, required=True) 300 | parser_compute_scaler.add_argument('--snr', type=float, required=True) 301 | 302 | args = parser.parse_args() 303 | if args.mode == 'create_mixture_csv': 304 | create_mixture_csv(args) 305 | elif args.mode == 'calculate_mixture_features': 306 | calculate_mixture_features(args) 307 | elif args.mode == 'pack_features': 308 | pack_features(args) 309 | elif args.mode == 'compute_scaler': 310 | compute_scaler(args) 311 | else: 312 | raise Exception("Error!") 313 | -------------------------------------------------------------------------------- /utils/bnorm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import tensorflow as tf 9 | 10 | 11 | class VBN(object): 12 | """ 13 | Virtual Batch Normalization 14 | (modified from https://github.com/openai/improved-gan/ definition) 15 | """ 16 | 17 | def __init__(self, x, name, epsilon=1e-5): 18 | """ 19 | x is the reference batch 20 | """ 21 | assert isinstance(epsilon, float) 22 | 23 | shape = x.get_shape().as_list() 24 | assert len(shape) == 3, shape 25 | with tf.variable_scope(name) as scope: 26 | assert name.startswith("d_") or name.startswith("g_") 27 | self.epsilon = epsilon 28 | self.name = name 29 | self.mean = tf.reduce_mean(x, [0, 1], keep_dims=True) 30 | self.mean_sq = tf.reduce_mean(tf.square(x), [0, 1], keep_dims=True) 31 | self.batch_size = int(x.get_shape()[0]) 32 | assert x is not None 33 | assert self.mean is not None 34 | assert self.mean_sq is not None 35 | out = self._normalize(x, self.mean, self.mean_sq, "reference") 36 | self.reference_output = out 37 | 38 | def __call__(self, x): 39 | 40 | shape = x.get_shape().as_list() 41 | with tf.variable_scope(self.name) as scope: 42 | new_coeff = 1. / (self.batch_size + 1.) 43 | old_coeff = 1. - new_coeff 44 | new_mean = tf.reduce_mean(x, [0, 1], keep_dims=True) 45 | new_mean_sq = tf.reduce_mean(tf.square(x), [0, 1], keep_dims=True) 46 | mean = new_coeff * new_mean + old_coeff * self.mean 47 | mean_sq = new_coeff * new_mean_sq + old_coeff * self.mean_sq 48 | out = self._normalize(x, mean, mean_sq, "live") 49 | return out 50 | 51 | def _normalize(self, x, mean, mean_sq, message): 52 | # make sure this is called with a variable scope 53 | shape = x.get_shape().as_list() 54 | assert len(shape) == 3 55 | self.gamma = tf.get_variable("gamma", [shape[-1]], 56 | initializer=tf.random_normal_initializer(1., 0.02)) 57 | gamma = tf.reshape(self.gamma, [1, 1, -1]) 58 | self.beta = tf.get_variable("beta", [shape[-1]], 59 | initializer=tf.constant_initializer(0.)) 60 | beta = tf.reshape(self.beta, [1, 1, -1]) 61 | assert self.epsilon is not None 62 | assert mean_sq is not None 63 | assert mean is not None 64 | std = tf.sqrt(self.epsilon + mean_sq - tf.square(mean)) 65 | out = x - mean 66 | out = out / std 67 | out = out * gamma 68 | out = out + beta 69 | return out 70 | 71 | def vbn(self, tensor, name): 72 | if self.disable_vbn: 73 | class Dummy(object): 74 | # Do nothing here, no bnorm 75 | def __init__(self, tensor, ignored): 76 | self.reference_output=tensor 77 | def __call__(self, x): 78 | return x 79 | VBN_cls = Dummy 80 | else: 81 | VBN_cls = VBN 82 | if not hasattr(self, name): 83 | vbn = VBN_cls(tensor, name) 84 | setattr(self, name, vbn) 85 | return vbn.reference_output 86 | vbn = getattr(self, name) 87 | return vbn(tensor) 88 | -------------------------------------------------------------------------------- /utils/bnorm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/utils/bnorm.pyc -------------------------------------------------------------------------------- /utils/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | """ This module contains several utility functions and classes that are 7 | commonly used in every scripts. 8 | https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/libs/common.py 9 | """ 10 | 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | import logging 16 | import subprocess 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(logging.NullHandler()) 20 | 21 | 22 | def execute_command(command): 23 | """ Runs a job in the foreground and waits for it to complete; raises an 24 | exception if its return status is nonzero. The command is executed in 25 | 'shell' mode so 'command' can involve things like pipes. 26 | See also: get_command_stdout 27 | """ 28 | p = subprocess.Popen(command, shell=True) 29 | p.communicate() 30 | if p.returncode is not 0: 31 | raise Exception("Command exited with status {0}: {1}".format( 32 | p.returncode, command)) 33 | 34 | 35 | def get_command_stdout(command, require_zero_status = True): 36 | """ Executes a command and returns its stdout output as a string. The 37 | command is executed with shell=True, so it may contain pipes and 38 | other shell constructs. 39 | If require_zero_stats is True, this function will raise an exception if 40 | the command has nonzero exit status. If False, it just prints a warning 41 | if the exit status is nonzero. 42 | See also: execute_command 43 | """ 44 | p = subprocess.Popen(command, shell=True, 45 | stdout=subprocess.PIPE) 46 | 47 | stdout = p.communicate()[0] 48 | if p.returncode is not 0: 49 | output = "Command exited with status {0}: {1}".format( 50 | p.returncode, command) 51 | if require_zero_status: 52 | raise Exception(output) 53 | else: 54 | logger.warning(output) 55 | return stdout if type(stdout) is str else stdout.decode() 56 | -------------------------------------------------------------------------------- /utils/generate_plots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import argparse 11 | import errno 12 | import logging 13 | import os 14 | import sys 15 | import warnings 16 | 17 | sys.path.append(os.path.dirname(sys.path[0])) 18 | import common as common 19 | 20 | 21 | try: 22 | import matplotlib as mpl 23 | mpl.use('Agg') 24 | import matplotlib.pyplot as plt 25 | import numpy as np 26 | from matplotlib.patches import Rectangle 27 | g_plot = True 28 | except ImportError: 29 | warnings.warn( 30 | """This script requires matplotlib and numpy. 31 | Please install them to generate plots. 32 | Proceeding with generation of tables. 33 | If you are on a cluster where you do not have admin rights you could 34 | try using virtualenv.""") 35 | g_plot = False 36 | 37 | 38 | logger = logging.getLogger('utils') 39 | logger.setLevel(logging.INFO) 40 | handler = logging.StreamHandler() 41 | handler.setLevel(logging.INFO) 42 | formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - " 43 | "%(funcName)s - %(levelname)s ] %(message)s") 44 | handler.setFormatter(formatter) 45 | logger.addHandler(handler) 46 | logger.info('Generating plots') 47 | 48 | 49 | def get_args(): 50 | parser = argparse.ArgumentParser( 51 | description="""Parses the training logs and generates a variety of 52 | plots. 53 | e.g. utils/generate_plots.py train_dnn.log exp/train_dnn. 54 | Look for the report.pdf in the output (report) directory.""") 55 | parser.add_argument("--adversarial", 56 | default=False, 57 | action="store_true", 58 | help="Flag indicating parse adversarial model or not." 59 | ) 60 | parser.add_argument("log_file", 61 | # required=True, 62 | help="name of log file." 63 | ) 64 | parser.add_argument("output_dir", 65 | # required=True, 66 | help="report directory." 67 | ) 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | g_plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan'] 73 | 74 | 75 | def generate_loss_plots(adversarial, log_file, output_dir, plot): 76 | train_key = "TRAIN" 77 | valid_key = "CROSS" 78 | try: 79 | os.makedirs(output_dir) 80 | except OSError as e: 81 | if e.errno == errno.EEXIST and os.path.isdir(output_dir): 82 | pass 83 | else: 84 | raise e 85 | logger.info("Generating loss plots") 86 | if adversarial: 87 | tr_losses = parse_loss_log_adversarial(log_file, train_key) 88 | cv_losses = parse_loss_log_adversarial(log_file, valid_key) 89 | else: 90 | tr_losses = parse_loss_log(log_file, train_key) 91 | cv_losses = parse_loss_log(log_file, valid_key) 92 | 93 | if plot: 94 | fig = plt.figure() 95 | plots = [] 96 | 97 | for key_word in sorted(tr_losses.keys()): 98 | name = key_word 99 | tr_data = tr_losses[key_word] 100 | tr_data = np.array(tr_data) 101 | tr_iters = np.arange(1, tr_data.size+1) 102 | color_val = g_plot_colors[0] 103 | plot_handle, = plt.plot(tr_iters[:], tr_data[:], color=color_val, 104 | linestyle="--", label="train") 105 | plots.append(plot_handle) 106 | color_val = g_plot_colors[1] 107 | cv_data = cv_losses[key_word] 108 | cv_data = np.array(cv_data) 109 | cv_iters = np.linspace(0, tr_data.size, num=cv_data.size, dtype=int) 110 | plot_handle, = plt.plot(cv_iters[:], cv_data[:], color=color_val, 111 | label="valid") 112 | plots.append(plot_handle) 113 | if plot: 114 | plt.xlabel("Iteration") 115 | plt.ylabel("Loss") 116 | lgd = plt.legend(handles=plots, loc="upper right", 117 | ncol=1, borderaxespad=0.) 118 | plt.grid(True) 119 | plt.title(key_word) 120 | figfile_name = "{0}/{1}.pdf".format(output_dir, key_word) 121 | plt.savefig(figfile_name, bbox_extra_artists=(lgd,), 122 | bbox_inches="tight") 123 | fig = plt.figure() 124 | plots = [] 125 | 126 | 127 | def parse_loss_log_adversarial(log_file, key): 128 | """Parse adversarial model loss log file. 129 | train_loss_string format: 130 | 1/821 (TRAIN AVG.LOSS): d_rl_loss = 0.32810, d_fk_loss = 0.32194, d_loss = 0.65004, g_adv_loss = 0.50822, g_mse_loss = 7.11048, g_l2_loss = 0.00000, g_loss = 36.06060 131 | valid_loss_string format: 132 | 1/821 (CROSS AVG.LOSS): d_rl_loss = 0.34894, d_fk_loss = 0.17205, d_loss = 0.52099, g_adv_loss = 0.39619, g_mse_loss = 8.70989, g_l2_loss = 0.00000, g_loss = 43.94563 133 | """ 134 | d_rl_losses = [] 135 | d_fk_losses = [] 136 | d_losses = [] 137 | g_adv_losses = [] 138 | g_mse_losses = [] 139 | g_l2_losses = [] 140 | g_losses = [] 141 | key_word = ["d_rl_loss", "d_fk_loss", "d_loss", 142 | "g_adv_loss", "g_mse_loss", "g_l2_loss", "g_loss"] 143 | losses = {key_word[0]: d_rl_losses, 144 | key_word[1]: d_fk_losses, 145 | key_word[2]: d_losses, 146 | key_word[3]: g_adv_losses, 147 | key_word[4]: g_mse_losses, 148 | key_word[5]: g_l2_losses, 149 | key_word[6]: g_losses} 150 | 151 | train_loss_strings = common.get_command_stdout( 152 | "grep -e {} {}".format(key, log_file)) 153 | for line in train_loss_strings.strip().split("\n"): 154 | line = line.split(",") 155 | assert len(line) == 7 156 | for i in range(7): 157 | sub_line = line[i].split() 158 | assert key_word[i] in sub_line 159 | losses[key_word[i]].append(float(sub_line[-1])) 160 | 161 | return losses 162 | 163 | 164 | def parse_loss_log(log_file, key): 165 | """Parse loss log file. 166 | train_loss_string format: 167 | 1/178 (TRAIN AVG.LOSS): g_mse_loss = 12.76571, g_l2_loss = 0.00000, g_loss = 12.76571, learning_rate= 1.200e-03 168 | valid_loss_string format: 169 | 1/178 (CROSS AVG.LOSS): g_mse_loss = 9.99273, g_l2_loss = 0.00000, g_loss = 9.99273, time = 3.52 min 170 | """ 171 | g_mse_losses = [] 172 | g_l2_losses = [] 173 | g_losses = [] 174 | key_word = ["g_mse_loss", "g_l2_loss", "g_loss"] 175 | losses = {key_word[0]: g_mse_losses, 176 | key_word[1]: g_l2_losses, 177 | key_word[2]: g_losses} 178 | 179 | train_loss_strings = common.get_command_stdout( 180 | "grep -e {} {}".format(key, log_file)) 181 | for line in train_loss_strings.strip().split("\n"): 182 | line = line.split(",") 183 | assert len(line) == 4 184 | for i in range(3): 185 | sub_line = line[i].split() 186 | assert key_word[i] in sub_line 187 | losses[key_word[i]].append(float(sub_line[-1])) 188 | 189 | return losses 190 | 191 | 192 | def main(): 193 | args = get_args() 194 | generate_loss_plots(args.adversarial, args.log_file, args.output_dir, g_plot) 195 | logger.info("Generating loss plots sucessfully.") 196 | 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang Xiaomi 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import argparse 11 | import sys 12 | import pprint 13 | 14 | import tensorflow as tf 15 | import tensorflow.contrib.slim as slim 16 | 17 | 18 | pp = pprint.PrettyPrinter() 19 | 20 | def check_tensorflow_version(): 21 | if tf.__version__ < "1.3.0": 22 | raise EnvironmentError("Tensorflow version must >= 1.3.0") 23 | else: 24 | print(tf.__version__) 25 | 26 | 27 | def read_list(filename): 28 | data_list = [] 29 | with open(filename, 'r') as fr: 30 | lines = fr.readlines() 31 | for line in lines: 32 | line = line.strip() 33 | data_list.append(line) 34 | return data_list 35 | 36 | 37 | def show_all_variables(): 38 | model_vars = tf.trainable_variables() 39 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 40 | sys.stdout.flush() 41 | 42 | 43 | def str2bool(v): 44 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 45 | return True 46 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 47 | return False 48 | else: 49 | raise argparse.ArgumentTypeError('Boolean value expected.') 50 | -------------------------------------------------------------------------------- /utils/misc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/utils/misc.pyc -------------------------------------------------------------------------------- /utils/ops.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linan2/TensorFlow-speech-enhancement-Chinese/7033215c086efea8bf0fb56319f4185d7fdb5754/utils/ops.pyc -------------------------------------------------------------------------------- /utils/select_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Ke Wang 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import sys 11 | 12 | if __name__ == "__main__": 13 | if len(sys.argv) != 4: 14 | print("Error parameter numbers.") 15 | print("Usage: python select_data.py infile1(key) ", end='') 16 | print("infile2(text_raw) outfile(text)") 17 | sys.exit(1) 18 | file_key = open(sys.argv[1], 'r') 19 | file_raw = open(sys.argv[2], 'r') 20 | file_text = open(sys.argv[3], 'w') 21 | 22 | key = [] 23 | 24 | key_lines = file_key.readlines() 25 | for line in key_lines: 26 | line = line.decode('utf-8').strip() 27 | key.append(line) 28 | 29 | raw_lines = file_raw.readlines() 30 | line_num = 0 31 | line_total = len(key) 32 | for line in raw_lines: 33 | line_back = line.decode('utf-8').strip() 34 | line = line_back.split() 35 | if line[0] == key[line_num]: 36 | file_text.write(line_back.encode('utf-8')) 37 | file_text.write('\n') 38 | line_num += 1 39 | if line_num >= line_total: 40 | break 41 | --------------------------------------------------------------------------------