├── README.md
├── RNNoise-License
├── Scripts
├── DataPreparation.sh
└── WaveConcat.sh
├── TRAINING-README
├── c-audio-experiments
└── main.c
├── evaluation
├── CDenoiseClassicMethods.m
├── CDownsample_WAV_Classic.m
├── CDownsample_WAV_Noisy.m
├── CEVALUATION_script_Classic.m
├── Downsample_WAV.m
├── EVALUATION_script.m
├── EvalPlotFun.m
├── EvalStatFun.m
├── README.txt
├── spectrograms.m
└── stoi.m
├── evaluation_batch.sh
├── featureExtractor
├── .deps
│ ├── .dirstamp
│ └── rnnoise_demo.Po
├── .dirstamp
├── .libs
│ ├── lt-rnnoise_demo
│ └── rnnoise_demo
├── denoise.c
├── denoise.h
├── denoise.o
├── feature_extractor
└── feature_extractor.c
├── log_testset.csv
├── src
├── denoise.c
├── denoise.h
└── featureExtraction.py
└── training
├── bin2hdf5.py
├── evaluation.py
├── evaluation_batch.py
└── rnn_train.py
/README.md:
--------------------------------------------------------------------------------
1 | # RNNoise-Ex
2 | RNNoise-Ex is an attempt to extend the RNNoise hybrid speech enhancement system based on RNN and spectral features.
3 |
4 | ## Network Architecture
5 |
6 |
7 | ## Usage
8 | See [TRAINING-README](TRAINING-README) for instructions on how to train and run the model.
9 |
10 | ## Paper
11 | You can find the [RNNoise-Ex paper on arXiv](https://arxiv.org/abs/2105.11813).
12 | Citation:
13 | ```
14 | @misc{rnnoise-ex,
15 | doi = {10.48550/ARXIV.2105.11813},
16 | url = {https://arxiv.org/abs/2105.11813},
17 | author = {Doumanidis, Constantine C. and Anagnostou, Christina and Arvaniti, Evangelia-Sofia and Papadopoulou, Anthi},
18 | title = {RNNoise-Ex: Hybrid Speech Enhancement System based on RNN and Spectral Features},
19 | publisher = {arXiv},
20 | year = {2021},
21 | copyright = {arXiv.org perpetual, non-exclusive license}
22 | }
23 | ```
24 |
--------------------------------------------------------------------------------
/RNNoise-License:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017, Mozilla
2 | Copyright (c) 2007-2017, Jean-Marc Valin
3 | Copyright (c) 2005-2017, Xiph.Org Foundation
4 | Copyright (c) 2003-2004, Mark Borgerding
5 |
6 | This is the original license of RNNoise. Files modified from the
7 | original RNNoise project are licensed under the following terms.
8 | Other files included in the rnnoise-ex project are licensed under
9 | different terms.
10 |
11 | Redistribution and use in source and binary forms, with or without
12 | modification, are permitted provided that the following conditions
13 | are met:
14 |
15 | - Redistributions of source code must retain the above copyright
16 | notice, this list of conditions and the following disclaimer.
17 |
18 | - Redistributions in binary form must reproduce the above copyright
19 | notice, this list of conditions and the following disclaimer in the
20 | documentation and/or other materials provided with the distribution.
21 |
22 | - Neither the name of the Xiph.Org Foundation nor the names of its
23 | contributors may be used to endorse or promote products derived from
24 | this software without specific prior written permission.
25 |
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION
30 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 |
--------------------------------------------------------------------------------
/Scripts/DataPreparation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Bash script to prepare noisy audio data for analysis
3 |
4 | echo "[$(date +%T)]: Starting..."
5 |
6 | while IFS=, read -r file scene db;
7 | do
8 | echo "[$(date +%T)]: Processing $file";
9 | # Create Directories
10 | mkdir -p $scene/"$db DB"/noisy_wav/
11 | mkdir -p $scene/"$db DB"/noisy_raw/
12 | mkdir -p $scene/"$db DB"/clean_raw/
13 | mkdir -p $scene/"$db DB"/clean_wav/
14 | # Categorize based on log file
15 | cp noisy_testset_wav/"$file.wav" $scene/"$db DB"/noisy_wav/"$file.wav"
16 | # Covert wave to raw
17 | sox $scene/"$db DB"/noisy_wav/"$file.wav" $scene/"$db DB"/noisy_raw/"$file.raw"
18 | # Use RNNoise to clean the audio
19 | rnnoise/rnnoise_demo $scene/"$db DB"/noisy_raw/"$file.raw" $scene/"$db DB"/clean_raw/"$file.raw"
20 | # Convert clean raw sample to wave for analysis
21 | sox -t raw -r 48000 -b 16 -e signed-integer -c 1 $scene/"$db DB"/clean_raw/"$file.raw" $scene/"$db DB"/clean_wav/"$file.wav"
22 | done < log_testset.csv
--------------------------------------------------------------------------------
/Scripts/WaveConcat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Bash script to prepare two big audio files for training the RNNoise
3 |
4 | echo "[$(date +%T)]: Starting..."
5 |
6 | # Concatenate all speech wave files into one
7 | sox $(ls speech) speech_concatenated.wav
8 |
9 | # Concatenate all noise wave files into one
10 | sox $(ls noise) noise_concatenated.wav
--------------------------------------------------------------------------------
/TRAINING-README:
--------------------------------------------------------------------------------
1 | # Training
2 |
3 | (1) Compile Valin's feature extraction application as modified by us:
4 | cd src ; ./compile.sh
5 |
6 | (2) Concatenate speech and noise tracks and convert them to raw format
7 | cd ../speech; sox $(ls *.wav) ../trainingSpeech.wav; sox trainingSpeech.wav trainingSpeech.raw
8 | cd ../noise; sox $(ls *.wav) trainingNoise.wav; sox trainingNoise.wav trainingNoise.raw
9 |
10 | (3) Run the feature extraction application that also dumps the raw noisy samples in ./src/noisySpeechSamples.raw
11 | It is advised to use 500000 or more for count
12 | cd ../src;
13 | ./denoise_training ../speech/trainingSpeech.raw ../noise/trainingNoise.raw count > trainingRNNoiseFeatures.f32
14 |
15 | (4) Extract extended features by running featureExtraction.py. Features will be saved as trainingExtendedFeatures.h5
16 | cd ../src; python ./featureExtraction.py training noisySpeechSamples.raw trainingExtendedFeatures.h5 count
17 |
18 | (5) Join feature sets using bin2hdf5
19 | cd ../training; ./bin2hdf5.py ../src/trainingRNNoiseFeatures.f32 ../src/trainingExtendedFeatures.h5 trainingFeatures.h5 training
20 |
21 | (6) Train RNNoise
22 | ./rnn_train.py trainingFeatures.h5 trainedModel.hdf5
23 |
24 | # Evaluation
25 |
26 | (1) Compile the RNNoise feature extractor
27 | cd ../featureExtraction
28 | gcc -Wall -o feature_extractor -lm feature_extractor.c denoise.c ../src/kiss_fft.c ../src/rnn_data.c ../src/pitch.c ../src/rnn.c ../src/celt_lpc.c
29 |
30 | (2) Convert desired audio track to raw
31 | sox testAudio.wav testAudio.raw
32 |
33 | (3) Extract RNNoise features using the feature extractor
34 | ./feature_extractor testAudio.raw testAudioFeatures.f32
35 |
36 | (4) Extract extended features using featureExtraction.py
37 | python ../training/featureExtraction.py testing testAudio.wav testAudioExtendedFeatures.h5
38 |
39 | (5) Join feature sets using bin2hdf5
40 | cd ../training; ./bin2hdf5.py ../featureExtraction/testAudioFeatures.f32 ../featureExtraction/testAudioExtendedFeatures.h5 testAudioFeatures.h5 testing
41 |
42 | (6) Run RNNoise using evaluation.py
43 | python ./evaluation.py trainedModel.hdf5 testAudioFeatures.h5 ../featureExtraction/testAudio.wav testAudioClean.wav
44 |
45 | (6a) If you want to batch evaluate a folder with wav files use
46 | python ./evaluation_batch.py noisyWavsDirectory/ trainedModel.hdf5
47 |
--------------------------------------------------------------------------------
/c-audio-experiments/main.c:
--------------------------------------------------------------------------------
1 | #define FRAME_SIZE 236300
2 | #include
3 | #include
4 | #include
5 |
6 | int main(){
7 |
8 | // Import wave file converted to raw to a 16bit array
9 | FILE *fin, *foutShort, *foutFloat;
10 | fin = fopen("sample.raw", "r");
11 | foutShort = fopen("processedShort.raw", "w");
12 | foutFloat = fopen("processedFloat.raw", "w");
13 | short shortRaw[FRAME_SIZE];
14 | float floatRaw[FRAME_SIZE];
15 | fread(shortRaw, sizeof(short), FRAME_SIZE, fin);
16 |
17 | // Copy to 32 bit array
18 | for(int i=0; i < FRAME_SIZE; i++){
19 | floatRaw[i] = shortRaw[i];
20 | }
21 |
22 | // Dump both to raw files
23 | fwrite(shortRaw, sizeof(short), FRAME_SIZE, foutShort);
24 | fwrite(floatRaw, sizeof(float), FRAME_SIZE, foutFloat);
25 |
26 | return 0;
27 | }
28 |
29 | /*
30 | Pre-processing:
31 | 1. Converted to raw: sox sample.wav sample.raw
32 | 2. Opened raw with audacity 16bit signed mono 48khz little endian
33 |
34 | 3. Compile: gcc main.c -o experiment
35 | 4. Processing: ./experiment
36 |
37 | Post Processing:
38 | 5. Opening processedShort.raw with previously mentioned settings in audacity yields desired results
39 | 6. Opening processedFloat.raw with previously mentioned settings in audacity yields desired results
40 | but has some trailing garbage after the full length of the useful file has been played. To trim them
41 | out just use:
42 |
43 | sox -t raw -r 48000 -b 16 -e signed-integer -c 1 processedFloat.raw processedFloat.wav
44 |
45 | to convert the file to wav and then to trim it use:
46 |
47 | sox processedFloat.wav processedFloatTrimmed.wav trim 0 4.92
48 |
49 | (keeps file from 0s to 4.92s)
50 |
51 | */
--------------------------------------------------------------------------------
/evaluation/CDenoiseClassicMethods.m:
--------------------------------------------------------------------------------
1 | clear all;
2 | close all;
3 |
4 | technic = "wiener-as";
5 |
6 | %% Store filename of Enhanced Signals
7 | folders = {'bus','cafe','living','office','psquare'};
8 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
9 |
10 | k = 0; %number of 20 contditions
11 | data_Noisy = cell(length(folders)*length(folders2),1); %filenames of enhanced
12 | data_Noisy_16 = cell(length(folders)*length(folders2),1); %filenames of enhanced
13 | descriptor = cell(length(folders)*length(folders2),1); %descriptors of conditions
14 | for i=1:length(folders) %folders of noise types
15 | for j = 1:length(folders2) %folders of dB
16 | k = k+1;
17 | names = "RNNoise2\" + folders{i} + "\ " + folders2{j} +"\noisy_wav"; %folder of enhanced signals
18 | names = dir(names + "\p*"); %initial storage of files
19 | data_cell = regexpi({names.name},'p\w*_down16.wav','match'); %get only wanted filenames
20 | data_cell = vertcat(data_cell{:}); %discard empty records
21 | data_Noisy_16(k) = {data_cell}; %store them in a cell array
22 | data_cell = regexpi({names.name},'p\d{3}_\d{3}.wav','match'); %get only wanted filenames
23 | data_cell = vertcat(data_cell{:}); %discard empty records
24 | data_Noisy(k) = {data_cell}; %store them in a cell array
25 | descriptor(k) = {folders{i} + "_" + folders2{j}}; %create a descriptor for the conditions
26 | end
27 | end
28 |
29 | %% Denoise
30 | for i=1:size(data_Noisy,1)%for every condition (20 in total)
31 | files = data_Noisy{i}; %filenames for enhanced
32 | files_16 = data_Noisy_16{i};
33 | for j=1:length(data_Noisy{i}) %for every file in the folder
34 | %get full path to .wav of clean and enhanced signals
35 | desc = strsplit(descriptor{i},'_'); %split descriptor (type of noise & SNR)
36 | name_Noisy = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\noisy_wav\" + files{j}; %full path to enhanced file
37 | name_Denoised = "RNNoise2\clean_ClassicMeth\" + technic + "\" + desc{1} + "\" + desc{2} + "\noisy_wav\" + files{j};
38 | %16 KHz
39 | name_Noisy_16 = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\noisy_wav\" + files_16{j}; %full path to enhanced file
40 | name_Denoised_16 = "RNNoise2\clean_ClassicMeth\" + technic + "\"+ desc{1} + "\" + desc{2} + "\noisy_wav\" + files_16{j};
41 | % logmmse(convertStringsToChars(name_Noisy),convertStringsToChars(name_Denoised));
42 | % logmmse(convertStringsToChars(name_Noisy_16),convertStringsToChars(name_Denoised_16));
43 | wiener_as(convertStringsToChars(name_Noisy),convertStringsToChars(name_Denoised));
44 | wiener_as(convertStringsToChars(name_Noisy_16),convertStringsToChars(name_Denoised_16));
45 | end
46 | end
--------------------------------------------------------------------------------
/evaluation/CDownsample_WAV_Classic.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | %for the downsampling of denoised 48KHz signals
4 | technic = "wiener-as";
5 | %% FOR ALL FOLDERS EXCEPT "clean"
6 | folders = {'bus','cafe','living','office','psquare'};
7 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
8 | Fresample = 16000; %final sampling frequency
9 |
10 | for k=1:length(folders) %for every noise type folder
11 | for j=1:length(folders2) %for every SNR
12 | sAudioFolder="RNNoise2\clean_ClassicMeth\" + technic + "\" + folders{k} + "\" + folders2{j} +"\noisy_wav"; %path of files
13 | eFiles=dir(sAudioFolder+"\*.wav"); %get all .wav files
14 | for i=1:length(eFiles) %for every file
15 | if(~isempty(regexpi(eFiles(i).name,'p\d{3}_\d{3}.wav','match')))
16 | sAudioFile=fullfile(sAudioFolder,eFiles(i).name); %full path to file
17 | [y,Fs] = audioread(sAudioFile); %read file
18 | y_resamp = resample(y,Fresample,Fs); %resample at Fresample frequency
19 | sAudioFileOut=fullfile(sAudioFolder,[strrep(eFiles(i).name,'.wav','') '_down16_AfterDen.wav']); %create new filename
20 | audiowrite(convertStringsToChars(sAudioFileOut),y_resamp,Fresample); %store the downsampled signal
21 | end
22 | end
23 | end
24 | end
--------------------------------------------------------------------------------
/evaluation/CDownsample_WAV_Noisy.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | %cretes downsampled to 16KHz noisy wav files
4 | %% FOR ALL FOLDERS EXCEPT "clean"
5 | folders = {'bus','cafe','living','office','psquare'};
6 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
7 | Fresample = 16000; %final sampling frequency
8 |
9 | for k=1:length(folders) %for every noise type folder
10 | for j=1:length(folders2) %for every SNR
11 | sAudioFolder="RNNoise2\" + folders{k} + "\ " + folders2{j} +"\noisy_wav"; %path of files
12 | eFiles=dir(sAudioFolder+"\*.wav"); %get all .wav files
13 | for i=1:length(eFiles) %for every file
14 | sAudioFile=fullfile(sAudioFolder,eFiles(i).name); %full path to file
15 | [y,Fs] = audioread(sAudioFile); %read file
16 | y_resamp = resample(y,Fresample,Fs); %resample at Fresample frequency
17 | sAudioFileOut=fullfile(sAudioFolder,[strrep(eFiles(i).name,'.wav','') '_down16.wav']); %create new filename
18 | audiowrite(convertStringsToChars(sAudioFileOut),y_resamp,Fresample); %store the downsampled signal
19 | end
20 | end
21 | end
--------------------------------------------------------------------------------
/evaluation/CEVALUATION_script_Classic.m:
--------------------------------------------------------------------------------
1 | clear all;
2 | close all;
3 |
4 | technic = "wiener-as";
5 | DOWNSAMPLED_BEFORE = 0;
6 | %% Store filename of Enhanced Signals
7 | folders = {'bus','cafe','living','office','psquare'};
8 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
9 |
10 | k = 0; %number of 20 contditions
11 | data_Enh = cell(length(folders)*length(folders2),1); %filenames of enhanced
12 | data_Enh_16 = cell(length(folders)*length(folders2),1); %filenames of enhanced 16 KHz
13 | data_Enh_16_After = cell(length(folders)*length(folders2),1); %filenames of enhanced 16 KHz
14 | descriptor = cell(length(folders)*length(folders2),1); %descriptors of conditions
15 | for i=1:length(folders) %folders of noise types
16 | for j = 1:length(folders2) %folders of dB
17 | k = k+1;
18 | names = "RNNoise2\clean_ClassicMeth\" + technic +"\"+ folders{i} + "\" + folders2{j} +"\noisy_wav"; %folder of enhanced signals
19 | names = dir(names + "\p*"); %initial storage of files
20 | data_cell = regexpi({names.name},'p\w*_down16.wav','match'); %get only wanted filenames
21 | data_cell = vertcat(data_cell{:}); %discard empty records
22 | data_Enh_16(k) = {data_cell}; %store them in a cell array
23 | data_cell = regexpi({names.name},'p\d{3}_\d{3}.wav','match'); %get only wanted filenames
24 | data_cell = vertcat(data_cell{:}); %discard empty records
25 | data_Enh(k) = {data_cell}; %store them in a cell array
26 | data_cell = regexpi({names.name},'p\w*_down16_AfterDen.wav','match'); %get only wanted filenames
27 | data_cell = vertcat(data_cell{:}); %discard empty records
28 | data_Enh_16_After(k) = {data_cell}; %store them in a cell array
29 | descriptor(k) = {folders{i} + "_" + folders2{j}}; %create a descriptor for the conditions
30 | end
31 | end
32 |
33 | %% Compute Metrics
34 | %find number of files in every folder
35 | a = [];
36 | for i=1:length(data_Enh)
37 | a(i) = length(data_Enh{i});
38 | end
39 |
40 | %initialization of metrics' arrays
41 | Csig = zeros(max(a),length(data_Enh));
42 | Cbak = zeros(max(a),length(data_Enh));
43 | Covl = zeros(max(a),length(data_Enh));
44 | PESQ = zeros(max(a),length(data_Enh),2);
45 | STOI = zeros(max(a),length(data_Enh));
46 | LLR = zeros(max(a),length(data_Enh));
47 | fwSNRseg = zeros(max(a),length(data_Enh));
48 | Snr_mean = zeros(max(a),length(data_Enh));
49 | SegSNR_mean = zeros(max(a),length(data_Enh));
50 | Pers = zeros(max(a),length(data_Enh));
51 |
52 | %computation of metrics
53 | for i=1:size(data_Enh,1)%for every condition (20 in total)
54 | files = data_Enh{i}; %filenames for enhanced
55 | files_16 = data_Enh_16{i};
56 | files_16_After = data_Enh_16_After{i};
57 | for j=1:length(data_Enh{i}) %for every file in the folder
58 | %get full path to .wav of clean and enhanced signals
59 | desc = strsplit(descriptor{i},'_'); %split descriptor (type of noise & SNR)
60 | % name_Enh = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\clean_wav\" + files{j}; %full path to enhanced file
61 | name_Enh = "RNNoise2\clean_ClassicMeth\" + technic + "\"+ desc{1} + "\" + desc{2} + "\noisy_wav\" + files{j}; %full path to enhanced file
62 | name_Clean = "RNNoise2\clean\" + files{j}; %full path to clear file
63 | %16 KHz
64 | % name_Enh_16 = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\clean_wav\" + files_16{j}; %full path to enhanced file
65 | name_Enh_16 = "RNNoise2\clean_ClassicMeth\" + technic + "\"+ desc{1} + "\" + desc{2} + "\noisy_wav\" + files_16{j}; %full path to enhanced file
66 | name_Clean_16 = "RNNoise2\clean\" + files_16{j}; %full path to clear file
67 | %16 KHz After Denoising
68 | name_Enh_16_After = "RNNoise2\clean_ClassicMeth\" + technic+ "\" + desc{1} + "\" + desc{2} + "\noisy_wav\" + files_16_After{j}; %full path to enhanced file
69 |
70 | if(DOWNSAMPLED_BEFORE == 1) %16KHz from the start
71 | %Csig/Cbak/Covl
72 | [Csig(j,i),Cbak(j,i),Covl(j,i)]=composite(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
73 | %PESQ
74 | PESQ(j,i,:) = pesq(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
75 | %Log-likelihood ratio (LLR)
76 | LLR(j,i) = comp_llr(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
77 | %fwSNRseg
78 | fwSNRseg(j,i) = comp_fwseg(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
79 | %SNR
80 | [Snr_mean(j,i), SegSNR_mean(j,i)]= comp_snr(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
81 | %STOI
82 | clean_audio = audioread(name_Clean_16); %get clean audio
83 | [enh_audio,fs] = audioread(name_Enh_16); %get enhanced audio
84 | samples = min(length(clean_audio),length(enh_audio)); %find minimum number of samples between clean and enhanced
85 | STOI(j,i) = stoi(clean_audio(1:samples),enh_audio(1:samples),fs);
86 | %Pearson
87 | Pers(j,i) = corr(clean_audio(1:samples),enh_audio(1:samples));
88 | else
89 | %Csig/Cbak/Covl
90 | [Csig(j,i),Cbak(j,i),Covl(j,i)]=composite(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16_After));
91 | %PESQ
92 | PESQ(j,i,:) = pesq(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16_After));
93 | %Log-likelihood ratio (LLR)
94 | LLR(j,i) = comp_llr(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
95 | %fwSNRseg
96 | fwSNRseg(j,i) = comp_fwseg(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
97 | %SNR
98 | [Snr_mean(j,i), SegSNR_mean(j,i)]= comp_snr(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
99 | %STOI
100 | clean_audio = audioread(name_Clean); %get clean audio
101 | [enh_audio,fs] = audioread(name_Enh); %get enhanced audio
102 | samples = min(length(clean_audio),length(enh_audio)); %find minimum number of samples between clean and enhanced
103 | STOI(j,i) = stoi(clean_audio(1:samples),enh_audio(1:samples),fs);
104 | %Pearson
105 | Pers(j,i) = corr(clean_audio(1:samples),enh_audio(1:samples));
106 | end
107 |
108 |
109 | end
110 | end
111 |
112 | % %% Statistics
113 | % %initializaton
114 | % Csig_mean = zeros(size(Csig,2),1);
115 | % Cbak_mean = zeros(size(Csig,2),1);
116 | % Covl_mean = zeros(size(Csig,2),1);
117 | % STOI_mean = zeros(size(Csig,2),1);
118 | % PESQ_mean = zeros(size(Csig,2),2);
119 | % LLR_mean = zeros(size(Csig,2),1);
120 | % fwSNRseg_mean = zeros(size(Csig,2),1);
121 | % SNR_mean = zeros(size(Csig,2),1); %SNR_mean
122 | % SNRseg_mean = zeros(size(Csig,2),1); %segSNR_mean
123 | % Corr_mean = zeros(size(Csig,2),1); %Pearson correlation
124 | %
125 | %
126 | % for i=1:size(Csig,2) %for every condition (noise type & SNR)
127 | % %for all metrics remove 0s (missing records) elements and then
128 | % %compute the mean for every condition
129 | % Csig_mean(i) = mean(Csig(Csig(:,i) ~= 0,i),1);
130 | % Cbak_mean(i) = mean(Cbak(Cbak(:,i) ~= 0,i),1);
131 | % Covl_mean(i) = mean(Covl(Covl(:,i) ~= 0,i),1);
132 | % PESQ_mean(i,1) = mean(PESQ(PESQ(:,i,1)~= 0,i,1),1); %2 for narrow- and wideband
133 | % PESQ_mean(i,2) = mean(PESQ(PESQ(:,i,2)~= 0,i,2),1);
134 | % STOI_mean(i) = mean(STOI(STOI(:,i) ~= 0,i),1);
135 | % LLR_mean(i) = mean(LLR(LLR(:,i) ~= 0,i),1);
136 | % fwSNRseg_mean(i) = mean(fwSNRseg(fwSNRseg(:,i) ~= 0,i),1);
137 | % SNR_mean(i) = mean(Snr_mean(Snr_mean(:,i) ~= 0,i),1);
138 | % SNRseg_mean(i) = mean(SegSNR_mean(SegSNR_mean(:,i) ~= 0,i),1);
139 | % Corr_mean(i) = mean(Pers(Pers(:,i) ~= 0,i),1);
140 | % end
141 |
142 | ALL_Metrics=[Csig Cbak Covl PESQ(:,:,1) PESQ(:,:,2) STOI LLR fwSNRseg Snr_mean SegSNR_mean Pers];
143 | numberOfMetrics=11;
144 |
145 | AllResults1_MEAN=zeros(1,11);
146 | AllResults2_MEAN=zeros(20,11);
147 | AllResults3_MEAN=zeros(4,11);
148 | AllResults4_MEAN=zeros(5,11);
149 | AllResults1_STD=zeros(1,11);
150 | AllResults2_STD=zeros(20,11);
151 | AllResults3_STD=zeros(4,11);
152 | AllResults4_STD=zeros(5,11);
153 |
154 | for i=1:numberOfMetrics
155 | [AllResults1_MEAN(i),AllResults2_MEAN(:,i),AllResults3_MEAN(:,i),AllResults4_MEAN(:,i),AllResults1_STD(i),AllResults2_STD(:,i),AllResults3_STD(:,i),AllResults4_STD(:,i),]=EvalPlotFun(ALL_Metrics(:,(i-1)*20+(1:20)),i);
156 | end
157 |
158 | FigList = findobj(allchild(0), 'flat', 'Type', 'figure');
159 | %FigList = flip(FigList);
160 | for iFig = 1:1:length(FigList)
161 | FigHandle = FigList(iFig);
162 | FigHandle.WindowState = 'maximized';
163 | saveas(FigHandle,fullfile("resultsClassicWiener-AS48KHz",num2str(length(FigList)-iFig+1)+".png"));
164 | end
--------------------------------------------------------------------------------
/evaluation/Downsample_WAV.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | %% FOR ALL FOLDERS EXCEPT "clean"
4 | folders = {'bus','cafe','living','office','psquare'};
5 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
6 | Fresample = 16000; %final sampling frequency
7 |
8 | for k=1:length(folders) %for every noise type folder
9 | for j=1:length(folders2) %for every SNR
10 | sAudioFolder="RNNoise2\" + folders{k} + "\ " + folders2{j} +"\clean_wav"; %path of files
11 | eFiles=dir(sAudioFolder+"\*.wav"); %get all .wav files
12 | for i=1:length(eFiles) %for every file
13 | sAudioFile=fullfile(sAudioFolder,eFiles(i).name); %full path to file
14 | [y,Fs] = audioread(sAudioFile); %read file
15 | y_resamp = resample(y,Fresample,Fs); %resample at Fresample frequency
16 | sAudioFileOut=fullfile(sAudioFolder,[strrep(eFiles(i).name,'.wav','') '_down16.wav']); %create new filename
17 | audiowrite(convertStringsToChars(sAudioFileOut),y_resamp,Fresample); %store the downsampled signal
18 | end
19 | end
20 | end
21 |
22 | % %% FOR FOLDER "clean"
23 | % sAudioFolder="RNNoise2\clean"; %path to files
24 | % eFiles=dir(sAudioFolder+"\*.wav"); %get all .wav files of folder
25 | % for i=1:length(eFiles) %for every file
26 | % sAudioFile=fullfile(sAudioFolder,eFiles(i).name); %full path to file
27 | % [y,Fs] = audioread(sAudioFile); %read file
28 | % y_resamp = resample(y,Fresample,Fs); %resample at Fresample frequency
29 | % sAudioFileOut=fullfile(sAudioFolder,[strrep(eFiles(i).name,'.wav','') '_down16.wav']); %create new filename
30 | % audiowrite(convertStringsToChars(sAudioFileOut),y_resamp,Fresample); %store the downsampled signal
31 | % end
--------------------------------------------------------------------------------
/evaluation/EVALUATION_script.m:
--------------------------------------------------------------------------------
1 | clear all;
2 | close all;
3 |
4 | %% Store filename of Enhanced Signals
5 | folders = {'bus','cafe','living','office','psquare'};
6 | folders2 = {'2.5 DB','7.5 DB','12.5 DB','17.5 DB'};
7 |
8 | k = 0; %number of 20 contditions
9 | data_Enh = cell(length(folders)*length(folders2),1); %filenames of enhanced
10 | data_Enh_16 = cell(length(folders)*length(folders2),1); %filenames of enhanced 16 KHz
11 | descriptor = cell(length(folders)*length(folders2),1); %descriptors of conditions
12 | for i=1:length(folders) %folders of noise types
13 | for j = 1:length(folders2) %folders of dB
14 | k = k+1;
15 | names = "RNNoise2\" + folders{i} + "\ " + folders2{j} +"\clean_wav"; %folder of enhanced signals
16 | names = dir(names + "\p*"); %initial storage of files
17 | data_cell = regexpi({names.name},'p\w*_down16.wav','match'); %get only wanted filenames
18 | data_cell = vertcat(data_cell{:}); %discard empty records
19 | data_Enh_16(k) = {data_cell}; %store them in a cell array
20 | data_cell = regexpi({names.name},'p\d{3}_\d{3}.wav','match'); %get only wanted filenames
21 | data_cell = vertcat(data_cell{:}); %discard empty records
22 | data_Enh(k) = {data_cell}; %store them in a cell array
23 | descriptor(k) = {folders{i} + "_" + folders2{j}}; %create a descriptor for the conditions
24 | end
25 | end
26 |
27 | %% Compute Metrics
28 | %find number of files in every folder
29 | a = [];
30 | for i=1:length(data_Enh)
31 | a(i) = length(data_Enh{i});
32 | end
33 |
34 | %initialization of metrics' arrays
35 | Csig = zeros(max(a),length(data_Enh));
36 | Cbak = zeros(max(a),length(data_Enh));
37 | Covl = zeros(max(a),length(data_Enh));
38 | PESQ = zeros(max(a),length(data_Enh),2);
39 | STOI = zeros(max(a),length(data_Enh));
40 | LLR = zeros(max(a),length(data_Enh));
41 | fwSNRseg = zeros(max(a),length(data_Enh));
42 | Snr_mean = zeros(max(a),length(data_Enh));
43 | SegSNR_mean = zeros(max(a),length(data_Enh));
44 | Pers = zeros(max(a),length(data_Enh));
45 |
46 | %computation of metrics
47 | for i=1:size(data_Enh,1)%for every condition (20 in total)
48 | files = data_Enh{i}; %filenames for enhanced
49 | files_16 = data_Enh_16{i};
50 | for j=1:length(data_Enh{i}) %for every file in the folder
51 | %get full path to .wav of clean and enhanced signals
52 | desc = strsplit(descriptor{i},'_'); %split descriptor (type of noise & SNR)
53 | name_Enh = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\clean_wav\" + files{j}; %full path to enhanced file
54 | name_Clean = "RNNoise2\clean\" + files{j}; %full path to clear file
55 | %16 KHz
56 | name_Enh_16 = "RNNoise2\"+ desc{1} + "\ " + desc{2} + "\clean_wav\" + files_16{j}; %full path to enhanced file
57 | name_Clean_16 = "RNNoise2\clean\" + files_16{j}; %full path to clear file
58 |
59 | %Csig/Cbak/Covl
60 | [Csig(j,i),Cbak(j,i),Covl(j,i)]=composite(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
61 |
62 | %PESQ
63 | PESQ(j,i,:) = pesq(convertStringsToChars(name_Clean_16),convertStringsToChars(name_Enh_16));
64 |
65 | %Log-likelihood ratio (LLR)
66 | LLR(j,i) = comp_llr(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
67 |
68 | %fwSNRseg
69 | fwSNRseg(j,i) = comp_fwseg(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
70 |
71 | %SNR
72 | [Snr_mean(j,i), SegSNR_mean(j,i)]= comp_snr(convertStringsToChars(name_Clean),convertStringsToChars(name_Enh));
73 |
74 | %STOI
75 | clean_audio = audioread(name_Clean); %get clean audio
76 | [enh_audio,fs] = audioread(name_Enh); %get enhanced audio
77 | samples = min(length(clean_audio),length(enh_audio)); %find minimum number of samples between clean and enhanced
78 | STOI(j,i) = stoi(clean_audio(1:samples),enh_audio(1:samples),fs);
79 |
80 | %Pearson
81 | Pers(j,i) = corr(clean_audio(1:samples),enh_audio(1:samples));
82 |
83 | end
84 | end
85 |
86 | % %% Statistics
87 | % %initializaton
88 | % Csig_mean = zeros(size(Csig,2),1);
89 | % Cbak_mean = zeros(size(Csig,2),1);
90 | % Covl_mean = zeros(size(Csig,2),1);
91 | % STOI_mean = zeros(size(Csig,2),1);
92 | % PESQ_mean = zeros(size(Csig,2),2);
93 | % LLR_mean = zeros(size(Csig,2),1);
94 | % fwSNRseg_mean = zeros(size(Csig,2),1);
95 | % SNR_mean = zeros(size(Csig,2),1); %SNR_mean
96 | % SNRseg_mean = zeros(size(Csig,2),1); %segSNR_mean
97 | % Corr_mean = zeros(size(Csig,2),1); %Pearson correlation
98 | %
99 | %
100 | % for i=1:size(Csig,2) %for every condition (noise type & SNR)
101 | % %for all metrics remove 0s (missing records) elements and then
102 | % %compute the mean for every condition
103 | % Csig_mean(i) = mean(Csig(Csig(:,i) ~= 0,i),1);
104 | % Cbak_mean(i) = mean(Cbak(Cbak(:,i) ~= 0,i),1);
105 | % Covl_mean(i) = mean(Covl(Covl(:,i) ~= 0,i),1);
106 | % PESQ_mean(i,1) = mean(PESQ(PESQ(:,i,1)~= 0,i,1),1); %2 for narrow- and wideband
107 | % PESQ_mean(i,2) = mean(PESQ(PESQ(:,i,2)~= 0,i,2),1);
108 | % STOI_mean(i) = mean(STOI(STOI(:,i) ~= 0,i),1);
109 | % LLR_mean(i) = mean(LLR(LLR(:,i) ~= 0,i),1);
110 | % fwSNRseg_mean(i) = mean(fwSNRseg(fwSNRseg(:,i) ~= 0,i),1);
111 | % SNR_mean(i) = mean(Snr_mean(Snr_mean(:,i) ~= 0,i),1);
112 | % SNRseg_mean(i) = mean(SegSNR_mean(SegSNR_mean(:,i) ~= 0,i),1);
113 | % Corr_mean(i) = mean(Pers(Pers(:,i) ~= 0,i),1);
114 | % end
115 |
116 | ALL_Metrics=[Csig Cbak Covl PESQ(:,:,1) PESQ(:,:,2) STOI LLR fwSNRseg Snr_mean SegSNR_mean Pers];
117 | numberOfMetrics=11;
118 |
119 | AllResults1_MEAN=zeros(1,11);
120 | AllResults2_MEAN=zeros(20,11);
121 | AllResults3_MEAN=zeros(4,11);
122 | AllResults4_MEAN=zeros(5,11);
123 | AllResults1_STD=zeros(1,11);
124 | AllResults2_STD=zeros(20,11);
125 | AllResults3_STD=zeros(4,11);
126 | AllResults4_STD=zeros(5,11);
127 |
128 | for i=1:numberOfMetrics
129 | [AllResults1_MEAN(i),AllResults2_MEAN(:,i),AllResults3_MEAN(:,i),AllResults4_MEAN(:,i),AllResults1_STD(i),AllResults2_STD(:,i),AllResults3_STD(:,i),AllResults4_STD(:,i),]=EvalPlotFun(ALL_Metrics(:,(i-1)*20+(1:20)),i);
130 | end
131 |
132 | FigList = findobj(allchild(0), 'flat', 'Type', 'figure');
133 | %FigList = flip(FigList);
134 | for iFig = 1:1:length(FigList)
135 | FigHandle = FigList(iFig);
136 | FigHandle.WindowState = 'maximized';
137 | saveas(FigHandle,fullfile("resultsRNNoise",num2str(iFig)+".png"));
138 | end
139 | % FigList = findobj(allchild(0), 'flat', 'Type', 'figure');
140 | % %FigList = flip(FigList);
141 | % for iFig = 1:1:length(FigList)
142 | % FigHandle = FigList(iFig);
143 | % FigHandle.WindowState = 'maximized';
144 | % saveas(FigHandle,fullfile("resultsRNNoiseModified_500.000",num2str(length(FigList)-iFig)+".png"));
145 | % end
--------------------------------------------------------------------------------
/evaluation/EvalPlotFun.m:
--------------------------------------------------------------------------------
1 | function [out1, out2, out3, out4, out5, out6, out7, out8]=EvalPlotFun(Vmatrix,metricType)
2 | %% Preamble
3 | metricNames={"Csig","Cbak","Covl","narrow-band PESQ","wide-band PESQ","STOI","LLR","fwSNRseg","SNR","SegSNR","Pearson's Coefficient"}
4 |
5 | [out1, out2, out3, out4]=EvalStatFun(Vmatrix,metricType,1);
6 | [out5, out6, out7, out8]=EvalStatFun(Vmatrix,metricType,2);
7 |
8 |
9 | %% 1st Figure
10 | databar=[out2(1),out2(2),out2(3),out2(4);out2(5),out2(6),out2(7),out2(8);out2(9),out2(10),out2(11),out2(12);out2(13),out2(14),out2(15),out2(16);out2(17),out2(18),out2(19),out2(20)];
11 | stdbar=[out6(1),out6(2),out6(3),out6(4);out6(5),out6(6),out6(7),out6(8);out6(9),out6(10),out6(11),out6(12);out6(13),out6(14),out6(15),out6(16);out6(17),out6(18),out6(19),out6(20)];
12 |
13 | h1=figure
14 | hb=bar(databar);
15 |
16 | hold on
17 | [ngroups,nbars]=size(databar);
18 | groupwidth = min(0.8, nbars/(nbars + 1.5));
19 | % Set the position of each error bar in the centre of the main bar
20 | % Based on barweb.m by Bolu Ajiboye from MATLAB File Exchange
21 | for i = 1:nbars
22 | % Calculate center of each bar
23 | x = (1:ngroups) - groupwidth/2 + (2*i-1) * groupwidth / (2*nbars);
24 | errorbar(x, databar(:,i), stdbar(:,i), 'k', 'linestyle', 'none');
25 | end
26 |
27 | if (metricType==1)
28 | ylim([0 5.2])
29 | elseif (metricType==2)||(metricType==3)
30 | ylim([0 5])
31 | elseif ((metricType==4)||(metricType==5))
32 | ylim([-0.5 4.5])
33 | elseif (metricType==6)
34 | ylim([0 1.1])
35 | elseif (metricType==7)
36 | ylim([0 2.2])
37 | elseif (metricType==8)
38 | ylim([0 23.5])
39 | elseif (metricType==9)
40 | ylim([0 25])
41 | elseif (metricType==10)
42 | ylim([-5 16])
43 | else
44 | ylim([0 1])
45 | end
46 |
47 |
48 | % title({'Performance in different noise conditions at various SNR levels'
49 | % ['Metric: ',convertStringsToChars(metricNames(metricType))]})
50 | grid on
51 | h=suptitle({'Performance in different noise conditions at various SNR levels'
52 | ['Metric: ',char(cellstr(metricNames(metricType)))]
53 | 'Modified RNNoise'});% --> change this part of the title manually
54 | set(h,'FontSize',16,'FontWeight','bold')
55 |
56 | HeightScaleFactor = 1.5;
57 | NewHeight = h.Position(2) * HeightScaleFactor;
58 | %h.Position(2) = h.Position(2) - (NewHeight - h.Position(4));
59 | h.Position(2) = NewHeight;
60 |
61 |
62 | name={'bus'; 'cafe'; 'living'; 'office'; 'psquare'};
63 | set(gca,'xticklabel',name,'FontWeight', 'bold','FontSize',12);
64 | lgd=legend('2.5dB','7.5dB','12.5dB','17.5dB', 'Location', 'northeastoutside');
65 | title(lgd,'SNR Levels')
66 |
67 | hold off
68 |
69 | %saveas(h1,sprintf(convertCharsToStrings(['Figure',(3*metricType-2)])),'png');
70 |
71 |
72 | %% 2nd Figure
73 |
74 | h2=figure (3*metricType-1)
75 | snrX=[2.5,7.5,12.5,17.5];
76 |
77 | subplot(2,6,[1,2])
78 | errorbar(snrX,out2(1:4),out6(1:4), 'LineWidth', 1.5);
79 | xticks([2.5 7.5 12.5 17.5])
80 | xlabel('SNR levels', 'FontSize', 8)
81 | title('Noise Profile: BUS','FontSize',9, 'FontWeight', 'bold')
82 | grid on
83 | set(gca,'FontWeight', 'bold','FontSize',9);
84 |
85 | subplot(2,6,[3,4])
86 | errorbar(snrX,out2(5:8),out6(5:8), 'LineWidth', 1.5);
87 | xticks([2.5 7.5 12.5 17.5])
88 | xlabel('SNR levels', 'FontSize', 8)
89 | title('Noise Profile: CAFE','FontSize',9, 'FontWeight', 'bold')
90 | grid on
91 | set(gca,'FontWeight', 'bold','FontSize',9);
92 |
93 | subplot(2,6,[5,6])
94 | errorbar(snrX,out2(9:12),out6(9:12), 'LineWidth', 1.5);
95 | xticks([2.5 7.5 12.5 17.5])
96 | xlabel('SNR levels', 'FontSize', 8)
97 | title('Noise Profile: LIVING','FontSize',9, 'FontWeight', 'bold')
98 | grid on
99 | set(gca,'FontWeight', 'bold','FontSize',9);
100 |
101 | subplot(2,6,[8,9])
102 | errorbar(snrX,out2(13:16),out6(13:16), 'LineWidth', 1.5);
103 | xticks([2.5 7.5 12.5 17.5])
104 | xlabel('SNR levels', 'FontSize', 8)
105 | title('Noise Profile: OFFICE','FontSize',9, 'FontWeight', 'bold')
106 | grid on
107 | set(gca,'FontWeight', 'bold','FontSize',9);
108 |
109 | subplot(2,6,[10,11])
110 | errorbar(snrX,out2(17:20),out6(17:20), 'LineWidth', 1.5);
111 | xticks([2.5 7.5 12.5 17.5])
112 | xlabel('SNR levels', 'FontSize', 8)
113 | title('Noise Profile: PSQUARE','FontSize',9, 'FontWeight', 'bold')
114 | grid on
115 | set(gca,'FontWeight', 'bold','FontSize',9);
116 |
117 | h=suptitle({'System Performance for Specific Noise Conditions at Different SNR Levels'
118 | ['Metric: ',char(cellstr(metricNames(metricType))), ' - Mofified RNNoise']
119 | });
120 | set(h,'FontSize',14,'FontWeight','bold')
121 |
122 | %% 3rd Figure
123 |
124 | figure (3*metricType)
125 | x=1:5;
126 |
127 | subplot(2,2,1)
128 | databar1=[out2(1),out2(5),out2(9),out2(13),out2(17)];
129 | stdbar1=[out6(1),out6(5),out6(9),out6(13),out6(17)];
130 | bar(x,databar1,'FaceColor', [0.00 0.45 0.74])
131 | hold on
132 | er1=errorbar(x,databar1,stdbar1);
133 | er1.Color = [0 0 0];
134 | er1.LineStyle = 'none';
135 | name={'bus'; 'cafe'; 'living'; 'office'; 'psquare'};
136 | set(gca,'xticklabel',name,'FontWeight','bold');
137 | hold off
138 | title('SNR Level: 2.5dB')
139 | grid on
140 |
141 | subplot(2,2,2)
142 | databar2=[out2(2),out2(6),out2(10),out2(14),out2(18)];
143 | stdbar2=[out6(2),out6(6),out6(10),out6(14),out6(18)];
144 | bar(x,databar2,'FaceColor', [0.85 0.33 0.10])
145 | hold on
146 | er2=errorbar(x,databar2,stdbar2);
147 | er2.Color = [0 0 0];
148 | er2.LineStyle = 'none';
149 | name={'bus'; 'cafe'; 'living'; 'office'; 'psquare'};
150 | set(gca,'xticklabel',name,'FontWeight','bold');
151 | hold off
152 | title('SNR Level: 7.5dB')
153 | grid on
154 |
155 | subplot(2,2,3)
156 | databar3=[out2(3),out2(7),out2(11),out2(15),out2(19)];
157 | stdbar3=[out6(3),out6(7),out6(11),out6(15),out6(19)];
158 | bar(x,databar3,'FaceColor', [0.93 0.69 0.13])
159 | hold on
160 | er3=errorbar(x,databar3,stdbar3);
161 | er3.Color = [0 0 0];
162 | er3.LineStyle = 'none';
163 | name={'bus'; 'cafe'; 'living'; 'office'; 'psquare'};
164 | set(gca,'xticklabel',name,'FontWeight','bold');
165 | hold off
166 | title('SNR Level: 12.5dB')
167 | grid on
168 |
169 | subplot(2,2,4)
170 | databar4=[out2(4),out2(8),out2(12),out2(16),out2(20)];
171 | stdbar4=[out6(4),out6(8),out6(12),out6(16),out6(20)];
172 | bar(x,databar4,'FaceColor', [0.49 0.18 0.56])
173 | hold on
174 | er4=errorbar(x,databar4,stdbar4);
175 | er4.Color = [0 0 0];
176 | er4.LineStyle = 'none';
177 | name={'bus'; 'cafe'; 'living'; 'office'; 'psquare'};
178 | set(gca,'xticklabel',name,'FontWeight','bold');
179 | hold off
180 | title('SNR Level: 17.5dB')
181 | grid on
182 |
183 | h=suptitle({'System Performance at Specific SNR leves for Different Noise Profiles'
184 | ['Metric: ',char(cellstr(metricNames(metricType))),' - Modified RNNoise']
185 | });
186 | set(h,'FontSize',14,'FontWeight','bold')
187 |
--------------------------------------------------------------------------------
/evaluation/EvalStatFun.m:
--------------------------------------------------------------------------------
1 | function [out1, out2, out3, out4]=EvalStatFun(Vmatrix,metricType,statType)
2 | %% Instructions
3 | % statType:
4 | % 1 for mean
5 | % 2 for std
6 |
7 | % metricsType
8 | % 1 for Csig
9 | % 2 for Cbak
10 | % 3 for Covl
11 | % 4 for narrowband PESQ
12 | % 5 for wideband PESQ
13 | % 6 for STOI
14 | % 7 for LLR
15 | % 8 for fwSNRseg
16 | % 9 for Snr_mean
17 | % 10 for SegSNR_mean
18 | % 11 for Pers
19 |
20 | %% How did the system do overall? (all noise types at all SNRs)
21 | Allvalues=Vmatrix(:);
22 | if (statType==1)
23 | out1=mean(nonzeros(Allvalues));
24 | else
25 | out1=std(nonzeros(Allvalues));
26 | end
27 |
28 | %% How did the system do for avery kind of noise for every different SNR level?
29 |
30 | out2 = zeros(size(Vmatrix,2),1);
31 | for i=1:size(Vmatrix,2)
32 | if(statType==1)
33 | out2(i) = mean(nonzeros(Vmatrix(:,i)));
34 | else
35 | out2(i) = std(nonzeros(Vmatrix(:,i)));
36 | end
37 | end
38 |
39 | %% How did the system do for each SNR level, taking into consideration all types of noise
40 | %BAR??
41 |
42 | SNR2_5=[Vmatrix(:,1)',Vmatrix(:,5)',Vmatrix(:,9)',Vmatrix(:,13)',Vmatrix(:,17)'];
43 | SNR7_5=[Vmatrix(:,2)',Vmatrix(:,6)',Vmatrix(:,10)',Vmatrix(:,14)',Vmatrix(:,18)'];
44 | SNR12_5=[Vmatrix(:,3)',Vmatrix(:,7)',Vmatrix(:,11)',Vmatrix(:,15)',Vmatrix(:,19)'];
45 | SNR17_5=[Vmatrix(:,4)',Vmatrix(:,8)',Vmatrix(:,12)',Vmatrix(:,16)',Vmatrix(:,20)'];
46 |
47 | if(statType==1)
48 | out3=[mean(nonzeros(SNR2_5));mean(nonzeros(SNR7_5));mean(nonzeros(SNR12_5));mean(nonzeros(SNR17_5))];
49 | else
50 | out3=[std(nonzeros(SNR2_5));std(nonzeros(SNR7_5));std(nonzeros(SNR12_5));std(nonzeros(SNR17_5))];
51 | end
52 |
53 | %% How did the system do for each noise type, taking into consideration all SNRs
54 | %LINEEE
55 |
56 | if((metricType~=8)&&(metricType~=9)&&(metricType~=10))
57 |
58 | bus=[Vmatrix(:,1)',Vmatrix(:,2)',Vmatrix(:,3)',Vmatrix(:,4)'];
59 | cafe=[Vmatrix(:,5)',Vmatrix(:,6)',Vmatrix(:,7)',Vmatrix(:,8)'];
60 | living=[Vmatrix(:,9)',Vmatrix(:,10)',Vmatrix(:,11)',Vmatrix(:,12)'];
61 | office=[Vmatrix(:,13)',Vmatrix(:,14)',Vmatrix(:,15)',Vmatrix(:,16)'];
62 | psquare=[Vmatrix(:,17)',Vmatrix(:,18)',Vmatrix(:,19)',Vmatrix(:,20)'];
63 |
64 | if(statType==1)
65 | out4=[mean(nonzeros(bus));mean(nonzeros(cafe));mean(nonzeros(living));mean(nonzeros(office));mean(nonzeros(psquare))];
66 | else
67 | out4=[std(nonzeros(bus));std(nonzeros(cafe));std(nonzeros(living));std(nonzeros(office));std(nonzeros(psquare))];
68 | end
69 |
70 | else
71 | out4=zeros(5,1);
72 | end
--------------------------------------------------------------------------------
/evaluation/README.txt:
--------------------------------------------------------------------------------
1 | %RNNoise and modified
2 | (1) Downsample noisy/cleaned samples
3 | Downsample_WAV.m
4 | (have to change folder to noisy_wav or clean_wav)
5 | (commented part is for folder "clean" containing original clean data)
6 |
7 | (2) Evaluation
8 | EVALUATION_script.m (uses EvalPlotFun.m and EvalStatFun.m)
9 | (change folder to store results - end of file)
10 |
11 | %Classic Methods
12 | (1) CDownsample_WAV_Noisy.m
13 | ('_down16.wav': creates 16KHz downsampled noisy data)
14 |
15 | (2) Denoise with Classic Methods
16 | CDenoiseClassicMethods.m
17 | (choose folder to store results using variable "technic". Also change function in for loop to change method)
18 |
19 | (3) CDownsample_WAV_Classic.m
20 | ('_down16_AfterDen.wav': downsampled denoised signals for PESQ and composite)
21 |
22 | (4) CEVALUATION_script_Classic.m (uses EvalPlotFun.m and EvalStatFun.m)
23 | (change folder to store results - end of file and also variable "technic")
24 |
25 | % We used Loizou's functions (comp_fwseg,comp_llr,comp_snr,composite,logmmse,pesq,wiener_as) from https://www.crcpress.com/downloads/K14513/K14513_CD_Files.zip
26 | % and some functions (v_readwav,v_writewav,wavread,wavwrite) for audio I/O from VOICEBOX that is under GNU Public Licence (http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html)
27 | % The abovementioned functions can be found on the link given.
28 | % The function stoi in the folder is taken from http://insy.ewi.tudelft.nl/content/short-time-objective-intelligibility-measure
--------------------------------------------------------------------------------
/evaluation/spectrograms.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | %% 1
4 |
5 | % [y1,fs] = audioread('p232_019clean.wav');
6 | % y2 = audioread('p232_019noisy.wav');
7 | % y3 = audioread('p232_019rnnoise.wav');
8 | % y4 = audioread('p232_019wiener.wav');
9 |
10 |
11 | %% 2
12 |
13 | % [y1,fs] = audioread('p232_006clean.wav');
14 | % y2 = audioread('p232_006noisy.wav');
15 | % y3 = audioread('p232_006rnnoise2.5.wav');
16 | % y4 = audioread('p232_006wiener.wav');
17 |
18 |
19 | %% 3
20 | % [y1,fs] = audioread('p232_019clean.wav');
21 | % y2 = audioread('p232_019noisy.wav');
22 | % y3 = audioread('p232_019rnnoise.wav');
23 | % y4 = audioread('p232_019logmmse.wav');
24 |
25 | %% 4
26 | [y1,fs] = audioread('p232_006clean.wav');
27 | y2 = audioread('p232_006noisy.wav');
28 | y3 = audioread('p232_006rnnoise.wav');
29 | y4 = audioread('p232_006logmmse.wav');
30 |
31 | %% 5
32 | % [y1,fs] = audioread('p232_003_clean.wav');
33 | % y2 = audioread('p232_003_bus7.5_noisy.wav');
34 | % y3 = audioread('p232_003_bus7.5_reference.wav');
35 | % y4 = audioread('p232_003_bus7.5_modified.wav');
36 |
37 |
38 | %% plot
39 | figure
40 | subplot(221)
41 | spectrogram(y1,2048,1024,[],fs,'yaxis')
42 | ax1 = gca;
43 | ax1.YScale = 'log';
44 | title("Clean Speech")
45 | subplot(222)
46 | spectrogram(y2,2048,1024,[],fs,'yaxis')
47 | ax2 = gca;
48 | ax2.YScale = 'log';
49 | title("Noisy Speech")
50 | subplot(223)
51 | spectrogram(y3,2048,1024,[],fs,'yaxis')
52 | ax3 = gca;
53 | ax3.YScale = 'log';
54 | title("Denoised Speech using RNNoise")
55 | %title("Denoised Speech using Reference RNNoise")
56 | subplot(224)
57 | spectrogram(y4,2048,1024,[],fs,'yaxis')
58 | ax4 = gca;
59 | ax4.YScale = 'log';
60 | %title("Denoised Speech using Wiener Filter")
61 | title("Denoised Speech using logMMSE Method")
62 | %title("Denoised Speech using Modified RNNoise")
63 | % suptitle({"Output Signal Comparison"
64 | % "Noise Type: OFFICE, SNR: 2.5 dB"})
65 | suptitle({"Output Signal Comparison"
66 | "Noise Type: CAFE, SNR: 17.5 dB"})
67 | % suptitle({"Output Signal Comparison"
68 | % "Noise Type: BUS, SNR: 7.5 dB"})
--------------------------------------------------------------------------------
/evaluation/stoi.m:
--------------------------------------------------------------------------------
1 | function d = stoi(x, y, fs_signal)
2 | % d = stoi(x, y, fs_signal) returns the output of the short-time
3 | % objective intelligibility (STOI) measure described in [1, 2], where x
4 | % and y denote the clean and processed speech, respectively, with sample
5 | % rate fs_signal in Hz. The output d is expected to have a monotonic
6 | % relation with the subjective speech-intelligibility, where a higher d
7 | % denotes better intelligible speech. See [1, 2] for more details.
8 | %
9 | % References:
10 | % [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
11 | % Objective Intelligibility Measure for Time-Frequency Weighted Noisy
12 | % Speech', ICASSP 2010, Texas, Dallas.
13 | %
14 | % [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for
15 | % Intelligibility Prediction of Time-Frequency Weighted Noisy Speech',
16 | % IEEE Transactions on Audio, Speech, and Language Processing, 2011.
17 | %
18 | %
19 | % Copyright 2009: Delft University of Technology, Signal & Information
20 | % Processing Lab. The software is free for non-commercial use. This program
21 | % comes WITHOUT ANY WARRANTY.
22 | %
23 | %
24 | %
25 | % Updates:
26 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr'
27 |
28 | if length(x)~=length(y)
29 | error('x and y should have the same length');
30 | end
31 |
32 | % initialization
33 | x = x(:); % clean speech column vector
34 | y = y(:); % processed speech column vector
35 |
36 | fs = 10000; % sample rate of proposed intelligibility measure
37 | N_frame = 256; % window support
38 | K = 512; % FFT size
39 | J = 15; % Number of 1/3 octave bands
40 | mn = 150; % Center frequency of first 1/3 octave band in Hz.
41 | H = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix
42 | N = 30; % Number of frames for intermediate intelligibility measure (Length analysis window)
43 | Beta = -15; % lower SDR-bound
44 | dyn_range = 40; % speech dynamic range
45 |
46 | % resample signals if other samplerate is used than fs
47 | if fs_signal ~= fs
48 | x = resample(x, fs, fs_signal);
49 | y = resample(y, fs, fs_signal);
50 | end
51 |
52 | % remove silent frames
53 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
54 |
55 | % apply 1/3 octave band TF-decomposition
56 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech
57 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech
58 |
59 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum
60 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum
61 |
62 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation
63 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation
64 |
65 | for i = 1:size(x_hat, 2)
66 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave bands as described in Eq.(1) [1]
67 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2);
68 | end
69 |
70 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions
71 | d_interm = zeros(J, length(N:size(X, 2))); % init memory for intermediate intelligibility measure
72 | c = 10^(-Beta/20); % constant for clipping procedure
73 |
74 | for m = N:size(X, 2)
75 | X_seg = X(:, (m-N+1):m); % region with length N of clean TF-units for all j
76 | Y_seg = Y(:, (m-N+1):m); % region with length N of processed TF-units for all j
77 | alpha = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2)); % obtain scale factor for normalizing processed TF-region for all j
78 | aY_seg = Y_seg.*repmat(alpha, [1 N]); % obtain \alpha*Y_j(n) from Eq.(2) [1]
79 | for j = 1:J
80 | Y_prime = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3)
81 | d_interm(j, m-N+1) = taa_corr(X_seg(j, :).', Y_prime(:)); % obtain correlation coeffecient from Eq.(4) [1]
82 | end
83 | end
84 |
85 | d = mean(d_interm(:)); % combine all intermediate intelligibility measures as in Eq.(4) [1]
86 |
87 | %%
88 | function [A cf] = thirdoct(fs, N_fft, numBands, mn)
89 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
90 | % inputs:
91 | % FS: samplerate
92 | % N_FFT: FFT size
93 | % NUMBANDS: number of bands
94 | % MN: center frequency of first 1/3 octave band
95 | % outputs:
96 | % A: octave band matrix
97 | % CF: center frequencies
98 |
99 | f = linspace(0, fs, N_fft+1);
100 | f = f(1:(N_fft/2+1));
101 | k = 0:(numBands-1);
102 | cf = 2.^(k/3)*mn;
103 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
104 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
105 | A = zeros(numBands, length(f));
106 |
107 | for i = 1:(length(cf))
108 | [a b] = min((f-fl(i)).^2);
109 | fl(i) = f(b);
110 | fl_ii = b;
111 |
112 | [a b] = min((f-fr(i)).^2);
113 | fr(i) = f(b);
114 | fr_ii = b;
115 | A(i,fl_ii:(fr_ii-1)) = 1;
116 | end
117 |
118 | rnk = sum(A, 2);
119 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
120 | A = A(1:numBands, :);
121 | cf = cf(1:numBands);
122 |
123 | %%
124 | function x_stdft = stdft(x, N, K, N_fft)
125 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
126 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size
127 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and
128 | % dft-bin index, respectively.
129 |
130 | frames = 1:K:(length(x)-N);
131 | x_stdft = zeros(length(frames), N_fft);
132 |
133 | w = hanning(N);
134 | x = x(:);
135 |
136 | for i = 1:length(frames)
137 | ii = frames(i):(frames(i)+N-1);
138 | x_stdft(i, :) = fft(x(ii).*w, N_fft);
139 | end
140 |
141 | %%
142 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
143 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
144 | % are segmented with frame-length N and overlap K, where the maximum energy
145 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
146 | % reconstructed signals, excluding the frames, where the energy of a frame
147 | % of X is smaller than X_MAX-RANGE
148 |
149 | x = x(:);
150 | y = y(:);
151 |
152 | frames = 1:K:(length(x)-N);
153 | w = hanning(N);
154 | msk = zeros(size(frames));
155 |
156 | for j = 1:length(frames)
157 | jj = frames(j):(frames(j)+N-1);
158 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N));
159 | end
160 |
161 | msk = (msk-max(msk)+range)>0;
162 | count = 1;
163 |
164 | x_sil = zeros(size(x));
165 | y_sil = zeros(size(y));
166 |
167 | for j = 1:length(frames)
168 | if msk(j)
169 | jj_i = frames(j):(frames(j)+N-1);
170 | jj_o = frames(count):(frames(count)+N-1);
171 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w;
172 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w;
173 | count = count+1;
174 | end
175 | end
176 |
177 | x_sil = x_sil(1:jj_o(end));
178 | y_sil = y_sil(1:jj_o(end));
179 |
180 | %%
181 | function rho = taa_corr(x, y)
182 | % RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column
183 | % vectors x and y. Gives same results as 'corr' from statistics toolbox.
184 | xn = x-mean(x);
185 | xn = xn/sqrt(sum(xn.^2));
186 | yn = y-mean(y);
187 | yn = yn/sqrt(sum(yn.^2));
188 | rho = sum(xn.*yn);
--------------------------------------------------------------------------------
/evaluation_batch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Bash script to evaluate a given set of audio files using RNNoise
3 |
4 | # Convert all files to raw
5 | cd noisy_testset_wav
6 | while IFS=, read -r file scene db;
7 | do
8 | # Print currently processed file
9 | echo "[$(date +%T)]: Processing $file";
10 |
11 | # Convert desired audio track to raw
12 | sox $file.wav $file.raw
13 |
14 | # Extract RNNoise features using the feature extractor
15 | ../featureExtractor/feature_extractor $file.raw $file.features.f32
16 |
17 | # Extract extended features
18 | python3.8 ../src/featureExtraction.py testing $file.wav $file.extendedFeatures.bin
19 |
20 | # Join feature sets using bin2hdf5
21 | python3.8 ../training/bin2hdf5.py $file.features.f32 $file.extendedFeatures.bin $file.fullFeatures.h5 testing
22 | done < ../log_testset.csv
23 |
24 | # Clean up work directory
25 | # rm *.f32; rm *.raw; rm *.h5; rm *.bin;
26 |
27 | cd ..
28 |
29 | # Run RNNoise
30 | python3.8 ./training/evaluation_batch.py noisy_testset_wav model_ex.hdf5
31 |
32 | # Categorize files
33 | while IFS=, read -r file scene db;
34 | do
35 | echo "[$(date +%T)]: Processing $file";
36 | # Create Directories
37 | mkdir -p $scene/"$db DB"/noisy_wav/
38 | mkdir -p $scene/"$db DB"/clean_wav/
39 | # Categorize based on log file
40 | cp noisy_testset_wav/"$file.wav" $scene/"$db DB"/noisy_wav/"$file.wav"
41 | cp noisy_testset_wav/"$file.clean.wav" $scene/"$db DB"/clean_wav/"$file.wav"
42 | done < log_testset.csv
43 |
44 | rm *.clean.wav
--------------------------------------------------------------------------------
/featureExtractor/.deps/.dirstamp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/.deps/.dirstamp
--------------------------------------------------------------------------------
/featureExtractor/.deps/rnnoise_demo.Po:
--------------------------------------------------------------------------------
1 | examples/rnnoise_demo.o: examples/rnnoise_demo.c \
2 | /usr/include/stdc-predef.h /usr/include/stdio.h \
3 | /usr/include/bits/libc-header-start.h /usr/include/features.h \
4 | /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
5 | /usr/include/bits/long-double.h /usr/include/gnu/stubs.h \
6 | /usr/include/gnu/stubs-64.h \
7 | /usr/lib/gcc/x86_64-pc-linux-gnu/10.2.0/include/stddef.h \
8 | /usr/lib/gcc/x86_64-pc-linux-gnu/10.2.0/include/stdarg.h \
9 | /usr/include/bits/types.h /usr/include/bits/timesize.h \
10 | /usr/include/bits/typesizes.h /usr/include/bits/time64.h \
11 | /usr/include/bits/types/__fpos_t.h /usr/include/bits/types/__mbstate_t.h \
12 | /usr/include/bits/types/__fpos64_t.h /usr/include/bits/types/__FILE.h \
13 | /usr/include/bits/types/FILE.h /usr/include/bits/types/struct_FILE.h \
14 | /usr/include/bits/stdio_lim.h /usr/include/bits/floatn.h \
15 | /usr/include/bits/floatn-common.h /usr/include/bits/stdio.h \
16 | include/rnnoise.h
17 | /usr/include/stdc-predef.h:
18 | /usr/include/stdio.h:
19 | /usr/include/bits/libc-header-start.h:
20 | /usr/include/features.h:
21 | /usr/include/sys/cdefs.h:
22 | /usr/include/bits/wordsize.h:
23 | /usr/include/bits/long-double.h:
24 | /usr/include/gnu/stubs.h:
25 | /usr/include/gnu/stubs-64.h:
26 | /usr/lib/gcc/x86_64-pc-linux-gnu/10.2.0/include/stddef.h:
27 | /usr/lib/gcc/x86_64-pc-linux-gnu/10.2.0/include/stdarg.h:
28 | /usr/include/bits/types.h:
29 | /usr/include/bits/timesize.h:
30 | /usr/include/bits/typesizes.h:
31 | /usr/include/bits/time64.h:
32 | /usr/include/bits/types/__fpos_t.h:
33 | /usr/include/bits/types/__mbstate_t.h:
34 | /usr/include/bits/types/__fpos64_t.h:
35 | /usr/include/bits/types/__FILE.h:
36 | /usr/include/bits/types/FILE.h:
37 | /usr/include/bits/types/struct_FILE.h:
38 | /usr/include/bits/stdio_lim.h:
39 | /usr/include/bits/floatn.h:
40 | /usr/include/bits/floatn-common.h:
41 | /usr/include/bits/stdio.h:
42 | include/rnnoise.h:
43 |
--------------------------------------------------------------------------------
/featureExtractor/.dirstamp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/.dirstamp
--------------------------------------------------------------------------------
/featureExtractor/.libs/lt-rnnoise_demo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/.libs/lt-rnnoise_demo
--------------------------------------------------------------------------------
/featureExtractor/.libs/rnnoise_demo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/.libs/rnnoise_demo
--------------------------------------------------------------------------------
/featureExtractor/denoise.c:
--------------------------------------------------------------------------------
1 | /* Copyright (c) 2018 Gregor Richards
2 | * Copyright (c) 2017 Mozilla */
3 | /*
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions
6 | are met:
7 |
8 | - Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | - Redistributions in binary form must reproduce the above copyright
12 | notice, this list of conditions and the following disclaimer in the
13 | documentation and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | */
27 |
28 | #ifdef HAVE_CONFIG_H
29 | #include "config.h"
30 | #endif
31 |
32 | #include "denoise.h"
33 |
34 |
35 | /* The built-in model, used if no file is given as input */
36 | extern const struct RNNModel rnnoise_model_orig;
37 |
38 |
39 | static const opus_int16 eband5ms[] = {
40 | /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
41 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
42 | };
43 |
44 |
45 | typedef struct {
46 | int init;
47 | kiss_fft_state *kfft;
48 | float half_window[FRAME_SIZE];
49 | float dct_table[NB_BANDS*NB_BANDS];
50 | } CommonState;
51 |
52 | void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
53 | int i;
54 | float sum[NB_BANDS] = {0};
55 | for (i=0;irnn.model = model;
216 | else
217 | st->rnn.model = &rnnoise_model_orig;
218 | st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size);
219 | st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size);
220 | st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size);
221 | return 0;
222 | }
223 |
224 | DenoiseState *rnnoise_create(RNNModel *model) {
225 | DenoiseState *st;
226 | st = malloc(rnnoise_get_size());
227 | rnnoise_init(st, model);
228 | return st;
229 | }
230 |
231 | void rnnoise_destroy(DenoiseState *st) {
232 | free(st->rnn.vad_gru_state);
233 | free(st->rnn.noise_gru_state);
234 | free(st->rnn.denoise_gru_state);
235 | free(st);
236 | }
237 |
238 | #if TRAINING
239 | int lowpass = FREQ_SIZE;
240 | int band_lp = NB_BANDS;
241 | #endif
242 |
243 | static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
244 | int i;
245 | float x[WINDOW_SIZE];
246 | RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
247 | for (i=0;ianalysis_mem, in, FRAME_SIZE);
249 |
250 | // Apply Vorbis window
251 | apply_window(x);
252 |
253 | // Fourier Transform FFT
254 | forward_transform(X, x);
255 | #if TRAINING
256 | for (i=lowpass;i>1];
273 | int pitch_index;
274 | float gain;
275 | float *(pre[1]);
276 | // float** pre;
277 | float tmp[NB_BANDS];
278 | float follow, logMax;
279 |
280 | // FFT and energy
281 | frame_analysis(st, X, Ex, in);
282 |
283 | // Copy last part of pitch buffer to the beginning and fill the rest with noisy frame
284 | RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
285 | RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
286 |
287 | // Pointer to pitch buffer
288 | pre[0] = &st->pitch_buf[0];
289 |
290 | // Find pitch
291 | pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
292 | pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
293 | PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
294 | pitch_index = PITCH_MAX_PERIOD-pitch_index;
295 |
296 |
297 | gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
298 | PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
299 | st->last_period = pitch_index;
300 | st->last_gain = gain;
301 |
302 | // x(n) = x(n-T)
303 | for (i=0;ipitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
305 | // p[i] = st->pitch_buf[pitch_index_old + i];
306 |
307 |
308 | apply_window(p);
309 | forward_transform(P, p);
310 | compute_band_energy(Ep, P);
311 | compute_band_corr(Exp, X, P);
312 | for (i=0;icepstral_mem[st->memid];
337 | ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
338 | ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
339 | for (i=0;imemid++;
341 | for (i=0;imemid == CEPS_MEM) st->memid = 0;
348 | for (i=0;icepstral_mem[i][k] - st->cepstral_mem[j][k];
360 | dist += tmp*tmp;
361 | }
362 | if (j!=i)
363 | mindist = MIN32(mindist, dist);
364 | }
365 | spec_variability += mindist;
366 | }
367 | features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
368 | return TRAINING && E < 0.1;
369 | }
370 |
371 | static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) {
372 | float x[WINDOW_SIZE];
373 | int i;
374 | inverse_transform(x, y);
375 | apply_window(x);
376 | for (i=0;isynthesis_mem[i];
377 | RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
378 | }
379 |
380 | void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
381 | int i;
382 | for (i=0;ig[i]) r[i] = 1;
400 | else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i]));
401 | r[i] = MIN16(1, MAX16(0, r[i]));
402 | #else
403 | if (Exp[i]>g[i]) r[i] = 1;
404 | else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
405 | r[i] = sqrt(MIN16(1, MAX16(0, r[i])));
406 | #endif
407 | r[i] *= sqrt(Ex[i]/(1e-8+Ep[i]));
408 | }
409 | interp_band_gain(rf, r);
410 | for (i=0;imem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
443 |
444 | silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
445 |
446 | if (!silence) {
447 |
448 | compute_rnn(&st->rnn, g, &vad_prob, features);
449 |
450 | pitch_filter(X, P, Ex, Ep, Exp, g);
451 | for (i=0;ilastg[i]);
454 | st->lastg[i] = g[i];
455 | }
456 | interp_band_gain(gf, g);
457 | #if 1
458 | for (i=0;i
29 | #include
30 | #include
31 | #include "../src/kiss_fft.h"
32 | #include "../src/common.h"
33 | #include
34 | #include "rnnoise.h"
35 | #include "../src/pitch.h"
36 | #include "../src/arch.h"
37 | #include "../src/rnn.h"
38 | #include "../src/rnn_data.h"
39 |
40 | struct DenoiseState {
41 | float analysis_mem[FRAME_SIZE];
42 | float cepstral_mem[CEPS_MEM][NB_BANDS];
43 | int memid;
44 | float synthesis_mem[FRAME_SIZE];
45 | float pitch_buf[PITCH_BUF_SIZE];
46 | float pitch_enh_buf[PITCH_BUF_SIZE];
47 | float last_gain;
48 | int last_period;
49 | float mem_hp_x[2];
50 | float lastg[NB_BANDS];
51 | RNNState rnn;
52 | };
53 |
54 | int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P,
55 | float *Ex, float *Ep, float *Exp, float *features, const float *in);
56 |
57 | void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N);
58 |
59 | #endif
--------------------------------------------------------------------------------
/featureExtractor/denoise.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/denoise.o
--------------------------------------------------------------------------------
/featureExtractor/feature_extractor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CedArctic/rnnoise-ex/16affa8f70ec84196b411f042a5682802c6857f2/featureExtractor/feature_extractor
--------------------------------------------------------------------------------
/featureExtractor/feature_extractor.c:
--------------------------------------------------------------------------------
1 | /* Copyright (c) 2018 Gregor Richards
2 | * Copyright (c) 2017 Mozilla */
3 | /*
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions
6 | are met:
7 |
8 | - Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | - Redistributions in binary form must reproduce the above copyright
12 | notice, this list of conditions and the following disclaimer in the
13 | documentation and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | */
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 |
33 | #include "rnnoise.h"
34 | #include "../config.h"
35 | #include "../src/kiss_fft.h"
36 | #include "../src/common.h"
37 | #include "../src/pitch.h"
38 | #include "../src/arch.h"
39 | #include "../src/rnn.h"
40 | #include "../src/rnn_data.h"
41 | #include "../src/denoise.h"
42 |
43 |
44 | #define NB_BANDS 22
45 | #define NB_DELTA_CEPS 6
46 | #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
47 | #define FRAME_SIZE_SHIFT 2
48 | #define FRAME_SIZE (120<mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
63 |
64 | compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
65 |
66 | return;
67 | }
68 |
69 | int main(int argc, char **argv) {
70 |
71 | float x[FRAME_SIZE];
72 | FILE *f1, *fout;
73 |
74 | float features[NB_FEATURES];
75 |
76 | // Create and allocate state
77 | DenoiseState *st;
78 | st = rnnoise_create(NULL);
79 |
80 | // Invalid input error message
81 | if (argc!=3) {
82 | fprintf(stderr, "usage: %s \n", argv[0]);
83 | return 1;
84 | }
85 |
86 | // Open file I/O streams
87 | f1 = fopen(argv[1], "rb");
88 | fout = fopen(argv[2], "wb");
89 |
90 | while (1) {
91 |
92 | // Allocate temporary space for the current frame and load it to memory
93 | short tmp[FRAME_SIZE];
94 | fread(tmp, sizeof(short), FRAME_SIZE, f1);
95 |
96 | // Check if we reached end of file
97 | if (feof(f1)) break;
98 |
99 | // Copy frame to x[]
100 | for (int i=0;i
33 | #include
34 | #include
35 | #include "kiss_fft.h"
36 | #include "common.h"
37 | #include
38 | #include "rnnoise.h"
39 | #include "pitch.h"
40 | #include "arch.h"
41 | #include "rnn.h"
42 | #include "rnn_data.h"
43 | #include "denoise.h"
44 |
45 |
46 | /* The built-in model, used if no file is given as input */
47 | extern const struct RNNModel rnnoise_model_orig;
48 |
49 |
50 | static const opus_int16 eband5ms[] = {
51 | /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
52 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
53 | };
54 |
55 |
56 | typedef struct {
57 | int init;
58 | kiss_fft_state *kfft;
59 | float half_window[FRAME_SIZE];
60 | float dct_table[NB_BANDS*NB_BANDS];
61 | } CommonState;
62 |
63 | void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
64 | int i;
65 | float sum[NB_BANDS] = {0};
66 | for (i=0;irnn.model = model;
227 | else
228 | st->rnn.model = &rnnoise_model_orig;
229 | st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size);
230 | st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size);
231 | st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size);
232 | return 0;
233 | }
234 |
235 | DenoiseState *rnnoise_create(RNNModel *model) {
236 | DenoiseState *st;
237 | st = malloc(rnnoise_get_size());
238 | rnnoise_init(st, model);
239 | return st;
240 | }
241 |
242 | void rnnoise_destroy(DenoiseState *st) {
243 | free(st->rnn.vad_gru_state);
244 | free(st->rnn.noise_gru_state);
245 | free(st->rnn.denoise_gru_state);
246 | free(st);
247 | }
248 |
249 | #if TRAINING
250 | int lowpass = FREQ_SIZE;
251 | int band_lp = NB_BANDS;
252 | #endif
253 |
254 | static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
255 | int i;
256 | float x[WINDOW_SIZE];
257 | RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
258 | for (i=0;ianalysis_mem, in, FRAME_SIZE);
260 |
261 | // Apply Vorbis window
262 | apply_window(x);
263 |
264 | // Fourier Transform FFT
265 | forward_transform(X, x);
266 | #if TRAINING
267 | for (i=lowpass;i>1];
284 | int pitch_index;
285 | float gain;
286 | float *(pre[1]);
287 | // float** pre;
288 | float tmp[NB_BANDS];
289 | float follow, logMax;
290 |
291 | // FFT and energy
292 | frame_analysis(st, X, Ex, in);
293 |
294 | // Copy last part of pitch buffer to the beginning and fill the rest with noisy frame
295 | RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
296 | RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
297 |
298 | // Pointer to pitch buffer
299 | pre[0] = &st->pitch_buf[0];
300 |
301 | // Find pitch
302 | pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
303 | pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
304 | PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
305 | pitch_index = PITCH_MAX_PERIOD-pitch_index;
306 |
307 |
308 | gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
309 | PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
310 | st->last_period = pitch_index;
311 | st->last_gain = gain;
312 |
313 | // x(n) = x(n-T)
314 | for (i=0;ipitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
316 | // p[i] = st->pitch_buf[pitch_index_old + i];
317 |
318 |
319 | apply_window(p);
320 | forward_transform(P, p);
321 | compute_band_energy(Ep, P);
322 | compute_band_corr(Exp, X, P);
323 | for (i=0;icepstral_mem[st->memid];
348 | ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
349 | ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
350 | for (i=0;imemid++;
352 | for (i=0;imemid == CEPS_MEM) st->memid = 0;
359 | for (i=0;icepstral_mem[i][k] - st->cepstral_mem[j][k];
371 | dist += tmp*tmp;
372 | }
373 | if (j!=i)
374 | mindist = MIN32(mindist, dist);
375 | }
376 | spec_variability += mindist;
377 | }
378 | features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
379 | return TRAINING && E < 0.1;
380 | }
381 |
382 | static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) {
383 | float x[WINDOW_SIZE];
384 | int i;
385 | inverse_transform(x, y);
386 | apply_window(x);
387 | for (i=0;isynthesis_mem[i];
388 | RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
389 | }
390 |
391 | void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
392 | int i;
393 | for (i=0;ig[i]) r[i] = 1;
411 | else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i]));
412 | r[i] = MIN16(1, MAX16(0, r[i]));
413 | #else
414 | if (Exp[i]>g[i]) r[i] = 1;
415 | else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
416 | r[i] = sqrt(MIN16(1, MAX16(0, r[i])));
417 | #endif
418 | r[i] *= sqrt(Ex[i]/(1e-8+Ep[i]));
419 | }
420 | interp_band_gain(rf, r);
421 | for (i=0;imem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
454 |
455 | silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
456 |
457 | if (!silence) {
458 |
459 | compute_rnn(&st->rnn, g, &vad_prob, features);
460 |
461 | pitch_filter(X, P, Ex, Ep, Exp, g);
462 | for (i=0;ilastg[i]);
465 | st->lastg[i] = g[i];
466 | }
467 | interp_band_gain(gf, g);
468 | #if 1
469 | for (i=0;i \n", argv[0]);
525 | return 1;
526 | }
527 |
528 | // Speech file pointer
529 | f1 = fopen(argv[1], "r");
530 |
531 | // Noise file pointer
532 | f2 = fopen(argv[2], "r");
533 |
534 | // Noisy speech output pointer
535 | fout = fopen("noisySpeechSamples.raw", "w");
536 |
537 | // Declare number of samples
538 | maxCount = atoi(argv[3]);
539 |
540 | // Read 150 noise frames
541 | for(i=0;i<150;i++) {
542 | short tmp[FRAME_SIZE];
543 | fread(tmp, sizeof(short), FRAME_SIZE, f2);
544 | }
545 |
546 | while (1) {
547 |
548 | // Declare arrays & variables
549 | kiss_fft_cpx X[FREQ_SIZE], Y[FREQ_SIZE], N[FREQ_SIZE], P[WINDOW_SIZE];
550 | float Ex[NB_BANDS], Ey[NB_BANDS], En[NB_BANDS], Ep[NB_BANDS];
551 | float Exp[NB_BANDS];
552 | float Ln[NB_BANDS];
553 | float features[NB_FEATURES];
554 | float g[NB_BANDS];
555 | short tmp[FRAME_SIZE];
556 | float vad=0;
557 | float E=0;
558 |
559 | // End while loop when number of samples has been satisfied
560 | if (count==maxCount) break;
561 |
562 | // Print progress
563 | if ((count%1000)==0) fprintf(stderr, "%d\r", count);
564 |
565 |
566 | if (++gain_change_count > 2821) {
567 | speech_gain = pow(10., (-40+(rand()%60))/20.);
568 | noise_gain = pow(10., (-30+(rand()%50))/20.);
569 | if (rand()%10==0) noise_gain = 0;
570 | noise_gain *= speech_gain;
571 | if (rand()%10==0) speech_gain = 0;
572 | gain_change_count = 0;
573 | rand_resp(a_noise, b_noise);
574 | rand_resp(a_sig, b_sig);
575 | lowpass = FREQ_SIZE * 3000./24000. * pow(50., rand()/(double)RAND_MAX);
576 | for (i=0;i lowpass) {
578 | band_lp = i;
579 | break;
580 | }
581 | }
582 | }
583 |
584 | if (speech_gain != 0) {
585 | fread(tmp, sizeof(short), FRAME_SIZE, f1);
586 | if (feof(f1)) {
587 | rewind(f1);
588 | fread(tmp, sizeof(short), FRAME_SIZE, f1);
589 | }
590 | for (i=0;i 1e9f) {
622 | vad_cnt=0;
623 | } else if (E > 1e8f) {
624 | vad_cnt -= 5;
625 | } else if (E > 1e7f) {
626 | vad_cnt++;
627 | } else {
628 | vad_cnt+=2;
629 | }
630 | if (vad_cnt < 0) vad_cnt = 0;
631 | if (vad_cnt > 15) vad_cnt = 15;
632 |
633 | if (vad_cnt >= 10) vad = 0;
634 | else if (vad_cnt > 0) vad = 0.5f;
635 | else vad = 1.f;
636 |
637 | frame_analysis(st, Y, Ey, x);
638 | frame_analysis(noise_state, N, En, n);
639 |
640 | for (i=0;ilast_gain, noisy->last_period);
644 |
645 | for (i=0;i 1) g[i] = 1;
648 | if (silence || i > band_lp) g[i] = -1;
649 | if (Ey[i] < 5e-2 && Ex[i] < 5e-2) g[i] = -1;
650 | if (vad==0 && noise_gain==0) g[i] = -1;
651 | }
652 |
653 | count++;
654 | #if 1
655 | fwrite(features, sizeof(float), NB_FEATURES, stdout);
656 | fwrite(g, sizeof(float), NB_BANDS, stdout);
657 | fwrite(Ln, sizeof(float), NB_BANDS, stdout);
658 | fwrite(&vad, sizeof(float), 1, stdout);
659 | #endif
660 | }
661 | fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1);
662 | fclose(f1);
663 | fclose(f2);
664 | return 0;
665 | }
666 |
667 | #endif
668 |
--------------------------------------------------------------------------------
/src/denoise.h:
--------------------------------------------------------------------------------
1 | #ifndef DENOISE_H
2 | #define DENOISE_H
3 |
4 | #define FRAME_SIZE_SHIFT 2
5 | #define FRAME_SIZE (120<
29 | #include
30 | #include
31 | #include "../src/kiss_fft.h"
32 | #include "../src/common.h"
33 | #include
34 | #include "rnnoise.h"
35 | #include "../src/pitch.h"
36 | #include "../src/arch.h"
37 | #include "../src/rnn.h"
38 | #include "../src/rnn_data.h"
39 |
40 | struct DenoiseState {
41 | float analysis_mem[FRAME_SIZE];
42 | float cepstral_mem[CEPS_MEM][NB_BANDS];
43 | int memid;
44 | float synthesis_mem[FRAME_SIZE];
45 | float pitch_buf[PITCH_BUF_SIZE];
46 | float pitch_enh_buf[PITCH_BUF_SIZE];
47 | float last_gain;
48 | int last_period;
49 | float mem_hp_x[2];
50 | float lastg[NB_BANDS];
51 | RNNState rnn;
52 | };
53 |
54 | int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P,
55 | float *Ex, float *Ep, float *Exp, float *features, const float *in);
56 |
57 | void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N);
58 |
59 | #endif
--------------------------------------------------------------------------------
/src/featureExtraction.py:
--------------------------------------------------------------------------------
1 | # Feature extraction example
2 | import numpy as np
3 | import librosa
4 | import soundfile as sf
5 | import sys
6 | import h5py
7 |
8 | # Sampling rate
9 | samplingRate = 48000
10 |
11 | # Set up for training
12 | if sys.argv[1] == 'training':
13 | # Total number of 10ms frames that will be processed
14 | totalFrames = int(sys.argv[4])
15 |
16 | # Number of samples to process per batch
17 | batch_size = 1000 * samplingRate
18 |
19 | # Βatches of samples. Division by 100 is due to the 10ms duration of each frame in totalFrames
20 | batches = int(totalFrames * samplingRate / (batch_size * 100))
21 |
22 | # Number of RNNoise frames to which each batch of samples maps to
23 | frames_per_batch = int(totalFrames / batches)
24 |
25 | # Initiallize vectors
26 | spectral_centroid = np.zeros(shape=(totalFrames))
27 | spectral_bandwidth = np.zeros(shape=(totalFrames))
28 | spectral_rolloff = np.zeros(shape=(totalFrames))
29 |
30 | # Set up for testing
31 | else:
32 | # Load the sample wav file with its sampling rate
33 | y, sr = sf.read(sys.argv[2])
34 |
35 | # Each frame is 10ms
36 | totalFrames = int(len(y) / 48000) * 100
37 |
38 | batch_size = len(y)
39 |
40 | batches = 1
41 |
42 | frames_per_batch = totalFrames
43 |
44 |
45 | # Open an h5 file for output
46 | hf = h5py.File(sys.argv[3], 'w')
47 |
48 | # Process batches
49 | for batch_num in range(batches):
50 |
51 | print("Processing batch", batch_num, "out of", batches)
52 |
53 | # Load input
54 | if sys.argv[1] == 'training':
55 | # For training:
56 | y, sr = sf.read(sys.argv[2], channels=1, samplerate=samplingRate, subtype='PCM_16', start=batch_num*batch_size, frames=batch_size)
57 | else:
58 | # Load the sample wav file with its sampling rate
59 | y, sr = sf.read(sys.argv[2])
60 |
61 | # Split to 20ms overlaping frames
62 | # 960 is 20ms for 48000 sampling rate, 480 adds 10ms overlap with the previous frame
63 | # frames = librosa.util.frame(x=y, frame_length=960, hop_length=480, axis=0)
64 |
65 | # Root Mean Square
66 | #rms = librosa.feature.rms(y=y, frame_length=960, hop_length=480)
67 |
68 | # Spectral centroid
69 | spectral_centroid_t = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=960, hop_length=480)
70 | spectral_centroid_t = np.reshape(spectral_centroid_t, newshape=(-1))
71 | if sys.argv[1] == 'training':
72 | spectral_centroid[batch_num*frames_per_batch:(batch_num+1)*frames_per_batch] = spectral_centroid_t[:-1]
73 | else:
74 | spectral_centroid = spectral_centroid_t[:-1]
75 |
76 | # Spectral bandwidth
77 | spectral_bandwidth_t = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=960, hop_length=480)
78 | spectral_bandwidth_t = np.reshape(spectral_bandwidth_t, newshape=(-1))
79 | if sys.argv[1] == 'training':
80 | spectral_bandwidth[batch_num*frames_per_batch:(batch_num+1)*frames_per_batch] = spectral_bandwidth_t[:-1]
81 | else:
82 | spectral_bandwidth = spectral_bandwidth_t[:-1]
83 |
84 | # Spectral flatness
85 | #spectral_flatness = librosa.feature.spectral_flatness(y=y, n_fft=960, hop_length=480)
86 |
87 | # Spectral roll-off frequency
88 | spectral_rolloff_t = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=960, hop_length=480)
89 | spectral_rolloff_t = np.reshape(spectral_rolloff_t, newshape=(-1))
90 | if sys.argv[1] == 'training':
91 | spectral_rolloff[batch_num*frames_per_batch:(batch_num+1)*frames_per_batch] = spectral_rolloff_t[:-1]
92 | else:
93 | spectral_rolloff = spectral_rolloff_t[:-1]
94 |
95 |
96 |
97 | # Normalize and save data
98 | if sys.argv[1] == 'training':
99 | spectral_centroid_std = np.std(spectral_centroid)
100 | spectral_centroid_mean = np.mean(spectral_centroid)
101 | spectral_centroid -= spectral_centroid_mean
102 | spectral_centroid /= spectral_centroid_std
103 | print('Spectral Centroid Std:', spectral_centroid_std)
104 | print('Spectral Centroid Mean:', spectral_centroid_mean)
105 | else:
106 | spectral_centroid -= 4112.5994
107 | spectral_centroid /= 2842.3116
108 | hf.create_dataset('centroid', data=spectral_centroid)
109 |
110 | if sys.argv[1] == 'training':
111 | spectral_bandwidth_std = np.std(spectral_bandwidth)
112 | spectral_bandwidth_mean = np.mean(spectral_bandwidth)
113 | spectral_bandwidth -= spectral_bandwidth_mean
114 | spectral_bandwidth /= spectral_bandwidth_std
115 | print('Spectral Bandwidth Std:', spectral_bandwidth_std)
116 | print('Spectral Bandwidth Mean:', spectral_bandwidth_mean)
117 | else:
118 | spectral_bandwidth -= 4952.0481
119 | spectral_bandwidth /= 1936.2998
120 | hf.create_dataset('bandwidth', data=spectral_bandwidth)
121 |
122 | if sys.argv[1] == 'training':
123 | spectral_rolloff_std = np.std(spectral_rolloff)
124 | spectral_rolloff_mean = np.mean(spectral_rolloff)
125 | spectral_rolloff -= spectral_rolloff_mean
126 | spectral_rolloff /= spectral_rolloff_std
127 | print('Spectral Rolloff Std:', spectral_rolloff_std)
128 | print('Spectral Rolloff Mean:', spectral_rolloff_mean)
129 | else:
130 | spectral_rolloff -= 8670.3725
131 | spectral_rolloff /= 6298.5521
132 | hf.create_dataset('rolloff', data=spectral_rolloff)
133 |
134 | # Close h5
135 | hf.close()
--------------------------------------------------------------------------------
/training/bin2hdf5.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import h5py
7 | import sys
8 |
9 | if sys.argv[4] == 'training':
10 | ORIGINAL_FEATURES = 87
11 | else:
12 | ORIGINAL_FEATURES = 42
13 |
14 | # Load original RNNoise features
15 | vdata = np.fromfile(sys.argv[1], dtype='float32')
16 | vdataY = int(vdata.size/ORIGINAL_FEATURES)
17 | vdata = np.reshape(vdata, (vdataY, ORIGINAL_FEATURES))
18 |
19 | # Load extended features
20 | exFeatFile = h5py.File(sys.argv[2], 'r')
21 | exdata = np.array([exFeatFile.get('centroid'), exFeatFile.get('bandwidth'), exFeatFile.get('rolloff')]).T
22 | exFeatFile.close()
23 |
24 | # Concatenate the matrices
25 | data = np.concatenate((vdata[:,:ORIGINAL_FEATURES], exdata, vdata[:,ORIGINAL_FEATURES:]), axis=1)
26 |
27 | # Write feature file
28 | h5f = h5py.File(sys.argv[3], 'w')
29 | h5f.create_dataset('data', data=data)
30 | h5f.close()
31 |
--------------------------------------------------------------------------------
/training/evaluation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 |
5 | import sys
6 | import h5py
7 | import numpy as np
8 | import librosa
9 | import soundfile as sf
10 | import math
11 | from tensorflow import keras
12 | from tensorflow.keras.constraints import Constraint
13 |
14 | # Global constants declarations
15 | NB_BANDS = 22
16 | FRAME_SIZE = 480
17 | WINDOW_SIZE = FRAME_SIZE * 2
18 | FREQ_SIZE = FRAME_SIZE + 1
19 | MAX_PITCH = 768
20 | FRAME_SIZE_SHIFT = 2
21 | eband5ms= np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100])
22 |
23 | # Calculate energy for each subband
24 | def energy(X):
25 | sum = [0] * NB_BANDS
26 | for i in range(NB_BANDS - 1):
27 | band_size = (eband5ms[i+1]-eband5ms[i]) << FRAME_SIZE_SHIFT
28 | for j in range(band_size):
29 | frac = j/band_size
30 | tmp = (X[(eband5ms[i]<g[i] :
71 | r[i] = 1
72 | else:
73 | r[i] = ((Exp[i])**2) *(1-(g[i])**2)/(0.001 + (g[i]**2) * (1-(Exp[i])**2))
74 |
75 | r[i] = math.sqrt(min(1, max(0, r[i])))
76 | r[i] *= math.sqrt(Ex[i]/(1e-8+Ep[i]))
77 |
78 | rf = interp_band_gain(r)
79 |
80 | for i in range(FREQ_SIZE):
81 | resultX.append(complex((X[i].real + rf[i]*P[i].real),(X[i].imag + rf[i]*P[i].imag)))
82 | # resultX[i].real += rf[i]*P[i].real
83 | # resultX[i].imag += rf[i]*P[i].imag
84 |
85 | newE = energy(resultX)
86 |
87 | norm = [(math.sqrt(Ex[i]/(1e-8+newE[i]))) for i in range(NB_BANDS)]
88 |
89 | normf = interp_band_gain(norm)
90 |
91 | for i in range(FREQ_SIZE):
92 | resultX[i] = complex(resultX[i].real * normf[i], resultX[i].imag * normf[i])
93 | # resultX[i].real *= normf[i]
94 | # resultX[i].imag *= normf[i]
95 |
96 | return resultX
97 |
98 | # Compute band correlation
99 | def compute_band_corr(X, P):
100 |
101 | sum = [0] * NB_BANDS
102 |
103 | for i in range(NB_BANDS - 1):
104 | band_size = (eband5ms[i+1]-eband5ms[i]) << FRAME_SIZE_SHIFT
105 | for j in range(band_size):
106 | frac = j / band_size
107 | tmp = X[(eband5ms[i]< 1:
214 | # [max_pitch-frame_size][frame][frame][frame]
215 | pitchBuffer = np.concatenate((inWindows[windowIndex - 2][(WINDOW_SIZE - MAX_PITCH):FRAME_SIZE], inWindows[windowIndex - 1][:FRAME_SIZE], window), axis=0)
216 | elif windowIndex == 1:
217 | pitchBuffer = np.concatenate((np.zeros(MAX_PITCH - FRAME_SIZE), inWindows[windowIndex - 1][:FRAME_SIZE], window), axis=0)
218 | elif windowIndex == 0:
219 | pitchBuffer = np.concatenate((np.zeros(MAX_PITCH), window), axis=0)
220 |
221 | # Calculate p buffer
222 | p = pitchBuffer[pitches[windowIndex] : pitches[windowIndex] + WINDOW_SIZE]
223 |
224 | # Apply vorbis window on p
225 | vP = vorbis_window(p)
226 |
227 | # FFT of vP
228 | fftP = np.fft.fft(vP, n=WINDOW_SIZE)
229 |
230 | # Zero padding on fftP
231 | fftP[FRAME_SIZE+1:] = 0
232 |
233 | # Energy of vP
234 | EvP = energy(fftP)
235 |
236 | # Compute ExP
237 | ExP = compute_band_corr(fftWindow, fftP)
238 |
239 | # Normalize ExP
240 | for i in range(NB_BANDS):
241 | ExP[i] = ExP[i]/math.sqrt(0.001+EvWindow[i]*EvP[i])
242 |
243 | # Apply pitch filter
244 | X = pitch_filter(fftWindow, fftP, EvWindow, EvP, ExP, gainsOutput[windowIndex,:])
245 | # Disable pitch filter
246 | #X = fftWindow
247 |
248 | # Gain Smoothing
249 | alpha = 0.6
250 | for i in range(NB_BANDS):
251 | finalGains[i] = max(gainsOutput[windowIndex,i], alpha * finalGains[i])
252 |
253 | # Interpolate band gains
254 | gf = interp_band_gain(finalGains)
255 |
256 | # Apply gains
257 | for i in range(FREQ_SIZE):
258 | X[i] = complex((X[i].real * gf[i]), (X[i].imag * gf[i]))
259 |
260 | # Synthesize frames
261 | X = np.concatenate([X, np.zeros(FRAME_SIZE-1)])
262 | for i in range(FREQ_SIZE, WINDOW_SIZE):
263 | X[i] = np.complex(X[WINDOW_SIZE-i].real, -X[WINDOW_SIZE-i].imag)
264 | x = np.fft.ifft(X, n=WINDOW_SIZE)
265 | vx = vorbis_window(x)
266 | outData[(windowIndex * FRAME_SIZE):((windowIndex + 2) * FRAME_SIZE)] = np.add(outData[(windowIndex * FRAME_SIZE):((windowIndex + 2) * FRAME_SIZE)], vx)
267 |
268 | # Increment frame index
269 | windowIndex += 1
270 |
271 | # Normalize energy
272 | # eClean = 0
273 | # for element in outData:
274 | # eClean += element ** 2
275 | # eNoisy = 0
276 | # for element in y:
277 | # eNoisy += element ** 2
278 | # ratio = (eNoisy / eClean)
279 | # outData = outData * ratio
280 | # outData /= max(outData)
281 |
282 | # Write output file
283 | sf.write(sys.argv[4], outData, sr, subtype='PCM_16')
--------------------------------------------------------------------------------
/training/evaluation_batch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 |
5 | import sys
6 | import h5py
7 | import numpy as np
8 | import librosa
9 | import soundfile as sf
10 | import math
11 | from tensorflow import keras
12 | from tensorflow.keras.constraints import Constraint
13 | import os
14 | from os.path import splitext
15 | import glob
16 |
17 | # Global constants declarations
18 | NB_BANDS = 22
19 | FRAME_SIZE = 480
20 | WINDOW_SIZE = FRAME_SIZE * 2
21 | FREQ_SIZE = FRAME_SIZE + 1
22 | MAX_PITCH = 768
23 | FRAME_SIZE_SHIFT = 2
24 | eband5ms= np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100])
25 |
26 | # Calculate energy for each subband
27 | def energy(X):
28 | sum = [0] * NB_BANDS
29 | for i in range(NB_BANDS - 1):
30 | band_size = (eband5ms[i+1]-eband5ms[i]) << FRAME_SIZE_SHIFT
31 | for j in range(band_size):
32 | frac = j/band_size
33 | tmp = (X[(eband5ms[i]<g[i] :
74 | r[i] = 1
75 | else:
76 | r[i] = ((Exp[i])**2) *(1-(g[i])**2)/(0.001 + (g[i]**2) * (1-(Exp[i])**2))
77 |
78 | r[i] = math.sqrt(min(1, max(0, r[i])))
79 | r[i] *= math.sqrt(Ex[i]/(1e-8+Ep[i]))
80 |
81 | rf = interp_band_gain(r)
82 |
83 | for i in range(FREQ_SIZE):
84 | resultX.append(complex((X[i].real + rf[i]*P[i].real),(X[i].imag + rf[i]*P[i].imag)))
85 | # resultX[i].real += rf[i]*P[i].real
86 | # resultX[i].imag += rf[i]*P[i].imag
87 |
88 | newE = energy(resultX)
89 |
90 | norm = [(math.sqrt(Ex[i]/(1e-8+newE[i]))) for i in range(NB_BANDS)]
91 |
92 | normf = interp_band_gain(norm)
93 |
94 | for i in range(FREQ_SIZE):
95 | resultX[i] = complex(resultX[i].real * normf[i], resultX[i].imag * normf[i])
96 | # resultX[i].real *= normf[i]
97 | # resultX[i].imag *= normf[i]
98 |
99 | return resultX
100 |
101 | # Compute band correlation
102 | def compute_band_corr(X, P):
103 |
104 | sum = [0] * NB_BANDS
105 |
106 | for i in range(NB_BANDS - 1):
107 | band_size = (eband5ms[i+1]-eband5ms[i]) << FRAME_SIZE_SHIFT
108 | for j in range(band_size):
109 | frac = j / band_size
110 | tmp = X[(eband5ms[i]< 1:
227 | # [max_pitch-frame_size][frame][frame][frame]
228 | pitchBuffer = np.concatenate((inWindows[windowIndex - 2][(WINDOW_SIZE - MAX_PITCH):FRAME_SIZE], inWindows[windowIndex - 1][:FRAME_SIZE], window), axis=0)
229 | elif windowIndex == 1:
230 | pitchBuffer = np.concatenate((np.zeros(MAX_PITCH - FRAME_SIZE), inWindows[windowIndex - 1][:FRAME_SIZE], window), axis=0)
231 | elif windowIndex == 0:
232 | pitchBuffer = np.concatenate((np.zeros(MAX_PITCH), window), axis=0)
233 |
234 | # Calculate p buffer
235 | p = pitchBuffer[pitches[windowIndex] : pitches[windowIndex] + WINDOW_SIZE]
236 |
237 | # Apply vorbis window on p
238 | vP = vorbis_window(p)
239 |
240 | # FFT of vP
241 | fftP = np.fft.fft(vP, n=WINDOW_SIZE)
242 |
243 | # Zero padding on fftP
244 | fftP[FRAME_SIZE+1:] = 0
245 |
246 | # Energy of vP
247 | EvP = energy(fftP)
248 |
249 | # Compute ExP
250 | ExP = compute_band_corr(fftWindow, fftP)
251 |
252 | # Normalize ExP
253 | for i in range(NB_BANDS):
254 | ExP[i] = ExP[i]/math.sqrt(0.001+EvWindow[i]*EvP[i])
255 |
256 | # Apply pitch filter
257 | X = pitch_filter(fftWindow, fftP, EvWindow, EvP, ExP, gainsOutput[windowIndex,:])
258 | # Disable pitch filter
259 | #X = fftWindow
260 |
261 | # Gain Smoothing
262 | alpha = 0.6
263 | for i in range(NB_BANDS):
264 | finalGains[i] = max(gainsOutput[windowIndex,i], alpha * finalGains[i])
265 |
266 | # Interpolate band gains
267 | gf = interp_band_gain(finalGains)
268 |
269 | # Apply gains
270 | for i in range(FREQ_SIZE):
271 | X[i] = complex((X[i].real * gf[i]), (X[i].imag * gf[i]))
272 |
273 | # Synthesize frames
274 | X = np.concatenate([X, np.zeros(FRAME_SIZE-1)])
275 | for i in range(FREQ_SIZE, WINDOW_SIZE):
276 | X[i] = np.complex(X[WINDOW_SIZE-i].real, -X[WINDOW_SIZE-i].imag)
277 | x = np.fft.ifft(X, n=WINDOW_SIZE)
278 | vx = vorbis_window(x)
279 | outData[(windowIndex * FRAME_SIZE):((windowIndex + 2) * FRAME_SIZE)] = np.add(outData[(windowIndex * FRAME_SIZE):((windowIndex + 2) * FRAME_SIZE)], vx)
280 |
281 | # Increment frame index
282 | windowIndex += 1
283 |
284 | # Normalize energy
285 | # eClean = 0
286 | # for element in outData:
287 | # eClean += element ** 2
288 | # eNoisy = 0
289 | # for element in y:
290 | # eNoisy += element ** 2
291 | # ratio = (eNoisy / eClean)
292 | # outData = outData * ratio
293 | # outData /= max(outData)
294 |
295 | # Write output file
296 | sf.write(os.path.splitext(audioFile)[0] + ".clean.wav", outData, sr, subtype='PCM_16')
297 |
298 | fileCounter += 1
--------------------------------------------------------------------------------
/training/rnn_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 |
5 | import keras
6 | from keras.models import Sequential
7 | from keras.models import Model
8 | from keras.layers import Input
9 | from keras.layers import Dense
10 | from keras.layers import LSTM
11 | from keras.layers import GRU
12 | from keras.layers import SimpleRNN
13 | from keras.layers import Dropout
14 | from keras.layers import concatenate
15 | from keras import losses
16 | from keras import regularizers
17 | from keras.constraints import min_max_norm
18 | import h5py
19 |
20 | from keras.constraints import Constraint
21 | from keras import backend as K
22 | import numpy as np
23 | import sys
24 |
25 | #import tensorflow as tf
26 | #from keras.backend.tensorflow_backend import set_session
27 | #config = tf.ConfigProto()
28 | #config.gpu_options.per_process_gpu_memory_fraction = 0.42
29 | #set_session(tf.Session(config=config))
30 |
31 |
32 | def my_crossentropy(y_true, y_pred):
33 | return K.mean(2*K.abs(y_true-0.5) * K.binary_crossentropy(y_pred, y_true), axis=-1)
34 |
35 | def mymask(y_true):
36 | return K.minimum(y_true+1., 1.)
37 |
38 | def msse(y_true, y_pred):
39 | return K.mean(mymask(y_true) * K.square(K.sqrt(y_pred) - K.sqrt(y_true)), axis=-1)
40 |
41 | def mycost(y_true, y_pred):
42 | return K.mean(mymask(y_true) * (10*K.square(K.square(K.sqrt(y_pred) - K.sqrt(y_true))) + K.square(K.sqrt(y_pred) - K.sqrt(y_true)) + 0.01*K.binary_crossentropy(y_pred, y_true)), axis=-1)
43 |
44 | def my_accuracy(y_true, y_pred):
45 | return K.mean(2*K.abs(y_true-0.5) * K.equal(y_true, K.round(y_pred)), axis=-1)
46 |
47 | class WeightClip(Constraint):
48 | '''Clips the weights incident to each hidden unit to be inside a range
49 | '''
50 | def __init__(self, c=2):
51 | self.c = c
52 |
53 | def __call__(self, p):
54 | return K.clip(p, -self.c, self.c)
55 |
56 | def get_config(self):
57 | return {'name': self.__class__.__name__,
58 | 'c': self.c}
59 |
60 | reg = 0.000001
61 | constraint = WeightClip(0.499)
62 |
63 | print('Loading data...')
64 | with h5py.File(sys.argv[1], 'r') as hf:
65 | all_data = hf['data'][:]
66 | print('done.')
67 |
68 | extraFeatures = 3
69 |
70 | print('Build model...')
71 | main_input = Input(shape=(None, 42+extraFeatures), name='main_input')
72 | tmp = Dense(24, activation='tanh', name='input_dense', kernel_constraint=constraint, bias_constraint=constraint)(main_input)
73 | vad_gru = GRU(24, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, name='vad_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(tmp)
74 | vad_output = Dense(1, activation='sigmoid', name='vad_output', kernel_constraint=constraint, bias_constraint=constraint)(vad_gru)
75 | noise_input = keras.layers.concatenate([tmp, vad_gru, main_input])
76 | noise_gru = GRU(48, activation='relu', recurrent_activation='sigmoid', return_sequences=True, name='noise_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(noise_input)
77 | denoise_input = keras.layers.concatenate([vad_gru, noise_gru, main_input])
78 |
79 | denoise_gru = GRU(96, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, name='denoise_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(denoise_input)
80 |
81 | denoise_output = Dense(22, activation='sigmoid', name='denoise_output', kernel_constraint=constraint, bias_constraint=constraint)(denoise_gru)
82 |
83 | model = Model(inputs=main_input, outputs=[denoise_output, vad_output])
84 |
85 | model.compile(loss=[mycost, my_crossentropy],
86 | metrics=[msse],
87 | optimizer='adam', loss_weights=[10, 0.5])
88 |
89 |
90 | batch_size = 32
91 |
92 | window_size = 2000
93 |
94 | nb_sequences = len(all_data)//window_size
95 | print(nb_sequences, ' sequences')
96 | x_train = all_data[:nb_sequences*window_size, :42+extraFeatures]
97 | x_train = np.reshape(x_train, (nb_sequences, window_size, 42+extraFeatures))
98 |
99 | y_train = np.copy(all_data[:nb_sequences*window_size, 42+extraFeatures:42+extraFeatures + 22])
100 | y_train = np.reshape(y_train, (nb_sequences, window_size, 22))
101 |
102 | noise_train = np.copy(all_data[:nb_sequences*window_size, 42+extraFeatures + 22:42+extraFeatures + 44])
103 | noise_train = np.reshape(noise_train, (nb_sequences, window_size, 22))
104 |
105 | vad_train = np.copy(all_data[:nb_sequences*window_size, 42+extraFeatures + 44:42+extraFeatures + 45])
106 | vad_train = np.reshape(vad_train, (nb_sequences, window_size, 1))
107 |
108 | all_data = 0
109 | #x_train = x_train.astype('float32')
110 | #y_train = y_train.astype('float32')
111 |
112 | print(len(x_train), 'train sequences. x shape =', x_train.shape, 'y shape = ', y_train.shape)
113 |
114 | print('Train...')
115 | model.fit(x_train, [y_train, vad_train],
116 | batch_size=batch_size,
117 | epochs=120,
118 | validation_split=0.1)
119 |
120 | # Save full model
121 | model.save(sys.argv[2])
122 |
--------------------------------------------------------------------------------