├── README.md ├── mcd.py ├── plot_tsne_visualization.py ├── ppg_plot_from_npy.py ├── ppg_resample_interpolate ├── tsne_visualization.py └── yin.py /README.md: -------------------------------------------------------------------------------- 1 | # Evaluation methods in SpeechSynthesis(including VC) 2 | 3 | 4 | 5 | ## subjective 6 | ### CMOS : https://ecs.utdallas.edu/loizou/cimplants/quality_assessment_chapter.pdf 7 | 8 | 9 | ## Objective evaluation in speech synthesis 10 | ## display overall 11 | ![image](https://user-images.githubusercontent.com/38338826/137138608-f137d3be-be9b-4144-8c89-4dbee631a211.png) 12 | 👉https://arxiv.org/pdf/2104.00355.pdf 13 | 14 | 15 | ## defination in papers 16 | [MSD, F0 RMSE, F0 corr, GPE, FPE](https://arxiv.org/pdf/1904.02790.pdf) 17 | ![image](https://user-images.githubusercontent.com/38338826/137137367-62739b52-0d72-49da-9550-06efd37d842a.png) 18 | 19 | [MCD, GPE, VDE, FFE](http://proceedings.mlr.press/v80/skerry-ryan18a/skerry-ryan18a.pdf) 20 | [MCD](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/2016_paper_297.pdf) 21 | 22 | 23 | 24 | 25 | 26 | ## training 27 | ### MCD 28 | #### definination 29 | ![image](https://user-images.githubusercontent.com/38338826/137128924-048a5bdb-bf64-4ed1-90b5-94f34f54ffaa.png) 30 | http://www1.se.cuhk.edu.hk/~hccl/publications/pub/2016_paper_297.pdf section 4.2 31 | ![image](https://user-images.githubusercontent.com/38338826/137129723-8b4e670e-0c3f-42b2-a727-b8971b3de4b8.png) 32 | 33 | 34 | #### code 35 | [mcd(34 mcep)](https://github.com/inconnu11/tsne-and-mcd/blob/main/mcd.py) 36 | [other implement](https://github.com/MattShannon/mcd) 37 | 38 | #### normal value range 39 | log dB 4.4 40 | 41 | 42 | ### F0 RMSE/ corr/ VDE/ FFE 43 | [GPE](https://github.com/bastibe/MAPS-Scripts) 44 | 45 | 46 | 47 | #### normal value range 48 | RMSE 22.386 49 | 50 | 51 | ### VUV 52 | 53 | 54 | 55 | ## test 56 | 57 | 58 | 59 | 60 | ## common 61 | ### content 62 | PER, WER 63 | ### speaker similarity 64 | https://github.com/resemble-ai/Resemblyzer 65 | [tsne]() 66 | 67 | 68 | in paper [Building Bilingual and Code-Switched Voice Conversion with Limited 69 | Training Data Using Embedding Consistency Loss](https://arxiv.org/pdf/2104.10832.pdf) 70 | ![image](https://user-images.githubusercontent.com/38338826/141472983-dd92c5e9-f9de-4912-91ca-979a8b451107.png) 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /mcd.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import glob 4 | import librosa 5 | import pyworld 6 | import pysptk 7 | import numpy as np 8 | import matplotlib.pyplot as plot 9 | 10 | #from binary_io import BinaryIOCollection 11 | 12 | 13 | def load_wav(wav_file, sr): 14 | """ 15 | Load a wav file with librosa. 16 | :param wav_file: path to wav file 17 | :param sr: sampling rate 18 | :return: audio time series numpy array 19 | """ 20 | wav, _ = librosa.load(wav_file, sr=sr, mono=True) 21 | 22 | return wav 23 | 24 | 25 | def log_spec_dB_dist(x, y): 26 | log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) 27 | diff = x - y 28 | 29 | return log_spec_dB_const * math.sqrt(np.inner(diff, diff)) 30 | 31 | 32 | SAMPLING_RATE = 22050 33 | FRAME_PERIOD = 5.0 34 | 35 | # Paths to target reference and converted synthesised wavs 36 | stgan_vc_wav_paths = glob.glob('./syn/*') 37 | stgan_vc2_wav_paths = glob.glob('./target/*') 38 | 39 | 40 | 41 | # Load the wavs 42 | # vc2_trg_ref = load_wav('/datapool/home/zxt20/MYDATA/VCTK/wav16/p250/p250_450.wav', sr=SAMPLING_RATE) 43 | # vc2_conv_synth = load_wav('/datapool/home/zxt20/JieWang2020ICASSP/orth_frame_level_5mask_trimdata/frame_5mask_1kRLoss_4FCsLayer/With_pretrained_ecoder/With_content_predictor_wordcount_EmbedLayer/results/p231_p250_test_p231_p231_449.npy_test_p250_p250_450.npy_RFU_NOP.wav', sr=SAMPLING_RATE) 44 | 45 | # print(type(vc2_trg_ref)) 46 | 47 | 48 | def wav2mcep_numpy(wavfile, target_directory, alpha=0.65, fft_size=512, mcep_size=34): 49 | # make relevant directories 50 | if not os.path.exists(target_directory): 51 | os.makedirs(target_directory) 52 | wavfile_tmp = wavfile.split('RFU')[0] 53 | if len(wavfile.split('.'))>3: 54 | source = wavfile_tmp.split('.')[1][-8:] 55 | target = wavfile_tmp.split('.')[-2][-8:] 56 | fname = source+'_'+target 57 | else: 58 | fname = os.path.basename(wavfile).split('.')[0] 59 | 60 | 61 | loaded_wav = load_wav(wavfile, sr=SAMPLING_RATE) 62 | 63 | # Use WORLD vocoder to spectral envelope 64 | _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=SAMPLING_RATE, 65 | frame_period=FRAME_PERIOD, fft_size=fft_size) 66 | 67 | # Extract MCEP features 68 | mgc = pysptk.sptk.mcep(sp, order=mcep_size, alpha=alpha, maxiter=0, 69 | etype=1, eps=1.0E-8, min_det=0.0, itype=3) 70 | 71 | #fname = os.path.basename(wavfile).split('.')[0] 72 | 73 | np.save(os.path.join(target_directory, fname + '.npy'), 74 | mgc, 75 | allow_pickle=False) 76 | 77 | alpha = 0.65 # commonly used at 22050 Hz 78 | fft_size = 512 79 | mcep_size = 34 80 | 81 | vc_trg_wavs = glob.glob('./target/*') 82 | vc_trg_mcep_dir = './mel_tar/' 83 | vc_conv_wavs = glob.glob('./syn/*') 84 | vc_conv_mcep_dir = './mel_syn' 85 | 86 | 87 | for wav in vc_trg_wavs: 88 | wav2mcep_numpy(wav, vc_trg_mcep_dir, fft_size=fft_size, mcep_size=mcep_size) 89 | 90 | for wav in vc_conv_wavs: 91 | wav2mcep_numpy(wav, vc_conv_mcep_dir, fft_size=fft_size, mcep_size=mcep_size) 92 | 93 | # for wav in vc2_trg_wavs: 94 | # wav2mcep_numpy(wav, vc2_trg_mcep_dir, fft_size=fft_size, mcep_size=mcep_size) 95 | 96 | # for wav in vc2_conv_wavs: 97 | # wav2mcep_numpy(wav, vc2_conv_mcep_dir, fft_size=fft_size, mcep_size=mcep_size) 98 | def average_mcd(ref_mcep_files, synth_mcep_files, cost_function): 99 | """ 100 | Calculate the average MCD. 101 | :param ref_mcep_files: list of strings, paths to MCEP target reference files 102 | :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files 103 | :param cost_function: distance metric used 104 | :returns: average MCD, total frames processed 105 | """ 106 | min_cost_tot = 0.0 107 | frames_tot = 0 108 | 109 | for ref in ref_mcep_files: 110 | for synth in synth_mcep_files: 111 | # get the trg_ref and conv_synth speaker name and sample id 112 | ref_fsplit, synth_fsplit = os.path.basename(ref).split('_'), os.path.basename(synth).split('_') 113 | ref_spk, ref_id = ref_fsplit[0], ref_fsplit[-1][:3] 114 | synth_spk, synth_id = synth_fsplit[2], synth_fsplit[3][:3] 115 | 116 | # if the speaker name is the same and sample id is the same, do MCD 117 | if ref_spk == synth_spk and ref_id == synth_id: 118 | # load MCEP vectors 119 | ref_vec = np.load(ref) 120 | ref_frame_no = len(ref_vec) 121 | synth_vec = np.load(synth) 122 | 123 | # dynamic time warping using librosa 124 | min_cost, _ = librosa.sequence.dtw(ref_vec[:, 1:].T, synth_vec[:, 1:].T, 125 | metric=cost_function) 126 | 127 | min_cost_tot += np.mean(min_cost) 128 | frames_tot += ref_frame_no 129 | 130 | mean_mcd = min_cost_tot / frames_tot 131 | 132 | return mean_mcd, frames_tot 133 | 134 | 135 | vc_trg_refs = glob.glob('./mel_tar/*') 136 | vc_conv_synths = glob.glob('./mel_syn/*') 137 | # vc2_trg_refs = glob.glob('./data/official_stargan-vc2/mceps_numpy/trg/*') 138 | # vc2_conv_synths = glob.glob('./data/official_stargan-vc2/mceps_numpy/conv/*') 139 | 140 | cost_function = log_spec_dB_dist 141 | 142 | vc_mcd, vc_tot_frames_used = average_mcd(vc_trg_refs, vc_conv_synths, cost_function) 143 | #vc2_mcd, vc2_tot_frames_used = average_mcd(vc2_trg_refs, vc2_conv_synths, cost_function) 144 | 145 | 146 | print(f'MCD = {vc_mcd} dB, calculated over a total of {vc_tot_frames_used} frames') 147 | -------------------------------------------------------------------------------- /plot_tsne_visualization.py: -------------------------------------------------------------------------------- 1 | import random 2 | from sklearn import manifold 3 | import matplotlib.pyplot as plt 4 | import os 5 | import numpy as np 6 | import matplotlib 7 | 8 | matplotlib.use('Agg') 9 | 10 | # source_path 11 | # ├── speaker_1 (dir storing embedding files,named sentence_id.npy,shape [1,embedding_size]) 12 | # └── speaker_2 13 | # .... 14 | # └── speaker_n 15 | 16 | 17 | # source_path 18 | source_path = "data" 19 | 20 | embedding_dict = {} 21 | speakers = os.listdir(source_path) 22 | speaker_list = [] 23 | # random select 3 speakers 24 | random.shuffle(speakers) 25 | sentence_label = [] 26 | speaker_label = [] 27 | for i, speaker in enumerate(speakers[:3]): 28 | speaker_list.append(speaker) 29 | speaker_path = os.path.join(source_path, speaker) 30 | sentences = os.listdir(speaker_path) 31 | embedding_dict[i] = [] 32 | for sentence in sentences: 33 | sentence_path = os.path.join(speaker_path, sentence) 34 | sentence_embedding = np.load(sentence_path)[0] 35 | embedding_dict[i].append(sentence_embedding) 36 | speaker_label.append(i) 37 | sentence_label.append(int(sentence.replace('.npy', ''))) 38 | if len(embedding_dict[i]) <= 1: 39 | embedding_dict.pop(i) 40 | print("speaker list:", speaker_list) 41 | tsne = manifold.TSNE(n_components=2, init='pca', perplexity=3, random_state=0, verbose=1) 42 | data = [] 43 | for i in range(3): 44 | data = data + embedding_dict[i] 45 | print("num data:", len(data)) 46 | data = np.array(data) 47 | data = (data - data.mean(axis=0)) / data.std(axis=0) 48 | X = tsne.fit_transform(data) 49 | markers = ['x', '^', '.'] 50 | # linear to [0,1] 51 | x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) 52 | X = (X - x_min) / (x_max - x_min) 53 | fig = plt.figure() 54 | ax = fig.add_subplot(1, 1, 1) 55 | for i in range(X.shape[0]): 56 | c = plt.cm.Set1(sentence_label[i]) 57 | ax.scatter(X[i, 0], X[i, 1], color=c, marker=markers[speaker_label[i]]) 58 | 59 | plt.title('t-SNE 2D') 60 | plt.savefig(os.path.join(source_path, 'tsne.png'), format='png', dpi=300) 61 | plt.close() 62 | -------------------------------------------------------------------------------- /ppg_plot_from_npy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | PPG_file = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105/p225/p225_002.npy' 6 | PPG_value = np.load(PPG_file) 7 | sns.heatmap(PPG_value, cmap='Reds') 8 | plt.savefig('./ppg_0.png') # [-0.2, 0.8] 9 | print(PPG_value.shape) 10 | # print(PPG_value) 11 | 12 | 13 | # txt_for_check = './spmel_p226_010.txt' 14 | # for ppg in PPG_value: 15 | # f = open(txt_for_check, 'a') 16 | # f.write('\n' + str(ppg)) 17 | # f.close() 18 | 19 | 20 | PPG_downsample_file = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_64_16ms/PPGs_VCTK105_64_16_nonorm_notrim/p225/p225_002.npy' 21 | PPG_downsample = np.load(PPG_downsample_file) 22 | sns.heatmap(PPG_downsample, cmpa='Reds') 23 | plt.savefig('./ppg_1.png') # [-0.2, 0.8] 24 | print(PPG_downsample.shape) 25 | # print(PPG_value) 26 | -------------------------------------------------------------------------------- /ppg_resample_interpolate: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from scipy.interpolate import interp1d 4 | import os 5 | 6 | 7 | 8 | 9 | 10 | ############################# single file ############################# 11 | # ppg_original = np.load('/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105/p225/p225_001.npy') 12 | # print(ppg_original.shape) # 206 frame (206, 41) 13 | # print(ppg_original) 14 | # # np.savetxt() 15 | 16 | # duration = ppg_original.shape[0] * shift_from 17 | # # print(duration) # 2.06 s 保留到小数点后1位? 18 | 19 | # t = np.arange(0, duration, shift_from) 20 | # # print(t.shape) 21 | # t2 = np.arange(0, duration, shift_to) 22 | # # print(t2) 23 | 24 | # ppg_val = interp1d(t, ppg_original, 25 | # kind='linear', axis=0, fill_value='extrapolate', copy=False, 26 | # assume_sorted=True) 27 | 28 | # ppg_val_to = ppg_val(t2) 29 | # print(ppg_val_to.shape) # (129, 41) 30 | # print(ppg_val_to) 31 | ############################# single file ############################# 32 | 33 | 34 | 35 | shift_from = 0.01 # 帧移 s 36 | shift_to = 0.016 37 | 38 | ############################# multiple file ############################# 39 | ppg_dir = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105' 40 | ppg_target_dir = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_64_16ms/PPGs_VCTK105_64_16ms' 41 | for spk in os.listdir(ppg_dir): 42 | # print(spk) 43 | target_spk_dir = os.path.join(ppg_target_dir, spk) 44 | if not os.path.exists(target_spk_dir): 45 | os.makedirs(target_spk_dir) 46 | # print(target_spk_dir) 47 | for uttid in os.listdir(os.path.join(ppg_dir, spk)): 48 | print(uttid) 49 | ppg_original = np.load(os.path.join(ppg_dir, spk, uttid)) 50 | # print("ppg_original shape[0]", ppg_original.shape) 51 | duration = ppg_original.shape[0] * shift_from 52 | t = np.arange(0, duration, shift_from) 53 | # print("t shape[0]", t.shape[0]) 54 | t2 = np.arange(0, duration, shift_to) 55 | if t.shape[0] < ppg_original.shape[0]: 56 | diffe = ppg_original.shape[0] - t.shape[0] 57 | if diffe <= 1: 58 | ppg_original = ppg_original[:t.shape[0], :] 59 | else: 60 | diffe = t.shape[0] - ppg_original.shape[0] 61 | if diffe <= 1: 62 | t = t[:ppg_original.shape[0]] 63 | assert t.shape[0] == ppg_original.shape[0] 64 | 65 | ppg_val = interp1d(t, ppg_original, 66 | kind='linear', axis=0, fill_value='extrapolate', copy=False, 67 | assume_sorted=True) 68 | ppg_val_to = ppg_val(t2) 69 | target_ppg_file_name = os.path.join(target_spk_dir, uttid) 70 | # print(target_ppg_file_name) 71 | np.save(os.path.join(target_spk_dir, uttid), ppg_val_to) 72 | 73 | ############################# multiple file ############################# 74 | 75 | 76 | ############################# original ############################# 77 | # if __name__ == '__main__': 78 | # if len(sys.argv) != 3: 79 | # print('Usage: python3 %s sampling_rate_from sampling_rate_to lf0_out.float32' %sys.argv[0]) 80 | # exit(-1) 81 | 82 | # sr_from = float(sys.argv[1]) 83 | # sr_to = float(sys.argv[2]) 84 | # lf0 = np.frombuffer(sys.stdin.buffer.read(),dtype='float32').copy() 85 | # t = np.arange(0,lf0.size)/sr_from 86 | 87 | # voiced_mask = lf0>0 88 | # duration = lf0.size/sr_from 89 | # t2= np.arange(0,duration,1/sr_to) 90 | 91 | # lf0[0] = lf0[voiced_mask][0] 92 | # lf0[-1] = lf0[voiced_mask][-1] 93 | # voiced_mask = lf0 > 0 94 | 95 | # lf0_val = interp1d(t[voiced_mask], lf0[voiced_mask], 96 | # kind='linear', fill_value='extrapolate', copy=False, 97 | # assume_sorted=True) 98 | 99 | # lf0_val_t2 = lf0_val(t2) 100 | # sys.stdout.buffer.write(lf0_val(t2).astype('float32').tobytes()) 101 | ############################# original ############################# 102 | -------------------------------------------------------------------------------- /tsne_visualization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from time import time 5 | import matplotlib 6 | 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as plt 9 | import matplotlib.cm as cm 10 | from sklearn import (manifold, datasets, decomposition, 11 | ensemble, random_projection) 12 | import random 13 | from time import time,sleep 14 | 15 | import matplotlib.pyplot as plt 16 | from mpl_toolkits.mplot3d import Axes3D 17 | from matplotlib.ticker import NullFormatter 18 | 19 | from sklearn import manifold, datasets 20 | 21 | 22 | 23 | style_encoder_path = "/home/zhaoxt20/vae_tac_myself/exp_multi/Libritts_styles" 24 | 25 | embedding_dict={} 26 | speakers = os.listdir(style_encoder_path) 27 | 28 | 29 | random.shuffle(speakers) 30 | for speaker in speakers[:10]: 31 | speaker_path = os.path.join(style_encoder_path,speaker) 32 | sentences = os.listdir(speaker_path) 33 | if speaker not in embedding_dict: 34 | embedding_dict[speaker]=[] 35 | 36 | for sentence in sentences: 37 | sentence_path = os.path.join(speaker_path,sentence) 38 | embeddings = os.listdir(sentence_path) 39 | for embedding in embeddings: 40 | embedding_path = os.path.join(sentence_path,embedding) 41 | embedding_np = np.load(embedding_path)[0] 42 | embedding_dict[speaker].append(embedding_np) 43 | if len(embedding_dict[speaker])<=1: 44 | embedding_dict.pop(speaker) 45 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=0, verbose=1) 46 | color_num = len(speakers) * 2 47 | colors = cm.rainbow(np.linspace(0, 1, color_num)) 48 | Y=None 49 | data=None 50 | label=[] 51 | for x in embedding_dict: 52 | if data is None: 53 | data = embedding_dict[x] 54 | else: 55 | data = data + embedding_dict[x] 56 | #data = tsne.fit_transform(data) 57 | # if Y is None: 58 | # Y=np.array(data) 59 | # else: 60 | # Y = np.concatenate((Y,data),axis= 61 | 62 | tmp=[] 63 | for i in range(len(embedding_dict[x])): 64 | tmp.append(x) 65 | label=label+tmp 66 | print(len(data)) 67 | Y = tsne.fit_transform(data) 68 | 69 | 70 | 71 | 72 | 73 | def plot_embedding_2d(X, title=None, save_path=None): 74 | # linear to [0,1] 75 | x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) 76 | X = (X - x_min) / (x_max - x_min) 77 | # print('plot X', X) 78 | # print('plot Char', Char) 79 | # at x0, x1, draw text 80 | fig = plt.figure() 81 | ax = fig.add_subplot(1, 1, 1) 82 | for i in range(X.shape[0]): 83 | c = plt.cm.Set1(speakers.index(label[i]) % 10 / 10.) 84 | ax.text(X[i, 0], X[i, 1], str(label[i]), color=c, 85 | fontdict={'weight': 'bold', 'size': 4}) 86 | 87 | if title is not None: 88 | plt.title(title) 89 | if save_path is not None: 90 | plt.savefig(save_path, format='png', dpi=300) 91 | plt.close() 92 | 93 | 94 | def plot_embedding_2d_focus(X, focus, title=None, save_path=None): 95 | # linear to [0,1] 96 | x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) 97 | X = (X - x_min) / (x_max - x_min) 98 | # print('plot X', X) 99 | # print('plot Char', Char) 100 | # at x0, x1, draw text 101 | fig = plt.figure() 102 | ax = fig.add_subplot(1, 1, 1) 103 | for i in range(X.shape[0]): 104 | ch = get_symbol(i) 105 | c = get_color(i) 106 | if ch in focus: 107 | # Log(ch) 108 | # Log(X[i, 0]) 109 | # Log(X[i, 1]) 110 | # ax.text(X[i, 0], X[i, 1], ch, color=c, 111 | # fontdict={'weight': 'bold', 'size': 9}) 112 | ax.text(X[i, 0]+np.random.normal(0, 0.0075), X[i, 1]+np.random.normal(0, 0.0075), ch, color=c, 113 | fontdict={'weight': 'bold', 'size': 9}) 114 | 115 | if title is not None: 116 | plt.title(title) 117 | if save_path is not None: 118 | plt.savefig(save_path, format='png', dpi=300) 119 | plt.close() 120 | 121 | 122 | # def plot_embedding(data, label, title): 123 | # x_min, x_max = np.min(data, 0), np.max(data, 0) 124 | # data = (data - x_min) / (x_max - x_min) 125 | # 126 | # fig = plt.figure() 127 | # ax = plt.subplot(111) 128 | # for i in range(data.shape[0]): 129 | # plt.text(data[i, 0], data[i, 1], str(speakers.index(label[i])), 130 | # , 131 | # fontdict={'weight': 'bold', 'size': 9}) 132 | # plt.xticks([]) 133 | # plt.yticks([]) 134 | # plt.title(title) 135 | # return fig 136 | # 137 | # t0=time() 138 | # fig = plot_embedding(Y,label, 139 | # 't-SNE embedding of the digits (time %.2fs)' 140 | # % (time() - t0)) 141 | # plt.show(fig) 142 | # fig.savefig('./temp.png') 143 | 144 | plot_embedding_2d(Y, "t-SNE 2D", style_encoder_path+'/t-sne.png') 145 | -------------------------------------------------------------------------------- /yin.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/patriceguyot/Yin 2 | 3 | import numpy as np 4 | 5 | 6 | def differenceFunction(x, N, tau_max): 7 | """ 8 | Compute difference function of data x. This corresponds to equation (6) in [1] 9 | This solution is implemented directly with Numpy fft. 10 | 11 | 12 | :param x: audio data 13 | :param N: length of data 14 | :param tau_max: integration window size 15 | :return: difference function 16 | :rtype: list 17 | """ 18 | 19 | x = np.array(x, np.float64) 20 | w = x.size 21 | tau_max = min(tau_max, w) 22 | x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum())) 23 | size = w + tau_max 24 | p2 = (size // 32).bit_length() 25 | nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32) 26 | size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size) 27 | fc = np.fft.rfft(x, size_pad) 28 | conv = np.fft.irfft(fc * fc.conjugate())[:tau_max] 29 | return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv 30 | 31 | 32 | def cumulativeMeanNormalizedDifferenceFunction(df, N): 33 | """ 34 | Compute cumulative mean normalized difference function (CMND). 35 | 36 | This corresponds to equation (8) in [1] 37 | 38 | :param df: Difference function 39 | :param N: length of data 40 | :return: cumulative mean normalized difference function 41 | :rtype: list 42 | """ 43 | 44 | cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method 45 | return np.insert(cmndf, 0, 1) 46 | 47 | 48 | def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1): 49 | """ 50 | Return fundamental period of a frame based on CMND function. 51 | 52 | :param cmdf: Cumulative Mean Normalized Difference function 53 | :param tau_min: minimum period for speech 54 | :param tau_max: maximum period for speech 55 | :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency 56 | :return: fundamental period if there is values under threshold, 0 otherwise 57 | :rtype: float 58 | """ 59 | tau = tau_min 60 | while tau < tau_max: 61 | if cmdf[tau] < harmo_th: 62 | while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]: 63 | tau += 1 64 | return tau 65 | tau += 1 66 | 67 | return 0 # if unvoiced 68 | 69 | 70 | def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, 71 | harmo_thresh=0.1): 72 | """ 73 | 74 | Compute the Yin Algorithm. Return fundamental frequency and harmonic rate. 75 | 76 | :param sig: Audio signal (list of float) 77 | :param sr: sampling rate (int) 78 | :param w_len: size of the analysis window (samples) 79 | :param w_step: size of the lag between two consecutives windows (samples) 80 | :param f0_min: Minimum fundamental frequency that can be detected (hertz) 81 | :param f0_max: Maximum fundamental frequency that can be detected (hertz) 82 | :param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold. 83 | 84 | :returns: 85 | 86 | * pitches: list of fundamental frequencies, 87 | * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value) 88 | * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction 89 | * times: list of time of each estimation 90 | :rtype: tuple 91 | """ 92 | 93 | tau_min = int(sr / f0_max) 94 | tau_max = int(sr / f0_min) 95 | 96 | timeScale = range(0, len(sig) - w_len, w_step) # time values for each analysis window 97 | times = [t/float(sr) for t in timeScale] 98 | frames = [sig[t:t + w_len] for t in timeScale] 99 | 100 | pitches = [0.0] * len(timeScale) 101 | harmonic_rates = [0.0] * len(timeScale) 102 | argmins = [0.0] * len(timeScale) 103 | 104 | for i, frame in enumerate(frames): 105 | # Compute YIN 106 | df = differenceFunction(frame, w_len, tau_max) 107 | cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max) 108 | p = getPitch(cmdf, tau_min, tau_max, harmo_thresh) 109 | 110 | # Get results 111 | if np.argmin(cmdf) > tau_min: 112 | argmins[i] = float(sr / np.argmin(cmdf)) 113 | if p != 0: # A pitch was found 114 | pitches[i] = float(sr / p) 115 | harmonic_rates[i] = cmdf[p] 116 | else: # No pitch, but we compute a value of the harmonic rate 117 | harmonic_rates[i] = min(cmdf) 118 | 119 | return pitches, harmonic_rates, argmins, times 120 | 121 | --------------------------------------------------------------------------------