├── README.md
├── mcd.py
├── plot_tsne_visualization.py
├── ppg_plot_from_npy.py
├── ppg_resample_interpolate
├── tsne_visualization.py
└── yin.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation methods in SpeechSynthesis(including VC)
 2 | 
 3 | 
 4 | 
 5 | ## subjective 
 6 | ### CMOS : https://ecs.utdallas.edu/loizou/cimplants/quality_assessment_chapter.pdf
 7 | 
 8 | 
 9 | ## Objective evaluation in speech synthesis
10 | ## display overall
11 | ![image](https://user-images.githubusercontent.com/38338826/137138608-f137d3be-be9b-4144-8c89-4dbee631a211.png)
12 | 👉https://arxiv.org/pdf/2104.00355.pdf
13 | 
14 | 
15 | ## defination in papers
16 | [MSD, F0 RMSE, F0 corr, GPE, FPE](https://arxiv.org/pdf/1904.02790.pdf)
17 | ![image](https://user-images.githubusercontent.com/38338826/137137367-62739b52-0d72-49da-9550-06efd37d842a.png)
18 | 
19 | [MCD, GPE, VDE, FFE](http://proceedings.mlr.press/v80/skerry-ryan18a/skerry-ryan18a.pdf)
20 | [MCD](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/2016_paper_297.pdf)
21 | 
22 | 
23 | 
24 | 
25 | 
26 | ## training
27 | ### MCD
28 | #### definination
29 | ![image](https://user-images.githubusercontent.com/38338826/137128924-048a5bdb-bf64-4ed1-90b5-94f34f54ffaa.png)
30 | http://www1.se.cuhk.edu.hk/~hccl/publications/pub/2016_paper_297.pdf section 4.2
31 | ![image](https://user-images.githubusercontent.com/38338826/137129723-8b4e670e-0c3f-42b2-a727-b8971b3de4b8.png)
32 | 
33 | 
34 | #### code
35 | [mcd(34 mcep)](https://github.com/inconnu11/tsne-and-mcd/blob/main/mcd.py)
36 | [other implement](https://github.com/MattShannon/mcd)
37 | 
38 | #### normal value range
39 | log dB 4.4
40 | 
41 | 
42 | ### F0 RMSE/ corr/ VDE/ FFE
43 | [GPE](https://github.com/bastibe/MAPS-Scripts)
44 | 
45 | 
46 | 
47 | #### normal value range
48 | RMSE 22.386
49 | 
50 | 
51 | ### VUV
52 | 
53 | 
54 | 
55 | ## test
56 | 
57 | 
58 | 
59 | 
60 | ## common
61 | ### content
62 | PER, WER
63 | ### speaker similarity
64 | https://github.com/resemble-ai/Resemblyzer
65 | [tsne]()
66 | 
67 | 
68 | in paper [Building Bilingual and Code-Switched Voice Conversion with Limited
69 | Training Data Using Embedding Consistency Loss](https://arxiv.org/pdf/2104.10832.pdf)
70 | ![image](https://user-images.githubusercontent.com/38338826/141472983-dd92c5e9-f9de-4912-91ca-979a8b451107.png)
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/mcd.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import glob
  4 | import librosa
  5 | import pyworld
  6 | import pysptk
  7 | import numpy as np
  8 | import matplotlib.pyplot as plot
  9 | 
 10 | #from binary_io import BinaryIOCollection
 11 | 
 12 | 
 13 | def load_wav(wav_file, sr):
 14 |     """
 15 |     Load a wav file with librosa.
 16 |     :param wav_file: path to wav file
 17 |     :param sr: sampling rate
 18 |     :return: audio time series numpy array
 19 |     """
 20 |     wav, _ = librosa.load(wav_file, sr=sr, mono=True)
 21 | 
 22 |     return wav
 23 | 
 24 | 
 25 | def log_spec_dB_dist(x, y):
 26 |     log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
 27 |     diff = x - y
 28 |     
 29 |     return log_spec_dB_const * math.sqrt(np.inner(diff, diff))
 30 | 
 31 | 
 32 | SAMPLING_RATE = 22050
 33 | FRAME_PERIOD = 5.0
 34 | 
 35 | # Paths to target reference and converted synthesised wavs
 36 | stgan_vc_wav_paths = glob.glob('./syn/*')
 37 | stgan_vc2_wav_paths = glob.glob('./target/*')
 38 | 
 39 | 
 40 | 
 41 | # Load the wavs
 42 | # vc2_trg_ref = load_wav('/datapool/home/zxt20/MYDATA/VCTK/wav16/p250/p250_450.wav', sr=SAMPLING_RATE)
 43 | # vc2_conv_synth = load_wav('/datapool/home/zxt20/JieWang2020ICASSP/orth_frame_level_5mask_trimdata/frame_5mask_1kRLoss_4FCsLayer/With_pretrained_ecoder/With_content_predictor_wordcount_EmbedLayer/results/p231_p250_test_p231_p231_449.npy_test_p250_p250_450.npy_RFU_NOP.wav', sr=SAMPLING_RATE)
 44 | 
 45 | # print(type(vc2_trg_ref))
 46 |  
 47 | 
 48 | def wav2mcep_numpy(wavfile, target_directory, alpha=0.65, fft_size=512, mcep_size=34):
 49 |     # make relevant directories
 50 |     if not os.path.exists(target_directory):
 51 |         os.makedirs(target_directory)
 52 |     wavfile_tmp = wavfile.split('RFU')[0]
 53 |     if len(wavfile.split('.'))>3:
 54 |         source = wavfile_tmp.split('.')[1][-8:]
 55 |         target = wavfile_tmp.split('.')[-2][-8:]
 56 |         fname = source+'_'+target
 57 |     else:
 58 |         fname = os.path.basename(wavfile).split('.')[0]
 59 | 
 60 | 
 61 |     loaded_wav = load_wav(wavfile, sr=SAMPLING_RATE)
 62 | 
 63 |     # Use WORLD vocoder to spectral envelope
 64 |     _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=SAMPLING_RATE,
 65 |                                    frame_period=FRAME_PERIOD, fft_size=fft_size)
 66 | 
 67 |     # Extract MCEP features
 68 |     mgc = pysptk.sptk.mcep(sp, order=mcep_size, alpha=alpha, maxiter=0,
 69 |                            etype=1, eps=1.0E-8, min_det=0.0, itype=3)
 70 | 
 71 |     #fname = os.path.basename(wavfile).split('.')[0]
 72 |     
 73 |     np.save(os.path.join(target_directory, fname + '.npy'),
 74 |             mgc,
 75 |             allow_pickle=False)
 76 | 
 77 | alpha = 0.65  # commonly used at 22050 Hz
 78 | fft_size = 512
 79 | mcep_size = 34
 80 | 
 81 | vc_trg_wavs = glob.glob('./target/*')
 82 | vc_trg_mcep_dir = './mel_tar/'
 83 | vc_conv_wavs = glob.glob('./syn/*')
 84 | vc_conv_mcep_dir = './mel_syn'
 85 | 
 86 | 
 87 | for wav in vc_trg_wavs:
 88 |     wav2mcep_numpy(wav, vc_trg_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
 89 | 
 90 | for wav in vc_conv_wavs:
 91 |     wav2mcep_numpy(wav, vc_conv_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
 92 | 
 93 | # for wav in vc2_trg_wavs:
 94 | #     wav2mcep_numpy(wav, vc2_trg_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
 95 | 
 96 | # for wav in vc2_conv_wavs:
 97 | #     wav2mcep_numpy(wav, vc2_conv_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
 98 | def average_mcd(ref_mcep_files, synth_mcep_files, cost_function):
 99 |     """
100 |     Calculate the average MCD.
101 |     :param ref_mcep_files: list of strings, paths to MCEP target reference files
102 |     :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
103 |     :param cost_function: distance metric used
104 |     :returns: average MCD, total frames processed
105 |     """
106 |     min_cost_tot = 0.0
107 |     frames_tot = 0
108 |     
109 |     for ref in ref_mcep_files:
110 |         for synth in synth_mcep_files:
111 |             # get the trg_ref and conv_synth speaker name and sample id
112 |             ref_fsplit, synth_fsplit = os.path.basename(ref).split('_'), os.path.basename(synth).split('_')
113 |             ref_spk, ref_id = ref_fsplit[0], ref_fsplit[-1][:3]
114 |             synth_spk, synth_id = synth_fsplit[2], synth_fsplit[3][:3]
115 |             
116 |             # if the speaker name is the same and sample id is the same, do MCD
117 |             if ref_spk == synth_spk and ref_id == synth_id:
118 |                 # load MCEP vectors
119 |                 ref_vec = np.load(ref)
120 |                 ref_frame_no = len(ref_vec)
121 |                 synth_vec = np.load(synth)
122 | 
123 |                 # dynamic time warping using librosa
124 |                 min_cost, _ = librosa.sequence.dtw(ref_vec[:, 1:].T, synth_vec[:, 1:].T, 
125 |                                                    metric=cost_function)
126 |                 
127 |                 min_cost_tot += np.mean(min_cost)
128 |                 frames_tot += ref_frame_no
129 |                 
130 |     mean_mcd = min_cost_tot / frames_tot
131 |     
132 |     return mean_mcd, frames_tot
133 | 
134 | 
135 | vc_trg_refs = glob.glob('./mel_tar/*')
136 | vc_conv_synths = glob.glob('./mel_syn/*')
137 | # vc2_trg_refs = glob.glob('./data/official_stargan-vc2/mceps_numpy/trg/*')
138 | # vc2_conv_synths = glob.glob('./data/official_stargan-vc2/mceps_numpy/conv/*')
139 | 
140 | cost_function = log_spec_dB_dist
141 | 
142 | vc_mcd, vc_tot_frames_used = average_mcd(vc_trg_refs, vc_conv_synths, cost_function)
143 | #vc2_mcd, vc2_tot_frames_used = average_mcd(vc2_trg_refs, vc2_conv_synths, cost_function)
144 | 
145 | 
146 | print(f'MCD = {vc_mcd} dB, calculated over a total of {vc_tot_frames_used} frames')
147 | 


--------------------------------------------------------------------------------
/plot_tsne_visualization.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from sklearn import manifold
 3 | import matplotlib.pyplot as plt
 4 | import os
 5 | import numpy as np
 6 | import matplotlib
 7 | 
 8 | matplotlib.use('Agg')
 9 | 
10 | # source_path
11 | # 	├── speaker_1 (dir storing embedding files,named sentence_id.npy,shape [1,embedding_size])
12 | # 	└── speaker_2
13 | #     ....
14 | #   └── speaker_n
15 | 
16 | 
17 | # source_path
18 | source_path = "data"
19 | 
20 | embedding_dict = {}
21 | speakers = os.listdir(source_path)
22 | speaker_list = []
23 | # random select 3 speakers
24 | random.shuffle(speakers)
25 | sentence_label = []
26 | speaker_label = []
27 | for i, speaker in enumerate(speakers[:3]):
28 |     speaker_list.append(speaker)
29 |     speaker_path = os.path.join(source_path, speaker)
30 |     sentences = os.listdir(speaker_path)
31 |     embedding_dict[i] = []
32 |     for sentence in sentences:
33 |         sentence_path = os.path.join(speaker_path, sentence)
34 |         sentence_embedding = np.load(sentence_path)[0]
35 |         embedding_dict[i].append(sentence_embedding)
36 |         speaker_label.append(i)
37 |         sentence_label.append(int(sentence.replace('.npy', '')))
38 |     if len(embedding_dict[i]) <= 1:
39 |         embedding_dict.pop(i)
40 | print("speaker list:", speaker_list)
41 | tsne = manifold.TSNE(n_components=2, init='pca', perplexity=3, random_state=0, verbose=1)
42 | data = []
43 | for i in range(3):
44 |     data = data + embedding_dict[i]
45 | print("num data:", len(data))
46 | data = np.array(data)
47 | data = (data - data.mean(axis=0)) / data.std(axis=0)
48 | X = tsne.fit_transform(data)
49 | markers = ['x', '^', '.']
50 | # linear to [0,1]
51 | x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
52 | X = (X - x_min) / (x_max - x_min)
53 | fig = plt.figure()
54 | ax = fig.add_subplot(1, 1, 1)
55 | for i in range(X.shape[0]):
56 |     c = plt.cm.Set1(sentence_label[i])
57 |     ax.scatter(X[i, 0], X[i, 1], color=c, marker=markers[speaker_label[i]])
58 | 
59 | plt.title('t-SNE 2D')
60 | plt.savefig(os.path.join(source_path, 'tsne.png'), format='png', dpi=300)
61 | plt.close()
62 | 


--------------------------------------------------------------------------------
/ppg_plot_from_npy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | 
 5 | PPG_file = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105/p225/p225_002.npy'
 6 | PPG_value = np.load(PPG_file)
 7 | sns.heatmap(PPG_value, cmap='Reds')
 8 | plt.savefig('./ppg_0.png')  # [-0.2, 0.8]
 9 | print(PPG_value.shape)
10 | # print(PPG_value)
11 | 
12 | 
13 | # txt_for_check = './spmel_p226_010.txt'
14 | # for ppg in PPG_value:
15 | #     f = open(txt_for_check, 'a')
16 | #     f.write('\n' + str(ppg))
17 | #     f.close()
18 | 
19 | 
20 | PPG_downsample_file = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_64_16ms/PPGs_VCTK105_64_16_nonorm_notrim/p225/p225_002.npy'
21 | PPG_downsample = np.load(PPG_downsample_file)
22 | sns.heatmap(PPG_downsample, cmpa='Reds')
23 | plt.savefig('./ppg_1.png')  # [-0.2, 0.8]
24 | print(PPG_downsample.shape)
25 | # print(PPG_value)
26 | 


--------------------------------------------------------------------------------
/ppg_resample_interpolate:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from scipy.interpolate import interp1d
  4 | import os
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | ############################# single file #############################
 11 | # ppg_original = np.load('/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105/p225/p225_001.npy')
 12 | # print(ppg_original.shape)   # 206 frame (206, 41)
 13 | # print(ppg_original)
 14 | # # np.savetxt()
 15 | 
 16 | # duration = ppg_original.shape[0] * shift_from
 17 | # # print(duration)   # 2.06 s 保留到小数点后1位？
 18 | 
 19 | # t = np.arange(0, duration, shift_from)
 20 | # # print(t.shape)
 21 | # t2 = np.arange(0, duration, shift_to)
 22 | # # print(t2)
 23 | 
 24 | # ppg_val = interp1d(t, ppg_original,
 25 | #                     kind='linear', axis=0, fill_value='extrapolate', copy=False,
 26 | #                     assume_sorted=True)
 27 | 
 28 | # ppg_val_to = ppg_val(t2)
 29 | # print(ppg_val_to.shape) # (129, 41)
 30 | # print(ppg_val_to)
 31 | ############################# single file #############################
 32 | 
 33 | 
 34 | 
 35 | shift_from = 0.01  # 帧移 s
 36 | shift_to = 0.016
 37 | 
 38 | ############################# multiple file #############################
 39 | ppg_dir = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_25_10ms/PPGs_VCTK105'
 40 | ppg_target_dir = '/home/v-jiewang/ContentANDStyle_Disentangle/ppg_spk_norm/assets_64_16ms/PPGs_VCTK105_64_16ms'
 41 | for spk in os.listdir(ppg_dir):
 42 |     # print(spk)
 43 |     target_spk_dir = os.path.join(ppg_target_dir, spk)
 44 |     if not os.path.exists(target_spk_dir):
 45 |         os.makedirs(target_spk_dir)
 46 |     # print(target_spk_dir)
 47 |     for uttid in os.listdir(os.path.join(ppg_dir, spk)):
 48 |         print(uttid)
 49 |         ppg_original = np.load(os.path.join(ppg_dir, spk, uttid))
 50 |         # print("ppg_original shape[0]", ppg_original.shape)
 51 |         duration = ppg_original.shape[0] * shift_from
 52 |         t = np.arange(0, duration, shift_from)
 53 |         # print("t shape[0]", t.shape[0])
 54 |         t2 = np.arange(0, duration, shift_to)
 55 |         if t.shape[0] < ppg_original.shape[0]:
 56 |             diffe = ppg_original.shape[0] - t.shape[0]
 57 |             if diffe <= 1:
 58 |                 ppg_original = ppg_original[:t.shape[0], :]
 59 |         else:
 60 |             diffe = t.shape[0] - ppg_original.shape[0]
 61 |             if diffe <= 1:
 62 |                 t = t[:ppg_original.shape[0]]
 63 |         assert t.shape[0] == ppg_original.shape[0]
 64 | 
 65 |         ppg_val = interp1d(t, ppg_original,
 66 |                     kind='linear', axis=0, fill_value='extrapolate', copy=False,
 67 |                     assume_sorted=True)
 68 |         ppg_val_to = ppg_val(t2)
 69 |         target_ppg_file_name = os.path.join(target_spk_dir, uttid)
 70 |         # print(target_ppg_file_name)
 71 |         np.save(os.path.join(target_spk_dir, uttid), ppg_val_to)
 72 | 
 73 | ############################# multiple file #############################
 74 | 
 75 | 
 76 | ############################# original #############################
 77 | # if __name__ == '__main__':
 78 | #     if len(sys.argv) != 3:
 79 | #         print('Usage: python3 %s sampling_rate_from sampling_rate_to <lf0_in.float32 >lf0_out.float32' %sys.argv[0])
 80 | #         exit(-1)
 81 |     
 82 | #     sr_from = float(sys.argv[1])
 83 | #     sr_to = float(sys.argv[2])
 84 | #     lf0 = np.frombuffer(sys.stdin.buffer.read(),dtype='float32').copy()
 85 | #     t = np.arange(0,lf0.size)/sr_from
 86 | 
 87 | #     voiced_mask = lf0>0
 88 | #     duration = lf0.size/sr_from
 89 | #     t2= np.arange(0,duration,1/sr_to)
 90 |     
 91 | #     lf0[0] = lf0[voiced_mask][0]
 92 | #     lf0[-1] = lf0[voiced_mask][-1]
 93 | #     voiced_mask = lf0 > 0
 94 | 
 95 | #     lf0_val = interp1d(t[voiced_mask], lf0[voiced_mask],
 96 | #                        kind='linear', fill_value='extrapolate', copy=False,
 97 | #                        assume_sorted=True)
 98 | 
 99 | #     lf0_val_t2 = lf0_val(t2)
100 | #     sys.stdout.buffer.write(lf0_val(t2).astype('float32').tobytes())
101 | ############################# original #############################
102 | 


--------------------------------------------------------------------------------
/tsne_visualization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | from time import time
  5 | import matplotlib
  6 | 
  7 | matplotlib.use('Agg')
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.cm as cm
 10 | from sklearn import (manifold, datasets, decomposition,
 11 |                      ensemble, random_projection)
 12 | import random
 13 | from time import time,sleep
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | from mpl_toolkits.mplot3d import Axes3D
 17 | from matplotlib.ticker import NullFormatter
 18 | 
 19 | from sklearn import manifold, datasets
 20 | 
 21 | 
 22 | 
 23 | style_encoder_path = "/home/zhaoxt20/vae_tac_myself/exp_multi/Libritts_styles"
 24 | 
 25 | embedding_dict={}
 26 | speakers = os.listdir(style_encoder_path)
 27 | 
 28 | 
 29 | random.shuffle(speakers)
 30 | for speaker in speakers[:10]:
 31 |     speaker_path = os.path.join(style_encoder_path,speaker)
 32 |     sentences = os.listdir(speaker_path)
 33 |     if speaker not in embedding_dict:
 34 |         embedding_dict[speaker]=[]
 35 | 
 36 |     for sentence in sentences:
 37 |         sentence_path = os.path.join(speaker_path,sentence)
 38 |         embeddings = os.listdir(sentence_path)
 39 |         for embedding in embeddings:
 40 |             embedding_path = os.path.join(sentence_path,embedding)
 41 |             embedding_np = np.load(embedding_path)[0]
 42 |             embedding_dict[speaker].append(embedding_np)
 43 |     if len(embedding_dict[speaker])<=1:
 44 |         embedding_dict.pop(speaker)
 45 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=0, verbose=1)
 46 | color_num = len(speakers) * 2
 47 | colors = cm.rainbow(np.linspace(0, 1, color_num))
 48 | Y=None
 49 | data=None
 50 | label=[]
 51 | for x in embedding_dict:
 52 |     if data is None:
 53 |         data = embedding_dict[x]
 54 |     else:
 55 |         data = data + embedding_dict[x]
 56 |     #data = tsne.fit_transform(data)
 57 |     # if Y is None:
 58 |     #     Y=np.array(data)
 59 |     # else:
 60 |     #     Y = np.concatenate((Y,data),axis=
 61 | 
 62 |     tmp=[]
 63 |     for i in range(len(embedding_dict[x])):
 64 |         tmp.append(x)
 65 |     label=label+tmp
 66 | print(len(data))
 67 | Y = tsne.fit_transform(data)
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | def plot_embedding_2d(X, title=None, save_path=None):
 74 |     # linear to [0,1]
 75 |     x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
 76 |     X = (X - x_min) / (x_max - x_min)
 77 |     # print('plot X', X)
 78 |     # print('plot Char', Char)
 79 |     # at x0, x1, draw text
 80 |     fig = plt.figure()
 81 |     ax = fig.add_subplot(1, 1, 1)
 82 |     for i in range(X.shape[0]):
 83 |         c = plt.cm.Set1(speakers.index(label[i]) % 10 / 10.)
 84 |         ax.text(X[i, 0], X[i, 1], str(label[i]), color=c,
 85 |                 fontdict={'weight': 'bold', 'size': 4})
 86 | 
 87 |     if title is not None:
 88 |         plt.title(title)
 89 |     if save_path is not None:
 90 |         plt.savefig(save_path, format='png', dpi=300)
 91 |     plt.close()
 92 | 
 93 | 
 94 | def plot_embedding_2d_focus(X, focus, title=None, save_path=None):
 95 |     # linear to [0,1]
 96 |     x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
 97 |     X = (X - x_min) / (x_max - x_min)
 98 |     # print('plot X', X)
 99 |     # print('plot Char', Char)
100 |     # at x0, x1, draw text
101 |     fig = plt.figure()
102 |     ax = fig.add_subplot(1, 1, 1)
103 |     for i in range(X.shape[0]):
104 |         ch = get_symbol(i)
105 |         c = get_color(i)
106 |         if ch in focus:
107 |             # Log(ch)
108 |             # Log(X[i, 0])
109 |             # Log(X[i, 1])
110 |             # ax.text(X[i, 0], X[i, 1], ch, color=c,
111 |             #         fontdict={'weight': 'bold', 'size': 9})
112 |             ax.text(X[i, 0]+np.random.normal(0, 0.0075), X[i, 1]+np.random.normal(0, 0.0075), ch, color=c,
113 |                     fontdict={'weight': 'bold', 'size': 9})
114 | 
115 |     if title is not None:
116 |         plt.title(title)
117 |     if save_path is not None:
118 |         plt.savefig(save_path, format='png', dpi=300)
119 |     plt.close()
120 | 
121 | 
122 | # def plot_embedding(data, label, title):
123 | #     x_min, x_max = np.min(data, 0), np.max(data, 0)
124 | #     data = (data - x_min) / (x_max - x_min)
125 | #
126 | #     fig = plt.figure()
127 | #     ax = plt.subplot(111)
128 | #     for i in range(data.shape[0]):
129 | #         plt.text(data[i, 0], data[i, 1], str(speakers.index(label[i])),
130 | #                  ,
131 | #                  fontdict={'weight': 'bold', 'size': 9})
132 | #     plt.xticks([])
133 | #     plt.yticks([])
134 | #     plt.title(title)
135 | #     return fig
136 | #
137 | # t0=time()
138 | # fig = plot_embedding(Y,label,
139 | #                          't-SNE embedding of the digits (time %.2fs)'
140 | #                          % (time() - t0))
141 | # plt.show(fig)
142 | # fig.savefig('./temp.png')
143 | 
144 | plot_embedding_2d(Y, "t-SNE 2D", style_encoder_path+'/t-sne.png')
145 | 


--------------------------------------------------------------------------------
/yin.py:
--------------------------------------------------------------------------------
  1 | # adapted from https://github.com/patriceguyot/Yin
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def differenceFunction(x, N, tau_max):
  7 |     """
  8 |     Compute difference function of data x. This corresponds to equation (6) in [1]
  9 |     This solution is implemented directly with Numpy fft.
 10 | 
 11 | 
 12 |     :param x: audio data
 13 |     :param N: length of data
 14 |     :param tau_max: integration window size
 15 |     :return: difference function
 16 |     :rtype: list
 17 |     """
 18 | 
 19 |     x = np.array(x, np.float64)
 20 |     w = x.size
 21 |     tau_max = min(tau_max, w)
 22 |     x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum()))
 23 |     size = w + tau_max
 24 |     p2 = (size // 32).bit_length()
 25 |     nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
 26 |     size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
 27 |     fc = np.fft.rfft(x, size_pad)
 28 |     conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
 29 |     return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
 30 | 
 31 | 
 32 | def cumulativeMeanNormalizedDifferenceFunction(df, N):
 33 |     """
 34 |     Compute cumulative mean normalized difference function (CMND).
 35 | 
 36 |     This corresponds to equation (8) in [1]
 37 | 
 38 |     :param df: Difference function
 39 |     :param N: length of data
 40 |     :return: cumulative mean normalized difference function
 41 |     :rtype: list
 42 |     """
 43 | 
 44 |     cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method
 45 |     return np.insert(cmndf, 0, 1)
 46 | 
 47 | 
 48 | def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
 49 |     """
 50 |     Return fundamental period of a frame based on CMND function.
 51 | 
 52 |     :param cmdf: Cumulative Mean Normalized Difference function
 53 |     :param tau_min: minimum period for speech
 54 |     :param tau_max: maximum period for speech
 55 |     :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
 56 |     :return: fundamental period if there is values under threshold, 0 otherwise
 57 |     :rtype: float
 58 |     """
 59 |     tau = tau_min
 60 |     while tau < tau_max:
 61 |         if cmdf[tau] < harmo_th:
 62 |             while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
 63 |                 tau += 1
 64 |             return tau
 65 |         tau += 1
 66 | 
 67 |     return 0    # if unvoiced
 68 | 
 69 | 
 70 | def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500,
 71 |                 harmo_thresh=0.1):
 72 |     """
 73 | 
 74 |     Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
 75 | 
 76 |     :param sig: Audio signal (list of float)
 77 |     :param sr: sampling rate (int)
 78 |     :param w_len: size of the analysis window (samples)
 79 |     :param w_step: size of the lag between two consecutives windows (samples)
 80 |     :param f0_min: Minimum fundamental frequency that can be detected (hertz)
 81 |     :param f0_max: Maximum fundamental frequency that can be detected (hertz)
 82 |     :param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
 83 | 
 84 |     :returns:
 85 | 
 86 |         * pitches: list of fundamental frequencies,
 87 |         * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
 88 |         * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
 89 |         * times: list of time of each estimation
 90 |     :rtype: tuple
 91 |     """
 92 | 
 93 |     tau_min = int(sr / f0_max)
 94 |     tau_max = int(sr / f0_min)
 95 | 
 96 |     timeScale = range(0, len(sig) - w_len, w_step)  # time values for each analysis window
 97 |     times = [t/float(sr) for t in timeScale]
 98 |     frames = [sig[t:t + w_len] for t in timeScale]
 99 | 
100 |     pitches = [0.0] * len(timeScale)
101 |     harmonic_rates = [0.0] * len(timeScale)
102 |     argmins = [0.0] * len(timeScale)
103 | 
104 |     for i, frame in enumerate(frames):
105 |         # Compute YIN
106 |         df = differenceFunction(frame, w_len, tau_max)
107 |         cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
108 |         p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
109 | 
110 |         # Get results
111 |         if np.argmin(cmdf) > tau_min:
112 |             argmins[i] = float(sr / np.argmin(cmdf))
113 |         if p != 0:  # A pitch was found
114 |             pitches[i] = float(sr / p)
115 |             harmonic_rates[i] = cmdf[p]
116 |         else:  # No pitch, but we compute a value of the harmonic rate
117 |             harmonic_rates[i] = min(cmdf)
118 | 
119 |     return pitches, harmonic_rates, argmins, times
120 | 
121 | 


--------------------------------------------------------------------------------