├── part_five ├── part_seven ├── randomCNN-voice-transfer ├── input │ ├── boy.wav │ ├── boy18.wav │ ├── girl.wav │ ├── girl52.wav │ ├── nightcall.wav │ └── stairway.wav ├── picture │ ├── gen.png │ └── purpose.png ├── requirements.txt ├── __pycache__ │ ├── model.cpython-310.pyc │ └── utils.cpython-310.pyc ├── model.py ├── utils.py ├── vctk_identify.py └── train.py ├── requirements ├── part_nine ├── part_six ├── part_two ├── part_four ├── Related articles ├── Explanation about the code and its function ├── part_eight ├── part_one ├── README.md └── part_three /part_five: -------------------------------------------------------------------------------- 1 | # main project 2 | link of the main project : 3 | https://github.com/mazzzystar/randomCNN-voice-transfer 4 | -------------------------------------------------------------------------------- /part_seven: -------------------------------------------------------------------------------- 1 | 2 | # link for new article : 3 | 4 | https://drive.google.com/drive/folders/1TabofkAJbbhmgP8gKo7_izznBbjLz7tL?usp=share_link 5 | 6 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/boy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/boy.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/boy18.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/boy18.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/girl.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/girl.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/picture/gen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/picture/gen.png -------------------------------------------------------------------------------- /randomCNN-voice-transfer/requirements.txt: -------------------------------------------------------------------------------- 1 | pip 2 | scikit-image 3 | librosa 4 | packaging 5 | pandas 6 | soundfile 7 | matplotlib 8 | torch 9 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/girl52.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/girl52.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/nightcall.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/nightcall.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/input/stairway.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/stairway.wav -------------------------------------------------------------------------------- /randomCNN-voice-transfer/picture/purpose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/picture/purpose.png -------------------------------------------------------------------------------- /requirements: -------------------------------------------------------------------------------- 1 | requirements for project: 2 | 3 | pip 4 | scikit-image 5 | librosa 6 | packaging 7 | pandas 8 | soundfile 9 | matplotlib 10 | torch 11 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /randomCNN-voice-transfer/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /part_nine: -------------------------------------------------------------------------------- 1 | 2 | ###################### proposal 3 | 4 | You can see my proposal in this section : 5 | 6 | https://drive.google.com/drive/folders/1vkGcnFcfwP9kvRXX2RyyK6kWTH6UHL3W?usp=share_link 7 | 8 | -------------------------------------------------------------------------------- /part_six: -------------------------------------------------------------------------------- 1 | # introducation 2 | 3 | I’m Alireza Ahmadi an engineer in major of biomedical engineering(bioelectric), 4 | I’m master student in south Tehran university. 5 | my research interests are neuroscience and medical signal processing 6 | and I’m so appreciate that in this semester I had a course called digital signal processing 7 | I’m so grateful of my professor dr.Mahdi Eslami that helped us to learn machine learning and digital signal processing 8 | and also helped us to learn more about GitHub and LinkedIn . 9 | https://www.linkedin.com/in/alireza-ahmadi-245214258 10 | 11 | -------------------------------------------------------------------------------- /part_two: -------------------------------------------------------------------------------- 1 | 2 | This is a many-to-one voice conversion system.The main significance of this Innovation work is that we could generate a target speaker's utterances without parallel... 3 | ...data like , or , but only waveforms of the target speaker .To make these parallel datasets needs a lot of effort. 4 | All we need is a number of waveforms of the target speaker's utterances and only a small set of pairs from a number of anonymous speakers. 5 | The model architecture consists of two modules: 6 | Net1 classify someone's utterances to one of phoneme classes at every timestep. 7 | Phonemes are speaker-independent while waveforms are speaker-dependent. 8 | Net2(speech synthesis) synthesize speeches of the target speaker from the phones. 9 | 10 | some information about our net1 : net1 classifies spectrogram to phonemes that consists of 60 English phonemes at every timestep 11 | some information about our net2 : net2 synthesizes the target speaker's speeches. 12 | -------------------------------------------------------------------------------- /part_four: -------------------------------------------------------------------------------- 1 | Subjective evaluation and conclusion 2 | 3 | 4 | This Innovation has the best results among all current works in voice style transfer . 5 | but the cost is that : 6 | 1. Heavy architecture. the architecture is to training 2 networks, 7 | Net one classifier and Net two synthesizer and combine them together 8 | 2. Delicate dataset. Except of using widely known dataset such as TIMIT, 9 | the author used the girls 2 hours audio dataset, and 1,000+ recording of pairs audio speaking the same sentence, 10 | that's maybe unacceptable in reality of training others voice. 11 | 3. Not general. The model was trained only for Kate Winslet's voice transfer. 12 | If we want to transfer to Obama's voice in our project RandomCNN, 13 | we need to gather Obama's voice data and train that network again. 14 | 15 | and here we have : 16 | It seems that to apply temperature to softmax in Net1 is not so meaningful. 17 | net2 can reach to near optimal when Net1 accuracy is correct to some extent. 18 | We have Over 70% test accuracy 19 | Obviously, sample rate, window length and hop length should be same in both Net1 and Net2. 20 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | cuda = True if torch.cuda.is_available() else False 5 | 6 | N_FFT = 512 7 | N_CHANNELS = round(1 + N_FFT/2) 8 | OUT_CHANNELS = 32 9 | 10 | 11 | class RandomCNN(nn.Module): 12 | def __init__(self): 13 | super(RandomCNN, self).__init__() 14 | 15 | # 2-D CNN 16 | self.conv1 = nn.Conv2d(1, OUT_CHANNELS, kernel_size=(3, 1), stride=1, padding=0) 17 | self.LeakyReLU = nn.LeakyReLU(0.2) 18 | 19 | # Set the random parameters to be constant. 20 | weight = torch.randn(self.conv1.weight.data.shape) 21 | self.conv1.weight = torch.nn.Parameter(weight, requires_grad=False) 22 | bias = torch.zeros(self.conv1.bias.data.shape) 23 | self.conv1.bias = torch.nn.Parameter(bias, requires_grad=False) 24 | 25 | def forward(self, x_delta): 26 | out = self.LeakyReLU(self.conv1(x_delta)) 27 | return out 28 | 29 | 30 | """ 31 | a_random = Variable(torch.randn(1, 1, 257, 430)).float() 32 | model = RandomCNN() 33 | a_O = model(a_random) 34 | print(a_O.shape) 35 | """ -------------------------------------------------------------------------------- /Related articles: -------------------------------------------------------------------------------- 1 | here we have some articles that you can download them with links below: 2 | 3 | 1. https://drive.google.com/drive/folders/1wduFYvdny9UDDkXS7Ev0Qyh_7qhr61s4?usp=share_link 4 | 2. https://drive.google.com/drive/folders/1afbiZB7az5DXDCGo9AmsKZ8giEzUIQ3q?usp=share_link 5 | 3. https://drive.google.com/drive/folders/1KOFdnqjnX7XysMcfGHZdCpaeeV2xb83s?usp=share_link 6 | 4. https://drive.google.com/drive/folders/1jdWhGgMbX2G3M_mshHotjCD5s5eiR6P6?usp=share_link 7 | 5. https://drive.google.com/drive/folders/1fYReHwhuA42KvUY3HrDtjHSI0VD6DLmQ?usp=share_link 8 | 6. https://drive.google.com/drive/folders/1vvD53GYmyJXV30nYj2gqoZ1W2QBDotG3?usp=share_link 9 | 7. https://drive.google.com/drive/folders/1XMP-dvuAzl82Ji_dZu_Nkya1aSqhw4Jq?usp=share_link 10 | 8. https://drive.google.com/drive/folders/1-EFosYJUeHyYPUz1zek2XO5r5gXVtTxk?usp=share_link 11 | 9. https://drive.google.com/drive/folders/1NWHRjZeSp8YlnIcMF7cdB0eebyViV9Tx?usp=share_link 12 | 10. https://drive.google.com/drive/folders/1-bHiBu1aaU7zGGGtTyEz8zHnBHqcQO8o?usp=share_link 13 | 11. https://drive.google.com/drive/folders/1YzxnbGQh-MsrDDwm5zAlHlWZ_QNE2uqI?usp=share_link 14 | 12. https://drive.google.com/drive/folders/1xJcD_utfkB_Yck4pM5-XIqqFTxu99FB3?usp=share_link 15 | -------------------------------------------------------------------------------- /Explanation about the code and its function: -------------------------------------------------------------------------------- 1 | clc 2 | clear all 3 | close all 4 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip % 5 | left=road(:,1); right=road(:,2); 6 | time=(1/fs)*length(left); % Calculate the duration of audio playback % 7 | t=linspace(0,time,length(left)); 8 | plot(t,left) 9 | xlabel('time (sec)'); 10 | ylabel('relative signal strength') 11 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength'); 12 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…) 13 | load gong.mat; 14 | sound(y, Fs); 15 | load handel.mat; 16 | sound(y, 2*Fs); 17 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs) 18 | leftout=left; % Create a new array for left with the same size % 19 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo 20 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo 21 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5) 22 | make sound louder/quieter clear all; 23 | Fs = 44100; % sampling frequency 24 | dur = 1; % duration of sound (in sec) 25 | % time vector 26 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100 27 | % frequency % Create frequency based on sinusoidal function % 28 | freq = 440; 29 | f = sin ( 2*pi * freq * t ); 30 | %%%%%%%%%%%%%%%%%%%%% 31 | % scale sound 32 | amp = .5; 33 | f_amp = amp * f; 34 | sound(f_amp,Fs) 35 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) ) 36 | hold on 37 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 38 | soundsc(left,fs); % Original left channel soundsc(left-right,fs); 39 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs); 40 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 41 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered 42 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N; 43 | -------------------------------------------------------------------------------- /part_eight: -------------------------------------------------------------------------------- 1 | ##################################################################################### 2 | ######################################### 3 | 4 | 5 | links for project : 6 | 1. https://drive.google.com/drive/folders/1QU9YJ6I0IMAg8kIkFsGIqYnreI3-JWwg?usp=share_link 7 | 2. https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG 8 | 3. https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing 9 | 4. https://drive.google.com/drive/folders/1xS9PHdGFSvJLAp6z8rSJ2Vps6Ok-7SKh?usp=share_link 10 | #################################################################################### 11 | ############################################## 12 | 13 | 14 | 15 | Explanation about the code and its function : 16 | 17 | clc 18 | clear all 19 | close all 20 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip % 21 | left=road(:,1); right=road(:,2); 22 | time=(1/fs)*length(left); % Calculate the duration of audio playback % 23 | t=linspace(0,time,length(left)); 24 | plot(t,left) 25 | xlabel('time (sec)'); 26 | ylabel('relative signal strength') 27 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength'); 28 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…) 29 | load gong.mat; 30 | sound(y, Fs); 31 | load handel.mat; 32 | sound(y, 2*Fs); 33 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs) 34 | leftout=left; % Create a new array for left with the same size % 35 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo 36 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo 37 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5) 38 | make sound louder/quieter clear all; 39 | Fs = 44100; % sampling frequency 40 | dur = 1; % duration of sound (in sec) 41 | % time vector 42 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100 43 | % frequency % Create frequency based on sinusoidal function % 44 | freq = 440; 45 | f = sin ( 2*pi * freq * t ); 46 | %%%%%%%%%%%%%%%%%%%%% 47 | % scale sound 48 | amp = .5; 49 | f_amp = amp * f; 50 | sound(f_amp,Fs) 51 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) ) 52 | hold on 53 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 54 | soundsc(left,fs); % Original left channel soundsc(left-right,fs); 55 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs); 56 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 57 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered 58 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N; 59 | -------------------------------------------------------------------------------- /part_one: -------------------------------------------------------------------------------- 1 | 2 | Project summary 3 | 4 | Professor Dr. Mahde Eslami 5 | 6 | student Alireza Ahmadi 7 | 8 | South Tehran University 9 | 10 | student number 4011414111015 11 | 12 | This is a summary of the project and the goal of the project : 13 | 14 | Audio style transfer with shallow random parameters CNN . 15 | Voice style transfer with random CNN : 16 | its maybe the fastest voice style transfer with reasonable result . 17 | the idea of Neural Voice Transfer aims at "using Obama's voice to sing songs of Beyoncé" or something related. 18 | Works that we did in this project : 19 | 1. Use 2-D CONV rather than 1-D for audio spectrogram. 20 | 2. Compute grams over time-axis. 21 | 3. Training fast. 5-10 minutes to train and transfer on 1 single GPU(Tesla P40). 22 | 4.Do not need dataset! You can transfer any 2 pieces of audio. 23 | You can also see the result here : 24 | https://soundcloud.com/mazzzystar/sets/speech-conversion-sample 25 | 26 | 27 | Explanation about the code and its function : 28 | 29 | clc 30 | clear all 31 | close all 32 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip % 33 | left=road(:,1); right=road(:,2); 34 | time=(1/fs)*length(left); % Calculate the duration of audio playback % 35 | t=linspace(0,time,length(left)); 36 | plot(t,left) 37 | xlabel('time (sec)'); 38 | ylabel('relative signal strength') 39 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength'); 40 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…) 41 | load gong.mat; 42 | sound(y, Fs); 43 | load handel.mat; 44 | sound(y, 2*Fs); 45 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs) 46 | leftout=left; % Create a new array for left with the same size % 47 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo 48 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo 49 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5) 50 | make sound louder/quieter clear all; 51 | Fs = 44100; % sampling frequency 52 | dur = 1; % duration of sound (in sec) 53 | % time vector 54 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100 55 | % frequency % Create frequency based on sinusoidal function % 56 | freq = 440; 57 | f = sin ( 2*pi * freq * t ); 58 | %%%%%%%%%%%%%%%%%%%%% 59 | % scale sound 60 | amp = .5; 61 | f_amp = amp * f; 62 | sound(f_amp,Fs) 63 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) ) 64 | hold on 65 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 66 | soundsc(left,fs); % Original left channel soundsc(left-right,fs); 67 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs); 68 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 69 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered 70 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N; 71 | 72 | 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # randomCNN-voice-transfer 2 | # Alireza Ahmadi 40114140111015 3 | # digital signal processing 4 | # Voice style transfer with random CNN 5 | Maybe the fastest voice style transfer with reasonable result ? 6 | ## What is voice style transfer? 7 | Inspired by the paper [A Neural Algorithm of Artistic Style](https://arxiv.org/abs/1508.06576) , the idea of `Neural Voice Transfer` aims at "using Obama's voice to sing songs of Beyoncé" or something related. 8 | 9 | We aim to: 10 | 11 | 12 | ## Highlight of my work 13 | * Use **2-D CONV** rather than 1-D for audio spectrogram. 14 | * Compute **grams over time-axis**. 15 | * **Training fast**. 5-10 minutes to train and transfer on 1 single GPU(Tesla P40). 16 | * **Do not need dataset!** You can transfer any 2 pieces of audio.(But some format of audio may occur error, then you should `sudo apt-get install libav-tools`) 17 | 18 | ## Results 19 | **You can listen to my current result now !** It's on soundcloud, [link1](https://soundcloud.com/mazzzystar/sets/stairway2nightcall), [link2](https://soundcloud.com/mazzzystar/sets/speech-conversion-sample). 20 | 21 | The generated spectrogram compared with `content` and `style`. 22 | ![](picture/gen.png) 23 | 24 | Compare the spectrogram of `gen` with `content` and `style`(X axis represents `Time Domain`, Y axis represents `Frequency Domain`), we can find that: 25 | * The structure is almost the same as `content`, and the **gap along frequency axis**, which determines the `voice texture` to a great extent, is more alike to the style. 26 | * The base skeleton is **shifted upward a little bit** for being similar to the style(The style is girl's voice, which has higher frequency than boy's). 27 | 28 | ## Reproduce it yourself 29 | ``` 30 | pip install -r requirements.txt 31 | # remove `CUDA_VISIBLE_DEVICES` when use CPU, though it will be slow. 32 | CUDA_VISIBLE_DEVICES=0 python train.py -content input/boy18.wav -style input/girl52.wav 33 | ``` 34 | Tips: change `3x1` CONV to `3x3` CONV can get smoother generated spectrogram. 35 | 36 | ### But..does the `gram` of random CNN output really works ? 37 | Below is my experiments result of using `texture gram` after 1-layer RandomCNN to capture speaker identity by putting them as **the only feature** in a simple nearest neighbor speaker identification system. The table shows the result of speaker identification accuracy of this system over the first 15 utterances of 30 first speakers of the VCTK dataset, along with 100 utterances of 4 first speakers. 38 | 39 | | Speakers | Train/Test | Accuracy | 40 | | ------------- |:-------------:| -----:| 41 | | 30 | 270/180 | 45.6%| 42 | | 4 | 240/160 | 92.5% | 43 | 44 | It seems `texture gram along time-axis` really captured something, you can check it by: 45 | ``` 46 | python vctk_identify 47 | ``` 48 | 49 | # main source code = https://github.com/mazzzystar/randomCNN-voice-transfer/blob/master/README.md 50 | # my acount in github = https://github.com/alirezaahmadiii 51 | # linkedin = https://www.linkedin.com/in/alireza-ahmadi-245214258 52 | 53 | # videos that are going to help you to learn more about this project and more about colab and matrix and machine learning : 54 | https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing .1 55 | https://colab.research.google.com/drive/10-8X59ey1gYBU2Uj3s-2fco1cLcKwGul?usp=sharing .2 56 | https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing .3 57 | https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG .4 58 | https://www.aparat.com/v/wPWKh .5 59 | https://www.aparat.com/v/FCtZ4 .6 60 | https://aparat.com/v/rncBI .7 61 | https://aparat.com/v/CzVJn .8 62 | https://aparat.com/v/OZSFB .9 63 | https://www.aparat.com/v/wPWKh .10 64 | https://www.aparat.com/v/FCtZ4 .11 65 | https://www.aparat.com/v/2o9Hb .12 66 | https://www.aparat.com/v/jbf2z .13 67 | https://drive.google.com/drive/folders/1y8bNyDQwvbbm60ih1fMeuu1Apnv76zKC?usp=share_li .14 68 | nk 69 | https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa- .15 70 | K?usp=share_link 71 | https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG?usp=share_link .16 72 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import torch 4 | import soundfile 5 | from model import * 6 | from packaging import version 7 | 8 | def librosa_write(outfile, x, sr): 9 | if version.parse(librosa.__version__) < version.parse('0.8.0'): 10 | librosa.output.write_wav(outfile, x, sr) 11 | else: 12 | soundfile.write(outfile, x, sr) 13 | 14 | def wav2spectrum(filename): 15 | x, sr = librosa.load(filename) 16 | S = librosa.stft(x, N_FFT) 17 | p = np.angle(S) 18 | 19 | S = np.log1p(np.abs(S)) 20 | return S, sr 21 | 22 | 23 | def spectrum2wav(spectrum, sr, outfile): 24 | # Return the all-zero vector with the same shape of `a_content` 25 | a = np.exp(spectrum) - 1 26 | p = 2 * np.pi * np.random.random_sample(spectrum.shape) - np.pi 27 | for i in range(50): 28 | S = a * np.exp(1j * p) 29 | x = librosa.istft(S) 30 | p = np.angle(librosa.stft(x, N_FFT)) 31 | librosa_write(outfile, x, sr) 32 | 33 | 34 | def wav2spectrum_keep_phase(filename): 35 | x, sr = librosa.load(filename) 36 | S = librosa.stft(x, N_FFT) 37 | p = np.angle(S) 38 | 39 | S = np.log1p(np.abs(S)) 40 | return S, p, sr 41 | 42 | 43 | def spectrum2wav_keep_phase(spectrum, p, sr, outfile): 44 | # Return the all-zero vector with the same shape of `a_content` 45 | a = np.exp(spectrum) - 1 46 | for i in range(50): 47 | S = a * np.exp(1j * p) 48 | x = librosa.istft(S) 49 | p = np.angle(librosa.stft(x, N_FFT)) 50 | librosa_write(outfile, x, sr) 51 | 52 | 53 | def compute_content_loss(a_C, a_G): 54 | """ 55 | Compute the content cost 56 | 57 | Arguments: 58 | a_C -- tensor of dimension (1, n_C, n_H, n_W) 59 | a_G -- tensor of dimension (1, n_C, n_H, n_W) 60 | 61 | Returns: 62 | J_content -- scalar that you compute using equation 1 above 63 | """ 64 | m, n_C, n_H, n_W = a_G.shape 65 | 66 | # Reshape a_C and a_G to the (m * n_C, n_H * n_W) 67 | a_C_unrolled = a_C.view(m * n_C, n_H * n_W) 68 | a_G_unrolled = a_G.view(m * n_C, n_H * n_W) 69 | 70 | # Compute the cost 71 | J_content = 1.0 / (4 * m * n_C * n_H * n_W) * torch.sum((a_C_unrolled - a_G_unrolled) ** 2) 72 | 73 | return J_content 74 | 75 | 76 | def gram(A): 77 | """ 78 | Argument: 79 | A -- matrix of shape (n_C, n_L) 80 | 81 | Returns: 82 | GA -- Gram matrix of shape (n_C, n_C) 83 | """ 84 | GA = torch.matmul(A, A.t()) 85 | 86 | return GA 87 | 88 | 89 | def gram_over_time_axis(A): 90 | """ 91 | Argument: 92 | A -- matrix of shape (1, n_C, n_H, n_W) 93 | 94 | Returns: 95 | GA -- Gram matrix of A along time axis, of shape (n_C, n_C) 96 | """ 97 | m, n_C, n_H, n_W = A.shape 98 | 99 | # Reshape the matrix to the shape of (n_C, n_L) 100 | # Reshape a_C and a_G to the (m * n_C, n_H * n_W) 101 | A_unrolled = A.view(m * n_C * n_H, n_W) 102 | GA = torch.matmul(A_unrolled, A_unrolled.t()) 103 | 104 | return GA 105 | 106 | 107 | def compute_layer_style_loss(a_S, a_G): 108 | """ 109 | Arguments: 110 | a_S -- tensor of dimension (1, n_C, n_H, n_W) 111 | a_G -- tensor of dimension (1, n_C, n_H, n_W) 112 | 113 | Returns: 114 | J_style_layer -- tensor representing a scalar style cost. 115 | """ 116 | m, n_C, n_H, n_W = a_G.shape 117 | 118 | # Reshape the matrix to the shape of (n_C, n_L) 119 | # Reshape a_C and a_G to the (m * n_C, n_H * n_W) 120 | 121 | # Calculate the gram 122 | # !!!!!! IMPORTANT !!!!! Here we compute the Gram along n_C, 123 | # not along n_H * n_W. But is the result the same? No. 124 | GS = gram_over_time_axis(a_S) 125 | GG = gram_over_time_axis(a_G) 126 | 127 | # Computing the loss 128 | J_style_layer = 1.0 / (4 * (n_C ** 2) * (n_H * n_W)) * torch.sum((GS - GG) ** 2) 129 | 130 | return J_style_layer 131 | 132 | 133 | """ 134 | # Test 135 | test_S = torch.randn(1, 6, 2, 2) 136 | test_G = torch.randn(1, 6, 2, 2) 137 | print(test_S) 138 | print(test_G) 139 | print(compute_layer_style_loss(test_S, test_G)) 140 | 141 | 142 | # Test 143 | test_C = torch.randn(1, 6, 2, 2) 144 | test_G = torch.randn(1, 6, 2, 2) 145 | print(test_C) 146 | print(test_G) 147 | print(compute_content_loss(test_C, test_G)) 148 | """ 149 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/vctk_identify.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import glob 3 | from utils import * 4 | from model import RandomCNN 5 | 6 | 7 | data_path = 'VCTK-Corpus1/' 8 | 9 | randomCNN = RandomCNN() 10 | randomCNN.eval() 11 | if cuda: 12 | randomCNN = randomCNN.cuda() 13 | 14 | 15 | def process_vctk(_data_path, speaker_num=30, each_audio_num=15): 16 | # read label-info 17 | df = pd.read_table(_data_path + 'speaker-info.txt', usecols=['ID'], 18 | index_col=False, delim_whitespace=True) 19 | 20 | # read file IDs 21 | file_ids = [] 22 | for d in [_data_path + 'txt/p%d/' % uid for uid in df.ID.values[:speaker_num]]: 23 | file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt')[:each_audio_num])]) 24 | 25 | audio_lst = [] 26 | for i, f in enumerate(file_ids): 27 | # wave file name 28 | wave_file = _data_path + 'wav48/%s/' % f[:4] + f + '.wav' 29 | fn = wave_file.split('/')[-1].split("_")[0] 30 | print(fn) 31 | # target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' 32 | # if os.path.exists(target_filename): 33 | # continue 34 | # print info 35 | print("VCTK corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file)) 36 | 37 | # load wave file 38 | spect, sr = wav2spectrum(wave_file) 39 | audio_lst.append((spect, fn)) 40 | 41 | del spect 42 | return audio_lst 43 | 44 | 45 | def compute_loss(a_C, a_G): 46 | """ 47 | Compute the content cost 48 | 49 | Arguments: 50 | a_C -- tensor of dimension (1, n_C, n_H, n_W) 51 | a_G -- tensor of dimension (1, n_C, n_H, n_W) 52 | 53 | Returns: 54 | J_content -- scalar that you compute using equation 1 above 55 | """ 56 | n_H, n_W = a_G.shape 57 | 58 | # Reshape a_C and a_G to the (m * n_C, n_H * n_W) 59 | J_content = 1.0 / (n_H * n_W) * torch.sum((a_C - a_G) ** 2) 60 | 61 | return J_content 62 | 63 | GAP_LEN = 15 64 | TRAIN_LEN = 9 65 | audio_lst = process_vctk(data_path) 66 | print(len(audio_lst)) 67 | 68 | train_lst = [] 69 | test_lst = [] 70 | 71 | count = 0 72 | for item in audio_lst: 73 | if count % GAP_LEN < TRAIN_LEN: 74 | train_lst.append(item) 75 | else: 76 | test_lst.append(item) 77 | count += 1 78 | del audio_lst 79 | print("Train len={}".format(len(train_lst))) 80 | print("Test len={}".format(len(test_lst))) 81 | for item in train_lst[:100]: 82 | print(item[-1]) 83 | for item in test_lst[:100]: 84 | print(item[-1]) 85 | 86 | 87 | def spect2gram(spect_lst): 88 | grams_lst = [] 89 | for item in spect_lst: 90 | audio, no = item[0], item[1] 91 | audio = audio.T 92 | audio_delta = np.zeros(audio.shape) 93 | for i in range(audio.shape[0] - 1): 94 | audio_delta[i] = audio_delta[i+1] - audio_delta[i+1] 95 | 96 | audio = audio.T 97 | audio_delta = audio_delta.T 98 | audio_torch = torch.from_numpy(audio)[None, None, :, :] 99 | audio_delta_torch = torch.from_numpy(audio_delta)[None, None, :, :] 100 | audio_delta_var = Variable(audio_delta_torch, requires_grad=False).float() 101 | audio_var = Variable(audio_torch, requires_grad=False).float() 102 | if cuda: 103 | audio_var = audio_var.cuda() 104 | audio_delta_var = audio_delta_var.cuda() 105 | randomCNN_output = randomCNN(audio_var) 106 | gram = gram_over_time_axis(randomCNN_output) 107 | grams_lst.append((gram, no)) 108 | del gram 109 | del randomCNN_output 110 | del audio_torch 111 | del audio_var 112 | del audio 113 | return grams_lst 114 | 115 | 116 | train_grams = spect2gram(train_lst) 117 | print("Train audio nums={}".format(len(train_grams))) 118 | del train_lst 119 | 120 | test_grams = spect2gram(test_lst) 121 | print("Test audio nums={}".format(len(test_grams))) 122 | del test_lst 123 | 124 | 125 | def classifiy(new_gram, no): 126 | MIN_DIS = 100000 127 | MIN_NO = "" 128 | for item in train_grams: 129 | item_gram, item_no = item[0], item[1] 130 | dis = compute_loss(new_gram, item_gram) 131 | if dis.data[0] < MIN_DIS: 132 | MIN_DIS = dis.data[0] 133 | MIN_NO = item_no 134 | del item_gram 135 | return 1 if(MIN_NO == no) else 0 136 | 137 | correct_count = 0 138 | print("Begin to classify.") 139 | for item in test_grams: 140 | gram, no = item[0], item[1] 141 | correct_count += classifiy(gram, no) 142 | precise = float(correct_count) / len(test_grams) 143 | print("test: {}/{}, precise={}".format(correct_count, len(test_grams), precise)) 144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /randomCNN-voice-transfer/train.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('agg') 3 | import matplotlib.pyplot as plt 4 | from torch.autograd import Variable 5 | from utils import * 6 | from model import * 7 | import time 8 | import math 9 | import argparse 10 | cuda = True if torch.cuda.is_available() else False 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-content', help='Content input') 14 | parser.add_argument('-content_weight', help='Content weight. Default is 1e2', default = 1e2) 15 | parser.add_argument('-style', help='Style input') 16 | parser.add_argument('-style_weight', help='Style weight. Default is 1', default = 1) 17 | parser.add_argument('-epochs', type=int, help='Number of epoch iterations. Default is 20000', default = 20000) 18 | parser.add_argument('-print_interval', type=int, help='Number of epoch iterations between printing losses', default = 1000) 19 | parser.add_argument('-plot_interval', type=int, help='Number of epoch iterations between plot points', default = 1000) 20 | parser.add_argument('-learning_rate', type=float, default = 0.002) 21 | parser.add_argument('-output', help='Output file name. Default is "output"', default = 'output') 22 | args = parser.parse_args() 23 | 24 | 25 | CONTENT_FILENAME = args.content 26 | STYLE_FILENAME = args.style 27 | 28 | a_content, sr = wav2spectrum(CONTENT_FILENAME) 29 | a_style, sr = wav2spectrum(STYLE_FILENAME) 30 | 31 | a_content_torch = torch.from_numpy(a_content)[None, None, :, :] 32 | if cuda: 33 | a_content_torch = a_content_torch.cuda() 34 | print(a_content_torch.shape) 35 | a_style_torch = torch.from_numpy(a_style)[None, None, :, :] 36 | if cuda: 37 | a_style_torch = a_style_torch.cuda() 38 | print(a_style_torch.shape) 39 | 40 | model = RandomCNN() 41 | model.eval() 42 | 43 | a_C_var = Variable(a_content_torch, requires_grad=False).float() 44 | a_S_var = Variable(a_style_torch, requires_grad=False).float() 45 | if cuda: 46 | model = model.cuda() 47 | a_C_var = a_C_var.cuda() 48 | a_S_var = a_S_var.cuda() 49 | 50 | a_C = model(a_C_var) 51 | a_S = model(a_S_var) 52 | 53 | 54 | # Optimizer 55 | learning_rate = args.learning_rate 56 | a_G_var = Variable(torch.randn(a_content_torch.shape) * 1e-3) 57 | if cuda: 58 | a_G_var = a_G_var.cuda() 59 | a_G_var.requires_grad = True 60 | optimizer = torch.optim.Adam([a_G_var]) 61 | 62 | # coefficient of content and style 63 | style_param = args.style_weight 64 | content_param = args.content_weight 65 | 66 | num_epochs = args.epochs 67 | print_every = args.print_interval 68 | plot_every = args.plot_interval 69 | 70 | # Keep track of losses for plotting 71 | current_loss = 0 72 | all_losses = [] 73 | 74 | 75 | def timeSince(since): 76 | now = time.time() 77 | s = now - since 78 | m = math.floor(s / 60) 79 | s -= m * 60 80 | return '%dm %ds' % (m, s) 81 | 82 | 83 | start = time.time() 84 | # Train the Model 85 | for epoch in range(1, num_epochs + 1): 86 | optimizer.zero_grad() 87 | a_G = model(a_G_var) 88 | 89 | content_loss = content_param * compute_content_loss(a_C, a_G) 90 | style_loss = style_param * compute_layer_style_loss(a_S, a_G) 91 | loss = content_loss + style_loss 92 | loss.backward() 93 | optimizer.step() 94 | 95 | # print 96 | if epoch % print_every == 0: 97 | print("{} {}% {} content_loss:{:4f} style_loss:{:4f} total_loss:{:4f}".format(epoch, 98 | epoch / num_epochs * 100, 99 | timeSince(start), 100 | content_loss.item(), 101 | style_loss.item(), loss.item())) 102 | current_loss += loss.item() 103 | 104 | # Add current loss avg to list of losses 105 | if epoch % plot_every == 0: 106 | all_losses.append(current_loss / plot_every) 107 | current_loss = 0 108 | 109 | 110 | gen_spectrum = a_G_var.cpu().data.numpy().squeeze() 111 | gen_audio_C = args.output + ".wav" 112 | spectrum2wav(gen_spectrum, sr, gen_audio_C) 113 | 114 | plt.figure() 115 | plt.plot(all_losses) 116 | plt.savefig('loss_curve.png') 117 | 118 | plt.figure(figsize=(5, 5)) 119 | # we then use the 2nd column. 120 | plt.subplot(1, 1, 1) 121 | plt.title("Content Spectrum") 122 | plt.imsave('Content_Spectrum.png', a_content[:400, :]) 123 | 124 | plt.figure(figsize=(5, 5)) 125 | # we then use the 2nd column. 126 | plt.subplot(1, 1, 1) 127 | plt.title("Style Spectrum") 128 | plt.imsave('Style_Spectrum.png', a_style[:400, :]) 129 | 130 | plt.figure(figsize=(5, 5)) 131 | # we then use the 2nd column. 132 | plt.subplot(1, 1, 1) 133 | plt.title("CNN Voice Transfer Result") 134 | plt.imsave('Gen_Spectrum.png', gen_spectrum[:400, :]) 135 | -------------------------------------------------------------------------------- /part_three: -------------------------------------------------------------------------------- 1 | 2 | ############################################################# 3 | requierment that we need in first step are: 4 | 5 | tensorflow-gpu >= 1.8 6 | numpy >= 1.11.1 7 | librosa == 0.5.1 8 | joblib == 0.11.0 9 | tensorpack >= 0.8.6 10 | pyyaml 11 | soundfile 12 | pydub 13 | tqdm 14 | 15 | 16 | ############################################################################# 17 | 18 | 19 | ######our firt model that we can write like this : 20 | #### start 21 | 22 | import tensorflow as tf 23 | from tensorpack.graph_builder.model_desc import ModelDesc, InputDesc 24 | from tensorpack.tfutils import ( 25 | get_current_tower_context, optimizer, gradproc) 26 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope 27 | 28 | import tensorpack_extension 29 | from data_load import phns 30 | from hparam import hparam as hp 31 | from modules import prenet, cbhg, normalize 32 | 33 | 34 | class Net1(ModelDesc): 35 | def __init__(self): 36 | pass 37 | 38 | def _get_inputs(self): 39 | return [InputDesc(tf.float32, (None, None, hp.default.n_mfcc), 'x_mfccs'), 40 | InputDesc(tf.int32, (None, None,), 'y_ppgs')] 41 | 42 | def _build_graph(self, inputs): 43 | self.x_mfccs, self.y_ppgs = inputs 44 | is_training = get_current_tower_context().is_training 45 | with tf.variable_scope('net1'): 46 | self.ppgs, self.preds, self.logits = self.network(self.x_mfccs, is_training) 47 | self.cost = self.loss() 48 | acc = self.acc() 49 | 50 | # summaries 51 | tf.summary.scalar('net1/train/loss', self.cost) 52 | tf.summary.scalar('net1/train/acc', acc) 53 | 54 | if not is_training: 55 | # summaries 56 | tf.summary.scalar('net1/eval/summ_loss', self.cost) 57 | tf.summary.scalar('net1/eval/summ_acc', acc) 58 | 59 | # for confusion matrix 60 | tf.reshape(self.y_ppgs, shape=(tf.size(self.y_ppgs),), name='net1/eval/y_ppg_1d') 61 | tf.reshape(self.preds, shape=(tf.size(self.preds),), name='net1/eval/pred_ppg_1d') 62 | 63 | def _get_optimizer(self): 64 | lr = tf.get_variable('learning_rate', initializer=hp.train1.lr, trainable=False) 65 | return tf.train.AdamOptimizer(lr) 66 | 67 | @auto_reuse_variable_scope 68 | def network(self, x_mfcc, is_training): 69 | # Pre-net 70 | prenet_out = prenet(x_mfcc, 71 | num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2], 72 | dropout_rate=hp.train1.dropout_rate, 73 | is_training=is_training) # (N, T, E/2) 74 | 75 | # CBHG 76 | out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2, 77 | hp.train1.num_highway_blocks, hp.train1.norm_type, is_training) 78 | 79 | # Final linear projection 80 | logits = tf.layers.dense(out, len(phns)) # (N, T, V) 81 | ppgs = tf.nn.softmax(logits / hp.train1.t, name='ppgs') # (N, T, V) 82 | preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) 83 | 84 | return ppgs, preds, logits 85 | 86 | def loss(self): 87 | istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfccs, -1))) # indicator: (N, T) 88 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits / hp.train1.t, 89 | labels=self.y_ppgs) 90 | loss *= istarget 91 | loss = tf.reduce_mean(loss) 92 | return loss 93 | 94 | def acc(self): 95 | istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfccs, -1))) # indicator: (N, T) 96 | num_hits = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y_ppgs)) * istarget) 97 | num_targets = tf.reduce_sum(istarget) 98 | acc = num_hits / num_targets 99 | return acc 100 | 101 | 102 | class Net2(ModelDesc): 103 | 104 | def _get_inputs(self): 105 | n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1 106 | 107 | return [InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mfcc), 'x_mfccs'), 108 | InputDesc(tf.float32, (None, n_timesteps, hp.default.n_fft // 2 + 1), 'y_spec'), 109 | InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mels), 'y_mel'), ] 110 | 111 | def _build_graph(self, inputs): 112 | self.x_mfcc, self.y_spec, self.y_mel = inputs 113 | 114 | is_training = get_current_tower_context().is_training 115 | 116 | # build net1 117 | self.net1 = Net1() 118 | with tf.variable_scope('net1'): 119 | self.ppgs, _, _ = self.net1.network(self.x_mfcc, is_training) 120 | self.ppgs = tf.identity(self.ppgs, name='ppgs') 121 | 122 | # build net2 123 | with tf.variable_scope('net2'): 124 | self.pred_spec, self.pred_mel = self.network(self.ppgs, is_training) 125 | self.pred_spec = tf.identity(self.pred_spec, name='pred_spec') 126 | 127 | self.cost = self.loss() 128 | 129 | # summaries 130 | tf.summary.scalar('net2/train/loss', self.cost) 131 | 132 | if not is_training: 133 | tf.summary.scalar('net2/eval/summ_loss', self.cost) 134 | 135 | def _get_optimizer(self): 136 | gradprocs = [ 137 | tensorpack_extension.FilterGradientVariables('.*net2.*', verbose=False), 138 | gradproc.MapGradient( 139 | lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), 140 | gradproc.GlobalNormClip(hp.train2.clip_norm), 141 | # gradproc.PrintGradient(), 142 | # gradproc.CheckGradient(), 143 | ] 144 | lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) 145 | opt = tf.train.AdamOptimizer(learning_rate=lr) 146 | return optimizer.apply_grad_processors(opt, gradprocs) 147 | 148 | @auto_reuse_variable_scope 149 | def network(self, ppgs, is_training): 150 | # Pre-net 151 | prenet_out = prenet(ppgs, 152 | num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2], 153 | dropout_rate=hp.train2.dropout_rate, 154 | is_training=is_training) # (N, T, E/2) 155 | 156 | # CBHG1: mel-scale 157 | pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2, 158 | hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, 159 | scope="cbhg_mel") 160 | pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1], name='pred_mel') # (N, T, n_mels) 161 | 162 | # CBHG2: linear-scale 163 | pred_spec = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) 164 | pred_spec = cbhg(pred_spec, hp.train2.num_banks, hp.train2.hidden_units // 2, 165 | hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear") 166 | pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1], name='pred_spec') # log magnitude: (N, T, 1+n_fft//2) 167 | 168 | return pred_spec, pred_mel 169 | 170 | def loss(self): 171 | loss_spec = tf.reduce_mean(tf.squared_difference(self.pred_spec, self.y_spec)) 172 | loss_mel = tf.reduce_mean(tf.squared_difference(self.pred_mel, self.y_mel)) 173 | loss = loss_spec + loss_mel 174 | return loss 175 | 176 | 177 | 178 | ####################################################################################################################### 179 | ################### on the other hand we can apply tensorpack gradproc############################################ 180 | 181 | 182 | second change that we can make : 183 | but its much harder and it needs more codes that we should write: 184 | ############################################################################ 185 | 186 | from __future__ import print_function 187 | 188 | import tensorflow as tf 189 | 190 | 191 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None): 192 | '''Embeds a given tensor. 193 | 194 | Args: 195 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 196 | to be looked up in `lookup table`. 197 | vocab_size: An int. Vocabulary size. 198 | num_units: An int. Number of embedding hidden units. 199 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 200 | should be constant zeros. 201 | scope: Optional scope for `variable_scope`. 202 | reuse: Boolean, whether to reuse the weights of a previous layer 203 | by the same name. 204 | 205 | Returns: 206 | A `Tensor` with one more rank than inputs's. The last dimesionality 207 | should be `num_units`. 208 | ''' 209 | with tf.variable_scope(scope, reuse=reuse): 210 | lookup_table = tf.get_variable('lookup_table', 211 | dtype=tf.float32, 212 | shape=[vocab_size, num_units], 213 | initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01)) 214 | if zero_pad: 215 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 216 | lookup_table[1:, :]), 0) 217 | return tf.nn.embedding_lookup(lookup_table, inputs) 218 | 219 | def normalize(inputs, 220 | type="bn", 221 | decay=.999, 222 | epsilon=1e-8, 223 | is_training=True, 224 | reuse=None, 225 | activation_fn=None, 226 | scope="normalize"): 227 | '''Applies {batch|layer} normalization. 228 | 229 | Args: 230 | inputs: A tensor with 2 or more dimensions, where the first dimension has 231 | `batch_size`. If type is `bn`, the normalization is over all but 232 | the last dimension. Or if type is `ln`, the normalization is over 233 | the last dimension. Note that this is different from the native 234 | `tf.contrib.layers.batch_norm`. For this I recommend you change 235 | a line in ``tensorflow/contrib/layers/python/layers/layer.py` 236 | as follows. 237 | Before: mean, variance = nn.moments(inputs, axis, keep_dims=True) 238 | After: mean, variance = nn.moments(inputs, [-1], keep_dims=True) 239 | type: A string. Either "bn" or "ln". 240 | decay: Decay for the moving average. Reasonable values for `decay` are close 241 | to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. 242 | Lower `decay` value (recommend trying `decay`=0.9) if model experiences 243 | reasonably good training performance but poor validation and/or test 244 | performance. 245 | is_training: Whether or not the layer is in training mode. W 246 | activation_fn: Activation function. 247 | scope: Optional scope for `variable_scope`. 248 | 249 | Returns: 250 | A tensor with the same shape and data dtype as `inputs`. 251 | ''' 252 | if type=="bn": 253 | inputs_shape = inputs.get_shape() 254 | inputs_rank = inputs_shape.ndims 255 | 256 | # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster. 257 | # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC. 258 | if inputs_rank in [2, 3, 4]: 259 | if inputs_rank==2: 260 | inputs = tf.expand_dims(inputs, axis=1) 261 | inputs = tf.expand_dims(inputs, axis=2) 262 | elif inputs_rank==3: 263 | inputs = tf.expand_dims(inputs, axis=1) 264 | 265 | outputs = tf.contrib.layers.batch_norm(inputs=inputs, 266 | decay=decay, 267 | center=True, 268 | scale=True, 269 | updates_collections=None, 270 | is_training=is_training, 271 | scope=scope, 272 | zero_debias_moving_mean=True, 273 | fused=True, 274 | reuse=reuse) 275 | # restore original shape 276 | if inputs_rank==2: 277 | outputs = tf.squeeze(outputs, axis=[1, 2]) 278 | elif inputs_rank==3: 279 | outputs = tf.squeeze(outputs, axis=1) 280 | else: # fallback to naive batch norm 281 | outputs = tf.contrib.layers.batch_norm(inputs=inputs, 282 | decay=decay, 283 | center=True, 284 | scale=True, 285 | updates_collections=None, 286 | is_training=is_training, 287 | scope=scope, 288 | reuse=reuse, 289 | fused=False) 290 | elif type in ("ln", "ins"): 291 | reduction_axis = -1 if type=="ln" else 1 292 | with tf.variable_scope(scope, reuse=reuse): 293 | inputs_shape = inputs.get_shape() 294 | params_shape = inputs_shape[-1:] 295 | 296 | mean, variance = tf.nn.moments(inputs, [reduction_axis], keep_dims=True) 297 | # beta = tf.Variable(tf.zeros(params_shape)) 298 | beta = tf.get_variable("beta", shape=params_shape, initializer=tf.zeros_initializer) 299 | # gamma = tf.Variable(tf.ones(params_shape)) 300 | gamma = tf.get_variable("gamma", shape=params_shape, initializer=tf.ones_initializer) 301 | normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) 302 | outputs = gamma * normalized + beta 303 | else: 304 | outputs = inputs 305 | 306 | if activation_fn: 307 | outputs = activation_fn(outputs) 308 | 309 | return outputs 310 | 311 | 312 | 313 | def conv1d(inputs, 314 | filters=None, 315 | size=1, 316 | rate=1, 317 | padding="SAME", 318 | use_bias=False, 319 | activation_fn=None, 320 | scope="conv1d", 321 | reuse=None): 322 | ''' 323 | Args: 324 | inputs: A 3-D tensor with shape of [batch, time, depth]. 325 | filters: An int. Number of outputs (=activation maps) 326 | size: An int. Filter size. 327 | rate: An int. Dilation rate. 328 | padding: Either `same` or `valid` or `causal` (case-insensitive). 329 | use_bias: A boolean. 330 | scope: Optional scope for `variable_scope`. 331 | reuse: Boolean, whether to reuse the weights of a previous layer 332 | by the same name. 333 | 334 | Returns: 335 | A masked tensor of the same shape and dtypes as `inputs`. 336 | ''' 337 | with tf.variable_scope(scope): 338 | if padding.lower()=="causal": 339 | # pre-padding for causality 340 | pad_len = (size - 1) * rate # padding size 341 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) 342 | padding = "valid" 343 | 344 | if filters is None: 345 | filters = inputs.get_shape().as_list[-1] 346 | 347 | params = {"inputs":inputs, "filters":filters, "kernel_size":size, 348 | "dilation_rate":rate, "padding":padding, "activation":activation_fn, 349 | "use_bias":use_bias, "reuse":reuse} 350 | 351 | outputs = tf.layers.conv1d(**params) 352 | return outputs 353 | 354 | 355 | def conv1d_banks(inputs, K=16, num_units=None, norm_type=None, is_training=True, scope="conv1d_banks", reuse=None): 356 | '''Applies a series of conv1d separately. 357 | 358 | Args: 359 | inputs: A 3d tensor with shape of [N, T, C] 360 | K: An int. The size of conv1d banks. That is, 361 | The `inputs` are convolved with K filters: 1, 2, ..., K. 362 | is_training: A boolean. This is passed to an argument of `batch_normalize`. 363 | 364 | Returns: 365 | A 3d tensor with shape of [N, T, K*Hp.embed_size//2]. 366 | ''' 367 | with tf.variable_scope(scope, reuse=reuse): 368 | outputs = [] 369 | for k in range(1, K+1): 370 | with tf.variable_scope("num_{}".format(k)): 371 | output = conv1d(inputs, num_units, k) 372 | output = normalize(output, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu) 373 | outputs.append(output) 374 | outputs = tf.concat(outputs, -1) 375 | return outputs # (N, T, Hp.embed_size//2*K) 376 | 377 | 378 | def gru(inputs, num_units=None, bidirection=False, seqlens=None, scope="gru", reuse=None): 379 | '''Applies a GRU. 380 | 381 | Args: 382 | inputs: A 3d tensor with shape of [N, T, C]. 383 | num_units: An int. The number of hidden units. 384 | bidirection: A boolean. If True, bidirectional results 385 | are concatenated. 386 | scope: Optional scope for `variable_scope`. 387 | reuse: Boolean, whether to reuse the weights of a previous layer 388 | by the same name. 389 | 390 | Returns: 391 | If bidirection is True, a 3d tensor with shape of [N, T, 2*num_units], 392 | otherwise [N, T, num_units]. 393 | ''' 394 | with tf.variable_scope(scope, reuse=reuse): 395 | if num_units is None: 396 | num_units = inputs.get_shape().as_list[-1] 397 | 398 | cell = tf.contrib.rnn.GRUCell(num_units) 399 | if bidirection: 400 | cell_bw = tf.contrib.rnn.GRUCell(num_units) 401 | outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, 402 | sequence_length=seqlens, 403 | dtype=tf.float32) 404 | return tf.concat(outputs, 2) 405 | else: 406 | outputs, _ = tf.nn.dynamic_rnn(cell, inputs, 407 | sequence_length=seqlens, 408 | dtype=tf.float32) 409 | return outputs 410 | 411 | 412 | def attention_decoder(inputs, memory, seqlens=None, num_units=None, scope="attention_decoder", reuse=None): 413 | '''Applies a GRU to `inputs`, while attending `memory`. 414 | Args: 415 | inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs. 416 | memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network. 417 | seqlens: A 1d tensor with shape of [N,], dtype of int32. 418 | num_units: An int. Attention size. 419 | scope: Optional scope for `variable_scope`. 420 | reuse: Boolean, whether to reuse the weights of a previous layer 421 | by the same name. 422 | 423 | Returns: 424 | A 3d tensor with shape of [N, T, num_units]. 425 | ''' 426 | with tf.variable_scope(scope, reuse=reuse): 427 | if num_units is None: 428 | num_units = inputs.get_shape().as_list[-1] 429 | 430 | attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units, 431 | memory, 432 | memory_sequence_length=seqlens, 433 | normalize=True, 434 | probability_fn=tf.nn.softmax) 435 | decoder_cell = tf.contrib.rnn.GRUCell(num_units) 436 | cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, num_units) 437 | outputs, _ = tf.nn.dynamic_rnn(cell_with_attention, inputs, 438 | dtype=tf.float32) #( N, T', 16) 439 | return outputs 440 | 441 | def prenet(inputs, num_units=None, dropout_rate=0., is_training=True, scope="prenet", reuse=None): 442 | '''Prenet for Encoder and Decoder. 443 | Args: 444 | inputs: A 3D tensor of shape [N, T, hp.embed_size]. 445 | is_training: A boolean. 446 | scope: Optional scope for `variable_scope`. 447 | reuse: Boolean, whether to reuse the weights of a previous layer 448 | by the same name. 449 | 450 | Returns: 451 | A 3D tensor of shape [N, T, num_units/2]. 452 | ''' 453 | with tf.variable_scope(scope, reuse=reuse): 454 | outputs = tf.layers.dense(inputs, units=num_units[0], activation=tf.nn.relu, name="dense1") 455 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout1") 456 | outputs = tf.layers.dense(outputs, units=num_units[1], activation=tf.nn.relu, name="dense2") 457 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout2") 458 | 459 | return outputs # (N, T, num_units/2) 460 | 461 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None): 462 | '''Highway networks, see https://arxiv.org/abs/1505.00387 463 | Args: 464 | inputs: A 3D tensor of shape [N, T, W]. 465 | num_units: An int or `None`. Specifies the number of units in the highway layer 466 | or uses the input size if `None`. 467 | scope: Optional scope for `variable_scope`. 468 | reuse: Boolean, whether to reuse the weights of a previous layer 469 | by the same name. 470 | Returns: 471 | A 3D tensor of shape [N, T, W]. 472 | ''' 473 | if not num_units: 474 | num_units = inputs.get_shape()[-1] 475 | 476 | with tf.variable_scope(scope, reuse=reuse): 477 | H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1") 478 | T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1.0), name="dense2") 479 | C = 1. - T 480 | outputs = H * T + inputs * C 481 | return outputs 482 | 483 | 484 | def cbhg(input, num_banks, hidden_units, num_highway_blocks, norm_type='bn', is_training=True, scope="cbhg"): 485 | with tf.variable_scope(scope): 486 | out = conv1d_banks(input, 487 | K=num_banks, 488 | num_units=hidden_units, 489 | norm_type=norm_type, 490 | is_training=is_training) # (N, T, K * E / 2) 491 | 492 | out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (N, T, K * E / 2) 493 | 494 | out = conv1d(out, hidden_units, 3, scope="conv1d_1") # (N, T, E/2) 495 | out = normalize(out, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu) 496 | out = conv1d(out, hidden_units, 3, scope="conv1d_2") # (N, T, E/2) 497 | out += input # (N, T, E/2) # residual connections 498 | 499 | for i in range(num_highway_blocks): 500 | out = highwaynet(out, num_units=hidden_units, 501 | scope='highwaynet_{}'.format(i)) # (N, T, E/2) 502 | 503 | out = gru(out, hidden_units, True) # (N, T, E) 504 | return out 505 | --------------------------------------------------------------------------------