├── part_five
├── part_seven
├── randomCNN-voice-transfer
    ├── input
    │   ├── boy.wav
    │   ├── boy18.wav
    │   ├── girl.wav
    │   ├── girl52.wav
    │   ├── nightcall.wav
    │   └── stairway.wav
    ├── picture
    │   ├── gen.png
    │   └── purpose.png
    ├── requirements.txt
    ├── __pycache__
    │   ├── model.cpython-310.pyc
    │   └── utils.cpython-310.pyc
    ├── model.py
    ├── utils.py
    ├── vctk_identify.py
    └── train.py
├── requirements
├── part_nine
├── part_six
├── part_two
├── part_four
├── Related articles
├── Explanation about the code and its function
├── part_eight
├── part_one
├── README.md
└── part_three


/part_five:
--------------------------------------------------------------------------------
1 | # main project 
2 | link of the main project :
3 | https://github.com/mazzzystar/randomCNN-voice-transfer
4 | 


--------------------------------------------------------------------------------
/part_seven:
--------------------------------------------------------------------------------
1 | 
2 | # link for new article :
3 | 
4 | https://drive.google.com/drive/folders/1TabofkAJbbhmgP8gKo7_izznBbjLz7tL?usp=share_link
5 | 
6 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/boy.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/boy.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/boy18.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/boy18.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/girl.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/girl.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/picture/gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/picture/gen.png


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/requirements.txt:
--------------------------------------------------------------------------------
1 | pip
2 | scikit-image
3 | librosa
4 | packaging
5 | pandas
6 | soundfile
7 | matplotlib
8 | torch
9 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/girl52.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/girl52.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/nightcall.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/nightcall.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/input/stairway.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/input/stairway.wav


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/picture/purpose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/picture/purpose.png


--------------------------------------------------------------------------------
/requirements:
--------------------------------------------------------------------------------
 1 |   requirements for project:
 2 |   
 3 |   pip
 4 | 	scikit-image
 5 | 	librosa
 6 | 	packaging
 7 | 	pandas
 8 | 	soundfile
 9 | 	matplotlib
10 | 	torch
11 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/randomCNN-voice-transfer/HEAD/randomCNN-voice-transfer/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/part_nine:
--------------------------------------------------------------------------------
1 | 
2 | ###################### proposal
3 | 
4 | You can see my proposal in this section :
5 | 
6 | https://drive.google.com/drive/folders/1vkGcnFcfwP9kvRXX2RyyK6kWTH6UHL3W?usp=share_link
7 | 
8 | 


--------------------------------------------------------------------------------
/part_six:
--------------------------------------------------------------------------------
 1 | # introducation
 2 | 
 3 | I’m Alireza Ahmadi an engineer in major of biomedical engineering(bioelectric),
 4 | I’m master student in south Tehran university.
 5 | my research interests are neuroscience and medical signal processing 
 6 | and I’m so appreciate that in this semester I had a course called digital signal processing 
 7 | I’m so grateful of my professor dr.Mahdi Eslami that helped us to learn machine learning and digital signal processing 
 8 | and also helped us to learn more about GitHub and LinkedIn .
 9 | https://www.linkedin.com/in/alireza-ahmadi-245214258 
10 | 
11 | 


--------------------------------------------------------------------------------
/part_two:
--------------------------------------------------------------------------------
 1 | 
 2 | This is a many-to-one voice conversion system.The main significance of this Innovation work is that we could generate a target speaker's utterances without parallel...
 3 | ...data like <source's wav, target's wav>, <wav, text> or <wav, phone>, but only waveforms of the target speaker .To make these parallel datasets needs a lot of effort.
 4 | All we need is a number of waveforms of the target speaker's utterances and only a small set of <wav, phone> pairs from a number of anonymous speakers.
 5 | The model architecture consists of two modules:
 6 | Net1 classify someone's utterances to one of phoneme classes at every timestep.
 7 | Phonemes are speaker-independent while waveforms are speaker-dependent.
 8 | Net2(speech synthesis) synthesize speeches of the target speaker from the phones.
 9 | 
10 | some information about our net1 : net1 classifies spectrogram to phonemes that consists of 60 English phonemes at every timestep
11 | some information about our net2 : net2 synthesizes the target speaker's speeches.
12 | 


--------------------------------------------------------------------------------
/part_four:
--------------------------------------------------------------------------------
 1 | Subjective evaluation and conclusion
 2 | 
 3 | 
 4 | This Innovation has the best results among all current works  in voice style transfer .
 5 | but the cost is that :
 6 | 1.	Heavy architecture. the architecture is to training 2 networks, 
 7 | Net one classifier and Net two synthesizer  and combine them together
 8 | 2.	Delicate dataset. Except of using widely known dataset such as TIMIT,
 9 | the author used the girls 2 hours audio dataset, and 1,000+ recording of <boy, girl> pairs audio speaking the same sentence,
10 | that's maybe unacceptable in reality of training others voice.
11 | 3.	Not general. The model was trained only for Kate Winslet's voice transfer. 
12 | If we want to transfer to Obama's voice in our project RandomCNN, 
13 | we need to gather Obama's voice data and train that network again.
14 | 
15 | and here we have :
16 | It seems that to apply temperature to softmax in Net1 is not so meaningful.
17 | net2 can reach to near optimal when Net1 accuracy is correct to some extent.
18 | We have Over 70% test accuracy
19 | Obviously, sample rate, window length and hop length should be same in both Net1 and Net2.
20 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | cuda = True if torch.cuda.is_available() else False
 5 | 
 6 | N_FFT = 512
 7 | N_CHANNELS = round(1 + N_FFT/2)
 8 | OUT_CHANNELS = 32
 9 | 
10 | 
11 | class RandomCNN(nn.Module):
12 |     def __init__(self):
13 |         super(RandomCNN, self).__init__()
14 | 
15 |         # 2-D CNN
16 |         self.conv1 = nn.Conv2d(1, OUT_CHANNELS, kernel_size=(3, 1), stride=1, padding=0)
17 |         self.LeakyReLU = nn.LeakyReLU(0.2)
18 | 
19 |         # Set the random parameters to be constant.
20 |         weight = torch.randn(self.conv1.weight.data.shape)
21 |         self.conv1.weight = torch.nn.Parameter(weight, requires_grad=False)
22 |         bias = torch.zeros(self.conv1.bias.data.shape)
23 |         self.conv1.bias = torch.nn.Parameter(bias, requires_grad=False)
24 | 
25 |     def forward(self, x_delta):
26 |         out = self.LeakyReLU(self.conv1(x_delta))
27 |         return out
28 | 
29 | 
30 | """
31 | a_random = Variable(torch.randn(1, 1, 257, 430)).float()
32 | model = RandomCNN()
33 | a_O = model(a_random)
34 | print(a_O.shape)
35 | """


--------------------------------------------------------------------------------
/Related articles:
--------------------------------------------------------------------------------
 1 | here we have some articles that you can download them with links below:
 2 | 
 3 | 1.  https://drive.google.com/drive/folders/1wduFYvdny9UDDkXS7Ev0Qyh_7qhr61s4?usp=share_link
 4 | 2.  https://drive.google.com/drive/folders/1afbiZB7az5DXDCGo9AmsKZ8giEzUIQ3q?usp=share_link
 5 | 3.  https://drive.google.com/drive/folders/1KOFdnqjnX7XysMcfGHZdCpaeeV2xb83s?usp=share_link
 6 | 4.  https://drive.google.com/drive/folders/1jdWhGgMbX2G3M_mshHotjCD5s5eiR6P6?usp=share_link
 7 | 5.  https://drive.google.com/drive/folders/1fYReHwhuA42KvUY3HrDtjHSI0VD6DLmQ?usp=share_link
 8 | 6.  https://drive.google.com/drive/folders/1vvD53GYmyJXV30nYj2gqoZ1W2QBDotG3?usp=share_link
 9 | 7.  https://drive.google.com/drive/folders/1XMP-dvuAzl82Ji_dZu_Nkya1aSqhw4Jq?usp=share_link
10 | 8.  https://drive.google.com/drive/folders/1-EFosYJUeHyYPUz1zek2XO5r5gXVtTxk?usp=share_link
11 | 9.  https://drive.google.com/drive/folders/1NWHRjZeSp8YlnIcMF7cdB0eebyViV9Tx?usp=share_link
12 | 10. https://drive.google.com/drive/folders/1-bHiBu1aaU7zGGGtTyEz8zHnBHqcQO8o?usp=share_link
13 | 11. https://drive.google.com/drive/folders/1YzxnbGQh-MsrDDwm5zAlHlWZ_QNE2uqI?usp=share_link
14 | 12. https://drive.google.com/drive/folders/1xJcD_utfkB_Yck4pM5-XIqqFTxu99FB3?usp=share_link
15 | 


--------------------------------------------------------------------------------
/Explanation about the code and its function:
--------------------------------------------------------------------------------
 1 | clc
 2 | clear all
 3 | close all
 4 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip %
 5 | left=road(:,1); right=road(:,2);
 6 | time=(1/fs)*length(left); % Calculate the duration of audio playback %
 7 | t=linspace(0,time,length(left));
 8 | plot(t,left)
 9 | xlabel('time (sec)');
10 | ylabel('relative signal strength')
11 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength');
12 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…)
13 | load gong.mat;
14 | sound(y, Fs);
15 | load handel.mat;
16 | sound(y, 2*Fs);
17 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs)
18 | leftout=left; %  Create a new array for left with the same size  %
19 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo
20 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo
21 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5)
22 | make sound louder/quieter clear all;
23 | Fs = 44100; % sampling frequency
24 | dur = 1; % duration of sound (in sec)
25 | % time vector
26 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100
27 | % frequency  % Create frequency based on sinusoidal function %
28 | freq = 440;
29 | f = sin ( 2*pi * freq * t );
30 |  %%%%%%%%%%%%%%%%%%%%%
31 | % scale sound
32 | amp = .5;
33 | f_amp = amp * f;
34 | sound(f_amp,Fs)
35 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) )
36 | hold on
37 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 
38 | soundsc(left,fs); % Original left channel soundsc(left-right,fs);
39 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs);
40 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 
41 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered
42 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N;
43 | 


--------------------------------------------------------------------------------
/part_eight:
--------------------------------------------------------------------------------
 1 | #####################################################################################
 2 | #########################################
 3 | 
 4 | 
 5 | links for project :
 6 | 1. https://drive.google.com/drive/folders/1QU9YJ6I0IMAg8kIkFsGIqYnreI3-JWwg?usp=share_link
 7 | 2. https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG
 8 | 3. https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing
 9 | 4. https://drive.google.com/drive/folders/1xS9PHdGFSvJLAp6z8rSJ2Vps6Ok-7SKh?usp=share_link
10 | ####################################################################################
11 | ##############################################
12 | 
13 | 
14 | 
15 | Explanation about the code and its function :
16 | 
17 | clc
18 | clear all
19 | close all
20 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip %
21 | left=road(:,1); right=road(:,2);
22 | time=(1/fs)*length(left); % Calculate the duration of audio playback %
23 | t=linspace(0,time,length(left));
24 | plot(t,left)
25 | xlabel('time (sec)');
26 | ylabel('relative signal strength')
27 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength');
28 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…)
29 | load gong.mat;
30 | sound(y, Fs);
31 | load handel.mat;
32 | sound(y, 2*Fs);
33 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs)
34 | leftout=left; %  Create a new array for left with the same size  %
35 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo
36 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo
37 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5)
38 | make sound louder/quieter clear all;
39 | Fs = 44100; % sampling frequency
40 | dur = 1; % duration of sound (in sec)
41 | % time vector
42 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100
43 | % frequency  % Create frequency based on sinusoidal function %
44 | freq = 440;
45 | f = sin ( 2*pi * freq * t );
46 |  %%%%%%%%%%%%%%%%%%%%%
47 | % scale sound
48 | amp = .5;
49 | f_amp = amp * f;
50 | sound(f_amp,Fs)
51 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) )
52 | hold on
53 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 
54 | soundsc(left,fs); % Original left channel soundsc(left-right,fs);
55 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs);
56 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 
57 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered
58 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N;
59 | 


--------------------------------------------------------------------------------
/part_one:
--------------------------------------------------------------------------------
 1 | 
 2 |  Project summary
 3 |  
 4 |  Professor Dr. Mahde Eslami
 5 |  
 6 |  student Alireza Ahmadi
 7 |  
 8 |  South Tehran University
 9 |  
10 |  student number 4011414111015
11 |  
12 |  This is a summary of the project and the goal of the project :
13 |  
14 |  Audio style transfer with shallow random parameters CNN .
15 | Voice style transfer with random CNN : 
16 | its maybe the fastest voice style transfer with reasonable result . 
17 | the idea of Neural Voice Transfer aims at "using Obama's voice to sing songs of Beyoncé" or something related.
18 | Works that we did in this project : 
19 | 1. Use 2-D CONV rather than 1-D for audio spectrogram. 
20 | 2. Compute grams over time-axis. 
21 | 3. Training fast. 5-10 minutes to train and transfer on 1 single GPU(Tesla P40).
22 | 4.Do not need dataset! You can transfer any 2 pieces of audio.
23 | You can also see the result here :
24 |  https://soundcloud.com/mazzzystar/sets/speech-conversion-sample 
25 |  
26 |  
27 |  Explanation about the code and its function :
28 |  
29 | clc
30 | clear all
31 | close all
32 | [road,fs]=wavread('road.wav'); % loads “the long and winding road” clip %
33 | left=road(:,1); right=road(:,2);
34 | time=(1/fs)*length(left); % Calculate the duration of audio playback %
35 | t=linspace(0,time,length(left));
36 | plot(t,left)
37 | xlabel('time (sec)');
38 | ylabel('relative signal strength')
39 | time=(1/44100)*2000; t=linspace(0,time,2000); plot(t,left(1:2000)) xlabel('time (sec)'); ylabel('relative signal strength');
40 | soundsc(left,fs) % plays left channel as mono soundsc(right,fs) % plays right channel mono sound nearly the same) soundsc(road,fs) % plays stereo (ahhh…)
41 | load gong.mat;
42 | sound(y, Fs);
43 | load handel.mat;
44 | sound(y, 2*Fs);
45 | y=[1;2;3;4;5] y2=flipud(y) left2=flipud(left); soundsc(left2,fs)
46 | leftout=left; %  Create a new array for left with the same size  %
47 | N=10000; % delay amount N/44100 seconds for n=N+1:length(left) leftout(n)=left(n)+left(n-N); % approximately ¼ second echo End soundsc(left,fs) % original % Wait until the sound stops before moving to next sound command soundsc(leftout,fs) % signal with new echo
48 | out=road; % set up a new array, same size as old one N=10000; % delay amount N/44100 seconds for n=N+1:length(road) out(n,1)=road(n,1)+road(n-N,2); % echo ight-to-left! out(n,2)=road(n,2)+road(n-N,1); % echo left-to-ight! end soundsc(road,fs) % original soundsc(out,fs) % echo
49 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie soundsc(hootie,fs/1.5) soundsc(hootie,fs*1.5)
50 | make sound louder/quieter clear all;
51 | Fs = 44100; % sampling frequency
52 | dur = 1; % duration of sound (in sec)
53 | % time vector
54 | t = [0 : 1/Fs : dur-1/Fs]; % 1 second --> length(t) = 44100
55 | % frequency  % Create frequency based on sinusoidal function %
56 | freq = 440;
57 | f = sin ( 2*pi * freq * t );
58 |  %%%%%%%%%%%%%%%%%%%%%
59 | % scale sound
60 | amp = .5;
61 | f_amp = amp * f;
62 | sound(f_amp,Fs)
63 | plot( t(1 : 2*round(1/freq*Fs)) , f(1 : 2* round(1/freq*Fs)) )
64 | hold on
65 | plot( t(1 : 2*round(1/freq*Fs)) , f_amp(1 : 2* round(1/freq*Fs)) 
66 | soundsc(left,fs); % Original left channel soundsc(left-right,fs);
67 | soundsc(hootie(:,1),fs); % Original left channel soundsc(hootie(:,1)-hootie(:,2),fs);
68 | [hootie,fs]=wavread('hootie.wav'); % loads Hootie out=hootie; for n=2:length(hootie) out(n,1)=.9*out(n-1,1)+hootie(n,1); % left out(n,2)=.9*out(n-1,2)+hootie(n,2); % right end soundsc(hootie,fs) % original soundsc(out,fs) % 
69 | out=hootie; for n=2:length(hootie) out(n,1)=hootie(n,1)-hootie(n-1,1); % left out(n,2)=hootie(n,2)-hootie(n-1,2); % right end soundsc(out,fs) % high pass filtered
70 | [perfectSound, freq] = wavread('road.wav'); N= randn(length(perfectSound), 2); noisySound = perfectSound + N;
71 | 
72 |  
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # randomCNN-voice-transfer
 2 | # Alireza Ahmadi 40114140111015
 3 | # digital signal processing
 4 | # Voice style transfer with random CNN
 5 | Maybe the fastest voice style transfer with reasonable result ?
 6 | ## What is voice style transfer?
 7 | Inspired by the paper [A Neural Algorithm of Artistic Style](https://arxiv.org/abs/1508.06576) , the idea of `Neural Voice Transfer` aims at  "using Obama's voice to sing songs of Beyoncé" or something related.
 8 | 
 9 | We aim to:
10 | 
11 | 
12 | ## Highlight of my work
13 | * Use **2-D CONV** rather than 1-D for audio spectrogram.
14 | * Compute **grams over time-axis**.
15 | * **Training fast**. 5-10 minutes to train and transfer on 1 single GPU(Tesla P40).
16 | * **Do not need dataset!** You can transfer any 2 pieces of audio.(But some format of audio may occur error, then you should `sudo apt-get install libav-tools`)
17 | 
18 | ## Results
19 | **You can listen to my current result  now !** It's on soundcloud, [link1](https://soundcloud.com/mazzzystar/sets/stairway2nightcall), [link2](https://soundcloud.com/mazzzystar/sets/speech-conversion-sample).
20 | 
21 | The generated spectrogram compared with `content` and `style`.
22 | ![](picture/gen.png)
23 | 
24 | Compare the spectrogram of `gen` with `content` and `style`(X axis represents `Time Domain`, Y axis represents `Frequency Domain`),  we can find that:
25 | * The structure is almost the same as `content`, and the **gap along frequency axis**, which determines the `voice texture` to a great extent, is more alike to the style.
26 | * The base skeleton is **shifted upward a little bit** for being similar to the style(The style is girl's voice, which has higher frequency than boy's).
27 | 
28 | ## Reproduce it yourself
29 | ```
30 | pip install -r requirements.txt 
31 | # remove `CUDA_VISIBLE_DEVICES` when use CPU, though it will be slow. 
32 | CUDA_VISIBLE_DEVICES=0 python train.py -content input/boy18.wav -style input/girl52.wav
33 | ```
34 | Tips: change `3x1` CONV to `3x3` CONV can get smoother generated spectrogram.
35 | 
36 | ### But..does the `gram` of random CNN output really works ?
37 | Below is my experiments result of using `texture gram`  after 1-layer RandomCNN  to capture speaker identity by putting them as **the only feature** in a simple nearest neighbor speaker identification system. The table shows the result of speaker identification accuracy of this system over the first 15 utterances of 30 first speakers of the VCTK dataset, along with 100 utterances of 4 first speakers.
38 | 
39 | | Speakers        | Train/Test           | Accuracy  |
40 | | ------------- |:-------------:| -----:|
41 | | 30     | 270/180 | 45.6%|
42 | | 4      | 240/160      |   92.5% |
43 | 
44 | It seems `texture gram along time-axis` really captured something, you can check it by:
45 | ```
46 | python vctk_identify
47 | ```
48 | 
49 | # main source code = https://github.com/mazzzystar/randomCNN-voice-transfer/blob/master/README.md
50 | # my acount in github = https://github.com/alirezaahmadiii    
51 | # linkedin = https://www.linkedin.com/in/alireza-ahmadi-245214258
52 | 
53 | # videos that are going to help you to learn more about this project and more about colab and matrix and machine learning :
54 | https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing .1
55 | https://colab.research.google.com/drive/10-8X59ey1gYBU2Uj3s-2fco1cLcKwGul?usp=sharing .2
56 |  https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa-K?usp=sharing .3
57 | https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG .4
58 | https://www.aparat.com/v/wPWKh .5
59 | https://www.aparat.com/v/FCtZ4 .6
60 | https://aparat.com/v/rncBI .7
61 | https://aparat.com/v/CzVJn .8
62 | https://aparat.com/v/OZSFB .9
63 | https://www.aparat.com/v/wPWKh .10
64 | https://www.aparat.com/v/FCtZ4 .11
65 | https://www.aparat.com/v/2o9Hb .12
66 | https://www.aparat.com/v/jbf2z .13
67 | https://drive.google.com/drive/folders/1y8bNyDQwvbbm60ih1fMeuu1Apnv76zKC?usp=share_li .14
68 | nk
69 | https://drive.google.com/drive/folders/1Sd_onkfdwq63tsg20tfum7vBUTCaQa- .15
70 | K?usp=share_link
71 | https://drive.google.com/drive/folders/1o5bZ28hl75eHwlEf-SfGKch6km8tYTZG?usp=share_link .16
72 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/utils.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | import torch
  4 | import soundfile
  5 | from model import *
  6 | from packaging import version
  7 | 
  8 | def librosa_write(outfile, x, sr):
  9 |     if version.parse(librosa.__version__) < version.parse('0.8.0'):
 10 |         librosa.output.write_wav(outfile, x, sr)
 11 |     else:
 12 |         soundfile.write(outfile, x, sr)
 13 | 
 14 | def wav2spectrum(filename):
 15 |     x, sr = librosa.load(filename)
 16 |     S = librosa.stft(x, N_FFT)
 17 |     p = np.angle(S)
 18 | 
 19 |     S = np.log1p(np.abs(S))
 20 |     return S, sr
 21 | 
 22 | 
 23 | def spectrum2wav(spectrum, sr, outfile):
 24 |     # Return the all-zero vector with the same shape of `a_content`
 25 |     a = np.exp(spectrum) - 1
 26 |     p = 2 * np.pi * np.random.random_sample(spectrum.shape) - np.pi
 27 |     for i in range(50):
 28 |         S = a * np.exp(1j * p)
 29 |         x = librosa.istft(S)
 30 |         p = np.angle(librosa.stft(x, N_FFT))
 31 |     librosa_write(outfile, x, sr)
 32 | 
 33 | 
 34 | def wav2spectrum_keep_phase(filename):
 35 |     x, sr = librosa.load(filename)
 36 |     S = librosa.stft(x, N_FFT)
 37 |     p = np.angle(S)
 38 | 
 39 |     S = np.log1p(np.abs(S))
 40 |     return S, p, sr
 41 | 
 42 | 
 43 | def spectrum2wav_keep_phase(spectrum, p, sr, outfile):
 44 |     # Return the all-zero vector with the same shape of `a_content`
 45 |     a = np.exp(spectrum) - 1
 46 |     for i in range(50):
 47 |         S = a * np.exp(1j * p)
 48 |         x = librosa.istft(S)
 49 |         p = np.angle(librosa.stft(x, N_FFT))
 50 |     librosa_write(outfile, x, sr)
 51 | 
 52 | 
 53 | def compute_content_loss(a_C, a_G):
 54 |     """
 55 |     Compute the content cost
 56 | 
 57 |     Arguments:
 58 |     a_C -- tensor of dimension (1, n_C, n_H, n_W)
 59 |     a_G -- tensor of dimension (1, n_C, n_H, n_W)
 60 | 
 61 |     Returns:
 62 |     J_content -- scalar that you compute using equation 1 above
 63 |     """
 64 |     m, n_C, n_H, n_W = a_G.shape
 65 | 
 66 |     # Reshape a_C and a_G to the (m * n_C, n_H * n_W)
 67 |     a_C_unrolled = a_C.view(m * n_C, n_H * n_W)
 68 |     a_G_unrolled = a_G.view(m * n_C, n_H * n_W)
 69 | 
 70 |     # Compute the cost
 71 |     J_content = 1.0 / (4 * m * n_C * n_H * n_W) * torch.sum((a_C_unrolled - a_G_unrolled) ** 2)
 72 | 
 73 |     return J_content
 74 | 
 75 | 
 76 | def gram(A):
 77 |     """
 78 |     Argument:
 79 |     A -- matrix of shape (n_C, n_L)
 80 | 
 81 |     Returns:
 82 |     GA -- Gram matrix of shape (n_C, n_C)
 83 |     """
 84 |     GA = torch.matmul(A, A.t())
 85 | 
 86 |     return GA
 87 | 
 88 | 
 89 | def gram_over_time_axis(A):
 90 |     """
 91 |     Argument:
 92 |     A -- matrix of shape (1, n_C, n_H, n_W)
 93 | 
 94 |     Returns:
 95 |     GA -- Gram matrix of A along time axis, of shape (n_C, n_C)
 96 |     """
 97 |     m, n_C, n_H, n_W = A.shape
 98 | 
 99 |     # Reshape the matrix to the shape of (n_C, n_L)
100 |     # Reshape a_C and a_G to the (m * n_C, n_H * n_W)
101 |     A_unrolled = A.view(m * n_C * n_H, n_W)
102 |     GA = torch.matmul(A_unrolled, A_unrolled.t())
103 | 
104 |     return GA
105 | 
106 | 
107 | def compute_layer_style_loss(a_S, a_G):
108 |     """
109 |     Arguments:
110 |     a_S -- tensor of dimension (1, n_C, n_H, n_W)
111 |     a_G -- tensor of dimension (1, n_C, n_H, n_W)
112 | 
113 |     Returns:
114 |     J_style_layer -- tensor representing a scalar style cost.
115 |     """
116 |     m, n_C, n_H, n_W = a_G.shape
117 | 
118 |     # Reshape the matrix to the shape of (n_C, n_L)
119 |     # Reshape a_C and a_G to the (m * n_C, n_H * n_W)
120 | 
121 |     # Calculate the gram
122 |     # !!!!!! IMPORTANT !!!!! Here we compute the Gram along n_C,
123 |     # not along n_H * n_W. But is the result the same? No.
124 |     GS = gram_over_time_axis(a_S)
125 |     GG = gram_over_time_axis(a_G)
126 | 
127 |     # Computing the loss
128 |     J_style_layer = 1.0 / (4 * (n_C ** 2) * (n_H * n_W)) * torch.sum((GS - GG) ** 2)
129 | 
130 |     return J_style_layer
131 | 
132 | 
133 | """
134 | # Test
135 | test_S = torch.randn(1, 6, 2, 2)
136 | test_G = torch.randn(1, 6, 2, 2)
137 | print(test_S)
138 | print(test_G)
139 | print(compute_layer_style_loss(test_S, test_G))
140 | 
141 | 
142 | # Test
143 | test_C = torch.randn(1, 6, 2, 2)
144 | test_G = torch.randn(1, 6, 2, 2)
145 | print(test_C)
146 | print(test_G)
147 | print(compute_content_loss(test_C, test_G))
148 | """
149 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/vctk_identify.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import glob
  3 | from utils import *
  4 | from model import RandomCNN
  5 | 
  6 | 
  7 | data_path = 'VCTK-Corpus1/'
  8 | 
  9 | randomCNN = RandomCNN()
 10 | randomCNN.eval()
 11 | if cuda:
 12 |     randomCNN = randomCNN.cuda()
 13 | 
 14 | 
 15 | def process_vctk(_data_path, speaker_num=30, each_audio_num=15):
 16 |     # read label-info
 17 |     df = pd.read_table(_data_path + 'speaker-info.txt', usecols=['ID'],
 18 |                        index_col=False, delim_whitespace=True)
 19 | 
 20 |     # read file IDs
 21 |     file_ids = []
 22 |     for d in [_data_path + 'txt/p%d/' % uid for uid in df.ID.values[:speaker_num]]:
 23 |         file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt')[:each_audio_num])])
 24 | 
 25 |     audio_lst = []
 26 |     for i, f in enumerate(file_ids):
 27 |         # wave file name
 28 |         wave_file = _data_path + 'wav48/%s/' % f[:4] + f + '.wav'
 29 |         fn = wave_file.split('/')[-1].split("_")[0]
 30 |         print(fn)
 31 |         # target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
 32 |         # if os.path.exists(target_filename):
 33 |         #     continue
 34 |         # print info
 35 |         print("VCTK corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file))
 36 | 
 37 |         # load wave file
 38 |         spect, sr = wav2spectrum(wave_file)
 39 |         audio_lst.append((spect, fn))
 40 | 
 41 |         del spect
 42 |     return audio_lst
 43 | 
 44 | 
 45 | def compute_loss(a_C, a_G):
 46 |     """
 47 |     Compute the content cost
 48 | 
 49 |     Arguments:
 50 |     a_C -- tensor of dimension (1, n_C, n_H, n_W)
 51 |     a_G -- tensor of dimension (1, n_C, n_H, n_W)
 52 | 
 53 |     Returns:
 54 |     J_content -- scalar that you compute using equation 1 above
 55 |     """
 56 |     n_H, n_W = a_G.shape
 57 | 
 58 |     # Reshape a_C and a_G to the (m * n_C, n_H * n_W)
 59 |     J_content = 1.0 / (n_H * n_W) * torch.sum((a_C - a_G) ** 2)
 60 | 
 61 |     return J_content
 62 | 
 63 | GAP_LEN = 15
 64 | TRAIN_LEN = 9
 65 | audio_lst = process_vctk(data_path)
 66 | print(len(audio_lst))
 67 | 
 68 | train_lst = []
 69 | test_lst = []
 70 | 
 71 | count = 0
 72 | for item in audio_lst:
 73 |     if count % GAP_LEN < TRAIN_LEN:
 74 |         train_lst.append(item)
 75 |     else:
 76 |         test_lst.append(item)
 77 |     count += 1
 78 | del audio_lst
 79 | print("Train len={}".format(len(train_lst)))
 80 | print("Test len={}".format(len(test_lst)))
 81 | for item in train_lst[:100]:
 82 |     print(item[-1])
 83 | for item in test_lst[:100]:
 84 |     print(item[-1])
 85 | 
 86 | 
 87 | def spect2gram(spect_lst):
 88 |     grams_lst = []
 89 |     for item in spect_lst:
 90 |         audio, no = item[0], item[1]
 91 |         audio = audio.T
 92 |         audio_delta = np.zeros(audio.shape)
 93 |         for i in range(audio.shape[0] - 1):
 94 |             audio_delta[i] = audio_delta[i+1] - audio_delta[i+1]
 95 | 
 96 |         audio = audio.T
 97 |         audio_delta = audio_delta.T
 98 |         audio_torch = torch.from_numpy(audio)[None, None, :, :]
 99 |         audio_delta_torch = torch.from_numpy(audio_delta)[None, None, :, :]
100 |         audio_delta_var = Variable(audio_delta_torch, requires_grad=False).float()
101 |         audio_var = Variable(audio_torch, requires_grad=False).float()
102 |         if cuda:
103 |             audio_var = audio_var.cuda()
104 |             audio_delta_var = audio_delta_var.cuda()
105 |         randomCNN_output = randomCNN(audio_var)
106 |         gram = gram_over_time_axis(randomCNN_output)
107 |         grams_lst.append((gram, no))
108 |         del gram
109 |         del randomCNN_output
110 |         del audio_torch
111 |         del audio_var
112 |         del audio
113 |     return grams_lst
114 | 
115 | 
116 | train_grams = spect2gram(train_lst)
117 | print("Train audio nums={}".format(len(train_grams)))
118 | del train_lst
119 | 
120 | test_grams = spect2gram(test_lst)
121 | print("Test audio nums={}".format(len(test_grams)))
122 | del test_lst
123 | 
124 | 
125 | def classifiy(new_gram, no):
126 |     MIN_DIS = 100000
127 |     MIN_NO = ""
128 |     for item in train_grams:
129 |         item_gram, item_no = item[0], item[1]
130 |         dis = compute_loss(new_gram, item_gram)
131 |         if dis.data[0] < MIN_DIS:
132 |             MIN_DIS = dis.data[0]
133 |             MIN_NO = item_no
134 |         del item_gram
135 |     return 1 if(MIN_NO == no) else 0
136 | 
137 | correct_count = 0
138 | print("Begin to classify.")
139 | for item in test_grams:
140 |     gram, no = item[0], item[1]
141 |     correct_count += classifiy(gram, no)
142 | precise = float(correct_count) / len(test_grams)
143 | print("test: {}/{}, precise={}".format(correct_count, len(test_grams), precise))
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/randomCNN-voice-transfer/train.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('agg')
  3 | import matplotlib.pyplot as plt
  4 | from torch.autograd import Variable
  5 | from utils import *
  6 | from model import *
  7 | import time
  8 | import math
  9 | import argparse
 10 | cuda = True if torch.cuda.is_available() else False
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('-content', help='Content input')
 14 | parser.add_argument('-content_weight', help='Content weight. Default is 1e2', default = 1e2)
 15 | parser.add_argument('-style', help='Style input')
 16 | parser.add_argument('-style_weight', help='Style weight. Default is 1', default = 1)
 17 | parser.add_argument('-epochs', type=int, help='Number of epoch iterations. Default is 20000', default = 20000)
 18 | parser.add_argument('-print_interval', type=int, help='Number of epoch iterations between printing losses', default = 1000)
 19 | parser.add_argument('-plot_interval', type=int, help='Number of epoch iterations between plot points', default = 1000)
 20 | parser.add_argument('-learning_rate', type=float, default = 0.002)
 21 | parser.add_argument('-output', help='Output file name. Default is "output"', default = 'output')
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | CONTENT_FILENAME = args.content
 26 | STYLE_FILENAME = args.style
 27 | 
 28 | a_content, sr = wav2spectrum(CONTENT_FILENAME)
 29 | a_style, sr = wav2spectrum(STYLE_FILENAME)
 30 | 
 31 | a_content_torch = torch.from_numpy(a_content)[None, None, :, :]
 32 | if cuda:
 33 |     a_content_torch = a_content_torch.cuda()
 34 | print(a_content_torch.shape)
 35 | a_style_torch = torch.from_numpy(a_style)[None, None, :, :]
 36 | if cuda:
 37 |     a_style_torch = a_style_torch.cuda()
 38 | print(a_style_torch.shape)
 39 | 
 40 | model = RandomCNN()
 41 | model.eval()
 42 | 
 43 | a_C_var = Variable(a_content_torch, requires_grad=False).float()
 44 | a_S_var = Variable(a_style_torch, requires_grad=False).float()
 45 | if cuda:
 46 |     model = model.cuda()
 47 |     a_C_var = a_C_var.cuda()
 48 |     a_S_var = a_S_var.cuda()
 49 | 
 50 | a_C = model(a_C_var)
 51 | a_S = model(a_S_var)
 52 | 
 53 | 
 54 | # Optimizer
 55 | learning_rate = args.learning_rate
 56 | a_G_var = Variable(torch.randn(a_content_torch.shape) * 1e-3)
 57 | if cuda:
 58 |     a_G_var = a_G_var.cuda()
 59 | a_G_var.requires_grad = True
 60 | optimizer = torch.optim.Adam([a_G_var])
 61 | 
 62 | # coefficient of content and style
 63 | style_param = args.style_weight
 64 | content_param = args.content_weight
 65 | 
 66 | num_epochs = args.epochs
 67 | print_every = args.print_interval
 68 | plot_every = args.plot_interval
 69 | 
 70 | # Keep track of losses for plotting
 71 | current_loss = 0
 72 | all_losses = []
 73 | 
 74 | 
 75 | def timeSince(since):
 76 |     now = time.time()
 77 |     s = now - since
 78 |     m = math.floor(s / 60)
 79 |     s -= m * 60
 80 |     return '%dm %ds' % (m, s)
 81 | 
 82 | 
 83 | start = time.time()
 84 | # Train the Model
 85 | for epoch in range(1, num_epochs + 1):
 86 |     optimizer.zero_grad()
 87 |     a_G = model(a_G_var)
 88 | 
 89 |     content_loss = content_param * compute_content_loss(a_C, a_G)
 90 |     style_loss = style_param * compute_layer_style_loss(a_S, a_G)
 91 |     loss = content_loss + style_loss
 92 |     loss.backward()
 93 |     optimizer.step()
 94 | 
 95 |     # print
 96 |     if epoch % print_every == 0:
 97 |         print("{} {}% {} content_loss:{:4f} style_loss:{:4f} total_loss:{:4f}".format(epoch,
 98 |                                                                                       epoch / num_epochs * 100,
 99 |                                                                                       timeSince(start),
100 |                                                                                       content_loss.item(),
101 |                                                                                       style_loss.item(), loss.item()))
102 |         current_loss += loss.item()
103 | 
104 |     # Add current loss avg to list of losses
105 |     if epoch % plot_every == 0:
106 |         all_losses.append(current_loss / plot_every)
107 |         current_loss = 0
108 | 
109 | 
110 | gen_spectrum = a_G_var.cpu().data.numpy().squeeze()
111 | gen_audio_C = args.output + ".wav"
112 | spectrum2wav(gen_spectrum, sr, gen_audio_C)
113 | 
114 | plt.figure()
115 | plt.plot(all_losses)
116 | plt.savefig('loss_curve.png')
117 | 
118 | plt.figure(figsize=(5, 5))
119 | # we then use the 2nd column.
120 | plt.subplot(1, 1, 1)
121 | plt.title("Content Spectrum")
122 | plt.imsave('Content_Spectrum.png', a_content[:400, :])
123 | 
124 | plt.figure(figsize=(5, 5))
125 | # we then use the 2nd column.
126 | plt.subplot(1, 1, 1)
127 | plt.title("Style Spectrum")
128 | plt.imsave('Style_Spectrum.png', a_style[:400, :])
129 | 
130 | plt.figure(figsize=(5, 5))
131 | # we then use the 2nd column.
132 | plt.subplot(1, 1, 1)
133 | plt.title("CNN Voice Transfer Result")
134 | plt.imsave('Gen_Spectrum.png', gen_spectrum[:400, :])
135 | 


--------------------------------------------------------------------------------
/part_three:
--------------------------------------------------------------------------------
  1 | 
  2 | #############################################################
  3 | requierment that we need in first step are:
  4 | 
  5 | tensorflow-gpu >= 1.8
  6 | numpy >= 1.11.1
  7 | librosa == 0.5.1
  8 | joblib == 0.11.0
  9 | tensorpack >= 0.8.6
 10 | pyyaml
 11 | soundfile
 12 | pydub
 13 | tqdm
 14 | 
 15 | 
 16 | #############################################################################
 17 | 
 18 | 
 19 | ######our firt model that we can write like this :
 20 | #### start
 21 | 
 22 | import tensorflow as tf
 23 | from tensorpack.graph_builder.model_desc import ModelDesc, InputDesc
 24 | from tensorpack.tfutils import (
 25 |     get_current_tower_context, optimizer, gradproc)
 26 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
 27 | 
 28 | import tensorpack_extension
 29 | from data_load import phns
 30 | from hparam import hparam as hp
 31 | from modules import prenet, cbhg, normalize
 32 | 
 33 | 
 34 | class Net1(ModelDesc):
 35 |     def __init__(self):
 36 |         pass
 37 | 
 38 |     def _get_inputs(self):
 39 |         return [InputDesc(tf.float32, (None, None, hp.default.n_mfcc), 'x_mfccs'),
 40 |                 InputDesc(tf.int32, (None, None,), 'y_ppgs')]
 41 | 
 42 |     def _build_graph(self, inputs):
 43 |         self.x_mfccs, self.y_ppgs = inputs
 44 |         is_training = get_current_tower_context().is_training
 45 |         with tf.variable_scope('net1'):
 46 |             self.ppgs, self.preds, self.logits = self.network(self.x_mfccs, is_training)
 47 |         self.cost = self.loss()
 48 |         acc = self.acc()
 49 | 
 50 |         # summaries
 51 |         tf.summary.scalar('net1/train/loss', self.cost)
 52 |         tf.summary.scalar('net1/train/acc', acc)
 53 | 
 54 |         if not is_training:
 55 |             # summaries
 56 |             tf.summary.scalar('net1/eval/summ_loss', self.cost)
 57 |             tf.summary.scalar('net1/eval/summ_acc', acc)
 58 | 
 59 |             # for confusion matrix
 60 |             tf.reshape(self.y_ppgs, shape=(tf.size(self.y_ppgs),), name='net1/eval/y_ppg_1d')
 61 |             tf.reshape(self.preds, shape=(tf.size(self.preds),), name='net1/eval/pred_ppg_1d')
 62 | 
 63 |     def _get_optimizer(self):
 64 |         lr = tf.get_variable('learning_rate', initializer=hp.train1.lr, trainable=False)
 65 |         return tf.train.AdamOptimizer(lr)
 66 | 
 67 |     @auto_reuse_variable_scope
 68 |     def network(self, x_mfcc, is_training):
 69 |         # Pre-net
 70 |         prenet_out = prenet(x_mfcc,
 71 |                             num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2],
 72 |                             dropout_rate=hp.train1.dropout_rate,
 73 |                             is_training=is_training)  # (N, T, E/2)
 74 | 
 75 |         # CBHG
 76 |         out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2,
 77 |                    hp.train1.num_highway_blocks, hp.train1.norm_type, is_training)
 78 | 
 79 |         # Final linear projection
 80 |         logits = tf.layers.dense(out, len(phns))  # (N, T, V)
 81 |         ppgs = tf.nn.softmax(logits / hp.train1.t, name='ppgs')  # (N, T, V)
 82 |         preds = tf.to_int32(tf.argmax(logits, axis=-1))  # (N, T)
 83 | 
 84 |         return ppgs, preds, logits
 85 | 
 86 |     def loss(self):
 87 |         istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfccs, -1)))  # indicator: (N, T)
 88 |         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits / hp.train1.t,
 89 |                                                               labels=self.y_ppgs)
 90 |         loss *= istarget
 91 |         loss = tf.reduce_mean(loss)
 92 |         return loss
 93 | 
 94 |     def acc(self):
 95 |         istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfccs, -1)))  # indicator: (N, T)
 96 |         num_hits = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y_ppgs)) * istarget)
 97 |         num_targets = tf.reduce_sum(istarget)
 98 |         acc = num_hits / num_targets
 99 |         return acc
100 | 
101 | 
102 | class Net2(ModelDesc):
103 | 
104 |     def _get_inputs(self):
105 |         n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1
106 | 
107 |         return [InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mfcc), 'x_mfccs'),
108 |                 InputDesc(tf.float32, (None, n_timesteps, hp.default.n_fft // 2 + 1), 'y_spec'),
109 |                 InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mels), 'y_mel'), ]
110 | 
111 |     def _build_graph(self, inputs):
112 |         self.x_mfcc, self.y_spec, self.y_mel = inputs
113 | 
114 |         is_training = get_current_tower_context().is_training
115 | 
116 |         # build net1
117 |         self.net1 = Net1()
118 |         with tf.variable_scope('net1'):
119 |             self.ppgs, _, _ = self.net1.network(self.x_mfcc, is_training)
120 |         self.ppgs = tf.identity(self.ppgs, name='ppgs')
121 | 
122 |         # build net2
123 |         with tf.variable_scope('net2'):
124 |             self.pred_spec, self.pred_mel = self.network(self.ppgs, is_training)
125 |         self.pred_spec = tf.identity(self.pred_spec, name='pred_spec')
126 | 
127 |         self.cost = self.loss()
128 | 
129 |         # summaries
130 |         tf.summary.scalar('net2/train/loss', self.cost)
131 | 
132 |         if not is_training:
133 |             tf.summary.scalar('net2/eval/summ_loss', self.cost)
134 | 
135 |     def _get_optimizer(self):
136 |         gradprocs = [
137 |             tensorpack_extension.FilterGradientVariables('.*net2.*', verbose=False),
138 |             gradproc.MapGradient(
139 |                 lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)),
140 |             gradproc.GlobalNormClip(hp.train2.clip_norm),
141 |             # gradproc.PrintGradient(),
142 |             # gradproc.CheckGradient(),
143 |         ]
144 |         lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False)
145 |         opt = tf.train.AdamOptimizer(learning_rate=lr)
146 |         return optimizer.apply_grad_processors(opt, gradprocs)
147 | 
148 |     @auto_reuse_variable_scope
149 |     def network(self, ppgs, is_training):
150 |         # Pre-net
151 |         prenet_out = prenet(ppgs,
152 |                             num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2],
153 |                             dropout_rate=hp.train2.dropout_rate,
154 |                             is_training=is_training)  # (N, T, E/2)
155 | 
156 |         # CBHG1: mel-scale
157 |         pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2,
158 |                         hp.train2.num_highway_blocks, hp.train2.norm_type, is_training,
159 |                         scope="cbhg_mel")
160 |         pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1], name='pred_mel')  # (N, T, n_mels)
161 | 
162 |         # CBHG2: linear-scale
163 |         pred_spec = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2)  # (N, T, n_mels)
164 |         pred_spec = cbhg(pred_spec, hp.train2.num_banks, hp.train2.hidden_units // 2,
165 |                    hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear")
166 |         pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1], name='pred_spec')  # log magnitude: (N, T, 1+n_fft//2)
167 | 
168 |         return pred_spec, pred_mel
169 | 
170 |     def loss(self):
171 |         loss_spec = tf.reduce_mean(tf.squared_difference(self.pred_spec, self.y_spec))
172 |         loss_mel = tf.reduce_mean(tf.squared_difference(self.pred_mel, self.y_mel))
173 |         loss = loss_spec + loss_mel
174 |         return loss
175 |        
176 |        
177 |        
178 |        #######################################################################################################################
179 |         ################### on the other hand we can apply tensorpack gradproc############################################
180 |         
181 |         
182 |         second change that we can make :
183 |         but its much harder and it needs more codes that we should write:
184 |         ############################################################################
185 |         
186 | from __future__ import print_function
187 | 
188 | import tensorflow as tf
189 | 
190 | 
191 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
192 |     '''Embeds a given tensor. 
193 |     
194 |     Args:
195 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
196 |          to be looked up in `lookup table`.
197 |       vocab_size: An int. Vocabulary size.
198 |       num_units: An int. Number of embedding hidden units.
199 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
200 |         should be constant zeros.
201 |       scope: Optional scope for `variable_scope`.  
202 |       reuse: Boolean, whether to reuse the weights of a previous layer
203 |         by the same name.
204 |         
205 |     Returns:
206 |       A `Tensor` with one more rank than inputs's. The last dimesionality
207 |         should be `num_units`.
208 |     '''
209 |     with tf.variable_scope(scope, reuse=reuse):
210 |         lookup_table = tf.get_variable('lookup_table', 
211 |                                        dtype=tf.float32, 
212 |                                        shape=[vocab_size, num_units],
213 |                                        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01))
214 |         if zero_pad:
215 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
216 |                                       lookup_table[1:, :]), 0)
217 |     return tf.nn.embedding_lookup(lookup_table, inputs)   
218 |  
219 | def normalize(inputs, 
220 |               type="bn",
221 |               decay=.999,
222 |               epsilon=1e-8,
223 |               is_training=True, 
224 |               reuse=None,
225 |               activation_fn=None,
226 |               scope="normalize"):
227 |     '''Applies {batch|layer} normalization.
228 |     
229 |     Args:
230 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
231 |         `batch_size`. If type is `bn`, the normalization is over all but 
232 |         the last dimension. Or if type is `ln`, the normalization is over 
233 |         the last dimension. Note that this is different from the native 
234 |         `tf.contrib.layers.batch_norm`. For this I recommend you change
235 |         a line in ``tensorflow/contrib/layers/python/layers/layer.py` 
236 |         as follows.
237 |         Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
238 |         After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
239 |       type: A string. Either "bn" or "ln".
240 |       decay: Decay for the moving average. Reasonable values for `decay` are close
241 |         to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
242 |         Lower `decay` value (recommend trying `decay`=0.9) if model experiences
243 |         reasonably good training performance but poor validation and/or test
244 |         performance.
245 |       is_training: Whether or not the layer is in training mode. W
246 |       activation_fn: Activation function.
247 |       scope: Optional scope for `variable_scope`.
248 |       
249 |     Returns:
250 |       A tensor with the same shape and data dtype as `inputs`.
251 |     '''
252 |     if type=="bn":
253 |         inputs_shape = inputs.get_shape()
254 |         inputs_rank = inputs_shape.ndims
255 | 
256 |         # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
257 |         # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC.
258 |         if inputs_rank in [2, 3, 4]:
259 |             if inputs_rank==2:
260 |                 inputs = tf.expand_dims(inputs, axis=1)
261 |                 inputs = tf.expand_dims(inputs, axis=2)
262 |             elif inputs_rank==3:
263 |                 inputs = tf.expand_dims(inputs, axis=1)
264 | 
265 |             outputs = tf.contrib.layers.batch_norm(inputs=inputs,
266 |                                                decay=decay,
267 |                                                center=True,
268 |                                                scale=True,
269 |                                                updates_collections=None,
270 |                                                is_training=is_training,
271 |                                                scope=scope,
272 |                                                zero_debias_moving_mean=True,
273 |                                                fused=True,
274 |                                                reuse=reuse)
275 |             # restore original shape
276 |             if inputs_rank==2:
277 |                 outputs = tf.squeeze(outputs, axis=[1, 2])
278 |             elif inputs_rank==3:
279 |                 outputs = tf.squeeze(outputs, axis=1)
280 |         else: # fallback to naive batch norm
281 |             outputs = tf.contrib.layers.batch_norm(inputs=inputs,
282 |                                                decay=decay,
283 |                                                center=True,
284 |                                                scale=True,
285 |                                                updates_collections=None,
286 |                                                is_training=is_training,
287 |                                                scope=scope,
288 |                                                reuse=reuse,
289 |                                                fused=False)
290 |     elif type in ("ln",  "ins"):
291 |         reduction_axis = -1 if type=="ln" else 1
292 |         with tf.variable_scope(scope, reuse=reuse):
293 |             inputs_shape = inputs.get_shape()
294 |             params_shape = inputs_shape[-1:]
295 | 
296 |             mean, variance = tf.nn.moments(inputs, [reduction_axis], keep_dims=True)
297 |             # beta = tf.Variable(tf.zeros(params_shape))
298 |             beta = tf.get_variable("beta", shape=params_shape, initializer=tf.zeros_initializer)
299 |             # gamma = tf.Variable(tf.ones(params_shape))
300 |             gamma = tf.get_variable("gamma", shape=params_shape, initializer=tf.ones_initializer)
301 |             normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
302 |             outputs = gamma * normalized + beta
303 |     else:
304 |         outputs = inputs
305 | 
306 |     if activation_fn:
307 |         outputs = activation_fn(outputs)
308 | 
309 |     return outputs
310 | 
311 | 
312 | 
313 | def conv1d(inputs,
314 |            filters=None, 
315 |            size=1, 
316 |            rate=1,
317 |            padding="SAME",
318 |            use_bias=False,
319 |            activation_fn=None,
320 |            scope="conv1d",
321 |            reuse=None):
322 |     '''
323 |     Args:
324 |       inputs: A 3-D tensor with shape of [batch, time, depth].
325 |       filters: An int. Number of outputs (=activation maps)
326 |       size: An int. Filter size.
327 |       rate: An int. Dilation rate.
328 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
329 |       use_bias: A boolean.
330 |       scope: Optional scope for `variable_scope`.
331 |       reuse: Boolean, whether to reuse the weights of a previous layer
332 |         by the same name.
333 |     
334 |     Returns:
335 |       A masked tensor of the same shape and dtypes as `inputs`.
336 |     '''
337 |     with tf.variable_scope(scope):
338 |         if padding.lower()=="causal":
339 |             # pre-padding for causality
340 |             pad_len = (size - 1) * rate  # padding size
341 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
342 |             padding = "valid"
343 |         
344 |         if filters is None:
345 |             filters = inputs.get_shape().as_list[-1]
346 |         
347 |         params = {"inputs":inputs, "filters":filters, "kernel_size":size,
348 |                 "dilation_rate":rate, "padding":padding, "activation":activation_fn, 
349 |                 "use_bias":use_bias, "reuse":reuse}
350 |         
351 |         outputs = tf.layers.conv1d(**params)
352 |     return outputs
353 | 
354 | 
355 | def conv1d_banks(inputs, K=16, num_units=None, norm_type=None, is_training=True, scope="conv1d_banks", reuse=None):
356 |     '''Applies a series of conv1d separately.
357 |     
358 |     Args:
359 |       inputs: A 3d tensor with shape of [N, T, C]
360 |       K: An int. The size of conv1d banks. That is, 
361 |         The `inputs` are convolved with K filters: 1, 2, ..., K.
362 |       is_training: A boolean. This is passed to an argument of `batch_normalize`.
363 |     
364 |     Returns:
365 |       A 3d tensor with shape of [N, T, K*Hp.embed_size//2].
366 |     '''
367 |     with tf.variable_scope(scope, reuse=reuse):
368 |         outputs = []
369 |         for k in range(1, K+1):
370 |             with tf.variable_scope("num_{}".format(k)):
371 |                 output = conv1d(inputs, num_units, k)
372 |                 output = normalize(output, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu)
373 |             outputs.append(output)
374 |         outputs = tf.concat(outputs, -1)
375 |     return outputs # (N, T, Hp.embed_size//2*K)
376 | 
377 | 
378 | def gru(inputs, num_units=None, bidirection=False, seqlens=None, scope="gru", reuse=None):
379 |     '''Applies a GRU.
380 |     
381 |     Args:
382 |       inputs: A 3d tensor with shape of [N, T, C].
383 |       num_units: An int. The number of hidden units.
384 |       bidirection: A boolean. If True, bidirectional results 
385 |         are concatenated.
386 |       scope: Optional scope for `variable_scope`.  
387 |       reuse: Boolean, whether to reuse the weights of a previous layer
388 |         by the same name.
389 |         
390 |     Returns:
391 |       If bidirection is True, a 3d tensor with shape of [N, T, 2*num_units],
392 |         otherwise [N, T, num_units].
393 |     '''
394 |     with tf.variable_scope(scope, reuse=reuse):
395 |         if num_units is None:
396 |             num_units = inputs.get_shape().as_list[-1]
397 |             
398 |         cell = tf.contrib.rnn.GRUCell(num_units)  
399 |         if bidirection: 
400 |             cell_bw = tf.contrib.rnn.GRUCell(num_units)
401 |             outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, 
402 |                                                          sequence_length=seqlens,
403 |                                                          dtype=tf.float32)
404 |             return tf.concat(outputs, 2)  
405 |         else:
406 |             outputs, _ = tf.nn.dynamic_rnn(cell, inputs, 
407 |                                            sequence_length=seqlens,
408 |                                            dtype=tf.float32)
409 |             return outputs
410 | 
411 | 
412 | def attention_decoder(inputs, memory, seqlens=None, num_units=None, scope="attention_decoder", reuse=None):
413 |     '''Applies a GRU to `inputs`, while attending `memory`.
414 |     Args:
415 |       inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
416 |       memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
417 |       seqlens: A 1d tensor with shape of [N,], dtype of int32.
418 |       num_units: An int. Attention size.
419 |       scope: Optional scope for `variable_scope`.  
420 |       reuse: Boolean, whether to reuse the weights of a previous layer
421 |         by the same name.
422 |     
423 |     Returns:
424 |       A 3d tensor with shape of [N, T, num_units].    
425 |     '''
426 |     with tf.variable_scope(scope, reuse=reuse):
427 |         if num_units is None:
428 |             num_units = inputs.get_shape().as_list[-1]
429 |         
430 |         attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units, 
431 |                                                                    memory, 
432 |                                                                    memory_sequence_length=seqlens, 
433 |                                                                    normalize=True,
434 |                                                                    probability_fn=tf.nn.softmax)
435 |         decoder_cell = tf.contrib.rnn.GRUCell(num_units)
436 |         cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, num_units)
437 |         outputs, _ = tf.nn.dynamic_rnn(cell_with_attention, inputs, 
438 |                                        dtype=tf.float32) #( N, T', 16)
439 |     return outputs
440 | 
441 | def prenet(inputs, num_units=None, dropout_rate=0., is_training=True, scope="prenet", reuse=None):
442 |     '''Prenet for Encoder and Decoder.
443 |     Args:
444 |       inputs: A 3D tensor of shape [N, T, hp.embed_size].
445 |       is_training: A boolean.
446 |       scope: Optional scope for `variable_scope`.  
447 |       reuse: Boolean, whether to reuse the weights of a previous layer
448 |         by the same name.
449 |         
450 |     Returns:
451 |       A 3D tensor of shape [N, T, num_units/2].
452 |     '''
453 |     with tf.variable_scope(scope, reuse=reuse):
454 |         outputs = tf.layers.dense(inputs, units=num_units[0], activation=tf.nn.relu, name="dense1")
455 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout1")
456 |         outputs = tf.layers.dense(outputs, units=num_units[1], activation=tf.nn.relu, name="dense2")
457 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training, name="dropout2")
458 | 
459 |     return outputs # (N, T, num_units/2)
460 | 
461 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
462 |     '''Highway networks, see https://arxiv.org/abs/1505.00387
463 |     Args:
464 |       inputs: A 3D tensor of shape [N, T, W].
465 |       num_units: An int or `None`. Specifies the number of units in the highway layer
466 |              or uses the input size if `None`.
467 |       scope: Optional scope for `variable_scope`.  
468 |       reuse: Boolean, whether to reuse the weights of a previous layer
469 |         by the same name.
470 |     Returns:
471 |       A 3D tensor of shape [N, T, W].
472 |     '''
473 |     if not num_units:
474 |         num_units = inputs.get_shape()[-1]
475 |         
476 |     with tf.variable_scope(scope, reuse=reuse):
477 |         H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
478 |         T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1.0), name="dense2")
479 |         C = 1. - T
480 |         outputs = H * T + inputs * C
481 |     return outputs
482 | 
483 | 
484 | def cbhg(input, num_banks, hidden_units, num_highway_blocks, norm_type='bn', is_training=True, scope="cbhg"):
485 |     with tf.variable_scope(scope):
486 |         out = conv1d_banks(input,
487 |                            K=num_banks,
488 |                            num_units=hidden_units,
489 |                            norm_type=norm_type,
490 |                            is_training=is_training)  # (N, T, K * E / 2)
491 | 
492 |         out = tf.layers.max_pooling1d(out, 2, 1, padding="same")  # (N, T, K * E / 2)
493 | 
494 |         out = conv1d(out, hidden_units, 3, scope="conv1d_1")  # (N, T, E/2)
495 |         out = normalize(out, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu)
496 |         out = conv1d(out, hidden_units, 3, scope="conv1d_2")  # (N, T, E/2)
497 |         out += input  # (N, T, E/2) # residual connections
498 | 
499 |         for i in range(num_highway_blocks):
500 |             out = highwaynet(out, num_units=hidden_units,
501 |                              scope='highwaynet_{}'.format(i))  # (N, T, E/2)
502 | 
503 |         out = gru(out, hidden_units, True)  # (N, T, E)
504 |     return out
505 | 


--------------------------------------------------------------------------------