├── output1.jpg ├── output2.jpg ├── proposal-zahra karbalaei mohammadi.pdf ├── data ├── __init__.py ├── dataset.py ├── infinite_dataloader.py └── wav2mel.py ├── Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf ├── config.yaml ├── inference.py ├── preprocess.py ├── train.py ├── README by zahra karbalaei mohammadi ├── README.md └── model.py /output1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/output1.jpg -------------------------------------------------------------------------------- /output2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/output2.jpg -------------------------------------------------------------------------------- /proposal-zahra karbalaei mohammadi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/proposal-zahra karbalaei mohammadi.pdf -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import SpeakerDataset 2 | from .infinite_dataloader import InfiniteDataLoader, infinite_iterator 3 | from .wav2mel import Wav2Mel 4 | -------------------------------------------------------------------------------- /Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | Model: 2 | SpeakerEncoder: 3 | c_in: 80 4 | c_h: 128 5 | c_out: 128 6 | kernel_size: 5 7 | bank_size: 8 8 | bank_scale: 1 9 | c_bank: 128 10 | n_conv_blocks: 6 11 | n_dense_blocks: 6 12 | subsample: [1, 2, 1, 2, 1, 2] 13 | act: "relu" 14 | dropout_rate: 0.0 15 | ContentEncoder: 16 | c_in: 80 17 | c_h: 128 18 | c_out: 128 19 | kernel_size: 5 20 | bank_size: 8 21 | bank_scale: 1 22 | c_bank: 128 23 | n_conv_blocks: 6 24 | subsample: [1, 2, 1, 2, 1, 2] 25 | act: "relu" 26 | dropout_rate: 0.0 27 | Decoder: 28 | c_in: 128 29 | c_cond: 128 30 | c_h: 128 31 | c_out: 80 32 | kernel_size: 5 33 | n_conv_blocks: 6 34 | upsample: [2, 1, 2, 1, 2, 1] 35 | act: "relu" 36 | sn: False 37 | dropout_rate: 0.0 38 | Optimizer: 39 | lr: 0.0005 40 | beta1: 0.9 41 | beta2: 0.999 42 | amsgrad: True 43 | weight_decay: 0.0001 44 | grad_norm: 5 45 | Lambda: 46 | rec: 10 47 | kl: 1 48 | kl_annealing: 20000 49 | -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class SpeakerDataset(Dataset): 10 | def __init__(self, data_dir, segment=128, n_uttrs=4): 11 | self.data_dir = data_dir 12 | self.meta_data = json.load(open(os.path.join(data_dir, "metadata.json"), "r")) 13 | self.id2spk = list(self.meta_data.keys()) 14 | self.segment = segment 15 | self.n_uttrs = n_uttrs 16 | 17 | def __len__(self): 18 | return len(self.meta_data) # num_speakers 19 | 20 | def __getitem__(self, index): 21 | spk = self.id2spk[index] 22 | mel_files = random.sample(self.meta_data[spk], k=self.n_uttrs) 23 | mels = [torch.load(os.path.join(self.data_dir, file)) for file in mel_files] 24 | starts = [random.randint(0, m.shape[-1] - self.segment) for m in mels] 25 | mels = torch.stack( 26 | [m[:, start : (start + self.segment)] for (m, start) in zip(mels, starts)] 27 | ) 28 | return mels 29 | -------------------------------------------------------------------------------- /data/infinite_dataloader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class InfiniteDataLoader(torch.utils.data.DataLoader): 5 | def __init__(self, *args, **kwargs): 6 | super().__init__(*args, **kwargs) 7 | self._DataLoader__initialized = False 8 | self.batch_sampler = _RepeatSampler(self.batch_sampler) 9 | self._DataLoader__initialized = True 10 | self.iterator = super().__iter__() 11 | 12 | def __len__(self): 13 | return len(self.batch_sampler.sampler) 14 | 15 | def __iter__(self): 16 | for _ in range(len(self)): 17 | yield next(self.iterator) 18 | 19 | 20 | class _RepeatSampler(object): 21 | """Sampler that repeats forever. 22 | Args: 23 | sampler (Sampler) 24 | """ 25 | 26 | def __init__(self, sampler): 27 | self.sampler = sampler 28 | 29 | def __iter__(self): 30 | while True: 31 | yield from iter(self.sampler) 32 | 33 | 34 | def infinite_iterator(dataloader): 35 | """Infinitely yield a batch of data.""" 36 | while True: 37 | for batch in iter(dataloader): 38 | yield batch 39 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import soundfile as sf 4 | import torch 5 | import torchaudio 6 | 7 | from data import Wav2Mel 8 | 9 | 10 | def main( 11 | model_path: str, 12 | vocoder_path: str, 13 | source: str, 14 | target: str, 15 | output: str, 16 | ): 17 | device = "cuda" if torch.cuda.is_available() else "cpu" 18 | model = torch.jit.load(model_path).to(device) 19 | vocoder = torch.jit.load(vocoder_path).to(device) 20 | wav2mel = Wav2Mel() 21 | 22 | src, src_sr = torchaudio.load(source) 23 | tgt, tgt_sr = torchaudio.load(target) 24 | 25 | src = wav2mel(src, src_sr)[None, :].to(device) 26 | tgt = wav2mel(tgt, tgt_sr)[None, :].to(device) 27 | 28 | cvt = model.inference(src, tgt) 29 | 30 | with torch.no_grad(): 31 | wav = vocoder.generate([cvt.squeeze(0).data.T]) 32 | 33 | wav = wav[0].data.cpu().numpy() 34 | sf.write(output, wav, wav2mel.sample_rate) 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("model_path", type=str) 40 | parser.add_argument("vocoder_path", type=str) 41 | parser.add_argument("source", type=str) 42 | parser.add_argument("target", type=str) 43 | parser.add_argument("output", type=str) 44 | main(**vars(parser.parse_args())) 45 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from functools import partial 5 | from uuid import uuid4 6 | 7 | import librosa 8 | import torch 9 | import torch.multiprocessing as mp 10 | import torch.nn as nn 11 | import torchaudio 12 | from torch import Tensor 13 | from tqdm.auto import tqdm 14 | 15 | from data.wav2mel import Wav2Mel 16 | 17 | 18 | def process_files(audio_file: str, wav2mel: nn.Module) -> Tensor: 19 | speech_tensor, sample_rate = torchaudio.load(audio_file) 20 | mel_tensor = wav2mel(speech_tensor, sample_rate) 21 | 22 | return mel_tensor 23 | 24 | 25 | def main(data_dir: str, save_dir: str, segment: int): 26 | mp.set_sharing_strategy("file_system") 27 | os.makedirs(save_dir, exist_ok=True) 28 | wav2mel = Wav2Mel() 29 | file2mel = partial(process_files, wav2mel=wav2mel) 30 | 31 | meta_data = {} 32 | speakers = sorted(os.listdir(data_dir)) 33 | 34 | for spk in tqdm(speakers): 35 | spk_dir = os.path.join(data_dir, spk) 36 | wav_files = librosa.util.find_files(spk_dir) 37 | mels = [file2mel(wav_file) for wav_file in wav_files] 38 | mels = list(filter(lambda x: x is not None and x.shape[-1] > segment, mels)) 39 | rnd_paths = [f"{uuid4().hex}.pt" for _ in range(len(mels))] 40 | dummy = [ 41 | torch.save(mel, os.path.join(save_dir, path)) 42 | for (mel, path) in zip(mels, rnd_paths) 43 | ] 44 | meta_data[spk] = rnd_paths 45 | 46 | with open(os.path.join(save_dir, "metadata.json"), "w") as f: 47 | json.dump(meta_data, f, indent=4) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("data_dir", type=str) 53 | parser.add_argument("save_dir", type=str) 54 | parser.add_argument("--segment", type=int, default=128) 55 | main(**vars(parser.parse_args())) 56 | -------------------------------------------------------------------------------- /data/wav2mel.py: -------------------------------------------------------------------------------- 1 | """Wav2Mel for processing audio data.""" 2 | 3 | import torch 4 | from torchaudio.sox_effects import apply_effects_tensor 5 | from torchaudio.transforms import MelSpectrogram 6 | 7 | 8 | class Wav2Mel(torch.nn.Module): 9 | """Transform audio file into mel spectrogram tensors.""" 10 | 11 | def __init__( 12 | self, 13 | sample_rate: float = 16000, 14 | norm_db: float = -3.0, 15 | sil_threshold: float = 1.0, 16 | sil_duration: float = 0.1, 17 | fft_window_ms: float = 50.0, 18 | fft_hop_ms: float = 12.5, 19 | n_fft: int = 2048, 20 | f_min: float = 50.0, 21 | n_mels: int = 80, 22 | preemph: float = 0.97, 23 | ref_db: float = 20.0, 24 | dc_db: float = 100.0, 25 | ): 26 | super().__init__() 27 | 28 | self.sample_rate = sample_rate 29 | self.norm_db = norm_db 30 | self.sil_threshold = sil_threshold 31 | self.sil_duration = sil_duration 32 | self.fft_window_ms = fft_window_ms 33 | self.fft_hop_ms = fft_hop_ms 34 | self.n_fft = n_fft 35 | self.f_min = f_min 36 | self.n_mels = n_mels 37 | self.preemph = preemph 38 | self.ref_db = ref_db 39 | self.dc_db = dc_db 40 | 41 | self.sox_effects = SoxEffects(sample_rate, norm_db, sil_threshold, sil_duration) 42 | self.log_melspectrogram = LogMelspectrogram( 43 | sample_rate, 44 | fft_window_ms, 45 | fft_hop_ms, 46 | n_fft, 47 | f_min, 48 | n_mels, 49 | preemph, 50 | ref_db, 51 | dc_db, 52 | ) 53 | 54 | def forward(self, wav_tensor: torch.Tensor, sample_rate: int) -> torch.Tensor: 55 | wav_tensor = self.sox_effects(wav_tensor, sample_rate) 56 | if wav_tensor.numel() == 0: 57 | return None 58 | mel_tensor = self.log_melspectrogram(wav_tensor) 59 | return mel_tensor 60 | 61 | 62 | class SoxEffects(torch.nn.Module): 63 | """Transform waveform tensors.""" 64 | 65 | def __init__( 66 | self, 67 | sample_rate: int, 68 | norm_db: float, 69 | sil_threshold: float, 70 | sil_duration: float, 71 | ): 72 | super().__init__() 73 | self.effects = [ 74 | ["channels", "1"], # convert to mono 75 | ["rate", f"{sample_rate}"], # resample 76 | ["norm", f"{norm_db}"], # normalize to -3 dB 77 | [ 78 | "silence", 79 | "1", 80 | f"{sil_duration}", 81 | f"{sil_threshold}%", 82 | "-1", 83 | f"{sil_duration}", 84 | f"{sil_threshold}%", 85 | ], # remove silence throughout the file 86 | ] 87 | 88 | def forward(self, wav_tensor: torch.Tensor, sample_rate: int) -> torch.Tensor: 89 | wav_tensor, _ = apply_effects_tensor(wav_tensor, sample_rate, self.effects) 90 | return wav_tensor 91 | 92 | 93 | class LogMelspectrogram(torch.nn.Module): 94 | """Transform waveform tensors into log mel spectrogram tensors.""" 95 | 96 | def __init__( 97 | self, 98 | sample_rate: float, 99 | fft_window_ms: float, 100 | fft_hop_ms: float, 101 | n_fft: int, 102 | f_min: float, 103 | n_mels: int, 104 | preemph: float, 105 | ref_db: float, 106 | dc_db: float, 107 | ): 108 | super().__init__() 109 | self.melspectrogram = MelSpectrogram( 110 | sample_rate=sample_rate, 111 | win_length=int(sample_rate * fft_window_ms / 1000), 112 | hop_length=int(sample_rate * fft_hop_ms / 1000), 113 | n_fft=n_fft, 114 | f_min=f_min, 115 | n_mels=n_mels, 116 | ) 117 | self.preemph = preemph 118 | self.ref_db = ref_db 119 | self.dc_db = dc_db 120 | 121 | def forward(self, wav_tensor: torch.Tensor) -> torch.Tensor: 122 | # preemph 123 | wav_tensor = torch.cat( 124 | ( 125 | wav_tensor[:, 0].unsqueeze(-1), 126 | wav_tensor[:, 1:] - self.preemph * wav_tensor[:, :-1], 127 | ), 128 | dim=-1, 129 | ) 130 | mel_tensor = self.melspectrogram(wav_tensor).squeeze(0) # (n_mels, time) 131 | mel_tensor = 20 * mel_tensor.clamp(min=1e-9).log10() 132 | mel_tensor = (mel_tensor - self.ref_db + self.dc_db) / self.dc_db 133 | return mel_tensor 134 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | import yaml 7 | from torch.utils.data import random_split 8 | from torch.utils.tensorboard import SummaryWriter 9 | from tqdm.auto import trange 10 | 11 | from data import InfiniteDataLoader, SpeakerDataset, infinite_iterator 12 | from model import AdaINVC 13 | 14 | 15 | def main( 16 | config_file: str, 17 | data_dir: str, 18 | save_dir: str, 19 | n_steps: int, 20 | save_steps: int, 21 | log_steps: int, 22 | n_spks: int, 23 | n_uttrs: int, 24 | ): 25 | device = "cuda" if torch.cuda.is_available() else "cpu" 26 | torch.backends.cudnn.benchmark = True 27 | 28 | # Load config 29 | config = yaml.load(open(config_file, "r"), Loader=yaml.FullLoader) 30 | 31 | # Prepare data 32 | data = SpeakerDataset(data_dir, segment=128, n_uttrs=n_uttrs) 33 | 34 | # split train/valid sets 35 | train_set, valid_set = random_split( 36 | data, [int(len(data) * 0.8), len(data) - int(len(data) * 0.8)] 37 | ) 38 | 39 | # construct loader 40 | train_loader = InfiniteDataLoader( 41 | train_set, batch_size=n_spks, shuffle=True, num_workers=8 42 | ) 43 | valid_loader = InfiniteDataLoader( 44 | valid_set, batch_size=n_spks, shuffle=True, num_workers=8 45 | ) 46 | 47 | # construct iterator 48 | train_iter = infinite_iterator(train_loader) 49 | valid_iter = infinite_iterator(valid_loader) 50 | 51 | # Build model 52 | model = AdaINVC(config["Model"]).to(device) 53 | model = torch.jit.script(model) 54 | 55 | # Optimizer 56 | opt = torch.optim.Adam( 57 | model.parameters(), 58 | lr=config["Optimizer"]["lr"], 59 | betas=(config["Optimizer"]["beta1"], config["Optimizer"]["beta2"]), 60 | amsgrad=config["Optimizer"]["amsgrad"], 61 | weight_decay=config["Optimizer"]["weight_decay"], 62 | ) 63 | 64 | # Tensorboard logger 65 | writer = SummaryWriter(save_dir) 66 | criterion = nn.L1Loss() 67 | pbar = trange(n_steps, ncols=0) 68 | valid_steps = 32 69 | 70 | for step in pbar: 71 | # get features 72 | org_mels = next(train_iter) 73 | org_mels = org_mels.flatten(0, 1) 74 | org_mels = org_mels.to(device) 75 | 76 | # reconstruction 77 | mu, log_sigma, emb, rec_mels = model(org_mels) 78 | 79 | # compute loss 80 | rec_loss = criterion(rec_mels, org_mels) 81 | kl_loss = 0.5 * (log_sigma.exp() + mu ** 2 - 1 - log_sigma).mean() 82 | rec_lambda = config["Lambda"]["rec"] 83 | kl_lambda = min( 84 | config["Lambda"]["kl"] * step / config["Lambda"]["kl_annealing"], 85 | config["Lambda"]["kl"], 86 | ) 87 | loss = rec_lambda * rec_loss + kl_lambda * kl_loss 88 | 89 | # update parameters 90 | opt.zero_grad() 91 | loss.backward() 92 | grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=5) 93 | opt.step() 94 | 95 | # save model and optimizer 96 | if (step + 1) % save_steps == 0: 97 | model_path = os.path.join(save_dir, f"model-{step + 1}.ckpt") 98 | model.cpu() 99 | model.save(model_path) 100 | model.to(device) 101 | opt_path = os.path.join(save_dir, f"opt-{step + 1}.ckpt") 102 | torch.save(opt.state_dict(), opt_path) 103 | 104 | if (step + 1) % log_steps == 0: 105 | # validation 106 | model.eval() 107 | valid_loss = 0 108 | for _ in range(valid_steps): 109 | org_mels = next(valid_iter) 110 | org_mels = org_mels.flatten(0, 1) 111 | org_mels = org_mels.to(device) 112 | mu, log_sigma, emb, rec_mels = model(org_mels) 113 | loss = criterion(rec_mels, org_mels) 114 | valid_loss += loss.item() 115 | valid_loss /= valid_steps 116 | model.train() 117 | 118 | # record information 119 | writer.add_scalar("training/rec_loss", rec_loss, step + 1) 120 | writer.add_scalar("training/kl_loss", kl_loss, step + 1) 121 | writer.add_scalar("training/grad_norm", grad_norm, step + 1) 122 | writer.add_scalar("lambda/kl", kl_lambda, step + 1) 123 | writer.add_scalar("validation/rec_loss", valid_loss, step + 1) 124 | 125 | # update tqdm bar 126 | pbar.set_postfix({"rec_loss": rec_loss.item(), "kl_loss": kl_loss.item()}) 127 | 128 | 129 | if __name__ == "__main__": 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument("config_file", type=str) 132 | parser.add_argument("data_dir", type=str) 133 | parser.add_argument("save_dir", type=str) 134 | parser.add_argument("--n_steps", type=int, default=int(1e6)) 135 | parser.add_argument("--save_steps", type=int, default=5000) 136 | parser.add_argument("--log_steps", type=int, default=250) 137 | parser.add_argument("--n_spks", type=int, default=32) 138 | parser.add_argument("--n_uttrs", type=int, default=4) 139 | main(**vars(parser.parse_args())) 140 | -------------------------------------------------------------------------------- /README by zahra karbalaei mohammadi: -------------------------------------------------------------------------------- 1 | 1. A summary of the purpose as well as the function of the code 2 | 3 | In this research, an algorithm for changing the voice of people has been presented, which can easily take the voice of a person with a specific content and have the voice of another person with a different content next to it, and at the end, the first content with the voice of the second person as the output. to give 4 | The function of the code is completely algorithmic and does not have any input or output as sound. Therefore, in the future, they can use this algorithm in voice conversion tasks. 5 | 6 | 7 | 2. The rate of innovation in code improvement 8 | 9 | Due to the fact that this project did not have any input and output, I found a similar work in the link https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=RmNTzr2Fds5l which can show the performance of this project well. 10 | Also, according to the studies, I found that if AdaIN-VC is used with AGAIN-VC, it can greatly reduce the dimensions and prevent the speaker information from leaking into the content embeddings. 11 | 12 | 13 | 3. Things that have been changed and improved in the source code 14 | 15 | There were no specific bugs in the original source code, and most of the problems were related to pycodestyle, which I tried to fix as much as possible. 16 | New source code to show the performance of the AdaIN-VC algorithm: 17 | 18 | # AdaIN-VC demo 19 | This is a demonstration of AdaIN-VC that should work out of the box and be fairly quick to setup. 20 | 21 | ## Code Setup 22 | !git clone https://github.com/yiftachbeer/AdaIN-VC 23 | %cd AdaIN-VC 24 | %%capture 25 | 26 | !python -m pip install -r requirements.txt 27 | ## Data Setup 28 | #We download a custom, smaller version of VCTK (all utterances of 5 speakers out of 110). 29 | %%capture 30 | 31 | !wget https://www.cs.huji.ac.il/~yiftach/VCTKmini.zip 32 | !unzip VCTKmini.zip && rm VCTKmini.zip 33 | 34 | %run adain-vc.py preprocess VCTKmini/wav48_silence_trimmed VCTKmini_mel 35 | 36 | ## Training 37 | #The `n_steps` parameter can be adjusted depending on how long you want to wait. 38 | 39 | %run adain-vc.py train config.yaml VCTKmini_mel saved_models --n_steps 1000 --save_steps 100 40 | 41 | ## Inference 42 | from IPython.display import Audio 43 | #We use the first sample for content, and the second for speaker: 44 | Audio('VCTKmini/p226/p226_002_mic2.flac') 45 | Audio('VCTKmini/p225/p225_003_mic2.flac') 46 | 47 | #We demonstrate the quality of the pretrained model along with the one we just trained: 48 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-trained.wav --model_path saved_models/model-1000.ckpt 49 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-pretrained.wav 50 | Audio('cvrt-trained.wav') 51 | Audio('cvrt-pretrained.wav') 52 | 53 | Reference to the project: https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=KVcBHvvfV9s- 54 | 55 | 56 | 4. The result of changing and improving the code in evaluating the output audio file 57 | 58 | After solving these problems, no other problems were observed in Visual Studio Code, but because the work is coded as an algorithm, we had no sound input or output. 59 | I was only able to check the codes and fix the errors and problems according to the Visual Studio Code guidelines to improve the coding. 60 | 61 | 62 | 5. Reference to the main project link 63 | 64 | https://github.com/cyhuang-tw/AdaIN-VC 65 | 66 | 67 | 6. Student introduction 68 | 69 | Zahra karbalaei mohammadi is a master's student from South Tehran University 70 | 71 | student number: 40014140111030 72 | 73 | Digital signal processing course 74 | 75 | Supervisor: Dr. Mahdi Eslami 76 | 77 | 78 | 7. The article file has been updated 79 | 80 | Link to download the comparison table of advantages and disadvantages of the method used in 10 articles with similar topics: https://drive.google.com/file/d/1senPl-zaLvEdadwrADIY5Ibc84KInL_0/view?usp=share_link 81 | 82 | Download link of the introduction of the new article: https://drive.google.com/file/d/1DQQAOIRcSbO8AzGZWcIVnzyrnj3OjqAG/view?usp=share_link 83 | 84 | 85 | 8. Explanation videos about project code and articles 86 | 87 | Video file link for a general explanation about the article: https://drive.google.com/file/d/1Dj-tNs13g7Z3m3rBQ18mCPBXiyZSz6KJ/view?usp=sharing 88 | 89 | Video link for a detailed explanation of the article: https://drive.google.com/file/d/1UAlZrxqV7mTjHeamGqRVoJ2YErcp-zPM/view?usp=sharing 90 | 91 | Video file general explanation about the main parts of the source and code database and the environment and software required to run the code: https://drive.google.com/file/d/1wytASSQn8NkKPPeb_t8BlzIFMRxnuWVC/view?usp=sharing 92 | 93 | Video file link explaining about the code and matching it with the article: https://drive.google.com/file/d/18sz51-JWXAwIVSdhsA0BptDeGtr9HQTd/view?usp=sharing 94 | 95 | Link to the video file of the source code execution and explanation about the input and output of the final project: https://drive.google.com/file/d/1U7PZy5zF4mX2T4OMar-OgltCig1yvK4V/view?usp=drivesdk 96 | 97 | The link of the input and output file in another similar project that uses the algorithm of my final project: https://drive.google.com/drive/folders/1fY-dxzlGMZGa0sGDEVHKmRQQNKZ9tLFq 98 | 99 | 100 | All the videos related to my project to promote science have been uploaded to Aparat 101 | 102 | Aparat link: https://www.aparat.com/Zahrakarbalaeimohammadi 103 | 104 | 105 | 9. Completed proposal file for the project 106 | 107 | Download link: 108 | 109 | https://drive.google.com/file/d/10ofAt50bEqUU1LrLRz96W2oYHmpbMS94/view?usp=share_link 110 | 111 | 112 | 10. All the tasks done for the progress of the project 113 | 114 | Download link: 115 | 116 | https://drive.google.com/drive/folders/11PUIKewGp8iGzwEpmLxETaU9BjT11w9k?usp=share_link 117 | 118 | 11. The link to the final project presentation 119 | 120 | Download link: 121 | 122 | https://drive.google.com/file/d/18UjO3zHM1mht7l9qqmY0sgnDcDml0uoL/view?usp=share_link 123 | 124 | Download link from Aparat: 125 | 126 | https://aparat.com/v/HjU6B 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AdaIN-VC 2 | 3 | This is an unofficial implementation of the paper [One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742) modified from the official one. 4 | 5 | ## Dependencies 6 | 7 | - Python >= 3.6 8 | - torch >= 1.7.0 9 | - torchaudio >= 0.7.0 10 | - numpy >= 1.16.0 11 | - librosa >= 0.6.3 12 | 13 | ## Differences from the official implementation 14 | 15 | The main difference from the official implementation is the use of a neural vocoder, which greatly improves the audio quality. 16 | I adopted universal vocoder, whose code was from [yistLin/universal-vocoder](https://github.com/yistLin/universal-vocoder) and checkpoint will be available soon. 17 | Besides, this implementation supports torch.jit, so the full model can be loaded with simply one line: 18 | 19 | ```python 20 | model = torch.jit.load(model_path) 21 | ``` 22 | 23 | Pre-trained models are available [here](https://drive.google.com/drive/folders/1MacKgXGA4Ad0O_c6W5MlkZMG0B8IzaM-?usp=sharing). 24 | 25 | ## Preprocess 26 | 27 | The code `preprocess.py` extracts features from raw audios. 28 | 29 | ```bash 30 | python preprocess.py [--segment seg_len] 31 | ``` 32 | 33 | - **data_dir**: The directory of speakers. 34 | - **save_dir**: The directory to save the processed files. 35 | - **seg_len**: The length of segments for training. 36 | 37 | ## Training 38 | 39 | ```bash 40 | python train.py [--n_steps steps] [--save_steps save] [--log_steps log] [--n_spks spks] [--n_uttrs uttrs] 41 | ``` 42 | 43 | - **config_file**: The config file for AdaIN-VC. 44 | - **data_dir**: The directory of processed files given by `preprocess.py`. 45 | - **save_dir**: The directory to save the model. 46 | - **steps**: The number of steps for training. 47 | - **save**: To save the model every save steps. 48 | - **log**: To record training information every log steps. 49 | - **spks**: The number of speakers in the batch. 50 | - **uttrs**: The number of utterances for each speaker in the batch. 51 | 52 | ## Inference 53 | 54 | You can use `inference.py` to perform one-shot voice conversion. 55 | The pre-trained model will be available soon. 56 | 57 | ```bash 58 | python inference.py 59 | ``` 60 | 61 | - **model_path**: The path of the model file. 62 | - **vocoder_path**: The path of the vocoder file. 63 | - **source**: The utterance providing linguistic content. 64 | - **target**: The utterance providing target speaker timbre. 65 | - **output**: The converted utterance. 66 | 67 | ## Reference 68 | 69 | Please cite the paper if you find AdaIN-VC useful. 70 | 71 | ```bib 72 | @article{chou2019one, 73 | title={One-shot voice conversion by separating speaker and content representations with instance normalization}, 74 | author={Chou, Ju-chieh and Yeh, Cheng-chieh and Lee, Hung-yi}, 75 | journal={arXiv preprint arXiv:1904.05742}, 76 | year={2019} 77 | } 78 | ``` 79 | ## Sections added by Zahra Karbalaei Mohammadi 80 | ## 1. A summary of the purpose as well as the function of the code 81 | 82 | In this research, an algorithm for changing the voice of people has been presented, which can easily take the voice of a person with a specific content and have the voice of another person with a different content next to it, and at the end, the first content with the voice of the second person as the output. to give 83 | The function of the code is completely algorithmic and does not have any input or output as sound. Therefore, in the future, they can use this algorithm in voice conversion tasks. 84 | 85 | 86 | ## 2. The rate of innovation in code improvement 87 | 88 | Due to the fact that this project did not have any input and output, I found a similar work in the link https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=RmNTzr2Fds5l which can show the performance of this project well. 89 | Also, according to the studies, I found that if AdaIN-VC is used with AGAIN-VC, it can greatly reduce the dimensions and prevent the speaker information from leaking into the content embeddings. 90 | 91 | 92 | ## 3. Things that have been changed and improved in the source code 93 | 94 | There were no specific bugs in the original source code, and most of the problems were related to pycodestyle, which I tried to fix as much as possible. 95 | New source code to show the performance of the AdaIN-VC algorithm: 96 | 97 | # AdaIN-VC demo 98 | This is a demonstration of AdaIN-VC that should work out of the box and be fairly quick to setup. 99 | 100 | ## Code Setup 101 | !git clone https://github.com/yiftachbeer/AdaIN-VC 102 | %cd AdaIN-VC 103 | %%capture 104 | 105 | !python -m pip install -r requirements.txt 106 | ## Data Setup 107 | #We download a custom, smaller version of VCTK (all utterances of 5 speakers out of 110). 108 | %%capture 109 | 110 | !wget https://www.cs.huji.ac.il/~yiftach/VCTKmini.zip 111 | !unzip VCTKmini.zip && rm VCTKmini.zip 112 | 113 | %run adain-vc.py preprocess VCTKmini/wav48_silence_trimmed VCTKmini_mel 114 | 115 | ## Training 116 | #The `n_steps` parameter can be adjusted depending on how long you want to wait. 117 | 118 | %run adain-vc.py train config.yaml VCTKmini_mel saved_models --n_steps 1000 --save_steps 100 119 | 120 | ## Inference 121 | from IPython.display import Audio 122 | #We use the first sample for content, and the second for speaker: 123 | Audio('VCTKmini/p226/p226_002_mic2.flac') 124 | Audio('VCTKmini/p225/p225_003_mic2.flac') 125 | 126 | #We demonstrate the quality of the pretrained model along with the one we just trained: 127 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-trained.wav --model_path saved_models/model-1000.ckpt 128 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-pretrained.wav 129 | Audio('cvrt-trained.wav') 130 | Audio('cvrt-pretrained.wav') 131 | 132 | Reference to the project: https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=KVcBHvvfV9s- 133 | 134 | 135 | ## 4. The result of changing and improving the code in evaluating the output audio file 136 | 137 | After solving these problems, no other problems were observed in Visual Studio Code, but because the work is coded as an algorithm, we had no sound input or output. 138 | I was only able to check the codes and fix the errors and problems according to the Visual Studio Code guidelines to improve the coding. 139 | 140 | 141 | ## 5. Reference to the main project link 142 | 143 | https://github.com/cyhuang-tw/AdaIN-VC 144 | 145 | 146 | ## 6. Student introduction 147 | 148 | Zahra karbalaei mohammadi is a master's student from South Tehran University 149 | 150 | student number: 40014140111030 151 | 152 | Digital signal processing course 153 | 154 | Supervisor: Dr. Mahdi Eslami 155 | 156 | 157 | ## 7. The article file has been updated 158 | 159 | Link to download the comparison table of advantages and disadvantages of the method used in 10 articles with similar topics: https://drive.google.com/file/d/1senPl-zaLvEdadwrADIY5Ibc84KInL_0/view?usp=share_link 160 | 161 | Download link of the introduction of the new article: https://drive.google.com/file/d/1DQQAOIRcSbO8AzGZWcIVnzyrnj3OjqAG/view?usp=share_link 162 | 163 | 164 | ## 8. Explanation videos about project code and articles 165 | 166 | Video file link for a general explanation about the article: https://drive.google.com/file/d/1Dj-tNs13g7Z3m3rBQ18mCPBXiyZSz6KJ/view?usp=sharing 167 | 168 | Video link for a detailed explanation of the article: https://drive.google.com/file/d/1UAlZrxqV7mTjHeamGqRVoJ2YErcp-zPM/view?usp=sharing 169 | 170 | Video file general explanation about the main parts of the source and code database and the environment and software required to run the code: https://drive.google.com/file/d/1wytASSQn8NkKPPeb_t8BlzIFMRxnuWVC/view?usp=sharing 171 | 172 | Video file link explaining about the code and matching it with the article: https://drive.google.com/file/d/18sz51-JWXAwIVSdhsA0BptDeGtr9HQTd/view?usp=sharing 173 | 174 | Link to the video file of the source code execution and explanation about the input and output of the final project: https://drive.google.com/file/d/1U7PZy5zF4mX2T4OMar-OgltCig1yvK4V/view?usp=drivesdk 175 | 176 | The link of the input and output file in another similar project that uses the algorithm of my final project: https://drive.google.com/drive/folders/1fY-dxzlGMZGa0sGDEVHKmRQQNKZ9tLFq 177 | 178 | 179 | All the videos related to my project to promote science have been uploaded to Aparat 180 | 181 | Aparat link: https://www.aparat.com/Zahrakarbalaeimohammadi 182 | 183 | 184 | ## 9. Completed proposal file for the project 185 | 186 | Download link: 187 | 188 | https://drive.google.com/file/d/10ofAt50bEqUU1LrLRz96W2oYHmpbMS94/view?usp=share_link 189 | 190 | ## All the tasks done for the progress of the project 191 | 192 | Download link: 193 | 194 | https://drive.google.com/drive/folders/11PUIKewGp8iGzwEpmLxETaU9BjT11w9k?usp=share_link 195 | 196 | ## The link to the final project presentation 197 | 198 | Download link: 199 | 200 | https://drive.google.com/file/d/18UjO3zHM1mht7l9qqmY0sgnDcDml0uoL/view?usp=share_link 201 | 202 | Download link from Aparat: 203 | 204 | https://aparat.com/v/HjU6B 205 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch import Tensor 7 | from torch.nn.utils import spectral_norm 8 | 9 | 10 | def get_act(act: str) -> nn.Module: 11 | if act == "lrelu": 12 | return nn.LeakyReLU() 13 | return nn.ReLU() 14 | 15 | 16 | class ConvBank(nn.Module): 17 | def __init__(self, c_in: int, c_out: int, n_bank: int, bank_scale: int, act: str): 18 | super(ConvBank, self).__init__() 19 | self.conv_bank = nn.ModuleList( 20 | [ 21 | nn.Sequential( 22 | nn.ReflectionPad1d((k // 2, k // 2 - 1 + k % 2)), 23 | nn.Conv1d(c_in, c_out, kernel_size=k), 24 | ) 25 | for k in range(bank_scale, n_bank + 1, bank_scale) 26 | ] 27 | ) 28 | self.act = get_act(act) 29 | 30 | def forward(self, x: Tensor) -> Tensor: 31 | outs = [self.act(layer(x)) for layer in self.conv_bank] 32 | out = torch.cat(outs + [x], dim=1) 33 | return out 34 | 35 | 36 | class PixelShuffle(nn.Module): 37 | def __init__(self, scale_factor: int): 38 | super(PixelShuffle, self).__init__() 39 | self.scale_factor = scale_factor 40 | 41 | def forward(self, x: Tensor) -> Tensor: 42 | batch_size, channels, in_width = x.size() 43 | channels = channels // self.scale_factor 44 | out_width = in_width * self.scale_factor 45 | x = x.contiguous().view(batch_size, channels, self.scale_factor, in_width) 46 | x = x.permute(0, 1, 3, 2).contiguous() 47 | x = x.view(batch_size, channels, out_width) 48 | return x 49 | 50 | 51 | class AffineLayer(nn.Module): 52 | def __init__(self, c_cond: int, c_h: int): 53 | super(AffineLayer, self).__init__() 54 | self.c_h = c_h 55 | self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) 56 | self.linear_layer = nn.Linear(c_cond, c_h * 2) 57 | 58 | def forward(self, x: Tensor, x_cond: Tensor) -> Tensor: 59 | x_cond = self.linear_layer(x_cond) 60 | mean, std = x_cond[:, : self.c_h], x_cond[:, self.c_h :] 61 | mean, std = mean.unsqueeze(-1), std.unsqueeze(-1) 62 | x = self.norm_layer(x) 63 | x = x * std + mean 64 | return x 65 | 66 | 67 | class SpeakerEncoder(nn.Module): 68 | def __init__( 69 | self, 70 | c_in: int, 71 | c_h: int, 72 | c_out: int, 73 | kernel_size: int, 74 | bank_size: int, 75 | bank_scale: int, 76 | c_bank: int, 77 | n_conv_blocks: int, 78 | n_dense_blocks: int, 79 | subsample: List[int], 80 | act: str, 81 | dropout_rate: float, 82 | ): 83 | super(SpeakerEncoder, self).__init__() 84 | self.c_in = c_in 85 | self.c_h = c_h 86 | self.c_out = c_out 87 | self.kernel_size = kernel_size 88 | self.n_conv_blocks = n_conv_blocks 89 | self.n_dense_blocks = n_dense_blocks 90 | self.subsample = subsample 91 | self.act = get_act(act) 92 | self.conv_bank = ConvBank(c_in, c_bank, bank_size, bank_scale, act) 93 | in_channels = c_bank * (bank_size // bank_scale) + c_in 94 | self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1) 95 | self.first_conv_layers = nn.ModuleList( 96 | [ 97 | nn.Sequential( 98 | nn.ReflectionPad1d( 99 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 100 | ), 101 | nn.Conv1d(c_h, c_h, kernel_size=kernel_size), 102 | ) 103 | for _ in range(n_conv_blocks) 104 | ] 105 | ) 106 | self.second_conv_layers = nn.ModuleList( 107 | [ 108 | nn.Sequential( 109 | nn.ReflectionPad1d( 110 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 111 | ), 112 | nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub), 113 | ) 114 | for sub, _ in zip(subsample, range(n_conv_blocks)) 115 | ] 116 | ) 117 | self.pooling_layer = nn.AdaptiveAvgPool1d(1) 118 | self.first_dense_layers = nn.ModuleList( 119 | [nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)] 120 | ) 121 | self.second_dense_layers = nn.ModuleList( 122 | [nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)] 123 | ) 124 | self.output_layer = nn.Linear(c_h, c_out) 125 | self.dropout_layer = nn.Dropout(p=dropout_rate) 126 | 127 | def conv_blocks(self, inp: Tensor) -> Tensor: 128 | out = inp 129 | for idx, (first_layer, second_layer) in enumerate( 130 | zip(self.first_conv_layers, self.second_conv_layers) 131 | ): 132 | y = first_layer(out) 133 | y = self.act(y) 134 | y = self.dropout_layer(y) 135 | y = second_layer(y) 136 | y = self.act(y) 137 | y = self.dropout_layer(y) 138 | if self.subsample[idx] > 1: 139 | out = F.avg_pool1d(out, kernel_size=self.subsample[idx], ceil_mode=True) 140 | out = y + out 141 | return out 142 | 143 | def dense_blocks(self, inp: Tensor) -> Tensor: 144 | out = inp 145 | for first_layer, second_layer in zip( 146 | self.first_dense_layers, self.second_dense_layers 147 | ): 148 | y = first_layer(out) 149 | y = self.act(y) 150 | y = self.dropout_layer(y) 151 | y = second_layer(y) 152 | y = self.act(y) 153 | y = self.dropout_layer(y) 154 | out = y + out 155 | return out 156 | 157 | def forward(self, x: Tensor) -> Tensor: 158 | out = self.conv_bank(x) 159 | out = self.in_conv_layer(out) 160 | out = self.act(out) 161 | out = self.conv_blocks(out) 162 | out = self.pooling_layer(out).squeeze(-1) 163 | out = self.dense_blocks(out) 164 | out = self.output_layer(out) 165 | return out 166 | 167 | 168 | class ContentEncoder(nn.Module): 169 | def __init__( 170 | self, 171 | c_in: int, 172 | c_h: int, 173 | c_out: int, 174 | kernel_size: int, 175 | bank_size: int, 176 | bank_scale: int, 177 | c_bank: int, 178 | n_conv_blocks: int, 179 | subsample: List[int], 180 | act: str, 181 | dropout_rate: float, 182 | ): 183 | super(ContentEncoder, self).__init__() 184 | self.n_conv_blocks = n_conv_blocks 185 | self.subsample = subsample 186 | self.act = get_act(act) 187 | self.conv_bank = ConvBank(c_in, c_bank, bank_size, bank_scale, act) 188 | in_channels = c_bank * (bank_size // bank_scale) + c_in 189 | self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1) 190 | self.first_conv_layers = nn.ModuleList( 191 | [ 192 | nn.Sequential( 193 | nn.ReflectionPad1d( 194 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 195 | ), 196 | nn.Conv1d(c_h, c_h, kernel_size=kernel_size), 197 | ) 198 | for _ in range(n_conv_blocks) 199 | ] 200 | ) 201 | self.second_conv_layers = nn.ModuleList( 202 | [ 203 | nn.Sequential( 204 | nn.ReflectionPad1d( 205 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 206 | ), 207 | nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub), 208 | ) 209 | for sub, _ in zip(subsample, range(n_conv_blocks)) 210 | ] 211 | ) 212 | self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) 213 | self.mean_layer = nn.Conv1d(c_h, c_out, kernel_size=1) 214 | self.std_layer = nn.Conv1d(c_h, c_out, kernel_size=1) 215 | self.dropout_layer = nn.Dropout(p=dropout_rate) 216 | 217 | def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]: 218 | out = self.conv_bank(x) 219 | out = self.in_conv_layer(out) 220 | out = self.norm_layer(out) 221 | out = self.act(out) 222 | out = self.dropout_layer(out) 223 | for idx, (first_layer, second_layer) in enumerate( 224 | zip(self.first_conv_layers, self.second_conv_layers) 225 | ): 226 | y = first_layer(out) 227 | y = self.norm_layer(y) 228 | y = self.act(y) 229 | y = self.dropout_layer(y) 230 | y = second_layer(y) 231 | y = self.norm_layer(y) 232 | y = self.act(y) 233 | y = self.dropout_layer(y) 234 | out = F.avg_pool1d(out, kernel_size=self.subsample[idx], ceil_mode=True) 235 | out = y + out 236 | mu = self.mean_layer(out) 237 | log_sigma = self.std_layer(out) 238 | return mu, log_sigma 239 | 240 | 241 | class Decoder(nn.Module): 242 | def __init__( 243 | self, 244 | c_in: int, 245 | c_cond: int, 246 | c_h: int, 247 | c_out: int, 248 | kernel_size: int, 249 | n_conv_blocks: int, 250 | upsample: List[int], 251 | act: str, 252 | sn: bool, 253 | dropout_rate: float, 254 | ): 255 | super(Decoder, self).__init__() 256 | self.n_conv_blocks = n_conv_blocks 257 | self.upsample = upsample 258 | self.act = get_act(act) 259 | f = spectral_norm if sn else lambda x: x 260 | self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1)) 261 | self.first_conv_layers = nn.ModuleList( 262 | [ 263 | nn.Sequential( 264 | nn.ReflectionPad1d( 265 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 266 | ), 267 | f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size)), 268 | ) 269 | for _ in range(n_conv_blocks) 270 | ] 271 | ) 272 | self.second_conv_layers = nn.ModuleList( 273 | [ 274 | nn.Sequential( 275 | nn.ReflectionPad1d( 276 | (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2) 277 | ), 278 | nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size), 279 | ) 280 | for _, up in zip(range(n_conv_blocks), self.upsample) 281 | ] 282 | ) 283 | self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) 284 | self.first_affine_layers = nn.ModuleList( 285 | [AffineLayer(c_cond, c_h) for _ in range(n_conv_blocks)] 286 | ) 287 | self.second_affine_layers = nn.ModuleList( 288 | [AffineLayer(c_cond, c_h) for _ in range(n_conv_blocks)] 289 | ) 290 | self.pixel_shuffle = nn.ModuleList( 291 | [PixelShuffle(scale_factor) for scale_factor in self.upsample] 292 | ) 293 | self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1)) 294 | self.dropout_layer = nn.Dropout(p=dropout_rate) 295 | 296 | def forward(self, z: Tensor, cond: Tensor) -> Tensor: 297 | out = self.in_conv_layer(z) 298 | out = self.norm_layer(out) 299 | out = self.act(out) 300 | out = self.dropout_layer(out) 301 | for idx, ( 302 | first_conv_layer, 303 | second_conv_layer, 304 | first_affine_layer, 305 | second_affine_layer, 306 | pixel_shuffle, 307 | ) in enumerate( 308 | zip( 309 | self.first_conv_layers, 310 | self.second_conv_layers, 311 | self.first_affine_layers, 312 | self.second_affine_layers, 313 | self.pixel_shuffle, 314 | ) 315 | ): 316 | y = first_conv_layer(out) 317 | y = self.norm_layer(y) 318 | y = first_affine_layer(y, cond) 319 | y = self.act(y) 320 | y = self.dropout_layer(y) 321 | y = second_conv_layer(y) 322 | y = pixel_shuffle(y) 323 | y = self.norm_layer(y) 324 | y = second_affine_layer(y, cond) 325 | y = self.act(y) 326 | y = self.dropout_layer(y) 327 | out = y + F.interpolate( 328 | out, scale_factor=float(self.upsample[idx]), mode="nearest" 329 | ) 330 | out = self.out_conv_layer(out) 331 | return out 332 | 333 | 334 | class AdaINVC(nn.Module): 335 | def __init__(self, config: Dict): 336 | super(AdaINVC, self).__init__() 337 | self.speaker_encoder = SpeakerEncoder(**config["SpeakerEncoder"]) 338 | self.content_encoder = ContentEncoder(**config["ContentEncoder"]) 339 | self.decoder = Decoder(**config["Decoder"]) 340 | 341 | def forward( 342 | self, src: Tensor, tgt: Optional[Tensor] = None 343 | ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: 344 | if tgt is None: 345 | emb = self.speaker_encoder(src) 346 | else: 347 | emb = self.speaker_encoder(tgt) 348 | mu, log_sigma = self.content_encoder(src) 349 | eps = torch.empty_like(log_sigma).normal_(0.0, 1.0) 350 | dec = self.decoder(mu + torch.exp(log_sigma / 2.0) * eps, emb) 351 | return mu, log_sigma, emb, dec 352 | 353 | @torch.jit.export 354 | def inference(self, src: Tensor, tgt: Tensor) -> Tensor: 355 | emb = self.speaker_encoder(tgt) 356 | mu, _ = self.content_encoder(src) 357 | dec = self.decoder(mu, emb) 358 | return dec 359 | --------------------------------------------------------------------------------