├── output1.jpg
├── output2.jpg
├── proposal-zahra karbalaei mohammadi.pdf
├── data
    ├── __init__.py
    ├── dataset.py
    ├── infinite_dataloader.py
    └── wav2mel.py
├── Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf
├── config.yaml
├── inference.py
├── preprocess.py
├── train.py
├── README by zahra karbalaei mohammadi
├── README.md
└── model.py


/output1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/output1.jpg


--------------------------------------------------------------------------------
/output2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/output2.jpg


--------------------------------------------------------------------------------
/proposal-zahra karbalaei mohammadi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/proposal-zahra karbalaei mohammadi.pdf


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import SpeakerDataset
2 | from .infinite_dataloader import InfiniteDataLoader, infinite_iterator
3 | from .wav2mel import Wav2Mel
4 | 


--------------------------------------------------------------------------------
/Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahdeslami11/AdaIN-VC/HEAD/Comparison table of advantages and disadvantages of the method used in 10 articles with similar topics.pdf


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | Model:
 2 |   SpeakerEncoder:
 3 |     c_in: 80
 4 |     c_h: 128
 5 |     c_out: 128
 6 |     kernel_size: 5
 7 |     bank_size: 8
 8 |     bank_scale: 1
 9 |     c_bank: 128
10 |     n_conv_blocks: 6
11 |     n_dense_blocks: 6
12 |     subsample: [1, 2, 1, 2, 1, 2]
13 |     act: "relu"
14 |     dropout_rate: 0.0
15 |   ContentEncoder:
16 |     c_in: 80
17 |     c_h: 128
18 |     c_out: 128
19 |     kernel_size: 5
20 |     bank_size: 8
21 |     bank_scale: 1
22 |     c_bank: 128
23 |     n_conv_blocks: 6
24 |     subsample: [1, 2, 1, 2, 1, 2]
25 |     act: "relu"
26 |     dropout_rate: 0.0
27 |   Decoder:
28 |     c_in: 128
29 |     c_cond: 128
30 |     c_h: 128
31 |     c_out: 80
32 |     kernel_size: 5
33 |     n_conv_blocks: 6
34 |     upsample: [2, 1, 2, 1, 2, 1]
35 |     act: "relu"
36 |     sn: False
37 |     dropout_rate: 0.0
38 | Optimizer:
39 |   lr: 0.0005
40 |   beta1: 0.9
41 |   beta2: 0.999
42 |   amsgrad: True
43 |   weight_decay: 0.0001
44 |   grad_norm: 5
45 | Lambda:
46 |   rec: 10
47 |   kl: 1
48 |   kl_annealing: 20000
49 | 


--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | 
 8 | 
 9 | class SpeakerDataset(Dataset):
10 |     def __init__(self, data_dir, segment=128, n_uttrs=4):
11 |         self.data_dir = data_dir
12 |         self.meta_data = json.load(open(os.path.join(data_dir, "metadata.json"), "r"))
13 |         self.id2spk = list(self.meta_data.keys())
14 |         self.segment = segment
15 |         self.n_uttrs = n_uttrs
16 | 
17 |     def __len__(self):
18 |         return len(self.meta_data)  # num_speakers
19 | 
20 |     def __getitem__(self, index):
21 |         spk = self.id2spk[index]
22 |         mel_files = random.sample(self.meta_data[spk], k=self.n_uttrs)
23 |         mels = [torch.load(os.path.join(self.data_dir, file)) for file in mel_files]
24 |         starts = [random.randint(0, m.shape[-1] - self.segment) for m in mels]
25 |         mels = torch.stack(
26 |             [m[:, start : (start + self.segment)] for (m, start) in zip(mels, starts)]
27 |         )
28 |         return mels
29 | 


--------------------------------------------------------------------------------
/data/infinite_dataloader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class InfiniteDataLoader(torch.utils.data.DataLoader):
 5 |     def __init__(self, *args, **kwargs):
 6 |         super().__init__(*args, **kwargs)
 7 |         self._DataLoader__initialized = False
 8 |         self.batch_sampler = _RepeatSampler(self.batch_sampler)
 9 |         self._DataLoader__initialized = True
10 |         self.iterator = super().__iter__()
11 | 
12 |     def __len__(self):
13 |         return len(self.batch_sampler.sampler)
14 | 
15 |     def __iter__(self):
16 |         for _ in range(len(self)):
17 |             yield next(self.iterator)
18 | 
19 | 
20 | class _RepeatSampler(object):
21 |     """Sampler that repeats forever.
22 |     Args:
23 |         sampler (Sampler)
24 |     """
25 | 
26 |     def __init__(self, sampler):
27 |         self.sampler = sampler
28 | 
29 |     def __iter__(self):
30 |         while True:
31 |             yield from iter(self.sampler)
32 | 
33 | 
34 | def infinite_iterator(dataloader):
35 |     """Infinitely yield a batch of data."""
36 |     while True:
37 |         for batch in iter(dataloader):
38 |             yield batch
39 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import soundfile as sf
 4 | import torch
 5 | import torchaudio
 6 | 
 7 | from data import Wav2Mel
 8 | 
 9 | 
10 | def main(
11 |     model_path: str,
12 |     vocoder_path: str,
13 |     source: str,
14 |     target: str,
15 |     output: str,
16 | ):
17 |     device = "cuda" if torch.cuda.is_available() else "cpu"
18 |     model = torch.jit.load(model_path).to(device)
19 |     vocoder = torch.jit.load(vocoder_path).to(device)
20 |     wav2mel = Wav2Mel()
21 | 
22 |     src, src_sr = torchaudio.load(source)
23 |     tgt, tgt_sr = torchaudio.load(target)
24 | 
25 |     src = wav2mel(src, src_sr)[None, :].to(device)
26 |     tgt = wav2mel(tgt, tgt_sr)[None, :].to(device)
27 | 
28 |     cvt = model.inference(src, tgt)
29 | 
30 |     with torch.no_grad():
31 |         wav = vocoder.generate([cvt.squeeze(0).data.T])
32 | 
33 |     wav = wav[0].data.cpu().numpy()
34 |     sf.write(output, wav, wav2mel.sample_rate)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument("model_path", type=str)
40 |     parser.add_argument("vocoder_path", type=str)
41 |     parser.add_argument("source", type=str)
42 |     parser.add_argument("target", type=str)
43 |     parser.add_argument("output", type=str)
44 |     main(**vars(parser.parse_args()))
45 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | from functools import partial
 5 | from uuid import uuid4
 6 | 
 7 | import librosa
 8 | import torch
 9 | import torch.multiprocessing as mp
10 | import torch.nn as nn
11 | import torchaudio
12 | from torch import Tensor
13 | from tqdm.auto import tqdm
14 | 
15 | from data.wav2mel import Wav2Mel
16 | 
17 | 
18 | def process_files(audio_file: str, wav2mel: nn.Module) -> Tensor:
19 |     speech_tensor, sample_rate = torchaudio.load(audio_file)
20 |     mel_tensor = wav2mel(speech_tensor, sample_rate)
21 | 
22 |     return mel_tensor
23 | 
24 | 
25 | def main(data_dir: str, save_dir: str, segment: int):
26 |     mp.set_sharing_strategy("file_system")
27 |     os.makedirs(save_dir, exist_ok=True)
28 |     wav2mel = Wav2Mel()
29 |     file2mel = partial(process_files, wav2mel=wav2mel)
30 | 
31 |     meta_data = {}
32 |     speakers = sorted(os.listdir(data_dir))
33 | 
34 |     for spk in tqdm(speakers):
35 |         spk_dir = os.path.join(data_dir, spk)
36 |         wav_files = librosa.util.find_files(spk_dir)
37 |         mels = [file2mel(wav_file) for wav_file in wav_files]
38 |         mels = list(filter(lambda x: x is not None and x.shape[-1] > segment, mels))
39 |         rnd_paths = [f"{uuid4().hex}.pt" for _ in range(len(mels))]
40 |         dummy = [
41 |             torch.save(mel, os.path.join(save_dir, path))
42 |             for (mel, path) in zip(mels, rnd_paths)
43 |         ]
44 |         meta_data[spk] = rnd_paths
45 | 
46 |     with open(os.path.join(save_dir, "metadata.json"), "w") as f:
47 |         json.dump(meta_data, f, indent=4)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("data_dir", type=str)
53 |     parser.add_argument("save_dir", type=str)
54 |     parser.add_argument("--segment", type=int, default=128)
55 |     main(**vars(parser.parse_args()))
56 | 


--------------------------------------------------------------------------------
/data/wav2mel.py:
--------------------------------------------------------------------------------
  1 | """Wav2Mel for processing audio data."""
  2 | 
  3 | import torch
  4 | from torchaudio.sox_effects import apply_effects_tensor
  5 | from torchaudio.transforms import MelSpectrogram
  6 | 
  7 | 
  8 | class Wav2Mel(torch.nn.Module):
  9 |     """Transform audio file into mel spectrogram tensors."""
 10 | 
 11 |     def __init__(
 12 |         self,
 13 |         sample_rate: float = 16000,
 14 |         norm_db: float = -3.0,
 15 |         sil_threshold: float = 1.0,
 16 |         sil_duration: float = 0.1,
 17 |         fft_window_ms: float = 50.0,
 18 |         fft_hop_ms: float = 12.5,
 19 |         n_fft: int = 2048,
 20 |         f_min: float = 50.0,
 21 |         n_mels: int = 80,
 22 |         preemph: float = 0.97,
 23 |         ref_db: float = 20.0,
 24 |         dc_db: float = 100.0,
 25 |     ):
 26 |         super().__init__()
 27 | 
 28 |         self.sample_rate = sample_rate
 29 |         self.norm_db = norm_db
 30 |         self.sil_threshold = sil_threshold
 31 |         self.sil_duration = sil_duration
 32 |         self.fft_window_ms = fft_window_ms
 33 |         self.fft_hop_ms = fft_hop_ms
 34 |         self.n_fft = n_fft
 35 |         self.f_min = f_min
 36 |         self.n_mels = n_mels
 37 |         self.preemph = preemph
 38 |         self.ref_db = ref_db
 39 |         self.dc_db = dc_db
 40 | 
 41 |         self.sox_effects = SoxEffects(sample_rate, norm_db, sil_threshold, sil_duration)
 42 |         self.log_melspectrogram = LogMelspectrogram(
 43 |             sample_rate,
 44 |             fft_window_ms,
 45 |             fft_hop_ms,
 46 |             n_fft,
 47 |             f_min,
 48 |             n_mels,
 49 |             preemph,
 50 |             ref_db,
 51 |             dc_db,
 52 |         )
 53 | 
 54 |     def forward(self, wav_tensor: torch.Tensor, sample_rate: int) -> torch.Tensor:
 55 |         wav_tensor = self.sox_effects(wav_tensor, sample_rate)
 56 |         if wav_tensor.numel() == 0:
 57 |             return None
 58 |         mel_tensor = self.log_melspectrogram(wav_tensor)
 59 |         return mel_tensor
 60 | 
 61 | 
 62 | class SoxEffects(torch.nn.Module):
 63 |     """Transform waveform tensors."""
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         sample_rate: int,
 68 |         norm_db: float,
 69 |         sil_threshold: float,
 70 |         sil_duration: float,
 71 |     ):
 72 |         super().__init__()
 73 |         self.effects = [
 74 |             ["channels", "1"],  # convert to mono
 75 |             ["rate", f"{sample_rate}"],  # resample
 76 |             ["norm", f"{norm_db}"],  # normalize to -3 dB
 77 |             [
 78 |                 "silence",
 79 |                 "1",
 80 |                 f"{sil_duration}",
 81 |                 f"{sil_threshold}%",
 82 |                 "-1",
 83 |                 f"{sil_duration}",
 84 |                 f"{sil_threshold}%",
 85 |             ],  # remove silence throughout the file
 86 |         ]
 87 | 
 88 |     def forward(self, wav_tensor: torch.Tensor, sample_rate: int) -> torch.Tensor:
 89 |         wav_tensor, _ = apply_effects_tensor(wav_tensor, sample_rate, self.effects)
 90 |         return wav_tensor
 91 | 
 92 | 
 93 | class LogMelspectrogram(torch.nn.Module):
 94 |     """Transform waveform tensors into log mel spectrogram tensors."""
 95 | 
 96 |     def __init__(
 97 |         self,
 98 |         sample_rate: float,
 99 |         fft_window_ms: float,
100 |         fft_hop_ms: float,
101 |         n_fft: int,
102 |         f_min: float,
103 |         n_mels: int,
104 |         preemph: float,
105 |         ref_db: float,
106 |         dc_db: float,
107 |     ):
108 |         super().__init__()
109 |         self.melspectrogram = MelSpectrogram(
110 |             sample_rate=sample_rate,
111 |             win_length=int(sample_rate * fft_window_ms / 1000),
112 |             hop_length=int(sample_rate * fft_hop_ms / 1000),
113 |             n_fft=n_fft,
114 |             f_min=f_min,
115 |             n_mels=n_mels,
116 |         )
117 |         self.preemph = preemph
118 |         self.ref_db = ref_db
119 |         self.dc_db = dc_db
120 | 
121 |     def forward(self, wav_tensor: torch.Tensor) -> torch.Tensor:
122 |         # preemph
123 |         wav_tensor = torch.cat(
124 |             (
125 |                 wav_tensor[:, 0].unsqueeze(-1),
126 |                 wav_tensor[:, 1:] - self.preemph * wav_tensor[:, :-1],
127 |             ),
128 |             dim=-1,
129 |         )
130 |         mel_tensor = self.melspectrogram(wav_tensor).squeeze(0)  # (n_mels, time)
131 |         mel_tensor = 20 * mel_tensor.clamp(min=1e-9).log10()
132 |         mel_tensor = (mel_tensor - self.ref_db + self.dc_db) / self.dc_db
133 |         return mel_tensor
134 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import yaml
  7 | from torch.utils.data import random_split
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | from tqdm.auto import trange
 10 | 
 11 | from data import InfiniteDataLoader, SpeakerDataset, infinite_iterator
 12 | from model import AdaINVC
 13 | 
 14 | 
 15 | def main(
 16 |     config_file: str,
 17 |     data_dir: str,
 18 |     save_dir: str,
 19 |     n_steps: int,
 20 |     save_steps: int,
 21 |     log_steps: int,
 22 |     n_spks: int,
 23 |     n_uttrs: int,
 24 | ):
 25 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 26 |     torch.backends.cudnn.benchmark = True
 27 | 
 28 |     # Load config
 29 |     config = yaml.load(open(config_file, "r"), Loader=yaml.FullLoader)
 30 | 
 31 |     # Prepare data
 32 |     data = SpeakerDataset(data_dir, segment=128, n_uttrs=n_uttrs)
 33 | 
 34 |     # split train/valid sets
 35 |     train_set, valid_set = random_split(
 36 |         data, [int(len(data) * 0.8), len(data) - int(len(data) * 0.8)]
 37 |     )
 38 | 
 39 |     # construct loader
 40 |     train_loader = InfiniteDataLoader(
 41 |         train_set, batch_size=n_spks, shuffle=True, num_workers=8
 42 |     )
 43 |     valid_loader = InfiniteDataLoader(
 44 |         valid_set, batch_size=n_spks, shuffle=True, num_workers=8
 45 |     )
 46 | 
 47 |     # construct iterator
 48 |     train_iter = infinite_iterator(train_loader)
 49 |     valid_iter = infinite_iterator(valid_loader)
 50 | 
 51 |     # Build model
 52 |     model = AdaINVC(config["Model"]).to(device)
 53 |     model = torch.jit.script(model)
 54 | 
 55 |     # Optimizer
 56 |     opt = torch.optim.Adam(
 57 |         model.parameters(),
 58 |         lr=config["Optimizer"]["lr"],
 59 |         betas=(config["Optimizer"]["beta1"], config["Optimizer"]["beta2"]),
 60 |         amsgrad=config["Optimizer"]["amsgrad"],
 61 |         weight_decay=config["Optimizer"]["weight_decay"],
 62 |     )
 63 | 
 64 |     # Tensorboard logger
 65 |     writer = SummaryWriter(save_dir)
 66 |     criterion = nn.L1Loss()
 67 |     pbar = trange(n_steps, ncols=0)
 68 |     valid_steps = 32
 69 | 
 70 |     for step in pbar:
 71 |         # get features
 72 |         org_mels = next(train_iter)
 73 |         org_mels = org_mels.flatten(0, 1)
 74 |         org_mels = org_mels.to(device)
 75 | 
 76 |         # reconstruction
 77 |         mu, log_sigma, emb, rec_mels = model(org_mels)
 78 | 
 79 |         # compute loss
 80 |         rec_loss = criterion(rec_mels, org_mels)
 81 |         kl_loss = 0.5 * (log_sigma.exp() + mu ** 2 - 1 - log_sigma).mean()
 82 |         rec_lambda = config["Lambda"]["rec"]
 83 |         kl_lambda = min(
 84 |             config["Lambda"]["kl"] * step / config["Lambda"]["kl_annealing"],
 85 |             config["Lambda"]["kl"],
 86 |         )
 87 |         loss = rec_lambda * rec_loss + kl_lambda * kl_loss
 88 | 
 89 |         # update parameters
 90 |         opt.zero_grad()
 91 |         loss.backward()
 92 |         grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
 93 |         opt.step()
 94 | 
 95 |         # save model and optimizer
 96 |         if (step + 1) % save_steps == 0:
 97 |             model_path = os.path.join(save_dir, f"model-{step + 1}.ckpt")
 98 |             model.cpu()
 99 |             model.save(model_path)
100 |             model.to(device)
101 |             opt_path = os.path.join(save_dir, f"opt-{step + 1}.ckpt")
102 |             torch.save(opt.state_dict(), opt_path)
103 | 
104 |         if (step + 1) % log_steps == 0:
105 |             # validation
106 |             model.eval()
107 |             valid_loss = 0
108 |             for _ in range(valid_steps):
109 |                 org_mels = next(valid_iter)
110 |                 org_mels = org_mels.flatten(0, 1)
111 |                 org_mels = org_mels.to(device)
112 |                 mu, log_sigma, emb, rec_mels = model(org_mels)
113 |                 loss = criterion(rec_mels, org_mels)
114 |                 valid_loss += loss.item()
115 |             valid_loss /= valid_steps
116 |             model.train()
117 | 
118 |             # record information
119 |             writer.add_scalar("training/rec_loss", rec_loss, step + 1)
120 |             writer.add_scalar("training/kl_loss", kl_loss, step + 1)
121 |             writer.add_scalar("training/grad_norm", grad_norm, step + 1)
122 |             writer.add_scalar("lambda/kl", kl_lambda, step + 1)
123 |             writer.add_scalar("validation/rec_loss", valid_loss, step + 1)
124 | 
125 |         # update tqdm bar
126 |         pbar.set_postfix({"rec_loss": rec_loss.item(), "kl_loss": kl_loss.item()})
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument("config_file", type=str)
132 |     parser.add_argument("data_dir", type=str)
133 |     parser.add_argument("save_dir", type=str)
134 |     parser.add_argument("--n_steps", type=int, default=int(1e6))
135 |     parser.add_argument("--save_steps", type=int, default=5000)
136 |     parser.add_argument("--log_steps", type=int, default=250)
137 |     parser.add_argument("--n_spks", type=int, default=32)
138 |     parser.add_argument("--n_uttrs", type=int, default=4)
139 |     main(**vars(parser.parse_args()))
140 | 


--------------------------------------------------------------------------------
/README by zahra karbalaei mohammadi:
--------------------------------------------------------------------------------
  1 | 1. A summary of the purpose as well as the function of the code
  2 | 
  3 | In this research, an algorithm for changing the voice of people has been presented, which can easily take the voice of a person with a specific content and have the voice of another person with a different content next to it, and at the end, the first content with the voice of the second person as the output. to give
  4 | The function of the code is completely algorithmic and does not have any input or output as sound. Therefore, in the future, they can use this algorithm in voice conversion tasks.
  5 | 
  6 | 
  7 | 2. The rate of innovation in code improvement
  8 | 
  9 | Due to the fact that this project did not have any input and output, I found a similar work in the link https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=RmNTzr2Fds5l which can show the performance of this project well.
 10 | Also, according to the studies, I found that if AdaIN-VC is used with AGAIN-VC, it can greatly reduce the dimensions and prevent the speaker information from leaking into the content embeddings.
 11 | 
 12 | 
 13 | 3. Things that have been changed and improved in the source code
 14 | 
 15 | There were no specific bugs in the original source code, and most of the problems were related to pycodestyle, which I tried to fix as much as possible.
 16 | New source code to show the performance of the AdaIN-VC algorithm:
 17 | 
 18 | # AdaIN-VC demo
 19 | This is a demonstration of AdaIN-VC that should work out of the box and be fairly quick to setup.
 20 | 
 21 | ## Code Setup
 22 | !git clone https://github.com/yiftachbeer/AdaIN-VC
 23 | %cd AdaIN-VC
 24 | %%capture
 25 | 
 26 | !python -m pip install -r requirements.txt
 27 | ## Data Setup
 28 | #We download a custom, smaller version of VCTK (all utterances of 5 speakers out of 110).
 29 | %%capture
 30 | 
 31 | !wget https://www.cs.huji.ac.il/~yiftach/VCTKmini.zip
 32 | !unzip VCTKmini.zip && rm VCTKmini.zip
 33 | 
 34 | %run adain-vc.py preprocess VCTKmini/wav48_silence_trimmed VCTKmini_mel
 35 | 
 36 | ## Training
 37 | #The `n_steps` parameter can be adjusted depending on how long you want to wait. 
 38 | 
 39 | %run adain-vc.py train config.yaml VCTKmini_mel saved_models --n_steps 1000 --save_steps 100
 40 | 
 41 | ## Inference
 42 | from IPython.display import Audio
 43 | #We use the first sample for content, and the second for speaker:
 44 | Audio('VCTKmini/p226/p226_002_mic2.flac')
 45 | Audio('VCTKmini/p225/p225_003_mic2.flac')
 46 | 
 47 | #We demonstrate the quality of the pretrained model along with the one we just trained:
 48 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-trained.wav --model_path saved_models/model-1000.ckpt
 49 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-pretrained.wav
 50 | Audio('cvrt-trained.wav')
 51 | Audio('cvrt-pretrained.wav')
 52 | 
 53 | Reference to the project: https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=KVcBHvvfV9s-
 54 | 
 55 | 
 56 | 4. The result of changing and improving the code in evaluating the output audio file
 57 | 
 58 | After solving these problems, no other problems were observed in Visual Studio Code, but because the work is coded as an algorithm, we had no sound input or output.
 59 | I was only able to check the codes and fix the errors and problems according to the Visual Studio Code guidelines to improve the coding.
 60 | 
 61 | 
 62 | 5. Reference to the main project link
 63 | 
 64 | https://github.com/cyhuang-tw/AdaIN-VC
 65 | 
 66 | 
 67 | 6. Student introduction
 68 | 
 69 | Zahra karbalaei mohammadi is a master's student from South Tehran University
 70 | 
 71 | student number: 40014140111030
 72 | 
 73 | Digital signal processing course
 74 | 
 75 | Supervisor: Dr. Mahdi Eslami
 76 | 
 77 | 
 78 | 7. The article file has been updated
 79 | 
 80 | Link to download the comparison table of advantages and disadvantages of the method used in 10 articles with similar topics: https://drive.google.com/file/d/1senPl-zaLvEdadwrADIY5Ibc84KInL_0/view?usp=share_link
 81 | 
 82 | Download link of the introduction of the new article: https://drive.google.com/file/d/1DQQAOIRcSbO8AzGZWcIVnzyrnj3OjqAG/view?usp=share_link
 83 | 
 84 | 
 85 | 8. Explanation videos about project code and articles
 86 | 
 87 | Video file link for a general explanation about the article: https://drive.google.com/file/d/1Dj-tNs13g7Z3m3rBQ18mCPBXiyZSz6KJ/view?usp=sharing
 88 | 
 89 | Video link for a detailed explanation of the article: https://drive.google.com/file/d/1UAlZrxqV7mTjHeamGqRVoJ2YErcp-zPM/view?usp=sharing
 90 | 
 91 | Video file general explanation about the main parts of the source and code database and the environment and software required to run the code: https://drive.google.com/file/d/1wytASSQn8NkKPPeb_t8BlzIFMRxnuWVC/view?usp=sharing
 92 | 
 93 | Video file link explaining about the code and matching it with the article: https://drive.google.com/file/d/18sz51-JWXAwIVSdhsA0BptDeGtr9HQTd/view?usp=sharing
 94 | 
 95 | Link to the video file of the source code execution and explanation about the input and output of the final project: https://drive.google.com/file/d/1U7PZy5zF4mX2T4OMar-OgltCig1yvK4V/view?usp=drivesdk
 96 | 
 97 | The link of the input and output file in another similar project that uses the algorithm of my final project: https://drive.google.com/drive/folders/1fY-dxzlGMZGa0sGDEVHKmRQQNKZ9tLFq
 98 | 
 99 | 
100 | All the videos related to my project to promote science have been uploaded to Aparat
101 | 
102 | Aparat link: https://www.aparat.com/Zahrakarbalaeimohammadi
103 | 
104 | 
105 | 9. Completed proposal file for the project
106 | 
107 | Download link: 
108 | 
109 | https://drive.google.com/file/d/10ofAt50bEqUU1LrLRz96W2oYHmpbMS94/view?usp=share_link
110 | 
111 | 
112 | 10. All the tasks done for the progress of the project
113 | 
114 | Download link: 
115 | 
116 | https://drive.google.com/drive/folders/11PUIKewGp8iGzwEpmLxETaU9BjT11w9k?usp=share_link
117 | 
118 | 11. The link to the final project presentation
119 | 
120 | Download link:
121 | 
122 | https://drive.google.com/file/d/18UjO3zHM1mht7l9qqmY0sgnDcDml0uoL/view?usp=share_link
123 | 
124 | Download link from Aparat:
125 | 
126 | https://aparat.com/v/HjU6B
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AdaIN-VC
  2 | 
  3 | This is an unofficial implementation of the paper [One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742) modified from the official one.
  4 | 
  5 | ## Dependencies
  6 | 
  7 | - Python >= 3.6
  8 | - torch >= 1.7.0
  9 | - torchaudio >= 0.7.0
 10 | - numpy >= 1.16.0
 11 | - librosa >= 0.6.3
 12 | 
 13 | ## Differences from the official implementation
 14 | 
 15 | The main difference from the official implementation is the use of a neural vocoder, which greatly improves the audio quality.
 16 | I adopted universal vocoder, whose code was from [yistLin/universal-vocoder](https://github.com/yistLin/universal-vocoder) and checkpoint will be available soon.
 17 | Besides, this implementation supports torch.jit, so the full model can be loaded with simply one line:
 18 | 
 19 | ```python
 20 | model = torch.jit.load(model_path)
 21 | ```
 22 | 
 23 | Pre-trained models are available [here](https://drive.google.com/drive/folders/1MacKgXGA4Ad0O_c6W5MlkZMG0B8IzaM-?usp=sharing).
 24 | 
 25 | ## Preprocess
 26 | 
 27 | The code `preprocess.py` extracts features from raw audios.
 28 | 
 29 | ```bash
 30 | python preprocess.py <data_dir> <save_dir> [--segment seg_len]
 31 | ```
 32 | 
 33 | - **data_dir**: The directory of speakers.
 34 | - **save_dir**: The directory to save the processed files.
 35 | - **seg_len**: The length of segments for training.
 36 | 
 37 | ## Training
 38 | 
 39 | ```bash
 40 | python train.py <config_file> <data_dir> <save_dir> [--n_steps steps] [--save_steps save] [--log_steps log] [--n_spks spks] [--n_uttrs uttrs]
 41 | ```
 42 | 
 43 | - **config_file**: The config file for AdaIN-VC.
 44 | - **data_dir**: The directory of processed files given by `preprocess.py`.
 45 | - **save_dir**: The directory to save the model.
 46 | - **steps**: The number of steps for training.
 47 | - **save**: To save the model every <em>save</em> steps.
 48 | - **log**: To record training information every <em>log</em> steps.
 49 | - **spks**: The number of speakers in the batch.
 50 | - **uttrs**: The number of utterances for each speaker in the batch.
 51 | 
 52 | ## Inference
 53 | 
 54 | You can use `inference.py` to perform one-shot voice conversion.
 55 | The pre-trained model will be available soon.
 56 | 
 57 | ```bash
 58 | python inference.py <model_path> <vocoder_path> <source> <target> <output>
 59 | ```
 60 | 
 61 | - **model_path**: The path of the model file.
 62 | - **vocoder_path**: The path of the vocoder file.
 63 | - **source**: The utterance providing linguistic content.
 64 | - **target**: The utterance providing target speaker timbre.
 65 | - **output**: The converted utterance.
 66 | 
 67 | ## Reference
 68 | 
 69 | Please cite the paper if you find AdaIN-VC useful.
 70 | 
 71 | ```bib
 72 | @article{chou2019one,
 73 |   title={One-shot voice conversion by separating speaker and content representations with instance normalization},
 74 |   author={Chou, Ju-chieh and Yeh, Cheng-chieh and Lee, Hung-yi},
 75 |   journal={arXiv preprint arXiv:1904.05742},
 76 |   year={2019}
 77 | }
 78 | ```
 79 | ## Sections added by Zahra Karbalaei Mohammadi
 80 | ## 1. A summary of the purpose as well as the function of the code
 81 | 
 82 | In this research, an algorithm for changing the voice of people has been presented, which can easily take the voice of a person with a specific content and have the voice of another person with a different content next to it, and at the end, the first content with the voice of the second person as the output. to give
 83 | The function of the code is completely algorithmic and does not have any input or output as sound. Therefore, in the future, they can use this algorithm in voice conversion tasks.
 84 | 
 85 | 
 86 | ## 2. The rate of innovation in code improvement
 87 | 
 88 | Due to the fact that this project did not have any input and output, I found a similar work in the link https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=RmNTzr2Fds5l which can show the performance of this project well.
 89 | Also, according to the studies, I found that if AdaIN-VC is used with AGAIN-VC, it can greatly reduce the dimensions and prevent the speaker information from leaking into the content embeddings.
 90 | 
 91 | 
 92 | ## 3. Things that have been changed and improved in the source code
 93 | 
 94 | There were no specific bugs in the original source code, and most of the problems were related to pycodestyle, which I tried to fix as much as possible.
 95 | New source code to show the performance of the AdaIN-VC algorithm:
 96 | 
 97 | # AdaIN-VC demo
 98 | This is a demonstration of AdaIN-VC that should work out of the box and be fairly quick to setup.
 99 | 
100 | ## Code Setup
101 | !git clone https://github.com/yiftachbeer/AdaIN-VC
102 | %cd AdaIN-VC
103 | %%capture
104 | 
105 | !python -m pip install -r requirements.txt
106 | ## Data Setup
107 | #We download a custom, smaller version of VCTK (all utterances of 5 speakers out of 110).
108 | %%capture
109 | 
110 | !wget https://www.cs.huji.ac.il/~yiftach/VCTKmini.zip
111 | !unzip VCTKmini.zip && rm VCTKmini.zip
112 | 
113 | %run adain-vc.py preprocess VCTKmini/wav48_silence_trimmed VCTKmini_mel
114 | 
115 | ## Training
116 | #The `n_steps` parameter can be adjusted depending on how long you want to wait. 
117 | 
118 | %run adain-vc.py train config.yaml VCTKmini_mel saved_models --n_steps 1000 --save_steps 100
119 | 
120 | ## Inference
121 | from IPython.display import Audio
122 | #We use the first sample for content, and the second for speaker:
123 | Audio('VCTKmini/p226/p226_002_mic2.flac')
124 | Audio('VCTKmini/p225/p225_003_mic2.flac')
125 | 
126 | #We demonstrate the quality of the pretrained model along with the one we just trained:
127 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-trained.wav --model_path saved_models/model-1000.ckpt
128 | %run adain-vc.py inference VCTKmini/p226/p226_002_mic2.flac VCTKmini/p225/p225_003_mic2.flac cvrt-pretrained.wav
129 | Audio('cvrt-trained.wav')
130 | Audio('cvrt-pretrained.wav')
131 | 
132 | Reference to the project: https://colab.research.google.com/github/yiftachbeer/AdaIN-VC/blob/master/notebooks/demo.ipynb#scrollTo=KVcBHvvfV9s-
133 | 
134 | 
135 | ## 4. The result of changing and improving the code in evaluating the output audio file
136 | 
137 | After solving these problems, no other problems were observed in Visual Studio Code, but because the work is coded as an algorithm, we had no sound input or output.
138 | I was only able to check the codes and fix the errors and problems according to the Visual Studio Code guidelines to improve the coding.
139 | 
140 | 
141 | ## 5. Reference to the main project link
142 | 
143 | https://github.com/cyhuang-tw/AdaIN-VC
144 | 
145 | 
146 | ## 6. Student introduction
147 | 
148 | Zahra karbalaei mohammadi is a master's student from South Tehran University
149 | 
150 | student number: 40014140111030
151 | 
152 | Digital signal processing course
153 | 
154 | Supervisor: Dr. Mahdi Eslami
155 | 
156 | 
157 | ## 7. The article file has been updated
158 | 
159 | Link to download the comparison table of advantages and disadvantages of the method used in 10 articles with similar topics: https://drive.google.com/file/d/1senPl-zaLvEdadwrADIY5Ibc84KInL_0/view?usp=share_link
160 | 
161 | Download link of the introduction of the new article: https://drive.google.com/file/d/1DQQAOIRcSbO8AzGZWcIVnzyrnj3OjqAG/view?usp=share_link
162 | 
163 | 
164 | ## 8. Explanation videos about project code and articles
165 | 
166 | Video file link for a general explanation about the article: https://drive.google.com/file/d/1Dj-tNs13g7Z3m3rBQ18mCPBXiyZSz6KJ/view?usp=sharing
167 | 
168 | Video link for a detailed explanation of the article: https://drive.google.com/file/d/1UAlZrxqV7mTjHeamGqRVoJ2YErcp-zPM/view?usp=sharing
169 | 
170 | Video file general explanation about the main parts of the source and code database and the environment and software required to run the code: https://drive.google.com/file/d/1wytASSQn8NkKPPeb_t8BlzIFMRxnuWVC/view?usp=sharing
171 | 
172 | Video file link explaining about the code and matching it with the article: https://drive.google.com/file/d/18sz51-JWXAwIVSdhsA0BptDeGtr9HQTd/view?usp=sharing
173 | 
174 | Link to the video file of the source code execution and explanation about the input and output of the final project: https://drive.google.com/file/d/1U7PZy5zF4mX2T4OMar-OgltCig1yvK4V/view?usp=drivesdk
175 | 
176 | The link of the input and output file in another similar project that uses the algorithm of my final project: https://drive.google.com/drive/folders/1fY-dxzlGMZGa0sGDEVHKmRQQNKZ9tLFq
177 | 
178 | 
179 | All the videos related to my project to promote science have been uploaded to Aparat
180 | 
181 | Aparat link: https://www.aparat.com/Zahrakarbalaeimohammadi
182 | 
183 | 
184 | ## 9. Completed proposal file for the project
185 | 
186 | Download link: 
187 | 
188 | https://drive.google.com/file/d/10ofAt50bEqUU1LrLRz96W2oYHmpbMS94/view?usp=share_link
189 | 
190 | ## All the tasks done for the progress of the project
191 | 
192 | Download link: 
193 | 
194 | https://drive.google.com/drive/folders/11PUIKewGp8iGzwEpmLxETaU9BjT11w9k?usp=share_link
195 | 
196 | ## The link to the final project presentation
197 | 
198 | Download link:
199 | 
200 | https://drive.google.com/file/d/18UjO3zHM1mht7l9qqmY0sgnDcDml0uoL/view?usp=share_link
201 | 
202 | Download link from Aparat:
203 | 
204 | https://aparat.com/v/HjU6B
205 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Optional, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch import Tensor
  7 | from torch.nn.utils import spectral_norm
  8 | 
  9 | 
 10 | def get_act(act: str) -> nn.Module:
 11 |     if act == "lrelu":
 12 |         return nn.LeakyReLU()
 13 |     return nn.ReLU()
 14 | 
 15 | 
 16 | class ConvBank(nn.Module):
 17 |     def __init__(self, c_in: int, c_out: int, n_bank: int, bank_scale: int, act: str):
 18 |         super(ConvBank, self).__init__()
 19 |         self.conv_bank = nn.ModuleList(
 20 |             [
 21 |                 nn.Sequential(
 22 |                     nn.ReflectionPad1d((k // 2, k // 2 - 1 + k % 2)),
 23 |                     nn.Conv1d(c_in, c_out, kernel_size=k),
 24 |                 )
 25 |                 for k in range(bank_scale, n_bank + 1, bank_scale)
 26 |             ]
 27 |         )
 28 |         self.act = get_act(act)
 29 | 
 30 |     def forward(self, x: Tensor) -> Tensor:
 31 |         outs = [self.act(layer(x)) for layer in self.conv_bank]
 32 |         out = torch.cat(outs + [x], dim=1)
 33 |         return out
 34 | 
 35 | 
 36 | class PixelShuffle(nn.Module):
 37 |     def __init__(self, scale_factor: int):
 38 |         super(PixelShuffle, self).__init__()
 39 |         self.scale_factor = scale_factor
 40 | 
 41 |     def forward(self, x: Tensor) -> Tensor:
 42 |         batch_size, channels, in_width = x.size()
 43 |         channels = channels // self.scale_factor
 44 |         out_width = in_width * self.scale_factor
 45 |         x = x.contiguous().view(batch_size, channels, self.scale_factor, in_width)
 46 |         x = x.permute(0, 1, 3, 2).contiguous()
 47 |         x = x.view(batch_size, channels, out_width)
 48 |         return x
 49 | 
 50 | 
 51 | class AffineLayer(nn.Module):
 52 |     def __init__(self, c_cond: int, c_h: int):
 53 |         super(AffineLayer, self).__init__()
 54 |         self.c_h = c_h
 55 |         self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
 56 |         self.linear_layer = nn.Linear(c_cond, c_h * 2)
 57 | 
 58 |     def forward(self, x: Tensor, x_cond: Tensor) -> Tensor:
 59 |         x_cond = self.linear_layer(x_cond)
 60 |         mean, std = x_cond[:, : self.c_h], x_cond[:, self.c_h :]
 61 |         mean, std = mean.unsqueeze(-1), std.unsqueeze(-1)
 62 |         x = self.norm_layer(x)
 63 |         x = x * std + mean
 64 |         return x
 65 | 
 66 | 
 67 | class SpeakerEncoder(nn.Module):
 68 |     def __init__(
 69 |         self,
 70 |         c_in: int,
 71 |         c_h: int,
 72 |         c_out: int,
 73 |         kernel_size: int,
 74 |         bank_size: int,
 75 |         bank_scale: int,
 76 |         c_bank: int,
 77 |         n_conv_blocks: int,
 78 |         n_dense_blocks: int,
 79 |         subsample: List[int],
 80 |         act: str,
 81 |         dropout_rate: float,
 82 |     ):
 83 |         super(SpeakerEncoder, self).__init__()
 84 |         self.c_in = c_in
 85 |         self.c_h = c_h
 86 |         self.c_out = c_out
 87 |         self.kernel_size = kernel_size
 88 |         self.n_conv_blocks = n_conv_blocks
 89 |         self.n_dense_blocks = n_dense_blocks
 90 |         self.subsample = subsample
 91 |         self.act = get_act(act)
 92 |         self.conv_bank = ConvBank(c_in, c_bank, bank_size, bank_scale, act)
 93 |         in_channels = c_bank * (bank_size // bank_scale) + c_in
 94 |         self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1)
 95 |         self.first_conv_layers = nn.ModuleList(
 96 |             [
 97 |                 nn.Sequential(
 98 |                     nn.ReflectionPad1d(
 99 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
100 |                     ),
101 |                     nn.Conv1d(c_h, c_h, kernel_size=kernel_size),
102 |                 )
103 |                 for _ in range(n_conv_blocks)
104 |             ]
105 |         )
106 |         self.second_conv_layers = nn.ModuleList(
107 |             [
108 |                 nn.Sequential(
109 |                     nn.ReflectionPad1d(
110 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
111 |                     ),
112 |                     nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub),
113 |                 )
114 |                 for sub, _ in zip(subsample, range(n_conv_blocks))
115 |             ]
116 |         )
117 |         self.pooling_layer = nn.AdaptiveAvgPool1d(1)
118 |         self.first_dense_layers = nn.ModuleList(
119 |             [nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)]
120 |         )
121 |         self.second_dense_layers = nn.ModuleList(
122 |             [nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)]
123 |         )
124 |         self.output_layer = nn.Linear(c_h, c_out)
125 |         self.dropout_layer = nn.Dropout(p=dropout_rate)
126 | 
127 |     def conv_blocks(self, inp: Tensor) -> Tensor:
128 |         out = inp
129 |         for idx, (first_layer, second_layer) in enumerate(
130 |             zip(self.first_conv_layers, self.second_conv_layers)
131 |         ):
132 |             y = first_layer(out)
133 |             y = self.act(y)
134 |             y = self.dropout_layer(y)
135 |             y = second_layer(y)
136 |             y = self.act(y)
137 |             y = self.dropout_layer(y)
138 |             if self.subsample[idx] > 1:
139 |                 out = F.avg_pool1d(out, kernel_size=self.subsample[idx], ceil_mode=True)
140 |             out = y + out
141 |         return out
142 | 
143 |     def dense_blocks(self, inp: Tensor) -> Tensor:
144 |         out = inp
145 |         for first_layer, second_layer in zip(
146 |             self.first_dense_layers, self.second_dense_layers
147 |         ):
148 |             y = first_layer(out)
149 |             y = self.act(y)
150 |             y = self.dropout_layer(y)
151 |             y = second_layer(y)
152 |             y = self.act(y)
153 |             y = self.dropout_layer(y)
154 |             out = y + out
155 |         return out
156 | 
157 |     def forward(self, x: Tensor) -> Tensor:
158 |         out = self.conv_bank(x)
159 |         out = self.in_conv_layer(out)
160 |         out = self.act(out)
161 |         out = self.conv_blocks(out)
162 |         out = self.pooling_layer(out).squeeze(-1)
163 |         out = self.dense_blocks(out)
164 |         out = self.output_layer(out)
165 |         return out
166 | 
167 | 
168 | class ContentEncoder(nn.Module):
169 |     def __init__(
170 |         self,
171 |         c_in: int,
172 |         c_h: int,
173 |         c_out: int,
174 |         kernel_size: int,
175 |         bank_size: int,
176 |         bank_scale: int,
177 |         c_bank: int,
178 |         n_conv_blocks: int,
179 |         subsample: List[int],
180 |         act: str,
181 |         dropout_rate: float,
182 |     ):
183 |         super(ContentEncoder, self).__init__()
184 |         self.n_conv_blocks = n_conv_blocks
185 |         self.subsample = subsample
186 |         self.act = get_act(act)
187 |         self.conv_bank = ConvBank(c_in, c_bank, bank_size, bank_scale, act)
188 |         in_channels = c_bank * (bank_size // bank_scale) + c_in
189 |         self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1)
190 |         self.first_conv_layers = nn.ModuleList(
191 |             [
192 |                 nn.Sequential(
193 |                     nn.ReflectionPad1d(
194 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
195 |                     ),
196 |                     nn.Conv1d(c_h, c_h, kernel_size=kernel_size),
197 |                 )
198 |                 for _ in range(n_conv_blocks)
199 |             ]
200 |         )
201 |         self.second_conv_layers = nn.ModuleList(
202 |             [
203 |                 nn.Sequential(
204 |                     nn.ReflectionPad1d(
205 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
206 |                     ),
207 |                     nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub),
208 |                 )
209 |                 for sub, _ in zip(subsample, range(n_conv_blocks))
210 |             ]
211 |         )
212 |         self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
213 |         self.mean_layer = nn.Conv1d(c_h, c_out, kernel_size=1)
214 |         self.std_layer = nn.Conv1d(c_h, c_out, kernel_size=1)
215 |         self.dropout_layer = nn.Dropout(p=dropout_rate)
216 | 
217 |     def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
218 |         out = self.conv_bank(x)
219 |         out = self.in_conv_layer(out)
220 |         out = self.norm_layer(out)
221 |         out = self.act(out)
222 |         out = self.dropout_layer(out)
223 |         for idx, (first_layer, second_layer) in enumerate(
224 |             zip(self.first_conv_layers, self.second_conv_layers)
225 |         ):
226 |             y = first_layer(out)
227 |             y = self.norm_layer(y)
228 |             y = self.act(y)
229 |             y = self.dropout_layer(y)
230 |             y = second_layer(y)
231 |             y = self.norm_layer(y)
232 |             y = self.act(y)
233 |             y = self.dropout_layer(y)
234 |             out = F.avg_pool1d(out, kernel_size=self.subsample[idx], ceil_mode=True)
235 |             out = y + out
236 |         mu = self.mean_layer(out)
237 |         log_sigma = self.std_layer(out)
238 |         return mu, log_sigma
239 | 
240 | 
241 | class Decoder(nn.Module):
242 |     def __init__(
243 |         self,
244 |         c_in: int,
245 |         c_cond: int,
246 |         c_h: int,
247 |         c_out: int,
248 |         kernel_size: int,
249 |         n_conv_blocks: int,
250 |         upsample: List[int],
251 |         act: str,
252 |         sn: bool,
253 |         dropout_rate: float,
254 |     ):
255 |         super(Decoder, self).__init__()
256 |         self.n_conv_blocks = n_conv_blocks
257 |         self.upsample = upsample
258 |         self.act = get_act(act)
259 |         f = spectral_norm if sn else lambda x: x
260 |         self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1))
261 |         self.first_conv_layers = nn.ModuleList(
262 |             [
263 |                 nn.Sequential(
264 |                     nn.ReflectionPad1d(
265 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
266 |                     ),
267 |                     f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size)),
268 |                 )
269 |                 for _ in range(n_conv_blocks)
270 |             ]
271 |         )
272 |         self.second_conv_layers = nn.ModuleList(
273 |             [
274 |                 nn.Sequential(
275 |                     nn.ReflectionPad1d(
276 |                         (kernel_size // 2, kernel_size // 2 - 1 + kernel_size % 2)
277 |                     ),
278 |                     nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size),
279 |                 )
280 |                 for _, up in zip(range(n_conv_blocks), self.upsample)
281 |             ]
282 |         )
283 |         self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
284 |         self.first_affine_layers = nn.ModuleList(
285 |             [AffineLayer(c_cond, c_h) for _ in range(n_conv_blocks)]
286 |         )
287 |         self.second_affine_layers = nn.ModuleList(
288 |             [AffineLayer(c_cond, c_h) for _ in range(n_conv_blocks)]
289 |         )
290 |         self.pixel_shuffle = nn.ModuleList(
291 |             [PixelShuffle(scale_factor) for scale_factor in self.upsample]
292 |         )
293 |         self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1))
294 |         self.dropout_layer = nn.Dropout(p=dropout_rate)
295 | 
296 |     def forward(self, z: Tensor, cond: Tensor) -> Tensor:
297 |         out = self.in_conv_layer(z)
298 |         out = self.norm_layer(out)
299 |         out = self.act(out)
300 |         out = self.dropout_layer(out)
301 |         for idx, (
302 |             first_conv_layer,
303 |             second_conv_layer,
304 |             first_affine_layer,
305 |             second_affine_layer,
306 |             pixel_shuffle,
307 |         ) in enumerate(
308 |             zip(
309 |                 self.first_conv_layers,
310 |                 self.second_conv_layers,
311 |                 self.first_affine_layers,
312 |                 self.second_affine_layers,
313 |                 self.pixel_shuffle,
314 |             )
315 |         ):
316 |             y = first_conv_layer(out)
317 |             y = self.norm_layer(y)
318 |             y = first_affine_layer(y, cond)
319 |             y = self.act(y)
320 |             y = self.dropout_layer(y)
321 |             y = second_conv_layer(y)
322 |             y = pixel_shuffle(y)
323 |             y = self.norm_layer(y)
324 |             y = second_affine_layer(y, cond)
325 |             y = self.act(y)
326 |             y = self.dropout_layer(y)
327 |             out = y + F.interpolate(
328 |                 out, scale_factor=float(self.upsample[idx]), mode="nearest"
329 |             )
330 |         out = self.out_conv_layer(out)
331 |         return out
332 | 
333 | 
334 | class AdaINVC(nn.Module):
335 |     def __init__(self, config: Dict):
336 |         super(AdaINVC, self).__init__()
337 |         self.speaker_encoder = SpeakerEncoder(**config["SpeakerEncoder"])
338 |         self.content_encoder = ContentEncoder(**config["ContentEncoder"])
339 |         self.decoder = Decoder(**config["Decoder"])
340 | 
341 |     def forward(
342 |         self, src: Tensor, tgt: Optional[Tensor] = None
343 |     ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
344 |         if tgt is None:
345 |             emb = self.speaker_encoder(src)
346 |         else:
347 |             emb = self.speaker_encoder(tgt)
348 |         mu, log_sigma = self.content_encoder(src)
349 |         eps = torch.empty_like(log_sigma).normal_(0.0, 1.0)
350 |         dec = self.decoder(mu + torch.exp(log_sigma / 2.0) * eps, emb)
351 |         return mu, log_sigma, emb, dec
352 | 
353 |     @torch.jit.export
354 |     def inference(self, src: Tensor, tgt: Tensor) -> Tensor:
355 |         emb = self.speaker_encoder(tgt)
356 |         mu, _ = self.content_encoder(src)
357 |         dec = self.decoder(mu, emb)
358 |         return dec
359 | 


--------------------------------------------------------------------------------