├── README.md ├── acoustic_mgc_model.py ├── acoustic_model.py ├── datasets.py ├── decode.py ├── duration_model.py ├── hparams.json ├── model_utils.py ├── preprocess.py ├── sgdr.py ├── split_cmp.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # EMPHASIS-pytorch 2 | EMPHASIS: An Emotional Phoneme-based Acoustic Model for Speech Synthesis System(Work in progress) 3 | -------------------------------------------------------------------------------- /acoustic_mgc_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | from utils import Conv1d, MaxPool1d, HighwayNet 6 | 7 | with open('./hparams.json', 'r') as f: 8 | hparams = json.load(f) 9 | 10 | 11 | class EMPHASISAcousticMgcModel(nn.Module): 12 | def __init__(self, units, bank_widths, max_pooling_width, duration_highway_layers, gru_layer, 13 | mgc_hidden_size, bap_hidden_size, 14 | lf0_hidden_size, activation=[nn.ReLU(), nn.Sigmoid()]): 15 | super(EMPHASISAcousticMgcModel, self).__init__() 16 | 17 | self.bank_widths = bank_widths 18 | 19 | self.phoneme_convs_bank = nn.ModuleList([ 20 | nn.Conv1d(in_channels=hparams['phoneme_in_channels'], out_channels=units, kernel_size=k) 21 | for k in bank_widths]) 22 | 23 | self.emotional_prosodic_convs_bank = nn.ModuleList([ 24 | nn.Conv1d(in_channels=hparams['emotional_prosodic_in_channels'], out_channels=units, kernel_size=k) 25 | for k in bank_widths]) 26 | 27 | self.max_pool_width = max_pooling_width 28 | 29 | self.max_pool = nn.MaxPool1d(kernel_size=max_pooling_width, stride=1) 30 | 31 | self.conv_projection = nn.Conv1d(in_channels=units * len(bank_widths), out_channels=units, kernel_size=3, 32 | stride=1, padding=1) 33 | 34 | self.highway_net = HighwayNet(activation=activation) 35 | 36 | self.duration_highway_layers = duration_highway_layers 37 | 38 | self.batch_norm = nn.BatchNorm1d(self.conv_projection.out_channels) 39 | 40 | self.highway_linear = nn.Linear(self.conv_projection.out_channels * 2, 128) 41 | 42 | self.mgc_gru = nn.GRU(input_size=units, hidden_size=(mgc_hidden_size + bap_hidden_size + lf0_hidden_size), 43 | num_layers=gru_layer, 44 | batch_first=True, bidirectional=True) 45 | 46 | # self.bap_gru = nn.GRU(input_size=units, hidden_size=bap_hidden_size, num_layers=gru_layer, 47 | # batch_first=True, bidirectional=True) 48 | 49 | # self.lf0_gru = nn.GRU(input_size=units, hidden_size=lf0_hidden_size, num_layers=gru_layer, 50 | # batch_first=True, bidirectional=True) 51 | 52 | self.mgc_linear = nn.Linear((mgc_hidden_size + bap_hidden_size + lf0_hidden_size) * 2, 53 | hparams['mgc_units'] + hparams['bap_units'] + hparams['lf0_units']) 54 | 55 | # self.bap_linear = nn.Linear(bap_hidden_size * 2, hparams['bap_units']) 56 | 57 | # self.lf0_linear = nn.Linear(lf0_hidden_size * 2, hparams['lf0_units']) 58 | 59 | self.uv_linear = nn.Linear(units, hparams['uv_units']) 60 | 61 | self.activation = activation 62 | 63 | def forward(self, input): 64 | phoneme_input = input[:, :, :hparams['phoneme_in_channels']] 65 | emotional_prosodic_input = input[:, :, hparams['phoneme_in_channels']:] 66 | # Convolution bank: concatenate on the last axis to stack channels from all convolutions 67 | phoneme_conv_outputs = torch.cat([ 68 | Conv1d(phoneme_input, conv, self.training, None, activation=self.activation[0], 69 | padding=self.bank_widths[i] - 1) 70 | for i, conv in enumerate(self.phoneme_convs_bank)], dim=-1) 71 | 72 | emotional_prosodic_conv_outputs = torch.cat([ 73 | Conv1d(emotional_prosodic_input, conv, self.training, None, activation=self.activation[0], 74 | padding=self.bank_widths[i] - 1) 75 | for i, conv in enumerate(self.emotional_prosodic_convs_bank)], dim=-1) 76 | 77 | # Maxpooling: 78 | phoneme_maxpool_output = MaxPool1d(phoneme_conv_outputs, self.max_pool, self.max_pool_width - 1) 79 | emotional_prosodic_maxpool_outputs = MaxPool1d(emotional_prosodic_conv_outputs, self.max_pool, 80 | self.max_pool_width - 1) 81 | 82 | # Projection layer: 83 | phoneme_proj_output = Conv1d(phoneme_maxpool_output, self.conv_projection, self.training, 84 | self.batch_norm, 85 | activation=self.activation[0]) 86 | emotional_prosodic_proj_output = Conv1d(emotional_prosodic_maxpool_outputs, self.conv_projection, self.training, 87 | self.batch_norm, 88 | activation=self.activation[0]) 89 | 90 | highway_input = torch.cat([phoneme_proj_output, emotional_prosodic_proj_output], dim=-1) 91 | 92 | # Handle dimensionality mismatch 93 | if highway_input.shape[2] != 128: 94 | highway_input = self.highway_linear(highway_input) 95 | 96 | # HighwayNet: 97 | for i in range(self.duration_highway_layers): 98 | highway_input = self.highway_net(highway_input) 99 | rnn_input = highway_input 100 | 101 | # Bidirectional RNN 102 | # Flatten parameters 103 | self.mgc_gru.flatten_parameters() 104 | # self.bap_gru.flatten_parameters() 105 | # self.lf0_gru.flatten_parameters() 106 | 107 | mgc_rnn_output, _ = self.mgc_gru(rnn_input) 108 | # bap_rnn_output, _ = self.bap_gru(rnn_input) 109 | # lf0_rnn_output, _ = self.lf0_gru(rnn_input) 110 | 111 | mgc_output = self.mgc_linear(mgc_rnn_output) 112 | # bap_output = self.bap_linear(bap_rnn_output) 113 | # lf0_output = self.lf0_linear(lf0_rnn_output) 114 | uv_output = self.uv_linear(rnn_input) 115 | 116 | # outputs = torch.cat([mgc_output, bap_output, lf0_output], dim=-1), uv_output 117 | 118 | outputs = mgc_output, uv_output 119 | 120 | return outputs 121 | -------------------------------------------------------------------------------- /acoustic_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | from utils import Conv1d, MaxPool1d, HighwayNet 6 | 7 | with open('./hparams.json', 'r') as f: 8 | hparams = json.load(f) 9 | 10 | 11 | class EMPHASISAcousticModel(nn.Module): 12 | def __init__(self, units, bank_widths, max_pooling_width, duration_highway_layers, gru_layer, 13 | spec_hidden_size, energy_hidden_size, cap_hidden_size, 14 | lf0_hidden_size, activation=[nn.ReLU(), nn.Sigmoid()]): 15 | super(EMPHASISAcousticModel, self).__init__() 16 | 17 | self.bank_widths = bank_widths 18 | 19 | self.phoneme_convs_bank = nn.ModuleList([ 20 | nn.Conv1d(in_channels=hparams['phoneme_in_channels'], out_channels=units, kernel_size=k) 21 | for k in bank_widths]) 22 | 23 | self.emotional_prosodic_convs_bank = nn.ModuleList([ 24 | nn.Conv1d(in_channels=hparams['emotional_prosodic_in_channels'], out_channels=units, kernel_size=k) 25 | for k in bank_widths]) 26 | 27 | self.max_pool_width = max_pooling_width 28 | 29 | self.max_pool = nn.MaxPool1d(kernel_size=max_pooling_width, stride=1) 30 | 31 | self.conv_projection = nn.Conv1d(in_channels=units * len(bank_widths), out_channels=units, kernel_size=3, 32 | stride=1, padding=1) 33 | 34 | self.highway_net = HighwayNet(activation=activation) 35 | 36 | self.duration_highway_layers = duration_highway_layers 37 | 38 | self.batch_norm = nn.BatchNorm1d(self.conv_projection.out_channels) 39 | 40 | self.highway_linear = nn.Linear(self.conv_projection.out_channels * 2, 128) 41 | 42 | self.spec_gru = nn.GRU(input_size=units, hidden_size=spec_hidden_size, num_layers=gru_layer, 43 | batch_first=True, bidirectional=True) 44 | 45 | self.energy_gru = nn.GRU(input_size=units, hidden_size=energy_hidden_size, num_layers=gru_layer, 46 | batch_first=True, bidirectional=True) 47 | 48 | self.cap_gru = nn.GRU(input_size=units, hidden_size=cap_hidden_size, num_layers=gru_layer, 49 | batch_first=True, bidirectional=True) 50 | 51 | self.lf0_gru = nn.GRU(input_size=units, hidden_size=lf0_hidden_size, num_layers=gru_layer, 52 | batch_first=True, bidirectional=True) 53 | 54 | self.spec_linear = nn.Linear(spec_hidden_size * 2, hparams['spec_units']) 55 | 56 | self.cap_linear = nn.Linear(cap_hidden_size * 2, hparams['cap_units']) 57 | 58 | self.lf0_linear = nn.Linear(lf0_hidden_size * 2, hparams['lf0_units']) 59 | 60 | self.energy_linear = nn.Linear(energy_hidden_size * 2, hparams['energy_units']) 61 | 62 | self.uv_linear = nn.Linear(units, hparams['uv_units']) 63 | 64 | self.activation = activation 65 | 66 | def forward(self, input): 67 | phoneme_input = input[:, :, :hparams['phoneme_in_channels']] 68 | emotional_prosodic_input = input[:, :, hparams['phoneme_in_channels']:] 69 | # Convolution bank: concatenate on the last axis to stack channels from all convolutions 70 | phoneme_conv_outputs = torch.cat([ 71 | Conv1d(phoneme_input, conv, self.training, None, activation=self.activation[0], 72 | padding=self.bank_widths[i] - 1) 73 | for i, conv in enumerate(self.phoneme_convs_bank)], dim=-1) 74 | 75 | emotional_prosodic_conv_outputs = torch.cat([ 76 | Conv1d(emotional_prosodic_input, conv, self.training, None, activation=self.activation[0], 77 | padding=self.bank_widths[i] - 1) 78 | for i, conv in enumerate(self.emotional_prosodic_convs_bank)], dim=-1) 79 | 80 | # Maxpooling: 81 | phoneme_maxpool_output = MaxPool1d(phoneme_conv_outputs, self.max_pool, self.max_pool_width - 1) 82 | emotional_prosodic_maxpool_outputs = MaxPool1d(emotional_prosodic_conv_outputs, self.max_pool, 83 | self.max_pool_width - 1) 84 | 85 | # Projection layer: 86 | phoneme_proj_output = Conv1d(phoneme_maxpool_output, self.conv_projection, self.training, 87 | self.batch_norm, 88 | activation=self.activation[0]) 89 | emotional_prosodic_proj_output = Conv1d(emotional_prosodic_maxpool_outputs, self.conv_projection, self.training, 90 | self.batch_norm, 91 | activation=self.activation[0]) 92 | 93 | highway_input = torch.cat([phoneme_proj_output, emotional_prosodic_proj_output], dim=-1) 94 | 95 | # Handle dimensionality mismatch 96 | if highway_input.shape[2] != 128: 97 | highway_input = self.highway_linear(highway_input) 98 | 99 | # HighwayNet: 100 | for i in range(self.duration_highway_layers): 101 | highway_input = self.highway_net(highway_input) 102 | rnn_input = highway_input 103 | 104 | # Bidirectional RNN 105 | # Flatten parameters 106 | self.spec_gru.flatten_parameters() 107 | self.energy_gru.flatten_parameters() 108 | self.cap_gru.flatten_parameters() 109 | self.lf0_gru.flatten_parameters() 110 | 111 | spec_rnn_output, _ = self.spec_gru(rnn_input) 112 | energy_rnn_output, _ = self.energy_gru(rnn_input) 113 | cap_rnn_output, _ = self.cap_gru(rnn_input) 114 | lf0_rnn_output, _ = self.lf0_gru(rnn_input) 115 | 116 | spec_output = self.spec_linear(spec_rnn_output) 117 | energy_output = self.energy_linear(energy_rnn_output) 118 | cap_output = self.cap_linear(cap_rnn_output) 119 | lf0_output = self.lf0_linear(lf0_rnn_output) 120 | uv_output = self.uv_linear(rnn_input) 121 | 122 | outputs = torch.cat([spec_output, lf0_output, cap_output, energy_output], dim=-1), uv_output 123 | 124 | return outputs 125 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | import json 5 | import numpy as np 6 | import pandas as pd 7 | import math 8 | from utils import read_binary_file 9 | 10 | 11 | with open('./hparams.json', 'r') as f: 12 | hparams = json.load(f) 13 | 14 | class EMPHASISDataset(Dataset): 15 | def __init__(self, path, id_path, model_type, sort=True): 16 | super(EMPHASISDataset, self).__init__() 17 | self.path = path 18 | self.meta_data = pd.read_csv(f'{id_path}', sep=' ', 19 | names=['id', 'label_dir', 'cmp_dir'], 20 | usecols=['id'], 21 | dtype={'id':str, 'label_dir':str, 'cmp_dir':str}, 22 | index_col=False) 23 | 24 | self.meta_data.dropna(inplace=True) 25 | self.model_type = model_type 26 | 27 | def __getitem__(self, index): 28 | id = self.meta_data.iloc[index]['id'] 29 | input = read_binary_file(f'{self.path}/label/{id}.lab', dimension=hparams['in_channels']) 30 | target = read_binary_file(f'{self.path}/cmp/{id}.cmp', dimension=hparams['mgc_target_channels'] 31 | if self.model_type.find('mgc') != -1 else hparams['target_channels']) 32 | return input, target 33 | 34 | def __len__(self): 35 | return len(self.meta_data) 36 | 37 | def collate_fn(batch): 38 | inputs = [item[0] for item in batch] 39 | targets = [item[1] for item in batch] 40 | 41 | input_lens = [len(input) for input in inputs] 42 | target_lens = [len(target) for target in targets] 43 | 44 | max_input_len = max(input_lens) 45 | max_target_len = max(target_lens) 46 | 47 | channels = targets[0].shape[1] 48 | 49 | mask = np.stack(_pad_mask(input_len, max_input_len, channels) for input_len in input_lens) 50 | uv_mask = np.stack(_pad_uv_mask(input_len, max_input_len) for input_len in input_lens) 51 | input_batch = np.stack(_pad_input(input, max_input_len) for input in inputs) 52 | target_batch = np.stack(_pad_target(target, max_target_len, channels) for target in targets) 53 | return torch.FloatTensor(input_batch), torch.FloatTensor(target_batch), torch.FloatTensor(mask), torch.FloatTensor(uv_mask) 54 | 55 | def _pad_mask(len, max_len, channels): 56 | return np.concatenate([np.ones((len, channels-1)), np.zeros((max_len-len, channels-1))], axis=0) 57 | 58 | def _pad_uv_mask(len, max_len): 59 | return np.concatenate([np.ones((len, hparams['uv_units'])), np.zeros((max_len-len, hparams['uv_units']))], axis=0) 60 | 61 | def _pad_input(input, max_input_len): 62 | padded = np.zeros((max_input_len - len(input), hparams['in_channels'])) + hparams['acoustic_input_padded'] 63 | return np.concatenate([input, padded], axis=0).astype(np.float32) 64 | 65 | def _pad_target(target, max_target_len, channels): 66 | if hparams['model_type'].find('acoustic') != -1: 67 | padded = np.zeros((max_target_len - len(target), channels)) + \ 68 | hparams['acoustic_target_padded'] 69 | else: 70 | padded = np.zeros(max_target_len - len(target)) + \ 71 | hparams['duration_target_padded'] 72 | return np.concatenate([target, padded], axis=0) 73 | -------------------------------------------------------------------------------- /decode.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tqdm 4 | import json 5 | import logging 6 | import argparse 7 | import numpy as np 8 | from model_utils import create_train_model 9 | from datasets import EMPHASISDataset 10 | from utils import read_binary_file, write_binary_file 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 17 | 18 | with open('./hparams.json', 'r') as f: 19 | hparams = json.load(f) 20 | 21 | 22 | def decode(args, model, device): 23 | model.eval() 24 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data_' + args.name) 25 | config_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config_' + args.name) 26 | data_list = open(os.path.join(config_dir, 'test.lst'), 'r').readlines() 27 | cmvn = np.load(os.path.join(data_dir, "train_cmvn.npz")) 28 | if not os.path.exists(args.output): 29 | os.mkdir(args.output) 30 | 31 | if args.model_type == 'acoustic': 32 | for input_name in data_list: 33 | input_name = input_name.split(' ')[0] + '.lab' 34 | logging.info(f'decode {input_name} ...') 35 | input = read_binary_file(os.path.join(os.path.join(data_dir, 'test', 'label'), input_name), 36 | dimension=hparams['in_channels']) 37 | input = torch.from_numpy(input).to(device) 38 | input = input.unsqueeze(0) 39 | output, uv_output = model(input) 40 | output = output.squeeze() 41 | uv_output = F.softmax(uv_output, dim=-1)[:, :, 0] 42 | uv_output = uv_output.squeeze() 43 | uv = torch.ones(uv_output.shape).to(device) 44 | uv[uv_output > 0.5] = 0.0 45 | uv = uv.unsqueeze(-1) 46 | output = torch.cat((uv, output), -1) 47 | output = output.cpu().squeeze().detach().numpy() 48 | uv = uv.cpu().squeeze().detach().numpy() 49 | output = output * cmvn['stddev_labels'] + cmvn["mean_labels"] 50 | 51 | cap = output[:, 1:hparams['cap_units']] 52 | sp = np.concatenate((output[:, hparams['cap_units'] + hparams['energy_units'] + 1: 53 | hparams['cap_units'] + hparams['energy_units'] + hparams['spec_units'] + 1], 54 | output[:, 55 | hparams['cap_units'] + 1:hparams['cap_units'] + hparams['energy_units'] + 1]), axis=-1) 56 | lf0 = output[:, hparams['cap_units'] + hparams['energy_units'] + hparams['spec_units'] + 1: 57 | hparams['cap_units'] + hparams['energy_units'] + hparams['spec_units'] + hparams[ 58 | 'lf0_units'] + 1] 59 | lf0[uv == 0] = -1.0e+10 60 | write_binary_file(sp, os.path.join(args.output, os.path.splitext(input_name)[0] + '.sp'), dtype=np.float64) 61 | write_binary_file(lf0, os.path.join(args.output, os.path.splitext(input_name)[0] + '.lf0'), 62 | dtype=np.float32) 63 | write_binary_file(cap, os.path.join(args.output, os.path.splitext(input_name)[0] + '.ap'), dtype=np.float64) 64 | elif args.model_type == 'acoustic_mgc': 65 | for input_name in data_list: 66 | input_name = input_name.split(' ')[0] + '.lab' 67 | logging.info(f'decode {input_name} ...') 68 | input = read_binary_file(os.path.join(os.path.join(data_dir, 'test', 'label'), input_name), 69 | dimension=hparams['in_channels']) 70 | input = torch.from_numpy(input).to(device) 71 | input = input.unsqueeze(0) 72 | output, uv_output = model(input) 73 | output = output.squeeze() 74 | uv_output = F.softmax(uv_output, dim=-1)[:, :, 0] 75 | uv_output = uv_output.squeeze() 76 | uv = torch.ones(uv_output.shape).to(device) 77 | uv[uv_output > 0.5] = 0.0 78 | uv = uv.unsqueeze(-1) 79 | output = torch.cat((output[:, :hparams['mgc_units']], 80 | uv, output[:, -(hparams['bap_units'] + hparams['lf0_units']):]), -1) 81 | output = output.cpu().squeeze().detach().numpy() 82 | uv = uv.cpu().squeeze().detach().numpy() 83 | output = output * cmvn['stddev_labels'] + cmvn["mean_labels"] 84 | 85 | mgc = output[:, :hparams['mgc_units']] 86 | lf0 = output[:, hparams['mgc_units'] + 1:hparams['mgc_units'] + hparams['lf0_units'] + 1] 87 | bap = output[:, -(hparams['bap_units']):] 88 | write_binary_file(mgc, os.path.join(args.output, os.path.splitext(input_name)[0] + '.mgc')) 89 | write_binary_file(lf0, os.path.join(args.output, os.path.splitext(input_name)[0] + '.lf0')) 90 | write_binary_file(bap, os.path.join(args.output, os.path.splitext(input_name)[0] + '.bap')) 91 | 92 | 93 | def main(): 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument('--checkpoint', default='') 96 | parser.add_argument('--output', default='./test_cmp/', type=str, 97 | help='path to output cmp') 98 | parser.add_argument('--model_type', default='') 99 | parser.add_argument('--name', default='') 100 | parser.add_argument('--use_cuda', default=False) 101 | args = parser.parse_args() 102 | logging.basicConfig(format='%(asctime)s %(filename)s %(levelname)s %(message)s', 103 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.DEBUG, 104 | stream=sys.stdout) 105 | 106 | model = create_train_model(args.model_type) 107 | 108 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 109 | 110 | if torch.cuda.device_count() >= 1: 111 | model = nn.DataParallel(model) 112 | model.to(device) 113 | 114 | if args.use_cuda: 115 | checkpoint = torch.load(args.checkpoint) 116 | else: 117 | checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) 118 | model.load_state_dict(checkpoint['model']) 119 | 120 | decode(args, model, device) 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | -------------------------------------------------------------------------------- /duration_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from utils import Conv1d, highwaynet 7 | 8 | with open('./hparams.json', 'r') as f: 9 | hparams = json.load(f) 10 | 11 | class EMPHASISDurationModel(nn.Module): 12 | def __init__(self, in_channels, units, bank_widths, max_pooling_width, highway_layers, gru_layer, duration_hidden_size, activation=[nn.ReLU(), nn.Sigmoid()]): 13 | super(EMPHASISDurationModel, self).__init__() 14 | self.bank_widths = bank_widths 15 | 16 | self.phoneme_convs_bank = [ 17 | nn.Conv1d(in_channels=hparams['phoneme_in_channels'], out_channels=units, kernel_size=k).cuda() 18 | for k in bank_widths] 19 | 20 | self.emotional_prosodic_convs_bank = [ 21 | nn.Conv1d(in_channels=hparams['emotional_prosodic_in_channels'], out_channels=units, kernel_size=k).cuda() 22 | for k in bank_widths] 23 | 24 | self.max_pool_width = max_pooling_width 25 | 26 | self.max_pool = nn.MaxPool1d(kernel_size=max_pooling_width, stride=1) 27 | 28 | self.conv_projection = nn.Conv1d(in_channels=units*len(bank_widths), out_channels=units, kernel_size=3, stride=1, padding=1) 29 | 30 | self.highway_layers = highway_layers 31 | 32 | self.gru = nn.GRU(input_size=units, hidden_size=duration_hidden_size, num_layers=gru_layer, batch_first=True, bidirectional=True) 33 | 34 | self.linear = nn.Linear(duration_hidden_size*2, 1) 35 | 36 | self.activation = activation 37 | 38 | def forward(self, input): 39 | # Convolution bank: concatenate on the last axis to stack channels from all convolutions 40 | phoneme_input = input[:, :, :hparams['phoneme_input_channels']] 41 | emotional_prosodic_input = input[:, :, hparams['phoneme_input_channels']: -1] 42 | phoneme_conv_ouputs = torch.cat([ 43 | Conv1d(phoneme_input, conv, nn.BatchNorm1d(conv.out_channels), self.training, activation=self.activation[0]) 44 | for conv in self.convs_bank], dim=-1) 45 | 46 | emotional_prosodic_conv_outputs = torch.cat([ 47 | Conv1d(emotional_prosodic_input, conv, self.training, None, activation=self.activation[0]) 48 | for conv in self.convs_bank], dim=-1) 49 | 50 | # Maxpooling: 51 | phoneme_maxpool_output = self.max_pool(phoneme_conv_ouputs) 52 | emotional_prosodic_maxpool_outputs = self.max_pool(emotional_prosodic_conv_outputs) 53 | 54 | # Projection layer: 55 | phoneme_proj_output = Conv1d(phoneme_maxpool_output, self.conv_projection, self.training, 56 | nn.BatchNorm1d(self.conv_projection.out_channels), 57 | activation=self.activation[0]) 58 | emotional_prosodic_proj_output = Conv1d(emotional_prosodic_maxpool_outputs, self.conv_projection, self.training, 59 | nn.BatchNorm1d(self.conv_projection.out_channels), 60 | activation=self.activation[0]) 61 | 62 | highway_input = torch.cat([phoneme_proj_output, emotional_prosodic_proj_output], dim=-1) 63 | 64 | # Handle dimensionality mismatch: 65 | if highway_input.shape[2] != 128: 66 | highway_input = F.linear(highway_input, 67 | weight=torch.nn.init.normal_(torch.empty(128, highway_input.shape[2]))) 68 | 69 | # HighwayNet: 70 | for i in range(self.highway_layers): 71 | highway_input = highwaynet(highway_input, self.activation) 72 | rnn_input = highway_input 73 | 74 | # Bidirectional RNN 75 | outputs, _ = self.gru(rnn_input) 76 | 77 | # Outputs [batch_size, phoneme_num, hidden_size * directions] -> [batch_size, phoneme_num] 78 | # the value is frame nums of the phoneme 79 | outputs = self.linear(outputs).squeeze() 80 | 81 | return outputs 82 | 83 | -------------------------------------------------------------------------------- /hparams.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "acoustic", 3 | "in_channels": 297, 4 | "target_channels": 1028, 5 | "mgc_target_channels": 67, 6 | "phoneme_in_channels": 226, 7 | "emotional_prosodic_in_channels": 71, 8 | "duration_units": 256, 9 | "duration_convolutional_bank_widths": [1, 2, 3, 4, 5, 6, 7, 8], 10 | "duration_max_pooling_width": 2, 11 | "duration_highway_layers": 1, 12 | "duration_gru_layer": 2, 13 | "duration_hidden_size": 32, 14 | "duration_target_channels": 1, 15 | "duration_input_padded": 0, 16 | "duration_target_padded": 0, 17 | "acoustic_units": 128, 18 | "acoustic_convolutional_bank_widths": [5, 10, 15, 20, 25, 30, 35, 40], 19 | "acoustic_max_pooling_width": 10, 20 | "acoustic_highway_layers": 2, 21 | "acoustic_gru_layer": 2, 22 | "acoustic_input_padded": 0, 23 | "acoustic_target_padded": 0, 24 | "spec_hidden_size": 128, 25 | "mgc_hidden_size": 128, 26 | "energy_hidden_size": 16, 27 | "cap_hidden_size": 16, 28 | "bap_hidden_size": 16, 29 | "lf0_hidden_size": 32, 30 | "spec_units": 512, 31 | "mgc_units": 60, 32 | "energy_units": 1, 33 | "cap_units": 513, 34 | "bap_units": 5, 35 | "lf0_units": 1, 36 | "uv_units": 2, 37 | "batch_size": 16, 38 | "dropout_rate": 0.5, 39 | "initial_lr": 1e-3, 40 | "min_lr": 1e-5, 41 | "weight_decay": 5e-5, 42 | "max_epochs": 300, 43 | "gpu_ids": "0" 44 | } 45 | -------------------------------------------------------------------------------- /model_utils.py: -------------------------------------------------------------------------------- 1 | from acoustic_model import EMPHASISAcousticModel 2 | from duration_model import EMPHASISDurationModel 3 | from acoustic_mgc_model import EMPHASISAcousticMgcModel 4 | from acoustic_dcbhg_model import EMPHASISAcousticDcbhgMgcModel 5 | 6 | import json 7 | import logging 8 | with open('./hparams.json', 'r') as f: 9 | hparams = json.load(f) 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | def create_train_model(model_type): 14 | if model_type == "acoustic": 15 | model = build_acoustic_model() 16 | elif model_type == "acoustic_mgc": 17 | model = build_acoustic_mgc_model() 18 | elif model_type == 'acoustic_dcbhg_mgc': 19 | model = build_acoustic_dcbhg_mgc_model() 20 | elif model_type == 'duration': 21 | model = build_duration_model() 22 | else: 23 | logger.error(f'this {model_type} is not supported!') 24 | model.train() 25 | return model 26 | 27 | def build_duration_model(): 28 | model = EMPHASISDurationModel( 29 | in_channels=hparams['in_channels'], 30 | units=hparams['duration_units'], 31 | bank_widths=hparams['duration_convolutional_bank_widths'], 32 | max_pooling_width=hparams['duration_max_pooling_width'], 33 | highway_layers=hparams['duration_highway_layers'], 34 | gru_layer=hparams['duration_gru_layer'], 35 | duration_hidden_size=hparams['duration_hidden_size'] 36 | ) 37 | return model 38 | 39 | def build_acoustic_model(): 40 | model = EMPHASISAcousticModel( 41 | in_channels=hparams['in_channels'], 42 | units=hparams['acoustic_units'], 43 | bank_widths=hparams['acoustic_convolutional_bank_widths'], 44 | max_pooling_width=hparams['acoustic_max_pooling_width'], 45 | duration_highway_layers=hparams['acoustic_highway_layers'], 46 | gru_layer=hparams['acoustic_gru_layer'], 47 | spec_hidden_size=hparams['spec_hidden_size'], 48 | energy_hidden_size=hparams['energy_hidden_size'], 49 | cap_hidden_size=hparams['cap_hidden_size'], 50 | lf0_hidden_size=hparams['lf0_hidden_size'] 51 | ) 52 | return model 53 | 54 | def build_acoustic_mgc_model(): 55 | model = EMPHASISAcousticMgcModel( 56 | in_channels=hparams['in_channels'], 57 | units=hparams['acoustic_units'], 58 | bank_widths=hparams['acoustic_convolutional_bank_widths'], 59 | max_pooling_width=hparams['acoustic_max_pooling_width'], 60 | duration_highway_layers=hparams['acoustic_highway_layers'], 61 | gru_layer=hparams['acoustic_gru_layer'], 62 | mgc_hidden_size=hparams['mgc_hidden_size'], 63 | bap_hidden_size=hparams['bap_hidden_size'], 64 | lf0_hidden_size=hparams['lf0_hidden_size'] 65 | ) 66 | return model 67 | 68 | def build_acoustic_dcbhg_mgc_model(): 69 | model = EMPHASISAcousticDcbhgMgcModel( 70 | in_channels=hparams['in_channels'], 71 | units=hparams['acoustic_units'], 72 | bank_widths=hparams['acoustic_convolutional_bank_widths'], 73 | max_pooling_width=hparams['acoustic_max_pooling_width'], 74 | duration_highway_layers=hparams['acoustic_highway_layers'], 75 | gru_layer=hparams['acoustic_gru_layer'], 76 | mgc_hidden_size=hparams['mgc_hidden_size'], 77 | bap_hidden_size=hparams['bap_hidden_size'], 78 | lf0_hidden_size=hparams['lf0_hidden_size'] 79 | ) 80 | return model 81 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import argparse 5 | import json 6 | import random 7 | import numpy as np 8 | from utils import calculate_cmvn, convert_to, read_binary_file, write_binary_file 9 | 10 | with open('./hparams.json', 'r') as f: 11 | hparams = json.load(f) 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | train_ratio = 0.97 16 | valid_ratio = 0.02 17 | test_ratio = 0.01 18 | 19 | cur_file = os.path.dirname(os.path.realpath(__file__)) 20 | 21 | 22 | def get_random_scp(label_scp_dir, param_scp_dir, lst_dir): 23 | label_scp = open(label_scp_dir + 'all.scp') 24 | param_scp = open(param_scp_dir + 'all.scp') 25 | 26 | label_train = open(label_scp_dir + 'train.scp', 'w') 27 | label_valid = open(label_scp_dir + 'valid.scp', 'w') 28 | label_test = open(label_scp_dir + 'test.scp', 'w') 29 | param_train = open(param_scp_dir + 'train.scp', 'w') 30 | param_valid = open(param_scp_dir + 'valid.scp', 'w') 31 | param_test = open(param_scp_dir + 'test.scp', 'w') 32 | 33 | if not os.path.exists(lst_dir): 34 | os.mkdir(lst_dir) 35 | 36 | lst_train = open(os.path.join(lst_dir, 'train.lst'), 'w') 37 | lst_valid = open(os.path.join(lst_dir, 'valid.lst'), 'w') 38 | lst_test = open(os.path.join(lst_dir, 'test.lst'), 'w') 39 | 40 | lists_label = label_scp.readlines() 41 | lists_param = param_scp.readlines() 42 | 43 | if len(lists_label) != len(lists_param): 44 | print("scp files have unequal lengths") 45 | sys.exit(1) 46 | 47 | lists = list(range(len(lists_label))) 48 | random.seed(0) 49 | random.shuffle(lists) 50 | 51 | train_num = int(train_ratio * len(lists)) 52 | valid_num = int(valid_ratio * len(lists)) 53 | test_num = int(test_ratio * len(lists)) 54 | train_lists = sorted(lists[: train_num]) 55 | valid_lists = sorted(lists[train_num: (train_num + valid_num)]) 56 | test_lists = sorted(lists[(train_num + valid_num):]) 57 | 58 | for i in range(len(lists)): 59 | line_label = lists_label[i] 60 | line_param = lists_param[i] 61 | line_lst = line_label.strip() + ' ' + line_param.split()[1] + '\n' 62 | if i in valid_lists: 63 | label_valid.write(line_label) 64 | param_valid.write(line_param) 65 | lst_valid.write(line_lst) 66 | elif i in test_lists: 67 | label_test.write(line_label) 68 | param_test.write(line_param) 69 | lst_test.write(line_label) 70 | else: 71 | label_train.write(line_label) 72 | param_train.write(line_param) 73 | lst_train.write(line_lst) 74 | 75 | 76 | def create_scp(raw): 77 | label_dir = os.path.join(cur_file, raw, 'prepared_label') 78 | cmp_dir = os.path.join(cur_file, raw, 'prepared_cmp') 79 | 80 | if not os.path.exists(label_dir): 81 | os.mkdir(label_dir) 82 | if not os.path.exists(cmp_dir): 83 | os.mkdir(cmp_dir) 84 | 85 | label_files = os.listdir(label_dir) 86 | cmp_files = os.listdir(cmp_dir) 87 | 88 | if not os.path.exists(os.path.join(label_dir, 'label_scp')): 89 | os.mkdir(os.path.join(label_dir, 'label_scp')) 90 | if not os.path.exists(os.path.join(cmp_dir, 'param_scp')): 91 | os.mkdir(os.path.join(cmp_dir, 'param_scp')) 92 | 93 | label_all_scp = open(os.path.join(os.path.join(label_dir, 'label_scp'), 'all.scp'), 'w') 94 | param_all_scp = open(os.path.join(os.path.join(cmp_dir, 'param_scp'), 'all.scp'), 'w') 95 | 96 | for label_filename in label_files: 97 | if label_filename == 'label_scp': 98 | continue 99 | filename = os.path.splitext(label_filename)[0] 100 | cmp_filename = os.path.splitext(label_filename)[0] + '.cmp' 101 | label_file_path = os.path.join(label_dir, label_filename) 102 | label_all_scp.write(filename + " " + label_file_path + '\n') 103 | 104 | cmp_file_path = os.path.join(cmp_dir, cmp_filename) 105 | param_all_scp.write(filename + " " + cmp_file_path + "\n") 106 | 107 | 108 | def read_data(args, raw): 109 | label_dir = os.path.join(cur_file, raw, 'prepared_label') 110 | cmp_dir = os.path.join(cur_file, raw, 'prepared_cmp') 111 | 112 | if os.path.exists(label_dir) and os.path.exists(cmp_dir): 113 | logger.info('Raw data has been prepared.') 114 | return 115 | 116 | if not os.path.exists(label_dir): 117 | os.mkdir(label_dir) 118 | if not os.path.exists(cmp_dir): 119 | os.mkdir(cmp_dir) 120 | 121 | label_files = os.listdir(args.label_dir) 122 | cmp_files = os.listdir(args.cmp_dir) 123 | 124 | # Do frame alignment 125 | for line in label_files: 126 | filename, _ = os.path.splitext(line.strip()) 127 | logger.info('processing ' + filename) 128 | sys.stdout.flush() 129 | 130 | label_mat = np.loadtxt(os.path.join(args.label_dir, filename + '.lab')) 131 | if args.model_type == 'acoustic': 132 | cmp_mat = read_binary_file( 133 | os.path.join(args.cmp_dir, filename + ".cmp"), 134 | dimension=hparams['target_channels'], dtype=np.float64) 135 | elif args.model_type == 'acoustic_mgc': 136 | cmp_mat = read_binary_file( 137 | os.path.join(args.cmp_dir, filename + ".cmp"), 138 | dimension=hparams['mgc_target_channels'], dtype=np.float32) 139 | 140 | if label_mat.shape[0] <= cmp_mat.shape[0]: 141 | cmp_mat = cmp_mat[:label_mat.shape[0], :] 142 | else: 143 | frame_diff = label_mat.shape[0] - cmp_mat.shape[0] 144 | rep = np.repeat(cmp_mat[-1:, :], frame_diff, axis=0) 145 | cmp_mat = np.concatenate([cmp_mat, rep], axis=0) 146 | 147 | write_binary_file( 148 | label_mat, 149 | os.path.join(label_dir, filename + '.lab')) 150 | if args.model_type == 'acoustic': 151 | write_binary_file( 152 | cmp_mat, 153 | os.path.join(cmp_dir, filename + '.cmp'), dtype=np.float64) 154 | elif args.model_type == 'acoustic_mgc': 155 | write_binary_file( 156 | cmp_mat, 157 | os.path.join(cmp_dir, filename + '.cmp'), dtype=np.float32) 158 | 159 | 160 | def main(): 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument('--label_dir', type=str) 163 | parser.add_argument('--cmp_dir', type=str) 164 | parser.add_argument('--name', type=str) 165 | parser.add_argument('--model_type', type=str) 166 | args = parser.parse_args() 167 | 168 | logging.basicConfig(format='%(asctime)s %(filename)s %(levelname)s %(message)s', 169 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.INFO, 170 | stream=sys.stdout) 171 | 172 | raw = 'raw_' + args.name 173 | 174 | if not os.path.exists(raw): 175 | os.mkdir(raw) 176 | 177 | label_scp_dir = raw + '/prepared_label/label_scp/' 178 | param_scp_dir = raw + '/prepared_cmp/param_scp/' 179 | lst_dir = os.path.join(cur_file, 'config_' + args.name) 180 | data_dir = os.path.join(cur_file, 'data_' + args.name) 181 | 182 | read_data(args, raw) 183 | create_scp(raw) 184 | get_random_scp(label_scp_dir, param_scp_dir, lst_dir) 185 | # cal cmvn to the data 186 | calculate_cmvn('train', lst_dir, data_dir, args.model_type) 187 | convert_to('train', os.path.join(lst_dir, 'train'), data_dir, args.model_type) 188 | convert_to('valid', os.path.join(lst_dir, 'valid'), data_dir, args.model_type) 189 | convert_to('test', os.path.join(lst_dir, 'test'), data_dir, args.model_type) 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | import os 195 | import sys 196 | import logging 197 | import argparse 198 | import json 199 | import random 200 | import numpy as np 201 | from utils import calculate_cmvn, convert_to, read_binary_file, write_binary_file 202 | 203 | with open('./hparams.json', 'r') as f: 204 | hparams = json.load(f) 205 | 206 | logger = logging.getLogger(__name__) 207 | 208 | train_ratio = 0.97 209 | valid_ratio = 0.02 210 | test_ratio = 0.01 211 | 212 | cur_file = os.path.dirname(os.path.realpath(__file__)) 213 | 214 | 215 | def get_random_scp(label_scp_dir, param_scp_dir, lst_dir): 216 | label_scp = open(label_scp_dir + 'all.scp') 217 | param_scp = open(param_scp_dir + 'all.scp') 218 | 219 | label_train = open(label_scp_dir + 'train.scp', 'w') 220 | label_valid = open(label_scp_dir + 'valid.scp', 'w') 221 | label_test = open(label_scp_dir + 'test.scp', 'w') 222 | param_train = open(param_scp_dir + 'train.scp', 'w') 223 | param_valid = open(param_scp_dir + 'valid.scp', 'w') 224 | param_test = open(param_scp_dir + 'test.scp', 'w') 225 | 226 | if not os.path.exists(lst_dir): 227 | os.mkdir(lst_dir) 228 | 229 | lst_train = open(os.path.join(lst_dir, 'train.lst'), 'w') 230 | lst_valid = open(os.path.join(lst_dir, 'valid.lst'), 'w') 231 | lst_test = open(os.path.join(lst_dir, 'test.lst'), 'w') 232 | 233 | lists_label = label_scp.readlines() 234 | lists_param = param_scp.readlines() 235 | 236 | if len(lists_label) != len(lists_param): 237 | print("scp files have unequal lengths") 238 | sys.exit(1) 239 | 240 | lists = list(range(len(lists_label))) 241 | random.seed(0) 242 | random.shuffle(lists) 243 | 244 | train_num = int(train_ratio * len(lists)) 245 | valid_num = int(valid_ratio * len(lists)) 246 | test_num = int(test_ratio * len(lists)) 247 | train_lists = sorted(lists[: train_num]) 248 | valid_lists = sorted(lists[train_num: (train_num + valid_num)]) 249 | test_lists = sorted(lists[(train_num + valid_num):]) 250 | 251 | for i in range(len(lists)): 252 | line_label = lists_label[i] 253 | line_param = lists_param[i] 254 | line_lst = line_label.strip() + ' ' + line_param.split()[1] + '\n' 255 | if i in valid_lists: 256 | label_valid.write(line_label) 257 | param_valid.write(line_param) 258 | lst_valid.write(line_lst) 259 | elif i in test_lists: 260 | label_test.write(line_label) 261 | param_test.write(line_param) 262 | lst_test.write(line_label) 263 | else: 264 | label_train.write(line_label) 265 | param_train.write(line_param) 266 | lst_train.write(line_lst) 267 | 268 | 269 | def create_scp(raw): 270 | label_dir = os.path.join(cur_file, raw, 'prepared_label') 271 | cmp_dir = os.path.join(cur_file, raw, 'prepared_cmp') 272 | 273 | if not os.path.exists(label_dir): 274 | os.mkdir(label_dir) 275 | if not os.path.exists(cmp_dir): 276 | os.mkdir(cmp_dir) 277 | 278 | label_files = os.listdir(label_dir) 279 | cmp_files = os.listdir(cmp_dir) 280 | 281 | if not os.path.exists(os.path.join(label_dir, 'label_scp')): 282 | os.mkdir(os.path.join(label_dir, 'label_scp')) 283 | if not os.path.exists(os.path.join(cmp_dir, 'param_scp')): 284 | os.mkdir(os.path.join(cmp_dir, 'param_scp')) 285 | 286 | label_all_scp = open(os.path.join(os.path.join(label_dir, 'label_scp'), 'all.scp'), 'w') 287 | param_all_scp = open(os.path.join(os.path.join(cmp_dir, 'param_scp'), 'all.scp'), 'w') 288 | 289 | for label_filename in label_files: 290 | if label_filename == 'label_scp': 291 | continue 292 | filename = os.path.splitext(label_filename)[0] 293 | cmp_filename = os.path.splitext(label_filename)[0] + '.cmp' 294 | label_file_path = os.path.join(label_dir, label_filename) 295 | label_all_scp.write(filename + " " + label_file_path + '\n') 296 | 297 | cmp_file_path = os.path.join(cmp_dir, cmp_filename) 298 | param_all_scp.write(filename + " " + cmp_file_path + "\n") 299 | 300 | 301 | def read_data(args, raw): 302 | label_dir = os.path.join(cur_file, raw, 'prepared_label') 303 | cmp_dir = os.path.join(cur_file, raw, 'prepared_cmp') 304 | 305 | if os.path.exists(label_dir) and os.path.exists(cmp_dir): 306 | logger.info('Raw data has been prepared.') 307 | return 308 | 309 | if not os.path.exists(label_dir): 310 | os.mkdir(label_dir) 311 | if not os.path.exists(cmp_dir): 312 | os.mkdir(cmp_dir) 313 | 314 | label_files = os.listdir(args.label_dir) 315 | cmp_files = os.listdir(args.cmp_dir) 316 | 317 | # Do frame alignment 318 | for line in label_files: 319 | filename, _ = os.path.splitext(line.strip()) 320 | logger.info('processing ' + filename) 321 | sys.stdout.flush() 322 | 323 | label_mat = np.loadtxt(os.path.join(args.label_dir, filename + '.lab')) 324 | if args.model_type == 'acoustic': 325 | cmp_mat = read_binary_file( 326 | os.path.join(args.cmp_dir, filename + ".cmp"), 327 | dimension=hparams['target_channels'], dtype=np.float64) 328 | elif args.model_type == 'acoustic_mgc': 329 | cmp_mat = read_binary_file( 330 | os.path.join(args.cmp_dir, filename + ".cmp"), 331 | dimension=hparams['mgc_target_channels'], dtype=np.float32) 332 | 333 | if label_mat.shape[0] <= cmp_mat.shape[0]: 334 | cmp_mat = cmp_mat[:label_mat.shape[0], :] 335 | else: 336 | frame_diff = label_mat.shape[0] - cmp_mat.shape[0] 337 | rep = np.repeat(cmp_mat[-1:, :], frame_diff, axis=0) 338 | cmp_mat = np.concatenate([cmp_mat, rep], axis=0) 339 | 340 | write_binary_file( 341 | label_mat, 342 | os.path.join(label_dir, filename + '.lab')) 343 | if args.model_type == 'acoustic': 344 | write_binary_file( 345 | cmp_mat, 346 | os.path.join(cmp_dir, filename + '.cmp'), dtype=np.float64) 347 | elif args.model_type == 'acoustic_mgc': 348 | write_binary_file( 349 | cmp_mat, 350 | os.path.join(cmp_dir, filename + '.cmp'), dtype=np.float32) 351 | 352 | 353 | def main(): 354 | parser = argparse.ArgumentParser() 355 | parser.add_argument('--label_dir', type=str) 356 | parser.add_argument('--cmp_dir', type=str) 357 | parser.add_argument('--name', type=str) 358 | parser.add_argument('--model_type', type=str) 359 | args = parser.parse_args() 360 | 361 | logging.basicConfig(format='%(asctime)s %(filename)s %(levelname)s %(message)s', 362 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.INFO, 363 | stream=sys.stdout) 364 | 365 | raw = 'raw_' + args.name 366 | 367 | if not os.path.exists(raw): 368 | os.mkdir(raw) 369 | 370 | label_scp_dir = raw + '/prepared_label/label_scp/' 371 | param_scp_dir = raw + '/prepared_cmp/param_scp/' 372 | lst_dir = os.path.join(cur_file, 'config_' + args.name) 373 | data_dir = os.path.join(cur_file, 'data_' + args.name) 374 | 375 | read_data(args, raw) 376 | create_scp(raw) 377 | get_random_scp(label_scp_dir, param_scp_dir, lst_dir) 378 | # cal cmvn to the data 379 | calculate_cmvn('train', lst_dir, data_dir, args.model_type) 380 | convert_to('train', os.path.join(lst_dir, 'train'), data_dir, args.model_type) 381 | convert_to('valid', os.path.join(lst_dir, 'valid'), data_dir, args.model_type) 382 | convert_to('test', os.path.join(lst_dir, 'test'), data_dir, args.model_type) 383 | 384 | 385 | if __name__ == '__main__': 386 | main() 387 | -------------------------------------------------------------------------------- /sgdr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import logging 4 | import numpy as np 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler): # pylint: disable=protected-access 9 | """ 10 | Cosine annealing with restarts. 11 | This is decribed in the paper https://arxiv.org/abs/1608.03983. 12 | Parameters 13 | ---------- 14 | optimizer : ``torch.optim.Optimizer`` 15 | t_max : ``int`` 16 | The maximum number of iterations within the first cycle. 17 | eta_min : ``float``, optional (default=0) 18 | The minimum learning rate. 19 | last_epoch : ``int``, optional (default=-1) 20 | The index of the last epoch. This is used when restarting. 21 | factor : ``float``, optional (default=1) 22 | The factor by which the cycle length (``T_max``) increases after each restart. 23 | """ 24 | 25 | def __init__(self, 26 | optimizer: torch.optim.Optimizer, 27 | t_max: int, 28 | eta_min: float = 0., 29 | last_epoch: int = -1, 30 | factor: float = 1.) -> None: 31 | assert t_max > 0 32 | assert eta_min >= 0 33 | if t_max == 1 and factor == 1: 34 | logger.warning("Cosine annealing scheduler will have no effect on the learning " 35 | "rate since T_max = 1 and factor = 1.") 36 | self.t_max = t_max 37 | self.eta_min = eta_min 38 | self.factor = factor 39 | self._last_restart: int = 0 40 | self._cycle_counter: int = 0 41 | self._cycle_factor: float = 1. 42 | self._updated_cycle_len: int = t_max 43 | self._initialized: bool = False 44 | super(CosineWithRestarts, self).__init__(optimizer, last_epoch) 45 | 46 | def get_lr(self): 47 | """Get updated learning rate.""" 48 | # HACK: We need to check if this is the first time ``self.get_lr()`` was called, 49 | # since ``torch.optim.lr_scheduler._LRScheduler`` will call ``self.get_lr()`` 50 | # when first initialized, but the learning rate should remain unchanged 51 | # for the first epoch. 52 | if not self._initialized: 53 | self._initialized = True 54 | return self.base_lrs 55 | 56 | step = self.last_epoch + 1 57 | self._cycle_counter = step - self._last_restart 58 | 59 | lrs = [ 60 | self.eta_min + ((lr - self.eta_min) / 2) * ( 61 | np.cos( 62 | np.pi * 63 | (self._cycle_counter % self._updated_cycle_len) / 64 | self._updated_cycle_len 65 | ) + 1 66 | ) 67 | for lr in self.base_lrs 68 | ] 69 | 70 | if self._cycle_counter % self._updated_cycle_len == 0: 71 | # Adjust the cycle length. 72 | self._cycle_factor *= self.factor 73 | self._cycle_counter = 0 74 | self._updated_cycle_len = int(self._cycle_factor * self.t_max) 75 | self._last_restart = step 76 | 77 | return lrs -------------------------------------------------------------------------------- /split_cmp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import logging 5 | import argparse 6 | import numpy as np 7 | 8 | from utils import read_binary_file, write_binary_file 9 | 10 | with open('./hparams.json', 'r') as f: 11 | hparams = json.load(f) 12 | 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--cmp_dir', default='') 17 | parser.add_argument('--output', default='./splited_cmp/', type=str, 18 | help='path to output cmp') 19 | parser.add_argument('--model_type', default='') 20 | 21 | args = parser.parse_args() 22 | logging.basicConfig(format='%(asctime)s %(filename)s %(levelname)s %(message)s', 23 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.DEBUG, 24 | stream=sys.stdout) 25 | if not os.path.exists(args.output): 26 | os.mkdir(args.output) 27 | 28 | cmp_file = os.listdir(args.cmp_dir) 29 | if args.model_type == 'acoustic': 30 | for cmp_filename in cmp_file: 31 | cmp = read_binary_file(os.path.join(args.cmp_dir, cmp_filename), dimension=hparams['target_channels'], dtype=np.float64) 32 | sp = np.zeros(cmp.shape) 33 | sp[:, :hparams['spec_units']] = cmp[:, :hparams['spec_units']] 34 | sp[:, -hparams['energy_units']] = cmp[:, -hparams['energy_units']] 35 | lf0 = cmp[:, hparams['spec_units']:hparams['spec_units']+hparams['lf0_units']] 36 | uv = cmp[:, hparams['spec_units'] + hparams['lf0_units']:hparams['spec_units'] + hparams['lf0_units']+hparams['uv_units']] 37 | cap = cmp[:, hparams['spec_units'] + hparams['lf0_units'] + hparams['uv_units']: 38 | hparams['cap_units']+hparams['spec_units'] + hparams['lf0_units'] + hparams['uv_units']] 39 | lf0[uv == 0] = 0 40 | write_binary_file(sp, os.path.join(args.output, os.path.splitext(cmp_filename)[0] + '.sp'), dtype=np.float64) 41 | write_binary_file(sp, os.path.join(args.output, os.path.splitext(cmp_filename)[0] + '.lf0'), dtype=np.float64) 42 | write_binary_file(sp, os.path.join(args.output, os.path.splitext(cmp_filename)[0] + '.ap'), dtype=np.float64) 43 | if __name__ == '__main__': 44 | main() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import json 5 | import logging 6 | import argparse 7 | from tqdm import tqdm 8 | from model_utils import create_train_model 9 | from datasets import EMPHASISDataset, collate_fn 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | import torch.nn.functional as F 15 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 16 | 17 | with open('./hparams.json', 'r') as f: 18 | hparams = json.load(f) 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def train_one_acoustic_epoch(train_loader, model, device, optimizer): 24 | model.train() 25 | tr_loss = 0.0 26 | num_steps = 0 27 | 28 | pbar = tqdm(train_loader, total=(len(train_loader)), unit=' batches') 29 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 30 | pbar): 31 | input = input_batch.to(device=device) 32 | target = target_batch.to(device=device) 33 | mask = mask.to(device=device) 34 | uv_mask = uv_mask.to(device=device) 35 | uv_target = target[:, :, 0] 36 | uv_target[uv_target > 0.5] = 1 37 | uv_target[uv_target <= 0.5] = 0 38 | uv_target = uv_target.long() 39 | target = target[:, :, 1:] 40 | 41 | output, uv_output = model(input) 42 | 43 | # mask the loss 44 | output *= mask 45 | uv_output *= uv_mask 46 | 47 | output_loss = F.mse_loss(output, target) 48 | uv_output = uv_output.view(-1, 2) 49 | uv_target = uv_target.view(-1, 1) 50 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 51 | loss = output_loss + uv_output_loss 52 | tr_loss += loss 53 | 54 | optimizer.zero_grad() 55 | loss.backward() 56 | optimizer.step() 57 | 58 | num_steps += 1 59 | return tr_loss / num_steps 60 | 61 | 62 | def eval_one_acoustic_epoch(valid_loader, model, device): 63 | val_loss = 0.0 64 | num_steps = 0 65 | 66 | pbar = tqdm(valid_loader, total=(len(valid_loader)), unit=' batches') 67 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 68 | pbar): 69 | input = input_batch.to(device=device) 70 | target = target_batch.to(device=device) 71 | mask = mask.to(device=device) 72 | uv_mask = uv_mask.to(device=device) 73 | uv_target = target[:, :, 0] 74 | uv_target[uv_target > 0.5] = 1 75 | uv_target[uv_target <= 0.5] = 0 76 | uv_target = uv_target.long() 77 | 78 | target = target[:, :, 1:] 79 | 80 | output, uv_output = model(input) 81 | 82 | # mask the loss 83 | output *= mask 84 | uv_output *= uv_mask 85 | 86 | output_loss = F.mse_loss(output, target) 87 | 88 | uv_output = uv_output.view(-1, 2) 89 | uv_target = uv_target.view(-1, 1) 90 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 91 | loss = output_loss + uv_output_loss 92 | val_loss += loss.item() 93 | num_steps += 1 94 | return val_loss / num_steps 95 | 96 | 97 | def train_one_acoustic_mgc_epoch(train_loader, model, device, optimizer): 98 | model.train() 99 | tr_loss = 0.0 100 | num_steps = 0 101 | 102 | pbar = tqdm(train_loader, total=(len(train_loader)), unit=' batches') 103 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 104 | pbar): 105 | input = input_batch.to(device=device) 106 | target = target_batch.to(device=device) 107 | mask = mask.to(device=device) 108 | uv_mask = uv_mask.to(device=device) 109 | 110 | uv_target = target[:, :, hparams['mgc_units']: 111 | hparams['mgc_units'] + 1] 112 | 113 | target = torch.cat((target[:, :, :hparams['mgc_units']], 114 | target[:, :, -(hparams['bap_units'] + hparams['lf0_units']):]), -1) 115 | uv_target[uv_target >= 0.5] = 1 116 | uv_target[uv_target < 0.5] = 0 117 | uv_target = uv_target.long() 118 | 119 | output, uv_output = model(input) 120 | # mask the loss 121 | output *= mask 122 | uv_output *= uv_mask 123 | 124 | output_loss = F.mse_loss(output, target) 125 | uv_output = uv_output.view(-1, 2) 126 | uv_target = uv_target.view(-1, 1) 127 | 128 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 129 | loss = output_loss + uv_output_loss 130 | tr_loss += loss 131 | 132 | optimizer.zero_grad() 133 | loss.backward() 134 | optimizer.step() 135 | 136 | num_steps += 1 137 | return tr_loss / num_steps 138 | 139 | 140 | def eval_one_acoustic_mgc_epoch(valid_loader, model, device): 141 | val_loss = 0.0 142 | num_steps = 0 143 | 144 | pbar = tqdm(valid_loader, total=(len(valid_loader)), unit=' batches') 145 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 146 | pbar): 147 | input = input_batch.to(device=device) 148 | target = target_batch.to(device=device) 149 | mask = mask.to(device=device) 150 | uv_mask = uv_mask.to(device=device) 151 | uv_mask = uv_mask.to(device=device) 152 | 153 | uv_target = target[:, :, hparams['mgc_units']: 154 | hparams['mgc_units'] + 1] 155 | target = torch.cat((target[:, :, :hparams['mgc_units']], 156 | target[:, :, -(hparams['bap_units'] + hparams['lf0_units']):]), -1) 157 | 158 | uv_target[uv_target >= 0.5] = 1 159 | uv_target[uv_target < 0.5] = 0 160 | uv_target = uv_target.long() 161 | 162 | output, uv_output = model(input) 163 | 164 | # mask the loss 165 | output *= mask 166 | uv_output *= uv_mask 167 | 168 | output_loss = F.mse_loss(output, target) 169 | 170 | uv_output = uv_output.view(-1, 2) 171 | uv_target = uv_target.view(-1, 1) 172 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 173 | loss = output_loss + uv_output_loss 174 | val_loss += loss.item() 175 | num_steps += 1 176 | return val_loss / num_steps 177 | 178 | 179 | def train_one_acoustic_dcbhg_mgc_epoch(train_loader, model, device, optimizer): 180 | model.train() 181 | tr_loss = 0.0 182 | num_steps = 0 183 | 184 | pbar = tqdm(train_loader, total=(len(train_loader)), unit=' batches') 185 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 186 | pbar): 187 | input = input_batch.to(device=device) 188 | target = target_batch.to(device=device) 189 | mask = mask.to(device=device) 190 | uv_mask = uv_mask.to(device=device) 191 | 192 | uv_target = target[:, :, hparams['mgc_units']: 193 | hparams['mgc_units'] + 1] 194 | 195 | lf0_target = target[:, :, -1].unsqueeze(-1) 196 | 197 | target = torch.cat((target[:, :, :hparams['mgc_units']], 198 | target[:, :, -(hparams['bap_units'] + hparams['lf0_units']):]), -1) 199 | uv_target[uv_target >= 0.5] = 1 200 | uv_target[uv_target < 0.5] = 0 201 | uv_target = uv_target.long() 202 | 203 | output, uv_output = model(input, lf0_target) 204 | # mask the loss 205 | output *= mask 206 | uv_output *= uv_mask 207 | 208 | output_loss = F.mse_loss(output, target) 209 | uv_output = uv_output.view(-1, 2) 210 | uv_target = uv_target.view(-1, 1) 211 | 212 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 213 | loss = output_loss + uv_output_loss 214 | tr_loss += loss 215 | 216 | optimizer.zero_grad() 217 | loss.backward() 218 | optimizer.step() 219 | 220 | num_steps += 1 221 | return tr_loss / num_steps 222 | 223 | 224 | def eval_one_acoustic_dcbhg_mgc_epoch(valid_loader, model, device): 225 | val_loss = 0.0 226 | num_steps = 0 227 | 228 | pbar = tqdm(valid_loader, total=(len(valid_loader)), unit=' batches') 229 | for b, (input_batch, target_batch, mask, uv_mask) in enumerate( 230 | pbar): 231 | input = input_batch.to(device=device) 232 | target = target_batch.to(device=device) 233 | mask = mask.to(device=device) 234 | uv_mask = uv_mask.to(device=device) 235 | uv_mask = uv_mask.to(device=device) 236 | 237 | uv_target = target[:, :, hparams['mgc_units']: 238 | hparams['mgc_units'] + 1] 239 | lf0_target = target[:, :, -1].unsqueeze(-1) 240 | 241 | target = torch.cat((target[:, :, :hparams['mgc_units']], 242 | target[:, :, -(hparams['bap_units'] + hparams['lf0_units']):]), -1) 243 | 244 | uv_target[uv_target >= 0.5] = 1 245 | uv_target[uv_target < 0.5] = 0 246 | uv_target = uv_target.long() 247 | 248 | 249 | output, uv_output = model(input, lf0_target) 250 | 251 | # mask the loss 252 | output *= mask 253 | uv_output *= uv_mask 254 | 255 | output_loss = F.mse_loss(output, target) 256 | 257 | uv_output = uv_output.view(-1, 2) 258 | uv_target = uv_target.view(-1, 1) 259 | uv_output_loss = F.cross_entropy(uv_output, uv_target.squeeze()) 260 | loss = output_loss + uv_output_loss 261 | val_loss += loss.item() 262 | num_steps += 1 263 | return val_loss / num_steps 264 | 265 | 266 | def train_one_duration_epoch(train_loader, model, device, optimizer): 267 | model.train() 268 | tr_loss = 0.0 269 | num_steps = 0 270 | 271 | pbar = tqdm(train_loader, total=(len(train_loader)), unit=' batches') 272 | for b, (input_batch, target_batch, mask) in enumerate( 273 | pbar): 274 | input = input_batch.to(device=device) 275 | target = target_batch.to(device=device) 276 | 277 | output = model(input) 278 | # mask the loss 279 | output *= mask 280 | output_loss = F.mse_loss(output, target) 281 | loss = output_loss 282 | tr_loss += loss 283 | 284 | optimizer.zero_grad() 285 | loss.backward() 286 | optimizer.step() 287 | 288 | num_steps += 1 289 | return tr_loss / num_steps 290 | 291 | 292 | def eval_one_duration_epoch(valid_loader, model, device): 293 | model.eval() 294 | val_loss = 0.0 295 | num_steps = 0 296 | 297 | pbar = tqdm(valid_loader, total=(len(valid_loader)), unit=' batches') 298 | for b, (input_batch, target_batch, mask) in enumerate( 299 | pbar): 300 | input = input_batch.to(device=device) 301 | target = target_batch.to(device=device) 302 | 303 | output = model(input) 304 | # mask the loss 305 | output *= mask 306 | output_loss = F.mse_loss(output, target) 307 | loss = output_loss 308 | val_loss += loss 309 | num_steps += 1 310 | model.train() 311 | return val_loss / num_steps 312 | 313 | 314 | def get_lr(optimizer): 315 | for group in optimizer.param_groups: 316 | return group['lr'] 317 | 318 | def train_model(args, model_type, model, optimizer, lr_scheduler, exp_name, device, epoch, checkpoint_path): 319 | data_path = os.path.join(args.base_dir, args.data) 320 | train_dataset = EMPHASISDataset(f'{data_path}/train', f'./config_{exp_name}/train.lst', model_type) 321 | train_sampler = RandomSampler(train_dataset) 322 | train_loader = DataLoader(dataset=train_dataset, batch_size=hparams['batch_size'], sampler=train_sampler, 323 | num_workers=6, collate_fn=collate_fn, pin_memory=False) 324 | 325 | valid_dataset = EMPHASISDataset(f'{data_path}/valid', f'./config_{exp_name}/valid.lst', model_type) 326 | valid_sampler = SequentialSampler(valid_dataset) 327 | valid_loader = DataLoader(dataset=valid_dataset, batch_size=hparams['batch_size'], sampler=valid_sampler, 328 | num_workers=6, collate_fn=collate_fn, pin_memory=False) 329 | prev_val_loss = 1000.0 330 | prev_checkpoint_path = '.' 331 | 332 | for cur_epoch in tqdm(range(epoch, hparams['max_epochs'])): 333 | # train one epoch 334 | time_start = time.time() 335 | if model_type == 'acoustic': 336 | tr_loss = train_one_acoustic_epoch(train_loader, model, device, optimizer) 337 | elif model_type == 'acoustic_mgc': 338 | tr_loss = train_one_acoustic_mgc_epoch(train_loader, model, device, optimizer) 339 | elif model_type == 'acoustic_dcbhg_mgc': 340 | tr_loss = train_one_acoustic_dcbhg_mgc_epoch(train_loader, model, device, optimizer) 341 | else: 342 | tr_loss = train_one_duration_epoch(train_loader, model, device, optimizer) 343 | time_end = time.time() 344 | used_time = time_end - time_start 345 | 346 | # validate one epoch 347 | if model_type == 'acoustic': 348 | val_loss = eval_one_acoustic_epoch(valid_loader, model, device) 349 | elif model_type == 'acoustic_mgc': 350 | val_loss = eval_one_acoustic_mgc_epoch(valid_loader, model, device) 351 | elif model_type == 'acoustic_dcbhg_mgc': 352 | val_loss = eval_one_acoustic_dcbhg_mgc_epoch(valid_loader, model, device) 353 | else: 354 | val_loss = eval_one_duration_epoch(valid_loader, model, device) 355 | 356 | lr_scheduler.step(val_loss) 357 | lr = get_lr(optimizer) 358 | 359 | logger.info(f'EPOCH {cur_epoch}: TRAIN AVG.LOSS {tr_loss:.4f}, learning_rate: {lr:g} ' 360 | f'CROSSVAL AVG.LOSS {val_loss:.4f}, TIME USED {used_time:.2f}') 361 | 362 | if val_loss >= prev_val_loss: 363 | logger.info(f'The CROSSVAL AVG.LOSS does\'nt reduce, so we need to reload the last checkpoint') 364 | checkpoint = torch.load(prev_checkpoint_path) 365 | cur_epoch = checkpoint['epoch'] 366 | model.load_state_dict(checkpoint['model']) 367 | 368 | logger.info(f'Loaded checkpoint from {prev_checkpoint_path} succeed') 369 | else: 370 | state = { 371 | 'epoch': cur_epoch + 1, 372 | 'model': model.state_dict(), 373 | 'optimizer': optimizer.state_dict(), 374 | 'lr_scheduler': lr_scheduler.state_dict() 375 | } 376 | 377 | torch.save(state, 378 | f'{checkpoint_path}/{exp_name}_epoch{cur_epoch}_lrate{lr:g}_tr{tr_loss:.4f}_cv{val_loss:g}.tar') 379 | logger.info( 380 | f'Save state to {checkpoint_path}/{exp_name}_epoch{cur_epoch}_lrate{lr:g}_tr{tr_loss:.4f}_cv{val_loss:g}.tar succeed') 381 | prev_val_loss = val_loss 382 | prev_checkpoint_path = f'{checkpoint_path}/{exp_name}_epoch{cur_epoch}_lrate{lr:g}_tr{tr_loss:.4f}_cv{val_loss:g}.tar' 383 | 384 | # add a blank line for log readability 385 | print() 386 | sys.stdout.flush() 387 | 388 | 389 | def main(): 390 | parser = argparse.ArgumentParser() 391 | parser.add_argument('--base_dir', default='') 392 | parser.add_argument('--data', default='data', type=str, 393 | help='path to dataset contains inputs and targets') 394 | parser.add_argument('--log_dir', default='EMPHASIS', type=str, help='path to save checkpoint') 395 | parser.add_argument('--restore_from', default=None, type=str, 396 | help='the checkpoint such as xxx.tar restored from the log_dir you set') 397 | parser.add_argument('--model_type', default='acoustic', type=str, 398 | help='model type which is either acoustic or acoustic_mgc') 399 | parser.add_argument('--name', default='EMPHASIS', type=str, 400 | help='name of the experiment') 401 | 402 | args = parser.parse_args() 403 | logging.basicConfig(format='%(asctime)s %(filename)s %(levelname)s %(message)s', 404 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.INFO, 405 | stream=sys.stdout) 406 | 407 | epoch = 0 408 | model_type = args.model_type 409 | exp_name = args.name 410 | model = create_train_model(model_type) 411 | device = torch.device('cuda' if torch.cuda.is_available() else "cpu") 412 | 413 | if torch.cuda.device_count() >= 1: 414 | model = nn.DataParallel(model) 415 | model.to(device) 416 | 417 | if not os.path.exists(args.log_dir): 418 | os.mkdir(args.log_dir) 419 | 420 | checkpoint_path = os.path.join(args.log_dir, "checkpoint") 421 | if not os.path.exists(checkpoint_path): 422 | os.mkdir(checkpoint_path) 423 | optimizer = optim.Adam(model.parameters(), lr=hparams['initial_lr'], weight_decay=hparams['weight_decay']) 424 | lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True, 425 | min_lr=hparams['min_lr']) 426 | 427 | if args.restore_from is not None: 428 | # load the checkpoint ... 429 | cpkt_path = os.path.join(checkpoint_path, args.restore_from) 430 | if os.path.exists(cpkt_path): 431 | logger.info(f'Loading checkpoint from {cpkt_path} ...') 432 | 433 | checkpoint = torch.load(cpkt_path) 434 | epoch = checkpoint['epoch'] 435 | model.load_state_dict(checkpoint['model']) 436 | exp_name = cpkt_path.split('/')[-1].split("_epoch")[0] 437 | optimizer.load_state_dict(checkpoint['optimizer']) 438 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 439 | 440 | logger.info(f'Loaded checkpoint from {cpkt_path} succeed') 441 | else: 442 | logger.error(f'Checkpoint path:{checkpoint_path} does\'t exist!') 443 | 444 | train_model(args, model_type, model, optimizer, 445 | lr_scheduler, exp_name, device, epoch, checkpoint_path) 446 | 447 | 448 | if __name__ == '__main__': 449 | main() 450 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import struct 5 | import logging 6 | import numpy as np 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | with open('./hparams.json', 'r') as f: 13 | hparams = json.load(f) 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def pad(inputs, padding): 19 | return F.pad(inputs, (padding // 2, padding // 2 if padding % 2 == 0 else int(padding / 2 + 1))) 20 | 21 | 22 | def Conv1d(inputs, conv, is_training, batch_norm=None, activation=None, padding=None): 23 | # the Conv1d of pytroch chanages the channels at the 1 dim 24 | # [batch_size, max_time, feature_dims] -> [batch_size, feature_dims, max_time] 25 | inputs = torch.transpose(inputs, 1, 2) 26 | if padding is not None: 27 | inputs = pad(inputs, padding) 28 | 29 | conv1d_output = conv(inputs) 30 | if batch_norm is not None: 31 | batch_norm_output = batch_norm(conv1d_output) 32 | batch_norm_output = torch.transpose(batch_norm_output, 1, 2) 33 | else: 34 | batch_norm_output = torch.transpose(conv1d_output, 1, 2) 35 | if activation is not None: 36 | batch_norm_output = activation(batch_norm_output) 37 | return F.dropout(batch_norm_output, p=hparams["dropout_rate"], training=is_training) 38 | 39 | 40 | def MaxPool1d(inputs, maxpool, padding=None): 41 | if padding is not None: 42 | inputs = pad(inputs, padding) 43 | outputs = maxpool(inputs) 44 | return outputs 45 | 46 | 47 | def highwaynet(inputs, activation, units=128): 48 | H = F.linear(inputs, weight=torch.nn.init.normal_(torch.empty(units, inputs.size(2))).cuda()) 49 | H = activation[0](H) 50 | T = F.linear(inputs, weight=torch.nn.init.normal_(torch.empty(units, inputs.size(2)).cuda()), 51 | bias=nn.init.constant_(torch.empty(1, 1, units), -0.1).cuda()) 52 | T = activation[1](T) 53 | return H * T + inputs * (1.0 - T) 54 | 55 | 56 | class HighwayNet(nn.Module): 57 | def __init__(self, activation=None, units=128): 58 | super(HighwayNet, self).__init__() 59 | 60 | self.activation = activation 61 | self.H = nn.Linear(units, units) 62 | self.T = nn.Linear(units, units) 63 | torch.nn.init.constant_(self.T.bias, val=-1.0) 64 | 65 | def forward(self, input): 66 | H_output = self.H(input) 67 | if self.activation[0] is not None: 68 | H_output = self.activation[0](H_output) 69 | 70 | T_output = self.T(H_output) 71 | if self.activation[1] is not None: 72 | T_output = self.activation[1](T_output) 73 | 74 | return H_output * T_output + input * (1.0 - T_output) 75 | 76 | 77 | def calculate_cmvn(name, config_dir, output_dir, model_type): 78 | """Calculate mean and var.""" 79 | logger.info("Calculating mean and var of %s" % name) 80 | config_filename = open(os.path.join(config_dir, name + '.lst')) 81 | 82 | inputs_frame_count, labels_frame_count = 0, 0 83 | for line in config_filename: 84 | utt_id, inputs_path, labels_path = line.strip().split() 85 | logger.info("Reading utterance %s" % utt_id) 86 | inputs = read_binary_file(inputs_path, hparams['in_channels']) 87 | labels = read_binary_file(labels_path, hparams['target_channels'] if model_type == 'acoustic' else 88 | hparams['mgc_target_channels'], dtype=np.float64 if model_type == 'acoustic' 89 | else np.float32) 90 | if inputs_frame_count == 0: # create numpy array for accumulating 91 | ex_inputs = np.sum(inputs, axis=0) 92 | ex2_inputs = np.sum(inputs ** 2, axis=0) 93 | ex_labels = np.sum(labels, axis=0) 94 | ex2_labels = np.sum(labels ** 2, axis=0) 95 | else: 96 | ex_inputs += np.sum(inputs, axis=0) 97 | ex2_inputs += np.sum(inputs ** 2, axis=0) 98 | ex_labels += np.sum(labels, axis=0) 99 | ex2_labels += np.sum(labels ** 2, axis=0) 100 | inputs_frame_count += len(inputs) 101 | labels_frame_count += len(labels) 102 | 103 | mean_inputs = ex_inputs / inputs_frame_count 104 | stddev_inputs = np.sqrt(np.abs(ex2_inputs / inputs_frame_count - mean_inputs ** 2)) 105 | stddev_inputs[stddev_inputs < 1e-20] = 1e-20 106 | 107 | mean_labels = ex_labels / labels_frame_count 108 | stddev_labels = np.sqrt(np.abs(ex2_labels / labels_frame_count - mean_labels ** 2)) 109 | stddev_labels[stddev_labels < 1e-20] = 1e-20 110 | 111 | if model_type == 'acoustic': 112 | mean_labels[0] = 0.0 113 | stddev_labels[0] = 1.0 114 | elif model_type == 'acoustic_mgc': 115 | mean_labels[60] = 0.0 116 | stddev_labels[60] = 1.0 117 | 118 | if not os.path.exists(output_dir): 119 | os.mkdir(output_dir) 120 | 121 | cmvn_name = os.path.join(output_dir, name + "_cmvn.npz") 122 | np.savez(cmvn_name, 123 | mean_inputs=mean_inputs, 124 | stddev_inputs=stddev_inputs, 125 | mean_labels=mean_labels, 126 | stddev_labels=stddev_labels) 127 | config_filename.close() 128 | logger.info("Wrote to %s" % cmvn_name) 129 | 130 | 131 | def convert_to(name, config_dir, output_dir, model_type, apply_cmvn=True): 132 | if not os.path.exists(output_dir): 133 | os.mkdir(output_dir) 134 | if not os.path.exists(os.path.join(output_dir, name)): 135 | os.mkdir(os.path.join(output_dir, name)) 136 | if not os.path.exists(os.path.join(output_dir, name, 'label')): 137 | os.mkdir(os.path.join(output_dir, name, 'label')) 138 | if not os.path.exists(os.path.join(output_dir, name, 'cmp')): 139 | os.mkdir(os.path.join(output_dir, name, 'cmp')) 140 | cmvn = np.load(os.path.join(output_dir, "train_cmvn.npz")) 141 | config_file = open(config_dir + ".lst") 142 | for line in config_file: 143 | if name != 'test': 144 | utt_id, inputs_path, labels_path = line.strip().split() 145 | inputs_outdir = os.path.join(output_dir, name, 'label', f'{utt_id}.lab') 146 | labels_outdir = os.path.join(output_dir, name, 'cmp', f'{utt_id}.cmp') 147 | else: 148 | utt_id, inputs_path = line.strip().split() 149 | inputs_outdir = os.path.join(output_dir, name, 'label', f'{utt_id}.lab') 150 | 151 | logger.info(f'Writing utterance {utt_id} ...') 152 | inputs = read_binary_file(inputs_path, hparams['in_channels']).astype(np.float32) 153 | if name != 'test': 154 | labels = read_binary_file(labels_path, hparams['target_channels'] if model_type == 'acoustic' else 155 | hparams['mgc_target_channels'], dtype=np.float64 if model_type == 'acoustic' 156 | else np.float32).astype(np.float64 if model_type == 'acoustic' else np.float32) 157 | else: 158 | labels = None 159 | if apply_cmvn: 160 | inputs = (inputs - cmvn["mean_inputs"]) / cmvn["stddev_inputs"] 161 | write_binary_file(inputs, inputs_outdir) 162 | if labels is not None: 163 | labels = (labels - cmvn["mean_labels"]) / cmvn["stddev_labels"] 164 | write_binary_file(labels, labels_outdir) 165 | 166 | config_file.close() 167 | 168 | 169 | def read_binary_file(filename, dimension=None, dtype=np.float32): 170 | """Read data from matlab binary file (row, col and matrix). 171 | Returns: 172 | A numpy matrix containing data of the given binary file. 173 | """ 174 | if dimension is None: 175 | read_buffer = open(filename, 'rb') 176 | 177 | rows = 0; 178 | cols = 0 179 | rows = struct.unpack('