├── version.txt ├── deepaudio ├── speaker │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ ├── eer.py │ │ └── train.py │ ├── configs │ │ ├── __init__.py │ │ └── train.yaml │ ├── data │ │ ├── __init__.py │ │ ├── audio_io │ │ │ └── __init__.py │ │ ├── augmentation │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── spec_augment.py │ │ │ ├── noise.py │ │ │ └── configurations.py │ │ ├── feature │ │ │ ├── fbank │ │ │ │ ├── __init__.py │ │ │ │ ├── configuration.py │ │ │ │ └── fbank.py │ │ │ ├── but_fbank │ │ │ │ ├── __init__.py │ │ │ │ └── configuration.py │ │ │ ├── utils.py │ │ │ └── __init__.py │ │ ├── dataloader.py │ │ ├── samplers.py │ │ └── dataset.py │ ├── metrics │ │ ├── __init__.py │ │ ├── utils.py │ │ └── eer.py │ ├── modules │ │ ├── __init__.py │ │ └── backbones │ │ │ ├── __init__.py │ │ │ ├── mmcl │ │ │ ├── __init__.py │ │ │ └── STP.py │ │ │ ├── clovaai │ │ │ ├── __init__.py │ │ │ ├── ResNetBlocks.py │ │ │ ├── ResNetSE34V2.py │ │ │ └── ResNetSE34L.py │ │ │ ├── wespeaker │ │ │ ├── __init__.py │ │ │ ├── speaker_model.py │ │ │ └── tdnn.py │ │ │ ├── ecapa.py │ │ │ └── resnet.py │ ├── dataclass │ │ ├── __init__.py │ │ └── initialize.py │ ├── models │ │ ├── ecapa │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── resnet │ │ │ ├── __init__.py │ │ │ ├── configurations.py │ │ │ └── model.py │ │ ├── clovaai_ecapa │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── mmcl_seresnet34 │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── wespeaker_model │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── clovaai_resnetse34l │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── clovaai_resnetse34v2 │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── configurations.py │ │ ├── inference.py │ │ ├── __init__.py │ │ ├── speaker_embedding_model.py │ │ └── speaker_model.py │ ├── datasets │ │ ├── dataframe │ │ │ ├── __init__.py │ │ │ ├── configurations.py │ │ │ ├── lit_data_module.py │ │ │ └── utils.py │ │ ├── voxceleb2 │ │ │ ├── __init__.py │ │ │ ├── preprocess.py │ │ │ ├── configurations.py │ │ │ └── lit_data_module.py │ │ ├── utils.py │ │ └── __init__.py │ ├── criterion │ │ ├── aamsoftmax │ │ │ ├── __init__.py │ │ │ ├── configuration.py │ │ │ └── aamsoftmax.py │ │ ├── adaptive_aamsoftmax │ │ │ ├── __init__.py │ │ │ ├── configuration.py │ │ │ └── aamsoftmax.py │ │ ├── pyannote_aamsoftmax │ │ │ ├── __init__.py │ │ │ ├── configuration.py │ │ │ └── aamsoftmax.py │ │ ├── subcenter_aamsoftmax │ │ │ ├── __init__.py │ │ │ ├── configuration.py │ │ │ └── subcenter_aamsoftmax.py │ │ ├── adaptive_subcenter_aamsoftmax │ │ │ ├── __init__.py │ │ │ ├── configuration.py │ │ │ └── subcenter_aamsoftmax.py │ │ ├── .DS_Store │ │ └── __init__.py │ ├── version.py │ ├── .DS_Store │ └── optim │ │ ├── scheduler │ │ ├── fix_lr_scheduler.py │ │ ├── lr_scheduler.py │ │ ├── step_lr_scheduler.py │ │ ├── __init__.py │ │ ├── warmup_scheduler.py │ │ ├── reduce_lr_on_plateau_scheduler.py │ │ ├── warmup_reduce_lr_on_plateau_scheduler.py │ │ ├── transformer_lr_scheduler.py │ │ ├── warmup_steplr_scheduler.py │ │ ├── warmup_adaptive_loss_reduce_lr_on_plateau_scheduler.py │ │ └── tri_stage_lr_scheduler.py │ │ ├── __init__.py │ │ ├── optimizer.py │ │ ├── adamp.py │ │ ├── radam.py │ │ └── novograd.py ├── __init__.py └── .DS_Store ├── requirements.txt ├── run.sh ├── run3.sh ├── run2.sh ├── setup.py ├── .gitignore ├── setup.cfg └── Readme.md /version.txt: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /deepaudio/speaker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/audio_io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/dataclass/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/ecapa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/resnet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/fbank/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/dataframe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/voxceleb2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/aamsoftmax/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/but_fbank/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_ecapa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/mmcl_seresnet34/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/wespeaker_model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/mmcl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34l/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34v2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/clovaai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/wespeaker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1' 2 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_aamsoftmax/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/pyannote_aamsoftmax/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/subcenter_aamsoftmax/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepaudio/__init__.py: -------------------------------------------------------------------------------- 1 | __import__("pkg_resources").declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /deepaudio/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/.DS_Store -------------------------------------------------------------------------------- /deepaudio/speaker/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/speaker/.DS_Store -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/speaker/criterion/.DS_Store -------------------------------------------------------------------------------- /deepaudio/speaker/data/augmentation/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | 4 | def get_all_wavs(parent_dir): 5 | return glob.glob(f'{parent_dir}/**/*.wav', recursive=True) 6 | -------------------------------------------------------------------------------- /deepaudio/speaker/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | defaults: 4 | - feature: fbank 5 | - augment: default 6 | - dataset: voxceleb2 7 | - criterion: pyannote_aamsoftmax 8 | - lr_scheduler: steplr 9 | - model: clovaai_ecapa 10 | - trainer: cpu 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hydra-core 2 | librosa 3 | pyannote.core >=4.1,<5.0 4 | pytorch-lightning >= 1.4,<1.5 5 | pytorch_metric_learning >=0.9.98 6 | soundfile >=0.10.2,<0.11 7 | torch >=1.8.1,<1.9 8 | torch-audiomentations >=0.6.0 9 | torchaudio >=0.8.1,<0.9 10 | typing_extensions >=3.7.4.3 11 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | def get_subdirs(directory): 5 | directory = Path(directory) 6 | return directory.glob('*/') 7 | 8 | 9 | def get_all_wavs(directory): 10 | directory = Path(directory) 11 | return list(directory.glob('**/*.wav')) 12 | 13 | -------------------------------------------------------------------------------- /deepaudio/speaker/metrics/utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | def get_all_wavs(trials): 3 | uris = set() 4 | for uri_enroll, uri_test, _ in trials: 5 | uris.add(uri_enroll) 6 | uris.add(uri_test) 7 | return set(uris) 8 | 9 | 10 | def get_all_embeddings(model, wav_trials): 11 | embedding = dict() 12 | for uri in tqdm(wav_trials): 13 | embedding[uri] = model.make_embedding(uri) 14 | return embedding 15 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | EPSILON = 1e-6 5 | 6 | 7 | class CMVN(nn.Module): 8 | def __init__(self, var_norm=False): 9 | super(CMVN, self).__init__() 10 | self.var_norm = var_norm 11 | 12 | def forward(self, x): 13 | mean = x.mean(dim=1, keepdims=True) 14 | if self.var_norm: 15 | std = torch.sqrt(x.var(dim=1, keepdims=True) + EPSILON) 16 | x = x - mean 17 | if self.var_norm: 18 | x /= std 19 | return x 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/aamsoftmax/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from ...dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class AAMSoftmaxConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="aamsoftmax", metadata={"help": "Criterion name for training"} 10 | ) 11 | margin: float = field( 12 | default=0.2, metadata={"help": "The angular margin penalty in radians."} 13 | ) 14 | 15 | scale: float = field( 16 | default=32, metadata={"help": "The scale for loss."} 17 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/pyannote_aamsoftmax/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from ...dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class PyannoteAAMSoftmaxConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="pyannote_aamsoftmax", metadata={"help": "Criterion name for training"} 10 | ) 11 | margin: float = field( 12 | default=0.2, metadata={"help": "The angular margin penalty in radians."} 13 | ) 14 | 15 | scale: float = field( 16 | default=32, metadata={"help": "The scale for loss."} 17 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/voxceleb2/preprocess.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from ..utils import get_subdirs, get_all_wavs 4 | 5 | 6 | def get_speaker_list(configs): 7 | data_dir = configs.dataset.dataset_path 8 | data_dir = configs.dataset.dataset_path 9 | speaker_dirs = get_subdirs(data_dir) 10 | speakers = [d.stem for d in speaker_dirs] 11 | spk2id = {k: v for v, k in enumerate(speakers)} 12 | return speakers, spk2id 13 | 14 | 15 | def get_speaker_wavs(data_dir, speakers): 16 | speaker2wav = {} 17 | for spk in speakers: 18 | spk_dir = Path(data_dir) / spk 19 | speaker2wav[spk] = get_all_wavs(spk_dir) 20 | return speaker2wav 21 | 22 | -------------------------------------------------------------------------------- /deepaudio/speaker/cli/eer.py: -------------------------------------------------------------------------------- 1 | from deepaudio.speaker.datasets.dataframe.utils import load_trial_dataframe, get_dataset_items 2 | from deepaudio.speaker.models.inference import Inference 3 | from deepaudio.speaker.metrics.eer import model_eer 4 | 5 | trial_meta = get_dataset_items('/home/amax/audio/deepaudio-database/database.yml', 6 | 'voxceleb1_o', 'trial') 7 | print(trial_meta[0]) 8 | wav_dir, trial_path = trial_meta[0] 9 | trials = load_trial_dataframe(wav_dir, trial_path) 10 | inference = Inference('/home/amax/audio/deepaudio-speaker/outputs/2021-10-26/00-37-08/logs/default/version_0/checkpoints/deepaudio-epoch=19-val_loss=2.33.ckpt') 11 | print(model_eer(inference, trials)) -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/subcenter_aamsoftmax/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from ...dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class SubcenterAAMSoftmaxConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="subcenter_aamsoftmax", metadata={"help": "Criterion name for training"} 10 | ) 11 | margin: float = field( 12 | default=0.2, metadata={"help": "The angular margin penalty in radians."} 13 | ) 14 | K: int = field( 15 | default=3, metadata={"help": "The number of subcenter."} 16 | ) 17 | scale: float = field( 18 | default=32, metadata={"help": "The scale for loss."} 19 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_aamsoftmax/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from ...dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class AdaptiveAAMSoftmaxConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="adaptive_aamsoftmax", metadata={"help": "Criterion name for training"} 10 | ) 11 | margin: float = field( 12 | default=0.3, metadata={"help": "The angular margin penalty in radians."} 13 | ) 14 | 15 | scale: float = field( 16 | default=32, metadata={"help": "The scale for loss."} 17 | ) 18 | increase_steps: int = field( 19 | default=50000, metadata={"help": "The increase step for margin."} 20 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_ecapa/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.clovaai.ECAPA_TDNN import MainModel 7 | 8 | from .configurations import ClovaaiECAPAConfigs 9 | 10 | 11 | @register_model('clovaai_ecapa', dataclass=ClovaaiECAPAConfigs) 12 | class ClovaaiECAPAModel(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(ClovaaiECAPAModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = MainModel( 18 | configs=self.configs 19 | ) 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/wespeaker_model/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.wespeaker.speaker_model import MainModel 7 | 8 | from .configurations import WespeakerModelConfigs 9 | 10 | 11 | @register_model('wespeaker_model', dataclass=WespeakerModelConfigs) 12 | class WespeakerModel(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(WespeakerModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = MainModel( 18 | configs=self.configs 19 | ) 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/mmcl_seresnet34/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.mmcl.seresnet_asv import MainModel 7 | 8 | from .configurations import MMCLSeResnet34Configs 9 | 10 | 11 | @register_model('mmcl_seresnet34', dataclass=MMCLSeResnet34Configs) 12 | class MMCLSeResnet34Model(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(MMCLSeResnet34Model, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = MainModel( 18 | configs=self.configs 19 | ) 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/voxceleb2/configurations.py: -------------------------------------------------------------------------------- 1 | from omegaconf import MISSING 2 | from dataclasses import dataclass, field 3 | 4 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 5 | @dataclass 6 | class Voxceleb2Configs(DeepMMDataclass): 7 | """ Configuration dataclass that common used """ 8 | name: str = field( 9 | default="voxceleb2", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"} 10 | ) 11 | dataset_path: str = field( 12 | default="/Users/yin/project/data/aac4", metadata={"help": "Path of dataset"} 13 | ) 14 | sampler: str = field( 15 | default="clovaai", metadata={"help": "Sampler name."} 16 | ) 17 | per_speaker: int = field( 18 | default=3, metadata={"help": "Sampler name."} 19 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34l/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.clovaai.ResNetSE34L import MainModel 7 | 8 | from .configurations import ClovaaiResnetse34lConfigs 9 | 10 | 11 | @register_model('clovaai_resnetse34l', dataclass=ClovaaiResnetse34lConfigs) 12 | class ClovaaiResnetse34lModel(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(ClovaaiResnetse34lModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = MainModel( 18 | configs=self.configs 19 | ) 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34v2/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.clovaai.ResNetSE34V2 import MainModel 7 | 8 | from .configurations import ClovaaiResnetse34V2Configs 9 | 10 | 11 | @register_model('clovaai_resnetse34v2', dataclass=ClovaaiResnetse34V2Configs) 12 | class ClovaaiResnetSE34V2Model(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(SpeakerEmbeddingModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = MainModel( 18 | configs=self.configs 19 | ) 20 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from ...dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class AdaptiveSubcenterAAMSoftmaxConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="adaptive_subcenter_aamsoftmax", metadata={"help": "Criterion name for training"} 10 | ) 11 | margin: float = field( 12 | default=0.2, metadata={"help": "The angular margin penalty in radians."} 13 | ) 14 | K: int = field( 15 | default=3, metadata={"help": "The number of subcenter."} 16 | ) 17 | scale: float = field( 18 | default=32, metadata={"help": "The scale for loss."} 19 | ) 20 | increase_steps: int = field( 21 | default=50000, metadata={"help": "The increase step for margin."} 22 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/ecapa/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.ecapa import ECAPA_TDNN 7 | 8 | from .configurations import ECAPAConfigs 9 | 10 | 11 | @register_model('ecapa', dataclass=ECAPAConfigs) 12 | class ECAPAModel(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(SpeakerEmbeddingModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = ECAPA_TDNN( 18 | in_channels=self.configs.feature.n_mels, 19 | channels=self.configs.model.channels, 20 | embed_dim=self.configs.model.embed_dim 21 | ) 22 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/ecapa/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class ECAPAConfigs(DeepMMDataclass): 7 | name: str = field( 8 | default="ecapa", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=192, metadata={"help": "Dimension of embedding."} 12 | ) 13 | channels: int = field( 14 | default=1024, metadata={"help": "Dimension of embedding."} 15 | ) 16 | optimizer: str = field( 17 | default="adam", metadata={"help": "Optimizer for training."} 18 | ) 19 | min_num_frames: int = field( 20 | default=300, metadata={"help": "Min num frames."} 21 | ) 22 | max_num_frames: int = field( 23 | default=400, metadata={"help": "Max num frames."} 24 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/resnet/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class Resnet101Configs(DeepMMDataclass): 7 | name: str = field( 8 | default="resnet101", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=256, metadata={"help": "Dimension of embedding."} 12 | ) 13 | optimizer: str = field( 14 | default="adam", metadata={"help": "Optimizer for training."} 15 | ) 16 | min_num_frames: int = field( 17 | default=300, metadata={"help": "Min num frames."} 18 | ) 19 | max_num_frames: int = field( 20 | default=400, metadata={"help": "Max num frames."} 21 | ) 22 | squeeze_excitation: bool = field( 23 | default=False, metadata={"help": "Max num frames."} 24 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34l/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class ClovaaiResnetse34lConfigs(DeepMMDataclass): 7 | name: str = field( 8 | default="clovaai_resnetse34l", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=256, metadata={"help": "Dimension of embedding."} 12 | ) 13 | encoder_type: str = field( 14 | default="SAP", metadata={"help": "Encoder type."} 15 | ) 16 | optimizer: str = field( 17 | default="adam", metadata={"help": "Optimizer for training."} 18 | ) 19 | min_num_frames: int = field( 20 | default=300, metadata={"help": "Min num frames."} 21 | ) 22 | max_num_frames: int = field( 23 | default=400, metadata={"help": "Max num frames."} 24 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_resnetse34v2/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class ClovaaiResnetse34V2Configs(DeepMMDataclass): 7 | name: str = field( 8 | default="clovaai_resnetse34v2", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=256, metadata={"help": "Dimension of embedding."} 12 | ) 13 | encoder_type: str = field( 14 | default="SAP", metadata={"help": "Encoder type."} 15 | ) 16 | optimizer: str = field( 17 | default="adam", metadata={"help": "Optimizer for training."} 18 | ) 19 | min_num_frames: int = field( 20 | default=300, metadata={"help": "Min num frames."} 21 | ) 22 | max_num_frames: int = field( 23 | default=400, metadata={"help": "Max num frames."} 24 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/fbank/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class FBankConfigs(DeepMMDataclass): 7 | name: str = field( 8 | default="fbank", metadata={"help": "Name of feature transform."} 9 | ) 10 | sample_rate: int = field( 11 | default=16000, metadata={"help": "Sampling rate of audio"} 12 | ) 13 | frame_duration: float = field( 14 | default=0.025, metadata={"help": "Frame length for spectrogram"} 15 | ) 16 | frame_shift: float = field( 17 | default=0.01, metadata={"help": "Length of hop between STFT"} 18 | ) 19 | n_mels: int = field( 20 | default=80, metadata={"help": "Number of mel filterbanks.."} 21 | ) 22 | var_norm: bool = field( 23 | default=False, metadata={"help": "Flag for cmvn"} 24 | ) 25 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/but_fbank/configuration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class ButFBankConfigs(DeepMMDataclass): 7 | name: str = field( 8 | default="fbank", metadata={"help": "Name of feature transform."} 9 | ) 10 | sample_rate: int = field( 11 | default=16000, metadata={"help": "Sampling rate of audio"} 12 | ) 13 | frame_duration: float = field( 14 | default=0.02, metadata={"help": "Frame length for spectrogram"} 15 | ) 16 | frame_shift: float = field( 17 | default=0.01, metadata={"help": "Length of hop between STFT"} 18 | ) 19 | n_mels: int = field( 20 | default=80, metadata={"help": "Number of mel filterbanks.."} 21 | ) 22 | var_norm: bool = field( 23 | default=False, metadata={"help": "Flag for cmvn"} 24 | ) 25 | -------------------------------------------------------------------------------- /deepaudio/speaker/metrics/eer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import roc_curve 3 | from scipy.spatial import distance 4 | 5 | from .utils import get_all_wavs, get_all_embeddings 6 | 7 | 8 | def compute_eer(y, y_pred, pos_label=1): 9 | fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=pos_label) 10 | fnr = 1 - tpr 11 | eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] 12 | eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] 13 | return eer, eer_threshold 14 | 15 | 16 | def model_eer(model, trials): 17 | wav_trials = get_all_wavs(trials) 18 | embeddings = get_all_embeddings(model, wav_trials) 19 | ys = [] 20 | y_preds = [] 21 | for uri_enroll, uri_test, y in trials: 22 | y_pred = 1 - distance.cosine(embeddings[uri_enroll], embeddings[uri_test]) 23 | y_preds.append(y_pred) 24 | ys.append(y) 25 | return compute_eer(ys, y_preds) 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/resnet/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from torch import Tensor 3 | 4 | from deepaudio.speaker.models import register_model 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 6 | from deepaudio.speaker.modules.backbones.resnet import ResNet, Bottleneck 7 | 8 | from .configurations import Resnet101Configs 9 | 10 | 11 | @register_model('resnet101', dataclass=Resnet101Configs) 12 | class Resnet101Model(SpeakerEmbeddingModel): 13 | def __init__(self, configs: DictConfig, num_classes: int): 14 | super(SpeakerEmbeddingModel, self).__init__(configs, num_classes) 15 | 16 | def build_model(self): 17 | self.model = ResNet( 18 | Bottleneck, 19 | [3, 4, 23, 3], 20 | feat_dim=self.configs.feature.n_mels, 21 | embed_dim=self.configs.model.embed_dim, 22 | squeeze_excitation=self.configs.model.squeeze_excitation 23 | ) 24 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/aamsoftmax/aamsoftmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | from torch import Tensor 4 | from omegaconf import DictConfig 5 | 6 | from pytorch_metric_learning.losses import ArcFaceLoss 7 | 8 | from .. import register_criterion 9 | from ..aamsoftmax.configuration import AAMSoftmaxConfigs 10 | 11 | 12 | def radian2degree(radian): 13 | return math.degrees(radian) 14 | 15 | 16 | @register_criterion("aamsoftmax", dataclass=AAMSoftmaxConfigs) 17 | class AAMSoftmax(nn.Module): 18 | def __init__(self, 19 | configs: DictConfig, 20 | num_classes: int, 21 | embedding_size: int 22 | ) -> None: 23 | super(AAMSoftmax, self).__init__() 24 | self.arcface_loss = ArcFaceLoss( 25 | num_classes, 26 | embedding_size, 27 | margin=radian2degree(configs.criterion.margin), 28 | scale=configs.criterion.scale 29 | ) 30 | 31 | def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor: 32 | return self.arcface_loss(embeddings, targets) 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/wespeaker_model/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | 6 | @dataclass 7 | class WespeakerModelConfigs(DeepMMDataclass): 8 | name: str = field( 9 | default="ResNet34", metadata={"help": "Model name"} 10 | ) 11 | embed_dim: int = field( 12 | default=256, metadata={"help": "Dimension of embedding."} 13 | ) 14 | pooling_func: str = field( 15 | default="TSTP", metadata={"help": "Pooling function for model."} 16 | ) 17 | optimizer: str = field( 18 | default="adam", metadata={"help": "Optimizer for training."} 19 | ) 20 | min_num_frames: int = field( 21 | default=200, metadata={"help": "Min num frames."} 22 | ) 23 | max_num_frames: int = field( 24 | default=300, metadata={"help": "Max num frames."} 25 | ) 26 | pretrained: bool = field( 27 | default=False, metadata={"help": "Use pretrained model or not."} 28 | ) 29 | checkpoint: str = field( 30 | default="None", metadata={"help": "Checkpoint path."} 31 | ) 32 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train \ 2 | dataset=dataframe \ 3 | dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \ 4 | dataset.dataset_name=voxceleb2_dev \ 5 | model=clovaai_ecapa \ 6 | model.channels=1024 \ 7 | model.embed_dim=256 \ 8 | model.min_num_frames=200 \ 9 | model.max_num_frames=300 \ 10 | feature=fbank \ 11 | lr_scheduler=warmup_adaptive_reduce_lr_on_plateau \ 12 | lr_scheduler.warmup_steps=30000 \ 13 | lr_scheduler.lr_factor=0.8 \ 14 | trainer=gpu \ 15 | trainer.batch_size=128 \ 16 | trainer.max_epochs=30 \ 17 | trainer.num_workers=8 \ 18 | trainer.num_checkpoints=30 \ 19 | criterion=adaptive_aamsoftmax \ 20 | criterion.increase_steps=300000 \ 21 | augment.apply_spec_augment=True\ 22 | augment.time_mask_num=1 \ 23 | augment.apply_noise_augment=True \ 24 | augment.apply_reverb_augment=True \ 25 | augment.apply_noise_reverb_augment=True \ 26 | augment.noise_augment_weight=2 \ 27 | augment.noise_dataset_dir=/data/share/data/musan \ 28 | augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \ 29 | -------------------------------------------------------------------------------- /run3.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train \ 2 | dataset=dataframe \ 3 | dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \ 4 | dataset.dataset_name=voxceleb2_dev \ 5 | model=clovaai_ecapa \ 6 | model.channels=1024 \ 7 | model.embed_dim=256 \ 8 | model.min_num_frames=200 \ 9 | model.max_num_frames=300 \ 10 | feature=fbank \ 11 | lr_scheduler=warmup_step_lr \ 12 | lr_scheduler.warmup_steps=30000 \ 13 | lr_scheduler.step_size=60000 \ 14 | lr_scheduler.freeze_steps=500000 \ 15 | lr_scheduler.lr_factor=0.8 \ 16 | trainer=gpu \ 17 | trainer.batch_size=128 \ 18 | trainer.max_epochs=30 \ 19 | trainer.num_workers=8 \ 20 | trainer.num_checkpoints=30 \ 21 | criterion=adaptive_aamsoftmax \ 22 | criterion.increase_steps=300000 \ 23 | augment.apply_spec_augment=True\ 24 | augment.time_mask_num=1 \ 25 | augment.apply_noise_augment=True \ 26 | augment.apply_reverb_augment=True \ 27 | augment.apply_noise_reverb_augment=True \ 28 | augment.noise_augment_weight=2 \ 29 | augment.noise_dataset_dir=/data/share/data/musan \ 30 | augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \ 31 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/dataframe/configurations.py: -------------------------------------------------------------------------------- 1 | from omegaconf import MISSING 2 | from dataclasses import dataclass, field 3 | 4 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 5 | @dataclass 6 | class DataframeConfigs(DeepMMDataclass): 7 | """ Configuration dataclass that common used """ 8 | name: str = field( 9 | default="dataframe", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"} 10 | ) 11 | database_yml: str = field( 12 | default="/Users/yin/project/deepaudio-database/database.yml", metadata={"help": "Path of database.yml"} 13 | ) 14 | dataset_name: str = field( 15 | default="debug", metadata={"help": "Database name. If you want use multiple dataset, please use ',' to split"} 16 | ) 17 | sampler: str = field( 18 | default="clovaai", metadata={"help": "Sampler name."} 19 | ) 20 | duration: float = field( 21 | default=4, metadata={"help": "Sliding window duration."} 22 | ) 23 | step: float = field( 24 | default=2, metadata={"help": "Sliding window step."} 25 | ) 26 | exhaustive: bool = field( 27 | default=True, metadata={"help": "exhaustive mode."} 28 | ) -------------------------------------------------------------------------------- /run2.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train \ 2 | dataset=dataframe \ 3 | dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \ 4 | dataset.dataset_name=voxceleb2_dev \ 5 | model=clovaai_ecapa \ 6 | model.channels=1024 \ 7 | model.embed_dim=256 \ 8 | model.min_num_frames=300 \ 9 | model.max_num_frames=500 \ 10 | model.pretrained=True \ 11 | model.checkpoint=/home/amax/audio/deepaudio-speaker/ckpts/epoch_20.ckpt \ 12 | feature=fbank \ 13 | lr_scheduler=steplr \ 14 | lr_scheduler.peak_lr=0.00001 \ 15 | lr_scheduler.init_lr=0.00001 \ 16 | lr_scheduler.step_size=30000 \ 17 | lr_scheduler.lr_factor=0.6 \ 18 | trainer=gpu \ 19 | trainer.batch_size=256 \ 20 | trainer.max_epochs=10 \ 21 | trainer.num_workers=8 \ 22 | trainer.num_checkpoints=30 \ 23 | criterion=pyannote_aamsoftmax \ 24 | criterion.margin=0.35 \ 25 | augment.apply_spec_augment=False\ 26 | augment.time_mask_num=1 \ 27 | augment.apply_noise_augment=True \ 28 | augment.apply_reverb_augment=True \ 29 | augment.apply_noise_reverb_augment=True \ 30 | augment.noise_augment_weight=2 \ 31 | augment.noise_dataset_dir=/data/share/data/musan \ 32 | augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \ 33 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/inference.py: -------------------------------------------------------------------------------- 1 | from typing import Text, Union 2 | from pathlib import Path 3 | 4 | import torch 5 | 6 | from pytorch_lightning.utilities.cloud_io import load as pl_load 7 | 8 | from deepaudio.speaker.data.audio_io.with_torchaudio import Audio 9 | from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_REGISTRY 10 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 11 | 12 | 13 | class Inference: 14 | def __init__( 15 | self, 16 | path_for_pl: Union[Text, Path], 17 | device: torch.device = None, 18 | strict: bool = False 19 | ): 20 | loaded_ckpt = pl_load(str(path_for_pl)) 21 | configs = loaded_ckpt["configs"] 22 | self.model = SpeakerEmbeddingModel.from_pretrained(str(path_for_pl), device, strict).eval().cuda() 23 | self.audio = Audio() 24 | self.feature_extractor = AUDIO_FEATURE_TRANSFORM_REGISTRY[configs.feature.name](configs).cuda() 25 | 26 | def make_embedding(self, wav, seg=None): 27 | if seg is None: 28 | waveform, _ = self.audio(wav) 29 | else: 30 | waveform, _ = self.audio.crop(wav, seg) 31 | feature = self.feature_extractor(waveform.cuda()) 32 | return self.model.make_embedding(feature) 33 | -------------------------------------------------------------------------------- /deepaudio/speaker/cli/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hydra 3 | import pytorch_lightning as pl 4 | from omegaconf import DictConfig, OmegaConf 5 | from pytorch_lightning.utilities import rank_zero_info 6 | 7 | from deepaudio.speaker.datasets import DATA_MODULE_REGISTRY 8 | from deepaudio.speaker.dataclass.initialize import hydra_train_init 9 | from deepaudio.speaker.models import MODEL_REGISTRY 10 | from deepaudio.speaker.utils import parse_configs, get_pl_trainer 11 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel 12 | 13 | 14 | 15 | @hydra.main(config_path=os.path.join("..", "configs"), config_name="train") 16 | def hydra_main(configs: DictConfig) -> None: 17 | rank_zero_info(OmegaConf.to_yaml(configs)) 18 | pl.seed_everything(configs.trainer.seed) 19 | logger, num_devices = parse_configs(configs) 20 | 21 | data_module = DATA_MODULE_REGISTRY[configs.dataset.name](configs) 22 | data_module.prepare_data() 23 | if configs.model.pretrained is True: 24 | model = SpeakerEmbeddingModel.from_pretrained(configs.model.checkpoint, configs=configs) 25 | else: 26 | model = MODEL_REGISTRY[configs.model.name](configs=configs, num_classes=data_module.num_classes) 27 | trainer = get_pl_trainer(configs, num_devices, logger) 28 | trainer.fit(model, data_module) 29 | 30 | 31 | def main(): 32 | hydra_train_init() 33 | hydra_main() 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | DATA_MODULE_REGISTRY = dict() 5 | 6 | 7 | def register_data_module(name: str): 8 | """ 9 | New data module types can be added to OpenSpeech with the :func:`register_data_module` function decorator. 10 | 11 | For example:: 12 | @register_data_module('ksponspeech') 13 | class LightningKsponSpeechDataModule: 14 | (...) 15 | 16 | .. note:: All vocabs must implement the :class:`cls.__name__` interface. 17 | 18 | Args: 19 | name (str): the name of the vocab 20 | """ 21 | 22 | def register_data_module_cls(cls): 23 | if name in DATA_MODULE_REGISTRY: 24 | raise ValueError(f"Cannot register duplicate data module ({name})") 25 | DATA_MODULE_REGISTRY[name] = cls 26 | return cls 27 | 28 | return register_data_module_cls 29 | 30 | 31 | data_module_dir = os.path.dirname(__file__) 32 | for file in os.listdir(data_module_dir): 33 | if os.path.isdir(os.path.join(data_module_dir, file)) and file != '__pycache__': 34 | for subfile in os.listdir(os.path.join(data_module_dir, file)): 35 | path = os.path.join(data_module_dir, file, subfile) 36 | if subfile.endswith(".py"): 37 | data_module_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile 38 | module = importlib.import_module(f"deepaudio.speaker.datasets.{file}.{data_module_name}") 39 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/fbank/fbank.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torchaudio.transforms import MelSpectrogram 4 | 5 | from ..utils import CMVN 6 | from .configuration import FBankConfigs 7 | from .. import register_audio_feature_transform 8 | 9 | EPSILON = 1e-6 10 | 11 | 12 | @register_audio_feature_transform("fbank", dataclass=FBankConfigs) 13 | class Fbank(nn.Module): 14 | def __init__(self, configs): 15 | super(Fbank, self).__init__() 16 | win_length = int(configs.feature.sample_rate * configs.feature.frame_duration) 17 | hop_length = int(configs.feature.sample_rate * configs.feature.frame_shift) 18 | self.melSpectrogram = MelSpectrogram(sample_rate=configs.feature.sample_rate, 19 | n_mels=configs.feature.n_mels, 20 | n_fft=512, 21 | win_length=win_length, 22 | hop_length=hop_length, 23 | window_fn=torch.hann_window) 24 | self.cmvn = CMVN(var_norm=configs.feature.var_norm) 25 | self.input_dim = configs.feature.n_mels 26 | 27 | def forward(self, waveform): 28 | mel_spectrogram = self.melSpectrogram(waveform) 29 | mel_spectrogram = torch.log(mel_spectrogram + EPSILON) 30 | mel_spectrogram = mel_spectrogram.transpose(1, 2) 31 | return self.cmvn(mel_spectrogram) 32 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/fix_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dataclasses import dataclass, field 3 | from typing import Optional 4 | from omegaconf import DictConfig 5 | from torch.optim import Optimizer 6 | 7 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 8 | from deepaudio.speaker.optim.scheduler import register_scheduler 9 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 10 | 11 | 12 | @dataclass 13 | class FixLRSchedulerConfigs(LearningRateSchedulerConfigs): 14 | scheduler_name: str = field( 15 | default="fix", metadata={"help": "Name of learning rate scheduler."} 16 | ) 17 | peak_lr: float = field( 18 | default=1e-04, metadata={"help": "Maximum learning rate."} 19 | ) 20 | 21 | 22 | @register_scheduler("fix", dataclass=FixLRSchedulerConfigs) 23 | class FixLRScheduler(LearningRateScheduler): 24 | """ 25 | Warmup learning rate until `total_steps` 26 | 27 | Args: 28 | optimizer (Optimizer): wrapped optimizer. 29 | configs (DictConfig): configuration set. 30 | """ 31 | def __init__( 32 | self, 33 | optimizer: Optimizer, 34 | configs: DictConfig, 35 | ) -> None: 36 | super(FixLRScheduler, self).__init__(optimizer, configs.lr_scheduler.peak_lr) 37 | self.lr = configs.lr_scheduler.peak_lr 38 | 39 | def step(self): 40 | self.set_lr(self.optimizer, self.lr) 41 | return self.lr 42 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/mmcl_seresnet34/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class MMCLSeResnet34Configs(DeepMMDataclass): 7 | name: str = field( 8 | default="mmcl_seresnet34", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=256, metadata={"help": "Dimension of embedding."} 12 | ) 13 | in_channels: int = field( 14 | default=1, metadata={"help": "In channel."} 15 | ) 16 | stem_channels: int = field( 17 | default=32, metadata={"help": "Stem channel."} 18 | ) 19 | base_channels: int = field( 20 | default=32, metadata={"help": "Base channel."} 21 | ) 22 | depth: int = field( 23 | default=34, metadata={"help": "Depth."} 24 | ) 25 | out_bn: bool = field( 26 | default=True, metadata={"help": "Flag for batch normalization in embedding layer."} 27 | ) 28 | num_stages: int = field( 29 | default=4, metadata={"help": "Number of stages"} 30 | ) 31 | out_indices: int = field( 32 | default=3, metadata={"help": "Out indices"} 33 | ) 34 | norm_cfg_type: str = field( 35 | default='BN', metadata={"help": "Norm type"} 36 | ) 37 | optimizer: str = field( 38 | default="adam", metadata={"help": "Optimizer for training."} 39 | ) 40 | min_num_frames: int = field( 41 | default=300, metadata={"help": "Min num frames."} 42 | ) 43 | max_num_frames: int = field( 44 | default=400, metadata={"help": "Max num frames."} 45 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/models/clovaai_ecapa/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class ClovaaiECAPAConfigs(DeepMMDataclass): 7 | name: str = field( 8 | default="clovaai_ecapa", metadata={"help": "Model name"} 9 | ) 10 | embed_dim: int = field( 11 | default=192, metadata={"help": "Dimension of embedding."} 12 | ) 13 | channels: int = field( 14 | default=512, metadata={"help": "Dimension of embedding."} 15 | ) 16 | model_scale: int = field( 17 | default=8, metadata={"help": "Model scale."} 18 | ) 19 | context: bool = field( 20 | default=True, metadata={"help": "Context."} 21 | ) 22 | summed: bool = field( 23 | default=True, metadata={"help": "Summed."} 24 | ) 25 | out_bn: bool = field( 26 | default=True, metadata={"help": "Flag for batch normalization in embedding layer."} 27 | ) 28 | encoder_type: str = field( 29 | default="ECA", metadata={"help": "Encoder type."} 30 | ) 31 | optimizer: str = field( 32 | default="adam", metadata={"help": "Optimizer for training."} 33 | ) 34 | min_num_frames: int = field( 35 | default=200, metadata={"help": "Min num frames."} 36 | ) 37 | max_num_frames: int = field( 38 | default=400, metadata={"help": "Max num frames."} 39 | ) 40 | pretrained: bool = field( 41 | default=False, metadata={"help": "Use pretrained model or not."} 42 | ) 43 | checkpoint: str = field( 44 | default="None", metadata={"help": "Checkpoint path."} 45 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/wespeaker/speaker_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import deepaudio.speaker.modules.backbones.wespeaker.tdnn as tdnn 16 | import deepaudio.speaker.modules.backbones.wespeaker.ecapa_tdnn as ecapa_tdnn 17 | import deepaudio.speaker.modules.backbones.wespeaker.resnet as resnet 18 | import deepaudio.speaker.modules.backbones.wespeaker.repvgg as repvgg 19 | 20 | 21 | def get_speaker_model(model_name: str): 22 | if model_name.startswith("XVEC"): 23 | return getattr(tdnn, model_name) 24 | elif model_name.startswith("ECAPA_TDNN"): 25 | return getattr(ecapa_tdnn, model_name) 26 | elif model_name.startswith("ResNet"): 27 | return getattr(resnet, model_name) 28 | elif model_name.startswith("REPVGG"): 29 | return getattr(repvgg, model_name) 30 | else: # model_name error !!! 31 | print(model_name + " not found !!!") 32 | exit(1) 33 | 34 | 35 | def MainModel(configs): 36 | model_class = get_speaker_model(configs.model.name) 37 | model = model_class(feat_dim=configs.feature.n_mels, 38 | embed_dim=configs.model.embed_dim, 39 | pooling_func=configs.model.pooling_func) 40 | return model 41 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/augmentation/spec_augment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from torch import Tensor 4 | 5 | 6 | class SpecAugment(object): 7 | """ 8 | Provides Spec Augment. A simple data augmentation method for speech recognition. 9 | This concept proposed in https://arxiv.org/abs/1904.08779 10 | 11 | Args: 12 | freq_mask_para (int): maximum frequency masking length 13 | time_mask_num (int): how many times to apply time masking 14 | freq_mask_num (int): how many times to apply frequency masking 15 | 16 | Inputs: feature_vector 17 | - **feature_vector** (torch.FloatTensor): feature vector from audio file. 18 | 19 | Returns: feature_vector: 20 | - **feature_vector**: masked feature vector. 21 | """ 22 | 23 | def __init__(self, configs) -> None: 24 | self.freq_mask_para = configs.augment.freq_mask_para 25 | self.time_mask_num = configs.augment.time_mask_num 26 | self.freq_mask_num = configs.augment.freq_mask_num 27 | 28 | def __call__(self, feature: Tensor) -> Tensor: 29 | """ Provides SpecAugmentation for audio """ 30 | time_axis_length = feature.size(0) 31 | freq_axis_length = feature.size(1) 32 | time_mask_para = time_axis_length / 20 # Refer to "Specaugment on large scale dataset" paper 33 | 34 | # time mask 35 | for _ in range(self.time_mask_num): 36 | t = int(np.random.uniform(low=0.0, high=time_mask_para)) 37 | t0 = random.randint(0, time_axis_length - t) 38 | feature[t0: t0 + t, :] = 0 39 | 40 | # freq mask 41 | for _ in range(self.freq_mask_num): 42 | f = int(np.random.uniform(low=0.0, high=self.freq_mask_para)) 43 | f0 = random.randint(0, freq_axis_length - f) 44 | feature[:, f0: f0 + f] = 0 45 | 46 | return feature 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | 5 | from pkg_resources import VersionConflict, require 6 | from setuptools import find_packages, setup 7 | 8 | with open("Readme.md") as f: 9 | long_description = f.read() 10 | 11 | with open("requirements.txt") as f: 12 | requirements = f.read().splitlines() 13 | 14 | try: 15 | require("setuptools>=38.3") 16 | except VersionConflict: 17 | print("Error: version of setuptools is too old (<38.3)!") 18 | sys.exit(1) 19 | 20 | 21 | ROOT_DIR = Path(__file__).parent.resolve() 22 | # Creating the version file 23 | 24 | with open("version.txt") as f: 25 | version = f.read() 26 | 27 | version = version.strip() 28 | sha = "Unknown" 29 | 30 | if os.getenv("BUILD_VERSION"): 31 | version = os.getenv("BUILD_VERSION") 32 | elif sha != "Unknown": 33 | version += "+" + sha[:7] 34 | print("-- Building version " + version) 35 | 36 | version_path = ROOT_DIR / "deepaudio" / "speaker" / "version.py" 37 | 38 | with open(version_path, "w") as f: 39 | f.write("__version__ = '{}'\n".format(version)) 40 | 41 | if __name__ == "__main__": 42 | setup( 43 | name="deepaudio.speaker", 44 | namespace_packages=["deepaudio"], 45 | version=version, 46 | packages=find_packages(), 47 | install_requires=requirements, 48 | description="Speaker embedding", 49 | long_description=long_description, 50 | long_description_content_type="text/markdown", 51 | author="Ruiqing Yin", 52 | # author_email="yinruiqing", 53 | url="https://github.com/deepaudio/deepaudio-speaker", 54 | classifiers=[ 55 | "Development Status :: 4 - Beta", 56 | "Intended Audience :: Science/Research", 57 | "License :: OSI Approved :: MIT License", 58 | "Natural Language :: English", 59 | "Programming Language :: Python :: 3.8", 60 | "Topic :: Scientific/Engineering", 61 | ], 62 | ) -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from torch.optim.lr_scheduler import _LRScheduler 24 | 25 | 26 | class LearningRateScheduler(_LRScheduler): 27 | r""" 28 | Provides inteface of learning rate scheduler. 29 | 30 | Note: 31 | Do not use this class directly, use one of the sub classes. 32 | """ 33 | def __init__(self, optimizer, init_lr): 34 | self.optimizer = optimizer 35 | self.init_lr = init_lr 36 | 37 | def step(self, *args, **kwargs): 38 | raise NotImplementedError 39 | 40 | @staticmethod 41 | def set_lr(optimizer, lr): 42 | for g in optimizer.param_groups: 43 | g['lr'] = lr 44 | 45 | def get_lr(self): 46 | for g in self.optimizer.param_groups: 47 | return g['lr'] 48 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/feature/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | AUDIO_FEATURE_TRANSFORM_REGISTRY = dict() 5 | AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY = dict() 6 | 7 | 8 | def register_audio_feature_transform(name: str, dataclass=None): 9 | r""" 10 | New dataset types can be added to OpenSpeech with the :func:`register_dataset` function decorator. 11 | 12 | For example:: 13 | @register_audio_feature_transform("fbank", dataclass=FilterBankConfigs) 14 | class FilterBankFeatureTransform(object): 15 | (...) 16 | 17 | .. note:: All dataset must implement the :class:`cls.__name__` interface. 18 | 19 | Args: 20 | name (str): the name of the dataset 21 | dataclass (Optional, str): the dataclass of the dataset (default: None) 22 | """ 23 | 24 | def register_audio_feature_transform_cls(cls): 25 | if name in AUDIO_FEATURE_TRANSFORM_REGISTRY: 26 | raise ValueError(f"Cannot register duplicate audio ({name})") 27 | 28 | AUDIO_FEATURE_TRANSFORM_REGISTRY[name] = cls 29 | 30 | if dataclass is not None: 31 | if name in AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY: 32 | raise ValueError(f"Cannot register duplicate dataclass ({name})") 33 | AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[name] = dataclass 34 | 35 | return cls 36 | 37 | return register_audio_feature_transform_cls 38 | 39 | 40 | data_dir = os.path.dirname(__file__) 41 | for file in os.listdir(f"{data_dir}"): 42 | if os.path.isdir(f"{data_dir}/{file}") and not file.startswith('__'): 43 | path = f"{data_dir}/{file}" 44 | for module_file in os.listdir(path): 45 | path = os.path.join(path, module_file) 46 | if module_file.endswith(".py"): 47 | module_name = module_file[: module_file.find(".py")] if module_file.endswith(".py") else module_file 48 | module = importlib.import_module(f"deepaudio.speaker.data.feature.{file}.{module_name}") -------------------------------------------------------------------------------- /deepaudio/speaker/data/augmentation/noise.py: -------------------------------------------------------------------------------- 1 | from torch_audiomentations import AddBackgroundNoise, ApplyImpulseResponse, Compose 2 | 3 | from .utils import get_all_wavs 4 | 5 | 6 | class Noise: 7 | def __init__(self, configs): 8 | self.configs = configs 9 | background_paths = get_all_wavs(configs.augment.noise_dataset_dir) 10 | self.noise = AddBackgroundNoise(background_paths=background_paths, 11 | min_snr_in_db=configs.augment.min_snr_in_db, 12 | max_snr_in_db=configs.augment.max_snr_in_db, 13 | p=1) 14 | 15 | def __call__(self, waveform): 16 | waveform = waveform.unsqueeze(0) 17 | return self.noise(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0) 18 | 19 | 20 | class Reverb: 21 | def __init__(self, configs): 22 | self.configs = configs 23 | ir_paths = get_all_wavs(configs.augment.rir_dataset_dir) 24 | self.reverb = ApplyImpulseResponse(ir_paths=ir_paths, p=1) 25 | 26 | def __call__(self, waveform): 27 | waveform = waveform.unsqueeze(0) 28 | return self.reverb(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0) 29 | 30 | 31 | class NoiseReverb: 32 | def __init__(self, configs): 33 | self.configs = configs 34 | background_paths = get_all_wavs(configs.augment.noise_dataset_dir) 35 | ir_paths = get_all_wavs(configs.augment.rir_dataset_dir) 36 | self.noise = AddBackgroundNoise(background_paths=background_paths, 37 | min_snr_in_db=configs.augment.min_snr_in_db, 38 | max_snr_in_db=configs.augment.max_snr_in_db, 39 | p=1) 40 | self.reverb = ApplyImpulseResponse(ir_paths=ir_paths, p=1) 41 | self.compose = Compose([self.noise, self.reverb], p=1) 42 | 43 | def __call__(self, waveform): 44 | waveform = waveform.unsqueeze(0) 45 | return self.compose(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0) 46 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | CRITERION_REGISTRY = dict() 5 | CRITERION_DATACLASS_REGISTRY = dict() 6 | 7 | 8 | def register_criterion(name: str, dataclass=None): 9 | r""" 10 | New criterion types can be added to OpenSpeech with the :func:`register_criterion` function decorator. 11 | 12 | For example:: 13 | @register_criterion('label_smoothed_cross_entropy') 14 | class LabelSmoothedCrossEntropyLoss(nn.Module): 15 | (...) 16 | 17 | .. note:: All criterion must implement the :class:`cls.__name__` interface. 18 | 19 | Args: 20 | name (str): the name of the criterion 21 | dataclass (Optional, str): the dataclass of the criterion (default: None) 22 | """ 23 | 24 | def register_criterion_cls(cls): 25 | if name in CRITERION_REGISTRY: 26 | raise ValueError(f"Cannot register duplicate criterion ({name})") 27 | 28 | CRITERION_REGISTRY[name] = cls 29 | 30 | cls.__dataclass = dataclass 31 | if dataclass is not None: 32 | if name in CRITERION_DATACLASS_REGISTRY: 33 | raise ValueError(f"Cannot register duplicate criterion ({name})") 34 | CRITERION_DATACLASS_REGISTRY[name] = dataclass 35 | 36 | return cls 37 | 38 | return register_criterion_cls 39 | 40 | 41 | criterion_dir = os.path.dirname(__file__) 42 | for file in os.listdir(criterion_dir): 43 | if os.path.isdir(os.path.join(criterion_dir, file)) and not file.startswith('__'): 44 | for subfile in os.listdir(os.path.join(criterion_dir, file)): 45 | path = os.path.join(criterion_dir, file, subfile) 46 | if subfile.endswith(".py"): 47 | python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile 48 | module = importlib.import_module(f"deepaudio.speaker.criterion.{file}.{python_file}") 49 | continue 50 | 51 | path = os.path.join(criterion_dir, file) 52 | if file.endswith(".py"): 53 | criterion_name = file[: file.find(".py")] if file.endswith(".py") else file 54 | module = importlib.import_module(f"deepaudio.speaker.criterion.{criterion_name}") 55 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/subcenter_aamsoftmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from omegaconf import DictConfig 7 | 8 | from .. import register_criterion 9 | from ..subcenter_aamsoftmax.subcenter_aamsoftmax import SubcenterArcMarginProduct 10 | from .configuration import AdaptiveSubcenterAAMSoftmaxConfigs 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.nn import Parameter 16 | import math 17 | 18 | 19 | @register_criterion("adaptive_subcenter_aamsoftmax", dataclass=AdaptiveSubcenterAAMSoftmaxConfigs) 20 | class PyannoteAAMSoftmax(nn.Module): 21 | def __init__(self, 22 | configs: DictConfig, 23 | num_classes: int, 24 | embedding_size: int 25 | ) -> None: 26 | super(PyannoteAAMSoftmax, self).__init__() 27 | self.configs = configs 28 | self.classifier_ = SubcenterArcMarginProduct( 29 | in_features=self.configs.model.embed_dim, 30 | out_features=num_classes, 31 | K=configs.model.criterion.K, 32 | m=configs.criterion.margin, 33 | s=configs.criterion.scale 34 | ) 35 | self.loss_ = nn.CrossEntropyLoss() 36 | self.margin = configs.criterion.margin 37 | self.warmup_steps = configs.lr_scheduler.warmup_steps if configs.lr_scheduler.scheduler_name.startswith( 38 | 'warmup') else 0 39 | self.increase_steps = configs.criterion.increase_steps 40 | self.increase_rate = self.margin / (self.increase_steps - self.warmup_steps) 41 | 42 | def step(self, global_steps): 43 | if global_steps < self.warmup_steps: 44 | self.classifier_.margin = 0 45 | elif global_steps < self.increase_steps: 46 | self.classifier_.margin = (global_steps - self.warmup_steps) * self.increase_rate 47 | else: 48 | self.classifier_.margin = self.margin 49 | 50 | def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor: 51 | logits = self.classifier_(embeddings, target=targets) 52 | return self.loss_(logits, targets) 53 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | from .speaker_model import SpeakerModel 5 | 6 | 7 | MODEL_REGISTRY = dict() 8 | MODEL_DATACLASS_REGISTRY = dict() 9 | 10 | 11 | def register_model(name: str, dataclass=None): 12 | r""" 13 | New model types can be added to OpenSpeech with the :func:`register_model` function decorator. 14 | 15 | For example:: 16 | @register_model('conformer_lstm') 17 | class ConformerLSTMModel(OpenspeechModel): 18 | (...) 19 | 20 | .. note:: All models must implement the :class:`cls.__name__` interface. 21 | 22 | Args: 23 | name (str): the name of the model 24 | """ 25 | 26 | def register_model_cls(cls): 27 | if name in MODEL_REGISTRY: 28 | raise ValueError(f"Cannot register duplicate model ({name})") 29 | if not issubclass(cls, SpeakerModel): 30 | raise ValueError(f"Model ({name}: {cls.__name__}) must extend OpenspeechModel") 31 | 32 | MODEL_REGISTRY[name] = cls 33 | 34 | cls.__dataclass = dataclass 35 | if dataclass is not None: 36 | if name in MODEL_DATACLASS_REGISTRY: 37 | raise ValueError(f"Cannot register duplicate model ({name})") 38 | MODEL_DATACLASS_REGISTRY[name] = dataclass 39 | 40 | return cls 41 | 42 | return register_model_cls 43 | 44 | 45 | # automatically import any Python files in the models/ directory 46 | models_dir = os.path.dirname(__file__) 47 | for file in os.listdir(models_dir): 48 | if os.path.isdir(os.path.join(models_dir, file)) and not file.startswith('__'): 49 | for subfile in os.listdir(os.path.join(models_dir, file)): 50 | path = os.path.join(models_dir, file, subfile) 51 | if subfile.endswith(".py"): 52 | python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile 53 | module = importlib.import_module(f"deepaudio.speaker.models.{file}.{python_file}") 54 | continue 55 | 56 | path = os.path.join(models_dir, file) 57 | if file.endswith(".py"): 58 | model_name = file[: file.find(".py")] if file.endswith(".py") else file 59 | module = importlib.import_module(f"deepaudio.speaker.models.{model_name}") -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/step_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dataclasses import dataclass, field 3 | from typing import Optional 4 | from omegaconf import DictConfig 5 | from torch.optim import Optimizer 6 | 7 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 8 | from deepaudio.speaker.optim.scheduler import register_scheduler 9 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 10 | 11 | 12 | @dataclass 13 | class StepLRSchedulerConfigs(LearningRateSchedulerConfigs): 14 | scheduler_name: str = field( 15 | default="steplr", metadata={"help": "Name of learning rate scheduler."} 16 | ) 17 | peak_lr: float = field( 18 | default=1e-04, metadata={"help": "Maximum learning rate."} 19 | ) 20 | min_lr: float = field( 21 | default=1e-7, metadata={"help": "Min learning rate."} 22 | ) 23 | step_size: int = field( 24 | default=50, metadata={"help": "Step size to decay"} 25 | ) 26 | lr_factor: float = field( 27 | default=0.8, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."} 28 | ) 29 | 30 | 31 | @register_scheduler("steplr", dataclass=StepLRSchedulerConfigs) 32 | class StepLRScheduler(LearningRateScheduler): 33 | """ 34 | Warmup learning rate until `total_steps` 35 | 36 | Args: 37 | optimizer (Optimizer): wrapped optimizer. 38 | configs (DictConfig): configuration set. 39 | """ 40 | def __init__( 41 | self, 42 | optimizer: Optimizer, 43 | configs: DictConfig, 44 | ) -> None: 45 | super(StepLRScheduler, self).__init__(optimizer, configs.lr_scheduler.peak_lr) 46 | self.update_steps = 1 47 | self.lr = configs.lr_scheduler.peak_lr 48 | self.step_size = configs.lr_scheduler.step_size 49 | self.min_lr = configs.lr_scheduler.min_lr 50 | self.lr_factor = configs.lr_scheduler.lr_factor 51 | 52 | def step(self, val_loss: Optional[torch.FloatTensor] = None): 53 | if self.update_steps % self.step_size == 0: 54 | lr = self.lr * self.lr_factor 55 | self.set_lr(self.optimizer, lr) 56 | self.lr = lr 57 | self.update_steps += 1 58 | return self.lr 59 | -------------------------------------------------------------------------------- /deepaudio/speaker/dataclass/initialize.py: -------------------------------------------------------------------------------- 1 | from hydra.core.config_store import ConfigStore 2 | from deepaudio.speaker.data.augmentation.configurations import AugmentConfigs 3 | from deepaudio.speaker.datasets.voxceleb2.configurations import Voxceleb2Configs 4 | from deepaudio.speaker.datasets.dataframe.configurations import DataframeConfigs 5 | 6 | from .configurations import ( 7 | CPUTrainerConfigs, 8 | GPUTrainerConfigs, 9 | TPUTrainerConfigs, 10 | Fp16GPUTrainerConfigs, 11 | Fp16TPUTrainerConfigs, 12 | Fp64CPUTrainerConfigs, 13 | ) 14 | 15 | 16 | SPEAKER_TRAIN_CONFIGS = [ 17 | "feature", 18 | "augment", 19 | "dataset", 20 | "model", 21 | "criterion", 22 | "lr_scheduler", 23 | "trainer", 24 | ] 25 | 26 | 27 | DATASET_DATACLASS_REGISTRY = { 28 | "voxceleb2": Voxceleb2Configs, 29 | 'dataframe': DataframeConfigs, 30 | } 31 | TRAINER_DATACLASS_REGISTRY = { 32 | "cpu": CPUTrainerConfigs, 33 | "gpu": GPUTrainerConfigs, 34 | "tpu": TPUTrainerConfigs, 35 | "gpu-fp16": Fp16GPUTrainerConfigs, 36 | "tpu-fp16": Fp16TPUTrainerConfigs, 37 | "cpu-fp64": Fp64CPUTrainerConfigs, 38 | } 39 | AUGMENT_DATACLASS_REGISTRY = { 40 | "default": AugmentConfigs, 41 | } 42 | 43 | def hydra_train_init() -> None: 44 | r""" initialize ConfigStore for hydra-train """ 45 | from deepaudio.speaker.models import MODEL_DATACLASS_REGISTRY 46 | from deepaudio.speaker.criterion import CRITERION_DATACLASS_REGISTRY 47 | from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY 48 | from deepaudio.speaker.optim.scheduler import SCHEDULER_DATACLASS_REGISTRY 49 | 50 | registries = { 51 | "feature": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY, 52 | "augment": AUGMENT_DATACLASS_REGISTRY, 53 | "dataset": DATASET_DATACLASS_REGISTRY, 54 | "trainer": TRAINER_DATACLASS_REGISTRY, 55 | "model": MODEL_DATACLASS_REGISTRY, 56 | "criterion": CRITERION_DATACLASS_REGISTRY, 57 | "lr_scheduler": SCHEDULER_DATACLASS_REGISTRY, 58 | } 59 | 60 | cs = ConfigStore.instance() 61 | 62 | for group in SPEAKER_TRAIN_CONFIGS: 63 | dataclass_registry = registries[group] 64 | 65 | for k, v in dataclass_registry.items(): 66 | cs.store(group=group, name=k, node=v, provider="deepaudio") 67 | 68 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import os 24 | import importlib 25 | 26 | from deepaudio.speaker.optim.adamp import AdamP 27 | from deepaudio.speaker.optim.radam import RAdam 28 | from deepaudio.speaker.optim.novograd import Novograd 29 | 30 | # automatically import any Python files in the models/ directory 31 | scheduler_dir = os.path.dirname(__file__) 32 | for file in os.listdir(scheduler_dir): 33 | if os.path.isdir(os.path.join(scheduler_dir, file)) and file != '__pycache__': 34 | for subfile in os.listdir(os.path.join(scheduler_dir, file)): 35 | path = os.path.join(scheduler_dir, file, subfile) 36 | if subfile.endswith(".py"): 37 | scheduler_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile 38 | module = importlib.import_module(f"deepaudio.speaker.optim.scheduler.{scheduler_name}") 39 | continue 40 | 41 | path = os.path.join(scheduler_dir, file) 42 | if file.endswith(".py"): 43 | scheduler_name = file[: file.find(".py")] if file.endswith(".py") else file 44 | module = importlib.import_module(f"deepaudio.speaker.optim.{scheduler_name}") 45 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import os 24 | import importlib 25 | 26 | SCHEDULER_REGISTRY = {} 27 | SCHEDULER_DATACLASS_REGISTRY = {} 28 | 29 | 30 | def register_scheduler(name: str, dataclass=None): 31 | """ 32 | New scheduler types can be added to OpenSpeech with the :func:`register_scheduler` function decorator. 33 | 34 | For example:: 35 | @register_scheduler('reduce_lr_on_plateau') 36 | class ReduceLROnPlateau: 37 | (...) 38 | 39 | .. note:: All scheduler must implement the :class:`cls.__name__` interface. 40 | 41 | Args: 42 | name (str): the name of the scheduler 43 | """ 44 | 45 | def register_scheduler_cls(cls): 46 | if name in SCHEDULER_REGISTRY: 47 | raise ValueError(f"Cannot register duplicate scheduler ({name})") 48 | 49 | SCHEDULER_REGISTRY[name] = cls 50 | 51 | cls.__dataclass = dataclass 52 | if dataclass is not None: 53 | if name in SCHEDULER_DATACLASS_REGISTRY: 54 | raise ValueError(f"Cannot register duplicate scheduler ({name})") 55 | SCHEDULER_DATACLASS_REGISTRY[name] = dataclass 56 | 57 | return cls 58 | 59 | return register_scheduler_cls 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | outputs/ 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/augmentation/configurations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, _MISSING_TYPE, field 2 | 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass 4 | 5 | @dataclass 6 | class AugmentConfigs(DeepMMDataclass): 7 | apply_spec_augment: bool = field( 8 | default=False, metadata={"help": "Flag indication whether to apply spec augment or not"} 9 | ) 10 | apply_noise_augment: bool = field( 11 | default=False, metadata={"help": "Flag indication whether to apply noise augment or not " 12 | "Noise augment requires `noise_dataset_path`. " 13 | "`noise_dataset_dir` should be contain audio files."} 14 | ) 15 | apply_reverb_augment: bool = field( 16 | default=False, metadata={"help": "Flag indication whether to apply joining augment or not " 17 | "If true, create a new audio file by connecting two audio randomly"} 18 | ) 19 | apply_noise_reverb_augment: bool = field( 20 | default=False, metadata={"help": "Flag indication whether to apply spec augment or not"} 21 | ) 22 | min_snr_in_db: float = field( 23 | default=3.0, metadata={"help": "Flag indication whether to apply spec augment or not"} 24 | ) 25 | max_snr_in_db: float = field( 26 | default=30.0, metadata={"help": "Flag indication whether to apply spec augment or not"} 27 | ) 28 | freq_mask_para: int = field( 29 | default=27, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"} 30 | ) 31 | freq_mask_num: int = field( 32 | default=2, metadata={"help": "How many freq-masked area to make"} 33 | ) 34 | time_mask_num: int = field( 35 | default=4, metadata={"help": "How many time-masked area to make"} 36 | ) 37 | noise_dataset_dir: str = field( 38 | default='None', metadata={"help": "Noise Directory"} 39 | ) 40 | rir_dataset_dir: str = field( 41 | default='None', metadata={"help": "Rirs Directory"} 42 | ) 43 | noise_augment_weight: float = field( 44 | default=2.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"} 45 | ) 46 | reverb_augment_weight: float = field( 47 | default=1.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"} 48 | ) 49 | noise_reverb_augment_weight: float = field( 50 | default=2.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"} 51 | ) 52 | specaugment_weight: float = field( 53 | default=1.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"} 54 | ) 55 | 56 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/dataloader.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | import torch 5 | from torch.utils.data import DataLoader 6 | 7 | 8 | def _collate_fn(batch, min_num_frames, max_num_frames): 9 | r""" 10 | Functions that pad to the maximum sequence length 11 | 12 | Args: 13 | batch (tuple): tuple contains input and target tensors 14 | 15 | Returns: 16 | inputs (torch.FloatTensor): tensor contains input tensor and target tensor. 17 | """ 18 | def get_min_num_frames(batch): 19 | return min([sample[0].size(0) for sample in batch]) 20 | def flatten(batch): 21 | batch_flatten = [] 22 | for items in batch: 23 | for X, y in zip(items[0], items[1]): 24 | batch_flatten.append((X,y)) 25 | return batch_flatten 26 | 27 | def get_subsample(feature, num_frames): 28 | length = feature.size(0) 29 | if length < num_frames: 30 | msg = 'Sample is too short' 31 | raise ValueError(msg) 32 | elif length == num_frames: 33 | return feature 34 | else: 35 | start = np.random.randint(0, length - num_frames) 36 | return feature[start:start + num_frames] 37 | batch = flatten(batch) 38 | min_num_frames_batch = get_min_num_frames(batch) 39 | num_frames = np.random.randint(min_num_frames, max_num_frames) 40 | num_frames = min(num_frames, min_num_frames_batch) 41 | 42 | X = [] 43 | y = [] 44 | for item in batch: 45 | feature = item[0] 46 | X.append(get_subsample(feature, num_frames).unsqueeze(0)) 47 | y.append(item[1]) 48 | return { 49 | 'X': torch.cat(X), 50 | 'y': torch.tensor(y, dtype=torch.int64) 51 | } 52 | 53 | 54 | class SpeakerUttDataLoader(DataLoader): 55 | r""" 56 | Text Data Loader 57 | 58 | Args: 59 | dataset (torch.utils.data.Dataset): dataset from which to load the data. 60 | num_workers (int): how many subprocesses to use for data loading. 61 | """ 62 | def __init__( 63 | self, 64 | dataset: torch.utils.data.Dataset, 65 | num_workers: int, 66 | min_num_frames: int, 67 | max_num_frames: int, 68 | batch_size: int, 69 | **kwargs, 70 | ) -> None: 71 | super(SpeakerUttDataLoader, self).__init__( 72 | dataset=dataset, 73 | num_workers=num_workers, 74 | batch_size=batch_size, 75 | **kwargs, 76 | ) 77 | self.min_num_frames = min_num_frames 78 | self.max_num_frames = max_num_frames 79 | self.collate_fn = partial(_collate_fn, 80 | min_num_frames=min_num_frames, 81 | max_num_frames=max_num_frames) 82 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/voxceleb2/lit_data_module.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import random 4 | from omegaconf import DictConfig 5 | import pytorch_lightning as pl 6 | from torch.utils.data import DataLoader 7 | 8 | from deepaudio.speaker.data.dataset import SpeakerAudioDataset 9 | from deepaudio.speaker.data.dataloader import SpeakerUttDataLoader 10 | from deepaudio.speaker.data.samplers import ClovaaiSampler 11 | 12 | 13 | from .preprocess import get_speaker_list, get_speaker_wavs 14 | from .. import register_data_module 15 | 16 | 17 | @register_data_module('voxceleb2') 18 | class LightningVoxceleb2DataModule(pl.LightningDataModule): 19 | def __init__(self, configs: DictConfig): 20 | super(LightningVoxceleb2DataModule, self).__init__() 21 | self.configs = configs 22 | 23 | def prepare_data(self): 24 | speakers, spk2id = get_speaker_list(self.configs) 25 | speaker2wav = get_speaker_wavs(self.configs.dataset.dataset_path, speakers) 26 | self.num_classes = len(speakers) 27 | self.train_utts, self.valid_utts = self._split_train_valid(speaker2wav, spk2id) 28 | 29 | def setup(self, stage: Optional[str] = None) -> None: 30 | self.train_dataset = SpeakerAudioDataset(self.configs, self.train_utts) 31 | self.valid_dataset = SpeakerAudioDataset(self.configs, self.valid_utts) 32 | 33 | def train_dataloader(self) -> DataLoader: 34 | if self.configs.dataset.sampler == 'clovaai': 35 | sampler = ClovaaiSampler(self.train_dataset.labels, self.configs) 36 | shuffle = False 37 | else: 38 | sampler = None 39 | shuffle = True 40 | return SpeakerUttDataLoader( 41 | dataset=self.train_dataset, 42 | num_workers=self.configs.trainer.num_workers, 43 | min_num_frames=self.configs.model.min_num_frames, 44 | max_num_frames=self.configs.model.max_num_frames, 45 | batch_size=self.configs.trainer.batch_size, 46 | shuffle=shuffle, 47 | sampler=sampler 48 | ) 49 | 50 | def val_dataloader(self) -> DataLoader: 51 | return SpeakerUttDataLoader( 52 | dataset=self.valid_dataset, 53 | num_workers=self.configs.trainer.num_workers, 54 | min_num_frames=self.configs.model.min_num_frames, 55 | max_num_frames=self.configs.model.max_num_frames, 56 | batch_size=self.configs.trainer.batch_size 57 | ) 58 | 59 | def _spk2wav_utts(self, spk2wav, spk2id): 60 | utts = [] 61 | for spk in spk2wav: 62 | for wav in spk2wav[spk]: 63 | utts.append((str(wav), spk2id[spk], None)) 64 | random.shuffle(utts) 65 | return utts 66 | 67 | def _split_train_valid(self, speaker2wav, spk2id): 68 | valid_spk2wav = {} 69 | for spk in speaker2wav: 70 | random.shuffle(speaker2wav[spk]) 71 | valid_spk2wav[spk] = [speaker2wav[spk].pop(0)] 72 | train_utts = self._spk2wav_utts(speaker2wav, spk2id) 73 | valid_utts = self._spk2wav_utts(valid_spk2wav, spk2id) 74 | return train_utts, valid_utts 75 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/clovaai/ResNetBlocks.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | 5 | class SEBasicBlock(nn.Module): 6 | expansion = 1 7 | 8 | def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): 9 | super(SEBasicBlock, self).__init__() 10 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 11 | self.bn1 = nn.BatchNorm2d(planes) 12 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) 13 | self.bn2 = nn.BatchNorm2d(planes) 14 | self.relu = nn.ReLU(inplace=True) 15 | self.se = SELayer(planes, reduction) 16 | self.downsample = downsample 17 | self.stride = stride 18 | 19 | def forward(self, x): 20 | residual = x 21 | 22 | out = self.conv1(x) 23 | out = self.relu(out) 24 | out = self.bn1(out) 25 | 26 | out = self.conv2(out) 27 | out = self.bn2(out) 28 | out = self.se(out) 29 | 30 | if self.downsample is not None: 31 | residual = self.downsample(x) 32 | 33 | out += residual 34 | out = self.relu(out) 35 | return out 36 | 37 | 38 | class SEBottleneck(nn.Module): 39 | expansion = 4 40 | 41 | def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): 42 | super(SEBottleneck, self).__init__() 43 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 44 | self.bn1 = nn.BatchNorm2d(planes) 45 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 46 | padding=1, bias=False) 47 | self.bn2 = nn.BatchNorm2d(planes) 48 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 49 | self.bn3 = nn.BatchNorm2d(planes * 4) 50 | self.relu = nn.ReLU(inplace=True) 51 | self.se = SELayer(planes * 4, reduction) 52 | self.downsample = downsample 53 | self.stride = stride 54 | 55 | def forward(self, x): 56 | residual = x 57 | 58 | out = self.conv1(x) 59 | out = self.bn1(out) 60 | out = self.relu(out) 61 | 62 | out = self.conv2(out) 63 | out = self.bn2(out) 64 | out = self.relu(out) 65 | 66 | out = self.conv3(out) 67 | out = self.bn3(out) 68 | out = self.se(out) 69 | 70 | if self.downsample is not None: 71 | residual = self.downsample(x) 72 | 73 | out += residual 74 | out = self.relu(out) 75 | 76 | return out 77 | 78 | 79 | class SELayer(nn.Module): 80 | def __init__(self, channel, reduction=8): 81 | super(SELayer, self).__init__() 82 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 83 | self.fc = nn.Sequential( 84 | nn.Linear(channel, channel // reduction), 85 | nn.ReLU(inplace=True), 86 | nn.Linear(channel // reduction, channel), 87 | nn.Sigmoid() 88 | ) 89 | 90 | def forward(self, x): 91 | b, c, _, _ = x.size() 92 | y = self.avg_pool(x).view(b, c) 93 | y = self.fc(y).view(b, c, 1, 1) 94 | return x * y -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # This file is used to configure your project. 2 | # Read more about the various options under: 3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files 4 | 5 | [metadata] 6 | name = deepaudio-speaker 7 | description = Speaker embedding with neural networks 8 | author = Ruiqing Yin 9 | ;author-email = 10 | ;license = mit 11 | long-description = file: README.md 12 | long-description-content-type = text/markdown; charset=UTF-8; variant=GFM 13 | # Change if running only on Windows, Mac or Linux (comma-separated) 14 | platforms = Linux, Mac 15 | # Add here all kinds of additional classifiers as defined under 16 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 17 | classifiers = 18 | Development Status :: 4 - Beta 19 | Programming Language :: Python 20 | 21 | [options] 22 | zip_safe = False 23 | packages = find: 24 | include_package_data = True 25 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! 26 | setup_requires = pyscaffold>=3.2a0,<3.3a0 27 | # Add here dependencies of your project (semicolon/line-separated), e.g. 28 | # install_requires = numpy; scipy 29 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4 30 | python_requires = >=3.7 31 | 32 | [options.packages.find] 33 | where = . 34 | exclude = 35 | tests 36 | 37 | [options.extras_require] 38 | # Add here additional requirements for extra features, to install with: 39 | # PDF = ReportLab; RXP 40 | # Add here test requirements (semicolon/line-separated) 41 | testing = 42 | pytest>=6.0 43 | pytest-cov>=2.10 44 | jupyter 45 | papermill 46 | dev = 47 | pre_commit>=2.7 48 | recommonmark>=0.6 49 | black>=19.10b0 50 | 51 | [options.entry_points] 52 | 53 | console_scripts = 54 | deepaudio-speaker-train=deepaudio.speaker.cli.train:main 55 | 56 | 57 | [test] 58 | # py.test options when running `python setup.py test` 59 | # addopts = --verbose 60 | extras = True 61 | 62 | [tool:pytest] 63 | # Options for py.test: 64 | # Specify command line options as you would do when invoking py.test directly. 65 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml 66 | # in order to write a coverage file that can be read by Jenkins. 67 | addopts = 68 | --cov deepaudio --cov-report term-missing 69 | --verbose 70 | norecursedirs = 71 | dist 72 | build 73 | .tox 74 | testpaths = tests 75 | 76 | [aliases] 77 | dists = bdist_wheel 78 | 79 | [bdist_wheel] 80 | # Use this option if your package is pure-python 81 | universal = 1 82 | 83 | [build_sphinx] 84 | source_dir = doc 85 | build_dir = build/sphinx 86 | 87 | [devpi:upload] 88 | # Options for the devpi: PyPI server and packaging tool 89 | # VCS export must be deactivated since we are using setuptools-scm 90 | no-vcs = 1 91 | formats = bdist_wheel 92 | 93 | [flake8] 94 | # Some sane defaults for the code style checker flake8 95 | exclude = 96 | .tox 97 | build 98 | dist 99 | .eggs 100 | docs/conf.py 101 | 102 | [pyscaffold] 103 | # PyScaffold's parameters when the project was created. 104 | # This will be used when updating. Do not change! 105 | version = 3.2.3 106 | package = deepaudio-speaker 107 | extensions = 108 | markdown 109 | no_skeleton 110 | pre_commit 111 | dsproject 112 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/samplers.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from omegaconf import DictConfig 3 | 4 | import numpy as np 5 | from collections import defaultdict 6 | import torch 7 | import torch.distributed as dist 8 | 9 | 10 | def round_down(num, divisor): 11 | return num - (num % divisor) 12 | 13 | 14 | def worker_init_fn(worker_id): 15 | np.random.seed(np.random.get_state()[1][0] + worker_id) 16 | 17 | 18 | class ClovaaiSampler(torch.utils.data.Sampler): 19 | def __init__(self, labels, configs): 20 | 21 | self.data_label = labels 22 | self.nPerSpeaker = configs.dataset.per_speaker 23 | self.batch_size = configs.trainer.batch_size 24 | self.epoch = 0 25 | self.seed = 42 26 | self.distributed = False 27 | if configs.trainer.accelerator == 'ddp': 28 | self.distributed = True 29 | self.__iter__() 30 | 31 | def __iter__(self): 32 | 33 | g = torch.Generator() 34 | g.manual_seed(self.seed + self.epoch) 35 | indices = torch.randperm(len(self.data_label), generator=g).tolist() 36 | 37 | data_dict = defaultdict(list) 38 | 39 | # Sort into dictionary of file indices for each ID 40 | for index in indices: 41 | speaker_label = self.data_label[index] 42 | data_dict[speaker_label].append(index) 43 | 44 | dictkeys = list(data_dict.keys()) 45 | dictkeys.sort() 46 | 47 | lol = lambda lst, sz: [lst[i:i + sz] for i in range(0, len(lst), sz)] 48 | 49 | flattened_list = [] 50 | flattened_label = [] 51 | 52 | for findex, key in enumerate(dictkeys): 53 | data = data_dict[key] 54 | numSeg = round_down(len(data), self.nPerSpeaker) 55 | 56 | rp = lol(np.arange(numSeg), self.nPerSpeaker) 57 | flattened_label.extend([findex] * (len(rp))) 58 | for indices in rp: 59 | flattened_list.append([data[i] for i in indices]) 60 | 61 | ## Mix data in random order 62 | mixid = torch.randperm(len(flattened_label), generator=g).tolist() 63 | mixlabel = [] 64 | mixmap = [] 65 | 66 | ## Prevent two pairs of the same speaker in the same batch 67 | for ii in mixid: 68 | startbatch = round_down(len(mixlabel), self.batch_size) 69 | if flattened_label[ii] not in mixlabel[startbatch:]: 70 | mixlabel.append(flattened_label[ii]) 71 | mixmap.append(ii) 72 | 73 | mixed_list = [flattened_list[i] for i in mixmap] 74 | 75 | ## Divide data to each GPU 76 | if self.distributed: 77 | total_size = round_down(len(mixed_list), self.batch_size * dist.get_world_size()) 78 | start_index = int((dist.get_rank()) / dist.get_world_size() * total_size) 79 | end_index = int((dist.get_rank() + 1) / dist.get_world_size() * total_size) 80 | self.num_samples = end_index - start_index 81 | return iter(mixed_list[start_index:end_index]) 82 | else: 83 | total_size = round_down(len(mixed_list), self.batch_size) 84 | self.num_samples = total_size 85 | return iter(mixed_list[:total_size]) 86 | 87 | def __len__(self) -> int: 88 | return self.num_samples 89 | 90 | def set_epoch(self, epoch: int) -> None: 91 | self.epoch = epoch 92 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/speaker_embedding_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | from pytorch_lightning.utilities.cloud_io import load as pl_load 5 | 6 | from .speaker_model import SpeakerModel 7 | from . import MODEL_REGISTRY 8 | 9 | 10 | class SpeakerEmbeddingModel(SpeakerModel): 11 | def __init__(self, configs, num_classes): 12 | super(SpeakerEmbeddingModel, self).__init__(configs, num_classes) 13 | 14 | def forward(self, inputs: torch.FloatTensor) -> Tensor: 15 | return self.model(inputs) 16 | 17 | def training_step(self, batch: tuple, batch_idx: int): 18 | if self.configs.criterion.name in ['adaptive_aamsoftmax', 19 | 'adaptive_subcenter_aamsoftmax'] and self.global_step == 0: 20 | self.log( 21 | "val_loss", 22 | 15, 23 | on_step=True, 24 | on_epoch=False, 25 | prog_bar=True, 26 | logger=True, 27 | ) 28 | if self.configs.criterion.name in ['adaptive_aamsoftmax', 'adaptive_subcenter_aamsoftmax']: 29 | self.criterion.step(self.global_step) 30 | self.log( 31 | "margin", 32 | self.criterion.classifier_.margin, 33 | on_step=True, 34 | on_epoch=False, 35 | prog_bar=True, 36 | logger=True, 37 | ) 38 | X = batch['X'] 39 | y = batch['y'] 40 | embeddings = self.forward(X) 41 | loss = self.criterion(embeddings, y) 42 | return { 43 | 'loss': loss 44 | } 45 | 46 | def validation_step(self, batch: tuple, batch_idx: int): 47 | X = batch['X'] 48 | y = batch['y'] 49 | embeddings = self.forward(X) 50 | loss = self.criterion(embeddings, y) 51 | self.log( 52 | "val_loss", 53 | loss, 54 | on_step=False, 55 | on_epoch=True, 56 | prog_bar=True, 57 | logger=True, 58 | ) 59 | return { 60 | 'val_loss': loss 61 | } 62 | 63 | def on_save_checkpoint(self, checkpoint): 64 | checkpoint["configs"] = self.configs 65 | checkpoint["num_classes"] = self.num_classes 66 | 67 | @classmethod 68 | def from_pretrained(cls, path_for_pl, 69 | map_location=None, 70 | strict=False, configs=None): 71 | loaded_checkpoint = pl_load(path_for_pl, map_location=map_location) 72 | model_name: str = loaded_checkpoint["configs"].model.name 73 | num_classes = loaded_checkpoint["num_classes"] 74 | if configs is not None: 75 | new_configs = configs 76 | else: 77 | new_configs = loaded_checkpoint["configs"] 78 | Klass = MODEL_REGISTRY[model_name] 79 | return Klass.load_from_checkpoint( 80 | path_for_pl, 81 | map_location=map_location, 82 | strict=strict, 83 | configs=new_configs, 84 | num_classes=num_classes 85 | ) 86 | 87 | def to_torchscript(self, filepath): 88 | script = torch.jit.script(self.model) 89 | torch.jit.save(script, filepath) 90 | 91 | def make_embedding(self, feature): 92 | if self.model.training: 93 | self.model = self.model.eval() 94 | return self.model(feature).cpu().detach().numpy() 95 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/pyannote_aamsoftmax/aamsoftmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from omegaconf import DictConfig 7 | 8 | 9 | from .. import register_criterion 10 | from .configuration import PyannoteAAMSoftmaxConfigs 11 | 12 | 13 | class ArcLinear(nn.Module): 14 | """Additive Angular Margin classification module 15 | Parameters 16 | ---------- 17 | nfeat : int 18 | Embedding dimension 19 | nclass : int 20 | Number of classes 21 | margin : float 22 | Angular margin to penalize distances between embeddings and centers 23 | scale : float 24 | Scaling factor for the logits 25 | """ 26 | 27 | def __init__(self, nfeat, nclass, margin, scale): 28 | super(ArcLinear, self).__init__() 29 | eps = 1e-4 30 | self.min_cos = eps - 1 31 | self.max_cos = 1 - eps 32 | self.nclass = nclass 33 | self.margin = margin 34 | self.scale = scale 35 | self.W = nn.Parameter(Tensor(nclass, nfeat)) 36 | nn.init.xavier_uniform_(self.W) 37 | 38 | def forward(self, x, target=None): 39 | """Apply the angular margin transformation 40 | Parameters 41 | ---------- 42 | x : `torch.Tensor` 43 | an embedding batch 44 | target : `torch.Tensor` 45 | a non one-hot label batch 46 | Returns 47 | ------- 48 | fX : `torch.Tensor` 49 | logits after the angular margin transformation 50 | """ 51 | # normalize the feature vectors and W 52 | xnorm = F.normalize(x) 53 | Wnorm = F.normalize(self.W) 54 | target = target.long().view(-1, 1) 55 | # calculate cosθj (the logits) 56 | cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1)) 57 | # get the cosθ corresponding to the classes 58 | cos_theta_yi = cos_theta_j.gather(1, target) 59 | # for numerical stability 60 | cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos) 61 | # get the angle separating xi and Wyi 62 | theta_yi = torch.acos(cos_theta_yi) 63 | # apply the margin to the angle 64 | cos_theta_yi_margin = torch.cos(theta_yi + self.margin) 65 | # one hot encode y 66 | one_hot = torch.zeros_like(cos_theta_j) 67 | one_hot.scatter_(1, target, 1.0) 68 | # project margin differences into cosθj 69 | return self.scale * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi)) 70 | 71 | @register_criterion("pyannote_aamsoftmax", dataclass=PyannoteAAMSoftmaxConfigs) 72 | class PyannoteAAMSoftmax(nn.Module): 73 | def __init__(self, 74 | configs: DictConfig, 75 | num_classes: int, 76 | embedding_size: int 77 | ) -> None: 78 | super(PyannoteAAMSoftmax, self).__init__() 79 | self.configs=configs 80 | self.classifier_ = ArcLinear( 81 | nfeat=self.configs.model.embed_dim, 82 | nclass=num_classes, 83 | margin=configs.criterion.margin, 84 | scale=configs.criterion.scale 85 | ) 86 | self.logsoftmax_ = nn.LogSoftmax(dim=1) 87 | self.loss_ = nn.NLLLoss() 88 | 89 | def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor: 90 | logits = self.logsoftmax_(self.classifier_(embeddings, target=targets)) 91 | return self.loss_(logits, targets) 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/dataframe/lit_data_module.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import random 4 | from omegaconf import DictConfig 5 | import pytorch_lightning as pl 6 | from torch.utils.data import DataLoader 7 | 8 | from deepaudio.speaker.data.dataset import SpeakerAudioDataset 9 | from deepaudio.speaker.data.dataloader import SpeakerUttDataLoader 10 | from deepaudio.speaker.data.samplers import ClovaaiSampler 11 | 12 | from .utils import get_dataset_items, SpeakerDataframe, split_segment 13 | from .. import register_data_module 14 | 15 | 16 | @register_data_module('dataframe') 17 | class LightningDataframeDataModule(pl.LightningDataModule): 18 | def __init__(self, configs: DictConfig): 19 | super(LightningDataframeDataModule, self).__init__() 20 | self.configs = configs 21 | 22 | def prepare_data(self): 23 | dataset_items = get_dataset_items(self.configs.dataset.database_yml, 24 | self.configs.dataset.dataset_name) 25 | dataset = SpeakerDataframe(dataset_items) 26 | speaker2items = dataset.speaker2items 27 | spk2ids = dataset.spk2ids 28 | self.num_classes = len(spk2ids) 29 | self.train_utts, self.valid_utts = self._split_train_valid(speaker2items, spk2ids) 30 | 31 | def setup(self, stage: Optional[str] = None) -> None: 32 | self.train_dataset = SpeakerAudioDataset(self.configs, self.train_utts) 33 | self.valid_dataset = SpeakerAudioDataset(self.configs, self.valid_utts) 34 | 35 | def train_dataloader(self) -> DataLoader: 36 | if self.configs.dataset.sampler == 'clovaai': 37 | sampler = ClovaaiSampler(self.train_dataset.labels) 38 | else: 39 | sampler = None 40 | return SpeakerUttDataLoader( 41 | dataset=self.train_dataset, 42 | num_workers=self.configs.trainer.num_workers, 43 | min_num_frames=self.configs.model.min_num_frames, 44 | max_num_frames=self.configs.model.max_num_frames, 45 | batch_size=self.configs.trainer.batch_size, 46 | shuffle=True, 47 | sampler=sampler 48 | ) 49 | 50 | def val_dataloader(self) -> DataLoader: 51 | return SpeakerUttDataLoader( 52 | dataset=self.valid_dataset, 53 | num_workers=self.configs.trainer.num_workers, 54 | min_num_frames=self.configs.model.min_num_frames, 55 | max_num_frames=self.configs.model.max_num_frames, 56 | batch_size=self.configs.trainer.batch_size 57 | ) 58 | 59 | def _spk2wav_utts(self, speaker2items, spk2ids): 60 | utts = [] 61 | for spk in speaker2items: 62 | for item in speaker2items[spk]: 63 | wav, spk, seg = item 64 | utts.append((str(wav), spk2ids[spk], seg)) 65 | if self.configs.dataset.exhaustive: 66 | for subseg in split_segment(seg, 67 | self.configs.dataset.duration, 68 | self.configs.dataset.step): 69 | utts.append((str(wav), spk2ids[spk], subseg)) 70 | random.shuffle(utts) 71 | return utts 72 | 73 | def _split_train_valid(self, speaker2items, spk2ids): 74 | valid_spk2item = {} 75 | for spk in speaker2items: 76 | random.shuffle(speaker2items[spk]) 77 | valid_spk2item[spk] = [speaker2items[spk].pop(0)] 78 | train_utts = self._spk2wav_utts(speaker2items, spk2ids) 79 | valid_utts = self._spk2wav_utts(valid_spk2item, spk2ids) 80 | return train_utts, valid_utts 81 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/subcenter_aamsoftmax/subcenter_aamsoftmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from omegaconf import DictConfig 7 | 8 | from .. import register_criterion 9 | from .configuration import SubcenterAAMSoftmaxConfigs 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch.nn import Parameter 15 | import math 16 | 17 | 18 | class SubcenterArcMarginProduct(nn.Module): 19 | r"""Modified implementation from https://github.com/ronghuaiyang/arcface-pytorch/blob/47ace80b128042cd8d2efd408f55c5a3e156b032/models/metrics.py#L10 20 | """ 21 | 22 | def __init__(self, in_features, out_features, K=3, s=30.0, m=0.50, easy_margin=False): 23 | super(SubcenterArcMarginProduct, self).__init__() 24 | self.in_features = in_features 25 | self.out_features = out_features 26 | self.scale = s 27 | self.margin = m 28 | self.K = K 29 | self.weight = Parameter(torch.FloatTensor(out_features * self.K, in_features)) 30 | nn.init.xavier_uniform_(self.weight) 31 | 32 | self.easy_margin = easy_margin 33 | 34 | 35 | def forward(self, input, label): 36 | self.cos_m = math.cos(self.margin) 37 | self.sin_m = math.sin(self.margin) 38 | self.th = math.cos(math.pi - self.margin) 39 | self.mm = math.sin(math.pi - self.margin) * self.margin 40 | # --------------------------- cos(theta) & phi(theta) --------------------------- 41 | cosine = F.linear(F.normalize(input), F.normalize(self.weight)) 42 | 43 | if self.K > 1: 44 | cosine = torch.reshape(cosine, (-1, self.out_features, self.K)) 45 | cosine, _ = torch.max(cosine, axis=2) 46 | 47 | sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1)) 48 | # cos(phi+m) 49 | phi = cosine * self.cos_m - sine * self.sin_m 50 | 51 | if self.easy_margin: 52 | phi = torch.where(cosine > 0, phi, cosine) 53 | else: 54 | phi = torch.where(cosine > self.th, phi, cosine - self.mm) 55 | 56 | # --------------------------- convert label to one-hot --------------------------- 57 | # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda') 58 | one_hot = torch.zeros(cosine.size(), device=input.device) 59 | one_hot.scatter_(1, label.view(-1, 1).long(), 1) 60 | # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- 61 | output = (one_hot * phi) + ( 62 | (1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 63 | output *= self.s 64 | 65 | return output 66 | 67 | 68 | @register_criterion("subcenter_aamsoftmax", dataclass=SubcenterAAMSoftmaxConfigs) 69 | class PyannoteAAMSoftmax(nn.Module): 70 | def __init__(self, 71 | configs: DictConfig, 72 | num_classes: int, 73 | embedding_size: int 74 | ) -> None: 75 | super(PyannoteAAMSoftmax, self).__init__() 76 | self.configs = configs 77 | self.classifier_ = SubcenterArcMarginProduct( 78 | in_features=self.configs.model.embed_dim, 79 | out_features=num_classes, 80 | K=configs.model.criterion.K, 81 | m=configs.criterion.margin, 82 | s=configs.criterion.scale 83 | ) 84 | self.loss_ = nn.CrossEntropyLoss() 85 | 86 | def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor: 87 | logits = self.classifier_(embeddings, target=targets) 88 | return self.loss_(logits, targets) 89 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/warmup_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import torch 24 | from dataclasses import dataclass, field 25 | from typing import Optional 26 | from omegaconf import DictConfig 27 | from torch.optim import Optimizer 28 | 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 30 | from deepaudio.speaker.optim.scheduler import register_scheduler 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 32 | 33 | 34 | @dataclass 35 | class WarmupLRSchedulerConfigs(LearningRateSchedulerConfigs): 36 | scheduler_name: str = field( 37 | default="warmup", metadata={"help": "Name of learning rate scheduler."} 38 | ) 39 | peak_lr: float = field( 40 | default=1e-04, metadata={"help": "Maximum learning rate."} 41 | ) 42 | init_lr: float = field( 43 | default=1e-7, metadata={"help": "Initial learning rate."} 44 | ) 45 | warmup_steps: int = field( 46 | default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"} 47 | ) 48 | total_steps: int = field( 49 | default=200000, metadata={"help": "Total training steps."} 50 | ) 51 | 52 | 53 | @register_scheduler("warmup", dataclass=WarmupLRSchedulerConfigs) 54 | class WarmupLRScheduler(LearningRateScheduler): 55 | """ 56 | Warmup learning rate until `total_steps` 57 | 58 | Args: 59 | optimizer (Optimizer): wrapped optimizer. 60 | configs (DictConfig): configuration set. 61 | """ 62 | def __init__( 63 | self, 64 | optimizer: Optimizer, 65 | configs: DictConfig, 66 | ) -> None: 67 | super(WarmupLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr) 68 | if configs.lr_scheduler.warmup_steps != 0: 69 | warmup_rate = configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr 70 | self.warmup_rate = warmup_rate / configs.lr_scheduler.warmup_steps 71 | else: 72 | self.warmup_rate = 0 73 | self.update_steps = 1 74 | self.lr = configs.lr_scheduler.init_lr 75 | self.warmup_steps = configs.lr_scheduler.warmup_steps 76 | 77 | def step(self, val_loss: Optional[torch.FloatTensor] = None): 78 | if self.update_steps < self.warmup_steps: 79 | lr = self.init_lr + self.warmup_rate * self.update_steps 80 | self.set_lr(self.optimizer, lr) 81 | self.lr = lr 82 | self.update_steps += 1 83 | return self.lr 84 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/optimizer.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import torch 24 | 25 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler 26 | from deepaudio.speaker.optim.scheduler.warmup_reduce_lr_on_plateau_scheduler import WarmupReduceLROnPlateauScheduler 27 | 28 | 29 | class Optimizer(object): 30 | """ 31 | This is wrapper classs of torch.optim.Optimizer. 32 | This class provides functionalities for learning rate scheduling and gradient norm clipping. 33 | 34 | Args: 35 | optim (torch.optim.Optimizer): optimizer object, the parameters to be optimized 36 | should be given when instantiating the object, e.g. torch.optim.Adam, torch.optim.SGD 37 | scheduler (openspeech.optim.scheduler, optional): learning rate scheduler 38 | scheduler_period (int, optional): timestep with learning rate scheduler 39 | max_grad_norm (int, optional): value used for gradient norm clipping 40 | """ 41 | def __init__(self, optim, scheduler=None, scheduler_period=None, max_grad_norm=0): 42 | self.optimizer = optim 43 | self.scheduler = scheduler 44 | self.scheduler_period = scheduler_period 45 | self.max_grad_norm = max_grad_norm 46 | self.count = 0 47 | 48 | def step(self, model): 49 | if self.max_grad_norm > 0: 50 | torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) 51 | self.optimizer.step() 52 | 53 | if self.scheduler is not None: 54 | self.update() 55 | self.count += 1 56 | 57 | if self.scheduler_period == self.count: 58 | self.scheduler = None 59 | self.scheduler_period = 0 60 | self.count = 0 61 | 62 | def set_scheduler(self, scheduler, scheduler_period): 63 | self.scheduler = scheduler 64 | self.scheduler_period = scheduler_period 65 | self.count = 0 66 | 67 | def update(self, val_loss=None): 68 | if isinstance(self.scheduler, ReduceLROnPlateauScheduler) \ 69 | or isinstance(self.scheduler, WarmupReduceLROnPlateauScheduler): 70 | self.scheduler.step(val_loss) 71 | else: 72 | self.scheduler.step() 73 | 74 | def zero_grad(self): 75 | self.optimizer.zero_grad() 76 | 77 | def get_lr(self): 78 | for g in self.optimizer.param_groups: 79 | return g['lr'] 80 | 81 | def set_lr(self, lr): 82 | for g in self.optimizer.param_groups: 83 | g['lr'] = lr 84 | -------------------------------------------------------------------------------- /deepaudio/speaker/data/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from omegaconf import DictConfig 3 | 4 | import numpy as np 5 | 6 | from torch import Tensor 7 | from torch.utils.data import Dataset 8 | 9 | from pyannote.core import Segment, Timeline 10 | 11 | from deepaudio.speaker.data.audio_io.with_torchaudio import Audio 12 | from deepaudio.speaker.data.augmentation.noise import Noise, NoiseReverb, Reverb 13 | from deepaudio.speaker.data.augmentation.spec_augment import SpecAugment 14 | from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_REGISTRY 15 | 16 | 17 | class SpeakerAudioDataset(Dataset): 18 | NONE_AUGMENT = 0 19 | NOISE_AUGMENT = 1 20 | REVERB_AUGMENT = 2 21 | NOISE_REVERB_AUGMENT = 3 22 | SPEC_AUGMENT = 4 23 | 24 | def __init__( 25 | self, 26 | configs: DictConfig, 27 | utts: List, 28 | ) -> None: 29 | super(SpeakerAudioDataset, self).__init__() 30 | self.configs = configs 31 | self.utts = utts 32 | self.labels = [utt[1] for utt in utts] 33 | self.audio = Audio() 34 | self.feature_extractor = AUDIO_FEATURE_TRANSFORM_REGISTRY[configs.feature.name](configs) 35 | self.augmentations = [self.NONE_AUGMENT] 36 | weights = [1] 37 | if self.configs.augment.apply_noise_augment: 38 | self._noise_augmentor = Noise(configs) 39 | self.augmentations.append(self.NOISE_AUGMENT) 40 | weights.append(self.configs.augment.noise_augment_weight) 41 | 42 | if self.configs.augment.apply_reverb_augment: 43 | self._reverb_augmentor = Reverb(configs) 44 | self.augmentations.append(self.REVERB_AUGMENT) 45 | weights.append(self.configs.augment.reverb_augment_weight) 46 | if self.configs.augment.apply_noise_reverb_augment: 47 | self._noise_reverb_augmentor = NoiseReverb(configs) 48 | self.augmentations.append(self.NOISE_REVERB_AUGMENT) 49 | weights.append(self.configs.augment.noise_reverb_augment_weight) 50 | if self.configs.augment.apply_spec_augment: 51 | self._spec_augmentor = SpecAugment(configs) 52 | self.augmentations.append(self.SPEC_AUGMENT) 53 | weights.append(self.configs.augment.specaugment_weight) 54 | self.augmentations_prob = [float(i) / sum(weights) for i in weights] 55 | 56 | def _parse_audio(self, audio_path: str, augment: int = None, vad: Union[Segment, Timeline] = None) -> Tensor: 57 | if vad is not None: 58 | waveform, _ = self.audio.crop(audio_path, vad) 59 | else: 60 | waveform, _ = self.audio(audio_path) 61 | if augment == self.NOISE_AUGMENT: 62 | waveform = self._noise_augmentor(waveform) 63 | if augment == self.REVERB_AUGMENT: 64 | waveform = self._reverb_augmentor(waveform) 65 | if augment == self.NOISE_REVERB_AUGMENT: 66 | waveform = self._noise_reverb_augmentor(waveform) 67 | feature = self.feature_extractor(waveform) 68 | if augment == self.SPEC_AUGMENT: 69 | feature = self._spec_augmentor(feature) 70 | return feature.squeeze(0) 71 | 72 | def __getitem__(self, idxs): 73 | if isinstance(idxs, int): 74 | idxs = [idxs] 75 | features = [] 76 | speaker_ids = [] 77 | for idx in idxs: 78 | wav, speaker_id, vad = self.utts[idx] 79 | augment = np.random.choice(self.augmentations, p=self.augmentations_prob) 80 | feature = self._parse_audio(wav, augment, vad) 81 | features.append(feature) 82 | speaker_ids.append(speaker_id) 83 | return features, speaker_ids 84 | 85 | def __len__(self): 86 | return len(self.utts) 87 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/reduce_lr_on_plateau_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from dataclasses import dataclass, field 24 | from omegaconf import DictConfig 25 | from torch.optim import Optimizer 26 | from torch.optim.lr_scheduler import ReduceLROnPlateau 27 | from typing import Optional 28 | 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 30 | from deepaudio.speaker.optim.scheduler import register_scheduler 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 32 | 33 | 34 | @dataclass 35 | class ReduceLROnPlateauConfigs(LearningRateSchedulerConfigs): 36 | scheduler_name: str = field( 37 | default="reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."} 38 | ) 39 | lr_patience: int = field( 40 | default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."} 41 | ) 42 | lr_factor: float = field( 43 | default=0.9, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."} 44 | ) 45 | tolr: float = field( 46 | default=0.01, metadata={"help": "Tolr for loss."} 47 | ) 48 | 49 | 50 | @register_scheduler("reduce_lr_on_plateau", dataclass=ReduceLROnPlateauConfigs) 51 | class ReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau): 52 | r""" 53 | Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by 54 | a factor of 2-10 once learning stagnates. This scheduler reads a metrics quantity and if no improvement is seen 55 | for a ‘patience’ number of epochs, the learning rate is reduced. 56 | 57 | Args: 58 | optimizer (Optimizer): wrapped optimizer. 59 | configs (DictConfig): configuration set. 60 | """ 61 | def __init__( 62 | self, 63 | optimizer: Optimizer, 64 | configs: DictConfig, 65 | ) -> None: 66 | super(ReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr) 67 | self.lr = configs.lr_scheduler.lr 68 | self.lr_patience = configs.lr_scheduler.lr_patience 69 | self.lr_factor = configs.lr_scheduler.lr_factor 70 | self.tolr = configs.lr_scheduler.tolr 71 | self.val_loss = 100.0 72 | self.count = 0 73 | 74 | def step(self, val_loss: Optional[float] = None): 75 | if val_loss is not None: 76 | if self.val_loss < val_loss+self.tolr: 77 | self.count += 1 78 | self.val_loss = val_loss 79 | else: 80 | self.count = 0 81 | self.val_loss = val_loss 82 | 83 | if self.lr_patience == self.count: 84 | self.count = 0 85 | self.lr *= self.lr_factor 86 | self.set_lr(self.optimizer, self.lr) 87 | 88 | return self.lr 89 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/adamp.py: -------------------------------------------------------------------------------- 1 | # AdamP 2 | # Copyright (c) 2020-present NAVER Corp. 3 | # MIT license 4 | 5 | import torch 6 | from torch.optim.optimizer import Optimizer 7 | import math 8 | 9 | 10 | class AdamP(Optimizer): 11 | """ 12 | Paper: "AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights" 13 | 14 | Copied from https://github.com/clovaai/AdamP/ 15 | Copyright (c) 2020 Naver Corp. 16 | MIT License 17 | """ 18 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 19 | weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False): 20 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 21 | delta=delta, wd_ratio=wd_ratio, nesterov=nesterov) 22 | super(AdamP, self).__init__(params, defaults) 23 | 24 | def _channel_view(self, x): 25 | return x.view(x.size(0), -1) 26 | 27 | def _layer_view(self, x): 28 | return x.view(1, -1) 29 | 30 | def _cosine_similarity(self, x, y, eps, view_func): 31 | x = view_func(x) 32 | y = view_func(y) 33 | 34 | x_norm = x.norm(dim=1).add_(eps) 35 | y_norm = y.norm(dim=1).add_(eps) 36 | dot = (x * y).sum(dim=1) 37 | 38 | return dot.abs() / x_norm / y_norm 39 | 40 | def _projection(self, p, grad, perturb, delta, wd_ratio, eps): 41 | wd = 1 42 | expand_size = [-1] + [1] * (len(p.shape) - 1) 43 | for view_func in [self._channel_view, self._layer_view]: 44 | 45 | cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func) 46 | 47 | if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)): 48 | p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps) 49 | perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size) 50 | wd = wd_ratio 51 | 52 | return perturb, wd 53 | 54 | return perturb, wd 55 | 56 | def step(self, closure=None): 57 | loss = None 58 | if closure is not None: 59 | loss = closure() 60 | 61 | for group in self.param_groups: 62 | for p in group['params']: 63 | if p.grad is None: 64 | continue 65 | 66 | grad = p.grad.data 67 | beta1, beta2 = group['betas'] 68 | nesterov = group['nesterov'] 69 | 70 | state = self.state[p] 71 | 72 | # State initialization 73 | if len(state) == 0: 74 | state['step'] = 0 75 | state['exp_avg'] = torch.zeros_like(p.data) 76 | state['exp_avg_sq'] = torch.zeros_like(p.data) 77 | 78 | # Adam 79 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 80 | 81 | state['step'] += 1 82 | bias_correction1 = 1 - beta1 ** state['step'] 83 | bias_correction2 = 1 - beta2 ** state['step'] 84 | 85 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 86 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 87 | 88 | denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 89 | step_size = group['lr'] / bias_correction1 90 | 91 | if nesterov: 92 | perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom 93 | else: 94 | perturb = exp_avg / denom 95 | 96 | # Projection 97 | wd_ratio = 1 98 | if len(p.shape) > 1: 99 | perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], 100 | group['eps']) 101 | 102 | # Weight decay 103 | if group['weight_decay'] > 0: 104 | p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio) 105 | 106 | # Step 107 | p.data.add_(-step_size, perturb) 108 | 109 | return loss 110 | -------------------------------------------------------------------------------- /deepaudio/speaker/criterion/adaptive_aamsoftmax/aamsoftmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from omegaconf import DictConfig 7 | 8 | from .. import register_criterion 9 | from .configuration import AdaptiveAAMSoftmaxConfigs 10 | 11 | 12 | class ArcLinear(nn.Module): 13 | """Additive Angular Margin classification module 14 | Parameters 15 | ---------- 16 | nfeat : int 17 | Embedding dimension 18 | nclass : int 19 | Number of classes 20 | margin : float 21 | Angular margin to penalize distances between embeddings and centers 22 | scale : float 23 | Scaling factor for the logits 24 | """ 25 | 26 | def __init__(self, nfeat, nclass, margin, scale): 27 | super(ArcLinear, self).__init__() 28 | eps = 1e-4 29 | self.min_cos = eps - 1 30 | self.max_cos = 1 - eps 31 | self.nclass = nclass 32 | self.margin = margin 33 | self.scale = scale 34 | self.W = nn.Parameter(Tensor(nclass, nfeat)) 35 | nn.init.xavier_uniform_(self.W) 36 | 37 | def forward(self, x, target=None): 38 | """Apply the angular margin transformation 39 | Parameters 40 | ---------- 41 | x : `torch.Tensor` 42 | an embedding batch 43 | target : `torch.Tensor` 44 | a non one-hot label batch 45 | Returns 46 | ------- 47 | fX : `torch.Tensor` 48 | logits after the angular margin transformation 49 | """ 50 | # normalize the feature vectors and W 51 | xnorm = F.normalize(x) 52 | Wnorm = F.normalize(self.W) 53 | target = target.long().view(-1, 1) 54 | # calculate cosθj (the logits) 55 | cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1)) 56 | # get the cosθ corresponding to the classes 57 | cos_theta_yi = cos_theta_j.gather(1, target) 58 | # for numerical stability 59 | cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos) 60 | # get the angle separating xi and Wyi 61 | theta_yi = torch.acos(cos_theta_yi) 62 | # apply the margin to the angle 63 | cos_theta_yi_margin = torch.cos(theta_yi + self.margin) 64 | # one hot encode y 65 | one_hot = torch.zeros_like(cos_theta_j) 66 | one_hot.scatter_(1, target, 1.0) 67 | # project margin differences into cosθj 68 | return self.scale * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi)) 69 | 70 | 71 | @register_criterion("adaptive_aamsoftmax", dataclass=AdaptiveAAMSoftmaxConfigs) 72 | class AdaptiveAAMSoftmax(nn.Module): 73 | def __init__(self, 74 | configs: DictConfig, 75 | num_classes: int, 76 | embedding_size: int 77 | ) -> None: 78 | super(AdaptiveAAMSoftmax, self).__init__() 79 | self.configs = configs 80 | self.classifier_ = ArcLinear( 81 | nfeat=self.configs.model.embed_dim, 82 | nclass=num_classes, 83 | margin=configs.criterion.margin, 84 | scale=configs.criterion.scale 85 | ) 86 | self.margin = configs.criterion.margin 87 | self.logsoftmax_ = nn.LogSoftmax(dim=1) 88 | self.loss_ = nn.NLLLoss() 89 | self.warmup_steps = configs.lr_scheduler.warmup_steps if configs.lr_scheduler.scheduler_name.startswith('warmup') else 0 90 | self.increase_steps = configs.criterion.increase_steps 91 | self.increase_rate = self.margin / (self.increase_steps - self.warmup_steps) 92 | 93 | def step(self, global_steps): 94 | if global_steps < self.warmup_steps: 95 | self.classifier_.margin = 0 96 | elif global_steps < self.increase_steps: 97 | self.classifier_.margin = (global_steps - self.warmup_steps) * self.increase_rate 98 | else: 99 | self.classifier_.margin = self.margin 100 | 101 | def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor: 102 | logits = self.logsoftmax_(self.classifier_(embeddings, target=targets)) 103 | return self.loss_(logits, targets) 104 | -------------------------------------------------------------------------------- /deepaudio/speaker/datasets/dataframe/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from pathlib import Path 4 | from collections import defaultdict 5 | import pandas as pd 6 | import yaml 7 | 8 | from pyannote.core import Segment, Timeline, SlidingWindow 9 | 10 | 11 | def load_dataframe(wav_dir, table_path): 12 | df = pd.read_csv(table_path, header=None, delimiter=' ') 13 | df[0] = df[0].apply(lambda x: Path(wav_dir) / f'{x}.wav') 14 | return df 15 | 16 | 17 | def load_trial_dataframe(wav_dir, table_path): 18 | df = pd.read_csv(table_path, header=None, delimiter=' ') 19 | df[1] = df[1].apply(lambda x: Path(wav_dir) / f'{x}') 20 | df[2] = df[2].apply(lambda x: Path(wav_dir) / f'{x}') 21 | trials = [] 22 | for row in df.iterrows(): 23 | y, enroll, test = row[1] 24 | trials.append((enroll, test, y)) 25 | return trials 26 | 27 | 28 | def get_speaker_from_dataframe(dataframe): 29 | return set(dataframe[3]) 30 | 31 | 32 | def get_spk_id(speakers): 33 | sorted_speakers = sorted(list(speakers)) 34 | return {spk: i for i, spk in enumerate(sorted_speakers)} 35 | 36 | 37 | def split_segment(segment, duration, step): 38 | if segment.duration < duration + step: 39 | return Timeline([segment]) 40 | else: 41 | segs = [] 42 | sw = SlidingWindow(start=segment.start, duration=duration, step=step) 43 | for s in sw: 44 | if s in segment: 45 | segs.append(s) 46 | else: 47 | break 48 | if s.start < segment.end < s.end: 49 | segs.append(Segment(segment.end - duration, segment.end)) 50 | return Timeline(segs) 51 | 52 | 53 | def get_dataset_items(database_yml, dataset_names, category='train'): 54 | dataset_items = [] 55 | dataset_names = dataset_names.split(',') 56 | dataset_names = [n.strip() for n in dataset_names] 57 | with open(database_yml) as fp: 58 | dataset = yaml.load(fp, Loader=yaml.FullLoader) 59 | for name in dataset_names: 60 | dataset_items.append(get_dataset_item(dataset, name, category)) 61 | return dataset_items 62 | 63 | 64 | def get_dataset_item(dataset, name, category='train'): 65 | dataset_item = dataset['Datasets']['SpeakerDataset'][category].get(name, None) 66 | if dataset_item is None: 67 | msg = f'{name} does not exist' 68 | raise ValueError(msg) 69 | return dataset_item['wav_dir'], dataset_item['list_path'] 70 | 71 | 72 | class SpeakerDataframe: 73 | def __init__(self, dataset_items, 74 | strict=False, 75 | segment_min_duration=0, 76 | speaker_min_duration=0): 77 | self.strict = strict 78 | self.segment_min_duration = segment_min_duration 79 | self.speaker_min_duration = speaker_min_duration 80 | dfs = [load_dataframe(*item) for item in dataset_items] 81 | self.check_speakers(dfs) 82 | self.load_speaker2items(dfs) 83 | 84 | def check_speakers(self, dataframes): 85 | all_spks = [get_speaker_from_dataframe(df) for df in dataframes] 86 | if len(all_spks) > 1 and len(set.intersection(*all_spks)) > 0: 87 | msg = 'Different datasets contain same speakers' 88 | if self.strict: 89 | raise ValueError(msg) 90 | else: 91 | warnings.warn(msg) 92 | 93 | def load_speaker2items(self, dataframes): 94 | self._speaker2items = defaultdict(list) 95 | self.spk2duration = defaultdict(int) 96 | for df in dataframes: 97 | for _, row in df.iterrows(): 98 | wav, start, end, spk = row 99 | if (end - start) < self.segment_min_duration: 100 | continue 101 | self._speaker2items[spk].append((wav, spk, Segment(start, end))) 102 | self.spk2duration[spk] += end - start 103 | 104 | for spk in self.spk2duration: 105 | if self.spk2duration[spk] < self.speaker_min_duration: 106 | self._speaker2items.pop(spk) 107 | 108 | self._spk_ids = get_spk_id(self._speaker2items.keys()) 109 | 110 | @property 111 | def spk2ids(self): 112 | return self._spk_ids 113 | 114 | @property 115 | def speaker2items(self): 116 | return self._speaker2items 117 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/clovaai/ResNetSE34V2.py: -------------------------------------------------------------------------------- 1 | from deepaudio.speaker.modules.backbones.clovaai.ResNetBlocks import * 2 | 3 | 4 | class ResNetSE(nn.Module): 5 | def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs): 6 | super(ResNetSE, self).__init__() 7 | 8 | print('Embedding size is %d, encoder %s.' % (nOut, encoder_type)) 9 | 10 | self.inplanes = num_filters[0] 11 | self.encoder_type = encoder_type 12 | self.n_mels = n_mels 13 | self.log_input = log_input 14 | 15 | self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) 16 | self.relu = nn.ReLU(inplace=True) 17 | self.bn1 = nn.BatchNorm2d(num_filters[0]) 18 | 19 | self.layer1 = self._make_layer(block, num_filters[0], layers[0]) 20 | self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) 21 | self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) 22 | self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2)) 23 | 24 | outmap_size = int(self.n_mels / 8) 25 | 26 | self.attention = nn.Sequential( 27 | nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), 28 | nn.ReLU(), 29 | nn.BatchNorm1d(128), 30 | nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), 31 | nn.Softmax(dim=2), 32 | ) 33 | 34 | if self.encoder_type == "SAP": 35 | out_dim = num_filters[3] * outmap_size 36 | elif self.encoder_type == "ASP": 37 | out_dim = num_filters[3] * outmap_size * 2 38 | else: 39 | raise ValueError('Undefined encoder') 40 | 41 | self.fc = nn.Linear(out_dim, nOut) 42 | 43 | for m in self.modules(): 44 | if isinstance(m, nn.Conv2d): 45 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 46 | elif isinstance(m, nn.BatchNorm2d): 47 | nn.init.constant_(m.weight, 1) 48 | nn.init.constant_(m.bias, 0) 49 | 50 | def _make_layer(self, block, planes, blocks, stride=1): 51 | downsample = None 52 | if stride != 1 or self.inplanes != planes * block.expansion: 53 | downsample = nn.Sequential( 54 | nn.Conv2d(self.inplanes, planes * block.expansion, 55 | kernel_size=1, stride=stride, bias=False), 56 | nn.BatchNorm2d(planes * block.expansion), 57 | ) 58 | 59 | layers = [] 60 | layers.append(block(self.inplanes, planes, stride, downsample)) 61 | self.inplanes = planes * block.expansion 62 | for i in range(1, blocks): 63 | layers.append(block(self.inplanes, planes)) 64 | 65 | return nn.Sequential(*layers) 66 | 67 | def new_parameter(self, *size): 68 | out = nn.Parameter(torch.FloatTensor(*size)) 69 | nn.init.xavier_normal_(out) 70 | return out 71 | 72 | def forward(self, x): 73 | x = x.unsqueeze(1) 74 | x = x.transpose(-1, -2) 75 | x = self.conv1(x) 76 | x = self.relu(x) 77 | x = self.bn1(x) 78 | 79 | x = self.layer1(x) 80 | x = self.layer2(x) 81 | x = self.layer3(x) 82 | x = self.layer4(x) 83 | 84 | x = x.reshape(x.size()[0], -1, x.size()[-1]) 85 | 86 | w = self.attention(x) 87 | 88 | if self.encoder_type == "SAP": 89 | x = torch.sum(x * w, dim=2) 90 | elif self.encoder_type == "ASP": 91 | mu = torch.sum(x * w, dim=2) 92 | sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) 93 | x = torch.cat((mu, sg), 1) 94 | 95 | x = x.view(x.size()[0], -1) 96 | x = self.fc(x) 97 | 98 | return x 99 | 100 | 101 | def MainModel(configs): 102 | # Number of filters 103 | num_filters = [32, 64, 128, 256] 104 | model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], 105 | num_filters, 106 | nOut=configs.model.embed_dim, 107 | encoder_type=configs.model.encoder_type, 108 | n_mels=configs.feature.n_mels 109 | ) 110 | return model 111 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/wespeaker/tdnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """TDNN model for x-vector learning""" 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | import deepaudio.speaker.modules.backbones.wespeaker.pooling_layers as pooling_layers 21 | 22 | 23 | class TdnnLayer(nn.Module): 24 | def __init__(self, in_dim, out_dim, context_size, dilation=1, padding=0): 25 | """Define the TDNN layer, essentially 1-D convolution 26 | 27 | Args: 28 | in_dim (int): input dimension 29 | out_dim (int): output channels 30 | context_size (int): context size, essentially the filter size 31 | dilation (int, optional): Defaults to 1. 32 | padding (int, optional): Defaults to 0. 33 | """ 34 | super(TdnnLayer, self).__init__() 35 | self.in_dim = in_dim 36 | self.out_dim = out_dim 37 | self.context_size = context_size 38 | self.dilation = dilation 39 | self.padding = padding 40 | self.conv_1d = nn.Conv1d(self.in_dim, 41 | self.out_dim, 42 | self.context_size, 43 | dilation=self.dilation, 44 | padding=self.padding) 45 | 46 | # Set Affine=false to be compatible with the original kaldi version 47 | self.bn = nn.BatchNorm1d(out_dim, affine=False) 48 | 49 | def forward(self, x): 50 | out = self.conv_1d(x) 51 | out = F.relu(out) 52 | out = self.bn(out) 53 | return out 54 | 55 | 56 | class XVEC(nn.Module): 57 | def __init__(self, 58 | feat_dim=40, 59 | hid_dim=512, 60 | stats_dim=1500, 61 | embed_dim=512, 62 | pooling_func='TSTP'): 63 | """ 64 | Implementation of Kaldi style xvec, as described in 65 | X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION 66 | """ 67 | super(XVEC, self).__init__() 68 | self.feat_dim = feat_dim 69 | self.stats_dim = stats_dim 70 | self.embed_dim = embed_dim 71 | 72 | self.frame_1 = TdnnLayer(feat_dim, hid_dim, context_size=5, dilation=1) 73 | self.frame_2 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=2) 74 | self.frame_3 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=3) 75 | self.frame_4 = TdnnLayer(hid_dim, hid_dim, context_size=1, dilation=1) 76 | self.frame_5 = TdnnLayer(hid_dim, 77 | stats_dim, 78 | context_size=1, 79 | dilation=1) 80 | 81 | self.pool = getattr(pooling_layers, pooling_func)(in_dim=stats_dim) 82 | self.pool_out_dim = self.pool.get_out_dim() 83 | self.seg_1 = nn.Linear(self.pool_out_dim, embed_dim) 84 | self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False) 85 | self.seg_2 = nn.Linear(embed_dim, embed_dim) 86 | 87 | def forward(self, x): 88 | x = x.permute(0, 2, 1) # (B,T,F) -> (B,F,T) 89 | 90 | out = self.frame_1(x) 91 | out = self.frame_2(out) 92 | out = self.frame_3(out) 93 | out = self.frame_4(out) 94 | out = self.frame_5(out) 95 | 96 | stats = self.pool(out) 97 | embed_a = self.seg_1(stats) 98 | out = F.relu(embed_a) 99 | out = self.seg_bn_1(out) 100 | embed_b = self.seg_2(out) 101 | 102 | return embed_a, embed_b 103 | 104 | 105 | if __name__ == '__main__': 106 | model = XVEC(feat_dim=80, embed_dim=512, pooling_func='TSTP') 107 | model.eval() 108 | y = model(torch.rand(10, 200, 80)) 109 | print(y[-1].size()) 110 | 111 | num_params = sum(p.numel() for p in model.parameters()) 112 | print("{} M".format(num_params / 1e6)) 113 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from omegaconf import DictConfig 24 | from torch.optim import Optimizer 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau 26 | from dataclasses import dataclass, field 27 | from typing import Optional 28 | 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 30 | from deepaudio.speaker.optim.scheduler import register_scheduler 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 32 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler 33 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler 34 | 35 | 36 | @dataclass 37 | class WarmupReduceLROnPlateauConfigs(LearningRateSchedulerConfigs): 38 | scheduler_name: str = field( 39 | default="warmup_reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."} 40 | ) 41 | lr_patience: int = field( 42 | default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."} 43 | ) 44 | lr_factor: float = field( 45 | default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."} 46 | ) 47 | peak_lr: float = field( 48 | default=1e-04, metadata={"help": "Maximum learning rate."} 49 | ) 50 | init_lr: float = field( 51 | default=1e-10, metadata={"help": "Initial learning rate."} 52 | ) 53 | warmup_steps: int = field( 54 | default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"} 55 | ) 56 | 57 | 58 | @register_scheduler("warmup_reduce_lr_on_plateau", dataclass=WarmupReduceLROnPlateauConfigs) 59 | class WarmupReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau): 60 | r""" 61 | Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after. 62 | 63 | Args: 64 | optimizer (Optimizer): wrapped optimizer. 65 | configs (DictConfig): configuration set. 66 | """ 67 | def __init__( 68 | self, 69 | optimizer: Optimizer, 70 | configs: DictConfig, 71 | ) -> None: 72 | super(WarmupReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr) 73 | self.warmup_steps = configs.lr_scheduler.warmup_steps 74 | self.update_steps = 0 75 | self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \ 76 | if self.warmup_steps != 0 else 0 77 | self.schedulers = [ 78 | WarmupLRScheduler( 79 | optimizer, 80 | configs, 81 | ), 82 | ReduceLROnPlateauScheduler( 83 | optimizer, 84 | configs, 85 | ), 86 | ] 87 | 88 | def _decide_stage(self): 89 | if self.update_steps < self.warmup_steps: 90 | return 0, self.update_steps 91 | else: 92 | return 1, None 93 | 94 | def step(self, val_loss: Optional[float] = None): 95 | stage, steps_in_stage = self._decide_stage() 96 | 97 | if stage == 0: 98 | self.schedulers[0].step() 99 | elif stage == 1: 100 | self.schedulers[1].step(val_loss) 101 | 102 | self.update_steps += 1 103 | 104 | return self.get_lr() 105 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/transformer_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import math 24 | import torch 25 | from typing import Optional 26 | from dataclasses import dataclass, field 27 | from omegaconf import DictConfig 28 | from torch.optim import Optimizer 29 | 30 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 31 | from deepaudio.speaker.optim.scheduler import register_scheduler 32 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 33 | 34 | 35 | @dataclass 36 | class TransformerLRSchedulerConfigs(LearningRateSchedulerConfigs): 37 | scheduler_name: str = field( 38 | default="transformer", metadata={"help": "Name of learning rate scheduler."} 39 | ) 40 | peak_lr: float = field( 41 | default=1e-04, metadata={"help": "Maximum learning rate."} 42 | ) 43 | final_lr: float = field( 44 | default=1e-07, metadata={"help": "Final learning rate."} 45 | ) 46 | final_lr_scale: float = field( 47 | default=0.05, metadata={"help": "Final learning rate scale"} 48 | ) 49 | warmup_steps: int = field( 50 | default=10000, metadata={"help": "Warmup the learning rate linearly for the first N updates"} 51 | ) 52 | decay_steps: int = field( 53 | default=150000, metadata={"help": "Steps in decay stages"} 54 | ) 55 | 56 | 57 | @register_scheduler("transformer", dataclass=TransformerLRSchedulerConfigs) 58 | class TransformerLRScheduler(LearningRateScheduler): 59 | r""" 60 | Transformer Learning Rate Scheduler proposed in "Attention Is All You Need" 61 | 62 | Args: 63 | optimizer (Optimizer): wrapped optimizer. 64 | configs (DictConfig): configuration set. 65 | """ 66 | def __init__( 67 | self, 68 | optimizer: Optimizer, 69 | configs: DictConfig, 70 | ) -> None: 71 | assert isinstance(configs.lr_scheduler.warmup_steps, int), "warmup_steps should be inteager type" 72 | assert isinstance(configs.lr_scheduler.decay_steps, int), "total_steps should be inteager type" 73 | 74 | super(TransformerLRScheduler, self).__init__(optimizer, 0.0) 75 | self.final_lr = configs.lr_scheduler.final_lr 76 | self.peak_lr = configs.lr_scheduler.peak_lr 77 | self.warmup_steps = configs.lr_scheduler.warmup_steps 78 | self.decay_steps = configs.lr_scheduler.decay_steps 79 | 80 | self.warmup_rate = self.peak_lr / self.warmup_steps 81 | self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps 82 | 83 | self.lr = self.init_lr 84 | self.update_step = 0 85 | 86 | def _decide_stage(self): 87 | if self.update_step < self.warmup_steps: 88 | return 0, self.update_step 89 | 90 | if self.warmup_steps <= self.update_step < self.warmup_steps + self.decay_steps: 91 | return 1, self.update_step - self.warmup_steps 92 | 93 | return 2, None 94 | 95 | def step(self, val_loss: Optional[torch.FloatTensor] = None): 96 | self.update_step += 1 97 | stage, steps_in_stage = self._decide_stage() 98 | 99 | if stage == 0: 100 | self.lr = self.update_step * self.warmup_rate 101 | elif stage == 1: 102 | self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) 103 | elif stage == 2: 104 | self.lr = self.final_lr 105 | else: 106 | raise ValueError("Undefined stage") 107 | 108 | self.set_lr(self.optimizer, self.lr) 109 | 110 | return self.lr 111 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/clovaai/ResNetSE34L.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from deepaudio.speaker.modules.backbones.clovaai.ResNetBlocks import * 3 | 4 | 5 | class ResNetSE(nn.Module): 6 | def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, **kwargs): 7 | super(ResNetSE, self).__init__() 8 | 9 | print('Embedding size is %d, encoder %s.' % (nOut, encoder_type)) 10 | 11 | self.inplanes = num_filters[0] 12 | self.encoder_type = encoder_type 13 | self.n_mels = n_mels 14 | 15 | self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=7, stride=(2, 1), padding=3, 16 | bias=False) 17 | self.bn1 = nn.BatchNorm2d(num_filters[0]) 18 | self.relu = nn.ReLU(inplace=True) 19 | 20 | self.layer1 = self._make_layer(block, num_filters[0], layers[0]) 21 | self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) 22 | self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) 23 | self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1)) 24 | 25 | if self.encoder_type == "SAP": 26 | self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion) 27 | self.attention = self.new_parameter(num_filters[3] * block.expansion, 1) 28 | out_dim = num_filters[3] * block.expansion 29 | elif self.encoder_type == "ASP": 30 | self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion) 31 | self.attention = self.new_parameter(num_filters[3] * block.expansion, 1) 32 | out_dim = num_filters[3] * block.expansion * 2 33 | else: 34 | raise ValueError('Undefined encoder') 35 | 36 | self.fc = nn.Linear(out_dim, nOut) 37 | 38 | for m in self.modules(): 39 | if isinstance(m, nn.Conv2d): 40 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 41 | elif isinstance(m, nn.BatchNorm2d): 42 | nn.init.constant_(m.weight, 1) 43 | nn.init.constant_(m.bias, 0) 44 | 45 | def _make_layer(self, block, planes, blocks, stride=1): 46 | downsample = None 47 | if stride != 1 or self.inplanes != planes * block.expansion: 48 | downsample = nn.Sequential( 49 | nn.Conv2d(self.inplanes, planes * block.expansion, 50 | kernel_size=1, stride=stride, bias=False), 51 | nn.BatchNorm2d(planes * block.expansion), 52 | ) 53 | 54 | layers = [] 55 | layers.append(block(self.inplanes, planes, stride, downsample)) 56 | self.inplanes = planes * block.expansion 57 | for i in range(1, blocks): 58 | layers.append(block(self.inplanes, planes)) 59 | 60 | return nn.Sequential(*layers) 61 | 62 | def new_parameter(self, *size): 63 | out = nn.Parameter(torch.FloatTensor(*size)) 64 | nn.init.xavier_normal_(out) 65 | return out 66 | 67 | def forward(self, x): 68 | x = x.unsqueeze(1) 69 | x = x.transpose(-1, -2) 70 | x = self.conv1(x) 71 | x = self.bn1(x) 72 | x = self.relu(x) 73 | 74 | x = self.layer1(x) 75 | x = self.layer2(x) 76 | x = self.layer3(x) 77 | x = self.layer4(x) 78 | 79 | x = torch.mean(x, dim=2, keepdim=True) 80 | 81 | if self.encoder_type == "SAP": 82 | x = x.permute(0, 3, 1, 2).squeeze(-1) 83 | h = torch.tanh(self.sap_linear(x)) 84 | w = torch.matmul(h, self.attention).squeeze(dim=2) 85 | w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1) 86 | x = torch.sum(x * w, dim=1) 87 | elif self.encoder_type == "ASP": 88 | x = x.permute(0, 3, 1, 2).squeeze(-1) 89 | h = torch.tanh(self.sap_linear(x)) 90 | w = torch.matmul(h, self.attention).squeeze(dim=2) 91 | w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1) 92 | mu = torch.sum(x * w, dim=1) 93 | rh = torch.sqrt((torch.sum((x ** 2) * w, dim=1) - mu ** 2).clamp(min=1e-5)) 94 | x = torch.cat((mu, rh), 1) 95 | 96 | x = x.view(x.size()[0], -1) 97 | x = self.fc(x) 98 | 99 | return x 100 | 101 | 102 | def MainModel(configs): 103 | # Number of filters 104 | num_filters = [16, 32, 64, 128] 105 | model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], 106 | num_filters, 107 | nOut=configs.model.embed_dim, 108 | encoder_type=configs.model.encoder_type 109 | ) 110 | return model 111 | 112 | if __name__ == '__main__': 113 | # Input size: batch_size * seq_len * feat_dim 114 | x = torch.zeros(2, 200, 80) 115 | model = MainModel() 116 | out = model(x) 117 | print(out.shape) # should be [2, 192] -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/warmup_steplr_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from omegaconf import DictConfig 24 | from torch.optim import Optimizer 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau 26 | from dataclasses import dataclass, field 27 | from typing import Optional 28 | 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 30 | from deepaudio.speaker.optim.scheduler import register_scheduler 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 32 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler 33 | from deepaudio.speaker.optim.scheduler.fix_lr_scheduler import FixLRScheduler 34 | from deepaudio.speaker.optim.scheduler.step_lr_scheduler import StepLRScheduler 35 | 36 | 37 | @dataclass 38 | class WarmupStepLRConfigs(LearningRateSchedulerConfigs): 39 | scheduler_name: str = field( 40 | default="warmup_step_lr", metadata={"help": "Name of learning rate scheduler."} 41 | ) 42 | lr_factor: float = field( 43 | default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."} 44 | ) 45 | peak_lr: float = field( 46 | default=1e-04, metadata={"help": "Maximum learning rate."} 47 | ) 48 | init_lr: float = field( 49 | default=1e-10, metadata={"help": "Initial learning rate."} 50 | ) 51 | warmup_steps: int = field( 52 | default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"} 53 | ) 54 | min_lr: float = field( 55 | default=1e-7, metadata={"help": "Min learning rate."} 56 | ) 57 | step_size: int = field( 58 | default=70000, metadata={"help": "Step size to decay"} 59 | ) 60 | freeze_steps: int = field( 61 | default=400000, metadata={"help": "Step size to decay"} 62 | ) 63 | 64 | 65 | @register_scheduler("warmup_step_lr", dataclass=WarmupStepLRConfigs) 66 | class WarmupStepLRScheduler(LearningRateScheduler): 67 | r""" 68 | Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after. 69 | 70 | Args: 71 | optimizer (Optimizer): wrapped optimizer. 72 | configs (DictConfig): configuration set. 73 | """ 74 | def __init__( 75 | self, 76 | optimizer: Optimizer, 77 | configs: DictConfig, 78 | ) -> None: 79 | super(WarmupStepLRScheduler, self).__init__(optimizer, configs.lr_scheduler.lr) 80 | self.warmup_steps = configs.lr_scheduler.warmup_steps 81 | self.update_steps = 0 82 | self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \ 83 | if self.warmup_steps != 0 else 0 84 | self.freeze_steps = configs.lr_scheduler.freeze_steps 85 | self.schedulers = [ 86 | WarmupLRScheduler( 87 | optimizer, 88 | configs, 89 | ), 90 | FixLRScheduler( 91 | optimizer, 92 | configs, 93 | ), 94 | StepLRScheduler( 95 | optimizer, 96 | configs, 97 | ), 98 | ] 99 | 100 | def _decide_stage(self): 101 | if self.update_steps < self.warmup_steps: 102 | return 0, self.update_steps 103 | elif self.update_steps < self.freeze_steps: 104 | return 1, self.update_steps 105 | else: 106 | return 2, None 107 | 108 | def step(self, val_loss: Optional[float] = None): 109 | stage, steps_in_stage = self._decide_stage() 110 | 111 | if stage == 0: 112 | self.schedulers[0].step() 113 | elif stage == 1: 114 | self.schedulers[1].step() 115 | elif stage == 2: 116 | self.schedulers[2].step() 117 | 118 | self.update_steps += 1 119 | 120 | return self.get_lr() 121 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/warmup_adaptive_loss_reduce_lr_on_plateau_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from omegaconf import DictConfig 24 | from torch.optim import Optimizer 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau 26 | from dataclasses import dataclass, field 27 | from typing import Optional 28 | 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 30 | from deepaudio.speaker.optim.scheduler import register_scheduler 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 32 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler 33 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler 34 | from deepaudio.speaker.optim.scheduler.fix_lr_scheduler import FixLRScheduler 35 | 36 | 37 | 38 | @dataclass 39 | class WarmupAdaptiveReduceLROnPlateauConfigs(LearningRateSchedulerConfigs): 40 | scheduler_name: str = field( 41 | default="warmup_adaptive_reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."} 42 | ) 43 | lr_patience: int = field( 44 | default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."} 45 | ) 46 | lr_factor: float = field( 47 | default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."} 48 | ) 49 | tolr: float = field( 50 | default=0.01, metadata={"help": "Tolr for loss."} 51 | ) 52 | peak_lr: float = field( 53 | default=1e-04, metadata={"help": "Maximum learning rate."} 54 | ) 55 | init_lr: float = field( 56 | default=1e-10, metadata={"help": "Initial learning rate."} 57 | ) 58 | warmup_steps: int = field( 59 | default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"} 60 | ) 61 | 62 | 63 | @register_scheduler("warmup_adaptive_reduce_lr_on_plateau", dataclass=WarmupAdaptiveReduceLROnPlateauConfigs) 64 | class WarmupAdaptiveReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau): 65 | r""" 66 | Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after. 67 | 68 | Args: 69 | optimizer (Optimizer): wrapped optimizer. 70 | configs (DictConfig): configuration set. 71 | """ 72 | def __init__( 73 | self, 74 | optimizer: Optimizer, 75 | configs: DictConfig, 76 | ) -> None: 77 | super(WarmupAdaptiveReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr) 78 | self.warmup_steps = configs.lr_scheduler.warmup_steps 79 | self.update_steps = 0 80 | self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \ 81 | if self.warmup_steps != 0 else 0 82 | self.increase_steps = configs.criterion.increase_steps 83 | self.schedulers = [ 84 | WarmupLRScheduler( 85 | optimizer, 86 | configs, 87 | ), 88 | ReduceLROnPlateauScheduler( 89 | optimizer, 90 | configs, 91 | ), 92 | FixLRScheduler( 93 | optimizer, 94 | configs, 95 | ), 96 | ] 97 | 98 | def _decide_stage(self): 99 | if self.update_steps < self.warmup_steps: 100 | return 0, self.update_steps 101 | elif self.update_steps < self.increase_steps: 102 | return 2, self.update_steps 103 | else: 104 | return 1, None 105 | 106 | def step(self, val_loss: Optional[float] = None): 107 | stage, steps_in_stage = self._decide_stage() 108 | 109 | if stage == 0: 110 | self.schedulers[0].step() 111 | elif stage == 1: 112 | self.schedulers[1].step(val_loss) 113 | elif stage == 2: 114 | self.schedulers[2].step() 115 | 116 | self.update_steps += 1 117 | 118 | return self.get_lr() 119 | -------------------------------------------------------------------------------- /deepaudio/speaker/models/speaker_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import pytorch_lightning as pl 4 | from typing import Dict 5 | from omegaconf import DictConfig 6 | from torch import Tensor 7 | from torch.optim import Adam, Adagrad, Adadelta, Adamax, AdamW, SGD, ASGD 8 | 9 | from deepaudio.speaker.optim import AdamP, RAdam, Novograd 10 | from deepaudio.speaker.criterion import CRITERION_REGISTRY 11 | from deepaudio.speaker.optim.scheduler import SCHEDULER_REGISTRY 12 | 13 | 14 | class SpeakerModel(pl.LightningModule): 15 | def __init__(self, configs: DictConfig, num_classes: int) -> None: 16 | super(SpeakerModel, self).__init__() 17 | self.configs = configs 18 | self.num_classes = num_classes 19 | self.gradient_clip_val = configs.trainer.gradient_clip_val 20 | self.current_val_loss = 100.0 21 | self.build_model() 22 | self.criterion = self.configure_criterion(configs.criterion.name) 23 | 24 | def build_model(self): 25 | raise NotImplementedError 26 | 27 | def forward(self, inputs: torch.FloatTensor) -> Tensor: 28 | raise NotImplementedError 29 | 30 | def training_step(self, batch: tuple, batch_idx: int): 31 | r""" 32 | Forward propagate a `inputs` and `targets` pair for training. 33 | 34 | Inputs: 35 | batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths` 36 | batch_idx (int): The index of batch 37 | 38 | Returns: 39 | loss (torch.Tensor): loss for training 40 | """ 41 | raise NotImplementedError 42 | 43 | def validation_step(self, batch: tuple, batch_idx: int): 44 | r""" 45 | Forward propagate a `inputs` and `targets` pair for validation. 46 | 47 | Inputs: 48 | batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths` 49 | batch_idx (int): The index of batch 50 | 51 | Returns: 52 | loss (torch.Tensor): loss for training 53 | """ 54 | raise NotImplementedError 55 | 56 | def configure_optimizers(self): 57 | r""" 58 | Choose what optimizers and learning-rate schedulers to use in your optimization. 59 | 60 | 61 | Returns: 62 | - **Dictionary** - The first item has multiple optimizers, and the second has multiple LR schedulers 63 | (or multiple ``lr_dict``). 64 | """ 65 | SUPPORTED_OPTIMIZERS = { 66 | "adam": Adam, 67 | "adamp": AdamP, 68 | "radam": RAdam, 69 | "adagrad": Adagrad, 70 | "adadelta": Adadelta, 71 | "adamax": Adamax, 72 | "adamw": AdamW, 73 | "sgd": SGD, 74 | "asgd": ASGD, 75 | "novograd": Novograd, 76 | } 77 | 78 | assert self.configs.model.optimizer in SUPPORTED_OPTIMIZERS.keys(), \ 79 | f"Unsupported Optimizer: {self.configs.model.optimizer}\n" \ 80 | f"Supported Optimizers: {SUPPORTED_OPTIMIZERS.keys()}" 81 | 82 | self.optimizer = SUPPORTED_OPTIMIZERS[self.configs.model.optimizer]( 83 | self.parameters(), 84 | lr=self.configs.lr_scheduler.lr, 85 | weight_decay=1e-5, 86 | ) 87 | scheduler = SCHEDULER_REGISTRY[self.configs.lr_scheduler.scheduler_name](self.optimizer, self.configs) 88 | 89 | if self.configs.lr_scheduler.scheduler_name == "reduce_lr_on_plateau": 90 | lr_scheduler = { 91 | 'scheduler': scheduler, 92 | 'monitor': 'val_loss', 93 | 'interval': 'epoch', 94 | } 95 | elif self.configs.lr_scheduler.scheduler_name == "warmup_reduce_lr_on_plateau": 96 | lr_scheduler = { 97 | 'scheduler': scheduler, 98 | 'monitor': 'val_loss', 99 | 'interval': 'step', 100 | } 101 | elif self.configs.lr_scheduler.scheduler_name == "warmup_adaptive_reduce_lr_on_plateau": 102 | lr_scheduler = { 103 | 'scheduler': scheduler, 104 | 'monitor': 'val_loss', 105 | 'interval': 'step', 106 | } 107 | else: 108 | print('by step') 109 | lr_scheduler = { 110 | 'scheduler': scheduler, 111 | 'interval': 'step', 112 | } 113 | 114 | return [self.optimizer], [lr_scheduler] 115 | 116 | def configure_criterion(self, criterion_name: str) -> nn.Module: 117 | r""" 118 | Configure criterion for training. 119 | 120 | Args: 121 | criterion_name (str): name of criterion 122 | 123 | Returns: 124 | criterion (nn.Module): criterion for training 125 | """ 126 | 127 | return CRITERION_REGISTRY[criterion_name]( 128 | configs=self.configs, 129 | num_classes=self.num_classes, 130 | embedding_size=self.configs.model.embed_dim 131 | ) 132 | 133 | def get_lr(self): 134 | for g in self.optimizer.param_groups: 135 | return g['lr'] 136 | 137 | def set_lr(self, lr): 138 | for g in self.optimizer.param_groups: 139 | g['lr'] = lr 140 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/radam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, LiyuanLucasLiu. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | import torch 17 | from torch.optim.optimizer import Optimizer 18 | 19 | 20 | class RAdam(Optimizer): 21 | """ 22 | Paper: "On the Variance of the Adaptive Learning Rate and Beyond" 23 | 24 | Refer to https://github.com/LiyuanLucasLiu/RAdam 25 | Copyright (c) LiyuanLucasLiu 26 | Apache 2.0 License 27 | """ 28 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): 29 | if lr < 0.0: 30 | raise ValueError("Invalid learning rate: {}".format(lr)) 31 | if eps < 0.0: 32 | raise ValueError("Invalid epsilon value: {}".format(eps)) 33 | if not 0.0 <= betas[0] < 1.0: 34 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 35 | if not 0.0 <= betas[1] < 1.0: 36 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 37 | 38 | self.degenerated_to_sgd = degenerated_to_sgd 39 | if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): 40 | for param in params: 41 | if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): 42 | param['buffer'] = [[None, None, None] for _ in range(10)] 43 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 44 | buffer=[[None, None, None] for _ in range(10)]) 45 | super(RAdam, self).__init__(params, defaults) 46 | 47 | def __setstate__(self, state): 48 | super(RAdam, self).__setstate__(state) 49 | 50 | def step(self, closure=None): 51 | 52 | loss = None 53 | if closure is not None: 54 | loss = closure() 55 | 56 | for group in self.param_groups: 57 | 58 | for p in group['params']: 59 | if p.grad is None: 60 | continue 61 | grad = p.grad.data.float() 62 | if grad.is_sparse: 63 | raise RuntimeError('RAdam does not support sparse gradients') 64 | 65 | p_data_fp32 = p.data.float() 66 | 67 | state = self.state[p] 68 | 69 | if len(state) == 0: 70 | state['step'] = 0 71 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 72 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 73 | else: 74 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 75 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 76 | 77 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 78 | beta1, beta2 = group['betas'] 79 | 80 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 81 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 82 | 83 | state['step'] += 1 84 | buffered = group['buffer'][int(state['step'] % 10)] 85 | if state['step'] == buffered[0]: 86 | N_sma, step_size = buffered[1], buffered[2] 87 | else: 88 | buffered[0] = state['step'] 89 | beta2_t = beta2 ** state['step'] 90 | N_sma_max = 2 / (1 - beta2) - 1 91 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 92 | buffered[1] = N_sma 93 | 94 | # more conservative since it's an approximated value 95 | if N_sma >= 5: 96 | step_size = math.sqrt( 97 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( 98 | N_sma_max - 2)) / (1 - beta1 ** state['step']) 99 | elif self.degenerated_to_sgd: 100 | step_size = 1.0 / (1 - beta1 ** state['step']) 101 | else: 102 | step_size = -1 103 | buffered[2] = step_size 104 | 105 | # more conservative since it's an approximated value 106 | if N_sma >= 5: 107 | if group['weight_decay'] != 0: 108 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 109 | denom = exp_avg_sq.sqrt().add_(group['eps']) 110 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 111 | p.data.copy_(p_data_fp32) 112 | elif step_size > 0: 113 | if group['weight_decay'] != 0: 114 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 115 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 116 | p.data.copy_(p_data_fp32) 117 | 118 | return loss 119 | -------------------------------------------------------------------------------- /deepaudio/speaker/optim/novograd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torch.optim.optimizer import Optimizer 17 | 18 | 19 | class Novograd(Optimizer): 20 | """ 21 | Novograd algorithm. 22 | 23 | Copied from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/optimizers.py 24 | Copyright (c) 2019 NVIDIA Corp. 25 | Apache-2.0 License 26 | 27 | Args: 28 | params (iterable): iterable of parameters to optimize or dicts defining 29 | parameter groups 30 | lr (float, optional): learning rate (default: 1e-3) 31 | betas (Tuple[float, float], optional): coefficients used for computing 32 | running averages of gradient and its square (default: (0.95, 0)) 33 | eps (float, optional): term added to the denominator to improve 34 | numerical stability (default: 1e-8) 35 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 36 | grad_averaging: gradient averaging 37 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 38 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 39 | (default: False) 40 | """ 41 | 42 | def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, 43 | weight_decay=0, grad_averaging=False, amsgrad=False): 44 | if 0.0 > lr: 45 | raise ValueError("Invalid learning rate: {}".format(lr)) 46 | if 0.0 > eps: 47 | raise ValueError("Invalid epsilon value: {}".format(eps)) 48 | if not 0.0 <= betas[0] < 1.0: 49 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 50 | if not 0.0 <= betas[1] < 1.0: 51 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 52 | defaults = dict(lr=lr, betas=betas, eps=eps, 53 | weight_decay=weight_decay, 54 | grad_averaging=grad_averaging, 55 | amsgrad=amsgrad) 56 | 57 | super(Novograd, self).__init__(params, defaults) 58 | 59 | def __setstate__(self, state): 60 | super(Novograd, self).__setstate__(state) 61 | for group in self.param_groups: 62 | group.setdefault('amsgrad', False) 63 | 64 | def step(self, closure=None): 65 | """Performs a single optimization step. 66 | Arguments: 67 | closure (callable, optional): A closure that reevaluates the model 68 | and returns the loss. 69 | """ 70 | loss = None 71 | if closure is not None: 72 | loss = closure() 73 | 74 | for group in self.param_groups: 75 | for p in group['params']: 76 | if p.grad is None: 77 | continue 78 | grad = p.grad.data 79 | if grad.is_sparse: 80 | raise RuntimeError('Sparse gradients are not supported.') 81 | amsgrad = group['amsgrad'] 82 | 83 | state = self.state[p] 84 | 85 | # State initialization 86 | if len(state) == 0: 87 | state['step'] = 0 88 | # Exponential moving average of gradient values 89 | state['exp_avg'] = torch.zeros_like(p.data) 90 | # Exponential moving average of squared gradient values 91 | state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) 92 | if amsgrad: 93 | # Maintains max of all exp. moving avg. of sq. grad. values 94 | state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) 95 | 96 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 97 | if amsgrad: 98 | max_exp_avg_sq = state['max_exp_avg_sq'] 99 | beta1, beta2 = group['betas'] 100 | 101 | state['step'] += 1 102 | 103 | norm = torch.sum(torch.pow(grad, 2)) 104 | 105 | if exp_avg_sq == 0: 106 | exp_avg_sq.copy_(norm) 107 | else: 108 | exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2) 109 | 110 | if amsgrad: 111 | # Maintains the maximum of all 2nd moment running avg. till now 112 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 113 | # Use the max. for normalizing running avg. of gradient 114 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 115 | else: 116 | denom = exp_avg_sq.sqrt().add_(group['eps']) 117 | 118 | grad.div_(denom) 119 | if group['weight_decay'] != 0: 120 | grad.add_(p.data, alpha=group['weight_decay']) 121 | if group['grad_averaging']: 122 | grad.mul_(1 - beta1) 123 | exp_avg.mul_(beta1).add_(grad) 124 | 125 | p.data.add_(exp_avg, alpha=-group['lr']) 126 | 127 | return loss 128 | -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/ecapa.py: -------------------------------------------------------------------------------- 1 | #code from https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | 8 | ''' Res2Conv1d + BatchNorm1d + ReLU 9 | ''' 10 | class Res2Conv1dReluBn(nn.Module): 11 | ''' 12 | in_channels == out_channels == channels 13 | ''' 14 | def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4): 15 | super().__init__() 16 | assert channels % scale == 0, "{} % {} != 0".format(channels, scale) 17 | self.scale = scale 18 | self.width = channels // scale 19 | self.nums = scale if scale == 1 else scale - 1 20 | 21 | self.convs = [] 22 | self.bns = [] 23 | for i in range(self.nums): 24 | self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias)) 25 | self.bns.append(nn.BatchNorm1d(self.width)) 26 | self.convs = nn.ModuleList(self.convs) 27 | self.bns = nn.ModuleList(self.bns) 28 | 29 | def forward(self, x): 30 | out = [] 31 | spx = torch.split(x, self.width, 1) 32 | for i in range(self.nums): 33 | if i == 0: 34 | sp = spx[i] 35 | else: 36 | sp = sp + spx[i] 37 | # Order: conv -> relu -> bn 38 | sp = self.convs[i](sp) 39 | sp = self.bns[i](F.relu(sp)) 40 | out.append(sp) 41 | if self.scale != 1: 42 | out.append(spx[self.nums]) 43 | out = torch.cat(out, dim=1) 44 | return out 45 | 46 | 47 | 48 | ''' Conv1d + BatchNorm1d + ReLU 49 | ''' 50 | class Conv1dReluBn(nn.Module): 51 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False): 52 | super().__init__() 53 | self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias) 54 | self.bn = nn.BatchNorm1d(out_channels) 55 | 56 | def forward(self, x): 57 | return self.bn(F.relu(self.conv(x))) 58 | 59 | 60 | 61 | ''' The SE connection of 1D case. 62 | ''' 63 | class SE_Connect(nn.Module): 64 | def __init__(self, channels, s=2): 65 | super().__init__() 66 | assert channels % s == 0, "{} % {} != 0".format(channels, s) 67 | self.linear1 = nn.Linear(channels, channels // s) 68 | self.linear2 = nn.Linear(channels // s, channels) 69 | 70 | def forward(self, x): 71 | out = x.mean(dim=2) 72 | out = F.relu(self.linear1(out)) 73 | out = torch.sigmoid(self.linear2(out)) 74 | out = x * out.unsqueeze(2) 75 | return out 76 | 77 | 78 | 79 | ''' SE-Res2Block. 80 | Note: residual connection is implemented in the ECAPA_TDNN model, not here. 81 | ''' 82 | def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale): 83 | return nn.Sequential( 84 | Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0), 85 | Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale), 86 | Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0), 87 | SE_Connect(channels) 88 | ) 89 | 90 | 91 | 92 | ''' Attentive weighted mean and standard deviation pooling. 93 | ''' 94 | class AttentiveStatsPool(nn.Module): 95 | def __init__(self, in_dim, bottleneck_dim): 96 | super().__init__() 97 | # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs. 98 | self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper 99 | self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper 100 | 101 | def forward(self, x): 102 | # DON'T use ReLU here! In experiments, I find ReLU hard to converge. 103 | alpha = torch.tanh(self.linear1(x)) 104 | alpha = torch.softmax(self.linear2(alpha), dim=2) 105 | mean = torch.sum(alpha * x, dim=2) 106 | residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2 107 | std = torch.sqrt(residuals.clamp(min=1e-9)) 108 | return torch.cat([mean, std], dim=1) 109 | 110 | 111 | 112 | ''' Implementation of 113 | "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification". 114 | Note that we DON'T concatenate the last frame-wise layer with non-weighted mean and standard deviation, 115 | because it brings little improvment but significantly increases model parameters. 116 | As a result, this implementation basically equals the A.2 of Table 2 in the paper. 117 | ''' 118 | class ECAPA_TDNN(nn.Module): 119 | def __init__(self, in_channels=80, channels=1024, embed_dim=192): 120 | super().__init__() 121 | self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2) 122 | self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8) 123 | self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8) 124 | self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8) 125 | 126 | cat_channels = channels * 3 127 | self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1) 128 | self.pooling = AttentiveStatsPool(cat_channels, 128) 129 | self.bn1 = nn.BatchNorm1d(cat_channels * 2) 130 | self.linear = nn.Linear(cat_channels * 2, embed_dim) 131 | self.bn2 = nn.BatchNorm1d(embed_dim) 132 | 133 | def forward(self, x): 134 | x = x.transpose(1, 2) 135 | out1 = self.layer1(x) 136 | out2 = self.layer2(out1) + out1 137 | out3 = self.layer3(out1 + out2) + out1 + out2 138 | out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3 139 | 140 | out = torch.cat([out2, out3, out4], dim=1) 141 | out = F.relu(self.conv(out)) 142 | out = self.bn1(self.pooling(out)) 143 | out = self.bn2(self.linear(out)) 144 | return out -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/mmcl/STP.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from mmcv.cnn import build_norm_layer 5 | 6 | 7 | def select_activation(activation_type): 8 | if activation_type == "leaky_relu": 9 | return nn.LeakyReLU(inplace=True) 10 | elif activation_type == "relu": 11 | return nn.ReLU(inplace=True) 12 | elif activation_type == "prelu": 13 | return nn.PReLU() 14 | elif activation_type == "none": 15 | return nn.Identity() 16 | else: 17 | print("activation type {} is not supported".format(activation_type)) 18 | raise NotImplementedError 19 | 20 | 21 | def std_pooling(batch, batch_mean, dim=-1, unbiased=False, eps=1e-8): 22 | # adding epsilon in sqrt function to make more numerically stable results (yufeng) 23 | r2 = torch.sum((batch - batch_mean.unsqueeze(-1))**2, dim) 24 | if unbiased: 25 | length = batch.shape[dim] - 1 26 | else: 27 | length = batch.shape[dim] 28 | return torch.sqrt(r2/length + eps) 29 | 30 | 31 | class Stats_pooling(nn.Module): 32 | def __init__(self, input_dim=1500): 33 | super(Stats_pooling, self).__init__() 34 | self.out_dim = 2 * input_dim 35 | 36 | def forward(self, x): 37 | """ 38 | x.size() = [batch_size, feature_dim, seq_length] 39 | """ 40 | mean_frame = torch.mean(x, -1, False) 41 | if self.training: 42 | std_frame = std_pooling(x, mean_frame, -1, False) 43 | else: 44 | std_frame = torch.std(x, -1, False) 45 | output = torch.cat([mean_frame, std_frame], dim=-1) 46 | # print(output.shape) 47 | output = output.view(-1, self.out_dim) 48 | return output 49 | 50 | 51 | class StatsPooling(nn.Module): 52 | """Stats Pooling neck. 53 | """ 54 | def __init__(self, in_plane, emb_dim, emb_bn=True, emb_affine=True, 55 | activation_type="relu", norm_type="BN1d", output_stage=(0,)): 56 | super(StatsPooling, self).__init__() 57 | self.avgpool = Stats_pooling(in_plane) 58 | embedding = [] 59 | initial_dim = self.avgpool.out_dim 60 | self.output_stage = output_stage 61 | if isinstance(emb_dim, list): 62 | self.stages = len(emb_dim) 63 | for e_dim, do_bn, do_affine, act_type in zip(emb_dim, emb_bn, emb_affine, activation_type): 64 | fc = [nn.Linear(initial_dim, e_dim)] 65 | initial_dim = e_dim 66 | fc.append(select_activation(act_type)) 67 | if do_bn: 68 | cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=do_affine) 69 | fc.append(build_norm_layer(cfg, e_dim)[1]) 70 | embedding.append(nn.Sequential(*fc)) 71 | else: 72 | self.stages = 1 73 | embedding.append(nn.Linear(initial_dim, emb_dim)) 74 | embedding.append(select_activation(activation_type)) 75 | if emb_bn: 76 | cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=emb_affine) 77 | embedding.append(build_norm_layer(cfg, emb_dim)[1]) 78 | self.embedding = nn.Sequential(*embedding) 79 | 80 | def init_weights(self): 81 | pass 82 | 83 | def forward(self, inputs): 84 | out = self.avgpool(inputs) 85 | if self.stages > 1 and len(self.output_stage) > 1 and self.training: 86 | # contains more than one fc layers and needs to output more than one vector and training mode 87 | embs = [] 88 | for fc in self.embedding: 89 | out = fc(out) 90 | embs.append(out) 91 | results = [] 92 | for stage in self.output_stage: 93 | results.append(embs[stage]) 94 | return tuple(results) 95 | else: 96 | return self.embedding(out) 97 | 98 | 99 | class StatsPoolingMSEA(nn.Module): 100 | """Stats Pooling neck. 101 | """ 102 | def __init__(self, in_plane, emb_dim, emb_bn=True, emb_affine=True, 103 | activation_type="relu", norm_type="BN1d", output_stage=(0,)): 104 | super(StatsPoolingMSEA, self).__init__() 105 | assert isinstance(in_plane, tuple) 106 | self.avgpool = [Stats_pooling(plane) for plane in in_plane] 107 | embedding = [] 108 | initial_dim = sum([pool.out_dim for pool in self.avgpool]) 109 | self.output_stage = output_stage 110 | if isinstance(emb_dim, list): 111 | self.stages = len(emb_dim) 112 | for e_dim, do_bn, do_affine, act_type in zip(emb_dim, emb_bn, emb_affine, activation_type): 113 | fc = [nn.Linear(initial_dim, e_dim)] 114 | initial_dim = e_dim 115 | fc.append(select_activation(act_type)) 116 | if do_bn: 117 | cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=do_affine) 118 | fc.append(build_norm_layer(cfg, e_dim)[1]) 119 | embedding.append(nn.Sequential(*fc)) 120 | else: 121 | self.stages = 1 122 | embedding.append(nn.Linear(initial_dim, emb_dim)) 123 | embedding.append(select_activation(activation_type)) 124 | if emb_bn: 125 | cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=emb_affine) 126 | embedding.append(build_norm_layer(cfg, emb_dim)[1]) 127 | self.embedding = nn.Sequential(*embedding) 128 | 129 | def init_weights(self): 130 | pass 131 | 132 | def forward(self, inputs): 133 | out = [pool(inp) for pool, inp in zip(self.avgpool, inputs)] 134 | out = torch.cat(out, dim=-1) 135 | if self.stages > 1 and len(self.output_stage) > 1 and self.training: 136 | # contains more than one fc layers and needs to output more than one vector and training mode 137 | embs = [] 138 | for fc in self.embedding: 139 | out = fc(out) 140 | embs.append(out) 141 | results = [] 142 | for stage in self.output_stage: 143 | results.append(embs[stage]) 144 | return tuple(results) 145 | else: 146 | return self.embedding(out) -------------------------------------------------------------------------------- /deepaudio/speaker/modules/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/BUTSpeechFIT/VBx/blob/master/VBx/models/resnet.py 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class BasicBlock(nn.Module): 8 | expansion = 1 9 | 10 | def __init__(self, in_planes, planes, stride=1, reduction=16): 11 | super(BasicBlock, self).__init__() 12 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(planes) 14 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 15 | self.bn2 = nn.BatchNorm2d(planes) 16 | # self.se = SELayer(planes, reduction) 17 | 18 | self.shortcut = nn.Sequential() 19 | if stride != 1 or in_planes != self.expansion * planes: 20 | self.shortcut = nn.Sequential( 21 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 22 | nn.BatchNorm2d(self.expansion * planes) 23 | ) 24 | 25 | def forward(self, x): 26 | out = F.relu(self.bn1(self.conv1(x))) 27 | out = self.bn2(self.conv2(out)) 28 | # out = self.se(out) 29 | out += self.shortcut(x) 30 | out = F.relu(out) 31 | return out 32 | 33 | 34 | class Bottleneck(nn.Module): 35 | expansion = 4 36 | 37 | def __init__(self, in_planes, planes, stride=1, reduction=16): 38 | super(Bottleneck, self).__init__() 39 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 40 | self.bn1 = nn.BatchNorm2d(planes) 41 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 42 | self.bn2 = nn.BatchNorm2d(planes) 43 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 44 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 45 | # self.se = SELayer(planes * 4, reduction) 46 | 47 | self.shortcut = nn.Sequential() 48 | if stride != 1 or in_planes != self.expansion * planes: 49 | self.shortcut = nn.Sequential( 50 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 51 | nn.BatchNorm2d(self.expansion * planes) 52 | ) 53 | 54 | def forward(self, x): 55 | out = F.relu(self.bn1(self.conv1(x))) 56 | out = F.relu(self.bn2(self.conv2(out))) 57 | out = self.bn3(self.conv3(out)) 58 | # out = self.se(out) 59 | out += self.shortcut(x) 60 | out = F.relu(out) 61 | return out 62 | 63 | 64 | class SELayer(nn.Module): 65 | def __init__(self, channel, reduction=16): 66 | super(SELayer, self).__init__() 67 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 68 | self.fc = nn.Sequential( 69 | nn.Linear(channel, channel // reduction, bias=False), 70 | nn.ReLU(inplace=True), 71 | nn.Linear(channel // reduction, channel, bias=False), 72 | nn.Sigmoid() 73 | ) 74 | 75 | def forward(self, x): 76 | b, c, _, _ = x.size() 77 | y = self.avg_pool(x).view(b, c) 78 | y = self.fc(y).view(b, c, 1, 1) 79 | return x * y.expand_as(x) 80 | 81 | 82 | class ResNet(nn.Module): 83 | def __init__(self, block, num_blocks, m_channels=32, feat_dim=40, embed_dim=128, squeeze_excitation=False): 84 | super(ResNet, self).__init__() 85 | self.in_planes = m_channels 86 | self.feat_dim = feat_dim 87 | self.embed_dim = embed_dim 88 | self.squeeze_excitation = squeeze_excitation 89 | if block is BasicBlock: 90 | self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) 91 | self.bn1 = nn.BatchNorm2d(m_channels) 92 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1) 93 | self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2) 94 | current_freq_dim = int((feat_dim - 1) / 2) + 1 95 | self.layer3 = self._make_layer(block, m_channels * 4, num_blocks[2], stride=2) 96 | current_freq_dim = int((current_freq_dim - 1) / 2) + 1 97 | self.layer4 = self._make_layer(block, m_channels * 8, num_blocks[3], stride=2) 98 | current_freq_dim = int((current_freq_dim - 1) / 2) + 1 99 | self.embedding = nn.Linear(m_channels * 8 * 2 * current_freq_dim, embed_dim) 100 | elif block is Bottleneck: 101 | self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) 102 | self.bn1 = nn.BatchNorm2d(m_channels) 103 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1) 104 | self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2) 105 | self.layer3 = self._make_layer(block, m_channels * 4, num_blocks[2], stride=2) 106 | self.layer4 = self._make_layer(block, m_channels * 8, num_blocks[3], stride=2) 107 | self.embedding = nn.Linear(int(feat_dim / 8) * m_channels * 16 * block.expansion, embed_dim) 108 | else: 109 | raise ValueError(f'Unexpected class {type(block)}.') 110 | 111 | def _make_layer(self, block, planes, num_blocks, stride): 112 | strides = [stride] + [1] * (num_blocks - 1) 113 | layers = [] 114 | for stride in strides: 115 | layers.append(block(self.in_planes, planes, stride)) 116 | self.in_planes = planes * block.expansion 117 | return nn.Sequential(*layers) 118 | 119 | def forward(self, x): 120 | x = x.transpose(1, 2) 121 | x = x.unsqueeze_(1) 122 | out = F.relu(self.bn1(self.conv1(x))) 123 | out = self.layer1(out) 124 | out = self.layer2(out) 125 | out = self.layer3(out) 126 | out = self.layer4(out) 127 | 128 | pooling_mean = torch.mean(out, dim=-1) 129 | meansq = torch.mean(out * out, dim=-1) 130 | pooling_std = torch.sqrt(meansq - pooling_mean ** 2 + 1e-10) 131 | out = torch.cat((torch.flatten(pooling_mean, start_dim=1), 132 | torch.flatten(pooling_std, start_dim=1)), 1) 133 | 134 | embedding = self.embedding(out) 135 | return embedding -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ## Content 2 | - [**What is deepaudio-speaker?**](https://github.com/deepaudio/deepaudio-speaker#what-is-deepaudio) 3 | - [**Installation**](https://github.com/deepaudio/deepaudio-speaker#installation) 4 | - [**Get Started**](https://github.com/deepaudio/deepaudio-speaker#get-started) 5 | - [**Model Architecture**](https://github.com/deepaudio/deepaudio-speaker#model-architectures) 6 | - [**How to contribute to deepaudio-speaker?**](https://github.com/deepaudio/deepaudio-speaker#How-to-contribute-to-deepaudio-speaker) 7 | - [**Acknowledge**](https://github.com/deepaudio/deepaudio-speaker#Acknowledge) 8 | 9 | ## What is deepaudio-speaker? 10 | 11 | Deepaudio-speaker is a framework for training neural network based speaker embedders. It supports online audio augmentation thanks to torch-audiomentation. It inlcudes or will include popular neural network architectures and losses used for speaker embedder. 12 | 13 | To make it easy to use various functions such as mixed-precision, multi-node training, and TPU training etc, I introduced PyTorch-Lighting and Hydra in this framework (just like what [pyannote-audio](https://github.com/pyannote/pyannote-audio) and [openspeech](https://github.com/openspeech-team/openspeech) do). 14 | 15 | Deepaudio-tts is coming soon. 16 | 17 | ## Installation 18 | ``` 19 | conda create -n deepaudio python=3.8.5 20 | conda activate deepaudio 21 | conda install numpy cffi 22 | conda install libsndfile=1.0.28 -c conda-forge 23 | git clone https://github.com/deepaudio/deepaudio-speaker.git 24 | cd deepaudio-speaker 25 | pip install -e . 26 | ``` 27 | 28 | ## Get Started 29 | 30 | ### Supported Datasets 31 | 32 | ####Voxceleb2 33 | * [Download VoxCeleb dataset](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) and follow [this script](https://github.com/pyannote/pyannote-db-voxceleb/issues/10#issuecomment-702638328) to obtain this kind of directory structure: 34 | 35 | ``` 36 | /path/to/voxceleb/voxceleb1/dev/wav/id10001/1zcIwhmdeo4/00001.wav 37 | /path/to/voxceleb/voxceleb1/test/wav/id10270/5r0dWxy17C8/00001.wav 38 | /path/to/voxceleb/voxceleb2/dev/aac/id00012/21Uxsk56VDQ/00001.m4a 39 | /path/to/voxceleb/voxceleb2/test/aac/id00017/01dfn2spqyE/00001.m4a 40 | ``` 41 | 42 | ### Training examples 43 | - Example1: Train the `ecapa-tdnn` model with `fbank` features on GPU. 44 | 45 | ``` 46 | $ deepaudio-speaker-train \ 47 | dataset=voxceleb2 \ 48 | dataset.dataset_path=/your/path/to/voxceleb2/dev/wav/ \ 49 | model=clovaai_ecapa \ 50 | model.channels=1024 \ 51 | feature=fbank \ 52 | lr_scheduler=reduce_lr_on_plateau \ 53 | trainer=gpu \ 54 | criterion=pyannote_aamsoftmax 55 | ``` 56 | - Example2: Train ecapa model to get eer around 1.13% for voxceleb 1 trials ( original version, without norm operation). 57 | 58 | ``` 59 | $ git clone https://github.com/deepaudio/deepaudio-database.git 60 | $ cd deepaudio-database 61 | $ vim database.yml # edit the list path and wav path 62 | $ deepaudio-speaker-train \ 63 | dataset=dataframe \ 64 | dataset.database_yml=/your/path/to/deepaudio-database/database.yml \ 65 | dataset.dataset_name=voxceleb2_dev \ 66 | model=clovaai_ecapa \ 67 | model.channels=1024 \ 68 | model.embed_dim=256 \ 69 | model.min_num_frames=200 \ 70 | model.max_num_frames=300 \ 71 | feature=fbank \ 72 | lr_scheduler=warmup_adaptive_reduce_lr_on_plateau \ 73 | lr_scheduler.warmup_steps=30000 \ 74 | lr_scheduler.lr_factor=0.8 \ 75 | trainer=gpu \ 76 | trainer.batch_size=128 \ 77 | trainer.max_epochs=30 \ 78 | trainer.num_checkpoints=30 \ 79 | criterion=adaptive_aamsoftmax \ 80 | criterion.increase_steps=300000 \ 81 | augment.apply_spec_augment=True\ 82 | augment.time_mask_num=1 \ 83 | augment.apply_noise_augment=True \ 84 | augment.apply_reverb_augment=True \ 85 | augment.apply_noise_reverb_augment=True \ 86 | augment.noise_augment_weight=2 \ 87 | augment.noise_dataset_dir=/your/path/to/musan \ 88 | augment.rir_dataset_dir=/your/path/to/RIRS_NOISES/simulated_rirs/ \ 89 | ``` 90 | 91 | - Example3: Compute the equal error rate (EER) 92 | ```python 93 | from deepaudio.speaker.datasets.dataframe.utils import load_trial_dataframe, get_dataset_items 94 | from deepaudio.speaker.models.inference import Inference 95 | from deepaudio.speaker.metrics.eer import model_eer 96 | 97 | trial_meta = get_dataset_items('/your/path/to/deepaudio-database/database.yml', 98 | 'voxceleb1_o', 'trial') 99 | wav_dir, trial_path = trial_meta[0] 100 | trials = load_trial_dataframe(wav_dir, trial_path) 101 | inference = Inference('/your/path/to/checkpoint.ckpt') 102 | eer, thresh = model_eer(inference, trials) 103 | ``` 104 | - Example4: Export torchscript model 105 | ```python 106 | from deepaudio.speaker.models.inference import Inference 107 | model = Inference('/your/path/to/checkpoint.ckpt').model 108 | model.to_torchscript('filepath/to/model') 109 | ``` 110 | 111 | 112 | ## Model Architecture 113 | [**Wespeaker**](https://github.com/wenet-e2e/wespeaker/tree/master/wespeaker/models) Models from wespeaker. 114 | 115 | [**ECAPA-TDNN**](https://arxiv.org/pdf/2005.07143.pdf) This is an unofficial implementation from @lawlict. Please find more details in this [link](https://github.com/lawlict/ECAPA-TDNN). 116 | 117 | [**ECAPA-TDNN**](https://arxiv.org/pdf/2005.07143.pdf) This is implemented by @joonson. Please find more details in this [link](https://github.com/clovaai/voxceleb_trainer/issues/86#issuecomment-739991154). 118 | 119 | [**ResNetSE34L**](https://arxiv.org/pdf/2003.11982.pdf) This is borrowed from [voxceleb trainer](https://github.com/clovaai/voxceleb_trainer). 120 | 121 | [**ResNetSE34V2**](https://arxiv.org/pdf/2003.11982.pdf) This is borrowed from [voxceleb trainer](https://github.com/clovaai/voxceleb_trainer). 122 | 123 | [**Resnet101**](https://arxiv.org/abs/2012.14952) This is proposed by BUT for speaker diarization. Please note that the feature used in this framework is different from [VB-HMM](https://github.com/BUTSpeechFIT/VBx) 124 | 125 | ## How to contribute to deepaudio-speaker 126 | 127 | It is a personal project. So I don't have enough gpu resources to do a lot of experiments. I appreciate any kind of feedback or contributions. Please feel free to make a pull requsest for some small issues like bug fixes, experiment results. If you have any questions, please [open an issue](https://github.com/deepaudio/deepaudio-speaker/issues). 128 | 129 | ## Acknowledge 130 | I borrow a lot of codes from [openspeech](https://github.com/openspeech-team/openspeech) and [pyannote-audio](https://github.com/pyannote/pyannote-audio) -------------------------------------------------------------------------------- /deepaudio/speaker/optim/scheduler/tri_stage_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import math 24 | import torch 25 | from dataclasses import dataclass, field 26 | from typing import Optional 27 | from omegaconf import DictConfig 28 | from torch.optim import Optimizer 29 | 30 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs 31 | from deepaudio.speaker.optim.scheduler import register_scheduler 32 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler 33 | 34 | 35 | @dataclass 36 | class TriStageLRSchedulerConfigs(LearningRateSchedulerConfigs): 37 | scheduler_name: str = field( 38 | default="tri_stage", metadata={"help": "Name of learning rate scheduler."} 39 | ) 40 | init_lr: float = field( 41 | default=1e-7, metadata={"help": "Initial learning rate."} 42 | ) 43 | init_lr_scale: float = field( 44 | default=0.01, metadata={"help": "Initial learning rate scale."} 45 | ) 46 | final_lr_scale: float = field( 47 | default=0.01, metadata={"help": "Final learning rate scale"} 48 | ) 49 | phase_ratio: str = field( 50 | default="(0.1, 0.4, 0.5)", metadata={"help": "Automatically sets warmup/hold/decay steps to the ratio " 51 | "specified here from max_updates. the ratios must add up to 1.0"} 52 | ) 53 | total_steps: int = field( 54 | default=400000, metadata={"help": "Total training steps."} 55 | ) 56 | 57 | 58 | @register_scheduler("tri_stage", dataclass=TriStageLRSchedulerConfigs) 59 | class TriStageLRScheduler(LearningRateScheduler): 60 | r""" 61 | Tri-Stage Learning Rate Scheduler. Implement the learning rate scheduler in "SpecAugment" 62 | 63 | Similar to inverse_squre_root scheduler, but tri_stage learning rate employs 64 | three stages LR scheduling: 65 | 66 | - warmup stage, starting from `lr` * `init_lr_scale`, linearly 67 | increased to `lr` in `warmup_steps` iterations 68 | - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps` 69 | iterations 70 | - decay stage, after hold stage, decay LR exponetially to 71 | `lr` * `final_lr_scale` in `decay_steps`; 72 | after that LR is keep as `final_lr_scale` * `lr` 73 | 74 | During warmup:: 75 | init_lr = cfg.init_lr_scale * cfg.lr 76 | lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps) 77 | lr = lrs[update_num] 78 | 79 | During hold:: 80 | lr = cfg.lr 81 | 82 | During decay:: 83 | decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps 84 | lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor) 85 | 86 | After that:: 87 | lr = cfg.lr * cfg.final_lr_scale 88 | 89 | Args: 90 | optimizer (Optimizer): wrapped optimizer. 91 | configs (DictConfig): configuration set. 92 | """ 93 | def __init__( 94 | self, 95 | optimizer: Optimizer, 96 | configs: DictConfig, 97 | ): 98 | super(TriStageLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr) 99 | 100 | self.phase_ratio = eval(configs.lr_scheduler.phase_ratio) 101 | 102 | self.warmup_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[0]) 103 | self.hold_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[1]) 104 | self.decay_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[2]) 105 | 106 | self.peak_lr = configs.lr_scheduler.lr 107 | self.init_lr = configs.lr_scheduler.init_lr_scale * configs.lr_scheduler.lr 108 | self.final_lr = configs.lr_scheduler.final_lr_scale * configs.lr_scheduler.lr 109 | 110 | self.warmup_rate = ( 111 | (self.peak_lr - self.init_lr) / self.warmup_steps 112 | if self.warmup_steps != 0 113 | else 0 114 | ) 115 | self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps 116 | self.update_step = 0 117 | self.lr = self.init_lr 118 | 119 | def _decide_stage(self): 120 | if self.update_step < self.warmup_steps: 121 | return 0, self.update_step 122 | 123 | offset = self.warmup_steps 124 | 125 | if self.update_step < offset + self.hold_steps: 126 | return 1, self.update_step - offset 127 | 128 | offset += self.hold_steps 129 | 130 | if self.update_step <= offset + self.decay_steps: 131 | # decay stage 132 | return 2, self.update_step - offset 133 | 134 | offset += self.decay_steps 135 | 136 | return 3, self.update_step - offset 137 | 138 | def step(self, val_loss: Optional[torch.FloatTensor] = None): 139 | stage, steps_in_stage = self._decide_stage() 140 | 141 | if stage == 0: 142 | self.lr = self.init_lr + self.warmup_rate * steps_in_stage 143 | elif stage == 1: 144 | self.lr = self.peak_lr 145 | elif stage == 2: 146 | self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) 147 | elif stage == 3: 148 | self.lr = self.final_lr 149 | else: 150 | raise ValueError("Undefined stage") 151 | 152 | self.set_lr(self.optimizer, self.lr) 153 | self.update_step += 1 154 | 155 | return self.lr 156 | --------------------------------------------------------------------------------