├── version.txt
├── deepaudio
    ├── speaker
    │   ├── __init__.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── eer.py
    │   │   └── train.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   └── train.yaml
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── audio_io
    │   │   │   └── __init__.py
    │   │   ├── augmentation
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   ├── spec_augment.py
    │   │   │   ├── noise.py
    │   │   │   └── configurations.py
    │   │   ├── feature
    │   │   │   ├── fbank
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configuration.py
    │   │   │   │   └── fbank.py
    │   │   │   ├── but_fbank
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── configuration.py
    │   │   │   ├── utils.py
    │   │   │   └── __init__.py
    │   │   ├── dataloader.py
    │   │   ├── samplers.py
    │   │   └── dataset.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   └── eer.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── backbones
    │   │   │   ├── __init__.py
    │   │   │   ├── mmcl
    │   │   │       ├── __init__.py
    │   │   │       └── STP.py
    │   │   │   ├── clovaai
    │   │   │       ├── __init__.py
    │   │   │       ├── ResNetBlocks.py
    │   │   │       ├── ResNetSE34V2.py
    │   │   │       └── ResNetSE34L.py
    │   │   │   ├── wespeaker
    │   │   │       ├── __init__.py
    │   │   │       ├── speaker_model.py
    │   │   │       └── tdnn.py
    │   │   │   ├── ecapa.py
    │   │   │   └── resnet.py
    │   ├── dataclass
    │   │   ├── __init__.py
    │   │   └── initialize.py
    │   ├── models
    │   │   ├── ecapa
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── resnet
    │   │   │   ├── __init__.py
    │   │   │   ├── configurations.py
    │   │   │   └── model.py
    │   │   ├── clovaai_ecapa
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── mmcl_seresnet34
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── wespeaker_model
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── clovaai_resnetse34l
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── clovaai_resnetse34v2
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── configurations.py
    │   │   ├── inference.py
    │   │   ├── __init__.py
    │   │   ├── speaker_embedding_model.py
    │   │   └── speaker_model.py
    │   ├── datasets
    │   │   ├── dataframe
    │   │   │   ├── __init__.py
    │   │   │   ├── configurations.py
    │   │   │   ├── lit_data_module.py
    │   │   │   └── utils.py
    │   │   ├── voxceleb2
    │   │   │   ├── __init__.py
    │   │   │   ├── preprocess.py
    │   │   │   ├── configurations.py
    │   │   │   └── lit_data_module.py
    │   │   ├── utils.py
    │   │   └── __init__.py
    │   ├── criterion
    │   │   ├── aamsoftmax
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration.py
    │   │   │   └── aamsoftmax.py
    │   │   ├── adaptive_aamsoftmax
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration.py
    │   │   │   └── aamsoftmax.py
    │   │   ├── pyannote_aamsoftmax
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration.py
    │   │   │   └── aamsoftmax.py
    │   │   ├── subcenter_aamsoftmax
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration.py
    │   │   │   └── subcenter_aamsoftmax.py
    │   │   ├── adaptive_subcenter_aamsoftmax
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration.py
    │   │   │   └── subcenter_aamsoftmax.py
    │   │   ├── .DS_Store
    │   │   └── __init__.py
    │   ├── version.py
    │   ├── .DS_Store
    │   └── optim
    │   │   ├── scheduler
    │   │       ├── fix_lr_scheduler.py
    │   │       ├── lr_scheduler.py
    │   │       ├── step_lr_scheduler.py
    │   │       ├── __init__.py
    │   │       ├── warmup_scheduler.py
    │   │       ├── reduce_lr_on_plateau_scheduler.py
    │   │       ├── warmup_reduce_lr_on_plateau_scheduler.py
    │   │       ├── transformer_lr_scheduler.py
    │   │       ├── warmup_steplr_scheduler.py
    │   │       ├── warmup_adaptive_loss_reduce_lr_on_plateau_scheduler.py
    │   │       └── tri_stage_lr_scheduler.py
    │   │   ├── __init__.py
    │   │   ├── optimizer.py
    │   │   ├── adamp.py
    │   │   ├── radam.py
    │   │   └── novograd.py
    ├── __init__.py
    └── .DS_Store
├── requirements.txt
├── run.sh
├── run3.sh
├── run2.sh
├── setup.py
├── .gitignore
├── setup.cfg
└── Readme.md


/version.txt:
--------------------------------------------------------------------------------
1 | 0.0.1
2 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/configs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/audio_io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/dataclass/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/ecapa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/resnet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/augmentation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/fbank/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/dataframe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/voxceleb2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/aamsoftmax/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/but_fbank/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_ecapa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/mmcl_seresnet34/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/wespeaker_model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/mmcl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34l/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34v2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/clovaai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/wespeaker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.1'
2 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_aamsoftmax/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/pyannote_aamsoftmax/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/subcenter_aamsoftmax/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepaudio/__init__.py:
--------------------------------------------------------------------------------
1 | __import__("pkg_resources").declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/deepaudio/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/.DS_Store


--------------------------------------------------------------------------------
/deepaudio/speaker/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/speaker/.DS_Store


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-speaker/HEAD/deepaudio/speaker/criterion/.DS_Store


--------------------------------------------------------------------------------
/deepaudio/speaker/data/augmentation/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | 
3 | 
4 | def get_all_wavs(parent_dir):
5 |     return glob.glob(f'{parent_dir}/**/*.wav', recursive=True)
6 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | defaults:
 4 |   - feature: fbank
 5 |   - augment: default
 6 |   - dataset: voxceleb2
 7 |   - criterion: pyannote_aamsoftmax
 8 |   - lr_scheduler: steplr
 9 |   - model: clovaai_ecapa
10 |   - trainer: cpu
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | hydra-core
 2 | librosa
 3 | pyannote.core >=4.1,<5.0
 4 | pytorch-lightning >= 1.4,<1.5
 5 | pytorch_metric_learning >=0.9.98
 6 | soundfile >=0.10.2,<0.11
 7 | torch >=1.8.1,<1.9
 8 | torch-audiomentations >=0.6.0
 9 | torchaudio >=0.8.1,<0.9
10 | typing_extensions >=3.7.4.3
11 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | 
 4 | def get_subdirs(directory):
 5 |     directory = Path(directory)
 6 |     return directory.glob('*/')
 7 | 
 8 | 
 9 | def get_all_wavs(directory):
10 |     directory = Path(directory)
11 |     return list(directory.glob('**/*.wav'))
12 | 
13 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/metrics/utils.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | def get_all_wavs(trials):
 3 |     uris = set()
 4 |     for uri_enroll, uri_test, _ in trials:
 5 |         uris.add(uri_enroll)
 6 |         uris.add(uri_test)
 7 |     return set(uris)
 8 | 
 9 | 
10 | def get_all_embeddings(model, wav_trials):
11 |     embedding = dict()
12 |     for uri in tqdm(wav_trials):
13 |         embedding[uri] = model.make_embedding(uri)
14 |     return embedding
15 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | EPSILON = 1e-6
 5 | 
 6 | 
 7 | class CMVN(nn.Module):
 8 |     def __init__(self, var_norm=False):
 9 |         super(CMVN, self).__init__()
10 |         self.var_norm = var_norm
11 | 
12 |     def forward(self, x):
13 |         mean = x.mean(dim=1, keepdims=True)
14 |         if self.var_norm:
15 |             std = torch.sqrt(x.var(dim=1, keepdims=True) + EPSILON)
16 |         x = x - mean
17 |         if self.var_norm:
18 |             x /= std
19 |         return x
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/aamsoftmax/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from ...dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class AAMSoftmaxConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="aamsoftmax", metadata={"help": "Criterion name for training"}
10 |     )
11 |     margin: float = field(
12 |         default=0.2, metadata={"help": "The angular margin penalty in radians."}
13 |     )
14 | 
15 |     scale: float = field(
16 |         default=32, metadata={"help": "The scale for loss."}
17 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/pyannote_aamsoftmax/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from ...dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class PyannoteAAMSoftmaxConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="pyannote_aamsoftmax", metadata={"help": "Criterion name for training"}
10 |     )
11 |     margin: float = field(
12 |         default=0.2, metadata={"help": "The angular margin penalty in radians."}
13 |     )
14 | 
15 |     scale: float = field(
16 |         default=32, metadata={"help": "The scale for loss."}
17 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/voxceleb2/preprocess.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from ..utils import get_subdirs, get_all_wavs
 4 | 
 5 | 
 6 | def get_speaker_list(configs):
 7 |     data_dir = configs.dataset.dataset_path
 8 |     data_dir = configs.dataset.dataset_path
 9 |     speaker_dirs = get_subdirs(data_dir)
10 |     speakers = [d.stem for d in speaker_dirs]
11 |     spk2id = {k: v for v, k in enumerate(speakers)}
12 |     return speakers, spk2id
13 | 
14 | 
15 | def get_speaker_wavs(data_dir, speakers):
16 |     speaker2wav = {}
17 |     for spk in speakers:
18 |         spk_dir = Path(data_dir) / spk
19 |         speaker2wav[spk] = get_all_wavs(spk_dir)
20 |     return speaker2wav
21 | 
22 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/cli/eer.py:
--------------------------------------------------------------------------------
 1 | from deepaudio.speaker.datasets.dataframe.utils import load_trial_dataframe, get_dataset_items
 2 | from deepaudio.speaker.models.inference import Inference
 3 | from deepaudio.speaker.metrics.eer import model_eer
 4 | 
 5 | trial_meta = get_dataset_items('/home/amax/audio/deepaudio-database/database.yml',
 6 |                                'voxceleb1_o', 'trial')
 7 | print(trial_meta[0])
 8 | wav_dir, trial_path = trial_meta[0]
 9 | trials = load_trial_dataframe(wav_dir, trial_path)
10 | inference = Inference('/home/amax/audio/deepaudio-speaker/outputs/2021-10-26/00-37-08/logs/default/version_0/checkpoints/deepaudio-epoch=19-val_loss=2.33.ckpt')
11 | print(model_eer(inference, trials))


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/subcenter_aamsoftmax/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from ...dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class SubcenterAAMSoftmaxConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="subcenter_aamsoftmax", metadata={"help": "Criterion name for training"}
10 |     )
11 |     margin: float = field(
12 |         default=0.2, metadata={"help": "The angular margin penalty in radians."}
13 |     )
14 |     K: int = field(
15 |         default=3, metadata={"help": "The number of subcenter."}
16 |     )
17 |     scale: float = field(
18 |         default=32, metadata={"help": "The scale for loss."}
19 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_aamsoftmax/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from ...dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class AdaptiveAAMSoftmaxConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="adaptive_aamsoftmax", metadata={"help": "Criterion name for training"}
10 |     )
11 |     margin: float = field(
12 |         default=0.3, metadata={"help": "The angular margin penalty in radians."}
13 |     )
14 | 
15 |     scale: float = field(
16 |         default=32, metadata={"help": "The scale for loss."}
17 |     )
18 |     increase_steps: int = field(
19 |         default=50000, metadata={"help": "The increase step for margin."}
20 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_ecapa/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.clovaai.ECAPA_TDNN import MainModel
 7 | 
 8 | from .configurations import ClovaaiECAPAConfigs
 9 | 
10 | 
11 | @register_model('clovaai_ecapa', dataclass=ClovaaiECAPAConfigs)
12 | class ClovaaiECAPAModel(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(ClovaaiECAPAModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = MainModel(
18 |             configs=self.configs
19 |         )
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/wespeaker_model/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.wespeaker.speaker_model import MainModel
 7 | 
 8 | from .configurations import WespeakerModelConfigs
 9 | 
10 | 
11 | @register_model('wespeaker_model', dataclass=WespeakerModelConfigs)
12 | class WespeakerModel(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(WespeakerModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = MainModel(
18 |             configs=self.configs
19 |         )
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/mmcl_seresnet34/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.mmcl.seresnet_asv import MainModel
 7 | 
 8 | from .configurations import MMCLSeResnet34Configs
 9 | 
10 | 
11 | @register_model('mmcl_seresnet34', dataclass=MMCLSeResnet34Configs)
12 | class MMCLSeResnet34Model(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(MMCLSeResnet34Model, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = MainModel(
18 |             configs=self.configs
19 |         )
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/voxceleb2/configurations.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import MISSING
 2 | from dataclasses import dataclass, field
 3 | 
 4 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 5 | @dataclass
 6 | class Voxceleb2Configs(DeepMMDataclass):
 7 |     """ Configuration dataclass that common used """
 8 |     name: str = field(
 9 |         default="voxceleb2", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
10 |     )
11 |     dataset_path: str = field(
12 |         default="/Users/yin/project/data/aac4", metadata={"help": "Path of dataset"}
13 |     )
14 |     sampler: str = field(
15 |         default="clovaai", metadata={"help": "Sampler name."}
16 |     )
17 |     per_speaker: int = field(
18 |         default=3, metadata={"help": "Sampler name."}
19 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34l/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.clovaai.ResNetSE34L import MainModel
 7 | 
 8 | from .configurations import ClovaaiResnetse34lConfigs
 9 | 
10 | 
11 | @register_model('clovaai_resnetse34l', dataclass=ClovaaiResnetse34lConfigs)
12 | class ClovaaiResnetse34lModel(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(ClovaaiResnetse34lModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = MainModel(
18 |             configs=self.configs
19 |         )
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34v2/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.clovaai.ResNetSE34V2 import MainModel
 7 | 
 8 | from .configurations import ClovaaiResnetse34V2Configs
 9 | 
10 | 
11 | @register_model('clovaai_resnetse34v2', dataclass=ClovaaiResnetse34V2Configs)
12 | class ClovaaiResnetSE34V2Model(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(SpeakerEmbeddingModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = MainModel(
18 |             configs=self.configs
19 |         )
20 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from ...dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class AdaptiveSubcenterAAMSoftmaxConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="adaptive_subcenter_aamsoftmax", metadata={"help": "Criterion name for training"}
10 |     )
11 |     margin: float = field(
12 |         default=0.2, metadata={"help": "The angular margin penalty in radians."}
13 |     )
14 |     K: int = field(
15 |         default=3, metadata={"help": "The number of subcenter."}
16 |     )
17 |     scale: float = field(
18 |         default=32, metadata={"help": "The scale for loss."}
19 |     )
20 |     increase_steps: int = field(
21 |         default=50000, metadata={"help": "The increase step for margin."}
22 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/ecapa/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.ecapa import ECAPA_TDNN
 7 | 
 8 | from .configurations import ECAPAConfigs
 9 | 
10 | 
11 | @register_model('ecapa', dataclass=ECAPAConfigs)
12 | class ECAPAModel(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(SpeakerEmbeddingModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = ECAPA_TDNN(
18 |             in_channels=self.configs.feature.n_mels,
19 |             channels=self.configs.model.channels,
20 |             embed_dim=self.configs.model.embed_dim
21 |         )
22 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/ecapa/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class ECAPAConfigs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="ecapa", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=192, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     channels: int = field(
14 |         default=1024, metadata={"help": "Dimension of embedding."}
15 |     )
16 |     optimizer: str = field(
17 |         default="adam", metadata={"help": "Optimizer for training."}
18 |     )
19 |     min_num_frames: int = field(
20 |         default=300, metadata={"help": "Min num frames."}
21 |     )
22 |     max_num_frames: int = field(
23 |         default=400, metadata={"help": "Max num frames."}
24 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/resnet/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class Resnet101Configs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="resnet101", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=256, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     optimizer: str = field(
14 |         default="adam", metadata={"help": "Optimizer for training."}
15 |     )
16 |     min_num_frames: int = field(
17 |         default=300, metadata={"help": "Min num frames."}
18 |     )
19 |     max_num_frames: int = field(
20 |         default=400, metadata={"help": "Max num frames."}
21 |     )
22 |     squeeze_excitation: bool = field(
23 |         default=False, metadata={"help": "Max num frames."}
24 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34l/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class ClovaaiResnetse34lConfigs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="clovaai_resnetse34l", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=256, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     encoder_type: str = field(
14 |         default="SAP", metadata={"help": "Encoder type."}
15 |     )
16 |     optimizer: str = field(
17 |         default="adam", metadata={"help": "Optimizer for training."}
18 |     )
19 |     min_num_frames: int = field(
20 |         default=300, metadata={"help": "Min num frames."}
21 |     )
22 |     max_num_frames: int = field(
23 |         default=400, metadata={"help": "Max num frames."}
24 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_resnetse34v2/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class ClovaaiResnetse34V2Configs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="clovaai_resnetse34v2", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=256, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     encoder_type: str = field(
14 |         default="SAP", metadata={"help": "Encoder type."}
15 |     )
16 |     optimizer: str = field(
17 |         default="adam", metadata={"help": "Optimizer for training."}
18 |     )
19 |     min_num_frames: int = field(
20 |         default=300, metadata={"help": "Min num frames."}
21 |     )
22 |     max_num_frames: int = field(
23 |         default=400, metadata={"help": "Max num frames."}
24 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/fbank/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class FBankConfigs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="fbank", metadata={"help": "Name of feature transform."}
 9 |     )
10 |     sample_rate: int = field(
11 |         default=16000, metadata={"help": "Sampling rate of audio"}
12 |     )
13 |     frame_duration: float = field(
14 |         default=0.025, metadata={"help": "Frame length for spectrogram"}
15 |     )
16 |     frame_shift: float = field(
17 |         default=0.01, metadata={"help": "Length of hop between STFT"}
18 |     )
19 |     n_mels: int = field(
20 |         default=80, metadata={"help": "Number of mel filterbanks.."}
21 |     )
22 |     var_norm: bool = field(
23 |         default=False, metadata={"help": "Flag for cmvn"}
24 |     )
25 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/but_fbank/configuration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class ButFBankConfigs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="fbank", metadata={"help": "Name of feature transform."}
 9 |     )
10 |     sample_rate: int = field(
11 |         default=16000, metadata={"help": "Sampling rate of audio"}
12 |     )
13 |     frame_duration: float = field(
14 |         default=0.02, metadata={"help": "Frame length for spectrogram"}
15 |     )
16 |     frame_shift: float = field(
17 |         default=0.01, metadata={"help": "Length of hop between STFT"}
18 |     )
19 |     n_mels: int = field(
20 |         default=80, metadata={"help": "Number of mel filterbanks.."}
21 |     )
22 |     var_norm: bool = field(
23 |         default=False, metadata={"help": "Flag for cmvn"}
24 |     )
25 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/metrics/eer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import roc_curve
 3 | from scipy.spatial import distance
 4 | 
 5 | from .utils import get_all_wavs, get_all_embeddings
 6 | 
 7 | 
 8 | def compute_eer(y, y_pred, pos_label=1):
 9 |     fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=pos_label)
10 |     fnr = 1 - tpr
11 |     eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
12 |     eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
13 |     return eer, eer_threshold
14 | 
15 | 
16 | def model_eer(model, trials):
17 |     wav_trials = get_all_wavs(trials)
18 |     embeddings = get_all_embeddings(model, wav_trials)
19 |     ys = []
20 |     y_preds = []
21 |     for uri_enroll, uri_test, y in trials:
22 |         y_pred = 1 - distance.cosine(embeddings[uri_enroll], embeddings[uri_test])
23 |         y_preds.append(y_pred)
24 |         ys.append(y)
25 |     return compute_eer(ys, y_preds)
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/resnet/model.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from torch import Tensor
 3 | 
 4 | from deepaudio.speaker.models import register_model
 5 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
 6 | from deepaudio.speaker.modules.backbones.resnet import ResNet, Bottleneck
 7 | 
 8 | from .configurations import Resnet101Configs
 9 | 
10 | 
11 | @register_model('resnet101', dataclass=Resnet101Configs)
12 | class Resnet101Model(SpeakerEmbeddingModel):
13 |     def __init__(self, configs: DictConfig, num_classes: int):
14 |         super(SpeakerEmbeddingModel, self).__init__(configs, num_classes)
15 | 
16 |     def build_model(self):
17 |         self.model = ResNet(
18 |             Bottleneck,
19 |             [3, 4, 23, 3],
20 |             feat_dim=self.configs.feature.n_mels,
21 |             embed_dim=self.configs.model.embed_dim,
22 |             squeeze_excitation=self.configs.model.squeeze_excitation
23 |         )
24 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/aamsoftmax/aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | from torch import Tensor
 4 | from omegaconf import DictConfig
 5 | 
 6 | from pytorch_metric_learning.losses import ArcFaceLoss
 7 | 
 8 | from .. import register_criterion
 9 | from ..aamsoftmax.configuration import AAMSoftmaxConfigs
10 | 
11 | 
12 | def radian2degree(radian):
13 |     return math.degrees(radian)
14 | 
15 | 
16 | @register_criterion("aamsoftmax", dataclass=AAMSoftmaxConfigs)
17 | class AAMSoftmax(nn.Module):
18 |     def __init__(self,
19 |                  configs: DictConfig,
20 |                  num_classes: int,
21 |                  embedding_size: int
22 |                  ) -> None:
23 |         super(AAMSoftmax, self).__init__()
24 |         self.arcface_loss = ArcFaceLoss(
25 |             num_classes,
26 |             embedding_size,
27 |             margin=radian2degree(configs.criterion.margin),
28 |             scale=configs.criterion.scale
29 |         )
30 | 
31 |     def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor:
32 |         return self.arcface_loss(embeddings, targets)
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/wespeaker_model/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class WespeakerModelConfigs(DeepMMDataclass):
 8 |     name: str = field(
 9 |         default="ResNet34", metadata={"help": "Model name"}
10 |     )
11 |     embed_dim: int = field(
12 |         default=256, metadata={"help": "Dimension of embedding."}
13 |     )
14 |     pooling_func: str = field(
15 |         default="TSTP", metadata={"help": "Pooling function for model."}
16 |     )
17 |     optimizer: str = field(
18 |         default="adam", metadata={"help": "Optimizer for training."}
19 |     )
20 |     min_num_frames: int = field(
21 |         default=200, metadata={"help": "Min num frames."}
22 |     )
23 |     max_num_frames: int = field(
24 |         default=300, metadata={"help": "Max num frames."}
25 |     )
26 |     pretrained: bool = field(
27 |         default=False, metadata={"help": "Use pretrained model or not."}
28 |     )
29 |     checkpoint: str = field(
30 |         default="None", metadata={"help": "Checkpoint path."}
31 |     )
32 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train  \
 2 |     dataset=dataframe \
 3 |     dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \
 4 |     dataset.dataset_name=voxceleb2_dev \
 5 |     model=clovaai_ecapa \
 6 |     model.channels=1024 \
 7 |     model.embed_dim=256 \
 8 |     model.min_num_frames=200 \
 9 |     model.max_num_frames=300 \
10 |     feature=fbank \
11 |     lr_scheduler=warmup_adaptive_reduce_lr_on_plateau \
12 |     lr_scheduler.warmup_steps=30000 \
13 |     lr_scheduler.lr_factor=0.8 \
14 |     trainer=gpu \
15 |     trainer.batch_size=128 \
16 |     trainer.max_epochs=30 \
17 |     trainer.num_workers=8 \
18 |     trainer.num_checkpoints=30 \
19 |     criterion=adaptive_aamsoftmax \
20 |     criterion.increase_steps=300000 \
21 |     augment.apply_spec_augment=True\
22 |     augment.time_mask_num=1 \
23 |     augment.apply_noise_augment=True \
24 |     augment.apply_reverb_augment=True \
25 |     augment.apply_noise_reverb_augment=True \
26 |     augment.noise_augment_weight=2 \
27 |     augment.noise_dataset_dir=/data/share/data/musan \
28 |     augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \
29 | 


--------------------------------------------------------------------------------
/run3.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train  \
 2 |     dataset=dataframe \
 3 |     dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \
 4 |     dataset.dataset_name=voxceleb2_dev \
 5 |     model=clovaai_ecapa \
 6 |     model.channels=1024 \
 7 |     model.embed_dim=256 \
 8 |     model.min_num_frames=200 \
 9 |     model.max_num_frames=300 \
10 |     feature=fbank \
11 |     lr_scheduler=warmup_step_lr \
12 |     lr_scheduler.warmup_steps=30000 \
13 |     lr_scheduler.step_size=60000 \
14 |     lr_scheduler.freeze_steps=500000 \
15 |     lr_scheduler.lr_factor=0.8 \
16 |     trainer=gpu \
17 |     trainer.batch_size=128 \
18 |     trainer.max_epochs=30 \
19 |     trainer.num_workers=8 \
20 |     trainer.num_checkpoints=30 \
21 |     criterion=adaptive_aamsoftmax \
22 |     criterion.increase_steps=300000 \
23 |     augment.apply_spec_augment=True\
24 |     augment.time_mask_num=1 \
25 |     augment.apply_noise_augment=True \
26 |     augment.apply_reverb_augment=True \
27 |     augment.apply_noise_reverb_augment=True \
28 |     augment.noise_augment_weight=2 \
29 |     augment.noise_dataset_dir=/data/share/data/musan \
30 |     augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \
31 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/dataframe/configurations.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import MISSING
 2 | from dataclasses import dataclass, field
 3 | 
 4 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 5 | @dataclass
 6 | class DataframeConfigs(DeepMMDataclass):
 7 |     """ Configuration dataclass that common used """
 8 |     name: str = field(
 9 |         default="dataframe", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
10 |     )
11 |     database_yml: str = field(
12 |         default="/Users/yin/project/deepaudio-database/database.yml", metadata={"help": "Path of database.yml"}
13 |     )
14 |     dataset_name: str = field(
15 |         default="debug", metadata={"help": "Database name. If you want use multiple dataset, please use ',' to split"}
16 |     )
17 |     sampler: str = field(
18 |         default="clovaai", metadata={"help": "Sampler name."}
19 |     )
20 |     duration: float = field(
21 |         default=4, metadata={"help": "Sliding window duration."}
22 |     )
23 |     step: float = field(
24 |         default=2, metadata={"help": "Sliding window step."}
25 |     )
26 |     exhaustive: bool = field(
27 |         default=True, metadata={"help": "exhaustive mode."}
28 |     )


--------------------------------------------------------------------------------
/run2.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1 deepaudio-speaker-train  \
 2 |     dataset=dataframe \
 3 |     dataset.database_yml=/home/amax/audio/deepaudio-database/database.yml \
 4 |     dataset.dataset_name=voxceleb2_dev \
 5 |     model=clovaai_ecapa \
 6 |     model.channels=1024 \
 7 |     model.embed_dim=256 \
 8 |     model.min_num_frames=300 \
 9 |     model.max_num_frames=500 \
10 |     model.pretrained=True \
11 |     model.checkpoint=/home/amax/audio/deepaudio-speaker/ckpts/epoch_20.ckpt \
12 |     feature=fbank \
13 |     lr_scheduler=steplr \
14 |     lr_scheduler.peak_lr=0.00001 \
15 |     lr_scheduler.init_lr=0.00001 \
16 |     lr_scheduler.step_size=30000 \
17 |     lr_scheduler.lr_factor=0.6 \
18 |     trainer=gpu \
19 |     trainer.batch_size=256 \
20 |     trainer.max_epochs=10 \
21 |     trainer.num_workers=8 \
22 |     trainer.num_checkpoints=30 \
23 |     criterion=pyannote_aamsoftmax \
24 |     criterion.margin=0.35 \
25 |     augment.apply_spec_augment=False\
26 |     augment.time_mask_num=1 \
27 |     augment.apply_noise_augment=True \
28 |     augment.apply_reverb_augment=True \
29 |     augment.apply_noise_reverb_augment=True \
30 |     augment.noise_augment_weight=2 \
31 |     augment.noise_dataset_dir=/data/share/data/musan \
32 |     augment.rir_dataset_dir=/data/share/data/RIRS_NOISES/simulated_rirs/ \
33 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/inference.py:
--------------------------------------------------------------------------------
 1 | from typing import Text, Union
 2 | from pathlib import Path
 3 | 
 4 | import torch
 5 | 
 6 | from pytorch_lightning.utilities.cloud_io import load as pl_load
 7 | 
 8 | from deepaudio.speaker.data.audio_io.with_torchaudio import Audio
 9 | from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_REGISTRY
10 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
11 | 
12 | 
13 | class Inference:
14 |     def __init__(
15 |             self,
16 |             path_for_pl: Union[Text, Path],
17 |             device: torch.device = None,
18 |             strict: bool = False
19 |     ):
20 |         loaded_ckpt = pl_load(str(path_for_pl))
21 |         configs = loaded_ckpt["configs"]
22 |         self.model = SpeakerEmbeddingModel.from_pretrained(str(path_for_pl), device, strict).eval().cuda()
23 |         self.audio = Audio()
24 |         self.feature_extractor = AUDIO_FEATURE_TRANSFORM_REGISTRY[configs.feature.name](configs).cuda()
25 | 
26 |     def make_embedding(self, wav, seg=None):
27 |         if seg is None:
28 |             waveform, _ = self.audio(wav)
29 |         else:
30 |             waveform, _ = self.audio.crop(wav, seg)
31 |         feature = self.feature_extractor(waveform.cuda())
32 |         return self.model.make_embedding(feature)
33 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/cli/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hydra
 3 | import pytorch_lightning as pl
 4 | from omegaconf import DictConfig, OmegaConf
 5 | from pytorch_lightning.utilities import rank_zero_info
 6 | 
 7 | from deepaudio.speaker.datasets import DATA_MODULE_REGISTRY
 8 | from deepaudio.speaker.dataclass.initialize import hydra_train_init
 9 | from deepaudio.speaker.models import MODEL_REGISTRY
10 | from deepaudio.speaker.utils import parse_configs, get_pl_trainer
11 | from deepaudio.speaker.models.speaker_embedding_model import SpeakerEmbeddingModel
12 | 
13 | 
14 | 
15 | @hydra.main(config_path=os.path.join("..", "configs"), config_name="train")
16 | def hydra_main(configs: DictConfig) -> None:
17 |     rank_zero_info(OmegaConf.to_yaml(configs))
18 |     pl.seed_everything(configs.trainer.seed)
19 |     logger, num_devices = parse_configs(configs)
20 | 
21 |     data_module = DATA_MODULE_REGISTRY[configs.dataset.name](configs)
22 |     data_module.prepare_data()
23 |     if configs.model.pretrained is True:
24 |         model = SpeakerEmbeddingModel.from_pretrained(configs.model.checkpoint, configs=configs)
25 |     else:
26 |         model = MODEL_REGISTRY[configs.model.name](configs=configs, num_classes=data_module.num_classes)
27 |     trainer = get_pl_trainer(configs, num_devices, logger)
28 |     trainer.fit(model, data_module)
29 | 
30 | 
31 | def main():
32 |     hydra_train_init()
33 |     hydra_main()
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | DATA_MODULE_REGISTRY = dict()
 5 | 
 6 | 
 7 | def register_data_module(name: str):
 8 |     """
 9 |     New data module types can be added to OpenSpeech with the :func:`register_data_module` function decorator.
10 | 
11 |     For example::
12 |         @register_data_module('ksponspeech')
13 |         class LightningKsponSpeechDataModule:
14 |             (...)
15 | 
16 |     .. note:: All vocabs must implement the :class:`cls.__name__` interface.
17 | 
18 |     Args:
19 |         name (str): the name of the vocab
20 |     """
21 | 
22 |     def register_data_module_cls(cls):
23 |         if name in DATA_MODULE_REGISTRY:
24 |             raise ValueError(f"Cannot register duplicate data module ({name})")
25 |         DATA_MODULE_REGISTRY[name] = cls
26 |         return cls
27 | 
28 |     return register_data_module_cls
29 | 
30 | 
31 | data_module_dir = os.path.dirname(__file__)
32 | for file in os.listdir(data_module_dir):
33 |     if os.path.isdir(os.path.join(data_module_dir, file)) and file != '__pycache__':
34 |         for subfile in os.listdir(os.path.join(data_module_dir, file)):
35 |             path = os.path.join(data_module_dir, file, subfile)
36 |             if subfile.endswith(".py"):
37 |                 data_module_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
38 |                 module = importlib.import_module(f"deepaudio.speaker.datasets.{file}.{data_module_name}")
39 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/fbank/fbank.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torchaudio.transforms import MelSpectrogram
 4 | 
 5 | from ..utils import CMVN
 6 | from .configuration import FBankConfigs
 7 | from .. import register_audio_feature_transform
 8 | 
 9 | EPSILON = 1e-6
10 | 
11 | 
12 | @register_audio_feature_transform("fbank", dataclass=FBankConfigs)
13 | class Fbank(nn.Module):
14 |     def __init__(self, configs):
15 |         super(Fbank, self).__init__()
16 |         win_length = int(configs.feature.sample_rate * configs.feature.frame_duration)
17 |         hop_length = int(configs.feature.sample_rate * configs.feature.frame_shift)
18 |         self.melSpectrogram = MelSpectrogram(sample_rate=configs.feature.sample_rate,
19 |                                              n_mels=configs.feature.n_mels,
20 |                                              n_fft=512,
21 |                                              win_length=win_length,
22 |                                              hop_length=hop_length,
23 |                                              window_fn=torch.hann_window)
24 |         self.cmvn = CMVN(var_norm=configs.feature.var_norm)
25 |         self.input_dim = configs.feature.n_mels
26 | 
27 |     def forward(self, waveform):
28 |         mel_spectrogram = self.melSpectrogram(waveform)
29 |         mel_spectrogram = torch.log(mel_spectrogram + EPSILON)
30 |         mel_spectrogram = mel_spectrogram.transpose(1, 2)
31 |         return self.cmvn(mel_spectrogram)
32 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/fix_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from dataclasses import dataclass, field
 3 | from typing import Optional
 4 | from omegaconf import DictConfig
 5 | from torch.optim import Optimizer
 6 | 
 7 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 8 | from deepaudio.speaker.optim.scheduler import register_scheduler
 9 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
10 | 
11 | 
12 | @dataclass
13 | class FixLRSchedulerConfigs(LearningRateSchedulerConfigs):
14 |     scheduler_name: str = field(
15 |         default="fix", metadata={"help": "Name of learning rate scheduler."}
16 |     )
17 |     peak_lr: float = field(
18 |         default=1e-04, metadata={"help": "Maximum learning rate."}
19 |     )
20 | 
21 | 
22 | @register_scheduler("fix", dataclass=FixLRSchedulerConfigs)
23 | class FixLRScheduler(LearningRateScheduler):
24 |     """
25 |     Warmup learning rate until `total_steps`
26 | 
27 |     Args:
28 |         optimizer (Optimizer): wrapped optimizer.
29 |         configs (DictConfig): configuration set.
30 |     """
31 |     def __init__(
32 |             self,
33 |             optimizer: Optimizer,
34 |             configs: DictConfig,
35 |     ) -> None:
36 |         super(FixLRScheduler, self).__init__(optimizer, configs.lr_scheduler.peak_lr)
37 |         self.lr = configs.lr_scheduler.peak_lr
38 | 
39 |     def step(self):
40 |         self.set_lr(self.optimizer, self.lr)
41 |         return self.lr
42 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/mmcl_seresnet34/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class MMCLSeResnet34Configs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="mmcl_seresnet34", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=256, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     in_channels: int = field(
14 |         default=1, metadata={"help": "In channel."}
15 |     )
16 |     stem_channels: int = field(
17 |         default=32, metadata={"help": "Stem channel."}
18 |     )
19 |     base_channels: int = field(
20 |         default=32, metadata={"help": "Base channel."}
21 |     )
22 |     depth: int = field(
23 |         default=34, metadata={"help": "Depth."}
24 |     )
25 |     out_bn: bool = field(
26 |         default=True, metadata={"help": "Flag for batch normalization in embedding layer."}
27 |     )
28 |     num_stages: int = field(
29 |         default=4, metadata={"help": "Number of stages"}
30 |     )
31 |     out_indices: int = field(
32 |         default=3, metadata={"help": "Out indices"}
33 |     )
34 |     norm_cfg_type: str = field(
35 |         default='BN', metadata={"help": "Norm type"}
36 |     )
37 |     optimizer: str = field(
38 |         default="adam", metadata={"help": "Optimizer for training."}
39 |     )
40 |     min_num_frames: int = field(
41 |         default=300, metadata={"help": "Min num frames."}
42 |     )
43 |     max_num_frames: int = field(
44 |         default=400, metadata={"help": "Max num frames."}
45 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/models/clovaai_ecapa/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class ClovaaiECAPAConfigs(DeepMMDataclass):
 7 |     name: str = field(
 8 |         default="clovaai_ecapa", metadata={"help": "Model name"}
 9 |     )
10 |     embed_dim: int = field(
11 |         default=192, metadata={"help": "Dimension of embedding."}
12 |     )
13 |     channels: int = field(
14 |         default=512, metadata={"help": "Dimension of embedding."}
15 |     )
16 |     model_scale: int = field(
17 |         default=8, metadata={"help": "Model scale."}
18 |     )
19 |     context: bool = field(
20 |         default=True, metadata={"help": "Context."}
21 |     )
22 |     summed: bool = field(
23 |         default=True, metadata={"help": "Summed."}
24 |     )
25 |     out_bn: bool = field(
26 |         default=True, metadata={"help": "Flag for batch normalization in embedding layer."}
27 |     )
28 |     encoder_type: str = field(
29 |         default="ECA", metadata={"help": "Encoder type."}
30 |     )
31 |     optimizer: str = field(
32 |         default="adam", metadata={"help": "Optimizer for training."}
33 |     )
34 |     min_num_frames: int = field(
35 |         default=200, metadata={"help": "Min num frames."}
36 |     )
37 |     max_num_frames: int = field(
38 |         default=400, metadata={"help": "Max num frames."}
39 |     )
40 |     pretrained: bool = field(
41 |         default=False, metadata={"help": "Use pretrained model or not."}
42 |     )
43 |     checkpoint: str = field(
44 |         default="None", metadata={"help": "Checkpoint path."}
45 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/wespeaker/speaker_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import deepaudio.speaker.modules.backbones.wespeaker.tdnn as tdnn
16 | import deepaudio.speaker.modules.backbones.wespeaker.ecapa_tdnn as ecapa_tdnn
17 | import deepaudio.speaker.modules.backbones.wespeaker.resnet as resnet
18 | import deepaudio.speaker.modules.backbones.wespeaker.repvgg as repvgg
19 | 
20 | 
21 | def get_speaker_model(model_name: str):
22 |     if model_name.startswith("XVEC"):
23 |         return getattr(tdnn, model_name)
24 |     elif model_name.startswith("ECAPA_TDNN"):
25 |         return getattr(ecapa_tdnn, model_name)
26 |     elif model_name.startswith("ResNet"):
27 |         return getattr(resnet, model_name)
28 |     elif model_name.startswith("REPVGG"):
29 |         return getattr(repvgg, model_name)
30 |     else:  # model_name error !!!
31 |         print(model_name + " not found !!!")
32 |         exit(1)
33 | 
34 | 
35 | def MainModel(configs):
36 |     model_class = get_speaker_model(configs.model.name)
37 |     model = model_class(feat_dim=configs.feature.n_mels,
38 |                         embed_dim=configs.model.embed_dim,
39 |                         pooling_func=configs.model.pooling_func)
40 |     return model
41 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/augmentation/spec_augment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | class SpecAugment(object):
 7 |     """
 8 |     Provides Spec Augment. A simple data augmentation method for speech recognition.
 9 |     This concept proposed in https://arxiv.org/abs/1904.08779
10 | 
11 |     Args:
12 |         freq_mask_para (int): maximum frequency masking length
13 |         time_mask_num (int): how many times to apply time masking
14 |         freq_mask_num (int): how many times to apply frequency masking
15 | 
16 |     Inputs: feature_vector
17 |         - **feature_vector** (torch.FloatTensor): feature vector from audio file.
18 | 
19 |     Returns: feature_vector:
20 |         - **feature_vector**: masked feature vector.
21 |     """
22 | 
23 |     def __init__(self, configs) -> None:
24 |         self.freq_mask_para = configs.augment.freq_mask_para
25 |         self.time_mask_num = configs.augment.time_mask_num
26 |         self.freq_mask_num = configs.augment.freq_mask_num
27 | 
28 |     def __call__(self, feature: Tensor) -> Tensor:
29 |         """ Provides SpecAugmentation for audio """
30 |         time_axis_length = feature.size(0)
31 |         freq_axis_length = feature.size(1)
32 |         time_mask_para = time_axis_length / 20  # Refer to "Specaugment on large scale dataset" paper
33 | 
34 |         # time mask
35 |         for _ in range(self.time_mask_num):
36 |             t = int(np.random.uniform(low=0.0, high=time_mask_para))
37 |             t0 = random.randint(0, time_axis_length - t)
38 |             feature[t0: t0 + t, :] = 0
39 | 
40 |         # freq mask
41 |         for _ in range(self.freq_mask_num):
42 |             f = int(np.random.uniform(low=0.0, high=self.freq_mask_para))
43 |             f0 = random.randint(0, freq_axis_length - f)
44 |             feature[:, f0: f0 + f] = 0
45 | 
46 |         return feature
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | from pkg_resources import VersionConflict, require
 6 | from setuptools import find_packages, setup
 7 | 
 8 | with open("Readme.md") as f:
 9 |     long_description = f.read()
10 | 
11 | with open("requirements.txt") as f:
12 |     requirements = f.read().splitlines()
13 | 
14 | try:
15 |     require("setuptools>=38.3")
16 | except VersionConflict:
17 |     print("Error: version of setuptools is too old (<38.3)!")
18 |     sys.exit(1)
19 | 
20 | 
21 | ROOT_DIR = Path(__file__).parent.resolve()
22 | # Creating the version file
23 | 
24 | with open("version.txt") as f:
25 |     version = f.read()
26 | 
27 | version = version.strip()
28 | sha = "Unknown"
29 | 
30 | if os.getenv("BUILD_VERSION"):
31 |     version = os.getenv("BUILD_VERSION")
32 | elif sha != "Unknown":
33 |     version += "+" + sha[:7]
34 | print("-- Building version " + version)
35 | 
36 | version_path = ROOT_DIR / "deepaudio" / "speaker" / "version.py"
37 | 
38 | with open(version_path, "w") as f:
39 |     f.write("__version__ = '{}'\n".format(version))
40 | 
41 | if __name__ == "__main__":
42 |     setup(
43 |         name="deepaudio.speaker",
44 |         namespace_packages=["deepaudio"],
45 |         version=version,
46 |         packages=find_packages(),
47 |         install_requires=requirements,
48 |         description="Speaker embedding",
49 |         long_description=long_description,
50 |         long_description_content_type="text/markdown",
51 |         author="Ruiqing Yin",
52 |         # author_email="yinruiqing",
53 |         url="https://github.com/deepaudio/deepaudio-speaker",
54 |         classifiers=[
55 |             "Development Status :: 4 - Beta",
56 |             "Intended Audience :: Science/Research",
57 |             "License :: OSI Approved :: MIT License",
58 |             "Natural Language :: English",
59 |             "Programming Language :: Python :: 3.8",
60 |             "Topic :: Scientific/Engineering",
61 |         ],
62 |     )


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from torch.optim.lr_scheduler import _LRScheduler
24 | 
25 | 
26 | class LearningRateScheduler(_LRScheduler):
27 |     r"""
28 |     Provides inteface of learning rate scheduler.
29 | 
30 |     Note:
31 |         Do not use this class directly, use one of the sub classes.
32 |     """
33 |     def __init__(self, optimizer, init_lr):
34 |         self.optimizer = optimizer
35 |         self.init_lr = init_lr
36 | 
37 |     def step(self, *args, **kwargs):
38 |         raise NotImplementedError
39 | 
40 |     @staticmethod
41 |     def set_lr(optimizer, lr):
42 |         for g in optimizer.param_groups:
43 |             g['lr'] = lr
44 | 
45 |     def get_lr(self):
46 |         for g in self.optimizer.param_groups:
47 |             return g['lr']
48 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/feature/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | AUDIO_FEATURE_TRANSFORM_REGISTRY = dict()
 5 | AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY = dict()
 6 | 
 7 | 
 8 | def register_audio_feature_transform(name: str, dataclass=None):
 9 |     r"""
10 |     New dataset types can be added to OpenSpeech with the :func:`register_dataset` function decorator.
11 | 
12 |     For example::
13 |         @register_audio_feature_transform("fbank", dataclass=FilterBankConfigs)
14 |         class FilterBankFeatureTransform(object):
15 |             (...)
16 | 
17 |     .. note:: All dataset must implement the :class:`cls.__name__` interface.
18 | 
19 |     Args:
20 |         name (str): the name of the dataset
21 |         dataclass (Optional, str): the dataclass of the dataset (default: None)
22 |     """
23 | 
24 |     def register_audio_feature_transform_cls(cls):
25 |         if name in AUDIO_FEATURE_TRANSFORM_REGISTRY:
26 |             raise ValueError(f"Cannot register duplicate audio ({name})")
27 | 
28 |         AUDIO_FEATURE_TRANSFORM_REGISTRY[name] = cls
29 | 
30 |         if dataclass is not None:
31 |             if name in AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY:
32 |                 raise ValueError(f"Cannot register duplicate dataclass ({name})")
33 |             AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[name] = dataclass
34 | 
35 |         return cls
36 | 
37 |     return register_audio_feature_transform_cls
38 | 
39 | 
40 | data_dir = os.path.dirname(__file__)
41 | for file in os.listdir(f"{data_dir}"):
42 |     if os.path.isdir(f"{data_dir}/{file}") and not file.startswith('__'):
43 |         path = f"{data_dir}/{file}"
44 |         for module_file in os.listdir(path):
45 |             path = os.path.join(path, module_file)
46 |             if module_file.endswith(".py"):
47 |                 module_name = module_file[: module_file.find(".py")] if module_file.endswith(".py") else module_file
48 |                 module = importlib.import_module(f"deepaudio.speaker.data.feature.{file}.{module_name}")


--------------------------------------------------------------------------------
/deepaudio/speaker/data/augmentation/noise.py:
--------------------------------------------------------------------------------
 1 | from torch_audiomentations import AddBackgroundNoise, ApplyImpulseResponse, Compose
 2 | 
 3 | from .utils import get_all_wavs
 4 | 
 5 | 
 6 | class Noise:
 7 |     def __init__(self, configs):
 8 |         self.configs = configs
 9 |         background_paths = get_all_wavs(configs.augment.noise_dataset_dir)
10 |         self.noise = AddBackgroundNoise(background_paths=background_paths,
11 |                                         min_snr_in_db=configs.augment.min_snr_in_db,
12 |                                         max_snr_in_db=configs.augment.max_snr_in_db,
13 |                                         p=1)
14 | 
15 |     def __call__(self, waveform):
16 |         waveform = waveform.unsqueeze(0)
17 |         return self.noise(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0)
18 | 
19 | 
20 | class Reverb:
21 |     def __init__(self, configs):
22 |         self.configs = configs
23 |         ir_paths = get_all_wavs(configs.augment.rir_dataset_dir)
24 |         self.reverb = ApplyImpulseResponse(ir_paths=ir_paths, p=1)
25 | 
26 |     def __call__(self, waveform):
27 |         waveform = waveform.unsqueeze(0)
28 |         return self.reverb(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0)
29 | 
30 | 
31 | class NoiseReverb:
32 |     def __init__(self, configs):
33 |         self.configs = configs
34 |         background_paths = get_all_wavs(configs.augment.noise_dataset_dir)
35 |         ir_paths = get_all_wavs(configs.augment.rir_dataset_dir)
36 |         self.noise = AddBackgroundNoise(background_paths=background_paths,
37 |                                         min_snr_in_db=configs.augment.min_snr_in_db,
38 |                                         max_snr_in_db=configs.augment.max_snr_in_db,
39 |                                         p=1)
40 |         self.reverb = ApplyImpulseResponse(ir_paths=ir_paths, p=1)
41 |         self.compose = Compose([self.noise, self.reverb], p=1)
42 | 
43 |     def __call__(self, waveform):
44 |         waveform = waveform.unsqueeze(0)
45 |         return self.compose(waveform, sample_rate=self.configs.feature.sample_rate).squeeze(0)
46 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | CRITERION_REGISTRY = dict()
 5 | CRITERION_DATACLASS_REGISTRY = dict()
 6 | 
 7 | 
 8 | def register_criterion(name: str, dataclass=None):
 9 |     r"""
10 |     New criterion types can be added to OpenSpeech with the :func:`register_criterion` function decorator.
11 | 
12 |     For example::
13 |         @register_criterion('label_smoothed_cross_entropy')
14 |         class LabelSmoothedCrossEntropyLoss(nn.Module):
15 |             (...)
16 | 
17 |     .. note:: All criterion must implement the :class:`cls.__name__` interface.
18 | 
19 |     Args:
20 |         name (str): the name of the criterion
21 |         dataclass (Optional, str): the dataclass of the criterion (default: None)
22 |     """
23 | 
24 |     def register_criterion_cls(cls):
25 |         if name in CRITERION_REGISTRY:
26 |             raise ValueError(f"Cannot register duplicate criterion ({name})")
27 | 
28 |         CRITERION_REGISTRY[name] = cls
29 | 
30 |         cls.__dataclass = dataclass
31 |         if dataclass is not None:
32 |             if name in CRITERION_DATACLASS_REGISTRY:
33 |                 raise ValueError(f"Cannot register duplicate criterion ({name})")
34 |             CRITERION_DATACLASS_REGISTRY[name] = dataclass
35 | 
36 |         return cls
37 | 
38 |     return register_criterion_cls
39 | 
40 | 
41 | criterion_dir = os.path.dirname(__file__)
42 | for file in os.listdir(criterion_dir):
43 |     if os.path.isdir(os.path.join(criterion_dir, file)) and not file.startswith('__'):
44 |         for subfile in os.listdir(os.path.join(criterion_dir, file)):
45 |             path = os.path.join(criterion_dir, file, subfile)
46 |             if subfile.endswith(".py"):
47 |                 python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
48 |                 module = importlib.import_module(f"deepaudio.speaker.criterion.{file}.{python_file}")
49 |         continue
50 | 
51 |     path = os.path.join(criterion_dir, file)
52 |     if file.endswith(".py"):
53 |         criterion_name = file[: file.find(".py")] if file.endswith(".py") else file
54 |         module = importlib.import_module(f"deepaudio.speaker.criterion.{criterion_name}")
55 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_subcenter_aamsoftmax/subcenter_aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch import Tensor
 6 | from omegaconf import DictConfig
 7 | 
 8 | from .. import register_criterion
 9 | from ..subcenter_aamsoftmax.subcenter_aamsoftmax import SubcenterArcMarginProduct
10 | from .configuration import AdaptiveSubcenterAAMSoftmaxConfigs
11 | 
12 | import torch
13 | import torch.nn as nn
14 | import torch.nn.functional as F
15 | from torch.nn import Parameter
16 | import math
17 | 
18 | 
19 | @register_criterion("adaptive_subcenter_aamsoftmax", dataclass=AdaptiveSubcenterAAMSoftmaxConfigs)
20 | class PyannoteAAMSoftmax(nn.Module):
21 |     def __init__(self,
22 |                  configs: DictConfig,
23 |                  num_classes: int,
24 |                  embedding_size: int
25 |                  ) -> None:
26 |         super(PyannoteAAMSoftmax, self).__init__()
27 |         self.configs = configs
28 |         self.classifier_ = SubcenterArcMarginProduct(
29 |             in_features=self.configs.model.embed_dim,
30 |             out_features=num_classes,
31 |             K=configs.model.criterion.K,
32 |             m=configs.criterion.margin,
33 |             s=configs.criterion.scale
34 |         )
35 |         self.loss_ = nn.CrossEntropyLoss()
36 |         self.margin = configs.criterion.margin
37 |         self.warmup_steps = configs.lr_scheduler.warmup_steps if configs.lr_scheduler.scheduler_name.startswith(
38 |             'warmup') else 0
39 |         self.increase_steps = configs.criterion.increase_steps
40 |         self.increase_rate = self.margin / (self.increase_steps - self.warmup_steps)
41 | 
42 |     def step(self, global_steps):
43 |         if global_steps < self.warmup_steps:
44 |             self.classifier_.margin = 0
45 |         elif global_steps < self.increase_steps:
46 |             self.classifier_.margin = (global_steps - self.warmup_steps) * self.increase_rate
47 |         else:
48 |             self.classifier_.margin = self.margin
49 | 
50 |     def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor:
51 |         logits = self.classifier_(embeddings, target=targets)
52 |         return self.loss_(logits, targets)
53 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | from .speaker_model import SpeakerModel
 5 | 
 6 | 
 7 | MODEL_REGISTRY = dict()
 8 | MODEL_DATACLASS_REGISTRY = dict()
 9 | 
10 | 
11 | def register_model(name: str, dataclass=None):
12 |     r"""
13 |     New model types can be added to OpenSpeech with the :func:`register_model` function decorator.
14 | 
15 |     For example::
16 |         @register_model('conformer_lstm')
17 |         class ConformerLSTMModel(OpenspeechModel):
18 |             (...)
19 | 
20 |     .. note:: All models must implement the :class:`cls.__name__` interface.
21 | 
22 |     Args:
23 |         name (str): the name of the model
24 |     """
25 | 
26 |     def register_model_cls(cls):
27 |         if name in MODEL_REGISTRY:
28 |             raise ValueError(f"Cannot register duplicate model ({name})")
29 |         if not issubclass(cls, SpeakerModel):
30 |             raise ValueError(f"Model ({name}: {cls.__name__}) must extend OpenspeechModel")
31 | 
32 |         MODEL_REGISTRY[name] = cls
33 | 
34 |         cls.__dataclass = dataclass
35 |         if dataclass is not None:
36 |             if name in MODEL_DATACLASS_REGISTRY:
37 |                 raise ValueError(f"Cannot register duplicate model ({name})")
38 |             MODEL_DATACLASS_REGISTRY[name] = dataclass
39 | 
40 |         return cls
41 | 
42 |     return register_model_cls
43 | 
44 | 
45 | # automatically import any Python files in the models/ directory
46 | models_dir = os.path.dirname(__file__)
47 | for file in os.listdir(models_dir):
48 |     if os.path.isdir(os.path.join(models_dir, file)) and not file.startswith('__'):
49 |         for subfile in os.listdir(os.path.join(models_dir, file)):
50 |             path = os.path.join(models_dir, file, subfile)
51 |             if subfile.endswith(".py"):
52 |                 python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
53 |                 module = importlib.import_module(f"deepaudio.speaker.models.{file}.{python_file}")
54 |         continue
55 | 
56 |     path = os.path.join(models_dir, file)
57 |     if file.endswith(".py"):
58 |         model_name = file[: file.find(".py")] if file.endswith(".py") else file
59 |         module = importlib.import_module(f"deepaudio.speaker.models.{model_name}")


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/step_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from dataclasses import dataclass, field
 3 | from typing import Optional
 4 | from omegaconf import DictConfig
 5 | from torch.optim import Optimizer
 6 | 
 7 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 8 | from deepaudio.speaker.optim.scheduler import register_scheduler
 9 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
10 | 
11 | 
12 | @dataclass
13 | class StepLRSchedulerConfigs(LearningRateSchedulerConfigs):
14 |     scheduler_name: str = field(
15 |         default="steplr", metadata={"help": "Name of learning rate scheduler."}
16 |     )
17 |     peak_lr: float = field(
18 |         default=1e-04, metadata={"help": "Maximum learning rate."}
19 |     )
20 |     min_lr: float = field(
21 |         default=1e-7, metadata={"help": "Min learning rate."}
22 |     )
23 |     step_size: int = field(
24 |         default=50, metadata={"help": "Step size to decay"}
25 |     )
26 |     lr_factor: float = field(
27 |         default=0.8, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
28 |     )
29 | 
30 | 
31 | @register_scheduler("steplr", dataclass=StepLRSchedulerConfigs)
32 | class StepLRScheduler(LearningRateScheduler):
33 |     """
34 |     Warmup learning rate until `total_steps`
35 | 
36 |     Args:
37 |         optimizer (Optimizer): wrapped optimizer.
38 |         configs (DictConfig): configuration set.
39 |     """
40 |     def __init__(
41 |             self,
42 |             optimizer: Optimizer,
43 |             configs: DictConfig,
44 |     ) -> None:
45 |         super(StepLRScheduler, self).__init__(optimizer, configs.lr_scheduler.peak_lr)
46 |         self.update_steps = 1
47 |         self.lr = configs.lr_scheduler.peak_lr
48 |         self.step_size = configs.lr_scheduler.step_size
49 |         self.min_lr = configs.lr_scheduler.min_lr
50 |         self.lr_factor = configs.lr_scheduler.lr_factor
51 | 
52 |     def step(self, val_loss: Optional[torch.FloatTensor] = None):
53 |         if self.update_steps % self.step_size == 0:
54 |             lr = self.lr * self.lr_factor
55 |             self.set_lr(self.optimizer, lr)
56 |             self.lr = lr
57 |         self.update_steps += 1
58 |         return self.lr
59 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/dataclass/initialize.py:
--------------------------------------------------------------------------------
 1 | from hydra.core.config_store import ConfigStore
 2 | from deepaudio.speaker.data.augmentation.configurations import AugmentConfigs
 3 | from deepaudio.speaker.datasets.voxceleb2.configurations import Voxceleb2Configs
 4 | from deepaudio.speaker.datasets.dataframe.configurations import DataframeConfigs
 5 | 
 6 | from .configurations import (
 7 |     CPUTrainerConfigs,
 8 |     GPUTrainerConfigs,
 9 |     TPUTrainerConfigs,
10 |     Fp16GPUTrainerConfigs,
11 |     Fp16TPUTrainerConfigs,
12 |     Fp64CPUTrainerConfigs,
13 | )
14 | 
15 | 
16 | SPEAKER_TRAIN_CONFIGS = [
17 |     "feature",
18 |     "augment",
19 |     "dataset",
20 |     "model",
21 |     "criterion",
22 |     "lr_scheduler",
23 |     "trainer",
24 | ]
25 | 
26 | 
27 | DATASET_DATACLASS_REGISTRY = {
28 |     "voxceleb2": Voxceleb2Configs,
29 |     'dataframe': DataframeConfigs,
30 | }
31 | TRAINER_DATACLASS_REGISTRY = {
32 |     "cpu": CPUTrainerConfigs,
33 |     "gpu": GPUTrainerConfigs,
34 |     "tpu": TPUTrainerConfigs,
35 |     "gpu-fp16": Fp16GPUTrainerConfigs,
36 |     "tpu-fp16": Fp16TPUTrainerConfigs,
37 |     "cpu-fp64": Fp64CPUTrainerConfigs,
38 | }
39 | AUGMENT_DATACLASS_REGISTRY = {
40 |     "default": AugmentConfigs,
41 | }
42 | 
43 | def hydra_train_init() -> None:
44 |     r""" initialize ConfigStore for hydra-train """
45 |     from deepaudio.speaker.models import MODEL_DATACLASS_REGISTRY
46 |     from deepaudio.speaker.criterion import CRITERION_DATACLASS_REGISTRY
47 |     from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
48 |     from deepaudio.speaker.optim.scheduler import SCHEDULER_DATACLASS_REGISTRY
49 | 
50 |     registries = {
51 |         "feature": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
52 |         "augment": AUGMENT_DATACLASS_REGISTRY,
53 |         "dataset": DATASET_DATACLASS_REGISTRY,
54 |         "trainer": TRAINER_DATACLASS_REGISTRY,
55 |         "model": MODEL_DATACLASS_REGISTRY,
56 |         "criterion": CRITERION_DATACLASS_REGISTRY,
57 |         "lr_scheduler": SCHEDULER_DATACLASS_REGISTRY,
58 |     }
59 | 
60 |     cs = ConfigStore.instance()
61 | 
62 |     for group in SPEAKER_TRAIN_CONFIGS:
63 |         dataclass_registry = registries[group]
64 | 
65 |         for k, v in dataclass_registry.items():
66 |             cs.store(group=group, name=k, node=v, provider="deepaudio")
67 | 
68 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import os
24 | import importlib
25 | 
26 | from deepaudio.speaker.optim.adamp import AdamP
27 | from deepaudio.speaker.optim.radam import RAdam
28 | from deepaudio.speaker.optim.novograd import Novograd
29 | 
30 | # automatically import any Python files in the models/ directory
31 | scheduler_dir = os.path.dirname(__file__)
32 | for file in os.listdir(scheduler_dir):
33 |     if os.path.isdir(os.path.join(scheduler_dir, file)) and file != '__pycache__':
34 |         for subfile in os.listdir(os.path.join(scheduler_dir, file)):
35 |             path = os.path.join(scheduler_dir, file, subfile)
36 |             if subfile.endswith(".py"):
37 |                 scheduler_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
38 |                 module = importlib.import_module(f"deepaudio.speaker.optim.scheduler.{scheduler_name}")
39 |         continue
40 | 
41 |     path = os.path.join(scheduler_dir, file)
42 |     if file.endswith(".py"):
43 |         scheduler_name = file[: file.find(".py")] if file.endswith(".py") else file
44 |         module = importlib.import_module(f"deepaudio.speaker.optim.{scheduler_name}")
45 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import os
24 | import importlib
25 | 
26 | SCHEDULER_REGISTRY = {}
27 | SCHEDULER_DATACLASS_REGISTRY = {}
28 | 
29 | 
30 | def register_scheduler(name: str, dataclass=None):
31 |     """
32 |     New scheduler types can be added to OpenSpeech with the :func:`register_scheduler` function decorator.
33 | 
34 |     For example::
35 |         @register_scheduler('reduce_lr_on_plateau')
36 |         class ReduceLROnPlateau:
37 |             (...)
38 | 
39 |     .. note:: All scheduler must implement the :class:`cls.__name__` interface.
40 | 
41 |     Args:
42 |         name (str): the name of the scheduler
43 |     """
44 | 
45 |     def register_scheduler_cls(cls):
46 |         if name in SCHEDULER_REGISTRY:
47 |             raise ValueError(f"Cannot register duplicate scheduler ({name})")
48 | 
49 |         SCHEDULER_REGISTRY[name] = cls
50 | 
51 |         cls.__dataclass = dataclass
52 |         if dataclass is not None:
53 |             if name in SCHEDULER_DATACLASS_REGISTRY:
54 |                 raise ValueError(f"Cannot register duplicate scheduler ({name})")
55 |             SCHEDULER_DATACLASS_REGISTRY[name] = dataclass
56 | 
57 |         return cls
58 | 
59 |     return register_scheduler_cls
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | outputs/
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/augmentation/configurations.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, _MISSING_TYPE, field
 2 | 
 3 | from deepaudio.speaker.dataclass.configurations import DeepMMDataclass
 4 | 
 5 | @dataclass
 6 | class AugmentConfigs(DeepMMDataclass):
 7 |     apply_spec_augment: bool = field(
 8 |         default=False, metadata={"help": "Flag indication whether to apply spec augment or not"}
 9 |     )
10 |     apply_noise_augment: bool = field(
11 |         default=False, metadata={"help": "Flag indication whether to apply noise augment or not "
12 |                                          "Noise augment requires `noise_dataset_path`. "
13 |                                          "`noise_dataset_dir` should be contain audio files."}
14 |     )
15 |     apply_reverb_augment: bool = field(
16 |         default=False, metadata={"help": "Flag indication whether to apply joining augment or not "
17 |                                          "If true, create a new audio file by connecting two audio randomly"}
18 |     )
19 |     apply_noise_reverb_augment: bool = field(
20 |         default=False, metadata={"help": "Flag indication whether to apply spec augment or not"}
21 |     )
22 |     min_snr_in_db: float = field(
23 |         default=3.0, metadata={"help": "Flag indication whether to apply spec augment or not"}
24 |     )
25 |     max_snr_in_db: float = field(
26 |         default=30.0, metadata={"help": "Flag indication whether to apply spec augment or not"}
27 |     )
28 |     freq_mask_para: int = field(
29 |         default=27, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
30 |     )
31 |     freq_mask_num: int = field(
32 |         default=2, metadata={"help": "How many freq-masked area to make"}
33 |     )
34 |     time_mask_num: int = field(
35 |         default=4, metadata={"help": "How many time-masked area to make"}
36 |     )
37 |     noise_dataset_dir: str = field(
38 |         default='None', metadata={"help": "Noise Directory"}
39 |     )
40 |     rir_dataset_dir: str = field(
41 |         default='None', metadata={"help": "Rirs Directory"}
42 |     )
43 |     noise_augment_weight: float = field(
44 |         default=2.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
45 |     )
46 |     reverb_augment_weight: float = field(
47 |         default=1.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
48 |     )
49 |     noise_reverb_augment_weight: float = field(
50 |         default=2.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
51 |     )
52 |     specaugment_weight: float = field(
53 |         default=1.0, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
54 |     )
55 | 
56 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/dataloader.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | 
 8 | def _collate_fn(batch, min_num_frames, max_num_frames):
 9 |     r"""
10 |     Functions that pad to the maximum sequence length
11 | 
12 |     Args:
13 |         batch (tuple): tuple contains input and target tensors
14 | 
15 |     Returns:
16 |         inputs (torch.FloatTensor): tensor contains input tensor and target tensor.
17 |     """
18 |     def get_min_num_frames(batch):
19 |         return min([sample[0].size(0) for sample in batch])
20 |     def flatten(batch):
21 |         batch_flatten = []
22 |         for items in batch:
23 |             for X, y in zip(items[0], items[1]):
24 |                 batch_flatten.append((X,y))
25 |         return batch_flatten
26 | 
27 |     def get_subsample(feature, num_frames):
28 |         length = feature.size(0)
29 |         if length < num_frames:
30 |             msg = 'Sample is too short'
31 |             raise ValueError(msg)
32 |         elif length == num_frames:
33 |             return feature
34 |         else:
35 |             start = np.random.randint(0, length - num_frames)
36 |             return feature[start:start + num_frames]
37 |     batch = flatten(batch)
38 |     min_num_frames_batch = get_min_num_frames(batch)
39 |     num_frames = np.random.randint(min_num_frames, max_num_frames)
40 |     num_frames = min(num_frames, min_num_frames_batch)
41 | 
42 |     X = []
43 |     y = []
44 |     for item in batch:
45 |         feature = item[0]
46 |         X.append(get_subsample(feature, num_frames).unsqueeze(0))
47 |         y.append(item[1])
48 |     return {
49 |         'X': torch.cat(X),
50 |         'y': torch.tensor(y, dtype=torch.int64)
51 |     }
52 | 
53 | 
54 | class SpeakerUttDataLoader(DataLoader):
55 |     r"""
56 |     Text Data Loader
57 | 
58 |     Args:
59 |         dataset (torch.utils.data.Dataset): dataset from which to load the data.
60 |         num_workers (int): how many subprocesses to use for data loading.
61 |     """
62 |     def __init__(
63 |             self,
64 |             dataset: torch.utils.data.Dataset,
65 |             num_workers: int,
66 |             min_num_frames: int,
67 |             max_num_frames: int,
68 |             batch_size: int,
69 |             **kwargs,
70 |     ) -> None:
71 |         super(SpeakerUttDataLoader, self).__init__(
72 |             dataset=dataset,
73 |             num_workers=num_workers,
74 |             batch_size=batch_size,
75 |             **kwargs,
76 |         )
77 |         self.min_num_frames = min_num_frames
78 |         self.max_num_frames = max_num_frames
79 |         self.collate_fn = partial(_collate_fn,
80 |                                   min_num_frames=min_num_frames,
81 |                                   max_num_frames=max_num_frames)
82 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/voxceleb2/lit_data_module.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import random
 4 | from omegaconf import DictConfig
 5 | import pytorch_lightning as pl
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | from deepaudio.speaker.data.dataset import SpeakerAudioDataset
 9 | from deepaudio.speaker.data.dataloader import SpeakerUttDataLoader
10 | from deepaudio.speaker.data.samplers import ClovaaiSampler
11 | 
12 | 
13 | from .preprocess import get_speaker_list, get_speaker_wavs
14 | from .. import register_data_module
15 | 
16 | 
17 | @register_data_module('voxceleb2')
18 | class LightningVoxceleb2DataModule(pl.LightningDataModule):
19 |     def __init__(self, configs: DictConfig):
20 |         super(LightningVoxceleb2DataModule, self).__init__()
21 |         self.configs = configs
22 | 
23 |     def prepare_data(self):
24 |         speakers, spk2id = get_speaker_list(self.configs)
25 |         speaker2wav = get_speaker_wavs(self.configs.dataset.dataset_path, speakers)
26 |         self.num_classes = len(speakers)
27 |         self.train_utts, self.valid_utts = self._split_train_valid(speaker2wav, spk2id)
28 | 
29 |     def setup(self, stage: Optional[str] = None) -> None:
30 |         self.train_dataset = SpeakerAudioDataset(self.configs, self.train_utts)
31 |         self.valid_dataset = SpeakerAudioDataset(self.configs, self.valid_utts)
32 | 
33 |     def train_dataloader(self) -> DataLoader:
34 |         if self.configs.dataset.sampler == 'clovaai':
35 |             sampler = ClovaaiSampler(self.train_dataset.labels, self.configs)
36 |             shuffle = False
37 |         else:
38 |             sampler = None
39 |             shuffle = True
40 |         return SpeakerUttDataLoader(
41 |             dataset=self.train_dataset,
42 |             num_workers=self.configs.trainer.num_workers,
43 |             min_num_frames=self.configs.model.min_num_frames,
44 |             max_num_frames=self.configs.model.max_num_frames,
45 |             batch_size=self.configs.trainer.batch_size,
46 |             shuffle=shuffle,
47 |             sampler=sampler
48 |         )
49 | 
50 |     def val_dataloader(self) -> DataLoader:
51 |         return SpeakerUttDataLoader(
52 |             dataset=self.valid_dataset,
53 |             num_workers=self.configs.trainer.num_workers,
54 |             min_num_frames=self.configs.model.min_num_frames,
55 |             max_num_frames=self.configs.model.max_num_frames,
56 |             batch_size=self.configs.trainer.batch_size
57 |         )
58 | 
59 |     def _spk2wav_utts(self, spk2wav, spk2id):
60 |         utts = []
61 |         for spk in spk2wav:
62 |             for wav in spk2wav[spk]:
63 |                 utts.append((str(wav), spk2id[spk], None))
64 |         random.shuffle(utts)
65 |         return utts
66 | 
67 |     def _split_train_valid(self, speaker2wav, spk2id):
68 |         valid_spk2wav = {}
69 |         for spk in speaker2wav:
70 |             random.shuffle(speaker2wav[spk])
71 |             valid_spk2wav[spk] = [speaker2wav[spk].pop(0)]
72 |         train_utts = self._spk2wav_utts(speaker2wav, spk2id)
73 |         valid_utts = self._spk2wav_utts(valid_spk2wav, spk2id)
74 |         return train_utts, valid_utts
75 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/clovaai/ResNetBlocks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class SEBasicBlock(nn.Module):
 6 |     expansion = 1
 7 | 
 8 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
 9 |         super(SEBasicBlock, self).__init__()
10 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
11 |         self.bn1 = nn.BatchNorm2d(planes)
12 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
13 |         self.bn2 = nn.BatchNorm2d(planes)
14 |         self.relu = nn.ReLU(inplace=True)
15 |         self.se = SELayer(planes, reduction)
16 |         self.downsample = downsample
17 |         self.stride = stride
18 | 
19 |     def forward(self, x):
20 |         residual = x
21 | 
22 |         out = self.conv1(x)
23 |         out = self.relu(out)
24 |         out = self.bn1(out)
25 | 
26 |         out = self.conv2(out)
27 |         out = self.bn2(out)
28 |         out = self.se(out)
29 | 
30 |         if self.downsample is not None:
31 |             residual = self.downsample(x)
32 | 
33 |         out += residual
34 |         out = self.relu(out)
35 |         return out
36 | 
37 | 
38 | class SEBottleneck(nn.Module):
39 |     expansion = 4
40 | 
41 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
42 |         super(SEBottleneck, self).__init__()
43 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
44 |         self.bn1 = nn.BatchNorm2d(planes)
45 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
46 |                                padding=1, bias=False)
47 |         self.bn2 = nn.BatchNorm2d(planes)
48 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
49 |         self.bn3 = nn.BatchNorm2d(planes * 4)
50 |         self.relu = nn.ReLU(inplace=True)
51 |         self.se = SELayer(planes * 4, reduction)
52 |         self.downsample = downsample
53 |         self.stride = stride
54 | 
55 |     def forward(self, x):
56 |         residual = x
57 | 
58 |         out = self.conv1(x)
59 |         out = self.bn1(out)
60 |         out = self.relu(out)
61 | 
62 |         out = self.conv2(out)
63 |         out = self.bn2(out)
64 |         out = self.relu(out)
65 | 
66 |         out = self.conv3(out)
67 |         out = self.bn3(out)
68 |         out = self.se(out)
69 | 
70 |         if self.downsample is not None:
71 |             residual = self.downsample(x)
72 | 
73 |         out += residual
74 |         out = self.relu(out)
75 | 
76 |         return out
77 | 
78 | 
79 | class SELayer(nn.Module):
80 |     def __init__(self, channel, reduction=8):
81 |         super(SELayer, self).__init__()
82 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
83 |         self.fc = nn.Sequential(
84 |                 nn.Linear(channel, channel // reduction),
85 |                 nn.ReLU(inplace=True),
86 |                 nn.Linear(channel // reduction, channel),
87 |                 nn.Sigmoid()
88 |         )
89 | 
90 |     def forward(self, x):
91 |         b, c, _, _ = x.size()
92 |         y = self.avg_pool(x).view(b, c)
93 |         y = self.fc(y).view(b, c, 1, 1)
94 |         return x * y


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # This file is used to configure your project.
  2 | # Read more about the various options under:
  3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
  4 | 
  5 | [metadata]
  6 | name = deepaudio-speaker
  7 | description = Speaker embedding with neural networks
  8 | author = Ruiqing Yin
  9 | ;author-email =
 10 | ;license = mit
 11 | long-description = file: README.md
 12 | long-description-content-type = text/markdown; charset=UTF-8; variant=GFM
 13 | # Change if running only on Windows, Mac or Linux (comma-separated)
 14 | platforms = Linux, Mac
 15 | # Add here all kinds of additional classifiers as defined under
 16 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 17 | classifiers =
 18 |     Development Status :: 4 - Beta
 19 |     Programming Language :: Python
 20 | 
 21 | [options]
 22 | zip_safe = False
 23 | packages = find:
 24 | include_package_data = True
 25 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 26 | setup_requires = pyscaffold>=3.2a0,<3.3a0
 27 | # Add here dependencies of your project (semicolon/line-separated), e.g.
 28 | # install_requires = numpy; scipy
 29 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 30 | python_requires = >=3.7
 31 | 
 32 | [options.packages.find]
 33 | where = .
 34 | exclude =
 35 |     tests
 36 | 
 37 | [options.extras_require]
 38 | # Add here additional requirements for extra features, to install with:
 39 | # PDF = ReportLab; RXP
 40 | # Add here test requirements (semicolon/line-separated)
 41 | testing =
 42 |     pytest>=6.0
 43 |     pytest-cov>=2.10
 44 |     jupyter
 45 |     papermill
 46 | dev =
 47 |     pre_commit>=2.7
 48 |     recommonmark>=0.6
 49 |     black>=19.10b0
 50 | 
 51 | [options.entry_points]
 52 | 
 53 | console_scripts =
 54 |     deepaudio-speaker-train=deepaudio.speaker.cli.train:main
 55 | 
 56 | 
 57 | [test]
 58 | # py.test options when running `python setup.py test`
 59 | # addopts = --verbose
 60 | extras = True
 61 | 
 62 | [tool:pytest]
 63 | # Options for py.test:
 64 | # Specify command line options as you would do when invoking py.test directly.
 65 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 66 | # in order to write a coverage file that can be read by Jenkins.
 67 | addopts =
 68 |     --cov deepaudio --cov-report term-missing
 69 |     --verbose
 70 | norecursedirs =
 71 |     dist
 72 |     build
 73 |     .tox
 74 | testpaths = tests
 75 | 
 76 | [aliases]
 77 | dists = bdist_wheel
 78 | 
 79 | [bdist_wheel]
 80 | # Use this option if your package is pure-python
 81 | universal = 1
 82 | 
 83 | [build_sphinx]
 84 | source_dir = doc
 85 | build_dir = build/sphinx
 86 | 
 87 | [devpi:upload]
 88 | # Options for the devpi: PyPI server and packaging tool
 89 | # VCS export must be deactivated since we are using setuptools-scm
 90 | no-vcs = 1
 91 | formats = bdist_wheel
 92 | 
 93 | [flake8]
 94 | # Some sane defaults for the code style checker flake8
 95 | exclude =
 96 |     .tox
 97 |     build
 98 |     dist
 99 |     .eggs
100 |     docs/conf.py
101 | 
102 | [pyscaffold]
103 | # PyScaffold's parameters when the project was created.
104 | # This will be used when updating. Do not change!
105 | version = 3.2.3
106 | package = deepaudio-speaker
107 | extensions =
108 |     markdown
109 |     no_skeleton
110 |     pre_commit
111 |     dsproject
112 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/samplers.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | from omegaconf import DictConfig
 3 | 
 4 | import numpy as np
 5 | from collections import defaultdict
 6 | import torch
 7 | import torch.distributed as dist
 8 | 
 9 | 
10 | def round_down(num, divisor):
11 |     return num - (num % divisor)
12 | 
13 | 
14 | def worker_init_fn(worker_id):
15 |     np.random.seed(np.random.get_state()[1][0] + worker_id)
16 | 
17 | 
18 | class ClovaaiSampler(torch.utils.data.Sampler):
19 |     def __init__(self, labels, configs):
20 | 
21 |         self.data_label = labels
22 |         self.nPerSpeaker = configs.dataset.per_speaker
23 |         self.batch_size = configs.trainer.batch_size
24 |         self.epoch = 0
25 |         self.seed = 42
26 |         self.distributed = False
27 |         if configs.trainer.accelerator == 'ddp':
28 |             self.distributed = True
29 |         self.__iter__()
30 | 
31 |     def __iter__(self):
32 | 
33 |         g = torch.Generator()
34 |         g.manual_seed(self.seed + self.epoch)
35 |         indices = torch.randperm(len(self.data_label), generator=g).tolist()
36 | 
37 |         data_dict = defaultdict(list)
38 | 
39 |         # Sort into dictionary of file indices for each ID
40 |         for index in indices:
41 |             speaker_label = self.data_label[index]
42 |             data_dict[speaker_label].append(index)
43 | 
44 |         dictkeys = list(data_dict.keys())
45 |         dictkeys.sort()
46 | 
47 |         lol = lambda lst, sz: [lst[i:i + sz] for i in range(0, len(lst), sz)]
48 | 
49 |         flattened_list = []
50 |         flattened_label = []
51 | 
52 |         for findex, key in enumerate(dictkeys):
53 |             data = data_dict[key]
54 |             numSeg = round_down(len(data), self.nPerSpeaker)
55 | 
56 |             rp = lol(np.arange(numSeg), self.nPerSpeaker)
57 |             flattened_label.extend([findex] * (len(rp)))
58 |             for indices in rp:
59 |                 flattened_list.append([data[i] for i in indices])
60 | 
61 |         ## Mix data in random order
62 |         mixid = torch.randperm(len(flattened_label), generator=g).tolist()
63 |         mixlabel = []
64 |         mixmap = []
65 | 
66 |         ## Prevent two pairs of the same speaker in the same batch
67 |         for ii in mixid:
68 |             startbatch = round_down(len(mixlabel), self.batch_size)
69 |             if flattened_label[ii] not in mixlabel[startbatch:]:
70 |                 mixlabel.append(flattened_label[ii])
71 |                 mixmap.append(ii)
72 | 
73 |         mixed_list = [flattened_list[i] for i in mixmap]
74 | 
75 |         ## Divide data to each GPU
76 |         if self.distributed:
77 |             total_size = round_down(len(mixed_list), self.batch_size * dist.get_world_size())
78 |             start_index = int((dist.get_rank()) / dist.get_world_size() * total_size)
79 |             end_index = int((dist.get_rank() + 1) / dist.get_world_size() * total_size)
80 |             self.num_samples = end_index - start_index
81 |             return iter(mixed_list[start_index:end_index])
82 |         else:
83 |             total_size = round_down(len(mixed_list), self.batch_size)
84 |             self.num_samples = total_size
85 |             return iter(mixed_list[:total_size])
86 | 
87 |     def __len__(self) -> int:
88 |         return self.num_samples
89 | 
90 |     def set_epoch(self, epoch: int) -> None:
91 |         self.epoch = epoch
92 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/speaker_embedding_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | from pytorch_lightning.utilities.cloud_io import load as pl_load
 5 | 
 6 | from .speaker_model import SpeakerModel
 7 | from . import MODEL_REGISTRY
 8 | 
 9 | 
10 | class SpeakerEmbeddingModel(SpeakerModel):
11 |     def __init__(self, configs, num_classes):
12 |         super(SpeakerEmbeddingModel, self).__init__(configs, num_classes)
13 | 
14 |     def forward(self, inputs: torch.FloatTensor) -> Tensor:
15 |         return self.model(inputs)
16 | 
17 |     def training_step(self, batch: tuple, batch_idx: int):
18 |         if self.configs.criterion.name in ['adaptive_aamsoftmax',
19 |                                            'adaptive_subcenter_aamsoftmax'] and self.global_step == 0:
20 |             self.log(
21 |                 "val_loss",
22 |                 15,
23 |                 on_step=True,
24 |                 on_epoch=False,
25 |                 prog_bar=True,
26 |                 logger=True,
27 |             )
28 |         if self.configs.criterion.name in ['adaptive_aamsoftmax', 'adaptive_subcenter_aamsoftmax']:
29 |             self.criterion.step(self.global_step)
30 |             self.log(
31 |                 "margin",
32 |                 self.criterion.classifier_.margin,
33 |                 on_step=True,
34 |                 on_epoch=False,
35 |                 prog_bar=True,
36 |                 logger=True,
37 |             )
38 |         X = batch['X']
39 |         y = batch['y']
40 |         embeddings = self.forward(X)
41 |         loss = self.criterion(embeddings, y)
42 |         return {
43 |             'loss': loss
44 |         }
45 | 
46 |     def validation_step(self, batch: tuple, batch_idx: int):
47 |         X = batch['X']
48 |         y = batch['y']
49 |         embeddings = self.forward(X)
50 |         loss = self.criterion(embeddings, y)
51 |         self.log(
52 |             "val_loss",
53 |             loss,
54 |             on_step=False,
55 |             on_epoch=True,
56 |             prog_bar=True,
57 |             logger=True,
58 |         )
59 |         return {
60 |             'val_loss': loss
61 |         }
62 | 
63 |     def on_save_checkpoint(self, checkpoint):
64 |         checkpoint["configs"] = self.configs
65 |         checkpoint["num_classes"] = self.num_classes
66 | 
67 |     @classmethod
68 |     def from_pretrained(cls, path_for_pl,
69 |                         map_location=None,
70 |                         strict=False, configs=None):
71 |         loaded_checkpoint = pl_load(path_for_pl, map_location=map_location)
72 |         model_name: str = loaded_checkpoint["configs"].model.name
73 |         num_classes = loaded_checkpoint["num_classes"]
74 |         if configs is not None:
75 |             new_configs = configs
76 |         else:
77 |             new_configs = loaded_checkpoint["configs"]
78 |         Klass = MODEL_REGISTRY[model_name]
79 |         return Klass.load_from_checkpoint(
80 |             path_for_pl,
81 |             map_location=map_location,
82 |             strict=strict,
83 |             configs=new_configs,
84 |             num_classes=num_classes
85 |         )
86 | 
87 |     def to_torchscript(self, filepath):
88 |         script = torch.jit.script(self.model)
89 |         torch.jit.save(script, filepath)
90 | 
91 |     def make_embedding(self, feature):
92 |         if self.model.training:
93 |             self.model = self.model.eval()
94 |         return self.model(feature).cpu().detach().numpy()
95 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/pyannote_aamsoftmax/aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch import Tensor
 6 | from omegaconf import DictConfig
 7 | 
 8 | 
 9 | from .. import register_criterion
10 | from .configuration import PyannoteAAMSoftmaxConfigs
11 | 
12 | 
13 | class ArcLinear(nn.Module):
14 |     """Additive Angular Margin classification module
15 |     Parameters
16 |     ----------
17 |     nfeat : int
18 |         Embedding dimension
19 |     nclass : int
20 |         Number of classes
21 |     margin : float
22 |         Angular margin to penalize distances between embeddings and centers
23 |     scale : float
24 |         Scaling factor for the logits
25 |     """
26 | 
27 |     def __init__(self, nfeat, nclass, margin, scale):
28 |         super(ArcLinear, self).__init__()
29 |         eps = 1e-4
30 |         self.min_cos = eps - 1
31 |         self.max_cos = 1 - eps
32 |         self.nclass = nclass
33 |         self.margin = margin
34 |         self.scale = scale
35 |         self.W = nn.Parameter(Tensor(nclass, nfeat))
36 |         nn.init.xavier_uniform_(self.W)
37 | 
38 |     def forward(self, x, target=None):
39 |         """Apply the angular margin transformation
40 |         Parameters
41 |         ----------
42 |         x : `torch.Tensor`
43 |             an embedding batch
44 |         target : `torch.Tensor`
45 |             a non one-hot label batch
46 |         Returns
47 |         -------
48 |         fX : `torch.Tensor`
49 |             logits after the angular margin transformation
50 |         """
51 |         # normalize the feature vectors and W
52 |         xnorm = F.normalize(x)
53 |         Wnorm = F.normalize(self.W)
54 |         target = target.long().view(-1, 1)
55 |         # calculate cosθj (the logits)
56 |         cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1))
57 |         # get the cosθ corresponding to the classes
58 |         cos_theta_yi = cos_theta_j.gather(1, target)
59 |         # for numerical stability
60 |         cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos)
61 |         # get the angle separating xi and Wyi
62 |         theta_yi = torch.acos(cos_theta_yi)
63 |         # apply the margin to the angle
64 |         cos_theta_yi_margin = torch.cos(theta_yi + self.margin)
65 |         # one hot encode  y
66 |         one_hot = torch.zeros_like(cos_theta_j)
67 |         one_hot.scatter_(1, target, 1.0)
68 |         # project margin differences into cosθj
69 |         return self.scale * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi))
70 | 
71 | @register_criterion("pyannote_aamsoftmax", dataclass=PyannoteAAMSoftmaxConfigs)
72 | class PyannoteAAMSoftmax(nn.Module):
73 |     def __init__(self,
74 |                  configs: DictConfig,
75 |                  num_classes: int,
76 |                  embedding_size: int
77 |                  ) -> None:
78 |         super(PyannoteAAMSoftmax, self).__init__()
79 |         self.configs=configs
80 |         self.classifier_ = ArcLinear(
81 |             nfeat=self.configs.model.embed_dim,
82 |             nclass=num_classes,
83 |             margin=configs.criterion.margin,
84 |             scale=configs.criterion.scale
85 |         )
86 |         self.logsoftmax_ = nn.LogSoftmax(dim=1)
87 |         self.loss_ = nn.NLLLoss()
88 | 
89 |     def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor:
90 |         logits = self.logsoftmax_(self.classifier_(embeddings, target=targets))
91 |         return self.loss_(logits, targets)
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/dataframe/lit_data_module.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import random
 4 | from omegaconf import DictConfig
 5 | import pytorch_lightning as pl
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | from deepaudio.speaker.data.dataset import SpeakerAudioDataset
 9 | from deepaudio.speaker.data.dataloader import SpeakerUttDataLoader
10 | from deepaudio.speaker.data.samplers import ClovaaiSampler
11 | 
12 | from .utils import get_dataset_items, SpeakerDataframe, split_segment
13 | from .. import register_data_module
14 | 
15 | 
16 | @register_data_module('dataframe')
17 | class LightningDataframeDataModule(pl.LightningDataModule):
18 |     def __init__(self, configs: DictConfig):
19 |         super(LightningDataframeDataModule, self).__init__()
20 |         self.configs = configs
21 | 
22 |     def prepare_data(self):
23 |         dataset_items = get_dataset_items(self.configs.dataset.database_yml,
24 |                                           self.configs.dataset.dataset_name)
25 |         dataset = SpeakerDataframe(dataset_items)
26 |         speaker2items = dataset.speaker2items
27 |         spk2ids = dataset.spk2ids
28 |         self.num_classes = len(spk2ids)
29 |         self.train_utts, self.valid_utts = self._split_train_valid(speaker2items, spk2ids)
30 | 
31 |     def setup(self, stage: Optional[str] = None) -> None:
32 |         self.train_dataset = SpeakerAudioDataset(self.configs, self.train_utts)
33 |         self.valid_dataset = SpeakerAudioDataset(self.configs, self.valid_utts)
34 | 
35 |     def train_dataloader(self) -> DataLoader:
36 |         if self.configs.dataset.sampler == 'clovaai':
37 |             sampler = ClovaaiSampler(self.train_dataset.labels)
38 |         else:
39 |             sampler = None
40 |         return SpeakerUttDataLoader(
41 |             dataset=self.train_dataset,
42 |             num_workers=self.configs.trainer.num_workers,
43 |             min_num_frames=self.configs.model.min_num_frames,
44 |             max_num_frames=self.configs.model.max_num_frames,
45 |             batch_size=self.configs.trainer.batch_size,
46 |             shuffle=True,
47 |             sampler=sampler
48 |         )
49 | 
50 |     def val_dataloader(self) -> DataLoader:
51 |         return SpeakerUttDataLoader(
52 |             dataset=self.valid_dataset,
53 |             num_workers=self.configs.trainer.num_workers,
54 |             min_num_frames=self.configs.model.min_num_frames,
55 |             max_num_frames=self.configs.model.max_num_frames,
56 |             batch_size=self.configs.trainer.batch_size
57 |         )
58 | 
59 |     def _spk2wav_utts(self, speaker2items, spk2ids):
60 |         utts = []
61 |         for spk in speaker2items:
62 |             for item in speaker2items[spk]:
63 |                 wav, spk, seg = item
64 |                 utts.append((str(wav), spk2ids[spk], seg))
65 |                 if self.configs.dataset.exhaustive:
66 |                     for subseg in split_segment(seg,
67 |                                                 self.configs.dataset.duration,
68 |                                                 self.configs.dataset.step):
69 |                         utts.append((str(wav), spk2ids[spk], subseg))
70 |         random.shuffle(utts)
71 |         return utts
72 | 
73 |     def _split_train_valid(self, speaker2items, spk2ids):
74 |         valid_spk2item = {}
75 |         for spk in speaker2items:
76 |             random.shuffle(speaker2items[spk])
77 |             valid_spk2item[spk] = [speaker2items[spk].pop(0)]
78 |         train_utts = self._spk2wav_utts(speaker2items, spk2ids)
79 |         valid_utts = self._spk2wav_utts(valid_spk2item, spk2ids)
80 |         return train_utts, valid_utts
81 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/subcenter_aamsoftmax/subcenter_aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch import Tensor
 6 | from omegaconf import DictConfig
 7 | 
 8 | from .. import register_criterion
 9 | from .configuration import SubcenterAAMSoftmaxConfigs
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from torch.nn import Parameter
15 | import math
16 | 
17 | 
18 | class SubcenterArcMarginProduct(nn.Module):
19 |     r"""Modified implementation from https://github.com/ronghuaiyang/arcface-pytorch/blob/47ace80b128042cd8d2efd408f55c5a3e156b032/models/metrics.py#L10
20 |         """
21 | 
22 |     def __init__(self, in_features, out_features, K=3, s=30.0, m=0.50, easy_margin=False):
23 |         super(SubcenterArcMarginProduct, self).__init__()
24 |         self.in_features = in_features
25 |         self.out_features = out_features
26 |         self.scale = s
27 |         self.margin = m
28 |         self.K = K
29 |         self.weight = Parameter(torch.FloatTensor(out_features * self.K, in_features))
30 |         nn.init.xavier_uniform_(self.weight)
31 | 
32 |         self.easy_margin = easy_margin
33 | 
34 | 
35 |     def forward(self, input, label):
36 |         self.cos_m = math.cos(self.margin)
37 |         self.sin_m = math.sin(self.margin)
38 |         self.th = math.cos(math.pi - self.margin)
39 |         self.mm = math.sin(math.pi - self.margin) * self.margin
40 |         # --------------------------- cos(theta) & phi(theta) ---------------------------
41 |         cosine = F.linear(F.normalize(input), F.normalize(self.weight))
42 | 
43 |         if self.K > 1:
44 |             cosine = torch.reshape(cosine, (-1, self.out_features, self.K))
45 |             cosine, _ = torch.max(cosine, axis=2)
46 | 
47 |         sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
48 |         # cos(phi+m)
49 |         phi = cosine * self.cos_m - sine * self.sin_m
50 | 
51 |         if self.easy_margin:
52 |             phi = torch.where(cosine > 0, phi, cosine)
53 |         else:
54 |             phi = torch.where(cosine > self.th, phi, cosine - self.mm)
55 | 
56 |         # --------------------------- convert label to one-hot ---------------------------
57 |         # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
58 |         one_hot = torch.zeros(cosine.size(), device=input.device)
59 |         one_hot.scatter_(1, label.view(-1, 1).long(), 1)
60 |         # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
61 |         output = (one_hot * phi) + (
62 |                 (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
63 |         output *= self.s
64 | 
65 |         return output
66 | 
67 | 
68 | @register_criterion("subcenter_aamsoftmax", dataclass=SubcenterAAMSoftmaxConfigs)
69 | class PyannoteAAMSoftmax(nn.Module):
70 |     def __init__(self,
71 |                  configs: DictConfig,
72 |                  num_classes: int,
73 |                  embedding_size: int
74 |                  ) -> None:
75 |         super(PyannoteAAMSoftmax, self).__init__()
76 |         self.configs = configs
77 |         self.classifier_ = SubcenterArcMarginProduct(
78 |             in_features=self.configs.model.embed_dim,
79 |             out_features=num_classes,
80 |             K=configs.model.criterion.K,
81 |             m=configs.criterion.margin,
82 |             s=configs.criterion.scale
83 |         )
84 |         self.loss_ = nn.CrossEntropyLoss()
85 | 
86 |     def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor:
87 |         logits = self.classifier_(embeddings, target=targets)
88 |         return self.loss_(logits, targets)
89 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/warmup_scheduler.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import torch
24 | from dataclasses import dataclass, field
25 | from typing import Optional
26 | from omegaconf import DictConfig
27 | from torch.optim import Optimizer
28 | 
29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
30 | from deepaudio.speaker.optim.scheduler import register_scheduler
31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
32 | 
33 | 
34 | @dataclass
35 | class WarmupLRSchedulerConfigs(LearningRateSchedulerConfigs):
36 |     scheduler_name: str = field(
37 |         default="warmup", metadata={"help": "Name of learning rate scheduler."}
38 |     )
39 |     peak_lr: float = field(
40 |         default=1e-04, metadata={"help": "Maximum learning rate."}
41 |     )
42 |     init_lr: float = field(
43 |         default=1e-7, metadata={"help": "Initial learning rate."}
44 |     )
45 |     warmup_steps: int = field(
46 |         default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
47 |     )
48 |     total_steps: int = field(
49 |         default=200000, metadata={"help": "Total training steps."}
50 |     )
51 | 
52 | 
53 | @register_scheduler("warmup", dataclass=WarmupLRSchedulerConfigs)
54 | class WarmupLRScheduler(LearningRateScheduler):
55 |     """
56 |     Warmup learning rate until `total_steps`
57 | 
58 |     Args:
59 |         optimizer (Optimizer): wrapped optimizer.
60 |         configs (DictConfig): configuration set.
61 |     """
62 |     def __init__(
63 |             self,
64 |             optimizer: Optimizer,
65 |             configs: DictConfig,
66 |     ) -> None:
67 |         super(WarmupLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr)
68 |         if configs.lr_scheduler.warmup_steps != 0:
69 |             warmup_rate = configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr
70 |             self.warmup_rate = warmup_rate / configs.lr_scheduler.warmup_steps
71 |         else:
72 |             self.warmup_rate = 0
73 |         self.update_steps = 1
74 |         self.lr = configs.lr_scheduler.init_lr
75 |         self.warmup_steps = configs.lr_scheduler.warmup_steps
76 | 
77 |     def step(self, val_loss: Optional[torch.FloatTensor] = None):
78 |         if self.update_steps < self.warmup_steps:
79 |             lr = self.init_lr + self.warmup_rate * self.update_steps
80 |             self.set_lr(self.optimizer, lr)
81 |             self.lr = lr
82 |         self.update_steps += 1
83 |         return self.lr
84 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/optimizer.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import torch
24 | 
25 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler
26 | from deepaudio.speaker.optim.scheduler.warmup_reduce_lr_on_plateau_scheduler import WarmupReduceLROnPlateauScheduler
27 | 
28 | 
29 | class Optimizer(object):
30 |     """
31 |     This is wrapper classs of torch.optim.Optimizer.
32 |     This class provides functionalities for learning rate scheduling and gradient norm clipping.
33 | 
34 |     Args:
35 |         optim (torch.optim.Optimizer): optimizer object, the parameters to be optimized
36 |             should be given when instantiating the object, e.g. torch.optim.Adam, torch.optim.SGD
37 |         scheduler (openspeech.optim.scheduler, optional): learning rate scheduler
38 |         scheduler_period (int, optional): timestep with learning rate scheduler
39 |         max_grad_norm (int, optional): value used for gradient norm clipping
40 |     """
41 |     def __init__(self, optim, scheduler=None, scheduler_period=None, max_grad_norm=0):
42 |         self.optimizer = optim
43 |         self.scheduler = scheduler
44 |         self.scheduler_period = scheduler_period
45 |         self.max_grad_norm = max_grad_norm
46 |         self.count = 0
47 | 
48 |     def step(self, model):
49 |         if self.max_grad_norm > 0:
50 |             torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
51 |         self.optimizer.step()
52 | 
53 |         if self.scheduler is not None:
54 |             self.update()
55 |             self.count += 1
56 | 
57 |             if self.scheduler_period == self.count:
58 |                 self.scheduler = None
59 |                 self.scheduler_period = 0
60 |                 self.count = 0
61 | 
62 |     def set_scheduler(self, scheduler, scheduler_period):
63 |         self.scheduler = scheduler
64 |         self.scheduler_period = scheduler_period
65 |         self.count = 0
66 | 
67 |     def update(self, val_loss=None):
68 |         if isinstance(self.scheduler, ReduceLROnPlateauScheduler) \
69 |                 or isinstance(self.scheduler, WarmupReduceLROnPlateauScheduler):
70 |             self.scheduler.step(val_loss)
71 |         else:
72 |             self.scheduler.step()
73 | 
74 |     def zero_grad(self):
75 |         self.optimizer.zero_grad()
76 | 
77 |     def get_lr(self):
78 |         for g in self.optimizer.param_groups:
79 |             return g['lr']
80 | 
81 |     def set_lr(self, lr):
82 |         for g in self.optimizer.param_groups:
83 |             g['lr'] = lr
84 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/data/dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | from omegaconf import DictConfig
 3 | 
 4 | import numpy as np
 5 | 
 6 | from torch import Tensor
 7 | from torch.utils.data import Dataset
 8 | 
 9 | from pyannote.core import Segment, Timeline
10 | 
11 | from deepaudio.speaker.data.audio_io.with_torchaudio import Audio
12 | from deepaudio.speaker.data.augmentation.noise import Noise, NoiseReverb, Reverb
13 | from deepaudio.speaker.data.augmentation.spec_augment import SpecAugment
14 | from deepaudio.speaker.data.feature import AUDIO_FEATURE_TRANSFORM_REGISTRY
15 | 
16 | 
17 | class SpeakerAudioDataset(Dataset):
18 |     NONE_AUGMENT = 0
19 |     NOISE_AUGMENT = 1
20 |     REVERB_AUGMENT = 2
21 |     NOISE_REVERB_AUGMENT = 3
22 |     SPEC_AUGMENT = 4
23 | 
24 |     def __init__(
25 |             self,
26 |             configs: DictConfig,
27 |             utts: List,
28 |     ) -> None:
29 |         super(SpeakerAudioDataset, self).__init__()
30 |         self.configs = configs
31 |         self.utts = utts
32 |         self.labels = [utt[1] for utt in utts]
33 |         self.audio = Audio()
34 |         self.feature_extractor = AUDIO_FEATURE_TRANSFORM_REGISTRY[configs.feature.name](configs)
35 |         self.augmentations = [self.NONE_AUGMENT]
36 |         weights = [1]
37 |         if self.configs.augment.apply_noise_augment:
38 |             self._noise_augmentor = Noise(configs)
39 |             self.augmentations.append(self.NOISE_AUGMENT)
40 |             weights.append(self.configs.augment.noise_augment_weight)
41 | 
42 |         if self.configs.augment.apply_reverb_augment:
43 |             self._reverb_augmentor = Reverb(configs)
44 |             self.augmentations.append(self.REVERB_AUGMENT)
45 |             weights.append(self.configs.augment.reverb_augment_weight)
46 |         if self.configs.augment.apply_noise_reverb_augment:
47 |             self._noise_reverb_augmentor = NoiseReverb(configs)
48 |             self.augmentations.append(self.NOISE_REVERB_AUGMENT)
49 |             weights.append(self.configs.augment.noise_reverb_augment_weight)
50 |         if self.configs.augment.apply_spec_augment:
51 |             self._spec_augmentor = SpecAugment(configs)
52 |             self.augmentations.append(self.SPEC_AUGMENT)
53 |             weights.append(self.configs.augment.specaugment_weight)
54 |         self.augmentations_prob = [float(i) / sum(weights) for i in weights]
55 | 
56 |     def _parse_audio(self, audio_path: str, augment: int = None, vad: Union[Segment, Timeline] = None) -> Tensor:
57 |         if vad is not None:
58 |             waveform, _ = self.audio.crop(audio_path, vad)
59 |         else:
60 |             waveform, _ = self.audio(audio_path)
61 |         if augment == self.NOISE_AUGMENT:
62 |             waveform = self._noise_augmentor(waveform)
63 |         if augment == self.REVERB_AUGMENT:
64 |             waveform = self._reverb_augmentor(waveform)
65 |         if augment == self.NOISE_REVERB_AUGMENT:
66 |             waveform = self._noise_reverb_augmentor(waveform)
67 |         feature = self.feature_extractor(waveform)
68 |         if augment == self.SPEC_AUGMENT:
69 |             feature = self._spec_augmentor(feature)
70 |         return feature.squeeze(0)
71 | 
72 |     def __getitem__(self, idxs):
73 |         if isinstance(idxs, int):
74 |             idxs = [idxs]
75 |         features = []
76 |         speaker_ids = []
77 |         for idx in idxs:
78 |             wav, speaker_id, vad = self.utts[idx]
79 |             augment = np.random.choice(self.augmentations, p=self.augmentations_prob)
80 |             feature = self._parse_audio(wav, augment, vad)
81 |             features.append(feature)
82 |             speaker_ids.append(speaker_id)
83 |         return features, speaker_ids
84 | 
85 |     def __len__(self):
86 |         return len(self.utts)
87 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/reduce_lr_on_plateau_scheduler.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from dataclasses import dataclass, field
24 | from omegaconf import DictConfig
25 | from torch.optim import Optimizer
26 | from torch.optim.lr_scheduler import ReduceLROnPlateau
27 | from typing import Optional
28 | 
29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
30 | from deepaudio.speaker.optim.scheduler import register_scheduler
31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
32 | 
33 | 
34 | @dataclass
35 | class ReduceLROnPlateauConfigs(LearningRateSchedulerConfigs):
36 |     scheduler_name: str = field(
37 |         default="reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."}
38 |     )
39 |     lr_patience: int = field(
40 |         default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."}
41 |     )
42 |     lr_factor: float = field(
43 |         default=0.9, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
44 |     )
45 |     tolr: float = field(
46 |         default=0.01, metadata={"help": "Tolr for loss."}
47 |     )
48 | 
49 | 
50 | @register_scheduler("reduce_lr_on_plateau", dataclass=ReduceLROnPlateauConfigs)
51 | class ReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau):
52 |     r"""
53 |     Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by
54 |     a factor of 2-10 once learning stagnates. This scheduler reads a metrics quantity and if no improvement is seen
55 |     for a ‘patience’ number of epochs, the learning rate is reduced.
56 | 
57 |     Args:
58 |         optimizer (Optimizer): wrapped optimizer.
59 |         configs (DictConfig): configuration set.
60 |     """
61 |     def __init__(
62 |             self,
63 |             optimizer: Optimizer,
64 |             configs: DictConfig,
65 |     ) -> None:
66 |         super(ReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
67 |         self.lr = configs.lr_scheduler.lr
68 |         self.lr_patience = configs.lr_scheduler.lr_patience
69 |         self.lr_factor = configs.lr_scheduler.lr_factor
70 |         self.tolr = configs.lr_scheduler.tolr
71 |         self.val_loss = 100.0
72 |         self.count = 0
73 | 
74 |     def step(self, val_loss: Optional[float] = None):
75 |         if val_loss is not None:
76 |             if self.val_loss < val_loss+self.tolr:
77 |                 self.count += 1
78 |                 self.val_loss = val_loss
79 |             else:
80 |                 self.count = 0
81 |                 self.val_loss = val_loss
82 | 
83 |             if self.lr_patience == self.count:
84 |                 self.count = 0
85 |                 self.lr *= self.lr_factor
86 |                 self.set_lr(self.optimizer, self.lr)
87 | 
88 |         return self.lr
89 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/adamp.py:
--------------------------------------------------------------------------------
  1 | # AdamP
  2 | # Copyright (c) 2020-present NAVER Corp.
  3 | # MIT license
  4 | 
  5 | import torch
  6 | from torch.optim.optimizer import Optimizer
  7 | import math
  8 | 
  9 | 
 10 | class AdamP(Optimizer):
 11 |     """
 12 |     Paper: "AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights"
 13 | 
 14 |     Copied from https://github.com/clovaai/AdamP/
 15 |     Copyright (c) 2020 Naver Corp.
 16 |     MIT License
 17 |     """
 18 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 19 |                  weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False):
 20 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
 21 |                         delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
 22 |         super(AdamP, self).__init__(params, defaults)
 23 | 
 24 |     def _channel_view(self, x):
 25 |         return x.view(x.size(0), -1)
 26 | 
 27 |     def _layer_view(self, x):
 28 |         return x.view(1, -1)
 29 | 
 30 |     def _cosine_similarity(self, x, y, eps, view_func):
 31 |         x = view_func(x)
 32 |         y = view_func(y)
 33 | 
 34 |         x_norm = x.norm(dim=1).add_(eps)
 35 |         y_norm = y.norm(dim=1).add_(eps)
 36 |         dot = (x * y).sum(dim=1)
 37 | 
 38 |         return dot.abs() / x_norm / y_norm
 39 | 
 40 |     def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
 41 |         wd = 1
 42 |         expand_size = [-1] + [1] * (len(p.shape) - 1)
 43 |         for view_func in [self._channel_view, self._layer_view]:
 44 | 
 45 |             cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
 46 | 
 47 |             if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
 48 |                 p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
 49 |                 perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
 50 |                 wd = wd_ratio
 51 | 
 52 |                 return perturb, wd
 53 | 
 54 |         return perturb, wd
 55 | 
 56 |     def step(self, closure=None):
 57 |         loss = None
 58 |         if closure is not None:
 59 |             loss = closure()
 60 | 
 61 |         for group in self.param_groups:
 62 |             for p in group['params']:
 63 |                 if p.grad is None:
 64 |                     continue
 65 | 
 66 |                 grad = p.grad.data
 67 |                 beta1, beta2 = group['betas']
 68 |                 nesterov = group['nesterov']
 69 | 
 70 |                 state = self.state[p]
 71 | 
 72 |                 # State initialization
 73 |                 if len(state) == 0:
 74 |                     state['step'] = 0
 75 |                     state['exp_avg'] = torch.zeros_like(p.data)
 76 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 77 | 
 78 |                 # Adam
 79 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 80 | 
 81 |                 state['step'] += 1
 82 |                 bias_correction1 = 1 - beta1 ** state['step']
 83 |                 bias_correction2 = 1 - beta2 ** state['step']
 84 | 
 85 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 86 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 87 | 
 88 |                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
 89 |                 step_size = group['lr'] / bias_correction1
 90 | 
 91 |                 if nesterov:
 92 |                     perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
 93 |                 else:
 94 |                     perturb = exp_avg / denom
 95 | 
 96 |                 # Projection
 97 |                 wd_ratio = 1
 98 |                 if len(p.shape) > 1:
 99 |                     perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'],
100 |                                                          group['eps'])
101 | 
102 |                 # Weight decay
103 |                 if group['weight_decay'] > 0:
104 |                     p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)
105 | 
106 |                 # Step
107 |                 p.data.add_(-step_size, perturb)
108 | 
109 |         return loss
110 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/criterion/adaptive_aamsoftmax/aamsoftmax.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch import Tensor
  6 | from omegaconf import DictConfig
  7 | 
  8 | from .. import register_criterion
  9 | from .configuration import AdaptiveAAMSoftmaxConfigs
 10 | 
 11 | 
 12 | class ArcLinear(nn.Module):
 13 |     """Additive Angular Margin classification module
 14 |     Parameters
 15 |     ----------
 16 |     nfeat : int
 17 |         Embedding dimension
 18 |     nclass : int
 19 |         Number of classes
 20 |     margin : float
 21 |         Angular margin to penalize distances between embeddings and centers
 22 |     scale : float
 23 |         Scaling factor for the logits
 24 |     """
 25 | 
 26 |     def __init__(self, nfeat, nclass, margin, scale):
 27 |         super(ArcLinear, self).__init__()
 28 |         eps = 1e-4
 29 |         self.min_cos = eps - 1
 30 |         self.max_cos = 1 - eps
 31 |         self.nclass = nclass
 32 |         self.margin = margin
 33 |         self.scale = scale
 34 |         self.W = nn.Parameter(Tensor(nclass, nfeat))
 35 |         nn.init.xavier_uniform_(self.W)
 36 | 
 37 |     def forward(self, x, target=None):
 38 |         """Apply the angular margin transformation
 39 |         Parameters
 40 |         ----------
 41 |         x : `torch.Tensor`
 42 |             an embedding batch
 43 |         target : `torch.Tensor`
 44 |             a non one-hot label batch
 45 |         Returns
 46 |         -------
 47 |         fX : `torch.Tensor`
 48 |             logits after the angular margin transformation
 49 |         """
 50 |         # normalize the feature vectors and W
 51 |         xnorm = F.normalize(x)
 52 |         Wnorm = F.normalize(self.W)
 53 |         target = target.long().view(-1, 1)
 54 |         # calculate cosθj (the logits)
 55 |         cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1))
 56 |         # get the cosθ corresponding to the classes
 57 |         cos_theta_yi = cos_theta_j.gather(1, target)
 58 |         # for numerical stability
 59 |         cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos)
 60 |         # get the angle separating xi and Wyi
 61 |         theta_yi = torch.acos(cos_theta_yi)
 62 |         # apply the margin to the angle
 63 |         cos_theta_yi_margin = torch.cos(theta_yi + self.margin)
 64 |         # one hot encode  y
 65 |         one_hot = torch.zeros_like(cos_theta_j)
 66 |         one_hot.scatter_(1, target, 1.0)
 67 |         # project margin differences into cosθj
 68 |         return self.scale * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi))
 69 | 
 70 | 
 71 | @register_criterion("adaptive_aamsoftmax", dataclass=AdaptiveAAMSoftmaxConfigs)
 72 | class AdaptiveAAMSoftmax(nn.Module):
 73 |     def __init__(self,
 74 |                  configs: DictConfig,
 75 |                  num_classes: int,
 76 |                  embedding_size: int
 77 |                  ) -> None:
 78 |         super(AdaptiveAAMSoftmax, self).__init__()
 79 |         self.configs = configs
 80 |         self.classifier_ = ArcLinear(
 81 |             nfeat=self.configs.model.embed_dim,
 82 |             nclass=num_classes,
 83 |             margin=configs.criterion.margin,
 84 |             scale=configs.criterion.scale
 85 |         )
 86 |         self.margin = configs.criterion.margin
 87 |         self.logsoftmax_ = nn.LogSoftmax(dim=1)
 88 |         self.loss_ = nn.NLLLoss()
 89 |         self.warmup_steps = configs.lr_scheduler.warmup_steps if configs.lr_scheduler.scheduler_name.startswith('warmup') else 0
 90 |         self.increase_steps = configs.criterion.increase_steps
 91 |         self.increase_rate = self.margin / (self.increase_steps - self.warmup_steps)
 92 | 
 93 |     def step(self, global_steps):
 94 |         if global_steps < self.warmup_steps:
 95 |             self.classifier_.margin = 0
 96 |         elif global_steps < self.increase_steps:
 97 |             self.classifier_.margin = (global_steps - self.warmup_steps) * self.increase_rate
 98 |         else:
 99 |             self.classifier_.margin = self.margin
100 | 
101 |     def forward(self, embeddings: Tensor, targets: Tensor) -> Tensor:
102 |         logits = self.logsoftmax_(self.classifier_(embeddings, target=targets))
103 |         return self.loss_(logits, targets)
104 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/datasets/dataframe/utils.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | from pathlib import Path
  4 | from collections import defaultdict
  5 | import pandas as pd
  6 | import yaml
  7 | 
  8 | from pyannote.core import Segment, Timeline, SlidingWindow
  9 | 
 10 | 
 11 | def load_dataframe(wav_dir, table_path):
 12 |     df = pd.read_csv(table_path, header=None, delimiter=' ')
 13 |     df[0] = df[0].apply(lambda x: Path(wav_dir) / f'{x}.wav')
 14 |     return df
 15 | 
 16 | 
 17 | def load_trial_dataframe(wav_dir, table_path):
 18 |     df = pd.read_csv(table_path, header=None, delimiter=' ')
 19 |     df[1] = df[1].apply(lambda x: Path(wav_dir) / f'{x}')
 20 |     df[2] = df[2].apply(lambda x: Path(wav_dir) / f'{x}')
 21 |     trials = []
 22 |     for row in df.iterrows():
 23 |         y, enroll, test = row[1]
 24 |         trials.append((enroll, test, y))
 25 |     return trials
 26 | 
 27 | 
 28 | def get_speaker_from_dataframe(dataframe):
 29 |     return set(dataframe[3])
 30 | 
 31 | 
 32 | def get_spk_id(speakers):
 33 |     sorted_speakers = sorted(list(speakers))
 34 |     return {spk: i for i, spk in enumerate(sorted_speakers)}
 35 | 
 36 | 
 37 | def split_segment(segment, duration, step):
 38 |     if segment.duration < duration + step:
 39 |         return Timeline([segment])
 40 |     else:
 41 |         segs = []
 42 |         sw = SlidingWindow(start=segment.start, duration=duration, step=step)
 43 |         for s in sw:
 44 |             if s in segment:
 45 |                 segs.append(s)
 46 |             else:
 47 |                 break
 48 |         if s.start < segment.end < s.end:
 49 |             segs.append(Segment(segment.end - duration, segment.end))
 50 |     return Timeline(segs)
 51 | 
 52 | 
 53 | def get_dataset_items(database_yml, dataset_names, category='train'):
 54 |     dataset_items = []
 55 |     dataset_names = dataset_names.split(',')
 56 |     dataset_names = [n.strip() for n in dataset_names]
 57 |     with open(database_yml) as fp:
 58 |         dataset = yaml.load(fp, Loader=yaml.FullLoader)
 59 |     for name in dataset_names:
 60 |         dataset_items.append(get_dataset_item(dataset, name, category))
 61 |     return dataset_items
 62 | 
 63 | 
 64 | def get_dataset_item(dataset, name, category='train'):
 65 |     dataset_item = dataset['Datasets']['SpeakerDataset'][category].get(name, None)
 66 |     if dataset_item is None:
 67 |         msg = f'{name} does not exist'
 68 |         raise ValueError(msg)
 69 |     return dataset_item['wav_dir'], dataset_item['list_path']
 70 | 
 71 | 
 72 | class SpeakerDataframe:
 73 |     def __init__(self, dataset_items,
 74 |                  strict=False,
 75 |                  segment_min_duration=0,
 76 |                  speaker_min_duration=0):
 77 |         self.strict = strict
 78 |         self.segment_min_duration = segment_min_duration
 79 |         self.speaker_min_duration = speaker_min_duration
 80 |         dfs = [load_dataframe(*item) for item in dataset_items]
 81 |         self.check_speakers(dfs)
 82 |         self.load_speaker2items(dfs)
 83 | 
 84 |     def check_speakers(self, dataframes):
 85 |         all_spks = [get_speaker_from_dataframe(df) for df in dataframes]
 86 |         if len(all_spks) > 1 and len(set.intersection(*all_spks)) > 0:
 87 |             msg = 'Different datasets contain same speakers'
 88 |             if self.strict:
 89 |                 raise ValueError(msg)
 90 |             else:
 91 |                 warnings.warn(msg)
 92 | 
 93 |     def load_speaker2items(self, dataframes):
 94 |         self._speaker2items = defaultdict(list)
 95 |         self.spk2duration = defaultdict(int)
 96 |         for df in dataframes:
 97 |             for _, row in df.iterrows():
 98 |                 wav, start, end, spk = row
 99 |                 if (end - start) < self.segment_min_duration:
100 |                     continue
101 |                 self._speaker2items[spk].append((wav, spk, Segment(start, end)))
102 |                 self.spk2duration[spk] += end - start
103 | 
104 |         for spk in self.spk2duration:
105 |             if self.spk2duration[spk] < self.speaker_min_duration:
106 |                 self._speaker2items.pop(spk)
107 | 
108 |         self._spk_ids = get_spk_id(self._speaker2items.keys())
109 | 
110 |     @property
111 |     def spk2ids(self):
112 |         return self._spk_ids
113 | 
114 |     @property
115 |     def speaker2items(self):
116 |         return self._speaker2items
117 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/clovaai/ResNetSE34V2.py:
--------------------------------------------------------------------------------
  1 | from deepaudio.speaker.modules.backbones.clovaai.ResNetBlocks import *
  2 | 
  3 | 
  4 | class ResNetSE(nn.Module):
  5 |     def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs):
  6 |         super(ResNetSE, self).__init__()
  7 | 
  8 |         print('Embedding size is %d, encoder %s.' % (nOut, encoder_type))
  9 | 
 10 |         self.inplanes = num_filters[0]
 11 |         self.encoder_type = encoder_type
 12 |         self.n_mels = n_mels
 13 |         self.log_input = log_input
 14 | 
 15 |         self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
 16 |         self.relu = nn.ReLU(inplace=True)
 17 |         self.bn1 = nn.BatchNorm2d(num_filters[0])
 18 | 
 19 |         self.layer1 = self._make_layer(block, num_filters[0], layers[0])
 20 |         self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
 21 |         self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
 22 |         self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2))
 23 | 
 24 |         outmap_size = int(self.n_mels / 8)
 25 | 
 26 |         self.attention = nn.Sequential(
 27 |             nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
 28 |             nn.ReLU(),
 29 |             nn.BatchNorm1d(128),
 30 |             nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
 31 |             nn.Softmax(dim=2),
 32 |         )
 33 | 
 34 |         if self.encoder_type == "SAP":
 35 |             out_dim = num_filters[3] * outmap_size
 36 |         elif self.encoder_type == "ASP":
 37 |             out_dim = num_filters[3] * outmap_size * 2
 38 |         else:
 39 |             raise ValueError('Undefined encoder')
 40 | 
 41 |         self.fc = nn.Linear(out_dim, nOut)
 42 | 
 43 |         for m in self.modules():
 44 |             if isinstance(m, nn.Conv2d):
 45 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 46 |             elif isinstance(m, nn.BatchNorm2d):
 47 |                 nn.init.constant_(m.weight, 1)
 48 |                 nn.init.constant_(m.bias, 0)
 49 | 
 50 |     def _make_layer(self, block, planes, blocks, stride=1):
 51 |         downsample = None
 52 |         if stride != 1 or self.inplanes != planes * block.expansion:
 53 |             downsample = nn.Sequential(
 54 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 55 |                           kernel_size=1, stride=stride, bias=False),
 56 |                 nn.BatchNorm2d(planes * block.expansion),
 57 |             )
 58 | 
 59 |         layers = []
 60 |         layers.append(block(self.inplanes, planes, stride, downsample))
 61 |         self.inplanes = planes * block.expansion
 62 |         for i in range(1, blocks):
 63 |             layers.append(block(self.inplanes, planes))
 64 | 
 65 |         return nn.Sequential(*layers)
 66 | 
 67 |     def new_parameter(self, *size):
 68 |         out = nn.Parameter(torch.FloatTensor(*size))
 69 |         nn.init.xavier_normal_(out)
 70 |         return out
 71 | 
 72 |     def forward(self, x):
 73 |         x = x.unsqueeze(1)
 74 |         x = x.transpose(-1, -2)
 75 |         x = self.conv1(x)
 76 |         x = self.relu(x)
 77 |         x = self.bn1(x)
 78 | 
 79 |         x = self.layer1(x)
 80 |         x = self.layer2(x)
 81 |         x = self.layer3(x)
 82 |         x = self.layer4(x)
 83 | 
 84 |         x = x.reshape(x.size()[0], -1, x.size()[-1])
 85 | 
 86 |         w = self.attention(x)
 87 | 
 88 |         if self.encoder_type == "SAP":
 89 |             x = torch.sum(x * w, dim=2)
 90 |         elif self.encoder_type == "ASP":
 91 |             mu = torch.sum(x * w, dim=2)
 92 |             sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
 93 |             x = torch.cat((mu, sg), 1)
 94 | 
 95 |         x = x.view(x.size()[0], -1)
 96 |         x = self.fc(x)
 97 | 
 98 |         return x
 99 | 
100 | 
101 | def MainModel(configs):
102 |     # Number of filters
103 |     num_filters = [32, 64, 128, 256]
104 |     model = ResNetSE(SEBasicBlock, [3, 4, 6, 3],
105 |                      num_filters,
106 |                      nOut=configs.model.embed_dim,
107 |                      encoder_type=configs.model.encoder_type,
108 |                      n_mels=configs.feature.n_mels
109 |                      )
110 |     return model
111 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/wespeaker/tdnn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """TDNN model for x-vector learning"""
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | import deepaudio.speaker.modules.backbones.wespeaker.pooling_layers as pooling_layers
 21 | 
 22 | 
 23 | class TdnnLayer(nn.Module):
 24 |     def __init__(self, in_dim, out_dim, context_size, dilation=1, padding=0):
 25 |         """Define the TDNN layer, essentially 1-D convolution
 26 | 
 27 |         Args:
 28 |             in_dim (int): input dimension
 29 |             out_dim (int): output channels
 30 |             context_size (int): context size, essentially the filter size
 31 |             dilation (int, optional):  Defaults to 1.
 32 |             padding (int, optional):  Defaults to 0.
 33 |         """
 34 |         super(TdnnLayer, self).__init__()
 35 |         self.in_dim = in_dim
 36 |         self.out_dim = out_dim
 37 |         self.context_size = context_size
 38 |         self.dilation = dilation
 39 |         self.padding = padding
 40 |         self.conv_1d = nn.Conv1d(self.in_dim,
 41 |                                  self.out_dim,
 42 |                                  self.context_size,
 43 |                                  dilation=self.dilation,
 44 |                                  padding=self.padding)
 45 | 
 46 |         # Set Affine=false to be compatible with the original kaldi version
 47 |         self.bn = nn.BatchNorm1d(out_dim, affine=False)
 48 | 
 49 |     def forward(self, x):
 50 |         out = self.conv_1d(x)
 51 |         out = F.relu(out)
 52 |         out = self.bn(out)
 53 |         return out
 54 | 
 55 | 
 56 | class XVEC(nn.Module):
 57 |     def __init__(self,
 58 |                  feat_dim=40,
 59 |                  hid_dim=512,
 60 |                  stats_dim=1500,
 61 |                  embed_dim=512,
 62 |                  pooling_func='TSTP'):
 63 |         """
 64 |         Implementation of Kaldi style xvec, as described in
 65 |         X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION
 66 |         """
 67 |         super(XVEC, self).__init__()
 68 |         self.feat_dim = feat_dim
 69 |         self.stats_dim = stats_dim
 70 |         self.embed_dim = embed_dim
 71 | 
 72 |         self.frame_1 = TdnnLayer(feat_dim, hid_dim, context_size=5, dilation=1)
 73 |         self.frame_2 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=2)
 74 |         self.frame_3 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=3)
 75 |         self.frame_4 = TdnnLayer(hid_dim, hid_dim, context_size=1, dilation=1)
 76 |         self.frame_5 = TdnnLayer(hid_dim,
 77 |                                  stats_dim,
 78 |                                  context_size=1,
 79 |                                  dilation=1)
 80 | 
 81 |         self.pool = getattr(pooling_layers, pooling_func)(in_dim=stats_dim)
 82 |         self.pool_out_dim = self.pool.get_out_dim()
 83 |         self.seg_1 = nn.Linear(self.pool_out_dim, embed_dim)
 84 |         self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
 85 |         self.seg_2 = nn.Linear(embed_dim, embed_dim)
 86 | 
 87 |     def forward(self, x):
 88 |         x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
 89 | 
 90 |         out = self.frame_1(x)
 91 |         out = self.frame_2(out)
 92 |         out = self.frame_3(out)
 93 |         out = self.frame_4(out)
 94 |         out = self.frame_5(out)
 95 | 
 96 |         stats = self.pool(out)
 97 |         embed_a = self.seg_1(stats)
 98 |         out = F.relu(embed_a)
 99 |         out = self.seg_bn_1(out)
100 |         embed_b = self.seg_2(out)
101 | 
102 |         return embed_a, embed_b
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     model = XVEC(feat_dim=80, embed_dim=512, pooling_func='TSTP')
107 |     model.eval()
108 |     y = model(torch.rand(10, 200, 80))
109 |     print(y[-1].size())
110 | 
111 |     num_params = sum(p.numel() for p in model.parameters())
112 |     print("{} M".format(num_params / 1e6))
113 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from omegaconf import DictConfig
 24 | from torch.optim import Optimizer
 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 26 | from dataclasses import dataclass, field
 27 | from typing import Optional
 28 | 
 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 30 | from deepaudio.speaker.optim.scheduler import register_scheduler
 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
 32 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler
 33 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler
 34 | 
 35 | 
 36 | @dataclass
 37 | class WarmupReduceLROnPlateauConfigs(LearningRateSchedulerConfigs):
 38 |     scheduler_name: str = field(
 39 |         default="warmup_reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."}
 40 |     )
 41 |     lr_patience: int = field(
 42 |         default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."}
 43 |     )
 44 |     lr_factor: float = field(
 45 |         default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
 46 |     )
 47 |     peak_lr: float = field(
 48 |         default=1e-04, metadata={"help": "Maximum learning rate."}
 49 |     )
 50 |     init_lr: float = field(
 51 |         default=1e-10, metadata={"help": "Initial learning rate."}
 52 |     )
 53 |     warmup_steps: int = field(
 54 |         default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
 55 |     )
 56 | 
 57 | 
 58 | @register_scheduler("warmup_reduce_lr_on_plateau", dataclass=WarmupReduceLROnPlateauConfigs)
 59 | class WarmupReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau):
 60 |     r"""
 61 |     Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after.
 62 | 
 63 |     Args:
 64 |         optimizer (Optimizer): wrapped optimizer.
 65 |         configs (DictConfig): configuration set.
 66 |     """
 67 |     def __init__(
 68 |             self,
 69 |             optimizer: Optimizer,
 70 |             configs: DictConfig,
 71 |     ) -> None:
 72 |         super(WarmupReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
 73 |         self.warmup_steps = configs.lr_scheduler.warmup_steps
 74 |         self.update_steps = 0
 75 |         self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \
 76 |             if self.warmup_steps != 0 else 0
 77 |         self.schedulers = [
 78 |             WarmupLRScheduler(
 79 |                 optimizer,
 80 |                 configs,
 81 |             ),
 82 |             ReduceLROnPlateauScheduler(
 83 |                 optimizer,
 84 |                 configs,
 85 |             ),
 86 |         ]
 87 | 
 88 |     def _decide_stage(self):
 89 |         if self.update_steps < self.warmup_steps:
 90 |             return 0, self.update_steps
 91 |         else:
 92 |             return 1, None
 93 | 
 94 |     def step(self, val_loss: Optional[float] = None):
 95 |         stage, steps_in_stage = self._decide_stage()
 96 | 
 97 |         if stage == 0:
 98 |             self.schedulers[0].step()
 99 |         elif stage == 1:
100 |             self.schedulers[1].step(val_loss)
101 | 
102 |         self.update_steps += 1
103 | 
104 |         return self.get_lr()
105 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/transformer_lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import math
 24 | import torch
 25 | from typing import Optional
 26 | from dataclasses import dataclass, field
 27 | from omegaconf import DictConfig
 28 | from torch.optim import Optimizer
 29 | 
 30 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 31 | from deepaudio.speaker.optim.scheduler import register_scheduler
 32 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
 33 | 
 34 | 
 35 | @dataclass
 36 | class TransformerLRSchedulerConfigs(LearningRateSchedulerConfigs):
 37 |     scheduler_name: str = field(
 38 |         default="transformer", metadata={"help": "Name of learning rate scheduler."}
 39 |     )
 40 |     peak_lr: float = field(
 41 |         default=1e-04, metadata={"help": "Maximum learning rate."}
 42 |     )
 43 |     final_lr: float = field(
 44 |         default=1e-07, metadata={"help": "Final learning rate."}
 45 |     )
 46 |     final_lr_scale: float = field(
 47 |         default=0.05, metadata={"help": "Final learning rate scale"}
 48 |     )
 49 |     warmup_steps: int = field(
 50 |         default=10000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
 51 |     )
 52 |     decay_steps: int = field(
 53 |         default=150000, metadata={"help": "Steps in decay stages"}
 54 |     )
 55 | 
 56 | 
 57 | @register_scheduler("transformer", dataclass=TransformerLRSchedulerConfigs)
 58 | class TransformerLRScheduler(LearningRateScheduler):
 59 |     r"""
 60 |     Transformer Learning Rate Scheduler proposed in "Attention Is All You Need"
 61 | 
 62 |     Args:
 63 |         optimizer (Optimizer): wrapped optimizer.
 64 |         configs (DictConfig): configuration set.
 65 |     """
 66 |     def __init__(
 67 |             self,
 68 |             optimizer: Optimizer,
 69 |             configs: DictConfig,
 70 |     ) -> None:
 71 |         assert isinstance(configs.lr_scheduler.warmup_steps, int), "warmup_steps should be inteager type"
 72 |         assert isinstance(configs.lr_scheduler.decay_steps, int), "total_steps should be inteager type"
 73 | 
 74 |         super(TransformerLRScheduler, self).__init__(optimizer, 0.0)
 75 |         self.final_lr = configs.lr_scheduler.final_lr
 76 |         self.peak_lr = configs.lr_scheduler.peak_lr
 77 |         self.warmup_steps = configs.lr_scheduler.warmup_steps
 78 |         self.decay_steps = configs.lr_scheduler.decay_steps
 79 | 
 80 |         self.warmup_rate = self.peak_lr / self.warmup_steps
 81 |         self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps
 82 | 
 83 |         self.lr = self.init_lr
 84 |         self.update_step = 0
 85 | 
 86 |     def _decide_stage(self):
 87 |         if self.update_step < self.warmup_steps:
 88 |             return 0, self.update_step
 89 | 
 90 |         if self.warmup_steps <= self.update_step < self.warmup_steps + self.decay_steps:
 91 |             return 1, self.update_step - self.warmup_steps
 92 | 
 93 |         return 2, None
 94 | 
 95 |     def step(self, val_loss: Optional[torch.FloatTensor] = None):
 96 |         self.update_step += 1
 97 |         stage, steps_in_stage = self._decide_stage()
 98 | 
 99 |         if stage == 0:
100 |             self.lr = self.update_step * self.warmup_rate
101 |         elif stage == 1:
102 |             self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
103 |         elif stage == 2:
104 |             self.lr = self.final_lr
105 |         else:
106 |             raise ValueError("Undefined stage")
107 | 
108 |         self.set_lr(self.optimizer, self.lr)
109 | 
110 |         return self.lr
111 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/clovaai/ResNetSE34L.py:
--------------------------------------------------------------------------------
  1 | import torch.nn.functional as F
  2 | from deepaudio.speaker.modules.backbones.clovaai.ResNetBlocks import *
  3 | 
  4 | 
  5 | class ResNetSE(nn.Module):
  6 |     def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, **kwargs):
  7 |         super(ResNetSE, self).__init__()
  8 | 
  9 |         print('Embedding size is %d, encoder %s.' % (nOut, encoder_type))
 10 | 
 11 |         self.inplanes = num_filters[0]
 12 |         self.encoder_type = encoder_type
 13 |         self.n_mels = n_mels
 14 | 
 15 |         self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=7, stride=(2, 1), padding=3,
 16 |                                bias=False)
 17 |         self.bn1 = nn.BatchNorm2d(num_filters[0])
 18 |         self.relu = nn.ReLU(inplace=True)
 19 | 
 20 |         self.layer1 = self._make_layer(block, num_filters[0], layers[0])
 21 |         self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
 22 |         self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
 23 |         self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
 24 | 
 25 |         if self.encoder_type == "SAP":
 26 |             self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion)
 27 |             self.attention = self.new_parameter(num_filters[3] * block.expansion, 1)
 28 |             out_dim = num_filters[3] * block.expansion
 29 |         elif self.encoder_type == "ASP":
 30 |             self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion)
 31 |             self.attention = self.new_parameter(num_filters[3] * block.expansion, 1)
 32 |             out_dim = num_filters[3] * block.expansion * 2
 33 |         else:
 34 |             raise ValueError('Undefined encoder')
 35 | 
 36 |         self.fc = nn.Linear(out_dim, nOut)
 37 | 
 38 |         for m in self.modules():
 39 |             if isinstance(m, nn.Conv2d):
 40 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 41 |             elif isinstance(m, nn.BatchNorm2d):
 42 |                 nn.init.constant_(m.weight, 1)
 43 |                 nn.init.constant_(m.bias, 0)
 44 | 
 45 |     def _make_layer(self, block, planes, blocks, stride=1):
 46 |         downsample = None
 47 |         if stride != 1 or self.inplanes != planes * block.expansion:
 48 |             downsample = nn.Sequential(
 49 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 50 |                           kernel_size=1, stride=stride, bias=False),
 51 |                 nn.BatchNorm2d(planes * block.expansion),
 52 |             )
 53 | 
 54 |         layers = []
 55 |         layers.append(block(self.inplanes, planes, stride, downsample))
 56 |         self.inplanes = planes * block.expansion
 57 |         for i in range(1, blocks):
 58 |             layers.append(block(self.inplanes, planes))
 59 | 
 60 |         return nn.Sequential(*layers)
 61 | 
 62 |     def new_parameter(self, *size):
 63 |         out = nn.Parameter(torch.FloatTensor(*size))
 64 |         nn.init.xavier_normal_(out)
 65 |         return out
 66 | 
 67 |     def forward(self, x):
 68 |         x = x.unsqueeze(1)
 69 |         x = x.transpose(-1, -2)
 70 |         x = self.conv1(x)
 71 |         x = self.bn1(x)
 72 |         x = self.relu(x)
 73 | 
 74 |         x = self.layer1(x)
 75 |         x = self.layer2(x)
 76 |         x = self.layer3(x)
 77 |         x = self.layer4(x)
 78 | 
 79 |         x = torch.mean(x, dim=2, keepdim=True)
 80 | 
 81 |         if self.encoder_type == "SAP":
 82 |             x = x.permute(0, 3, 1, 2).squeeze(-1)
 83 |             h = torch.tanh(self.sap_linear(x))
 84 |             w = torch.matmul(h, self.attention).squeeze(dim=2)
 85 |             w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
 86 |             x = torch.sum(x * w, dim=1)
 87 |         elif self.encoder_type == "ASP":
 88 |             x = x.permute(0, 3, 1, 2).squeeze(-1)
 89 |             h = torch.tanh(self.sap_linear(x))
 90 |             w = torch.matmul(h, self.attention).squeeze(dim=2)
 91 |             w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
 92 |             mu = torch.sum(x * w, dim=1)
 93 |             rh = torch.sqrt((torch.sum((x ** 2) * w, dim=1) - mu ** 2).clamp(min=1e-5))
 94 |             x = torch.cat((mu, rh), 1)
 95 | 
 96 |         x = x.view(x.size()[0], -1)
 97 |         x = self.fc(x)
 98 | 
 99 |         return x
100 | 
101 | 
102 | def MainModel(configs):
103 |     # Number of filters
104 |     num_filters = [16, 32, 64, 128]
105 |     model = ResNetSE(SEBasicBlock, [3, 4, 6, 3],
106 |                      num_filters,
107 |                      nOut=configs.model.embed_dim,
108 |                      encoder_type=configs.model.encoder_type
109 |                      )
110 |     return model
111 | 
112 | if __name__ == '__main__':
113 |     # Input size: batch_size * seq_len * feat_dim
114 |     x = torch.zeros(2, 200, 80)
115 |     model = MainModel()
116 |     out = model(x)
117 |     print(out.shape)    # should be [2, 192]


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/warmup_steplr_scheduler.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from omegaconf import DictConfig
 24 | from torch.optim import Optimizer
 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 26 | from dataclasses import dataclass, field
 27 | from typing import Optional
 28 | 
 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 30 | from deepaudio.speaker.optim.scheduler import register_scheduler
 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
 32 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler
 33 | from deepaudio.speaker.optim.scheduler.fix_lr_scheduler import FixLRScheduler
 34 | from deepaudio.speaker.optim.scheduler.step_lr_scheduler import StepLRScheduler
 35 | 
 36 | 
 37 | @dataclass
 38 | class WarmupStepLRConfigs(LearningRateSchedulerConfigs):
 39 |     scheduler_name: str = field(
 40 |         default="warmup_step_lr", metadata={"help": "Name of learning rate scheduler."}
 41 |     )
 42 |     lr_factor: float = field(
 43 |         default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
 44 |     )
 45 |     peak_lr: float = field(
 46 |         default=1e-04, metadata={"help": "Maximum learning rate."}
 47 |     )
 48 |     init_lr: float = field(
 49 |         default=1e-10, metadata={"help": "Initial learning rate."}
 50 |     )
 51 |     warmup_steps: int = field(
 52 |         default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
 53 |     )
 54 |     min_lr: float = field(
 55 |         default=1e-7, metadata={"help": "Min learning rate."}
 56 |     )
 57 |     step_size: int = field(
 58 |         default=70000, metadata={"help": "Step size to decay"}
 59 |     )
 60 |     freeze_steps: int = field(
 61 |         default=400000, metadata={"help": "Step size to decay"}
 62 |     )
 63 | 
 64 | 
 65 | @register_scheduler("warmup_step_lr", dataclass=WarmupStepLRConfigs)
 66 | class WarmupStepLRScheduler(LearningRateScheduler):
 67 |     r"""
 68 |     Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after.
 69 | 
 70 |     Args:
 71 |         optimizer (Optimizer): wrapped optimizer.
 72 |         configs (DictConfig): configuration set.
 73 |     """
 74 |     def __init__(
 75 |             self,
 76 |             optimizer: Optimizer,
 77 |             configs: DictConfig,
 78 |     ) -> None:
 79 |         super(WarmupStepLRScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
 80 |         self.warmup_steps = configs.lr_scheduler.warmup_steps
 81 |         self.update_steps = 0
 82 |         self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \
 83 |             if self.warmup_steps != 0 else 0
 84 |         self.freeze_steps = configs.lr_scheduler.freeze_steps
 85 |         self.schedulers = [
 86 |             WarmupLRScheduler(
 87 |                 optimizer,
 88 |                 configs,
 89 |             ),
 90 |             FixLRScheduler(
 91 |                 optimizer,
 92 |                 configs,
 93 |             ),
 94 |             StepLRScheduler(
 95 |                 optimizer,
 96 |                 configs,
 97 |             ),
 98 |         ]
 99 | 
100 |     def _decide_stage(self):
101 |         if self.update_steps < self.warmup_steps:
102 |             return 0, self.update_steps
103 |         elif self.update_steps < self.freeze_steps:
104 |             return 1, self.update_steps
105 |         else:
106 |             return 2, None
107 | 
108 |     def step(self, val_loss: Optional[float] = None):
109 |         stage, steps_in_stage = self._decide_stage()
110 | 
111 |         if stage == 0:
112 |             self.schedulers[0].step()
113 |         elif stage == 1:
114 |             self.schedulers[1].step()
115 |         elif stage == 2:
116 |             self.schedulers[2].step()
117 | 
118 |         self.update_steps += 1
119 | 
120 |         return self.get_lr()
121 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/warmup_adaptive_loss_reduce_lr_on_plateau_scheduler.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from omegaconf import DictConfig
 24 | from torch.optim import Optimizer
 25 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 26 | from dataclasses import dataclass, field
 27 | from typing import Optional
 28 | 
 29 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 30 | from deepaudio.speaker.optim.scheduler import register_scheduler
 31 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
 32 | from deepaudio.speaker.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler
 33 | from deepaudio.speaker.optim.scheduler.warmup_scheduler import WarmupLRScheduler
 34 | from deepaudio.speaker.optim.scheduler.fix_lr_scheduler import FixLRScheduler
 35 | 
 36 | 
 37 | 
 38 | @dataclass
 39 | class WarmupAdaptiveReduceLROnPlateauConfigs(LearningRateSchedulerConfigs):
 40 |     scheduler_name: str = field(
 41 |         default="warmup_adaptive_reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."}
 42 |     )
 43 |     lr_patience: int = field(
 44 |         default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."}
 45 |     )
 46 |     lr_factor: float = field(
 47 |         default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
 48 |     )
 49 |     tolr: float = field(
 50 |         default=0.01, metadata={"help": "Tolr for loss."}
 51 |     )
 52 |     peak_lr: float = field(
 53 |         default=1e-04, metadata={"help": "Maximum learning rate."}
 54 |     )
 55 |     init_lr: float = field(
 56 |         default=1e-10, metadata={"help": "Initial learning rate."}
 57 |     )
 58 |     warmup_steps: int = field(
 59 |         default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
 60 |     )
 61 | 
 62 | 
 63 | @register_scheduler("warmup_adaptive_reduce_lr_on_plateau", dataclass=WarmupAdaptiveReduceLROnPlateauConfigs)
 64 | class WarmupAdaptiveReduceLROnPlateauScheduler(LearningRateScheduler, ReduceLROnPlateau):
 65 |     r"""
 66 |     Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after.
 67 | 
 68 |     Args:
 69 |         optimizer (Optimizer): wrapped optimizer.
 70 |         configs (DictConfig): configuration set.
 71 |     """
 72 |     def __init__(
 73 |             self,
 74 |             optimizer: Optimizer,
 75 |             configs: DictConfig,
 76 |     ) -> None:
 77 |         super(WarmupAdaptiveReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
 78 |         self.warmup_steps = configs.lr_scheduler.warmup_steps
 79 |         self.update_steps = 0
 80 |         self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \
 81 |             if self.warmup_steps != 0 else 0
 82 |         self.increase_steps = configs.criterion.increase_steps
 83 |         self.schedulers = [
 84 |             WarmupLRScheduler(
 85 |                 optimizer,
 86 |                 configs,
 87 |             ),
 88 |             ReduceLROnPlateauScheduler(
 89 |                 optimizer,
 90 |                 configs,
 91 |             ),
 92 |             FixLRScheduler(
 93 |                 optimizer,
 94 |                 configs,
 95 |             ),
 96 |         ]
 97 | 
 98 |     def _decide_stage(self):
 99 |         if self.update_steps < self.warmup_steps:
100 |             return 0, self.update_steps
101 |         elif self.update_steps < self.increase_steps:
102 |             return 2, self.update_steps
103 |         else:
104 |             return 1, None
105 | 
106 |     def step(self, val_loss: Optional[float] = None):
107 |         stage, steps_in_stage = self._decide_stage()
108 | 
109 |         if stage == 0:
110 |             self.schedulers[0].step()
111 |         elif stage == 1:
112 |             self.schedulers[1].step(val_loss)
113 |         elif stage == 2:
114 |             self.schedulers[2].step()
115 | 
116 |         self.update_steps += 1
117 | 
118 |         return self.get_lr()
119 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/models/speaker_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import pytorch_lightning as pl
  4 | from typing import Dict
  5 | from omegaconf import DictConfig
  6 | from torch import Tensor
  7 | from torch.optim import Adam, Adagrad, Adadelta, Adamax, AdamW, SGD, ASGD
  8 | 
  9 | from deepaudio.speaker.optim import AdamP, RAdam, Novograd
 10 | from deepaudio.speaker.criterion import CRITERION_REGISTRY
 11 | from deepaudio.speaker.optim.scheduler import SCHEDULER_REGISTRY
 12 | 
 13 | 
 14 | class SpeakerModel(pl.LightningModule):
 15 |     def __init__(self, configs: DictConfig, num_classes: int) -> None:
 16 |         super(SpeakerModel, self).__init__()
 17 |         self.configs = configs
 18 |         self.num_classes = num_classes
 19 |         self.gradient_clip_val = configs.trainer.gradient_clip_val
 20 |         self.current_val_loss = 100.0
 21 |         self.build_model()
 22 |         self.criterion = self.configure_criterion(configs.criterion.name)
 23 | 
 24 |     def build_model(self):
 25 |         raise NotImplementedError
 26 | 
 27 |     def forward(self, inputs: torch.FloatTensor) -> Tensor:
 28 |         raise NotImplementedError
 29 | 
 30 |     def training_step(self, batch: tuple, batch_idx: int):
 31 |         r"""
 32 |         Forward propagate a `inputs` and `targets` pair for training.
 33 | 
 34 |         Inputs:
 35 |             batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
 36 |             batch_idx (int): The index of batch
 37 | 
 38 |         Returns:
 39 |             loss (torch.Tensor): loss for training
 40 |         """
 41 |         raise NotImplementedError
 42 | 
 43 |     def validation_step(self, batch: tuple, batch_idx: int):
 44 |         r"""
 45 |         Forward propagate a `inputs` and `targets` pair for validation.
 46 | 
 47 |         Inputs:
 48 |             batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
 49 |             batch_idx (int): The index of batch
 50 | 
 51 |         Returns:
 52 |             loss (torch.Tensor): loss for training
 53 |         """
 54 |         raise NotImplementedError
 55 | 
 56 |     def configure_optimizers(self):
 57 |         r"""
 58 |         Choose what optimizers and learning-rate schedulers to use in your optimization.
 59 | 
 60 | 
 61 |         Returns:
 62 |             - **Dictionary** - The first item has multiple optimizers, and the second has multiple LR schedulers
 63 |                 (or multiple ``lr_dict``).
 64 |         """
 65 |         SUPPORTED_OPTIMIZERS = {
 66 |             "adam": Adam,
 67 |             "adamp": AdamP,
 68 |             "radam": RAdam,
 69 |             "adagrad": Adagrad,
 70 |             "adadelta": Adadelta,
 71 |             "adamax": Adamax,
 72 |             "adamw": AdamW,
 73 |             "sgd": SGD,
 74 |             "asgd": ASGD,
 75 |             "novograd": Novograd,
 76 |         }
 77 | 
 78 |         assert self.configs.model.optimizer in SUPPORTED_OPTIMIZERS.keys(), \
 79 |             f"Unsupported Optimizer: {self.configs.model.optimizer}\n" \
 80 |             f"Supported Optimizers: {SUPPORTED_OPTIMIZERS.keys()}"
 81 | 
 82 |         self.optimizer = SUPPORTED_OPTIMIZERS[self.configs.model.optimizer](
 83 |             self.parameters(),
 84 |             lr=self.configs.lr_scheduler.lr,
 85 |             weight_decay=1e-5,
 86 |         )
 87 |         scheduler = SCHEDULER_REGISTRY[self.configs.lr_scheduler.scheduler_name](self.optimizer, self.configs)
 88 | 
 89 |         if self.configs.lr_scheduler.scheduler_name == "reduce_lr_on_plateau":
 90 |             lr_scheduler = {
 91 |                 'scheduler': scheduler,
 92 |                 'monitor': 'val_loss',
 93 |                 'interval': 'epoch',
 94 |             }
 95 |         elif self.configs.lr_scheduler.scheduler_name == "warmup_reduce_lr_on_plateau":
 96 |             lr_scheduler = {
 97 |                 'scheduler': scheduler,
 98 |                 'monitor': 'val_loss',
 99 |                 'interval': 'step',
100 |             }
101 |         elif self.configs.lr_scheduler.scheduler_name == "warmup_adaptive_reduce_lr_on_plateau":
102 |             lr_scheduler = {
103 |                 'scheduler': scheduler,
104 |                 'monitor': 'val_loss',
105 |                 'interval': 'step',
106 |             }
107 |         else:
108 |             print('by step')
109 |             lr_scheduler = {
110 |                 'scheduler': scheduler,
111 |                 'interval': 'step',
112 |             }
113 | 
114 |         return [self.optimizer], [lr_scheduler]
115 | 
116 |     def configure_criterion(self, criterion_name: str) -> nn.Module:
117 |         r"""
118 |         Configure criterion for training.
119 | 
120 |         Args:
121 |             criterion_name (str): name of criterion
122 | 
123 |         Returns:
124 |             criterion (nn.Module): criterion for training
125 |         """
126 | 
127 |         return CRITERION_REGISTRY[criterion_name](
128 |             configs=self.configs,
129 |             num_classes=self.num_classes,
130 |             embedding_size=self.configs.model.embed_dim
131 |         )
132 | 
133 |     def get_lr(self):
134 |         for g in self.optimizer.param_groups:
135 |             return g['lr']
136 | 
137 |     def set_lr(self, lr):
138 |         for g in self.optimizer.param_groups:
139 |             g['lr'] = lr
140 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/radam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, LiyuanLucasLiu. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import math
 16 | import torch
 17 | from torch.optim.optimizer import Optimizer
 18 | 
 19 | 
 20 | class RAdam(Optimizer):
 21 |     """
 22 |     Paper: "On the Variance of the Adaptive Learning Rate and Beyond"
 23 | 
 24 |     Refer to https://github.com/LiyuanLucasLiu/RAdam
 25 |     Copyright (c) LiyuanLucasLiu
 26 |     Apache 2.0 License
 27 |     """
 28 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
 29 |         if lr < 0.0:
 30 |             raise ValueError("Invalid learning rate: {}".format(lr))
 31 |         if eps < 0.0:
 32 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 33 |         if not 0.0 <= betas[0] < 1.0:
 34 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 35 |         if not 0.0 <= betas[1] < 1.0:
 36 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 37 | 
 38 |         self.degenerated_to_sgd = degenerated_to_sgd
 39 |         if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
 40 |             for param in params:
 41 |                 if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
 42 |                     param['buffer'] = [[None, None, None] for _ in range(10)]
 43 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
 44 |                         buffer=[[None, None, None] for _ in range(10)])
 45 |         super(RAdam, self).__init__(params, defaults)
 46 | 
 47 |     def __setstate__(self, state):
 48 |         super(RAdam, self).__setstate__(state)
 49 | 
 50 |     def step(self, closure=None):
 51 | 
 52 |         loss = None
 53 |         if closure is not None:
 54 |             loss = closure()
 55 | 
 56 |         for group in self.param_groups:
 57 | 
 58 |             for p in group['params']:
 59 |                 if p.grad is None:
 60 |                     continue
 61 |                 grad = p.grad.data.float()
 62 |                 if grad.is_sparse:
 63 |                     raise RuntimeError('RAdam does not support sparse gradients')
 64 | 
 65 |                 p_data_fp32 = p.data.float()
 66 | 
 67 |                 state = self.state[p]
 68 | 
 69 |                 if len(state) == 0:
 70 |                     state['step'] = 0
 71 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 72 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 73 |                 else:
 74 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 75 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 76 | 
 77 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 78 |                 beta1, beta2 = group['betas']
 79 | 
 80 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 81 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 82 | 
 83 |                 state['step'] += 1
 84 |                 buffered = group['buffer'][int(state['step'] % 10)]
 85 |                 if state['step'] == buffered[0]:
 86 |                     N_sma, step_size = buffered[1], buffered[2]
 87 |                 else:
 88 |                     buffered[0] = state['step']
 89 |                     beta2_t = beta2 ** state['step']
 90 |                     N_sma_max = 2 / (1 - beta2) - 1
 91 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
 92 |                     buffered[1] = N_sma
 93 | 
 94 |                     # more conservative since it's an approximated value
 95 |                     if N_sma >= 5:
 96 |                         step_size = math.sqrt(
 97 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
 98 |                                         N_sma_max - 2)) / (1 - beta1 ** state['step'])
 99 |                     elif self.degenerated_to_sgd:
100 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
101 |                     else:
102 |                         step_size = -1
103 |                     buffered[2] = step_size
104 | 
105 |                 # more conservative since it's an approximated value
106 |                 if N_sma >= 5:
107 |                     if group['weight_decay'] != 0:
108 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
109 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
110 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
111 |                     p.data.copy_(p_data_fp32)
112 |                 elif step_size > 0:
113 |                     if group['weight_decay'] != 0:
114 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
115 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
116 |                     p.data.copy_(p_data_fp32)
117 | 
118 |         return loss
119 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/novograd.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | from torch.optim.optimizer import Optimizer
 17 | 
 18 | 
 19 | class Novograd(Optimizer):
 20 |     """
 21 |     Novograd algorithm.
 22 | 
 23 |     Copied from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/optimizers.py
 24 |     Copyright (c) 2019 NVIDIA Corp.
 25 |     Apache-2.0 License
 26 | 
 27 |     Args:
 28 |         params (iterable): iterable of parameters to optimize or dicts defining
 29 |             parameter groups
 30 |         lr (float, optional): learning rate (default: 1e-3)
 31 |         betas (Tuple[float, float], optional): coefficients used for computing
 32 |             running averages of gradient and its square (default: (0.95, 0))
 33 |         eps (float, optional): term added to the denominator to improve
 34 |             numerical stability (default: 1e-8)
 35 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 36 |         grad_averaging: gradient averaging
 37 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 38 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 39 |             (default: False)
 40 |     """
 41 | 
 42 |     def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
 43 |                  weight_decay=0, grad_averaging=False, amsgrad=False):
 44 |         if 0.0 > lr:
 45 |             raise ValueError("Invalid learning rate: {}".format(lr))
 46 |         if 0.0 > eps:
 47 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 48 |         if not 0.0 <= betas[0] < 1.0:
 49 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 50 |         if not 0.0 <= betas[1] < 1.0:
 51 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 52 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 53 |                         weight_decay=weight_decay,
 54 |                         grad_averaging=grad_averaging,
 55 |                         amsgrad=amsgrad)
 56 | 
 57 |         super(Novograd, self).__init__(params, defaults)
 58 | 
 59 |     def __setstate__(self, state):
 60 |         super(Novograd, self).__setstate__(state)
 61 |         for group in self.param_groups:
 62 |             group.setdefault('amsgrad', False)
 63 | 
 64 |     def step(self, closure=None):
 65 |         """Performs a single optimization step.
 66 |         Arguments:
 67 |             closure (callable, optional): A closure that reevaluates the model
 68 |             and returns the loss.
 69 |         """
 70 |         loss = None
 71 |         if closure is not None:
 72 |             loss = closure()
 73 | 
 74 |         for group in self.param_groups:
 75 |             for p in group['params']:
 76 |                 if p.grad is None:
 77 |                     continue
 78 |                 grad = p.grad.data
 79 |                 if grad.is_sparse:
 80 |                     raise RuntimeError('Sparse gradients are not supported.')
 81 |                 amsgrad = group['amsgrad']
 82 | 
 83 |                 state = self.state[p]
 84 | 
 85 |                 # State initialization
 86 |                 if len(state) == 0:
 87 |                     state['step'] = 0
 88 |                     # Exponential moving average of gradient values
 89 |                     state['exp_avg'] = torch.zeros_like(p.data)
 90 |                     # Exponential moving average of squared gradient values
 91 |                     state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
 92 |                     if amsgrad:
 93 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 94 |                         state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
 95 | 
 96 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 97 |                 if amsgrad:
 98 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 99 |                 beta1, beta2 = group['betas']
100 | 
101 |                 state['step'] += 1
102 | 
103 |                 norm = torch.sum(torch.pow(grad, 2))
104 | 
105 |                 if exp_avg_sq == 0:
106 |                     exp_avg_sq.copy_(norm)
107 |                 else:
108 |                     exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
109 | 
110 |                 if amsgrad:
111 |                     # Maintains the maximum of all 2nd moment running avg. till now
112 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
113 |                     # Use the max. for normalizing running avg. of gradient
114 |                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
115 |                 else:
116 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
117 | 
118 |                 grad.div_(denom)
119 |                 if group['weight_decay'] != 0:
120 |                     grad.add_(p.data, alpha=group['weight_decay'])
121 |                 if group['grad_averaging']:
122 |                     grad.mul_(1 - beta1)
123 |                 exp_avg.mul_(beta1).add_(grad)
124 | 
125 |                 p.data.add_(exp_avg, alpha=-group['lr'])
126 | 
127 |         return loss
128 | 


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/ecapa.py:
--------------------------------------------------------------------------------
  1 | #code from https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | 
  8 | ''' Res2Conv1d + BatchNorm1d + ReLU
  9 | '''
 10 | class Res2Conv1dReluBn(nn.Module):
 11 |     '''
 12 |     in_channels == out_channels == channels
 13 |     '''
 14 |     def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
 15 |         super().__init__()
 16 |         assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
 17 |         self.scale = scale
 18 |         self.width = channels // scale
 19 |         self.nums = scale if scale == 1 else scale - 1
 20 | 
 21 |         self.convs = []
 22 |         self.bns = []
 23 |         for i in range(self.nums):
 24 |             self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
 25 |             self.bns.append(nn.BatchNorm1d(self.width))
 26 |         self.convs = nn.ModuleList(self.convs)
 27 |         self.bns = nn.ModuleList(self.bns)
 28 | 
 29 |     def forward(self, x):
 30 |         out = []
 31 |         spx = torch.split(x, self.width, 1)
 32 |         for i in range(self.nums):
 33 |             if i == 0:
 34 |                 sp = spx[i]
 35 |             else:
 36 |                 sp = sp + spx[i]
 37 |             # Order: conv -> relu -> bn
 38 |             sp = self.convs[i](sp)
 39 |             sp = self.bns[i](F.relu(sp))
 40 |             out.append(sp)
 41 |         if self.scale != 1:
 42 |             out.append(spx[self.nums])
 43 |         out = torch.cat(out, dim=1)
 44 |         return out
 45 | 
 46 | 
 47 | 
 48 | ''' Conv1d + BatchNorm1d + ReLU
 49 | '''
 50 | class Conv1dReluBn(nn.Module):
 51 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
 52 |         super().__init__()
 53 |         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
 54 |         self.bn = nn.BatchNorm1d(out_channels)
 55 | 
 56 |     def forward(self, x):
 57 |         return self.bn(F.relu(self.conv(x)))
 58 | 
 59 | 
 60 | 
 61 | ''' The SE connection of 1D case.
 62 | '''
 63 | class SE_Connect(nn.Module):
 64 |     def __init__(self, channels, s=2):
 65 |         super().__init__()
 66 |         assert channels % s == 0, "{} % {} != 0".format(channels, s)
 67 |         self.linear1 = nn.Linear(channels, channels // s)
 68 |         self.linear2 = nn.Linear(channels // s, channels)
 69 | 
 70 |     def forward(self, x):
 71 |         out = x.mean(dim=2)
 72 |         out = F.relu(self.linear1(out))
 73 |         out = torch.sigmoid(self.linear2(out))
 74 |         out = x * out.unsqueeze(2)
 75 |         return out
 76 | 
 77 | 
 78 | 
 79 | ''' SE-Res2Block.
 80 |     Note: residual connection is implemented in the ECAPA_TDNN model, not here.
 81 | '''
 82 | def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
 83 |     return nn.Sequential(
 84 |         Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
 85 |         Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
 86 |         Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
 87 |         SE_Connect(channels)
 88 |     )
 89 | 
 90 | 
 91 | 
 92 | ''' Attentive weighted mean and standard deviation pooling.
 93 | '''
 94 | class AttentiveStatsPool(nn.Module):
 95 |     def __init__(self, in_dim, bottleneck_dim):
 96 |         super().__init__()
 97 |         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
 98 |         self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
 99 |         self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
100 | 
101 |     def forward(self, x):
102 |         # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
103 |         alpha = torch.tanh(self.linear1(x))
104 |         alpha = torch.softmax(self.linear2(alpha), dim=2)
105 |         mean = torch.sum(alpha * x, dim=2)
106 |         residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
107 |         std = torch.sqrt(residuals.clamp(min=1e-9))
108 |         return torch.cat([mean, std], dim=1)
109 | 
110 | 
111 | 
112 | ''' Implementation of
113 |     "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification".
114 |     Note that we DON'T concatenate the last frame-wise layer with non-weighted mean and standard deviation, 
115 |     because it brings little improvment but significantly increases model parameters. 
116 |     As a result, this implementation basically equals the A.2 of Table 2 in the paper.
117 | '''
118 | class ECAPA_TDNN(nn.Module):
119 |     def __init__(self, in_channels=80, channels=1024, embed_dim=192):
120 |         super().__init__()
121 |         self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2)
122 |         self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
123 |         self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
124 |         self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
125 | 
126 |         cat_channels = channels * 3
127 |         self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
128 |         self.pooling = AttentiveStatsPool(cat_channels, 128)
129 |         self.bn1 = nn.BatchNorm1d(cat_channels * 2)
130 |         self.linear = nn.Linear(cat_channels * 2, embed_dim)
131 |         self.bn2 = nn.BatchNorm1d(embed_dim)
132 | 
133 |     def forward(self, x):
134 |         x = x.transpose(1, 2)
135 |         out1 = self.layer1(x)
136 |         out2 = self.layer2(out1) + out1
137 |         out3 = self.layer3(out1 + out2) + out1 + out2
138 |         out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
139 | 
140 |         out = torch.cat([out2, out3, out4], dim=1)
141 |         out = F.relu(self.conv(out))
142 |         out = self.bn1(self.pooling(out))
143 |         out = self.bn2(self.linear(out))
144 |         return out


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/mmcl/STP.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from mmcv.cnn import build_norm_layer
  5 | 
  6 | 
  7 | def select_activation(activation_type):
  8 |     if activation_type == "leaky_relu":
  9 |         return nn.LeakyReLU(inplace=True)
 10 |     elif activation_type == "relu":
 11 |         return nn.ReLU(inplace=True)
 12 |     elif activation_type == "prelu":
 13 |         return nn.PReLU()
 14 |     elif activation_type == "none":
 15 |         return nn.Identity()
 16 |     else:
 17 |         print("activation type {} is not supported".format(activation_type))
 18 |         raise NotImplementedError
 19 | 
 20 | 
 21 | def std_pooling(batch, batch_mean, dim=-1, unbiased=False, eps=1e-8):
 22 |     # adding epsilon in sqrt function to make more numerically stable results (yufeng)
 23 |     r2 = torch.sum((batch - batch_mean.unsqueeze(-1))**2, dim)
 24 |     if unbiased:
 25 |         length = batch.shape[dim] - 1
 26 |     else:
 27 |         length = batch.shape[dim]
 28 |     return torch.sqrt(r2/length + eps)
 29 | 
 30 | 
 31 | class Stats_pooling(nn.Module):
 32 |     def __init__(self, input_dim=1500):
 33 |         super(Stats_pooling, self).__init__()
 34 |         self.out_dim = 2 * input_dim
 35 | 
 36 |     def forward(self, x):
 37 |         """
 38 |         x.size() = [batch_size, feature_dim, seq_length]
 39 |         """
 40 |         mean_frame = torch.mean(x, -1, False)
 41 |         if self.training:
 42 |             std_frame = std_pooling(x, mean_frame, -1, False)
 43 |         else:
 44 |             std_frame = torch.std(x, -1, False)
 45 |         output = torch.cat([mean_frame, std_frame], dim=-1)
 46 |         # print(output.shape)
 47 |         output = output.view(-1, self.out_dim)
 48 |         return output
 49 | 
 50 | 
 51 | class StatsPooling(nn.Module):
 52 |     """Stats Pooling neck.
 53 |     """
 54 |     def __init__(self, in_plane, emb_dim, emb_bn=True, emb_affine=True,
 55 |                  activation_type="relu", norm_type="BN1d", output_stage=(0,)):
 56 |         super(StatsPooling, self).__init__()
 57 |         self.avgpool = Stats_pooling(in_plane)
 58 |         embedding = []
 59 |         initial_dim = self.avgpool.out_dim
 60 |         self.output_stage = output_stage
 61 |         if isinstance(emb_dim, list):
 62 |             self.stages = len(emb_dim)
 63 |             for e_dim, do_bn, do_affine, act_type in zip(emb_dim, emb_bn, emb_affine, activation_type):
 64 |                 fc = [nn.Linear(initial_dim, e_dim)]
 65 |                 initial_dim = e_dim
 66 |                 fc.append(select_activation(act_type))
 67 |                 if do_bn:
 68 |                     cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=do_affine)
 69 |                     fc.append(build_norm_layer(cfg, e_dim)[1])
 70 |                 embedding.append(nn.Sequential(*fc))
 71 |         else:
 72 |             self.stages = 1
 73 |             embedding.append(nn.Linear(initial_dim, emb_dim))
 74 |             embedding.append(select_activation(activation_type))
 75 |             if emb_bn:
 76 |                 cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=emb_affine)
 77 |                 embedding.append(build_norm_layer(cfg, emb_dim)[1])
 78 |         self.embedding = nn.Sequential(*embedding)
 79 | 
 80 |     def init_weights(self):
 81 |         pass
 82 | 
 83 |     def forward(self, inputs):
 84 |         out = self.avgpool(inputs)
 85 |         if self.stages > 1 and len(self.output_stage) > 1 and self.training:
 86 |             # contains more than one fc layers and needs to output more than one vector and training mode
 87 |             embs = []
 88 |             for fc in self.embedding:
 89 |                 out = fc(out)
 90 |                 embs.append(out)
 91 |             results = []
 92 |             for stage in self.output_stage:
 93 |                 results.append(embs[stage])
 94 |             return tuple(results)
 95 |         else:
 96 |             return self.embedding(out)
 97 | 
 98 | 
 99 | class StatsPoolingMSEA(nn.Module):
100 |     """Stats Pooling neck.
101 |     """
102 |     def __init__(self, in_plane, emb_dim, emb_bn=True, emb_affine=True,
103 |                  activation_type="relu", norm_type="BN1d", output_stage=(0,)):
104 |         super(StatsPoolingMSEA, self).__init__()
105 |         assert isinstance(in_plane, tuple)
106 |         self.avgpool = [Stats_pooling(plane) for plane in in_plane]
107 |         embedding = []
108 |         initial_dim = sum([pool.out_dim for pool in self.avgpool])
109 |         self.output_stage = output_stage
110 |         if isinstance(emb_dim, list):
111 |             self.stages = len(emb_dim)
112 |             for e_dim, do_bn, do_affine, act_type in zip(emb_dim, emb_bn, emb_affine, activation_type):
113 |                 fc = [nn.Linear(initial_dim, e_dim)]
114 |                 initial_dim = e_dim
115 |                 fc.append(select_activation(act_type))
116 |                 if do_bn:
117 |                     cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=do_affine)
118 |                     fc.append(build_norm_layer(cfg, e_dim)[1])
119 |                 embedding.append(nn.Sequential(*fc))
120 |         else:
121 |             self.stages = 1
122 |             embedding.append(nn.Linear(initial_dim, emb_dim))
123 |             embedding.append(select_activation(activation_type))
124 |             if emb_bn:
125 |                 cfg = dict(type=norm_type, requires_grad=True, momentum=0.5, affine=emb_affine)
126 |                 embedding.append(build_norm_layer(cfg, emb_dim)[1])
127 |         self.embedding = nn.Sequential(*embedding)
128 | 
129 |     def init_weights(self):
130 |         pass
131 | 
132 |     def forward(self, inputs):
133 |         out = [pool(inp) for pool, inp in zip(self.avgpool, inputs)]
134 |         out = torch.cat(out, dim=-1)
135 |         if self.stages > 1 and len(self.output_stage) > 1 and self.training:
136 |             # contains more than one fc layers and needs to output more than one vector and training mode
137 |             embs = []
138 |             for fc in self.embedding:
139 |                 out = fc(out)
140 |                 embs.append(out)
141 |             results = []
142 |             for stage in self.output_stage:
143 |                 results.append(embs[stage])
144 |             return tuple(results)
145 |         else:
146 |             return self.embedding(out)


--------------------------------------------------------------------------------
/deepaudio/speaker/modules/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | # code from https://github.com/BUTSpeechFIT/VBx/blob/master/VBx/models/resnet.py
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class BasicBlock(nn.Module):
  8 |     expansion = 1
  9 | 
 10 |     def __init__(self, in_planes, planes, stride=1, reduction=16):
 11 |         super(BasicBlock, self).__init__()
 12 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 13 |         self.bn1 = nn.BatchNorm2d(planes)
 14 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 15 |         self.bn2 = nn.BatchNorm2d(planes)
 16 |         # self.se = SELayer(planes, reduction)
 17 | 
 18 |         self.shortcut = nn.Sequential()
 19 |         if stride != 1 or in_planes != self.expansion * planes:
 20 |             self.shortcut = nn.Sequential(
 21 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
 22 |                 nn.BatchNorm2d(self.expansion * planes)
 23 |             )
 24 | 
 25 |     def forward(self, x):
 26 |         out = F.relu(self.bn1(self.conv1(x)))
 27 |         out = self.bn2(self.conv2(out))
 28 |         # out = self.se(out)
 29 |         out += self.shortcut(x)
 30 |         out = F.relu(out)
 31 |         return out
 32 | 
 33 | 
 34 | class Bottleneck(nn.Module):
 35 |     expansion = 4
 36 | 
 37 |     def __init__(self, in_planes, planes, stride=1, reduction=16):
 38 |         super(Bottleneck, self).__init__()
 39 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 40 |         self.bn1 = nn.BatchNorm2d(planes)
 41 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 42 |         self.bn2 = nn.BatchNorm2d(planes)
 43 |         self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
 44 |         self.bn3 = nn.BatchNorm2d(self.expansion * planes)
 45 |         # self.se = SELayer(planes * 4, reduction)
 46 | 
 47 |         self.shortcut = nn.Sequential()
 48 |         if stride != 1 or in_planes != self.expansion * planes:
 49 |             self.shortcut = nn.Sequential(
 50 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
 51 |                 nn.BatchNorm2d(self.expansion * planes)
 52 |             )
 53 | 
 54 |     def forward(self, x):
 55 |         out = F.relu(self.bn1(self.conv1(x)))
 56 |         out = F.relu(self.bn2(self.conv2(out)))
 57 |         out = self.bn3(self.conv3(out))
 58 |         # out = self.se(out)
 59 |         out += self.shortcut(x)
 60 |         out = F.relu(out)
 61 |         return out
 62 | 
 63 | 
 64 | class SELayer(nn.Module):
 65 |     def __init__(self, channel, reduction=16):
 66 |         super(SELayer, self).__init__()
 67 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 68 |         self.fc = nn.Sequential(
 69 |             nn.Linear(channel, channel // reduction, bias=False),
 70 |             nn.ReLU(inplace=True),
 71 |             nn.Linear(channel // reduction, channel, bias=False),
 72 |             nn.Sigmoid()
 73 |         )
 74 | 
 75 |     def forward(self, x):
 76 |         b, c, _, _ = x.size()
 77 |         y = self.avg_pool(x).view(b, c)
 78 |         y = self.fc(y).view(b, c, 1, 1)
 79 |         return x * y.expand_as(x)
 80 | 
 81 | 
 82 | class ResNet(nn.Module):
 83 |     def __init__(self, block, num_blocks, m_channels=32, feat_dim=40, embed_dim=128, squeeze_excitation=False):
 84 |         super(ResNet, self).__init__()
 85 |         self.in_planes = m_channels
 86 |         self.feat_dim = feat_dim
 87 |         self.embed_dim = embed_dim
 88 |         self.squeeze_excitation = squeeze_excitation
 89 |         if block is BasicBlock:
 90 |             self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
 91 |             self.bn1 = nn.BatchNorm2d(m_channels)
 92 |             self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
 93 |             self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
 94 |             current_freq_dim = int((feat_dim - 1) / 2) + 1
 95 |             self.layer3 = self._make_layer(block, m_channels * 4, num_blocks[2], stride=2)
 96 |             current_freq_dim = int((current_freq_dim - 1) / 2) + 1
 97 |             self.layer4 = self._make_layer(block, m_channels * 8, num_blocks[3], stride=2)
 98 |             current_freq_dim = int((current_freq_dim - 1) / 2) + 1
 99 |             self.embedding = nn.Linear(m_channels * 8 * 2 * current_freq_dim, embed_dim)
100 |         elif block is Bottleneck:
101 |             self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
102 |             self.bn1 = nn.BatchNorm2d(m_channels)
103 |             self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
104 |             self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
105 |             self.layer3 = self._make_layer(block, m_channels * 4, num_blocks[2], stride=2)
106 |             self.layer4 = self._make_layer(block, m_channels * 8, num_blocks[3], stride=2)
107 |             self.embedding = nn.Linear(int(feat_dim / 8) * m_channels * 16 * block.expansion, embed_dim)
108 |         else:
109 |             raise ValueError(f'Unexpected class {type(block)}.')
110 | 
111 |     def _make_layer(self, block, planes, num_blocks, stride):
112 |         strides = [stride] + [1] * (num_blocks - 1)
113 |         layers = []
114 |         for stride in strides:
115 |             layers.append(block(self.in_planes, planes, stride))
116 |             self.in_planes = planes * block.expansion
117 |         return nn.Sequential(*layers)
118 | 
119 |     def forward(self, x):
120 |         x = x.transpose(1, 2)
121 |         x = x.unsqueeze_(1)
122 |         out = F.relu(self.bn1(self.conv1(x)))
123 |         out = self.layer1(out)
124 |         out = self.layer2(out)
125 |         out = self.layer3(out)
126 |         out = self.layer4(out)
127 | 
128 |         pooling_mean = torch.mean(out, dim=-1)
129 |         meansq = torch.mean(out * out, dim=-1)
130 |         pooling_std = torch.sqrt(meansq - pooling_mean ** 2 + 1e-10)
131 |         out = torch.cat((torch.flatten(pooling_mean, start_dim=1),
132 |                          torch.flatten(pooling_std, start_dim=1)), 1)
133 | 
134 |         embedding = self.embedding(out)
135 |         return embedding


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | ## Content
  2 | - [**What is deepaudio-speaker?**](https://github.com/deepaudio/deepaudio-speaker#what-is-deepaudio)
  3 | - [**Installation**](https://github.com/deepaudio/deepaudio-speaker#installation)
  4 | - [**Get Started**](https://github.com/deepaudio/deepaudio-speaker#get-started)
  5 | - [**Model Architecture**](https://github.com/deepaudio/deepaudio-speaker#model-architectures)
  6 | - [**How to contribute to deepaudio-speaker?**](https://github.com/deepaudio/deepaudio-speaker#How-to-contribute-to-deepaudio-speaker)
  7 | - [**Acknowledge**](https://github.com/deepaudio/deepaudio-speaker#Acknowledge)
  8 | 
  9 | ## What is deepaudio-speaker?
 10 | 
 11 | Deepaudio-speaker is a framework for training neural network based speaker embedders. It supports online audio augmentation thanks to torch-audiomentation. It inlcudes or will include  popular neural network architectures and losses used for speaker embedder. 
 12 | 
 13 | To make it easy to use various functions such as mixed-precision, multi-node training, and TPU training etc, I introduced PyTorch-Lighting and Hydra in this framework (just like what [pyannote-audio](https://github.com/pyannote/pyannote-audio) and [openspeech](https://github.com/openspeech-team/openspeech) do).    
 14 | 
 15 | Deepaudio-tts is coming soon.
 16 | 
 17 | ## Installation
 18 | ```
 19 | conda create -n deepaudio python=3.8.5
 20 | conda activate deepaudio
 21 | conda install numpy cffi
 22 | conda install libsndfile=1.0.28 -c conda-forge
 23 | git clone https://github.com/deepaudio/deepaudio-speaker.git
 24 | cd deepaudio-speaker
 25 | pip install -e .
 26 | ```
 27 | 
 28 | ## Get Started
 29 | 
 30 | ### Supported Datasets
 31 | 
 32 | ####Voxceleb2
 33 | * [Download VoxCeleb dataset](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) and follow [this script](https://github.com/pyannote/pyannote-db-voxceleb/issues/10#issuecomment-702638328) to obtain this kind of directory structure:
 34 | 
 35 | ```
 36 | /path/to/voxceleb/voxceleb1/dev/wav/id10001/1zcIwhmdeo4/00001.wav
 37 | /path/to/voxceleb/voxceleb1/test/wav/id10270/5r0dWxy17C8/00001.wav
 38 | /path/to/voxceleb/voxceleb2/dev/aac/id00012/21Uxsk56VDQ/00001.m4a
 39 | /path/to/voxceleb/voxceleb2/test/aac/id00017/01dfn2spqyE/00001.m4a
 40 | ```
 41 | 
 42 | ### Training examples
 43 |  - Example1: Train the `ecapa-tdnn` model with `fbank` features on GPU.
 44 |   
 45 | ```
 46 | $ deepaudio-speaker-train  \
 47 |     dataset=voxceleb2 \
 48 |     dataset.dataset_path=/your/path/to/voxceleb2/dev/wav/ \
 49 |     model=clovaai_ecapa \
 50 |     model.channels=1024 \
 51 |     feature=fbank \
 52 |     lr_scheduler=reduce_lr_on_plateau \
 53 |     trainer=gpu \
 54 |     criterion=pyannote_aamsoftmax
 55 | ```
 56 | - Example2: Train ecapa model to get eer around 1.13% for voxceleb 1 trials ( original version, without norm operation).
 57 | 
 58 | ```
 59 | $ git clone https://github.com/deepaudio/deepaudio-database.git
 60 | $ cd deepaudio-database
 61 | $ vim database.yml # edit the list path and wav path
 62 | $ deepaudio-speaker-train  \
 63 |     dataset=dataframe \
 64 |     dataset.database_yml=/your/path/to/deepaudio-database/database.yml \
 65 |     dataset.dataset_name=voxceleb2_dev \
 66 |     model=clovaai_ecapa \
 67 |     model.channels=1024 \
 68 |     model.embed_dim=256 \
 69 |     model.min_num_frames=200 \
 70 |     model.max_num_frames=300 \
 71 |     feature=fbank \
 72 |     lr_scheduler=warmup_adaptive_reduce_lr_on_plateau \
 73 |     lr_scheduler.warmup_steps=30000 \
 74 |     lr_scheduler.lr_factor=0.8 \
 75 |     trainer=gpu \
 76 |     trainer.batch_size=128 \
 77 |     trainer.max_epochs=30 \
 78 |     trainer.num_checkpoints=30 \
 79 |     criterion=adaptive_aamsoftmax \
 80 |     criterion.increase_steps=300000 \
 81 |     augment.apply_spec_augment=True\
 82 |     augment.time_mask_num=1 \
 83 |     augment.apply_noise_augment=True \
 84 |     augment.apply_reverb_augment=True \
 85 |     augment.apply_noise_reverb_augment=True \
 86 |     augment.noise_augment_weight=2 \
 87 |     augment.noise_dataset_dir=/your/path/to/musan \
 88 |     augment.rir_dataset_dir=/your/path/to/RIRS_NOISES/simulated_rirs/ \
 89 | ```
 90 | 
 91 | - Example3: Compute the equal error rate (EER)
 92 | ```python
 93 | from deepaudio.speaker.datasets.dataframe.utils import load_trial_dataframe, get_dataset_items
 94 | from deepaudio.speaker.models.inference import Inference
 95 | from deepaudio.speaker.metrics.eer import model_eer
 96 | 
 97 | trial_meta = get_dataset_items('/your/path/to/deepaudio-database/database.yml',
 98 |                                'voxceleb1_o', 'trial')
 99 | wav_dir, trial_path = trial_meta[0]
100 | trials = load_trial_dataframe(wav_dir, trial_path)
101 | inference = Inference('/your/path/to/checkpoint.ckpt')
102 | eer, thresh = model_eer(inference, trials)
103 | ```
104 | - Example4: Export torchscript model 
105 | ```python
106 | from deepaudio.speaker.models.inference import Inference
107 | model = Inference('/your/path/to/checkpoint.ckpt').model
108 | model.to_torchscript('filepath/to/model')
109 | ```
110 | 
111 | 
112 | ## Model Architecture
113 | [**Wespeaker**](https://github.com/wenet-e2e/wespeaker/tree/master/wespeaker/models) Models from wespeaker.
114 | 
115 | [**ECAPA-TDNN**](https://arxiv.org/pdf/2005.07143.pdf) This is an unofficial implementation from @lawlict. Please find more details in this [link](https://github.com/lawlict/ECAPA-TDNN).
116 | 
117 | [**ECAPA-TDNN**](https://arxiv.org/pdf/2005.07143.pdf) This is implemented by @joonson. Please find more details in this [link](https://github.com/clovaai/voxceleb_trainer/issues/86#issuecomment-739991154).
118 | 
119 | [**ResNetSE34L**](https://arxiv.org/pdf/2003.11982.pdf) This is borrowed from [voxceleb trainer](https://github.com/clovaai/voxceleb_trainer).
120 | 
121 | [**ResNetSE34V2**](https://arxiv.org/pdf/2003.11982.pdf) This is borrowed from [voxceleb trainer](https://github.com/clovaai/voxceleb_trainer).
122 | 
123 | [**Resnet101**](https://arxiv.org/abs/2012.14952) This is proposed by BUT for speaker diarization. Please note that the feature used in this framework is different from [VB-HMM](https://github.com/BUTSpeechFIT/VBx) 
124 | 
125 | ## How to contribute to deepaudio-speaker
126 | 
127 | It is a personal project. So I don't have enough gpu resources to do a lot of experiments. I appreciate any kind of feedback or contributions. Please feel free to make a pull requsest for some small issues like bug fixes, experiment results. If you have any questions, please [open an issue](https://github.com/deepaudio/deepaudio-speaker/issues).
128 | 
129 | ## Acknowledge
130 | I borrow a lot of codes from [openspeech](https://github.com/openspeech-team/openspeech) and [pyannote-audio](https://github.com/pyannote/pyannote-audio)


--------------------------------------------------------------------------------
/deepaudio/speaker/optim/scheduler/tri_stage_lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import math
 24 | import torch
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | from omegaconf import DictConfig
 28 | from torch.optim import Optimizer
 29 | 
 30 | from deepaudio.speaker.dataclass.configurations import LearningRateSchedulerConfigs
 31 | from deepaudio.speaker.optim.scheduler import register_scheduler
 32 | from deepaudio.speaker.optim.scheduler.lr_scheduler import LearningRateScheduler
 33 | 
 34 | 
 35 | @dataclass
 36 | class TriStageLRSchedulerConfigs(LearningRateSchedulerConfigs):
 37 |     scheduler_name: str = field(
 38 |         default="tri_stage", metadata={"help": "Name of learning rate scheduler."}
 39 |     )
 40 |     init_lr: float = field(
 41 |         default=1e-7, metadata={"help": "Initial learning rate."}
 42 |     )
 43 |     init_lr_scale: float = field(
 44 |         default=0.01, metadata={"help": "Initial learning rate scale."}
 45 |     )
 46 |     final_lr_scale: float = field(
 47 |         default=0.01, metadata={"help": "Final learning rate scale"}
 48 |     )
 49 |     phase_ratio: str = field(
 50 |         default="(0.1, 0.4, 0.5)", metadata={"help": "Automatically sets warmup/hold/decay steps to the ratio "
 51 |                                                      "specified here from max_updates. the ratios must add up to 1.0"}
 52 |     )
 53 |     total_steps: int = field(
 54 |         default=400000, metadata={"help": "Total training steps."}
 55 |     )
 56 | 
 57 | 
 58 | @register_scheduler("tri_stage", dataclass=TriStageLRSchedulerConfigs)
 59 | class TriStageLRScheduler(LearningRateScheduler):
 60 |     r"""
 61 |     Tri-Stage Learning Rate Scheduler. Implement the learning rate scheduler in "SpecAugment"
 62 | 
 63 |     Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
 64 |     three stages LR scheduling:
 65 | 
 66 |         - warmup stage, starting from `lr` * `init_lr_scale`, linearly
 67 |           increased to `lr` in `warmup_steps` iterations
 68 |         - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
 69 |           iterations
 70 |         - decay stage, after hold stage, decay LR exponetially to
 71 |           `lr` * `final_lr_scale` in `decay_steps`;
 72 |           after that LR is keep as `final_lr_scale` * `lr`
 73 | 
 74 |     During warmup::
 75 |       init_lr = cfg.init_lr_scale * cfg.lr
 76 |       lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps)
 77 |       lr = lrs[update_num]
 78 | 
 79 |     During hold::
 80 |       lr = cfg.lr
 81 | 
 82 |     During decay::
 83 |       decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps
 84 |       lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
 85 | 
 86 |     After that::
 87 |       lr = cfg.lr * cfg.final_lr_scale
 88 | 
 89 |     Args:
 90 |         optimizer (Optimizer): wrapped optimizer.
 91 |         configs (DictConfig): configuration set.
 92 |     """
 93 |     def __init__(
 94 |             self,
 95 |             optimizer: Optimizer,
 96 |             configs: DictConfig,
 97 |     ):
 98 |         super(TriStageLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr)
 99 | 
100 |         self.phase_ratio = eval(configs.lr_scheduler.phase_ratio)
101 | 
102 |         self.warmup_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[0])
103 |         self.hold_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[1])
104 |         self.decay_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[2])
105 | 
106 |         self.peak_lr = configs.lr_scheduler.lr
107 |         self.init_lr = configs.lr_scheduler.init_lr_scale * configs.lr_scheduler.lr
108 |         self.final_lr = configs.lr_scheduler.final_lr_scale * configs.lr_scheduler.lr
109 | 
110 |         self.warmup_rate = (
111 |             (self.peak_lr - self.init_lr) / self.warmup_steps
112 |             if self.warmup_steps != 0
113 |             else 0
114 |         )
115 |         self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps
116 |         self.update_step = 0
117 |         self.lr = self.init_lr
118 | 
119 |     def _decide_stage(self):
120 |         if self.update_step < self.warmup_steps:
121 |             return 0, self.update_step
122 | 
123 |         offset = self.warmup_steps
124 | 
125 |         if self.update_step < offset + self.hold_steps:
126 |             return 1, self.update_step - offset
127 | 
128 |         offset += self.hold_steps
129 | 
130 |         if self.update_step <= offset + self.decay_steps:
131 |             # decay stage
132 |             return 2, self.update_step - offset
133 | 
134 |         offset += self.decay_steps
135 | 
136 |         return 3, self.update_step - offset
137 | 
138 |     def step(self, val_loss: Optional[torch.FloatTensor] = None):
139 |         stage, steps_in_stage = self._decide_stage()
140 | 
141 |         if stage == 0:
142 |             self.lr = self.init_lr + self.warmup_rate * steps_in_stage
143 |         elif stage == 1:
144 |             self.lr = self.peak_lr
145 |         elif stage == 2:
146 |             self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
147 |         elif stage == 3:
148 |             self.lr = self.final_lr
149 |         else:
150 |             raise ValueError("Undefined stage")
151 | 
152 |         self.set_lr(self.optimizer, self.lr)
153 |         self.update_step += 1
154 | 
155 |         return self.lr
156 | 


--------------------------------------------------------------------------------