├── egs ├── __init__.py ├── voxceleb │ ├── v1 │ │ ├── __init__.py │ │ ├── nnet │ │ │ ├── __init__.py │ │ │ ├── lib │ │ │ │ ├── __init__.py │ │ │ │ ├── make_checkpoint.py │ │ │ │ ├── train_insight.py │ │ │ │ ├── train_mt_lr_learning.py │ │ │ │ ├── train_vae_lr_learning.py │ │ │ │ ├── train_lr_learning.py │ │ │ │ └── finetune_lr_learning.py │ │ │ ├── wrap │ │ │ │ ├── extract_wrapper.sh │ │ │ │ ├── extract_mt_wrapper.sh │ │ │ │ ├── extract_mi_wrapper.sh │ │ │ │ └── extract_mt_phone_wrapper.sh │ │ │ ├── run_finetune_lr_learning.sh │ │ │ ├── run_train_lr_learning.sh │ │ │ ├── run_finetune_nnet.sh │ │ │ ├── run_train_mi_nnet.sh │ │ │ ├── run_train_nnet.sh │ │ │ ├── run_train_mt_nnet.sh │ │ │ ├── run_extract_mt_phone_embeddings.sh │ │ │ └── run_extract_embeddings_no_vad.sh │ │ ├── slurm_conf │ │ │ └── slurm.conf │ │ ├── path.sh │ │ ├── nnet_conf │ │ │ ├── tdnn_softmax_1e-2.json │ │ │ ├── tdnn_asoftmax_m1_linear_bn_1e-2.json │ │ │ ├── tdnn_asoftmax_m2_linear_bn_1e-2.json │ │ │ ├── tdnn_asoftmax_m4_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json │ │ │ ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json │ │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json │ │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json │ │ │ └── tdnn_amsoftmax_m0.20_linear_bn_1e-2_tdnn4_att.json │ │ └── cmd.sh │ └── v2_unfinished │ │ ├── path.sh │ │ ├── nnet_conf │ │ ├── tdnn_softmax_1e-2.json │ │ ├── tdnn_asoftmax_m1_linear_bn_1e-2.json │ │ ├── tdnn_asoftmax_m2_linear_bn_1e-2.json │ │ ├── tdnn_asoftmax_m4_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json │ │ └── tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json │ │ └── cmd.sh ├── sre │ └── v1 │ │ ├── cmd.sh │ │ ├── path.sh │ │ └── nnet_conf │ │ ├── test.json │ │ ├── tdnn_softmax_1e-4.json │ │ ├── tdnn_softmax_1e-6.json │ │ ├── tdnn_softmax_1e-2.json │ │ ├── tdnn_asoftmax_m1_linear_bn.json │ │ ├── tdnn_asoftmax_m2_linear_bn_1e-2.json │ │ ├── tdnn_asoftmax_m4_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.10_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json │ │ └── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json └── fisher │ ├── v1 │ ├── cmd.sh │ ├── path.sh │ ├── nnet_conf │ │ ├── learning_rate_decay_45 │ │ ├── test.json │ │ ├── tdnn_softmax_1e-2.json │ │ ├── tdnn_softmax.json │ │ ├── tdnn_asoftmax_m1_1e-2.json │ │ ├── tdnn_asoftmax_m2_linear_bn_1e-2.json │ │ ├── tdnn_asoftmax_m4_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.10_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json │ │ ├── tdnn_amsoftmax_m0.45_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json │ │ ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json │ │ ├── tdnn_softmax_tdnn4_att.json │ │ ├── tdnn_softmax_tdnn4_att_2.json │ │ ├── tdnn_softmax_tdnn4_att_3.json │ │ └── tdnn_softmax_tdnn4_att_4.json │ ├── eval_cos.sh │ └── eval_plda.sh │ └── v3 │ ├── cmd.sh │ ├── path.sh │ ├── eval_cos.sh │ ├── nnet_conf │ ├── tdnn_softmax_1e-2.json │ ├── mt_softmax.json │ ├── mt_softmax_2.json │ ├── mt_softmax_3.json │ ├── mt_softmax_4.json │ ├── mt_softmax_5.json │ ├── mt_softmax_6.json │ ├── mt_softmax_7.json │ ├── mt_softmax_8.2.json │ ├── mt_softmax_8.json │ └── mt_softmax_8.3.json │ └── eval_plda.sh ├── misc ├── __init__.py ├── ._.DS_Store ├── tuning │ ├── target_logit_curve │ ├── target_logit_curve.pdf │ ├── asoftmax_lambda_tuning.m │ ├── target_logit_curve.py │ └── tune_lr.m ├── DETware_v2.1 │ ├── compute_det.sh │ ├── thick.m │ ├── readme.txt │ ├── Set_DCF.m │ ├── Comp_Det.m │ ├── Eval_Spkr_Det.m │ ├── Get_DCF.m │ ├── Min_DCF.m │ └── Set_DET_limits.m └── tools │ ├── score_distribution.m │ └── sample_validset_spk2utt.py ├── model ├── __init__.py ├── multitask_v1 │ ├── __init__.py │ ├── common.py │ └── pooling.py └── ._.DS_Store ├── dataset ├── __init__.py ├── multitask │ └── __init__.py └── ._.DS_Store ├── scripts ├── diagnostic │ ├── wer_hyp_filter │ ├── wer_ref_filter │ └── wer_output_filter ├── prepare_pdf_for_multitask_egs.sh ├── prepare_bnfeats_for_egs.sh ├── prepare_feats_for_multitask_egs.sh ├── lmrescore_const_arpa.sh └── extract_bnf.sh ├── CHANGELOG.md └── .gitignore /egs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/multitask/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/multitask_v1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/sre/v1/cmd.sh: -------------------------------------------------------------------------------- 1 | export train_cmd="run.pl" 2 | export cuda_cmd="run.pl" -------------------------------------------------------------------------------- /egs/fisher/v1/cmd.sh: -------------------------------------------------------------------------------- 1 | export train_cmd="run.pl" 2 | export cuda_cmd="run.pl" -------------------------------------------------------------------------------- /egs/fisher/v3/cmd.sh: -------------------------------------------------------------------------------- 1 | export train_cmd="run.pl" 2 | export cuda_cmd="run.pl" -------------------------------------------------------------------------------- /misc/._.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/._.DS_Store -------------------------------------------------------------------------------- /dataset/._.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/dataset/._.DS_Store -------------------------------------------------------------------------------- /model/._.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/model/._.DS_Store -------------------------------------------------------------------------------- /misc/tuning/target_logit_curve: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/tuning/target_logit_curve -------------------------------------------------------------------------------- /misc/tuning/target_logit_curve.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/tuning/target_logit_curve.pdf -------------------------------------------------------------------------------- /scripts/diagnostic/wer_hyp_filter: -------------------------------------------------------------------------------- 1 | #!/bin/sed -f 2 | s:::g 3 | s:::g 4 | s:::g 5 | s/://g 6 | s/\*//g 7 | s/-HOLDER/HOLDER/g 8 | s/COMPAIGN/CAMPAIGN/g 9 | s/APPROACHES-/APPROACHES/g 10 | s/RESEACHERS/RESEARCHERS/g 11 | 12 | -------------------------------------------------------------------------------- /scripts/diagnostic/wer_ref_filter: -------------------------------------------------------------------------------- 1 | #!/bin/sed -f 2 | s:::g 3 | s:::g 4 | s:::g 5 | s/://g 6 | s/\*//g 7 | s/-HOLDER/HOLDER/g 8 | s/COMPAIGN/CAMPAIGN/g 9 | s/APPROACHES-/APPROACHES/g 10 | s/RESEACHERS/RESEARCHERS/g 11 | 12 | -------------------------------------------------------------------------------- /scripts/diagnostic/wer_output_filter: -------------------------------------------------------------------------------- 1 | #!/bin/sed -f 2 | s:::g 3 | s:::g 4 | s:::g 5 | s/://g 6 | s/\*//g 7 | s/-HOLDER/HOLDER/g 8 | s/COMPAIGN/CAMPAIGN/g 9 | s/APPROACHES-/APPROACHES/g 10 | s/RESEACHERS/RESEARCHERS/g 11 | 12 | -------------------------------------------------------------------------------- /misc/DETware_v2.1/compute_det.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# != 1 ]; then 4 | echo "Usage: $0 " 5 | echo "" 6 | exit 100 7 | fi 8 | 9 | score=$1 10 | 11 | grep ' target' $score > ${score}.tar 12 | grep ' nontarget' $score > ${score}.imp 13 | 14 | -------------------------------------------------------------------------------- /misc/DETware_v2.1/thick.m: -------------------------------------------------------------------------------- 1 | function [lh] = thick(w,lh) 2 | % THICK chages the width of the lines references by habdles 3 | % lh, the line handles 4 | % w, new width (default is 0.5) 5 | % Example usage: thick(2,plot([1:5],[1,0,1,0,1],'b')) 6 | 7 | for i=1:length(lh) 8 | set (lh(i),'LineWidth',w); 9 | end 10 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [TODO] 4 | - Train on VoxCeleb 2 dev set and test on VoxCeleb 1. 5 | - Test attention component. 6 | - Other pooling strategy. (Utterance-level Aggregation For Speaker Recognition In The Wild) 7 | - Add multitask_v1 learning. 8 | 9 | 10 | ## [First version] 11 | - Basic x-vector pipeline. 12 | - Large margin softmax loss. 13 | -------------------------------------------------------------------------------- /misc/tools/score_distribution.m: -------------------------------------------------------------------------------- 1 | 2 | tar = load('score.target.amsoftmax'); 3 | nontar = load('score.nontarget.amsoftmax'); 4 | 5 | [n_tar, c_tar] = hist(tar, 30); 6 | n_tar = n_tar / sum(n_tar); 7 | [n_nontar, c_nontar] = hist(nontar, 30); 8 | n_nontar = n_nontar / sum(n_nontar); 9 | 10 | plot(c_tar, n_tar, 'r--'); 11 | hold on; 12 | plot(c_nontar, n_nontar, 'b--'); -------------------------------------------------------------------------------- /misc/tuning/asoftmax_lambda_tuning.m: -------------------------------------------------------------------------------- 1 | clear 2 | 3 | step = 1:1000000; 4 | 5 | lambda_min = 10; 6 | lambda_base = 1000; 7 | gamma = 0.00001; 8 | lambda_power = 5; 9 | 10 | lambda = max(lambda_min, lambda_base * (1 + gamma * step).^(-lambda_power)); 11 | fa = 1.0 ./ (1.0 + lambda); 12 | figure 13 | plot(step, lambda); 14 | xlim([0 800000]) 15 | ylim([0 100]) 16 | figure(); 17 | plot(step, fa); -------------------------------------------------------------------------------- /egs/voxceleb/v1/slurm_conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts 7 | default gpu=0 8 | option gpu=0 -p r830all.q 9 | option gpu=* -p c4130all.q --gres=gpu:$0 -------------------------------------------------------------------------------- /egs/sre/v1/path.sh: -------------------------------------------------------------------------------- 1 | # The virtualenv path 2 | export TF_ENV=/home/heliang05/liuyi/venv 3 | 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 8 | . $KALDI_ROOT/tools/config/common_path.sh 9 | export LC_ALL=C -------------------------------------------------------------------------------- /egs/fisher/v1/path.sh: -------------------------------------------------------------------------------- 1 | # The virtualenv path 2 | export TF_ENV=/home/heliang05/liuyi/venv 3 | 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 8 | . $KALDI_ROOT/tools/config/common_path.sh 9 | export LC_ALL=C -------------------------------------------------------------------------------- /egs/fisher/v3/path.sh: -------------------------------------------------------------------------------- 1 | # The virtualenv path 2 | export TF_ENV=/home/heliang05/liuyi/venv 3 | 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 8 | . $KALDI_ROOT/tools/config/common_path.sh 9 | export LC_ALL=C -------------------------------------------------------------------------------- /egs/voxceleb/v1/path.sh: -------------------------------------------------------------------------------- 1 | # The virtualenv path 2 | export TF_ENV=/home/heliang05/liuyi/venv 3 | 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 8 | . $KALDI_ROOT/tools/config/common_path.sh 9 | export LC_ALL=C 10 | -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/path.sh: -------------------------------------------------------------------------------- 1 | # The virtualenv path 2 | export TF_ENV=/home/heliang05/liuyi/venv 3 | 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 8 | . $KALDI_ROOT/tools/config/common_path.sh 9 | export LC_ALL=C 10 | -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/learning_rate_decay_45: -------------------------------------------------------------------------------- 1 | 0.005 2 | 0.005 3 | 0.005 4 | 0.005 5 | 0.005 6 | 0.005 7 | 0.0025 8 | 0.0025 9 | 0.0025 10 | 0.0025 11 | 0.00125 12 | 0.00125 13 | 0.00125 14 | 0.00125 15 | 0.000625 16 | 0.000625 17 | 0.000625 18 | 0.000625 19 | 0.0003125 20 | 0.0003125 21 | 0.0003125 22 | 0.00015625 23 | 0.00015625 24 | 0.00015625 25 | 0.00007813 26 | 0.00007813 27 | 0.00007813 28 | 0.00003906 29 | 0.00003906 30 | 0.00003906 31 | 0.00001953 32 | 0.00001953 33 | 0.00001953 34 | 0.00000977 35 | 0.00000977 36 | 0.00000977 37 | 0.00000488 38 | 0.00000488 39 | 0.00000488 40 | 0.00000244 41 | 0.00000244 42 | 0.00000244 43 | 0.00000122 44 | 0.00000122 45 | 0.00000122 46 | 0 -------------------------------------------------------------------------------- /misc/DETware_v2.1/readme.txt: -------------------------------------------------------------------------------- 1 | 2 | NIST is making available these matlab files to be used to produce 3 | Detection Error Trade-off curves with the matlab software package. 4 | 5 | For a basic test of these matlab scripts, start matlab and type the 6 | command: "Eval_Spkr_Det" 7 | 8 | This script reads the files: 9 | A) true_speaker_scores 10 | B) impostor_scores 11 | 12 | and produces a sample DET curve. 13 | 14 | By making the appropriate changes to the files true_speaker_scores and 15 | impostor_scores, you can produce your own DET-curves with: 16 | "Eval_Spkr_Det" 17 | 18 | Run "DET_usage" for a more detailed demonstration of the capabilities 19 | of these scripts. 20 | 21 | -------------------------------------------------------------------------------- /misc/DETware_v2.1/Set_DCF.m: -------------------------------------------------------------------------------- 1 | function Set_DCF (Cmiss, Cfa, Ptrue) 2 | %function Set_DCF (Cmiss, Cfa, Ptrue) initializes the detection 3 | %cost function (DCF) parameters. The detection cost function is 4 | %defined as: 5 | % 6 | % DCF = Cmiss * Pmiss * Ptrue + Cfa * Pfa * Pfalse 7 | % 8 | % DCF is a function of Pmiss and Pfa, the miss and false alarm 9 | % probabilities. The DCF parameters are: 10 | % 11 | % Cmiss, the cost of a miss, 12 | % Cfa, the cost of a false alarm, 13 | % Ptrue, the a priori probability of the target, and 14 | % Pfalse, = 1 - Ptrue. 15 | % 16 | % See DET_usage for an example of how to use Set_DCF. 17 | 18 | global DCF_parameters 19 | DCF_parameters = [Cmiss, Cfa, Ptrue, 1-Ptrue]; 20 | 21 | -------------------------------------------------------------------------------- /scripts/prepare_pdf_for_multitask_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f path.sh ]; then . ./path.sh; fi 4 | . parse_options.sh || exit 1; 5 | 6 | if [ $# != 1 ]; then 7 | echo "Usage: $0 " 8 | echo "e.g.: $0 data/train exp/tri5a_ali" 9 | exit 1; 10 | fi 11 | 12 | dir=$1 13 | 14 | for f in $dir/ali.1.gz $dir/final.mdl ; do 15 | [ ! -f $f ] && echo "$0: No such file $f" && exit 1; 16 | done 17 | 18 | num_ali_jobs=$(cat $dir/num_jobs) || exit 1; 19 | for id in $(seq $num_ali_jobs); do gunzip -c $dir/ali.$id.gz; done | \ 20 | ali-to-pdf $dir/final.mdl ark:- ark,scp:$dir/pdf.ark,$dir/pdf.scp || exit 1; 21 | 22 | # TODO: pdf to phones? pdf to phone classes? pdf to ali? We may need to get other types of alignments. 23 | 24 | exit 0 -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/make_checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from misc.utils import get_checkpoint 5 | from misc.utils import Params 6 | import tensorflow as tf 7 | 8 | if __name__ == '__main__': 9 | tf.logging.set_verbosity(tf.logging.INFO) 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("-c", "--checkpoint", type=str, default='-1', 12 | help="The checkpoint to load. The default is to load the BEST checkpoint (according to valid_loss).") 13 | parser.add_argument("model_dir", type=str, help="The model directory.") 14 | args = parser.parse_args() 15 | checkpoint = get_checkpoint(os.path.join(args.model_dir, "nnet"), args.checkpoint) 16 | print("Set the checkpoint to %s" % checkpoint) 17 | -------------------------------------------------------------------------------- /egs/fisher/v1/eval_cos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nnetdir=$1 4 | 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 6 | echo "EER: ${eer}%" 7 | 8 | paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_cos.new 9 | grep ' target$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.target 10 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.nontarget 11 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_cos.target', '$nnetdir/xvector_scores_hires/test_cos.nontarget', '$nnetdir/xvector_scores_hires/test_cos.result')"` 12 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 13 | tail -n 1 $nnetdir/xvector_scores_hires/test_cos.result -------------------------------------------------------------------------------- /egs/fisher/v3/eval_cos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nnetdir=$1 4 | 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 6 | echo "EER: ${eer}%" 7 | 8 | paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_cos.new 9 | grep ' target$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.target 10 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.nontarget 11 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_cos.target', '$nnetdir/xvector_scores_hires/test_cos.nontarget', '$nnetdir/xvector_scores_hires/test_cos.result')"` 12 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 13 | tail -n 1 $nnetdir/xvector_scores_hires/test_cos.result -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/wrap/extract_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env= 4 | gpuid=-1 5 | min_chunk_size=25 6 | chunk_size=10000 7 | normalize=false 8 | node="output" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 3 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --gpuid <-1>" 17 | echo " --min-chunk-size <25>" 18 | echo " --chunk-size <10000>" 19 | echo " --normalize " 20 | echo " --node " 21 | echo "" 22 | exit 100 23 | fi 24 | 25 | nnetdir=$1 26 | feat=$2 27 | dir=$3 28 | 29 | if [ ! -z $env ]; then 30 | source $TF_ENV/$env/bin/activate 31 | fi 32 | 33 | if $normalize; then 34 | cmdopt_norm="--normalize" 35 | fi 36 | 37 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH 38 | 39 | python nnet/lib/extract.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\ 40 | "$nnetdir" "$feat" "$dir" 41 | deactivate -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.02, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-4, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 16000, 21 | "reduce_lr_epochs": 1, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 8000, 25 | "save_checkpoints_steps": 16000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 4, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 100, 33 | "max_segment_len": 300, 34 | 35 | "early_stop_epochs": 6, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.001, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-2, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 60000, 21 | "reduce_lr_epochs": 4, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 20000, 25 | "save_checkpoints_steps": 30000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 8, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 200, 33 | "max_segment_len": 400, 34 | 35 | "early_stop_epochs": 6, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /misc/DETware_v2.1/Comp_Det.m: -------------------------------------------------------------------------------- 1 | %------------------------------ 2 | %load speaker detection output scores 3 | load true_speaker_scores 4 | load impostor_scores 5 | 6 | %------------------------------ 7 | %initialize the DCF parameters 8 | Set_DCF (10, 1, 0.01); 9 | 10 | %------------------------------ 11 | %compute Pmiss and Pfa from experimental detection output scores 12 | [P_miss,P_fa] = Compute_DET (true_speaker_scores, impostor_scores); 13 | 14 | %------------------------------ 15 | %plot results 16 | 17 | % Set tic marks 18 | Pmiss_min = 0.01; 19 | Pmiss_max = 0.45; 20 | Pfa_min = 0.01; 21 | Pfa_max = 0.45; 22 | Set_DET_limits(Pmiss_min,Pmiss_max,Pfa_min,Pfa_max); 23 | 24 | %call figure, plot DET-curve 25 | figure; 26 | Plot_DET (P_miss, P_fa,'r'); 27 | title ('Speaker Detection Performance'); 28 | hold on; 29 | 30 | %find lowest cost point and plot 31 | C_miss = 1; 32 | C_fa = 1; 33 | P_target = 0.5; 34 | Set_DCF(C_miss,C_fa,P_target); 35 | [DCF_opt Popt_miss Popt_fa] = Min_DCF(P_miss,P_fa); 36 | Plot_DET (Popt_miss,Popt_fa,'ko'); -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/wrap/extract_mt_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env= 4 | gpuid=-1 5 | min_chunk_size=25 6 | chunk_size=10000 7 | normalize=false 8 | node="output" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 4 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --gpuid <-1>" 17 | echo " --min-chunk-size <25>" 18 | echo " --chunk-size <10000>" 19 | echo " --normalize " 20 | echo " --node " 21 | echo "" 22 | exit 100 23 | fi 24 | 25 | nnetdir=$1 26 | feat=$2 27 | ali=$3 28 | dir=$4 29 | 30 | if [ ! -z $env ]; then 31 | source $TF_ENV/$env/bin/activate 32 | fi 33 | 34 | if $normalize; then 35 | cmdopt_norm="--normalize" 36 | fi 37 | 38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH 39 | 40 | python nnet/lib/extract_mt.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\ 41 | "$nnetdir" "$feat" "$ali" "$dir" 42 | deactivate 43 | -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.005, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-2, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 16000, 21 | "reduce_lr_epochs": 3, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 8000, 25 | "save_checkpoints_steps": 16000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 4, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 100, 33 | "max_segment_len": 300, 34 | 35 | "early_stop_epochs": 8, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/tdnn_softmax_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.005, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-2, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 7000, 21 | "reduce_lr_epochs": 3, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 3500, 25 | "save_checkpoints_steps": 7000, 26 | "valid_max_iterations": 1000, 27 | 28 | "num_parallel_datasets": 4, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 100, 33 | "max_segment_len": 300, 34 | 35 | "early_stop_epochs": 8, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_softmax_1e-4.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.001, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-4, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 60000, 21 | "reduce_lr_epochs": 4, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 20000, 25 | "save_checkpoints_steps": 30000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 16, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 200, 33 | "max_segment_len": 400, 34 | 35 | "early_stop_epochs": 6, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_softmax_1e-6.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.001, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-6, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 60000, 21 | "reduce_lr_epochs": 4, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 20000, 25 | "save_checkpoints_steps": 30000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 16, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 200, 33 | "max_segment_len": 400, 34 | 35 | "early_stop_epochs": 6, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /misc/DETware_v2.1/Eval_Spkr_Det.m: -------------------------------------------------------------------------------- 1 | %------------------------------ 2 | %load speaker detection output scores 3 | load true_speaker_scores 4 | load impostor_scores 5 | 6 | %------------------------------ 7 | %initialize the DCF parameters 8 | Set_DCF (10, 1, 0.01); 9 | 10 | %------------------------------ 11 | %compute Pmiss and Pfa from experimental detection output scores 12 | [P_miss,P_fa] = Compute_DET (true_speaker_scores, impostor_scores); 13 | 14 | %------------------------------ 15 | %plot results 16 | 17 | % Set tic marks 18 | Pmiss_min = 0.01; 19 | Pmiss_max = 0.45; 20 | Pfa_min = 0.01; 21 | Pfa_max = 0.45; 22 | Set_DET_limits(Pmiss_min,Pmiss_max,Pfa_min,Pfa_max); 23 | 24 | %call figure, plot DET-curve 25 | figure; 26 | Plot_DET (P_miss, P_fa,'r'); 27 | title ('Speaker Detection Performance'); 28 | hold on; 29 | 30 | %find lowest cost point and plot 31 | C_miss = 1; 32 | C_fa = 1; 33 | P_target = 0.5; 34 | Set_DCF(C_miss,C_fa,P_target); 35 | [DCF_opt Popt_miss Popt_fa] = Min_DCF(P_miss,P_fa); 36 | Plot_DET (Popt_miss,Popt_fa,'ko'); 37 | 38 | -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_softmax_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.001, 11 | "optimizer": "momentum", 12 | "momentum": 0.9, 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-2, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 60000, 21 | "reduce_lr_epochs": 1, 22 | "show_training_progress": 200, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 20000, 25 | "save_checkpoints_steps": 30000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 8, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 200, 33 | "max_segment_len": 400, 34 | 35 | "early_stop_epochs": 6, 36 | "min_learning_rate": 1e-6 37 | } 38 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/wrap/extract_mi_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env= 4 | gpuid=-1 5 | min_chunk_size=25 6 | chunk_size=10000 7 | normalize=false 8 | node="output" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 4 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --gpuid <-1>" 17 | echo " --min-chunk-size <25>" 18 | echo " --chunk-size <10000>" 19 | echo " --normalize " 20 | echo " --node " 21 | echo "" 22 | exit 100 23 | fi 24 | 25 | nnetdir=$1 26 | feat=$2 27 | feat_aux=$3 28 | dir=$4 29 | 30 | if [ ! -z $env ]; then 31 | source $TF_ENV/$env/bin/activate 32 | fi 33 | 34 | if $normalize; then 35 | cmdopt_norm="--normalize" 36 | fi 37 | 38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH 39 | 40 | python nnet/lib/extract_mi.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm \ 41 | "$nnetdir" "$feat" "$feat_aux" "$dir" 42 | deactivate -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/wrap/extract_mt_phone_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env= 4 | gpuid=-1 5 | min_chunk_size=25 6 | chunk_size=10000 7 | normalize=false 8 | node="output" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 4 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --gpuid <-1>" 17 | echo " --min-chunk-size <25>" 18 | echo " --chunk-size <10000>" 19 | echo " --normalize " 20 | echo " --node " 21 | echo "" 22 | exit 100 23 | fi 24 | 25 | nnetdir=$1 26 | feat=$2 27 | ali=$3 28 | dir=$4 29 | 30 | if [ ! -z $env ]; then 31 | source $TF_ENV/$env/bin/activate 32 | fi 33 | 34 | if $normalize; then 35 | cmdopt_norm="--normalize" 36 | fi 37 | 38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH 39 | 40 | python nnet/lib/extract_mt_phone.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\ 41 | "$nnetdir" "$feat" "$ali" "$dir" 42 | deactivate 43 | -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | "pooling_type": "statistics_pooling", 8 | "embedding_node": "tdnn6_dense", 9 | 10 | "learning_rate": 0.01, 11 | "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9", 12 | 13 | "use_nesterov": false, 14 | "clip_gradient": false, 15 | 16 | "weight_l2_regularizer": 1e-4, 17 | "batchnorm_momentum": 0.99, 18 | 19 | "num_epochs": 100, 20 | "num_steps_per_epoch": 50000, 21 | "reduce_lr_epochs": 4, 22 | "show_training_progress": 100, 23 | "keep_checkpoint_max": 100, 24 | "save_summary_steps": 10000, 25 | "save_checkpoints_steps": 50000, 26 | "valid_max_iterations": 5000, 27 | 28 | "num_parallel_datasets": 8, 29 | "max_queue_size": 10, 30 | "num_speakers_per_batch": 64, 31 | "num_segments_per_speaker": 1, 32 | "min_segment_len": 200, 33 | "max_segment_len": 400, 34 | 35 | "early_stop_epochs": 8, 36 | "min_learning_rate": 1e-6 37 | } -------------------------------------------------------------------------------- /misc/DETware_v2.1/Get_DCF.m: -------------------------------------------------------------------------------- 1 | function [eer, dcf08, dcf10, dcf12] = Get_DCF(target, imposter, output) 2 | 3 | tar = load(target); 4 | non = load(imposter); 5 | 6 | lim = [0.0001 0.95]; 7 | Set_DET_limits(lim(1), lim(2), lim(1), lim(2)); 8 | 9 | % EER 10 | [Pmiss, Pfa, eer] = Compute_DET(tar, non); 11 | 12 | % DCF08 for DCF12 13 | Set_DCF(1, 1, 0.01); 14 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa); 15 | dcf08 = DCF_opt * 100; 16 | 17 | % DCF10 for DCF12 18 | Set_DCF(1, 1, 0.001); 19 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa); 20 | Plot_DET(Popt_miss, max(Popt_fa, lim(1)), 'ro', 2); 21 | dcf10 = DCF_opt * 1000; 22 | 23 | % DCF12 24 | dcf12 = (dcf08 + dcf10) / 2; 25 | 26 | % DCF08 27 | Set_DCF(10, 1, 0.01); 28 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa); 29 | dcf08 = DCF_opt; 30 | 31 | % DCF10 32 | Set_DCF(1, 1, 0.001); 33 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa); 34 | dcf10 = DCF_opt * 1000; 35 | 36 | fid = fopen(output, 'a'); 37 | fprintf(fid, 'eer: %5.4f%%; mindcf08: %5.4f%%; mindcf10: %5.4f%%; mindcf12: %5.4f%%\n', eer*100, dcf08, dcf10, dcf12); 38 | fclose(fid); -------------------------------------------------------------------------------- /misc/DETware_v2.1/Min_DCF.m: -------------------------------------------------------------------------------- 1 | function [min_cost, Pmiss_opt, Pfa_opt] = Min_DCF(Pmiss, Pfa) 2 | %function [min_cost, Pmiss_opt, Pfa_opt] = Min_DCF(Pmiss, Pfa) 3 | % 4 | % Min_DCF finds and returns the minimum value of the detection 5 | % cost function for a given detection error trade-off curve. 6 | % 7 | % Pmiss and Pfa are the correcponding miss and false alarm 8 | % trade-off probabilities. 9 | % 10 | % 11 | % See DET_usage for an example of how to use Min_DCF. 12 | 13 | global DCF_parameters 14 | 15 | if isempty(DCF_parameters) 16 | error ('call Set_DCF to define DCF parameters before calling Min_DCF'); 17 | end 18 | 19 | Cmiss = DCF_parameters(1); 20 | Cfa = DCF_parameters(2); 21 | Ptrue = DCF_parameters(3); 22 | Pfalse = DCF_parameters(4); 23 | 24 | npts = max(size(Pmiss)); 25 | if npts ~= max(size(Pfa)) 26 | error ('vector size of Pmiss and Pfa not equal in call to Plot_DET'); 27 | end 28 | 29 | %------------------------- 30 | %Find DCF_best: 31 | 32 | DCF_vector = Cmiss * Pmiss * Ptrue + Cfa * Pfa * Pfalse; 33 | [min_cost min_ptr] = min (DCF_vector); 34 | Pmiss_opt = Pmiss(min_ptr(1)) ; 35 | Pfa_opt = Pfa(min_ptr(1)); 36 | 37 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_softmax_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "Note": "If the weight decay is 1e-2, the vinilla SGD (0.01) is used. If momentum=0.9, reduce the learning rate to 0.001 may be better", 3 | 4 | "seed": 0, 5 | "network_type": "tdnn", 6 | "last_layer_linear": false, 7 | "loss_func": "softmax", 8 | "batch_type": "softmax", 9 | "pooling_type": "statistics_pooling", 10 | "embedding_node": "tdnn6_dense", 11 | 12 | "learning_rate": 0.01, 13 | "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9", 14 | 15 | "use_nesterov": false, 16 | "clip_gradient": false, 17 | 18 | "weight_l2_regularizer": 1e-2, 19 | "batchnorm_momentum": 0.99, 20 | 21 | "num_epochs": 100, 22 | "num_steps_per_epoch": 30000, 23 | "reduce_lr_epochs": 4, 24 | "show_training_progress": 100, 25 | "keep_checkpoint_max": 100, 26 | "save_summary_steps": 10000, 27 | "save_checkpoints_steps": 30000, 28 | "valid_max_iterations": 1000, 29 | 30 | "num_parallel_datasets": 16, 31 | "max_queue_size": 10, 32 | "num_speakers_per_batch": 64, 33 | "num_segments_per_speaker": 1, 34 | "min_segment_len": 200, 35 | "max_segment_len": 400, 36 | 37 | "early_stop_epochs": 10, 38 | "min_learning_rate": 1e-6 39 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_softmax_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "Note": "If the weight decay is 1e-2, the vinilla SGD (0.01) is used. If momentum=0.9, reduce the learning rate to 0.001 may be better", 3 | 4 | "seed": 0, 5 | "network_type": "tdnn", 6 | "last_layer_linear": false, 7 | "loss_func": "softmax", 8 | "batch_type": "softmax", 9 | "pooling_type": "statistics_pooling", 10 | "embedding_node": "tdnn6_dense", 11 | 12 | "learning_rate": 0.01, 13 | "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9", 14 | 15 | "use_nesterov": false, 16 | "clip_gradient": false, 17 | 18 | "weight_l2_regularizer": 1e-2, 19 | "batchnorm_momentum": 0.99, 20 | 21 | "num_epochs": 100, 22 | "num_steps_per_epoch": 30000, 23 | "reduce_lr_epochs": 4, 24 | "show_training_progress": 100, 25 | "keep_checkpoint_max": 100, 26 | "save_summary_steps": 10000, 27 | "save_checkpoints_steps": 30000, 28 | "valid_max_iterations": 1000, 29 | 30 | "num_parallel_datasets": 16, 31 | "max_queue_size": 10, 32 | "num_speakers_per_batch": 64, 33 | "num_segments_per_speaker": 1, 34 | "min_segment_len": 200, 35 | "max_segment_len": 400, 36 | 37 | "early_stop_epochs": 10, 38 | "min_learning_rate": 1e-6 39 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_asoftmax_m1_linear_bn.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "asoftmax", 8 | "asoftmax_m": 1, 9 | "asoftmax_lambda_min": 0, 10 | "asoftmax_lambda_base": 0, 11 | "asoftmax_lambda_gamma": 1, 12 | "asoftmax_lambda_power": 1, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 20000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "asoftmax", 8 | "asoftmax_m": 2, 9 | "asoftmax_lambda_min": 10, 10 | "asoftmax_lambda_base": 1000, 11 | "asoftmax_lambda_gamma": 0.00001, 12 | "asoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "asoftmax", 8 | "asoftmax_m": 4, 9 | "asoftmax_lambda_min": 10, 10 | "asoftmax_lambda_base": 1000, 11 | "asoftmax_lambda_gamma": 0.00001, 12 | "asoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m1_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 1, 10 | "asoftmax_lambda_min": 0, 11 | "asoftmax_lambda_base": 0, 12 | "asoftmax_lambda_gamma": 1, 13 | "asoftmax_lambda_power": 1, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_asoftmax_m1_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": false, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 1, 10 | "asoftmax_lambda_min": 0, 11 | "asoftmax_lambda_base": 0, 12 | "asoftmax_lambda_gamma": 1, 13 | "asoftmax_lambda_power": 1, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 100, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 2, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00001, 13 | "asoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 4, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00001, 13 | "asoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m1_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 1, 10 | "asoftmax_lambda_min": 0, 11 | "asoftmax_lambda_base": 0, 12 | "asoftmax_lambda_gamma": 1, 13 | "asoftmax_lambda_power": 1, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.10_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.10, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.15, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.20, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.25, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.30, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_margin_softmax", 8 | "amsoftmax_m": 0.35, 9 | "amsoftmax_lambda_min": 0, 10 | "amsoftmax_lambda_base": 1000, 11 | "amsoftmax_lambda_gamma": 0.0001, 12 | "amsoftmax_lambda_power": 5, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 2, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00001, 13 | "asoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 4, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00001, 13 | "asoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 2, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00008, 13 | "asoftmax_lambda_power": 2, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "asoftmax", 9 | "asoftmax_m": 4, 10 | "asoftmax_lambda_min": 10, 11 | "asoftmax_lambda_base": 1000, 12 | "asoftmax_lambda_gamma": 0.00008, 13 | "asoftmax_lambda_power": 2, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.10, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.15, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.20, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.25, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.30, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": true, 5 | "feature_norm": false, 6 | 7 | "loss_func": "additive_angular_margin_softmax", 8 | "arcsoftmax_m": 0.35, 9 | "arcsoftmax_lambda_min": 0, 10 | "arcsoftmax_lambda_base": 1000, 11 | "arcsoftmax_lambda_gamma": 0.00001, 12 | "arcsoftmax_lambda_power": 6, 13 | 14 | "batch_type": "softmax", 15 | "pooling_type": "statistics_pooling", 16 | "embedding_node": "tdnn6_dense", 17 | 18 | "learning_rate": 0.001, 19 | "optimizer": "momentum", 20 | "momentum": 0.9, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 200, 28 | "num_steps_per_epoch": 60000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 200, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 30000, 33 | "save_checkpoints_steps": 60000, 34 | "valid_max_iterations": 5000, 35 | 36 | "num_parallel_datasets": 8, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.15, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.25, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.30, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.35, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.15, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.25, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.30, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.35, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.35, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 0, 12 | "amsoftmax_lambda_gamma": 1, 13 | "amsoftmax_lambda_power": 1, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.15, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.20, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.25, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.30, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.35, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.40, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.10_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.10, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.15, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.25, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.30, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.45_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.45, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.15, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.20, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.25, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.30, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.35, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.40, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00001, 13 | "arcsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.01, 20 | "use_nesterov": false, 21 | "clip_gradient": false, 22 | "clip_gradient_norm": 3, 23 | 24 | "weight_l2_regularizer": 1e-2, 25 | "batchnorm_momentum": 0.99, 26 | 27 | "num_epochs": 100, 28 | "num_steps_per_epoch": 30000, 29 | "reduce_lr_epochs": 4, 30 | "show_training_progress": 100, 31 | "keep_checkpoint_max": 100, 32 | "save_summary_steps": 10000, 33 | "save_checkpoints_steps": 30000, 34 | "valid_max_iterations": 1000, 35 | 36 | "num_parallel_datasets": 16, 37 | "max_queue_size": 10, 38 | "num_speakers_per_batch": 64, 39 | "num_segments_per_speaker": 1, 40 | "min_segment_len": 200, 41 | "max_segment_len": 400, 42 | 43 | "early_stop_epochs": 10, 44 | "min_learning_rate": 1e-6 45 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.10, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.15, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.20, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.25, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.30, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.35, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_angular_margin_softmax", 9 | "arcsoftmax_m": 0.40, 10 | "arcsoftmax_lambda_min": 0, 11 | "arcsoftmax_lambda_base": 1000, 12 | "arcsoftmax_lambda_gamma": 0.00002, 13 | "arcsoftmax_lambda_power": 8, 14 | 15 | "batch_type": "softmax", 16 | "pooling_type": "statistics_pooling", 17 | "embedding_node": "tdnn6_dense", 18 | 19 | "learning_rate": 0.005, 20 | "optimizer": "momentum", 21 | "momentum": 0.9, 22 | "use_nesterov": false, 23 | "clip_gradient": false, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 200, 29 | "num_steps_per_epoch": 16000, 30 | "reduce_lr_epochs": 3, 31 | "show_training_progress": 200, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 8000, 34 | "save_checkpoints_steps": 16000, 35 | "valid_max_iterations": 5000, 36 | 37 | "num_parallel_datasets": 4, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 100, 42 | "max_segment_len": 300, 43 | 44 | "early_stop_epochs": 8, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": true, 7 | "feature_scaling_factor": 30, 8 | 9 | "loss_func": "additive_margin_softmax", 10 | "amsoftmax_m": 0.20, 11 | "amsoftmax_lambda_min": 0, 12 | "amsoftmax_lambda_base": 1000, 13 | "amsoftmax_lambda_gamma": 0.0001, 14 | "amsoftmax_lambda_power": 5, 15 | 16 | "batch_type": "softmax", 17 | "pooling_type": "statistics_pooling", 18 | "embedding_node": "tdnn6_dense", 19 | 20 | "learning_rate": 0.01, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | "clip_gradient_norm": 3, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 100, 29 | "num_steps_per_epoch": 30000, 30 | "reduce_lr_epochs": 4, 31 | "show_training_progress": 100, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 10000, 34 | "save_checkpoints_steps": 30000, 35 | "valid_max_iterations": 1000, 36 | 37 | "num_parallel_datasets": 16, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 200, 42 | "max_segment_len": 400, 43 | 44 | "early_stop_epochs": 10, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": true, 7 | "feature_scaling_factor": 30, 8 | 9 | "loss_func": "additive_margin_softmax", 10 | "amsoftmax_m": 0.20, 11 | "amsoftmax_lambda_min": 0, 12 | "amsoftmax_lambda_base": 1000, 13 | "amsoftmax_lambda_gamma": 0.0001, 14 | "amsoftmax_lambda_power": 5, 15 | 16 | "batch_type": "softmax", 17 | "pooling_type": "statistics_pooling", 18 | "embedding_node": "tdnn6_dense", 19 | 20 | "learning_rate": 0.01, 21 | "use_nesterov": false, 22 | "clip_gradient": false, 23 | "clip_gradient_norm": 3, 24 | 25 | "weight_l2_regularizer": 1e-2, 26 | "batchnorm_momentum": 0.99, 27 | 28 | "num_epochs": 100, 29 | "num_steps_per_epoch": 30000, 30 | "reduce_lr_epochs": 4, 31 | "show_training_progress": 100, 32 | "keep_checkpoint_max": 100, 33 | "save_summary_steps": 10000, 34 | "save_checkpoints_steps": 30000, 35 | "valid_max_iterations": 1000, 36 | 37 | "num_parallel_datasets": 16, 38 | "max_queue_size": 10, 39 | "num_speakers_per_batch": 64, 40 | "num_segments_per_speaker": 1, 41 | "min_segment_len": 200, 42 | "max_segment_len": 400, 43 | 44 | "early_stop_epochs": 10, 45 | "min_learning_rate": 1e-6 46 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | 8 | "pooling_type": "self_attention", 9 | "att_key_input": "tdnn4_relu", 10 | "att_key_num_nodes": [1500, 1500], 11 | "att_key_network_type": 3, 12 | "att_value_input": "tdnn5_relu", 13 | "att_value_num_nodes": [], 14 | "att_value_network_type": 0, 15 | "att_apply_nonlinear": false, 16 | "att_use_scale": true, 17 | "att_num_heads": 1, 18 | "att_split_key": false, 19 | "att_penalty_term": 0, 20 | 21 | "learning_rate": 0.005, 22 | "optimizer": "momentum", 23 | "momentum": 0.9, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | 27 | "weight_l2_regularizer": 1e-2, 28 | "batchnorm_momentum": 0.99, 29 | 30 | "num_epochs": 200, 31 | "num_steps_per_epoch": 16000, 32 | "reduce_lr_epochs": 3, 33 | "show_training_progress": 200, 34 | "keep_checkpoint_max": 100, 35 | "save_summary_steps": 8000, 36 | "save_checkpoints_steps": 16000, 37 | "valid_max_iterations": 5000, 38 | 39 | "num_parallel_datasets": 4, 40 | "max_queue_size": 10, 41 | "num_speakers_per_batch": 64, 42 | "num_segments_per_speaker": 1, 43 | "min_segment_len": 100, 44 | "max_segment_len": 300, 45 | 46 | "early_stop_epochs": 8, 47 | "min_learning_rate": 1e-6 48 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | 8 | "pooling_type": "self_attention", 9 | "att_key_input": "tdnn4_relu", 10 | "att_key_num_nodes": [1500, 1500], 11 | "att_key_network_type": 1, 12 | "att_value_input": "tdnn5_relu", 13 | "att_value_num_nodes": [], 14 | "att_value_network_type": 0, 15 | "att_apply_nonlinear": false, 16 | "att_use_scale": true, 17 | "att_num_heads": 1, 18 | "att_split_key": false, 19 | "att_penalty_term": 0, 20 | 21 | "learning_rate": 0.005, 22 | "optimizer": "momentum", 23 | "momentum": 0.9, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | 27 | "weight_l2_regularizer": 1e-2, 28 | "batchnorm_momentum": 0.99, 29 | 30 | "num_epochs": 200, 31 | "num_steps_per_epoch": 16000, 32 | "reduce_lr_epochs": 3, 33 | "show_training_progress": 200, 34 | "keep_checkpoint_max": 100, 35 | "save_summary_steps": 8000, 36 | "save_checkpoints_steps": 16000, 37 | "valid_max_iterations": 5000, 38 | 39 | "num_parallel_datasets": 4, 40 | "max_queue_size": 10, 41 | "num_speakers_per_batch": 64, 42 | "num_segments_per_speaker": 1, 43 | "min_segment_len": 100, 44 | "max_segment_len": 300, 45 | 46 | "early_stop_epochs": 8, 47 | "min_learning_rate": 1e-6 48 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | 8 | "pooling_type": "self_attention", 9 | "att_key_input": "tdnn4_relu", 10 | "att_key_num_nodes": [1500, 1500], 11 | "att_key_network_type": 2, 12 | "att_value_input": "tdnn5_relu", 13 | "att_value_num_nodes": [], 14 | "att_value_network_type": 0, 15 | "att_apply_nonlinear": false, 16 | "att_use_scale": true, 17 | "att_num_heads": 1, 18 | "att_split_key": false, 19 | "att_penalty_term": 0, 20 | 21 | "learning_rate": 0.005, 22 | "optimizer": "momentum", 23 | "momentum": 0.9, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | 27 | "weight_l2_regularizer": 1e-2, 28 | "batchnorm_momentum": 0.99, 29 | 30 | "num_epochs": 200, 31 | "num_steps_per_epoch": 16000, 32 | "reduce_lr_epochs": 3, 33 | "show_training_progress": 200, 34 | "keep_checkpoint_max": 100, 35 | "save_summary_steps": 8000, 36 | "save_checkpoints_steps": 16000, 37 | "valid_max_iterations": 5000, 38 | 39 | "num_parallel_datasets": 4, 40 | "max_queue_size": 10, 41 | "num_speakers_per_batch": 64, 42 | "num_segments_per_speaker": 1, 43 | "min_segment_len": 100, 44 | "max_segment_len": 300, 45 | 46 | "early_stop_epochs": 8, 47 | "min_learning_rate": 1e-6 48 | } -------------------------------------------------------------------------------- /egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_4.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_linear": false, 5 | "loss_func": "softmax", 6 | "batch_type": "softmax", 7 | 8 | "pooling_type": "self_attention", 9 | "att_key_input": "tdnn4_relu", 10 | "att_key_num_nodes": [1500, 1500], 11 | "att_key_network_type": 0, 12 | "att_value_input": "tdnn5_relu", 13 | "att_value_num_nodes": [], 14 | "att_value_network_type": 0, 15 | "att_apply_nonlinear": false, 16 | "att_use_scale": true, 17 | "att_num_heads": 1, 18 | "att_split_key": false, 19 | "att_penalty_term": 0, 20 | 21 | "learning_rate": 0.005, 22 | "optimizer": "momentum", 23 | "momentum": 0.9, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | 27 | "weight_l2_regularizer": 1e-2, 28 | "batchnorm_momentum": 0.99, 29 | 30 | "num_epochs": 200, 31 | "num_steps_per_epoch": 16000, 32 | "reduce_lr_epochs": 3, 33 | "show_training_progress": 200, 34 | "keep_checkpoint_max": 100, 35 | "save_summary_steps": 8000, 36 | "save_checkpoints_steps": 16000, 37 | "valid_max_iterations": 5000, 38 | 39 | "num_parallel_datasets": 4, 40 | "max_queue_size": 10, 41 | "num_speakers_per_batch": 64, 42 | "num_segments_per_speaker": 1, 43 | "min_segment_len": 100, 44 | "max_segment_len": 300, 45 | 46 | "early_stop_epochs": 8, 47 | "min_learning_rate": 1e-6 48 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "aux_loss_func": ["ring_loss"], 16 | "ring_loss_init": 20, 17 | "ring_loss_lambda": 0.01, 18 | 19 | "batch_type": "softmax", 20 | "pooling_type": "statistics_pooling", 21 | "embedding_node": "tdnn6_dense", 22 | 23 | "learning_rate": 0.01, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | "clip_gradient_norm": 3, 27 | 28 | "weight_l2_regularizer": 1e-2, 29 | "batchnorm_momentum": 0.99, 30 | 31 | "num_epochs": 100, 32 | "num_steps_per_epoch": 30000, 33 | "reduce_lr_epochs": 4, 34 | "show_training_progress": 100, 35 | "keep_checkpoint_max": 100, 36 | "save_summary_steps": 10000, 37 | "save_checkpoints_steps": 30000, 38 | "valid_max_iterations": 1000, 39 | 40 | "num_parallel_datasets": 8, 41 | "max_queue_size": 10, 42 | "num_speakers_per_batch": 64, 43 | "num_segments_per_speaker": 1, 44 | "min_segment_len": 200, 45 | "max_segment_len": 400, 46 | 47 | "early_stop_epochs": 10, 48 | "min_learning_rate": 1e-6 49 | } -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "aux_loss_func": ["ring_loss"], 16 | "ring_loss_init": 20, 17 | "ring_loss_lambda": 0.01, 18 | 19 | "batch_type": "softmax", 20 | "pooling_type": "statistics_pooling", 21 | "embedding_node": "tdnn6_dense", 22 | 23 | "learning_rate": 0.01, 24 | "use_nesterov": false, 25 | "clip_gradient": false, 26 | "clip_gradient_norm": 3, 27 | 28 | "weight_l2_regularizer": 1e-2, 29 | "batchnorm_momentum": 0.99, 30 | 31 | "num_epochs": 100, 32 | "num_steps_per_epoch": 30000, 33 | "reduce_lr_epochs": 4, 34 | "show_training_progress": 100, 35 | "keep_checkpoint_max": 100, 36 | "save_summary_steps": 10000, 37 | "save_checkpoints_steps": 30000, 38 | "valid_max_iterations": 1000, 39 | 40 | "num_parallel_datasets": 8, 41 | "max_queue_size": 10, 42 | "num_speakers_per_batch": 64, 43 | "num_segments_per_speaker": 1, 44 | "min_segment_len": 200, 45 | "max_segment_len": 400, 46 | 47 | "early_stop_epochs": 10, 48 | "min_learning_rate": 1e-6 49 | } -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "aux_loss_func": ["mhe_loss"], 16 | "mhe_lambda": 0.01, 17 | 18 | "noupdate_var_list": [], 19 | "noload_var_list": [], 20 | 21 | "batch_type": "softmax", 22 | "pooling_type": "statistics_pooling", 23 | "embedding_node": "tdnn6_dense", 24 | 25 | "learning_rate": 0.01, 26 | "use_nesterov": false, 27 | "clip_gradient": false, 28 | "clip_gradient_norm": 3, 29 | 30 | "weight_l2_regularizer": 1e-2, 31 | "batchnorm_momentum": 0.99, 32 | 33 | "num_epochs": 100, 34 | "num_steps_per_epoch": 30000, 35 | "reduce_lr_epochs": 4, 36 | "show_training_progress": 100, 37 | "keep_checkpoint_max": 100, 38 | "save_summary_steps": 10000, 39 | "save_checkpoints_steps": 30000, 40 | "valid_max_iterations": 1000, 41 | 42 | "num_parallel_datasets": 8, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 200, 47 | "max_segment_len": 400, 48 | 49 | "early_stop_epochs": 10, 50 | "min_learning_rate": 1e-6 51 | } -------------------------------------------------------------------------------- /misc/tools/sample_validset_spk2utt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 4: 6 | print('usage: %s num_heldout_spk num_heldout_utts_per_spk input_spk2utt' % sys.argv[0]) 7 | quit() 8 | 9 | num_spks = int(sys.argv[1]) 10 | num_utts_per_spk = int(sys.argv[2]) 11 | 12 | satisfy_spks = [] 13 | not_satisfy_spks = [] 14 | with open(sys.argv[3], 'r') as f: 15 | for line in f.readlines(): 16 | spk, utts = line.strip().split(' ', 1) 17 | utts = utts.split(' ') 18 | if len(utts) >= num_utts_per_spk + 2: 19 | satisfy_spks.append([spk, utts]) 20 | else: 21 | not_satisfy_spks.append([spk, utts]) 22 | 23 | if len(satisfy_spks) < num_spks: 24 | satisfy_spks += random.sample(not_satisfy_spks, num_spks - len(satisfy_spks)) 25 | 26 | sampled_spks = random.sample(satisfy_spks, num_spks) 27 | for spk in sampled_spks: 28 | sys.stdout.write('%s' % spk[0]) 29 | 30 | # We should ensure at lease one utterance of each speaker is left in the training set. 31 | if len(spk[1]) > num_utts_per_spk: 32 | spk[1] = random.sample(spk[1], num_utts_per_spk) 33 | else: 34 | spk[1] = random.sample(spk[1], len(spk[1]) - 1) 35 | 36 | for utt in spk[1]: 37 | sys.stdout.write(' %s' % utt) 38 | sys.stdout.write('\n') 39 | -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "aux_loss_func": ["mhe_loss"], 16 | "mhe_lambda": 0.01, 17 | 18 | "noupdate_var_list": [], 19 | "noload_var_list": [], 20 | 21 | "batch_type": "softmax", 22 | "pooling_type": "statistics_pooling", 23 | "embedding_node": "tdnn6_dense", 24 | 25 | "learning_rate": 0.01, 26 | "use_nesterov": false, 27 | "clip_gradient": false, 28 | "clip_gradient_norm": 3, 29 | 30 | "weight_l2_regularizer": 1e-2, 31 | "batchnorm_momentum": 0.99, 32 | 33 | "num_epochs": 100, 34 | "num_steps_per_epoch": 30000, 35 | "reduce_lr_epochs": 4, 36 | "show_training_progress": 100, 37 | "keep_checkpoint_max": 100, 38 | "save_summary_steps": 10000, 39 | "save_checkpoints_steps": 30000, 40 | "valid_max_iterations": 1000, 41 | 42 | "num_parallel_datasets": 8, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 200, 47 | "max_segment_len": 400, 48 | 49 | "early_stop_epochs": 10, 50 | "min_learning_rate": 1e-6 51 | } -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment", 16 | "context_size": 7, 17 | "num_shared_layers": 0, 18 | "pooling_type": "statistics_pooling", 19 | 20 | "spk_loss_weight": 1.0, 21 | "speaker_dim": 512, 22 | "spk_last_layer_no_bn": false, 23 | "spk_last_layer_linear": false, 24 | "spk_loss_type": "softmax", 25 | 26 | "phn_loss_weight": 0.0, 27 | "phone_dim": 512, 28 | "phn_loss_type": "softmax", 29 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 30 | "num_frames_per_utt": 4, 31 | 32 | "spk_embedding_node": "zs_mu_relu", 33 | "phn_embedding_node": "zp_mu_relu", 34 | 35 | "num_parallel_datasets": 8, 36 | "max_queue_size": 10, 37 | "num_speakers_per_batch": 64, 38 | "num_segments_per_speaker": 1, 39 | "min_segment_len": 100, 40 | "max_segment_len": 300, 41 | 42 | "num_epochs": 200, 43 | "num_steps_per_epoch": 7000, 44 | "show_training_progress": 200, 45 | "keep_checkpoint_max": 100, 46 | "save_summary_steps": 3500, 47 | "save_checkpoints_steps": 7000, 48 | "valid_max_iterations": 1000, 49 | 50 | "reduce_lr_epochs": 3, 51 | "early_stop_epochs": 8, 52 | "min_learning_rate": 1e-6 53 | } 54 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment", 16 | "context_size": 7, 17 | "num_shared_layers": 0, 18 | "pooling_type": "statistics_pooling", 19 | 20 | "spk_loss_weight": 1.0, 21 | "speaker_dim": 512, 22 | "spk_last_layer_no_bn": false, 23 | "spk_last_layer_linear": false, 24 | "spk_loss_type": "softmax", 25 | 26 | "phn_loss_weight": 1.0, 27 | "phone_dim": 512, 28 | "phn_loss_type": "softmax", 29 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 30 | "num_frames_per_utt": 4, 31 | 32 | "spk_embedding_node": "zs_mu_relu", 33 | "phn_embedding_node": "zp_mu_relu", 34 | 35 | "num_parallel_datasets": 8, 36 | "max_queue_size": 10, 37 | "num_speakers_per_batch": 64, 38 | "num_segments_per_speaker": 1, 39 | "min_segment_len": 100, 40 | "max_segment_len": 300, 41 | 42 | "num_epochs": 200, 43 | "num_steps_per_epoch": 7000, 44 | "show_training_progress": 200, 45 | "keep_checkpoint_max": 100, 46 | "save_summary_steps": 3500, 47 | "save_checkpoints_steps": 7000, 48 | "valid_max_iterations": 1000, 49 | 50 | "reduce_lr_epochs": 3, 51 | "early_stop_epochs": 8, 52 | "min_learning_rate": 1e-6 53 | } 54 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.001, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment", 16 | "context_size": 7, 17 | "num_shared_layers": 0, 18 | "pooling_type": "statistics_pooling", 19 | 20 | "spk_loss_weight": 0, 21 | "speaker_dim": 512, 22 | "spk_last_layer_no_bn": false, 23 | "spk_last_layer_linear": false, 24 | "spk_loss_type": "softmax", 25 | 26 | "phn_loss_weight": 1.0, 27 | "phone_dim": 512, 28 | "phn_loss_type": "softmax", 29 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 30 | "num_frames_per_utt": 4, 31 | 32 | "spk_embedding_node": "zs_mu_relu", 33 | "phn_embedding_node": "zp_mu_relu", 34 | 35 | "num_parallel_datasets": 8, 36 | "max_queue_size": 10, 37 | "num_speakers_per_batch": 64, 38 | "num_segments_per_speaker": 1, 39 | "min_segment_len": 100, 40 | "max_segment_len": 300, 41 | 42 | "num_epochs": 200, 43 | "num_steps_per_epoch": 7000, 44 | "show_training_progress": 200, 45 | "keep_checkpoint_max": 100, 46 | "save_summary_steps": 3500, 47 | "save_checkpoints_steps": 7000, 48 | "valid_max_iterations": 1000, 49 | 50 | "reduce_lr_epochs": 3, 51 | "early_stop_epochs": 8, 52 | "min_learning_rate": 1e-6 53 | } 54 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_4.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.001, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-4, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment", 16 | "context_size": 7, 17 | "num_shared_layers": 0, 18 | "pooling_type": "statistics_pooling", 19 | 20 | "spk_loss_weight": 0, 21 | "speaker_dim": 512, 22 | "spk_last_layer_no_bn": false, 23 | "spk_last_layer_linear": false, 24 | "spk_loss_type": "softmax", 25 | 26 | "phn_loss_weight": 1.0, 27 | "phone_dim": 512, 28 | "phn_loss_type": "softmax", 29 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 30 | "num_frames_per_utt": 4, 31 | 32 | "spk_embedding_node": "zs_mu_relu", 33 | "phn_embedding_node": "zp_mu_relu", 34 | 35 | "num_parallel_datasets": 8, 36 | "max_queue_size": 10, 37 | "num_speakers_per_batch": 64, 38 | "num_segments_per_speaker": 1, 39 | "min_segment_len": 100, 40 | "max_segment_len": 300, 41 | 42 | "num_epochs": 200, 43 | "num_steps_per_epoch": 7000, 44 | "show_training_progress": 200, 45 | "keep_checkpoint_max": 100, 46 | "save_summary_steps": 3500, 47 | "save_checkpoints_steps": 7000, 48 | "valid_max_iterations": 1000, 49 | 50 | "reduce_lr_epochs": 3, 51 | "early_stop_epochs": 8, 52 | "min_learning_rate": 1e-6 53 | } 54 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.001, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-4, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment", 16 | "context_size": 11, 17 | "num_shared_layers": 0, 18 | "pooling_type": "statistics_pooling", 19 | 20 | "spk_loss_weight": 0, 21 | "speaker_dim": 512, 22 | "spk_last_layer_no_bn": false, 23 | "spk_last_layer_linear": false, 24 | "spk_loss_type": "softmax", 25 | 26 | "phn_loss_weight": 1.0, 27 | "phone_dim": 512, 28 | "phn_loss_type": "softmax", 29 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 30 | "num_frames_per_utt": 4, 31 | 32 | "spk_embedding_node": "zs_mu_relu", 33 | "phn_embedding_node": "zp_mu_relu", 34 | 35 | "num_parallel_datasets": 8, 36 | "max_queue_size": 10, 37 | "num_speakers_per_batch": 64, 38 | "num_segments_per_speaker": 1, 39 | "min_segment_len": 100, 40 | "max_segment_len": 300, 41 | 42 | "num_epochs": 100, 43 | "num_steps_per_epoch": 7000, 44 | "show_training_progress": 200, 45 | "keep_checkpoint_max": 100, 46 | "save_summary_steps": 3500, 47 | "save_checkpoints_steps": 7000, 48 | "valid_max_iterations": 1000, 49 | 50 | "reduce_lr_epochs": 1, 51 | "early_stop_epochs": 8, 52 | "min_learning_rate": 1e-6 53 | } 54 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_tdnn4_att.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | "network_type": "tdnn", 4 | "last_layer_no_bn": false, 5 | "last_layer_linear": true, 6 | "feature_norm": false, 7 | 8 | "loss_func": "additive_margin_softmax", 9 | "amsoftmax_m": 0.20, 10 | "amsoftmax_lambda_min": 0, 11 | "amsoftmax_lambda_base": 1000, 12 | "amsoftmax_lambda_gamma": 0.0001, 13 | "amsoftmax_lambda_power": 5, 14 | 15 | "batch_type": "softmax", 16 | 17 | "pooling_type": "self_attention", 18 | "att_key_input": "tdnn4_relu", 19 | "att_key_num_nodes": [1500, 1500], 20 | "att_key_network_type": 3, 21 | "att_value_input": "tdnn5_relu", 22 | "att_value_num_nodes": [], 23 | "att_value_network_type": 0, 24 | "att_apply_nonlinear": false, 25 | "att_use_scale": true, 26 | "att_num_heads": 1, 27 | "att_split_key": false, 28 | "att_penalty_term": 0, 29 | 30 | "learning_rate": 0.01, 31 | "use_nesterov": false, 32 | "clip_gradient": false, 33 | "clip_gradient_norm": 3, 34 | 35 | "weight_l2_regularizer": 1e-2, 36 | "batchnorm_momentum": 0.99, 37 | 38 | "num_epochs": 100, 39 | "num_steps_per_epoch": 30000, 40 | "reduce_lr_epochs": 4, 41 | "show_training_progress": 100, 42 | "keep_checkpoint_max": 100, 43 | "save_summary_steps": 10000, 44 | "save_checkpoints_steps": 30000, 45 | "valid_max_iterations": 1000, 46 | 47 | "num_parallel_datasets": 16, 48 | "max_queue_size": 10, 49 | "num_speakers_per_batch": 64, 50 | "num_segments_per_speaker": 1, 51 | "min_segment_len": 200, 52 | "max_segment_len": 400, 53 | 54 | "early_stop_epochs": 10, 55 | "min_learning_rate": 1e-6 56 | } -------------------------------------------------------------------------------- /misc/DETware_v2.1/Set_DET_limits.m: -------------------------------------------------------------------------------- 1 | function Set_DET_limits(Pmiss_min, Pmiss_max, Pfa_min, Pfa_max) 2 | % function Set_DET_limits(Pmiss_min, Pmiss_max, Pfa_min, Pfa_max) 3 | % 4 | % Set_DET_limits initializes the min.max plotting limits for P_min and P_fa. 5 | % 6 | % See DET_usage for an example of how to use Set_DET_limits. 7 | 8 | Pmiss_min_default = 0.0005+eps; 9 | Pmiss_max_default = 0.5-eps; 10 | Pfa_min_default = 0.0005+eps; 11 | Pfa_max_default = 0.5-eps; 12 | 13 | global DET_limits; 14 | 15 | %------------------------- 16 | % If value not supplied as arguement, then use previous value 17 | % or use default value if DET_limits hasn't been initialized. 18 | 19 | if (~isempty(DET_limits)) 20 | Pmiss_min_default = DET_limits(1); 21 | Pmiss_max_default = DET_limits(2); 22 | Pfa_min_default = DET_limits(3); 23 | Pfa_max_default = DET_limits(4); 24 | end 25 | 26 | if ~(exist('Pmiss_min')); Pmiss_min = Pmiss_min_default; end; 27 | if ~(exist('Pmiss_max')); Pmiss_max = Pmiss_max_default; end; 28 | if ~(exist('Pfa_min')); Pfa_min = Pfa_min_default; end; 29 | if ~(exist('Pfa_max')); Pfa_max = Pfa_max_default; end; 30 | 31 | %------------------------- 32 | % Limit bounds to reasonable values 33 | 34 | Pmiss_min = max(Pmiss_min,eps); 35 | Pmiss_max = min(Pmiss_max,1-eps); 36 | if Pmiss_max <= Pmiss_min 37 | Pmiss_min = eps; 38 | Pmiss_max = 1-eps; 39 | end 40 | 41 | Pfa_min = max(Pfa_min,eps); 42 | Pfa_max = min(Pfa_max,1-eps); 43 | if Pfa_max <= Pfa_min 44 | Pfa_min = eps; 45 | Pfa_max = 1-eps; 46 | end 47 | 48 | %-------------------------- 49 | % Load DET_limits with bounds to use 50 | 51 | DET_limits = [Pmiss_min Pmiss_max Pfa_min Pfa_max]; 52 | 53 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | #a) Old options 14 | #export cuda_cmd="queue.pl -l qp=cuda-low -l osrel='*' -l gpuclass='*'" 15 | #export decode_cmd="queue.pl -l qp=low -l osrel='*' -l osrel='*'" 16 | #export mkgraph_cmd="queue.pl -l qp=low -l osrel='*'" 17 | 18 | #b) THU Tianjin Cluster 19 | queue_conf=$PWD/slurm_conf/slurm.conf 20 | #export train_cmd="slurm.pl --config $queue_conf" 21 | export train_cmd="run.pl" 22 | export cuda_cmd="run.pl" 23 | 24 | if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then 25 | queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, 26 | export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" 27 | export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" 28 | export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" 29 | fi -------------------------------------------------------------------------------- /egs/voxceleb/v2_unfinished/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | #a) Old options 14 | #export cuda_cmd="queue.pl -l qp=cuda-low -l osrel='*' -l gpuclass='*'" 15 | #export decode_cmd="queue.pl -l qp=low -l osrel='*' -l osrel='*'" 16 | #export mkgraph_cmd="queue.pl -l qp=low -l osrel='*'" 17 | 18 | #b) THU Tianjin Cluster 19 | queue_conf=$PWD/slurm_conf/slurm.conf 20 | #export train_cmd="slurm.pl --config $queue_conf" 21 | export train_cmd="run.pl" 22 | export cuda_cmd="run.pl" 23 | 24 | if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then 25 | queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, 26 | export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" 27 | export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" 28 | export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" 29 | fi -------------------------------------------------------------------------------- /egs/fisher/v1/eval_plda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nnetdir=$1 4 | 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 6 | echo "EER: ${eer}%" 7 | paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test.new 8 | grep ' target$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.target 9 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.nontarget 10 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test.target', '$nnetdir/xvector_scores_hires/test.nontarget', '$nnetdir/xvector_scores_hires/test_lda_plda.result')"` 11 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 12 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_plda.result 13 | 14 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 15 | echo "EER: ${eer}%" 16 | paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_lda_cos.new 17 | grep ' target$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.target 18 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.nontarget 19 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_lda_cos.target', '$nnetdir/xvector_scores_hires/test_lda_cos.nontarget', '$nnetdir/xvector_scores_hires/test_lda_cos.result')"` 20 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 21 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_cos.result 22 | -------------------------------------------------------------------------------- /egs/fisher/v3/eval_plda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nnetdir=$1 4 | 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 6 | echo "EER: ${eer}%" 7 | paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test.new 8 | grep ' target$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.target 9 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.nontarget 10 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test.target', '$nnetdir/xvector_scores_hires/test.nontarget', '$nnetdir/xvector_scores_hires/test_lda_plda.result')"` 11 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 12 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_plda.result 13 | 14 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null) 15 | echo "EER: ${eer}%" 16 | paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_lda_cos.new 17 | grep ' target$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.target 18 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.nontarget 19 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_lda_cos.target', '$nnetdir/xvector_scores_hires/test_lda_cos.nontarget', '$nnetdir/xvector_scores_hires/test_lda_cos.result')"` 20 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null 21 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_cos.result 22 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_6.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "phone_layer_size": [512, 512, 512, 512, 512], 16 | "phone_kernel_size": [5, 5, 7, 1, 3], 17 | "phone_dilation_size": [1, 1, 1, 1, 4], 18 | 19 | "?left_and_right_context": "The context is used in feature expansion", 20 | "speaker_left_context": 7, 21 | "speaker_right_context": 7, 22 | "phone_left_context": 11, 23 | "phone_right_context": 11, 24 | "num_shared_layers": 0, 25 | 26 | "pooling_type": "statistics_pooling", 27 | "spk_loss_weight": 0, 28 | "speaker_dim": 512, 29 | "spk_last_layer_no_bn": false, 30 | "spk_last_layer_linear": false, 31 | "spk_loss_type": "softmax", 32 | 33 | "phn_loss_weight": 1.0, 34 | "phone_dim": 512, 35 | "phn_loss_type": "softmax", 36 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 37 | "num_frames_per_utt": 4, 38 | 39 | "spk_embedding_node": "zs_mu_relu", 40 | "phn_embedding_node": "zp_mu_relu", 41 | 42 | "num_parallel_datasets": 4, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 100, 47 | "max_segment_len": 300, 48 | 49 | "num_epochs": 100, 50 | "num_steps_per_epoch": 7000, 51 | "show_training_progress": 200, 52 | "keep_checkpoint_max": 100, 53 | "save_summary_steps": 3500, 54 | "save_checkpoints_steps": 7000, 55 | "valid_max_iterations": 1000, 56 | 57 | "reduce_lr_epochs": 3, 58 | "early_stop_epochs": 8, 59 | "min_learning_rate": 1e-6 60 | } 61 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_7.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "phone_layer_size": [512, 512, 512, 512, 512], 16 | "phone_kernel_size": [5, 5, 7, 1, 3], 17 | "phone_dilation_size": [1, 1, 1, 1, 4], 18 | 19 | "?left_and_right_context": "The context is used in feature expansion", 20 | "speaker_left_context": 7, 21 | "speaker_right_context": 7, 22 | "phone_left_context": 11, 23 | "phone_right_context": 11, 24 | "num_shared_layers": 0, 25 | 26 | "pooling_type": "statistics_pooling", 27 | "spk_loss_weight": 0, 28 | "speaker_dim": 512, 29 | "spk_last_layer_no_bn": false, 30 | "spk_last_layer_linear": false, 31 | "spk_loss_type": "softmax", 32 | 33 | "phn_loss_weight": 1.0, 34 | "phone_dim": 512, 35 | "phn_loss_type": "softmax", 36 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 37 | "num_frames_per_utt": 4, 38 | 39 | "spk_embedding_node": "zs_mu_relu", 40 | "phn_embedding_node": "zp_mu_relu", 41 | 42 | "num_parallel_datasets": 4, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 30, 47 | "max_segment_len": 30, 48 | 49 | "num_epochs": 100, 50 | "num_steps_per_epoch": 7000, 51 | "show_training_progress": 200, 52 | "keep_checkpoint_max": 100, 53 | "save_summary_steps": 3500, 54 | "save_checkpoints_steps": 7000, 55 | "valid_max_iterations": 1000, 56 | 57 | "reduce_lr_epochs": 3, 58 | "early_stop_epochs": 8, 59 | "min_learning_rate": 1e-6 60 | } 61 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_8.2.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "phone_layer_size": [512, 512, 512, 512, 512], 16 | "phone_kernel_size": [5, 5, 7, 1, 3], 17 | "phone_dilation_size": [1, 1, 1, 1, 1], 18 | 19 | "?left_and_right_context": "The context is used in feature expansion", 20 | "speaker_left_context": 7, 21 | "speaker_right_context": 7, 22 | "phone_left_context": 8, 23 | "phone_right_context": 8, 24 | "num_shared_layers": 0, 25 | 26 | "pooling_type": "statistics_pooling", 27 | "spk_loss_weight": 0, 28 | "speaker_dim": 512, 29 | "spk_last_layer_no_bn": false, 30 | "spk_last_layer_linear": false, 31 | "spk_loss_type": "softmax", 32 | 33 | "phn_loss_weight": 1.0, 34 | "phone_dim": 512, 35 | "phn_loss_type": "softmax", 36 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 37 | "num_frames_per_utt": 4, 38 | 39 | "spk_embedding_node": "zs_mu_relu", 40 | "phn_embedding_node": "zp_mu_relu", 41 | 42 | "num_parallel_datasets": 4, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 100, 47 | "max_segment_len": 300, 48 | 49 | "num_epochs": 100, 50 | "num_steps_per_epoch": 7000, 51 | "show_training_progress": 200, 52 | "keep_checkpoint_max": 100, 53 | "save_summary_steps": 3500, 54 | "save_checkpoints_steps": 7000, 55 | "valid_max_iterations": 1000, 56 | 57 | "reduce_lr_epochs": 3, 58 | "early_stop_epochs": 8, 59 | "min_learning_rate": 1e-6 60 | } 61 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_8.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "phone_layer_size": [512, 512, 512, 512, 512], 16 | "phone_kernel_size": [5, 5, 7, 1, 3], 17 | "phone_dilation_size": [1, 1, 1, 1, 1], 18 | 19 | "?left_and_right_context": "The context is used in feature expansion", 20 | "speaker_left_context": 7, 21 | "speaker_right_context": 7, 22 | "phone_left_context": 8, 23 | "phone_right_context": 8, 24 | "num_shared_layers": 0, 25 | 26 | "pooling_type": "statistics_pooling", 27 | "spk_loss_weight": 0, 28 | "speaker_dim": 512, 29 | "spk_last_layer_no_bn": false, 30 | "spk_last_layer_linear": false, 31 | "spk_loss_type": "softmax", 32 | 33 | "phn_loss_weight": 1.0, 34 | "phone_dim": 512, 35 | "phn_loss_type": "softmax", 36 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 37 | "num_frames_per_utt": 1, 38 | 39 | "spk_embedding_node": "zs_mu_relu", 40 | "phn_embedding_node": "zp_mu_relu", 41 | 42 | "num_parallel_datasets": 4, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 100, 47 | "max_segment_len": 300, 48 | 49 | "num_epochs": 100, 50 | "num_steps_per_epoch": 7000, 51 | "show_training_progress": 200, 52 | "keep_checkpoint_max": 100, 53 | "save_summary_steps": 3500, 54 | "save_checkpoints_steps": 7000, 55 | "valid_max_iterations": 1000, 56 | 57 | "reduce_lr_epochs": 3, 58 | "early_stop_epochs": 8, 59 | "min_learning_rate": 1e-6 60 | } 61 | -------------------------------------------------------------------------------- /egs/fisher/v3/nnet_conf/mt_softmax_8.3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 0, 3 | 4 | "learning_rate": 0.005, 5 | "optimizer": "momentum", 6 | "momentum": 0.9, 7 | "use_nesterov": false, 8 | "clip_gradient": false, 9 | 10 | "weight_l2_regularizer": 1e-2, 11 | "batchnorm_momentum": 0.99, 12 | 13 | "batch_type": "softmax", 14 | 15 | "phone_layer_size": [512, 512, 512, 512, 512], 16 | "phone_kernel_size": [5, 5, 7, 1, 3], 17 | "phone_dilation_size": [1, 1, 1, 1, 1], 18 | 19 | "?left_and_right_context": "The context is used in feature expansion", 20 | "speaker_left_context": 7, 21 | "speaker_right_context": 7, 22 | "phone_left_context": 8, 23 | "phone_right_context": 8, 24 | "num_shared_layers": 0, 25 | 26 | "pooling_type": "statistics_pooling", 27 | "spk_loss_weight": 0, 28 | "speaker_dim": 512, 29 | "spk_last_layer_no_bn": false, 30 | "spk_last_layer_linear": false, 31 | "spk_loss_type": "softmax", 32 | 33 | "phn_loss_weight": 1.0, 34 | "phone_dim": 512, 35 | "phn_loss_type": "softmax", 36 | "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.", 37 | "num_frames_per_utt": -1, 38 | 39 | "spk_embedding_node": "zs_mu_relu", 40 | "phn_embedding_node": "zp_mu_relu", 41 | 42 | "num_parallel_datasets": 4, 43 | "max_queue_size": 10, 44 | "num_speakers_per_batch": 64, 45 | "num_segments_per_speaker": 1, 46 | "min_segment_len": 100, 47 | "max_segment_len": 300, 48 | 49 | "num_epochs": 100, 50 | "num_steps_per_epoch": 7000, 51 | "show_training_progress": 200, 52 | "keep_checkpoint_max": 100, 53 | "save_summary_steps": 3500, 54 | "save_checkpoints_steps": 7000, 55 | "valid_max_iterations": 1000, 56 | 57 | "reduce_lr_epochs": 3, 58 | "early_stop_epochs": 8, 59 | "min_learning_rate": 1e-6 60 | } 61 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_finetune_lr_learning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | env=tf_gpu 5 | num_gpus=1 6 | checkpoint=-1 7 | tune_period=100 8 | 9 | echo "$0 $@" 10 | 11 | if [ -f path.sh ]; then . ./path.sh; fi 12 | . parse_options.sh || exit 1; 13 | 14 | if [ $# != 7 ]; then 15 | echo "Usage: $0 [options] " 16 | echo "Options:" 17 | echo " --tune-period <100>" 18 | echo " --checkpoint <-1>" 19 | echo " --env " 20 | echo " --num-gpus " 21 | exit 100 22 | fi 23 | 24 | config=$1 25 | train=$2 26 | train_spklist=$3 27 | valid=$4 28 | valid_spklist=$5 29 | pretrain_nnetdir=$6 30 | nnetdir=$7 31 | 32 | # add the library to the python path. 33 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 34 | 35 | mkdir -p $nnetdir/log 36 | 37 | # Get available GPUs before we can train the network. 38 | num_total_gpus=`nvidia-smi -L | wc -l` 39 | num_gpus_assigned=0 40 | while [ $num_gpus_assigned -ne $num_gpus ]; do 41 | num_gpus_assigned=0 42 | for i in `seq 0 $[$num_total_gpus-1]`; do 43 | # going over all GPUs and check if it is idle, and add to the list if yes 44 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 45 | num_gpus_assigned=$[$num_gpus_assigned+1] 46 | fi 47 | # once we have enough GPUs, break out of the loop 48 | [ $num_gpus_assigned -eq $num_gpus ] && break 49 | done 50 | [ $num_gpus_assigned -eq $num_gpus ] && break 51 | sleep 300 52 | done 53 | 54 | source $TF_ENV/$env/bin/activate 55 | $cmd $nnetdir/log/finetune_lr_learning.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 56 | python nnet/lib/finetune_lr_learning.py --tune_period $tune_period --checkpoint $checkpoint --config $config $train $train_spklist $valid $valid_spklist $pretrain_nnetdir $nnetdir 57 | deactivate 58 | 59 | exit 0 -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_train_lr_learning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | env=tf_gpu 5 | num_gpus=1 6 | tune_period=100 7 | 8 | echo "$0 $@" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 6 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --tune-period <100>" 17 | echo " --env " 18 | echo " --num-gpus " 19 | exit 100 20 | fi 21 | 22 | config=$1 23 | train=$2 24 | train_spklist=$3 25 | valid=$4 26 | valid_spklist=$5 27 | nnetdir=$6 28 | 29 | # add the library to the python path. 30 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 31 | 32 | mkdir -p $nnetdir/log 33 | 34 | 35 | # Get available GPUs before we can train the network. 36 | num_total_gpus=`nvidia-smi -L | wc -l` 37 | num_gpus_assigned=0 38 | while [ $num_gpus_assigned -ne $num_gpus ]; do 39 | num_gpus_assigned=0 40 | for i in `seq 0 $[$num_total_gpus-1]`; do 41 | # going over all GPUs and check if it is idle, and add to the list if yes 42 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 43 | num_gpus_assigned=$[$num_gpus_assigned+1] 44 | fi 45 | # once we have enough GPUs, break out of the loop 46 | [ $num_gpus_assigned -eq $num_gpus ] && break 47 | done 48 | [ $num_gpus_assigned -eq $num_gpus ] && break 49 | sleep 300 50 | done 51 | 52 | # Activate the gpu virtualenv 53 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways. 54 | # Limit the GPU number to what we want. 55 | source $TF_ENV/$env/bin/activate 56 | $cmd $nnetdir/log/train_lr_learning.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 57 | python nnet/lib/train_lr_learning.py --tune_period $tune_period --config $config $train $train_spklist $valid $valid_spklist $nnetdir 58 | deactivate 59 | 60 | exit 0 -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_finetune_nnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | continue_training=false 5 | env=tf_gpu 6 | num_gpus=1 7 | checkpoint=-1 8 | 9 | echo "$0 $@" 10 | 11 | if [ -f path.sh ]; then . ./path.sh; fi 12 | . parse_options.sh || exit 1; 13 | 14 | if [ $# != 7 ]; then 15 | echo "Usage: $0 [options] " 16 | echo "Options:" 17 | echo " --continue-training " 18 | echo " --checkpoint <-1>" 19 | echo " --env " 20 | echo " --num-gpus " 21 | exit 100 22 | fi 23 | 24 | config=$1 25 | train=$2 26 | train_spklist=$3 27 | valid=$4 28 | valid_spklist=$5 29 | pretrain_nnetdir=$6 30 | nnetdir=$7 31 | 32 | # add the library to the python path. 33 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 34 | 35 | mkdir -p $nnetdir/log 36 | 37 | if [ $continue_training == 'true' ]; then 38 | cmdopts="-c" 39 | fi 40 | 41 | # Get available GPUs before we can train the network. 42 | num_total_gpus=`nvidia-smi -L | wc -l` 43 | num_gpus_assigned=0 44 | while [ $num_gpus_assigned -ne $num_gpus ]; do 45 | num_gpus_assigned=0 46 | for i in `seq 0 $[$num_total_gpus-1]`; do 47 | # going over all GPUs and check if it is idle, and add to the list if yes 48 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 49 | num_gpus_assigned=$[$num_gpus_assigned+1] 50 | fi 51 | # once we have enough GPUs, break out of the loop 52 | [ $num_gpus_assigned -eq $num_gpus ] && break 53 | done 54 | [ $num_gpus_assigned -eq $num_gpus ] && break 55 | sleep 300 56 | done 57 | 58 | source $TF_ENV/$env/bin/activate 59 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 60 | python nnet/lib/finetune.py $cmdopts --checkpoint $checkpoint --config $config $train $train_spklist $valid $valid_spklist $pretrain_nnetdir $nnetdir 61 | deactivate 62 | 63 | exit 0 64 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_train_mi_nnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | continue_training=false 5 | env=tf_gpu 6 | num_gpus=1 7 | 8 | echo "$0 $@" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 8 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --continue-training " 17 | echo " --env " 18 | echo " --num-gpus " 19 | exit 100 20 | fi 21 | 22 | config=$1 23 | train=$2 24 | train_aux=$3 25 | train_spklist=$4 26 | valid=$5 27 | valid_aux=$6 28 | valid_spklist=$7 29 | nnetdir=$8 30 | 31 | # add the library to the python path. 32 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 33 | 34 | mkdir -p $nnetdir/log 35 | 36 | if [ $continue_training == 'true' ]; then 37 | cmdopts="-c" 38 | fi 39 | 40 | # Get available GPUs before we can train the network. 41 | num_total_gpus=`nvidia-smi -L | wc -l` 42 | num_gpus_assigned=0 43 | while [ $num_gpus_assigned -ne $num_gpus ]; do 44 | num_gpus_assigned=0 45 | for i in `seq 0 $[$num_total_gpus-1]`; do 46 | # going over all GPUs and check if it is idle, and add to the list if yes 47 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 48 | num_gpus_assigned=$[$num_gpus_assigned+1] 49 | fi 50 | # once we have enough GPUs, break out of the loop 51 | [ $num_gpus_assigned -eq $num_gpus ] && break 52 | done 53 | [ $num_gpus_assigned -eq $num_gpus ] && break 54 | sleep 300 55 | done 56 | 57 | # Activate the gpu virtualenv 58 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways. 59 | # Limit the GPU number to what we want. 60 | source $TF_ENV/$env/bin/activate 61 | #$cmd $nnetdir/log/train_mi_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 62 | python nnet/lib/train_mi.py $cmdopts --config $config $train $train_aux $train_spklist $valid $valid_aux $valid_spklist $nnetdir 63 | deactivate 64 | 65 | exit 0 -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_train_nnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | continue_training=false 5 | env=tf_gpu 6 | num_gpus=1 7 | 8 | echo "$0 $@" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 6 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --continue-training " 17 | echo " --env " 18 | echo " --num-gpus " 19 | exit 100 20 | fi 21 | 22 | config=$1 23 | train=$2 24 | train_spklist=$3 25 | valid=$4 26 | valid_spklist=$5 27 | nnetdir=$6 28 | 29 | # add the library to the python path. 30 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 31 | 32 | mkdir -p $nnetdir/log 33 | 34 | if [ $continue_training == 'true' ]; then 35 | cmdopts="-c" 36 | fi 37 | 38 | # Get available GPUs before we can train the network. 39 | num_total_gpus=`nvidia-smi -L | wc -l` 40 | num_gpus_assigned=0 41 | while [ $num_gpus_assigned -ne $num_gpus ]; do 42 | num_gpus_assigned=0 43 | for i in `seq 0 $[$num_total_gpus-1]`; do 44 | # going over all GPUs and check if it is idle, and add to the list if yes 45 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 46 | num_gpus_assigned=$[$num_gpus_assigned+1] 47 | fi 48 | # once we have enough GPUs, break out of the loop 49 | [ $num_gpus_assigned -eq $num_gpus ] && break 50 | done 51 | [ $num_gpus_assigned -eq $num_gpus ] && break 52 | sleep 300 53 | done 54 | 55 | if [ -d $nnetdir/log ] && [ `ls $nnetdir/log | wc -l` -ge 1 ]; then 56 | mkdir -p $nnetdir/.backup/log 57 | cp $nnetdir/log/* $nnetdir/.backup/log 58 | fi 59 | 60 | # Activate the gpu virtualenv 61 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways. 62 | # Limit the GPU number to what we want. 63 | source $TF_ENV/$env/bin/activate 64 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 65 | python nnet/lib/train.py $cmdopts --config $config $train $train_spklist $valid $valid_spklist $nnetdir 66 | deactivate 67 | 68 | exit 0 -------------------------------------------------------------------------------- /misc/tuning/target_logit_curve.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | angle = np.arange(0, 180, 1) 5 | softmax = np.cos(angle/180*np.pi) 6 | 7 | m = 0.2 8 | amsoftmax = np.cos(angle/180*np.pi) - m 9 | 10 | m = 0.3 11 | arcsoftmax = np.cos(angle/180*np.pi + m) 12 | 13 | m = 4.0 14 | l = 10.0 15 | angle1 = np.arange(0, 180/4, 1) 16 | a1 = l / (1+l) * np.cos(angle1/180*np.pi) + 1 / (1+l) * np.cos(4 * angle1/180*np.pi) 17 | angle2 = np.arange(180/4, 180/2, 1) 18 | a2 = l / (1+l) * np.cos(angle2/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle2/180*np.pi) - 2) 19 | angle3 = np.arange(180/2, 180*3/4, 1) 20 | a3 = l / (1+l) * np.cos(angle3/180*np.pi) + 1 / (1+l) * (np.cos(4 * angle3/180*np.pi) - 4) 21 | angle4 = np.arange(180*3/4, 180, 1) 22 | a4 = l / (1+l) * np.cos(angle4/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle4/180*np.pi) - 6) 23 | angle_new = np.concatenate([angle1, angle2, angle3, angle4], axis=0) 24 | asoftmax = np.concatenate([a1, a2, a3, a4], axis=0) 25 | 26 | l = 0 27 | a1 = l / (1+l) * np.cos(angle1/180*np.pi) + 1 / (1+l) * np.cos(4 * angle1/180*np.pi) 28 | a2 = l / (1+l) * np.cos(angle2/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle2/180*np.pi) - 2) 29 | a3 = l / (1+l) * np.cos(angle3/180*np.pi) + 1 / (1+l) * (np.cos(4 * angle3/180*np.pi) - 4) 30 | a4 = l / (1+l) * np.cos(angle4/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle4/180*np.pi) - 6) 31 | asoftmax_nolambda = np.concatenate([a1, a2, a3, a4], axis=0) 32 | 33 | m = 1.20 34 | asoftmax_new = np.cos(m * angle / 180 * np.pi) 35 | 36 | plt.figure(1) 37 | plt.plot(angle, softmax, 'b', label='Softmax') 38 | plt.plot(angle_new, asoftmax_nolambda, 'r', label='ASoftmax ($m_1=4$, $\lambda=0$)') 39 | plt.plot(angle_new, asoftmax, 'r', label='ASoftmax ($m_1=4$, $\lambda=10$)') 40 | plt.plot(angle, arcsoftmax, 'c', label='ArcSoftmax ($m_2=0.30$)') 41 | plt.plot(angle, amsoftmax, 'm', label='AMSoftmax ($m_3=0.20$)') 42 | plt.xlabel(r'$\theta$', fontsize='x-large') 43 | plt.ylabel(r'$\psi(\theta)$', fontsize='x-large') 44 | plt.xlim((10, 120)) 45 | plt.ylim((-1.0, 1.0)) 46 | plt.legend(loc='lower left', fontsize='medium') 47 | plt.savefig('target_logit_curve.pdf', format='pdf') 48 | plt.show() 49 | -------------------------------------------------------------------------------- /scripts/prepare_bnfeats_for_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nj=30 4 | cmd="run.pl" 5 | stage=0 6 | compress=true 7 | 8 | echo "$0 $@" # Print the command line for logging 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 3 ]; then 14 | echo "Usage: $0 " 15 | echo "e.g.: $0 data/bnf data/bnf_nosil exp/bnf_nosil" 16 | echo "Options: " 17 | echo " --nj # number of parallel jobs" 18 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 19 | exit 1; 20 | fi 21 | 22 | data_in=$1 23 | data_out=$2 24 | dir=$3 25 | 26 | name=`basename $data_in` 27 | 28 | for f in $data_in/feats.scp $data_in/vad.scp $data_in/cmvn.scp ; do 29 | [ ! -f $f ] && echo "$0: No such file $f" && exit 1; 30 | done 31 | 32 | # Set various variables. 33 | mkdir -p $dir/log 34 | mkdir -p $data_out 35 | featdir=$(utils/make_absolute.sh $dir) 36 | 37 | cp $data_in/utt2spk $data_out/utt2spk 38 | cp $data_in/spk2utt $data_out/spk2utt 39 | cp $data_in/wav.scp $data_out/wav.scp 40 | [ -f $data_in/segments ] && cp $data_in/segments $data_out/segments 41 | 42 | write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" 43 | 44 | sdata_in=$data_in/split$nj; 45 | utils/split_data.sh $data_in $nj || exit 1; 46 | 47 | $cmd JOB=1:$nj $dir/log/create_bnfeats_${name}.JOB.log \ 48 | apply-cmvn --norm-means=true --norm-vars=false --utt2spk=ark:${sdata_in}/JOB/utt2spk scp:${sdata_in}/JOB/cmvn.scp scp:${sdata_in}/JOB/feats.scp ark:- \| \ 49 | select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ 50 | copy-feats --compress=$compress $write_num_frames_opt ark:- \ 51 | ark,scp:$featdir/bnfeats_${name}.JOB.ark,$featdir/bnfeats_${name}.JOB.scp || exit 1; 52 | 53 | for n in $(seq $nj); do 54 | cat $featdir/bnfeats_${name}.$n.scp || exit 1; 55 | done > ${data_out}/feats.scp || exit 1 56 | 57 | for n in $(seq $nj); do 58 | cat $featdir/log/utt2num_frames.$n || exit 1; 59 | done > $data_out/utt2num_frames || exit 1 60 | rm $featdir/log/utt2num_frames.* 61 | 62 | echo "$0: Succeeded creating bottleneck features with cvmn and vad for $name" 63 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_train_mt_nnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd="run.pl" 4 | continue_training=false 5 | env=tf_gpu 6 | num_gpus=1 7 | 8 | echo "$0 $@" 9 | 10 | if [ -f path.sh ]; then . ./path.sh; fi 11 | . parse_options.sh || exit 1; 12 | 13 | if [ $# != 8 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "Options:" 16 | echo " --continue-training " 17 | echo " --env " 18 | echo " --num-gpus " 19 | exit 100 20 | fi 21 | 22 | config=$1 23 | train=$2 24 | train_ali_dir=$3 25 | train_spklist=$4 26 | valid=$5 27 | valid_ali_dir=$6 28 | valid_spklist=$7 29 | nnetdir=$8 30 | 31 | # add the library to the python path. 32 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 33 | 34 | mkdir -p $nnetdir/log 35 | 36 | if [ $continue_training == 'true' ]; then 37 | cmdopts="-c" 38 | fi 39 | 40 | # Get available GPUs before we can train the network. 41 | num_total_gpus=`nvidia-smi -L | wc -l` 42 | num_gpus_assigned=0 43 | while [ $num_gpus_assigned -ne $num_gpus ]; do 44 | num_gpus_assigned=0 45 | for i in `seq 0 $[$num_total_gpus-1]`; do 46 | # going over all GPUs and check if it is idle, and add to the list if yes 47 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 48 | num_gpus_assigned=$[$num_gpus_assigned+1] 49 | fi 50 | # once we have enough GPUs, break out of the loop 51 | [ $num_gpus_assigned -eq $num_gpus ] && break 52 | done 53 | [ $num_gpus_assigned -eq $num_gpus ] && break 54 | sleep 300 55 | done 56 | 57 | if [ -d $nnetdir/log ] && [ `ls $nnetdir/log | wc -l` -ge 1 ]; then 58 | mkdir -p $nnetdir/.backup/log 59 | cp $nnetdir/log/* $nnetdir/.backup/log 60 | fi 61 | 62 | # Activate the gpu virtualenv 63 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways. 64 | # Limit the GPU number to what we want. 65 | source $TF_ENV/$env/bin/activate 66 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \ 67 | python nnet/lib/train_mt.py $cmdopts --config $config $train $train_ali_dir $train_spklist $valid $valid_ali_dir $valid_spklist $nnetdir 68 | deactivate 69 | 70 | exit 0 71 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/train_insight.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import sys 5 | import numpy, scipy, sklearn 6 | import tensorflow as tf 7 | import numpy as np 8 | from misc.utils import save_codes_and_config, compute_cos_pairwise_eer 9 | from model.trainer import Trainer 10 | from dataset.data_loader import KaldiDataRandomQueue 11 | from dataset.kaldi_io import FeatureReader 12 | 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("data_dir", type=str, help="The data directory of the dataset.") 16 | parser.add_argument("data_spklist", type=str, help="The spklist maps the speakers to the indices.") 17 | parser.add_argument("model", type=str, help="The output model directory.") 18 | 19 | 20 | if __name__ == '__main__': 21 | tf.logging.set_verbosity(tf.logging.INFO) 22 | args = parser.parse_args() 23 | params = save_codes_and_config(True, args.model, None) 24 | 25 | # The model directory always has a folder named nnet 26 | model_dir = os.path.join(args.model, "nnet") 27 | 28 | # Set the random seed. The random operations may appear in data input, batch forming, etc. 29 | tf.set_random_seed(params.seed) 30 | random.seed(params.seed) 31 | np.random.seed(params.seed) 32 | 33 | dim = FeatureReader(args.data_dir).get_dim() 34 | with open(args.data_spklist, 'r') as f: 35 | num_total_train_speakers = len(f.readlines()) 36 | trainer = Trainer(params, args.model) 37 | trainer.build("valid", 38 | dim=dim, 39 | loss_type=params.loss_func, 40 | num_speakers=num_total_train_speakers) 41 | # valid_loss, valid_embeddings, valid_labels = trainer.valid(args.data_dir, args.data_spklist, 42 | # batch_type=params.batch_type, 43 | # output_embeddings=True) 44 | 45 | valid_loss, valid_embeddings, valid_labels = trainer.insight(args.data_dir, args.data_spklist, 46 | batch_type=params.batch_type, 47 | output_embeddings=True) 48 | eer = compute_cos_pairwise_eer(valid_embeddings, valid_labels) 49 | tf.logging.info("EER: %f" % eer) 50 | trainer.close() 51 | -------------------------------------------------------------------------------- /scripts/prepare_feats_for_multitask_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Apache 2.0. 4 | 5 | nj=40 6 | cmd="run.pl" 7 | stage=0 8 | norm_vars=false 9 | center=true 10 | compress=true 11 | cmn_window=300 12 | 13 | echo "$0 $@" # Print the command line for logging 14 | 15 | if [ -f path.sh ]; then . ./path.sh; fi 16 | . parse_options.sh || exit 1; 17 | if [ $# != 3 ]; then 18 | echo "Usage: $0 " 19 | echo "e.g.: $0 data/train data/train_wcmvn exp/make_xvector_features" 20 | echo "Options: " 21 | echo " --nj # number of parallel jobs" 22 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 23 | echo " --norm-vars # If true, normalize variances in the sliding window cmvn" 24 | exit 1; 25 | fi 26 | 27 | data_in=$1 28 | data_out=$2 29 | dir=$3 30 | 31 | name=`basename $data_in` 32 | 33 | for f in $data_in/feats.scp $data_in/vad.scp ; do 34 | [ ! -f $f ] && echo "$0: No such file $f" && exit 1; 35 | done 36 | 37 | # Set various variables. 38 | mkdir -p $dir/log 39 | mkdir -p $data_out 40 | featdir=$(utils/make_absolute.sh $dir) 41 | 42 | cp $data_in/utt2spk $data_out/utt2spk 43 | cp $data_in/spk2utt $data_out/spk2utt 44 | cp $data_in/wav.scp $data_out/wav.scp 45 | [ -f $data_in/segments ] && cp $data_in/segments $data_out/segments 46 | [ -f $data_in/vad.scp ] && cp $data_in/vad.scp $data_out/vad.scp 47 | 48 | write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" 49 | 50 | sdata_in=$data_in/split$nj; 51 | utils/split_data.sh $data_in $nj || exit 1; 52 | 53 | $cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ 54 | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ 55 | scp:${sdata_in}/JOB/feats.scp ark:- \| \ 56 | copy-feats --compress=$compress $write_num_frames_opt ark:- \ 57 | ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; 58 | 59 | for n in $(seq $nj); do 60 | cat $featdir/xvector_feats_${name}.$n.scp || exit 1; 61 | done > ${data_out}/feats.scp || exit 1 62 | 63 | for n in $(seq $nj); do 64 | cat $featdir/log/utt2num_frames.$n || exit 1; 65 | done > $data_out/utt2num_frames || exit 1 66 | rm $featdir/log/utt2num_frames.* 67 | 68 | echo "$0: Succeeded creating xvector features for $name" 69 | -------------------------------------------------------------------------------- /scripts/lmrescore_const_arpa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script rescores lattices with the ConstArpaLm format language model. 7 | 8 | # Begin configuration section. 9 | cmd=run.pl 10 | skip_scoring=false 11 | stage=1 12 | scoring_opts= 13 | # End configuration section. 14 | 15 | echo "$0 $@" # Print the command line for logging 16 | 17 | . ./utils/parse_options.sh 18 | 19 | if [ $# != 5 ]; then 20 | echo "Does language model rescoring of lattices (remove old LM, add new LM)" 21 | echo "Usage: $0 [options] \\" 22 | echo " " 23 | echo "options: [--cmd (run.pl|queue.pl [queue opts])]" 24 | exit 1; 25 | fi 26 | 27 | [ -f path.sh ] && . ./path.sh; 28 | 29 | oldlang=$1 30 | newlang=$2 31 | data=$3 32 | indir=$4 33 | outdir=$5 34 | 35 | oldlm=$oldlang/G.fst 36 | newlm=$newlang/G.carpa 37 | ! cmp $oldlang/words.txt $newlang/words.txt &&\ 38 | echo "$0: Warning: vocabularies may be incompatible." 39 | [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; 40 | [ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1; 41 | ! ls $indir/lat.*.gz >/dev/null &&\ 42 | echo "$0: No lattices input directory $indir" && exit 1; 43 | 44 | if ! cmp -s $oldlang/words.txt $newlang/words.txt; then 45 | echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; 46 | fi 47 | 48 | rm -f $outdir/lat.*.gz 49 | 50 | oldlmcommand="fstproject --project_output=true $oldlm |" 51 | mkdir -p $outdir/log 52 | nj=`cat $indir/num_jobs` || exit 1; 53 | cp $indir/num_jobs $outdir 54 | 55 | if [ $stage -le 1 ]; then 56 | $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ 57 | lattice-lmrescore --lm-scale=-1.0 \ 58 | "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ 59 | lattice-lmrescore-const-arpa --lm-scale=1.0 \ 60 | ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; 61 | fi 62 | 63 | if ! $skip_scoring && [ $stage -le 2 ]; then 64 | err_msg="Not scoring because local/score.sh does not exist or not executable." 65 | [ ! -x scripts/diagnostic/score.sh ] && echo $err_msg && exit 1; 66 | scripts/diagnostic/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir 67 | else 68 | echo "Not scoring because requested so..." 69 | fi 70 | 71 | exit 0; -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/train_mt_lr_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import sys 5 | import tensorflow as tf 6 | import numpy as np 7 | from misc.utils import ValidLoss, load_valid_loss, save_codes_and_config, compute_cos_pairwise_eer 8 | from dataset.multitask.data_loader_v2 import KaldiDataRandomQueueV2 9 | from dataset.kaldi_io import FeatureReaderV2 10 | from six.moves import range 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-c", "--cont", action="store_true", help="Continue training from an existing model.") 14 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.") 15 | parser.add_argument("--config", type=str, help="The configuration file.") 16 | parser.add_argument("train_data_dir", type=str, help="The data directory of the training set.") 17 | parser.add_argument("train_ali_dir", type=str, help="The ali directory of the training set.") 18 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.") 19 | parser.add_argument("model", type=str, help="The output model directory.") 20 | 21 | 22 | if __name__ == '__main__': 23 | tf.logging.set_verbosity(tf.logging.INFO) 24 | args = parser.parse_args() 25 | params = save_codes_and_config(False, args.model, args.config) 26 | 27 | # The model directory always has a folder named nnet 28 | model_dir = os.path.join(args.model, "nnet") 29 | 30 | # Set the random seed. The random operations may appear in data input, batch forming, etc. 31 | tf.set_random_seed(params.seed) 32 | random.seed(params.seed) 33 | np.random.seed(params.seed) 34 | 35 | start_epoch = 0 36 | 37 | feat_reader = FeatureReaderV2(args.train_data_dir, args.train_ali_dir) 38 | dim = feat_reader.get_dim() 39 | 40 | feat_reader = KaldiDataRandomQueueV2(args.train_data_dir, args.train_ali_dir, args.train_spklist) 41 | num_total_speakers = feat_reader.num_total_speakers 42 | num_total_phones = feat_reader.num_total_phones 43 | 44 | from model.multitask_v1.base_v1 import BaseMT 45 | 46 | trainer = BaseMT(params, args.model, dim, num_total_speakers, num_total_phones) 47 | trainer.build("train") 48 | trainer.train_tune_lr(args.train_data_dir, args.train_ali_dir, args.train_spklist, args.tune_period) 49 | trainer.close() 50 | tf.logging.info("Finish tuning.") 51 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/train_vae_lr_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import tensorflow as tf 5 | import numpy as np 6 | from misc.utils import ValidLoss, save_codes_and_config 7 | from dataset.data_loader import KaldiDataRandomQueueV2 8 | from dataset.kaldi_io import FeatureReaderV2 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.") 12 | parser.add_argument("--config", type=str, help="The configuration file.") 13 | parser.add_argument("train_data_dir", type=str, help="The data directory of the training set.") 14 | parser.add_argument("train_ali_dir", type=str, help="The ali directory of the training set.") 15 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.") 16 | parser.add_argument("model", type=str, help="The output model directory.") 17 | 18 | 19 | if __name__ == '__main__': 20 | tf.logging.set_verbosity(tf.logging.INFO) 21 | args = parser.parse_args() 22 | params = save_codes_and_config(False, args.model, args.config) 23 | 24 | # The model directory always has a folder named nnet 25 | model_dir = os.path.join(args.model, "nnet") 26 | 27 | # Set the random seed. The random operations may appear in data input, batch forming, etc. 28 | tf.set_random_seed(params.seed) 29 | random.seed(params.seed) 30 | np.random.seed(params.seed) 31 | 32 | start_epoch = 0 33 | 34 | feat_reader = FeatureReaderV2(args.train_data_dir, args.train_ali_dir) 35 | dim = feat_reader.get_dim() 36 | feat_reader = KaldiDataRandomQueueV2(args.train_data_dir, args.train_ali_dir, args.train_spklist) 37 | num_total_speakers = feat_reader.num_total_speakers 38 | num_total_phones = feat_reader.num_total_phones 39 | min_valid_loss = ValidLoss() 40 | 41 | from model.vae.base_v1 import BaseMT 42 | trainer = BaseMT(params, args.model, dim, num_total_speakers, num_total_phones) 43 | trainer.build("train") 44 | 45 | # You can tune the learning rate using the following function. 46 | # After training, you should plot the loss v.s. the learning rate and pich a learning rate that decrease the 47 | # loss fastest. 48 | trainer.train_tune_lr(args.train_data_dir, args.train_ali_dir, args.train_spklist, args.tune_period) 49 | trainer.close() 50 | tf.logging.info("Finish tuning.") 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | share/python-wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | 116 | .idea/ 117 | 118 | # MacOS 119 | .DS_Store 120 | 121 | # Matlab 122 | *.m~ 123 | ._* 124 | 125 | ._.DS_Store* 126 | 127 | # backup 128 | egs/voxceleb/v1/nnet_conf.bak/* 129 | egs/voxceleb/v2/nnet_conf.bak/* 130 | egs/sre/v1/nnet_conf.bak/* 131 | egs/sre/v1/nnet_conf/test.json 132 | egs/mgb* 133 | egs/leap* 134 | egs/fisher/v1/nnet_conf.bak/* 135 | egs/fisher/v2* 136 | 137 | # Unused 138 | egs/voxceleb/nnet/run_extract_bnf_mi_embeddings.sh 139 | egs/voxceleb/nnet/run_train_mi_nnet.sh 140 | egs/voxceleb/nnet/wrap/extract_mi_wrapper.sh 141 | scripts/prepare_bnfeats_for_egs.sh 142 | scripts/extract_bnf.sh 143 | 144 | # intermediate files 145 | misc/tools/score* 146 | misc/tuning* 147 | 148 | *.png 149 | -------------------------------------------------------------------------------- /model/multitask_v1/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def make_phone_masks(length, resample, num_frames_per_utt): 5 | """Randomly select frames for each utterance. 6 | 7 | Args: 8 | length: The length of each utterance. 9 | resample: If 0, return the beginning frame; otherwise random select a frame. 10 | resample is designed to try to make every frame has the same probability to be sampled. 11 | num_frames_per_utt: #frames selected. if -1, then select all frames 12 | :return: a mat with [n_selected_frames, 2], each row is the index of the selected frame 13 | """ 14 | n_utts = length.shape[0] 15 | 16 | # This sampling strategy will make the sampling probability of each frame the same 17 | if num_frames_per_utt == -1: 18 | mat = [] 19 | for i in range(n_utts): 20 | for j in range(length[i]): 21 | mat.append([i, j]) 22 | mat = np.array(mat, dtype=np.int32) 23 | else: 24 | # # Uniform sampling 25 | # mat = np.zeros((length.shape[0] * num_frames_per_utt, 2), dtype=np.int32) 26 | # assert num_frames_per_utt > 0, "The num of frames should be greater than 0 (or -1)" 27 | # for i in range(n_utts): 28 | # mat[i * num_frames_per_utt:(i+1) * num_frames_per_utt, 0] = i 29 | # if resample[i] == 1: 30 | # # Resample the last segment 31 | # tmp = [] 32 | # for _ in range(num_frames_per_utt): 33 | # while True: 34 | # a = np.random.randint(0, length[i], dtype=np.int32) 35 | # if a not in tmp: 36 | # tmp.append(a) 37 | # break 38 | # mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = tmp 39 | # else: 40 | # mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = np.arange(num_frames_per_utt, dtype=np.int32) 41 | 42 | # Totally random sampling (the central frames will get higher sampling probabilities) 43 | mat = np.zeros((length.shape[0] * num_frames_per_utt, 2), dtype=np.int32) 44 | for i in range(n_utts): 45 | mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 0] = i 46 | # Resample the last segment 47 | tmp = [] 48 | for _ in range(num_frames_per_utt): 49 | while True: 50 | a = np.random.randint(0, length[i], dtype=np.int32) 51 | if a not in tmp: 52 | tmp.append(a) 53 | break 54 | mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = tmp 55 | 56 | return mat 57 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/train_lr_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import sys 5 | import tensorflow as tf 6 | import numpy as np 7 | from misc.utils import ValidLoss, load_valid_loss, save_codes_and_config 8 | from model.trainer import Trainer 9 | from dataset.data_loader import KaldiDataRandomQueue 10 | from dataset.kaldi_io import FeatureReader 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.") 14 | parser.add_argument("--config", type=str, help="The configuration file.") 15 | parser.add_argument("train_dir", type=str, help="The data directory of the training set.") 16 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.") 17 | parser.add_argument("valid_dir", type=str, help="The data directory of the validation set.") 18 | parser.add_argument("valid_spklist", type=str, help="The spklist maps the VALID speakers to the indices.") 19 | parser.add_argument("model", type=str, help="The output model directory.") 20 | 21 | 22 | if __name__ == '__main__': 23 | tf.logging.set_verbosity(tf.logging.INFO) 24 | args = parser.parse_args() 25 | params = save_codes_and_config(False, args.model, args.config) 26 | 27 | # The model directory always has a folder named nnet 28 | model_dir = os.path.join(args.model, "nnet") 29 | 30 | # Set the random seed. The random operations may appear in data input, batch forming, etc. 31 | tf.set_random_seed(params.seed) 32 | random.seed(params.seed) 33 | np.random.seed(params.seed) 34 | 35 | start_epoch = 0 36 | 37 | dim = FeatureReader(args.train_dir).get_dim() 38 | with open(os.path.join(model_dir, "feature_dim"), "w") as f: 39 | f.write("%d\n" % dim) 40 | 41 | num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers 42 | tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) 43 | 44 | # Load the history valid loss 45 | min_valid_loss = ValidLoss() 46 | 47 | # The trainer is used to control the training process 48 | trainer = Trainer(params, args.model) 49 | trainer.build("train", 50 | dim=dim, 51 | loss_type=params.loss_func, 52 | num_speakers=num_total_train_speakers) 53 | trainer.build("valid", 54 | dim=dim, 55 | loss_type=params.loss_func, 56 | num_speakers=num_total_train_speakers) 57 | 58 | # You can tune the learning rate using the following function. 59 | # After training, you should plot the loss v.s. the learning rate and pich a learning rate that decrease the 60 | # loss fastest. 61 | trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period) 62 | trainer.close() 63 | tf.logging.info("Finish tuning.") 64 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_extract_mt_phone_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nj=32 4 | use_gpu=false 5 | cmd="run.pl" 6 | min_chunk_size=25 7 | chunk_size=10000 8 | stage=0 9 | normalize=false 10 | checkpoint=-1 11 | env=tf_cpu 12 | node="output" 13 | compress=true 14 | 15 | echo "$0 $@" 16 | 17 | if [ -f path.sh ]; then . ./path.sh; fi 18 | . parse_options.sh || exit 1; 19 | 20 | if [ $# != 4 ]; then 21 | echo "Usage: $0 [options] " 22 | echo "Options:" 23 | echo " --use-gpu " 24 | echo " --nj <32>" 25 | echo " --min-chunk-size <25>" 26 | echo " --chunk-size <10000>" 27 | echo " --normalize " 28 | echo " --checkpoint <-1>" 29 | echo " --node " 30 | echo " --compress " 31 | echo "" 32 | exit 100 33 | fi 34 | 35 | nnetdir=$1 36 | data=$2 37 | alidir=$3 38 | dir=$4 39 | 40 | for f in $nnetdir/nnet/checkpoint $data/feats.scp $data/vad.scp $alidir/pdf.scp; do 41 | [ ! -f $f ] && echo "No such file $f" && exit 1; 42 | done 43 | 44 | mkdir -p $dir/log 45 | 46 | utils/split_data.sh $data $nj 47 | echo "$0: extracting embeddings for $data" 48 | sdata=$data/split$nj/JOB 49 | 50 | # Filter the alignments to match the feats. 51 | utils/filter_scps.pl JOB=1:$nj \ 52 | $data/split${nj}/JOB/utt2spk $alidir/pdf.scp $data/split${nj}/JOB/pdf.scp || exit 1; 53 | 54 | feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- |" 55 | 56 | # I use conda to load TF (in cpu case), so some preparations are applied before python. So a wrapper make things more flexible. 57 | # If no conda is used, simply set "--use-env false" 58 | if [ $stage -le 0 ]; then 59 | echo "$0: extracting xvectors from nnet" 60 | echo "$0: embedding from node $node" 61 | 62 | # Set the checkpoint. 63 | source $TF_ENV/$env/bin/activate 64 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 65 | python nnet/lib/make_checkpoint.py --checkpoint $checkpoint "$nnetdir" 66 | deactivate 67 | 68 | if $use_gpu; then 69 | echo "Using CPU to do inference is a better choice." 70 | exit 1 71 | # $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ 72 | # nnet/wrap/extract_wrapper.sh --gpuid JOB --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size --normalize $normalize \ 73 | # "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp" 74 | else 75 | $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ 76 | nnet/wrap/extract_mt_phone_wrapper.sh --gpuid -1 --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size \ 77 | --normalize $normalize --node $node \ 78 | "$nnetdir" "$feat" ${sdata}/pdf.scp "ark:| copy-feats --compress=$compress ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp" 79 | 80 | fi 81 | fi 82 | 83 | if [ $stage -le 1 ]; then 84 | echo "$0: combining xvectors across jobs" 85 | for j in $(seq $nj); do cat $dir/xvector.$j.scp; done > $dir/xvector.scp || exit 1; 86 | fi 87 | 88 | exit 0 -------------------------------------------------------------------------------- /misc/tuning/tune_lr.m: -------------------------------------------------------------------------------- 1 | clear 2 | close all; 3 | 4 | lr = [0.000010 5 | 0.000012 6 | 0.000013 7 | 0.000015 8 | 0.000017 9 | 0.000020 10 | 0.000023 11 | 0.000027 12 | 0.000031 13 | 0.000035 14 | 0.000040 15 | 0.000047 16 | 0.000054 17 | 0.000062 18 | 0.000071 19 | 0.000081 20 | 0.000094 21 | 0.000108 22 | 0.000124 23 | 0.000142 24 | 0.000164 25 | 0.000188 26 | 0.000216 27 | 0.000249 28 | 0.000286 29 | 0.000329 30 | 0.000379 31 | 0.000435 32 | 0.000501 33 | 0.000576 34 | 0.000662 35 | 0.000761 36 | 0.000876 37 | 0.001007 38 | 0.001158 39 | 0.001332 40 | 0.001532 41 | 0.001761 42 | 0.002025 43 | 0.002329 44 | 0.002679 45 | 0.003080 46 | 0.003542 47 | 0.004074 48 | 0.004685 49 | 0.005388 50 | 0.006196 51 | 0.007125 52 | 0.008194 53 | 0.009423 54 | 0.010837 55 | 0.012462 56 | 0.014331 57 | 0.016481 58 | 0.018953 59 | 0.021796 60 | 0.025066 61 | 0.028826 62 | 0.033149 63 | 0.038122 64 | 0.043840 65 | 0.050416 66 | 0.057978 67 | 0.066675 68 | 0.076676 69 | 0.088178 70 | 0.101405 71 | 0.116615 72 | 0.134108 73 | 0.154224 74 | 0.177357 75 | 0.203961 76 | 0.234555 77 | 0.269738 78 | 0.310199 79 | 0.356729 80 | 0.410238 81 | 0.471774 82 | 0.542540 83 | 0.623921 84 | 0.717509 85 | 0.825135 86 | 0.948905 87 | 1.091241 88 | 1.254927 89 | 1.443166 90 | 1.659641 91 | 1.908588 92 | 2.194876 93 | 2.524107 94 | 2.902723 95 | 3.338132 96 | 3.838852 97 | 4.414679 98 | 5.076881 99 | 5.838413 100 | 6.714175 101 | 7.721302 102 | 8.879497 103 | 10.211421]; 104 | 105 | loss = [7.862374 106 | 7.870405 107 | 7.771949 108 | 7.787009 109 | 7.566071 110 | 7.733312 111 | 6.704276 112 | 7.570509 113 | 6.750152 114 | 7.261982 115 | 6.866084 116 | 6.672805 117 | 6.590648 118 | 6.298755 119 | 6.757052 120 | 6.595728 121 | 6.486756 122 | 5.642969 123 | 6.621517 124 | 6.393176 125 | 6.472243 126 | 6.267687 127 | 6.596249 128 | 6.058064 129 | 6.151696 130 | 6.340888 131 | 5.645424 132 | 6.459932 133 | 6.390144 134 | 5.754430 135 | 5.931551 136 | 5.213816 137 | 5.011546 138 | 6.196012 139 | 5.601851 140 | 4.494273 141 | 5.674572 142 | 5.236257 143 | 5.222935 144 | 5.152613 145 | 5.424874 146 | 4.766945 147 | 4.949891 148 | 4.694318 149 | 4.824037 150 | 4.918430 151 | 4.113710 152 | 4.040040 153 | 4.822907 154 | 3.912708 155 | 4.655045 156 | 4.444558 157 | 4.385447 158 | 4.390773 159 | 4.715075 160 | 4.955003 161 | 4.506167 162 | 4.952337 163 | 4.916849 164 | 4.933915 165 | 5.132740 166 | 4.584652 167 | 5.110647 168 | 5.910470 169 | 5.527468 170 | 5.851896 171 | 5.173183 172 | 5.006588 173 | 5.238864 174 | 6.415123 175 | 5.679238 176 | 6.092204 177 | 5.951892 178 | 6.053728 179 | 5.838257 180 | 6.347813 181 | 5.253940 182 | 5.873345 183 | 5.180672 184 | 6.765231 185 | 6.544772 186 | 6.581923 187 | 6.521677 188 | 6.496094 189 | 6.449677 190 | 6.650800 191 | 6.242509 192 | 6.709395 193 | 6.472134 194 | 6.652347 195 | 6.052146 196 | 7.097000 197 | 7.214063 198 | 6.960054 199 | 6.783081 200 | 6.404983 201 | 6.553833 202 | 6.387044 203 | 7.082532 204 | 6.591753 205 | ]; 206 | 207 | sma = 1; 208 | derivatives = (loss(1+sma:end) - loss(1:end-sma))/sma; 209 | derivatives = filter(ones(1,5)/5,1,derivatives); 210 | figure(); 211 | semilogx(lr, loss); 212 | figure(); 213 | semilogx(lr(2:end), derivatives) 214 | 215 | -------------------------------------------------------------------------------- /scripts/extract_bnf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | set -e 6 | 7 | # Begin configuration section. 8 | stage=0 9 | nj=30 10 | cmd="run.pl" 11 | use_gpu=false 12 | compress=true 13 | # End configuration options. 14 | 15 | echo "$0 $@" # Print the command line for logging 16 | 17 | if [ -f path.sh ]; then . ./path.sh; fi 18 | . parse_options.sh || exit 1; 19 | 20 | if [ $# != 5 ]; then 21 | echo "Usage: $0 " 22 | echo " e.g.: $0 exp/nnet data/train data/train_bn exp/train_bn" 23 | echo "main options (for others, see top of script file)" 24 | echo " --config # config containing options" 25 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 26 | echo " --use-gpu # If true, use GPU." 27 | echo " --nj # Number of jobs" 28 | echo " --stage # To control partial reruns" 29 | echo " --compress " 30 | exit 1 31 | fi 32 | 33 | srcdir=$1 34 | output_node=$2 35 | data=$3 36 | bnf_data=$4 37 | dir=$5 38 | 39 | for f in $srcdir/final.mdl $data/feats.scp; do 40 | [ ! -f $f ] && echo "No such file $f" && exit 1; 41 | done 42 | 43 | cmvn_opts=`cat $srcdir/cmvn_opts` 44 | name=`basename $data` 45 | sdata=$data/split$nj 46 | utils/split_data.sh $data $nj 47 | 48 | mkdir -p $dir/log 49 | mkdir -p $bnf_data 50 | 51 | echo "$0: extracting bottleneck features for $data" 52 | 53 | echo "$0: Generating bottleneck features using $srcdir/final.mdl as output of " 54 | echo " component-node with name $output_node." 55 | echo "output-node name=output input=$output_node" > $dir/extract.config 56 | 57 | raw_nnet="nnet3-am-copy --raw=true $srcdir/final.mdl - | nnet3-copy --nnet-config=$dir/extract.config - - |" 58 | # Set up the features 59 | # The feature processing pipeline: 60 | # apply-cmvn --norm-means=true --norm-vars=false --utt2spk=xxx scp:xxx scp:xxx ark:xxx 61 | feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" 62 | 63 | if [ $stage -le 0 ]; then 64 | echo "$0: extracting xvectors from nnet" 65 | if $use_gpu; then 66 | echo "Set use_gpu=false" 67 | exit 1 68 | else 69 | $cmd JOB=1:$nj $dir/log/extract.JOB.log \ 70 | nnet3-compute --use-gpu=no "$raw_nnet" "$feats" ark:- \| \ 71 | copy-feats --compress=$compress ark:- ark,scp:$dir/raw_bnfeat_$name.JOB.ark,$dir/raw_bnfeat_$name.JOB.scp || exit 1; 72 | fi 73 | fi 74 | 75 | N0=$(cat $data/feats.scp | wc -l) 76 | N1=$(cat $dir/raw_bnfeat_$name.*.scp | wc -l) 77 | if [[ "$N0" != "$N1" ]]; then 78 | echo "$0: Error happens when generating bottleneck features for $name (Original:$N0 BNF:$N1)" 79 | exit 1; 80 | fi 81 | 82 | # Concatenate feats.scp into bnf_data 83 | for n in $(seq $nj); do 84 | cat $dir/raw_bnfeat_$name.$n.scp 85 | done > $bnf_data/feats.scp 86 | 87 | for f in segments spk2utt spk2gender text utt2spk wav.scp vad.scp utt2num_frames char.stm glm kws reco2file_and_channel stm; do 88 | [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f 89 | done 90 | 91 | if [ $stage -le 1 ]; then 92 | echo "$0: computing CMVN stats." 93 | steps/compute_cmvn_stats.sh $bnf_data $dir/log $dir 94 | fi 95 | 96 | echo "$0: done making bottleneck features." 97 | 98 | exit 0; -------------------------------------------------------------------------------- /model/multitask_v1/pooling.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from model.common import shape_list, dense_relu, dense_tanh, split_heads, combine_last_two_dimensions 3 | import sys 4 | 5 | 6 | VAR2STD_EPSILON = 1e-12 7 | 8 | 9 | def statistics_pooling_v2(features, feat_length, endpoints, params, is_training): 10 | """Statistics pooling 11 | Note that we need to take care of the zeros in the variance since the sqrt on 0 will lead to NaN. 12 | 13 | Args: 14 | features: A tensor with shape [batch, length, dim]. 15 | feat_length: The length of each utterance. 16 | endpoints: Outputs of different parts of the network. 17 | params: 18 | is_training: 19 | :return: 20 | Statistics pooling result [mean, stddev] with shape [batch, dim]. 21 | """ 22 | with tf.variable_scope("stat_pooling"): 23 | feat_shape = shape_list(features) 24 | frame_index = tf.tile(tf.expand_dims(tf.range(feat_shape[1]), axis=0), [feat_shape[0], 1]) 25 | feat_length = tf.expand_dims(feat_length, axis=1) 26 | feat_length_new = tf.tile(feat_length, [1, feat_shape[1]]) 27 | mask = tf.expand_dims(tf.to_float(tf.less(frame_index, feat_length_new)), axis=2) 28 | feat_length = tf.to_float(tf.expand_dims(feat_length, axis=2)) 29 | mean = tf.reduce_sum(features * mask, axis=1, keep_dims=True) / (feat_length + 1e-16) 30 | variance = tf.reduce_sum(tf.squared_difference(features, mean) * mask, axis=1, keep_dims=True) / (feat_length + 1e-16) 31 | 32 | mean = tf.squeeze(mean, 1) 33 | variance = tf.squeeze(variance, 1) 34 | 35 | mask = tf.to_float(tf.less_equal(variance, VAR2STD_EPSILON)) 36 | variance = (1.0 - mask) * variance + mask * VAR2STD_EPSILON 37 | stddev = tf.sqrt(variance) 38 | stat_pooling = tf.concat([mean, stddev], 1, name="concat") 39 | 40 | return stat_pooling 41 | 42 | 43 | if __name__ == "__main__": 44 | num_labels = 10 45 | num_data = 100 46 | num_length = 1000 47 | num_dim = 1500 48 | features = tf.placeholder(tf.float32, shape=[None, None, num_dim], name="features") 49 | feat_length = tf.placeholder(tf.int32, shape=[None], name="feat_length") 50 | from collections import OrderedDict 51 | endpoints = OrderedDict() 52 | from misc.utils import ParamsPlain 53 | 54 | # Self-attention 55 | params = ParamsPlain() 56 | 57 | stat_pooling = statistics_pooling_v2(features, feat_length, endpoints, params, True) 58 | 59 | with tf.Session() as sess: 60 | sess.run(tf.global_variables_initializer()) 61 | import numpy as np 62 | features_val = np.random.rand(num_data, num_length, num_dim).astype(np.float32) 63 | features_val[0, :, :] = 0 64 | length_val = np.random.randint(100, 1001, size=(num_data)) 65 | stat_pooling_tf = sess.run(stat_pooling, feed_dict={features: features_val, 66 | feat_length: length_val}) 67 | 68 | def compute_stat_pooling(features, length): 69 | num_data, l, dim = features.shape 70 | assert num_data == length.shape[0] 71 | mean = np.zeros((num_data, dim)) 72 | stddev = np.zeros((num_data, dim)) 73 | for i in range(num_data): 74 | for j in range(length[i]): 75 | mean[i, :] += features[i, j, :] 76 | stddev[i, :] += np.square(features[i, j, :]) 77 | mean[i, :] /= length[i] 78 | stddev[i, :] /= length[i] 79 | stddev[i, :] = np.sqrt(np.maximum(stddev[i, :] - np.square(mean[i, :]), 1e-12)) 80 | return np.concatenate([mean, stddev], axis=1) 81 | 82 | stat_pooling_np = compute_stat_pooling(features_val, length_val) 83 | assert np.allclose(stat_pooling_tf, stat_pooling_np) 84 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/lib/finetune_lr_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import sys 5 | import tensorflow as tf 6 | import numpy as np 7 | from misc.utils import get_pretrain_model 8 | from misc.utils import ValidLoss, save_codes_and_config, compute_cos_pairwise_eer 9 | from model.trainer import Trainer 10 | from dataset.data_loader import KaldiDataRandomQueue 11 | from dataset.kaldi_io import FeatureReader 12 | from six.moves import range 13 | 14 | # We don't need to use a `continue` option here, because if we want to resume training, we should simply use train.py. 15 | # In the beginning of finetuning, we want to restore a part of the model rather than the entire graph. 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.") 18 | parser.add_argument("--checkpoint", type=str, default="-1", help="The checkpoint in the pre-trained model. The default is to load the BEST checkpoint (according to valid_loss)") 19 | parser.add_argument("--config", type=str, help="The configuration file.") 20 | parser.add_argument("train_dir", type=str, help="The data directory of the training set.") 21 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.") 22 | parser.add_argument("valid_dir", type=str, help="The data directory of the validation set.") 23 | parser.add_argument("valid_spklist", type=str, help="The spklist maps the VALID speakers to the indices.") 24 | parser.add_argument("pretrain_model", type=str, help="The pre-trained model directory.") 25 | parser.add_argument("finetune_model", type=str, help="The fine-tuned model directory") 26 | 27 | 28 | if __name__ == '__main__': 29 | tf.logging.set_verbosity(tf.logging.INFO) 30 | args = parser.parse_args() 31 | params = save_codes_and_config(False, args.finetune_model, args.config) 32 | 33 | # Load the pre-trained model to the target model directory. 34 | # The pre-trained model will be copied as the fine-tuned model and can be loaded from the new directory. 35 | # The pre-trained model is now just like an initialized model. 36 | get_pretrain_model(os.path.join(args.pretrain_model, "nnet"), 37 | os.path.join(args.finetune_model, "nnet"), 38 | args.checkpoint) 39 | 40 | # The model directory always has a folder named nnet 41 | model_dir = os.path.join(args.finetune_model, "nnet") 42 | 43 | # Set the random seed. The random operations may appear in data input, batch forming, etc. 44 | tf.set_random_seed(params.seed) 45 | random.seed(params.seed) 46 | np.random.seed(params.seed) 47 | 48 | dim = FeatureReader(args.train_dir).get_dim() 49 | with open(os.path.join(model_dir, "feature_dim"), "w") as f: 50 | f.write("%d\n" % dim) 51 | 52 | num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers 53 | tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) 54 | 55 | min_valid_loss = ValidLoss() 56 | 57 | # The trainer is used to control the training process 58 | trainer = Trainer(params, args.finetune_model) 59 | trainer.build("train", 60 | dim=dim, 61 | loss_type=params.loss_func, 62 | num_speakers=num_total_train_speakers) 63 | trainer.build("valid", 64 | dim=dim, 65 | loss_type=params.loss_func, 66 | num_speakers=num_total_train_speakers) 67 | 68 | # Load the pre-trained model and transfer to current model 69 | trainer.get_finetune_model(params.noload_var_list) 70 | 71 | trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period) 72 | trainer.close() 73 | tf.logging.info("Finish tuning.") 74 | -------------------------------------------------------------------------------- /egs/voxceleb/v1/nnet/run_extract_embeddings_no_vad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nj=32 4 | use_gpu=false 5 | cmd="run.pl" 6 | min_chunk_size=25 7 | chunk_size=10000 8 | stage=0 9 | normalize=false 10 | checkpoint=-1 11 | env=tf_cpu 12 | node="output" 13 | 14 | echo "$0 $@" 15 | 16 | if [ -f path.sh ]; then . ./path.sh; fi 17 | . parse_options.sh || exit 1; 18 | 19 | if [ $# != 3 ]; then 20 | echo "Usage: $0 [options] " 21 | echo "Options:" 22 | echo " --use-gpu " 23 | echo " --nj <32>" 24 | echo " --min-chunk-size <25>" 25 | echo " --chunk-size <10000>" 26 | echo " --normalize " 27 | echo " --checkpoint <-1>" 28 | echo " --node " 29 | echo "" 30 | exit 100 31 | fi 32 | 33 | nnetdir=$1 34 | data=$2 35 | dir=$3 36 | 37 | for f in $nnetdir/nnet/checkpoint $data/feats.scp; do 38 | [ ! -f $f ] && echo "No such file $f" && exit 1; 39 | done 40 | 41 | mkdir -p $dir/log 42 | 43 | utils/split_data.sh $data $nj 44 | echo "$0: extracting embeddings for $data" 45 | sdata=$data/split$nj/JOB 46 | 47 | feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- |" 48 | 49 | # I use conda to load TF (in cpu case), so some preparations are applied before python. So a wrapper make things more flexible. 50 | # If no conda is used, simply set "--use-env false" 51 | if [ $stage -le 0 ]; then 52 | echo "$0: extracting xvectors from nnet" 53 | echo "$0: embedding from node $node" 54 | 55 | # Set the checkpoint. 56 | source $TF_ENV/$env/bin/activate 57 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH 58 | python nnet/lib/make_checkpoint.py --checkpoint $checkpoint "$nnetdir" 59 | deactivate 60 | 61 | if $use_gpu; then 62 | echo "Using CPU to do inference is a better choice." 63 | exit 1 64 | # $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ 65 | # nnet/wrap/extract_wrapper.sh --gpuid JOB --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size --normalize $normalize \ 66 | # "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp" 67 | else 68 | $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ 69 | nnet/wrap/extract_wrapper.sh --gpuid -1 --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size \ 70 | --normalize $normalize --node $node \ 71 | "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp" 72 | fi 73 | fi 74 | 75 | if [ $stage -le 1 ]; then 76 | echo "$0: combining xvectors across jobs" 77 | for j in $(seq $nj); do cat $dir/xvector.$j.scp; done >$dir/xvector.scp || exit 1; 78 | fi 79 | 80 | if [ $stage -le 2 ]; then 81 | # Average the utterance-level xvectors to get speaker-level xvectors 82 | echo "$0: computing mean of xvectors for each speaker" 83 | if $normalize; then 84 | echo "$0: Normalize xvectors before computing the mean." 85 | $cmd $dir/log/speaker_mean.log \ 86 | ivector-normalize-length --scaleup=false scp:$dir/xvector.scp ark:- \| \ 87 | ivector-mean ark:$data/spk2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \ 88 | ivector-normalize-length --scaleup=false ark:- ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp || exit 1 89 | else 90 | $cmd $dir/log/speaker_mean.log \ 91 | ivector-mean ark:$data/spk2utt scp:$dir/xvector.scp \ 92 | ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp ark,t:$dir/num_utts.ark || exit 1; 93 | fi 94 | fi 95 | 96 | if [ $stage -le 3 ]; then 97 | if $normalize; then 98 | # Normalize the output embeddings 99 | cp $dir/xvector.scp $dir/xvector_before_norm.scp 100 | $cmd $dir/log/length_norm.log \ 101 | ivector-normalize-length --scaleup=false scp:$dir/xvector_before_norm.scp ark,scp:$dir/xvector.ark,$dir/xvector.scp 102 | fi 103 | fi 104 | 105 | exit 0 --------------------------------------------------------------------------------