├── egs
    ├── __init__.py
    ├── voxceleb
    │   ├── v1
    │   │   ├── __init__.py
    │   │   ├── nnet
    │   │   │   ├── __init__.py
    │   │   │   ├── lib
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── make_checkpoint.py
    │   │   │   │   ├── train_insight.py
    │   │   │   │   ├── train_mt_lr_learning.py
    │   │   │   │   ├── train_vae_lr_learning.py
    │   │   │   │   ├── train_lr_learning.py
    │   │   │   │   └── finetune_lr_learning.py
    │   │   │   ├── wrap
    │   │   │   │   ├── extract_wrapper.sh
    │   │   │   │   ├── extract_mt_wrapper.sh
    │   │   │   │   ├── extract_mi_wrapper.sh
    │   │   │   │   └── extract_mt_phone_wrapper.sh
    │   │   │   ├── run_finetune_lr_learning.sh
    │   │   │   ├── run_train_lr_learning.sh
    │   │   │   ├── run_finetune_nnet.sh
    │   │   │   ├── run_train_mi_nnet.sh
    │   │   │   ├── run_train_nnet.sh
    │   │   │   ├── run_train_mt_nnet.sh
    │   │   │   ├── run_extract_mt_phone_embeddings.sh
    │   │   │   └── run_extract_embeddings_no_vad.sh
    │   │   ├── slurm_conf
    │   │   │   └── slurm.conf
    │   │   ├── path.sh
    │   │   ├── nnet_conf
    │   │   │   ├── tdnn_softmax_1e-2.json
    │   │   │   ├── tdnn_asoftmax_m1_linear_bn_1e-2.json
    │   │   │   ├── tdnn_asoftmax_m2_linear_bn_1e-2.json
    │   │   │   ├── tdnn_asoftmax_m4_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json
    │   │   │   ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json
    │   │   │   ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json
    │   │   │   ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json
    │   │   │   └── tdnn_amsoftmax_m0.20_linear_bn_1e-2_tdnn4_att.json
    │   │   └── cmd.sh
    │   └── v2_unfinished
    │   │   ├── path.sh
    │   │   ├── nnet_conf
    │   │       ├── tdnn_softmax_1e-2.json
    │   │       ├── tdnn_asoftmax_m1_linear_bn_1e-2.json
    │   │       ├── tdnn_asoftmax_m2_linear_bn_1e-2.json
    │   │       ├── tdnn_asoftmax_m4_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json
    │   │       └── tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json
    │   │   └── cmd.sh
    ├── sre
    │   └── v1
    │   │   ├── cmd.sh
    │   │   ├── path.sh
    │   │   └── nnet_conf
    │   │       ├── test.json
    │   │       ├── tdnn_softmax_1e-4.json
    │   │       ├── tdnn_softmax_1e-6.json
    │   │       ├── tdnn_softmax_1e-2.json
    │   │       ├── tdnn_asoftmax_m1_linear_bn.json
    │   │       ├── tdnn_asoftmax_m2_linear_bn_1e-2.json
    │   │       ├── tdnn_asoftmax_m4_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.10_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json
    │   │       ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json
    │   │       ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json
    │   │       └── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json
    └── fisher
    │   ├── v1
    │       ├── cmd.sh
    │       ├── path.sh
    │       ├── nnet_conf
    │       │   ├── learning_rate_decay_45
    │       │   ├── test.json
    │       │   ├── tdnn_softmax_1e-2.json
    │       │   ├── tdnn_softmax.json
    │       │   ├── tdnn_asoftmax_m1_1e-2.json
    │       │   ├── tdnn_asoftmax_m2_linear_bn_1e-2.json
    │       │   ├── tdnn_asoftmax_m4_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.35_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.10_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.15_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.20_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.25_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.30_linear_bn_1e-2.json
    │       │   ├── tdnn_amsoftmax_m0.45_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json
    │       │   ├── tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json
    │       │   ├── tdnn_softmax_tdnn4_att.json
    │       │   ├── tdnn_softmax_tdnn4_att_2.json
    │       │   ├── tdnn_softmax_tdnn4_att_3.json
    │       │   └── tdnn_softmax_tdnn4_att_4.json
    │       ├── eval_cos.sh
    │       └── eval_plda.sh
    │   └── v3
    │       ├── cmd.sh
    │       ├── path.sh
    │       ├── eval_cos.sh
    │       ├── nnet_conf
    │           ├── tdnn_softmax_1e-2.json
    │           ├── mt_softmax.json
    │           ├── mt_softmax_2.json
    │           ├── mt_softmax_3.json
    │           ├── mt_softmax_4.json
    │           ├── mt_softmax_5.json
    │           ├── mt_softmax_6.json
    │           ├── mt_softmax_7.json
    │           ├── mt_softmax_8.2.json
    │           ├── mt_softmax_8.json
    │           └── mt_softmax_8.3.json
    │       └── eval_plda.sh
├── misc
    ├── __init__.py
    ├── ._.DS_Store
    ├── tuning
    │   ├── target_logit_curve
    │   ├── target_logit_curve.pdf
    │   ├── asoftmax_lambda_tuning.m
    │   ├── target_logit_curve.py
    │   └── tune_lr.m
    ├── DETware_v2.1
    │   ├── compute_det.sh
    │   ├── thick.m
    │   ├── readme.txt
    │   ├── Set_DCF.m
    │   ├── Comp_Det.m
    │   ├── Eval_Spkr_Det.m
    │   ├── Get_DCF.m
    │   ├── Min_DCF.m
    │   └── Set_DET_limits.m
    └── tools
    │   ├── score_distribution.m
    │   └── sample_validset_spk2utt.py
├── model
    ├── __init__.py
    ├── multitask_v1
    │   ├── __init__.py
    │   ├── common.py
    │   └── pooling.py
    └── ._.DS_Store
├── dataset
    ├── __init__.py
    ├── multitask
    │   └── __init__.py
    └── ._.DS_Store
├── scripts
    ├── diagnostic
    │   ├── wer_hyp_filter
    │   ├── wer_ref_filter
    │   └── wer_output_filter
    ├── prepare_pdf_for_multitask_egs.sh
    ├── prepare_bnfeats_for_egs.sh
    ├── prepare_feats_for_multitask_egs.sh
    ├── lmrescore_const_arpa.sh
    └── extract_bnf.sh
├── CHANGELOG.md
└── .gitignore


/egs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset/multitask/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/multitask_v1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/sre/v1/cmd.sh:
--------------------------------------------------------------------------------
1 | export train_cmd="run.pl"
2 | export cuda_cmd="run.pl"


--------------------------------------------------------------------------------
/egs/fisher/v1/cmd.sh:
--------------------------------------------------------------------------------
1 | export train_cmd="run.pl"
2 | export cuda_cmd="run.pl"


--------------------------------------------------------------------------------
/egs/fisher/v3/cmd.sh:
--------------------------------------------------------------------------------
1 | export train_cmd="run.pl"
2 | export cuda_cmd="run.pl"


--------------------------------------------------------------------------------
/misc/._.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/._.DS_Store


--------------------------------------------------------------------------------
/dataset/._.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/dataset/._.DS_Store


--------------------------------------------------------------------------------
/model/._.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/model/._.DS_Store


--------------------------------------------------------------------------------
/misc/tuning/target_logit_curve:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/tuning/target_logit_curve


--------------------------------------------------------------------------------
/misc/tuning/target_logit_curve.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mycrazycracy/tf-kaldi-speaker/HEAD/misc/tuning/target_logit_curve.pdf


--------------------------------------------------------------------------------
/scripts/diagnostic/wer_hyp_filter:
--------------------------------------------------------------------------------
 1 | #!/bin/sed -f
 2 | s:<NOISE>::g
 3 | s:<SPOKEN_NOISE>::g
 4 | s:<UNK>::g
 5 | s/://g
 6 | s/\*//g
 7 | s/-HOLDER/HOLDER/g
 8 | s/COMPAIGN/CAMPAIGN/g
 9 | s/APPROACHES-/APPROACHES/g
10 | s/RESEACHERS/RESEARCHERS/g
11 | 
12 | 


--------------------------------------------------------------------------------
/scripts/diagnostic/wer_ref_filter:
--------------------------------------------------------------------------------
 1 | #!/bin/sed -f
 2 | s:<NOISE>::g
 3 | s:<SPOKEN_NOISE>::g
 4 | s:<UNK>::g
 5 | s/://g
 6 | s/\*//g
 7 | s/-HOLDER/HOLDER/g
 8 | s/COMPAIGN/CAMPAIGN/g
 9 | s/APPROACHES-/APPROACHES/g
10 | s/RESEACHERS/RESEARCHERS/g
11 | 
12 | 


--------------------------------------------------------------------------------
/scripts/diagnostic/wer_output_filter:
--------------------------------------------------------------------------------
 1 | #!/bin/sed -f
 2 | s:<NOISE>::g
 3 | s:<SPOKEN_NOISE>::g
 4 | s:<UNK>::g
 5 | s/://g
 6 | s/\*//g
 7 | s/-HOLDER/HOLDER/g
 8 | s/COMPAIGN/CAMPAIGN/g
 9 | s/APPROACHES-/APPROACHES/g
10 | s/RESEACHERS/RESEARCHERS/g
11 | 
12 | 


--------------------------------------------------------------------------------
/misc/DETware_v2.1/compute_det.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# != 1 ]; then
 4 |   echo "Usage: $0 <score file>"
 5 |   echo ""
 6 |   exit 100
 7 | fi
 8 | 
 9 | score=$1
10 | 
11 | grep ' target' $score > ${score}.tar
12 | grep ' nontarget' $score > ${score}.imp
13 | 
14 | 


--------------------------------------------------------------------------------
/misc/DETware_v2.1/thick.m:
--------------------------------------------------------------------------------
 1 | function [lh] = thick(w,lh)
 2 | % THICK chages the width of the lines references by habdles
 3 | %    lh, the line handles
 4 | %     w, new width (default is 0.5)
 5 | % Example usage: thick(2,plot([1:5],[1,0,1,0,1],'b'))
 6 | 
 7 | for i=1:length(lh)
 8 |    set (lh(i),'LineWidth',w);
 9 | end
10 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [TODO]
 4 | - Train on VoxCeleb 2 dev set and test on VoxCeleb 1.
 5 | - Test attention component.
 6 | - Other pooling strategy. (Utterance-level Aggregation For Speaker Recognition In The Wild)
 7 | - Add multitask_v1 learning.
 8 | 
 9 | 
10 | ## [First version]
11 | - Basic x-vector pipeline.
12 | - Large margin softmax loss.
13 | 


--------------------------------------------------------------------------------
/misc/tools/score_distribution.m:
--------------------------------------------------------------------------------
 1 | 
 2 | tar = load('score.target.amsoftmax');
 3 | nontar = load('score.nontarget.amsoftmax');
 4 | 
 5 | [n_tar, c_tar] = hist(tar, 30);
 6 | n_tar = n_tar / sum(n_tar);
 7 | [n_nontar, c_nontar] = hist(nontar, 30);
 8 | n_nontar = n_nontar / sum(n_nontar);
 9 | 
10 | plot(c_tar, n_tar, 'r--');
11 | hold on;
12 | plot(c_nontar, n_nontar, 'b--');


--------------------------------------------------------------------------------
/misc/tuning/asoftmax_lambda_tuning.m:
--------------------------------------------------------------------------------
 1 | clear
 2 | 
 3 | step = 1:1000000;
 4 | 
 5 | lambda_min = 10;
 6 | lambda_base = 1000;
 7 | gamma = 0.00001;
 8 | lambda_power = 5;
 9 | 
10 | lambda = max(lambda_min, lambda_base * (1 + gamma * step).^(-lambda_power));
11 | fa = 1.0 ./ (1.0 + lambda);
12 | figure
13 | plot(step, lambda);
14 | xlim([0 800000])
15 | ylim([0 100])
16 | figure();
17 | plot(step, fa);


--------------------------------------------------------------------------------
/egs/voxceleb/v1/slurm_conf/slurm.conf:
--------------------------------------------------------------------------------
1 | command sbatch --export=PATH --ntasks-per-node=1
2 | option time=* --time $0
3 | option mem=* --mem-per-cpu $0
4 | option mem=0          # Do not add anything to qsub_opts
5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
7 | default gpu=0
8 | option gpu=0 -p r830all.q
9 | option gpu=* -p c4130all.q --gres=gpu:$0


--------------------------------------------------------------------------------
/egs/sre/v1/path.sh:
--------------------------------------------------------------------------------
1 | # The virtualenv path
2 | export TF_ENV=/home/heliang05/liuyi/venv
3 | 
4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker
5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu
6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
8 | . $KALDI_ROOT/tools/config/common_path.sh
9 | export LC_ALL=C


--------------------------------------------------------------------------------
/egs/fisher/v1/path.sh:
--------------------------------------------------------------------------------
1 | # The virtualenv path
2 | export TF_ENV=/home/heliang05/liuyi/venv
3 | 
4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker
5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu
6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
8 | . $KALDI_ROOT/tools/config/common_path.sh
9 | export LC_ALL=C


--------------------------------------------------------------------------------
/egs/fisher/v3/path.sh:
--------------------------------------------------------------------------------
1 | # The virtualenv path
2 | export TF_ENV=/home/heliang05/liuyi/venv
3 | 
4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker
5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu
6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
8 | . $KALDI_ROOT/tools/config/common_path.sh
9 | export LC_ALL=C


--------------------------------------------------------------------------------
/egs/voxceleb/v1/path.sh:
--------------------------------------------------------------------------------
 1 | # The virtualenv path
 2 | export TF_ENV=/home/heliang05/liuyi/venv
 3 | 
 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker
 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu
 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 8 | . $KALDI_ROOT/tools/config/common_path.sh
 9 | export LC_ALL=C
10 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/path.sh:
--------------------------------------------------------------------------------
 1 | # The virtualenv path
 2 | export TF_ENV=/home/heliang05/liuyi/venv
 3 | 
 4 | export TF_KALDI_ROOT=/home/heliang05/liuyi/base/tf-kaldi-speaker
 5 | export KALDI_ROOT=/home/heliang05/liuyi/software/kaldi_gpu
 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 8 | . $KALDI_ROOT/tools/config/common_path.sh
 9 | export LC_ALL=C
10 | 


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/learning_rate_decay_45:
--------------------------------------------------------------------------------
 1 | 0.005
 2 | 0.005
 3 | 0.005
 4 | 0.005
 5 | 0.005
 6 | 0.005
 7 | 0.0025
 8 | 0.0025
 9 | 0.0025
10 | 0.0025
11 | 0.00125
12 | 0.00125
13 | 0.00125
14 | 0.00125
15 | 0.000625
16 | 0.000625
17 | 0.000625
18 | 0.000625
19 | 0.0003125
20 | 0.0003125
21 | 0.0003125
22 | 0.00015625
23 | 0.00015625
24 | 0.00015625
25 | 0.00007813
26 | 0.00007813
27 | 0.00007813
28 | 0.00003906
29 | 0.00003906
30 | 0.00003906
31 | 0.00001953
32 | 0.00001953
33 | 0.00001953
34 | 0.00000977
35 | 0.00000977
36 | 0.00000977
37 | 0.00000488
38 | 0.00000488
39 | 0.00000488
40 | 0.00000244
41 | 0.00000244
42 | 0.00000244
43 | 0.00000122
44 | 0.00000122
45 | 0.00000122
46 | 0


--------------------------------------------------------------------------------
/misc/DETware_v2.1/readme.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | NIST is making available these matlab files to be used to produce
 3 | Detection Error Trade-off curves with the matlab software package.
 4 | 
 5 | For a basic test of these matlab scripts, start matlab and type the
 6 | command: "Eval_Spkr_Det"
 7 | 
 8 | This script reads the files:
 9 |   A) true_speaker_scores
10 |   B) impostor_scores
11 | 
12 |   and produces a sample DET curve.
13 | 
14 | By making the appropriate changes to the files true_speaker_scores and
15 | impostor_scores, you can produce your own DET-curves with:
16 | "Eval_Spkr_Det"
17 | 
18 | Run "DET_usage" for a more detailed demonstration of the capabilities
19 | of these scripts.
20 | 
21 | 


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Set_DCF.m:
--------------------------------------------------------------------------------
 1 | function Set_DCF (Cmiss, Cfa, Ptrue)
 2 | %function Set_DCF (Cmiss, Cfa, Ptrue) initializes the detection
 3 | %cost function (DCF) parameters.  The detection cost function is
 4 | %defined as:
 5 | %
 6 | %     DCF = Cmiss * Pmiss * Ptrue  +  Cfa * Pfa * Pfalse
 7 | %
 8 | %  DCF is a function of Pmiss and Pfa, the miss and false alarm
 9 | %  probabilities.  The  DCF parameters are:
10 | %
11 | %     Cmiss, the cost of a miss,
12 | %     Cfa, the cost of a false alarm,
13 | %     Ptrue, the a priori probability of the target, and
14 | %     Pfalse, = 1 - Ptrue.
15 | %
16 | %  See DET_usage for an example of how to use Set_DCF.
17 | 
18 | global DCF_parameters
19 | DCF_parameters = [Cmiss, Cfa, Ptrue, 1-Ptrue];
20 | 
21 | 


--------------------------------------------------------------------------------
/scripts/prepare_pdf_for_multitask_egs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f path.sh ]; then . ./path.sh; fi
 4 | . parse_options.sh || exit 1;
 5 | 
 6 | if [ $# != 1 ]; then
 7 |   echo "Usage: $0 <ali-dir>"
 8 |   echo "e.g.: $0 data/train exp/tri5a_ali"
 9 |   exit 1;
10 | fi
11 | 
12 | dir=$1
13 | 
14 | for f in $dir/ali.1.gz $dir/final.mdl ; do
15 |   [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
16 | done
17 | 
18 | num_ali_jobs=$(cat $dir/num_jobs) || exit 1;
19 | for id in $(seq $num_ali_jobs); do gunzip -c $dir/ali.$id.gz; done | \
20 |   ali-to-pdf $dir/final.mdl ark:- ark,scp:$dir/pdf.ark,$dir/pdf.scp || exit 1;
21 | 
22 | # TODO: pdf to phones? pdf to phone classes? pdf to ali? We may need to get other types of alignments.
23 | 
24 | exit 0


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/make_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | from misc.utils import get_checkpoint
 5 | from misc.utils import Params
 6 | import tensorflow as tf
 7 | 
 8 | if __name__ == '__main__':
 9 |     tf.logging.set_verbosity(tf.logging.INFO)
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("-c", "--checkpoint", type=str, default='-1',
12 |                         help="The checkpoint to load. The default is to load the BEST checkpoint (according to valid_loss).")
13 |     parser.add_argument("model_dir", type=str, help="The model directory.")
14 |     args = parser.parse_args()
15 |     checkpoint = get_checkpoint(os.path.join(args.model_dir, "nnet"), args.checkpoint)
16 |     print("Set the checkpoint to %s" % checkpoint)
17 | 


--------------------------------------------------------------------------------
/egs/fisher/v1/eval_cos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nnetdir=$1
 4 | 
 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
 6 | echo "EER: ${eer}%"
 7 | 
 8 | paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_cos.new
 9 | grep ' target$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.target
10 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.nontarget
11 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_cos.target', '$nnetdir/xvector_scores_hires/test_cos.nontarget', '$nnetdir/xvector_scores_hires/test_cos.result')"`
12 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
13 | tail -n 1 $nnetdir/xvector_scores_hires/test_cos.result


--------------------------------------------------------------------------------
/egs/fisher/v3/eval_cos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nnetdir=$1
 4 | 
 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
 6 | echo "EER: ${eer}%"
 7 | 
 8 | paste $trials $nnetdir/xvector_scores_hires/test_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_cos.new
 9 | grep ' target$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.target
10 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_cos.nontarget
11 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_cos.target', '$nnetdir/xvector_scores_hires/test_cos.nontarget', '$nnetdir/xvector_scores_hires/test_cos.result')"`
12 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
13 | tail -n 1 $nnetdir/xvector_scores_hires/test_cos.result


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/wrap/extract_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env=
 4 | gpuid=-1
 5 | min_chunk_size=25
 6 | chunk_size=10000
 7 | normalize=false
 8 | node="output"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 3 ]; then
14 |   echo "Usage: $0 [options] <nnet-dir> <data> <embeddings-dir>"
15 |   echo "Options:"
16 |   echo "  --gpuid <-1>"
17 |   echo "  --min-chunk-size <25>"
18 |   echo "  --chunk-size <10000>"
19 |   echo "  --normalize <false>"
20 |   echo "  --node <output>"
21 |   echo ""
22 |   exit 100
23 | fi
24 | 
25 | nnetdir=$1
26 | feat=$2
27 | dir=$3
28 | 
29 | if [ ! -z $env ]; then
30 |   source $TF_ENV/$env/bin/activate
31 | fi
32 | 
33 | if $normalize; then
34 |   cmdopt_norm="--normalize"
35 | fi
36 | 
37 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH
38 | 
39 | python nnet/lib/extract.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\
40 |          "$nnetdir" "$feat" "$dir"
41 | deactivate


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.02,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-4,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 16000,
21 |   "reduce_lr_epochs": 1,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 8000,
25 |   "save_checkpoints_steps": 16000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 4,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 100,
33 |   "max_segment_len": 300,
34 | 
35 |   "early_stop_epochs": 6,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.001,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-2,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 60000,
21 |   "reduce_lr_epochs": 4,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 20000,
25 |   "save_checkpoints_steps": 30000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 8,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 200,
33 |   "max_segment_len": 400,
34 | 
35 |   "early_stop_epochs": 6,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Comp_Det.m:
--------------------------------------------------------------------------------
 1 | %------------------------------
 2 | %load speaker detection output scores
 3 | load true_speaker_scores
 4 | load impostor_scores
 5 | 
 6 | %------------------------------
 7 | %initialize the DCF parameters
 8 | Set_DCF (10, 1, 0.01);
 9 | 
10 | %------------------------------
11 | %compute Pmiss and Pfa from experimental detection output scores
12 | [P_miss,P_fa] = Compute_DET (true_speaker_scores, impostor_scores);
13 | 
14 | %------------------------------
15 | %plot results
16 | 
17 | % Set tic marks
18 | Pmiss_min = 0.01;
19 | Pmiss_max = 0.45;
20 | Pfa_min = 0.01;
21 | Pfa_max = 0.45;
22 | Set_DET_limits(Pmiss_min,Pmiss_max,Pfa_min,Pfa_max);
23 | 
24 | %call figure, plot DET-curve
25 | figure;
26 | Plot_DET (P_miss, P_fa,'r');
27 | title ('Speaker Detection Performance');
28 | hold on;
29 | 
30 | %find lowest cost point and plot
31 | C_miss = 1;
32 | C_fa = 1;
33 | P_target = 0.5;
34 | Set_DCF(C_miss,C_fa,P_target);
35 | [DCF_opt Popt_miss Popt_fa] = Min_DCF(P_miss,P_fa);
36 | Plot_DET (Popt_miss,Popt_fa,'ko');


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/wrap/extract_mt_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env=
 4 | gpuid=-1
 5 | min_chunk_size=25
 6 | chunk_size=10000
 7 | normalize=false
 8 | node="output"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 4 ]; then
14 |   echo "Usage: $0 [options] <nnet-dir> <data> <ali> <embeddings-dir>"
15 |   echo "Options:"
16 |   echo "  --gpuid <-1>"
17 |   echo "  --min-chunk-size <25>"
18 |   echo "  --chunk-size <10000>"
19 |   echo "  --normalize <false>"
20 |   echo "  --node <output>"
21 |   echo ""
22 |   exit 100
23 | fi
24 | 
25 | nnetdir=$1
26 | feat=$2
27 | ali=$3
28 | dir=$4
29 | 
30 | if [ ! -z $env ]; then
31 |   source $TF_ENV/$env/bin/activate
32 | fi
33 | 
34 | if $normalize; then
35 |   cmdopt_norm="--normalize"
36 | fi
37 | 
38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH
39 | 
40 | python nnet/lib/extract_mt.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\
41 |          "$nnetdir" "$feat" "$ali" "$dir"
42 | deactivate
43 | 


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.005,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-2,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 16000,
21 |   "reduce_lr_epochs": 3,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 8000,
25 |   "save_checkpoints_steps": 16000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 4,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 100,
33 |   "max_segment_len": 300,
34 | 
35 |   "early_stop_epochs": 8,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/tdnn_softmax_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.005,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-2,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 7000,
21 |   "reduce_lr_epochs": 3,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 3500,
25 |   "save_checkpoints_steps": 7000,
26 |   "valid_max_iterations": 1000,
27 | 
28 |   "num_parallel_datasets": 4,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 100,
33 |   "max_segment_len": 300,
34 | 
35 |   "early_stop_epochs": 8,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_softmax_1e-4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.001,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-4,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 60000,
21 |   "reduce_lr_epochs": 4,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 20000,
25 |   "save_checkpoints_steps": 30000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 16,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 200,
33 |   "max_segment_len": 400,
34 | 
35 |   "early_stop_epochs": 6,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_softmax_1e-6.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.001,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-6,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 60000,
21 |   "reduce_lr_epochs": 4,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 20000,
25 |   "save_checkpoints_steps": 30000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 16,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 200,
33 |   "max_segment_len": 400,
34 | 
35 |   "early_stop_epochs": 6,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Eval_Spkr_Det.m:
--------------------------------------------------------------------------------
 1 | %------------------------------
 2 | %load speaker detection output scores
 3 | load true_speaker_scores
 4 | load impostor_scores
 5 | 
 6 | %------------------------------
 7 | %initialize the DCF parameters
 8 | Set_DCF (10, 1, 0.01);
 9 | 
10 | %------------------------------
11 | %compute Pmiss and Pfa from experimental detection output scores
12 | [P_miss,P_fa] = Compute_DET (true_speaker_scores, impostor_scores);
13 | 
14 | %------------------------------
15 | %plot results
16 | 
17 | % Set tic marks
18 | Pmiss_min = 0.01;
19 | Pmiss_max = 0.45;
20 | Pfa_min = 0.01;
21 | Pfa_max = 0.45;
22 | Set_DET_limits(Pmiss_min,Pmiss_max,Pfa_min,Pfa_max);
23 | 
24 | %call figure, plot DET-curve
25 | figure;
26 | Plot_DET (P_miss, P_fa,'r');
27 | title ('Speaker Detection Performance');
28 | hold on;
29 | 
30 | %find lowest cost point and plot
31 | C_miss = 1;
32 | C_fa = 1;
33 | P_target = 0.5;
34 | Set_DCF(C_miss,C_fa,P_target);
35 | [DCF_opt Popt_miss Popt_fa] = Min_DCF(P_miss,P_fa);
36 | Plot_DET (Popt_miss,Popt_fa,'ko');
37 | 
38 | 


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_softmax_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.001,
11 |   "optimizer": "momentum",
12 |   "momentum": 0.9,
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-2,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 60000,
21 |   "reduce_lr_epochs": 1,
22 |   "show_training_progress": 200,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 20000,
25 |   "save_checkpoints_steps": 30000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 8,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 200,
33 |   "max_segment_len": 400,
34 | 
35 |   "early_stop_epochs": 6,
36 |   "min_learning_rate": 1e-6
37 | }
38 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/wrap/extract_mi_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env=
 4 | gpuid=-1
 5 | min_chunk_size=25
 6 | chunk_size=10000
 7 | normalize=false
 8 | node="output"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 4 ]; then
14 |   echo "Usage: $0 [options] <nnet-dir> <feat> <aux-feat> <embeddings-dir>"
15 |   echo "Options:"
16 |   echo "  --gpuid <-1>"
17 |   echo "  --min-chunk-size <25>"
18 |   echo "  --chunk-size <10000>"
19 |   echo "  --normalize <false>"
20 |   echo "  --node <output>"
21 |   echo ""
22 |   exit 100
23 | fi
24 | 
25 | nnetdir=$1
26 | feat=$2
27 | feat_aux=$3
28 | dir=$4
29 | 
30 | if [ ! -z $env ]; then
31 |   source $TF_ENV/$env/bin/activate
32 | fi
33 | 
34 | if $normalize; then
35 |   cmdopt_norm="--normalize"
36 | fi
37 | 
38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH
39 | 
40 | python nnet/lib/extract_mi.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm \
41 |          "$nnetdir" "$feat" "$feat_aux" "$dir"
42 | deactivate


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/wrap/extract_mt_phone_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env=
 4 | gpuid=-1
 5 | min_chunk_size=25
 6 | chunk_size=10000
 7 | normalize=false
 8 | node="output"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 4 ]; then
14 |   echo "Usage: $0 [options] <nnet-dir> <data> <ali> <embeddings-dir>"
15 |   echo "Options:"
16 |   echo "  --gpuid <-1>"
17 |   echo "  --min-chunk-size <25>"
18 |   echo "  --chunk-size <10000>"
19 |   echo "  --normalize <false>"
20 |   echo "  --node <output>"
21 |   echo ""
22 |   exit 100
23 | fi
24 | 
25 | nnetdir=$1
26 | feat=$2
27 | ali=$3
28 | dir=$4
29 | 
30 | if [ ! -z $env ]; then
31 |   source $TF_ENV/$env/bin/activate
32 | fi
33 | 
34 | if $normalize; then
35 |   cmdopt_norm="--normalize"
36 | fi
37 | 
38 | export PYTHONPATH=`pwd`/../../:$PYTHONPATH
39 | 
40 | python nnet/lib/extract_mt_phone.py --gpu $gpuid --node $node --min-chunk-size $min_chunk_size --chunk-size $chunk_size $cmdopt_norm\
41 |          "$nnetdir" "$feat" "$ali" "$dir"
42 | deactivate
43 | 


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 |   "pooling_type": "statistics_pooling",
 8 |   "embedding_node": "tdnn6_dense",
 9 | 
10 |   "learning_rate": 0.01,
11 |   "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9",
12 | 
13 |   "use_nesterov": false,
14 |   "clip_gradient": false,
15 | 
16 |   "weight_l2_regularizer": 1e-4,
17 |   "batchnorm_momentum": 0.99,
18 | 
19 |   "num_epochs": 100,
20 |   "num_steps_per_epoch": 50000,
21 |   "reduce_lr_epochs": 4,
22 |   "show_training_progress": 100,
23 |   "keep_checkpoint_max": 100,
24 |   "save_summary_steps": 10000,
25 |   "save_checkpoints_steps": 50000,
26 |   "valid_max_iterations": 5000,
27 | 
28 |   "num_parallel_datasets": 8,
29 |   "max_queue_size": 10,
30 |   "num_speakers_per_batch": 64,
31 |   "num_segments_per_speaker": 1,
32 |   "min_segment_len": 200,
33 |   "max_segment_len": 400,
34 | 
35 |   "early_stop_epochs": 8,
36 |   "min_learning_rate": 1e-6
37 | }


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Get_DCF.m:
--------------------------------------------------------------------------------
 1 | function [eer, dcf08, dcf10, dcf12] = Get_DCF(target, imposter, output)
 2 | 
 3 | tar = load(target);
 4 | non = load(imposter);
 5 | 
 6 | lim = [0.0001 0.95];
 7 | Set_DET_limits(lim(1), lim(2), lim(1), lim(2));
 8 | 
 9 | % EER
10 | [Pmiss, Pfa, eer] = Compute_DET(tar, non);
11 | 
12 | % DCF08 for DCF12
13 | Set_DCF(1, 1, 0.01);
14 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa);
15 | dcf08 = DCF_opt * 100;
16 | 
17 | % DCF10 for DCF12
18 | Set_DCF(1, 1, 0.001);
19 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa);
20 | Plot_DET(Popt_miss, max(Popt_fa, lim(1)), 'ro', 2);
21 | dcf10 = DCF_opt * 1000;
22 | 
23 | % DCF12
24 | dcf12 = (dcf08 + dcf10) / 2;
25 | 
26 | % DCF08
27 | Set_DCF(10, 1, 0.01);
28 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa);
29 | dcf08 = DCF_opt;
30 | 
31 | % DCF10
32 | Set_DCF(1, 1, 0.001);
33 | [DCF_opt, Popt_miss, Popt_fa] = Min_DCF(Pmiss, Pfa);
34 | dcf10 = DCF_opt * 1000;
35 | 
36 | fid = fopen(output, 'a');
37 | fprintf(fid, 'eer: %5.4f%%; mindcf08: %5.4f%%; mindcf10: %5.4f%%; mindcf12: %5.4f%%\n', eer*100, dcf08, dcf10, dcf12);
38 | fclose(fid);


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Min_DCF.m:
--------------------------------------------------------------------------------
 1 | function [min_cost, Pmiss_opt, Pfa_opt] = Min_DCF(Pmiss, Pfa)
 2 | %function [min_cost, Pmiss_opt, Pfa_opt] = Min_DCF(Pmiss, Pfa)
 3 | %
 4 | % Min_DCF finds and returns the minimum value of the detection
 5 | % cost function for a given detection error trade-off curve.
 6 | %
 7 | % Pmiss and Pfa are the correcponding miss and false alarm
 8 | % trade-off probabilities.
 9 | %
10 | %
11 | % See DET_usage for an example of how to use Min_DCF.
12 | 
13 | global DCF_parameters
14 | 
15 | if isempty(DCF_parameters)
16 |         error ('call Set_DCF to define DCF parameters before calling Min_DCF');
17 | end
18 | 
19 | Cmiss = DCF_parameters(1);
20 | Cfa = DCF_parameters(2);
21 | Ptrue = DCF_parameters(3);
22 | Pfalse = DCF_parameters(4);
23 | 
24 | npts = max(size(Pmiss));
25 | if npts ~= max(size(Pfa))
26 |         error ('vector size of Pmiss and Pfa not equal in call to Plot_DET');
27 | end
28 | 
29 | %-------------------------
30 | %Find DCF_best:
31 | 
32 | DCF_vector = Cmiss * Pmiss * Ptrue  + Cfa * Pfa * Pfalse;
33 | [min_cost min_ptr] = min (DCF_vector);
34 | Pmiss_opt = Pmiss(min_ptr(1)) ;
35 | Pfa_opt = Pfa(min_ptr(1));
36 | 
37 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_softmax_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Note": "If the weight decay is 1e-2, the vinilla SGD (0.01) is used. If momentum=0.9, reduce the learning rate to 0.001 may be better",
 3 | 
 4 |   "seed": 0,
 5 |   "network_type": "tdnn",
 6 |   "last_layer_linear": false,
 7 |   "loss_func": "softmax",
 8 |   "batch_type": "softmax",
 9 |   "pooling_type": "statistics_pooling",
10 |   "embedding_node": "tdnn6_dense",
11 | 
12 |   "learning_rate": 0.01,
13 |   "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9",
14 | 
15 |   "use_nesterov": false,
16 |   "clip_gradient": false,
17 | 
18 |   "weight_l2_regularizer": 1e-2,
19 |   "batchnorm_momentum": 0.99,
20 | 
21 |   "num_epochs": 100,
22 |   "num_steps_per_epoch": 30000,
23 |   "reduce_lr_epochs": 4,
24 |   "show_training_progress": 100,
25 |   "keep_checkpoint_max": 100,
26 |   "save_summary_steps": 10000,
27 |   "save_checkpoints_steps": 30000,
28 |   "valid_max_iterations": 1000,
29 | 
30 |   "num_parallel_datasets": 16,
31 |   "max_queue_size": 10,
32 |   "num_speakers_per_batch": 64,
33 |   "num_segments_per_speaker": 1,
34 |   "min_segment_len": 200,
35 |   "max_segment_len": 400,
36 | 
37 |   "early_stop_epochs": 10,
38 |   "min_learning_rate": 1e-6
39 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_softmax_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Note": "If the weight decay is 1e-2, the vinilla SGD (0.01) is used. If momentum=0.9, reduce the learning rate to 0.001 may be better",
 3 | 
 4 |   "seed": 0,
 5 |   "network_type": "tdnn",
 6 |   "last_layer_linear": false,
 7 |   "loss_func": "softmax",
 8 |   "batch_type": "softmax",
 9 |   "pooling_type": "statistics_pooling",
10 |   "embedding_node": "tdnn6_dense",
11 | 
12 |   "learning_rate": 0.01,
13 |   "Another option": "learning_rate=0.001, optimizer=momentum, momentum=0.9",
14 | 
15 |   "use_nesterov": false,
16 |   "clip_gradient": false,
17 | 
18 |   "weight_l2_regularizer": 1e-2,
19 |   "batchnorm_momentum": 0.99,
20 | 
21 |   "num_epochs": 100,
22 |   "num_steps_per_epoch": 30000,
23 |   "reduce_lr_epochs": 4,
24 |   "show_training_progress": 100,
25 |   "keep_checkpoint_max": 100,
26 |   "save_summary_steps": 10000,
27 |   "save_checkpoints_steps": 30000,
28 |   "valid_max_iterations": 1000,
29 | 
30 |   "num_parallel_datasets": 16,
31 |   "max_queue_size": 10,
32 |   "num_speakers_per_batch": 64,
33 |   "num_segments_per_speaker": 1,
34 |   "min_segment_len": 200,
35 |   "max_segment_len": 400,
36 | 
37 |   "early_stop_epochs": 10,
38 |   "min_learning_rate": 1e-6
39 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_asoftmax_m1_linear_bn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "asoftmax",
 8 |   "asoftmax_m": 1,
 9 |   "asoftmax_lambda_min": 0,
10 |   "asoftmax_lambda_base": 0,
11 |   "asoftmax_lambda_gamma": 1,
12 |   "asoftmax_lambda_power": 1,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 20000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "asoftmax",
 8 |   "asoftmax_m": 2,
 9 |   "asoftmax_lambda_min": 10,
10 |   "asoftmax_lambda_base": 1000,
11 |   "asoftmax_lambda_gamma": 0.00001,
12 |   "asoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "asoftmax",
 8 |   "asoftmax_m": 4,
 9 |   "asoftmax_lambda_min": 10,
10 |   "asoftmax_lambda_base": 1000,
11 |   "asoftmax_lambda_gamma": 0.00001,
12 |   "asoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m1_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 1,
10 |   "asoftmax_lambda_min": 0,
11 |   "asoftmax_lambda_base": 0,
12 |   "asoftmax_lambda_gamma": 1,
13 |   "asoftmax_lambda_power": 1,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_asoftmax_m1_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": false,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 1,
10 |   "asoftmax_lambda_min": 0,
11 |   "asoftmax_lambda_base": 0,
12 |   "asoftmax_lambda_gamma": 1,
13 |   "asoftmax_lambda_power": 1,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 100,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 2,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00001,
13 |   "asoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 4,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00001,
13 |   "asoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m1_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 1,
10 |   "asoftmax_lambda_min": 0,
11 |   "asoftmax_lambda_base": 0,
12 |   "asoftmax_lambda_gamma": 1,
13 |   "asoftmax_lambda_power": 1,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.10_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.10,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.15,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.20,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.25,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.30,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_margin_softmax",
 8 |   "amsoftmax_m": 0.35,
 9 |   "amsoftmax_lambda_min": 0,
10 |   "amsoftmax_lambda_base": 1000,
11 |   "amsoftmax_lambda_gamma": 0.0001,
12 |   "amsoftmax_lambda_power": 5,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 2,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00001,
13 |   "asoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 4,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00001,
13 |   "asoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_asoftmax_m2_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 2,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00008,
13 |   "asoftmax_lambda_power": 2,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_asoftmax_m4_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "asoftmax",
 9 |   "asoftmax_m": 4,
10 |   "asoftmax_lambda_min": 10,
11 |   "asoftmax_lambda_base": 1000,
12 |   "asoftmax_lambda_gamma": 0.00008,
13 |   "asoftmax_lambda_power": 2,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.10,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.15,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.20,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.25,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.30,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/sre/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": true,
 5 |   "feature_norm": false,
 6 | 
 7 |   "loss_func": "additive_angular_margin_softmax",
 8 |   "arcsoftmax_m": 0.35,
 9 |   "arcsoftmax_lambda_min": 0,
10 |   "arcsoftmax_lambda_base": 1000,
11 |   "arcsoftmax_lambda_gamma": 0.00001,
12 |   "arcsoftmax_lambda_power": 6,
13 | 
14 |   "batch_type": "softmax",
15 |   "pooling_type": "statistics_pooling",
16 |   "embedding_node": "tdnn6_dense",
17 | 
18 |   "learning_rate": 0.001,
19 |   "optimizer": "momentum",
20 |   "momentum": 0.9,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 200,
28 |   "num_steps_per_epoch": 60000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 200,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 30000,
33 |   "save_checkpoints_steps": 60000,
34 |   "valid_max_iterations": 5000,
35 | 
36 |   "num_parallel_datasets": 8,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.15,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.25,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.30,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.35,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.15,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.25,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.30,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.35,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.35,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 0,
12 |   "amsoftmax_lambda_gamma": 1,
13 |   "amsoftmax_lambda_power": 1,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.15,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.20,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.25,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.30,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.35,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.40,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.10_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.10,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.15,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.25,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.30,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_amsoftmax_m0.45_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.45,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.15,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.20,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.25,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.30,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.35,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.40,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00001,
13 |   "arcsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.01,
20 |   "use_nesterov": false,
21 |   "clip_gradient": false,
22 |   "clip_gradient_norm": 3,
23 | 
24 |   "weight_l2_regularizer": 1e-2,
25 |   "batchnorm_momentum": 0.99,
26 | 
27 |   "num_epochs": 100,
28 |   "num_steps_per_epoch": 30000,
29 |   "reduce_lr_epochs": 4,
30 |   "show_training_progress": 100,
31 |   "keep_checkpoint_max": 100,
32 |   "save_summary_steps": 10000,
33 |   "save_checkpoints_steps": 30000,
34 |   "valid_max_iterations": 1000,
35 | 
36 |   "num_parallel_datasets": 16,
37 |   "max_queue_size": 10,
38 |   "num_speakers_per_batch": 64,
39 |   "num_segments_per_speaker": 1,
40 |   "min_segment_len": 200,
41 |   "max_segment_len": 400,
42 | 
43 |   "early_stop_epochs": 10,
44 |   "min_learning_rate": 1e-6
45 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.10_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.10,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.15_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.15,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.20_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.20,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.25_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.25,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.30_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.30,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.35_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.35,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_arcsoftmax_m0.40_linear_bn_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_angular_margin_softmax",
 9 |   "arcsoftmax_m": 0.40,
10 |   "arcsoftmax_lambda_min": 0,
11 |   "arcsoftmax_lambda_base": 1000,
12 |   "arcsoftmax_lambda_gamma": 0.00002,
13 |   "arcsoftmax_lambda_power": 8,
14 | 
15 |   "batch_type": "softmax",
16 |   "pooling_type": "statistics_pooling",
17 |   "embedding_node": "tdnn6_dense",
18 | 
19 |   "learning_rate": 0.005,
20 |   "optimizer": "momentum",
21 |   "momentum": 0.9,
22 |   "use_nesterov": false,
23 |   "clip_gradient": false,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 200,
29 |   "num_steps_per_epoch": 16000,
30 |   "reduce_lr_epochs": 3,
31 |   "show_training_progress": 200,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 8000,
34 |   "save_checkpoints_steps": 16000,
35 |   "valid_max_iterations": 5000,
36 | 
37 |   "num_parallel_datasets": 4,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 100,
42 |   "max_segment_len": 300,
43 | 
44 |   "early_stop_epochs": 8,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": true,
 7 |   "feature_scaling_factor": 30,
 8 | 
 9 |   "loss_func": "additive_margin_softmax",
10 |   "amsoftmax_m": 0.20,
11 |   "amsoftmax_lambda_min": 0,
12 |   "amsoftmax_lambda_base": 1000,
13 |   "amsoftmax_lambda_gamma": 0.0001,
14 |   "amsoftmax_lambda_power": 5,
15 | 
16 |   "batch_type": "softmax",
17 |   "pooling_type": "statistics_pooling",
18 |   "embedding_node": "tdnn6_dense",
19 | 
20 |   "learning_rate": 0.01,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 |   "clip_gradient_norm": 3,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 100,
29 |   "num_steps_per_epoch": 30000,
30 |   "reduce_lr_epochs": 4,
31 |   "show_training_progress": 100,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 10000,
34 |   "save_checkpoints_steps": 30000,
35 |   "valid_max_iterations": 1000,
36 | 
37 |   "num_parallel_datasets": 16,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 200,
42 |   "max_segment_len": 400,
43 | 
44 |   "early_stop_epochs": 10,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_fn30_1e-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": true,
 7 |   "feature_scaling_factor": 30,
 8 | 
 9 |   "loss_func": "additive_margin_softmax",
10 |   "amsoftmax_m": 0.20,
11 |   "amsoftmax_lambda_min": 0,
12 |   "amsoftmax_lambda_base": 1000,
13 |   "amsoftmax_lambda_gamma": 0.0001,
14 |   "amsoftmax_lambda_power": 5,
15 | 
16 |   "batch_type": "softmax",
17 |   "pooling_type": "statistics_pooling",
18 |   "embedding_node": "tdnn6_dense",
19 | 
20 |   "learning_rate": 0.01,
21 |   "use_nesterov": false,
22 |   "clip_gradient": false,
23 |   "clip_gradient_norm": 3,
24 | 
25 |   "weight_l2_regularizer": 1e-2,
26 |   "batchnorm_momentum": 0.99,
27 | 
28 |   "num_epochs": 100,
29 |   "num_steps_per_epoch": 30000,
30 |   "reduce_lr_epochs": 4,
31 |   "show_training_progress": 100,
32 |   "keep_checkpoint_max": 100,
33 |   "save_summary_steps": 10000,
34 |   "save_checkpoints_steps": 30000,
35 |   "valid_max_iterations": 1000,
36 | 
37 |   "num_parallel_datasets": 16,
38 |   "max_queue_size": 10,
39 |   "num_speakers_per_batch": 64,
40 |   "num_segments_per_speaker": 1,
41 |   "min_segment_len": 200,
42 |   "max_segment_len": 400,
43 | 
44 |   "early_stop_epochs": 10,
45 |   "min_learning_rate": 1e-6
46 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 | 
 8 |   "pooling_type": "self_attention",
 9 |   "att_key_input": "tdnn4_relu",
10 |   "att_key_num_nodes": [1500, 1500],
11 |   "att_key_network_type": 3,
12 |   "att_value_input": "tdnn5_relu",
13 |   "att_value_num_nodes": [],
14 |   "att_value_network_type": 0,
15 |   "att_apply_nonlinear": false,
16 |   "att_use_scale": true,
17 |   "att_num_heads": 1,
18 |   "att_split_key": false,
19 |   "att_penalty_term": 0,
20 | 
21 |   "learning_rate": 0.005,
22 |   "optimizer": "momentum",
23 |   "momentum": 0.9,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 | 
27 |   "weight_l2_regularizer": 1e-2,
28 |   "batchnorm_momentum": 0.99,
29 | 
30 |   "num_epochs": 200,
31 |   "num_steps_per_epoch": 16000,
32 |   "reduce_lr_epochs": 3,
33 |   "show_training_progress": 200,
34 |   "keep_checkpoint_max": 100,
35 |   "save_summary_steps": 8000,
36 |   "save_checkpoints_steps": 16000,
37 |   "valid_max_iterations": 5000,
38 | 
39 |   "num_parallel_datasets": 4,
40 |   "max_queue_size": 10,
41 |   "num_speakers_per_batch": 64,
42 |   "num_segments_per_speaker": 1,
43 |   "min_segment_len": 100,
44 |   "max_segment_len": 300,
45 | 
46 |   "early_stop_epochs": 8,
47 |   "min_learning_rate": 1e-6
48 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 | 
 8 |   "pooling_type": "self_attention",
 9 |   "att_key_input": "tdnn4_relu",
10 |   "att_key_num_nodes": [1500, 1500],
11 |   "att_key_network_type": 1,
12 |   "att_value_input": "tdnn5_relu",
13 |   "att_value_num_nodes": [],
14 |   "att_value_network_type": 0,
15 |   "att_apply_nonlinear": false,
16 |   "att_use_scale": true,
17 |   "att_num_heads": 1,
18 |   "att_split_key": false,
19 |   "att_penalty_term": 0,
20 | 
21 |   "learning_rate": 0.005,
22 |   "optimizer": "momentum",
23 |   "momentum": 0.9,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 | 
27 |   "weight_l2_regularizer": 1e-2,
28 |   "batchnorm_momentum": 0.99,
29 | 
30 |   "num_epochs": 200,
31 |   "num_steps_per_epoch": 16000,
32 |   "reduce_lr_epochs": 3,
33 |   "show_training_progress": 200,
34 |   "keep_checkpoint_max": 100,
35 |   "save_summary_steps": 8000,
36 |   "save_checkpoints_steps": 16000,
37 |   "valid_max_iterations": 5000,
38 | 
39 |   "num_parallel_datasets": 4,
40 |   "max_queue_size": 10,
41 |   "num_speakers_per_batch": 64,
42 |   "num_segments_per_speaker": 1,
43 |   "min_segment_len": 100,
44 |   "max_segment_len": 300,
45 | 
46 |   "early_stop_epochs": 8,
47 |   "min_learning_rate": 1e-6
48 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 | 
 8 |   "pooling_type": "self_attention",
 9 |   "att_key_input": "tdnn4_relu",
10 |   "att_key_num_nodes": [1500, 1500],
11 |   "att_key_network_type": 2,
12 |   "att_value_input": "tdnn5_relu",
13 |   "att_value_num_nodes": [],
14 |   "att_value_network_type": 0,
15 |   "att_apply_nonlinear": false,
16 |   "att_use_scale": true,
17 |   "att_num_heads": 1,
18 |   "att_split_key": false,
19 |   "att_penalty_term": 0,
20 | 
21 |   "learning_rate": 0.005,
22 |   "optimizer": "momentum",
23 |   "momentum": 0.9,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 | 
27 |   "weight_l2_regularizer": 1e-2,
28 |   "batchnorm_momentum": 0.99,
29 | 
30 |   "num_epochs": 200,
31 |   "num_steps_per_epoch": 16000,
32 |   "reduce_lr_epochs": 3,
33 |   "show_training_progress": 200,
34 |   "keep_checkpoint_max": 100,
35 |   "save_summary_steps": 8000,
36 |   "save_checkpoints_steps": 16000,
37 |   "valid_max_iterations": 5000,
38 | 
39 |   "num_parallel_datasets": 4,
40 |   "max_queue_size": 10,
41 |   "num_speakers_per_batch": 64,
42 |   "num_segments_per_speaker": 1,
43 |   "min_segment_len": 100,
44 |   "max_segment_len": 300,
45 | 
46 |   "early_stop_epochs": 8,
47 |   "min_learning_rate": 1e-6
48 | }


--------------------------------------------------------------------------------
/egs/fisher/v1/nnet_conf/tdnn_softmax_tdnn4_att_4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_linear": false,
 5 |   "loss_func": "softmax",
 6 |   "batch_type": "softmax",
 7 | 
 8 |   "pooling_type": "self_attention",
 9 |   "att_key_input": "tdnn4_relu",
10 |   "att_key_num_nodes": [1500, 1500],
11 |   "att_key_network_type": 0,
12 |   "att_value_input": "tdnn5_relu",
13 |   "att_value_num_nodes": [],
14 |   "att_value_network_type": 0,
15 |   "att_apply_nonlinear": false,
16 |   "att_use_scale": true,
17 |   "att_num_heads": 1,
18 |   "att_split_key": false,
19 |   "att_penalty_term": 0,
20 | 
21 |   "learning_rate": 0.005,
22 |   "optimizer": "momentum",
23 |   "momentum": 0.9,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 | 
27 |   "weight_l2_regularizer": 1e-2,
28 |   "batchnorm_momentum": 0.99,
29 | 
30 |   "num_epochs": 200,
31 |   "num_steps_per_epoch": 16000,
32 |   "reduce_lr_epochs": 3,
33 |   "show_training_progress": 200,
34 |   "keep_checkpoint_max": 100,
35 |   "save_summary_steps": 8000,
36 |   "save_checkpoints_steps": 16000,
37 |   "valid_max_iterations": 5000,
38 | 
39 |   "num_parallel_datasets": 4,
40 |   "max_queue_size": 10,
41 |   "num_speakers_per_batch": 64,
42 |   "num_segments_per_speaker": 1,
43 |   "min_segment_len": 100,
44 |   "max_segment_len": 300,
45 | 
46 |   "early_stop_epochs": 8,
47 |   "min_learning_rate": 1e-6
48 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "aux_loss_func": ["ring_loss"],
16 |   "ring_loss_init": 20,
17 |   "ring_loss_lambda": 0.01,
18 | 
19 |   "batch_type": "softmax",
20 |   "pooling_type": "statistics_pooling",
21 |   "embedding_node": "tdnn6_dense",
22 | 
23 |   "learning_rate": 0.01,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 |   "clip_gradient_norm": 3,
27 | 
28 |   "weight_l2_regularizer": 1e-2,
29 |   "batchnorm_momentum": 0.99,
30 | 
31 |   "num_epochs": 100,
32 |   "num_steps_per_epoch": 30000,
33 |   "reduce_lr_epochs": 4,
34 |   "show_training_progress": 100,
35 |   "keep_checkpoint_max": 100,
36 |   "save_summary_steps": 10000,
37 |   "save_checkpoints_steps": 30000,
38 |   "valid_max_iterations": 1000,
39 | 
40 |   "num_parallel_datasets": 8,
41 |   "max_queue_size": 10,
42 |   "num_speakers_per_batch": 64,
43 |   "num_segments_per_speaker": 1,
44 |   "min_segment_len": 200,
45 |   "max_segment_len": 400,
46 | 
47 |   "early_stop_epochs": 10,
48 |   "min_learning_rate": 1e-6
49 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_r0.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "aux_loss_func": ["ring_loss"],
16 |   "ring_loss_init": 20,
17 |   "ring_loss_lambda": 0.01,
18 | 
19 |   "batch_type": "softmax",
20 |   "pooling_type": "statistics_pooling",
21 |   "embedding_node": "tdnn6_dense",
22 | 
23 |   "learning_rate": 0.01,
24 |   "use_nesterov": false,
25 |   "clip_gradient": false,
26 |   "clip_gradient_norm": 3,
27 | 
28 |   "weight_l2_regularizer": 1e-2,
29 |   "batchnorm_momentum": 0.99,
30 | 
31 |   "num_epochs": 100,
32 |   "num_steps_per_epoch": 30000,
33 |   "reduce_lr_epochs": 4,
34 |   "show_training_progress": 100,
35 |   "keep_checkpoint_max": 100,
36 |   "save_summary_steps": 10000,
37 |   "save_checkpoints_steps": 30000,
38 |   "valid_max_iterations": 1000,
39 | 
40 |   "num_parallel_datasets": 8,
41 |   "max_queue_size": 10,
42 |   "num_speakers_per_batch": 64,
43 |   "num_segments_per_speaker": 1,
44 |   "min_segment_len": 200,
45 |   "max_segment_len": 400,
46 | 
47 |   "early_stop_epochs": 10,
48 |   "min_learning_rate": 1e-6
49 | }


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "aux_loss_func": ["mhe_loss"],
16 |   "mhe_lambda": 0.01,
17 | 
18 |   "noupdate_var_list": [],
19 |   "noload_var_list": [],
20 | 
21 |   "batch_type": "softmax",
22 |   "pooling_type": "statistics_pooling",
23 |   "embedding_node": "tdnn6_dense",
24 | 
25 |   "learning_rate": 0.01,
26 |   "use_nesterov": false,
27 |   "clip_gradient": false,
28 |   "clip_gradient_norm": 3,
29 | 
30 |   "weight_l2_regularizer": 1e-2,
31 |   "batchnorm_momentum": 0.99,
32 | 
33 |   "num_epochs": 100,
34 |   "num_steps_per_epoch": 30000,
35 |   "reduce_lr_epochs": 4,
36 |   "show_training_progress": 100,
37 |   "keep_checkpoint_max": 100,
38 |   "save_summary_steps": 10000,
39 |   "save_checkpoints_steps": 30000,
40 |   "valid_max_iterations": 1000,
41 | 
42 |   "num_parallel_datasets": 8,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 200,
47 |   "max_segment_len": 400,
48 | 
49 |   "early_stop_epochs": 10,
50 |   "min_learning_rate": 1e-6
51 | }


--------------------------------------------------------------------------------
/misc/tools/sample_validset_spk2utt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | 
 4 | if __name__ == "__main__":
 5 |     if len(sys.argv) != 4:
 6 |         print('usage: %s num_heldout_spk num_heldout_utts_per_spk input_spk2utt' % sys.argv[0])
 7 |         quit()
 8 | 
 9 |     num_spks = int(sys.argv[1])
10 |     num_utts_per_spk = int(sys.argv[2])
11 | 
12 |     satisfy_spks = []
13 |     not_satisfy_spks = []
14 |     with open(sys.argv[3], 'r') as f:
15 |         for line in f.readlines():
16 |             spk, utts = line.strip().split(' ', 1)
17 |             utts = utts.split(' ')
18 |             if len(utts) >= num_utts_per_spk + 2:
19 |                 satisfy_spks.append([spk, utts])
20 |             else:
21 |                 not_satisfy_spks.append([spk, utts])
22 | 
23 |     if len(satisfy_spks) < num_spks:
24 |         satisfy_spks += random.sample(not_satisfy_spks, num_spks - len(satisfy_spks))
25 | 
26 |     sampled_spks = random.sample(satisfy_spks, num_spks)
27 |     for spk in sampled_spks:
28 |         sys.stdout.write('%s' % spk[0])
29 | 
30 |         # We should ensure at lease one utterance of each speaker is left in the training set.
31 |         if len(spk[1]) > num_utts_per_spk:
32 |             spk[1] = random.sample(spk[1], num_utts_per_spk)
33 |         else:
34 |             spk[1] = random.sample(spk[1], len(spk[1]) - 1)
35 | 
36 |         for utt in spk[1]:
37 |             sys.stdout.write(' %s' % utt)
38 |         sys.stdout.write('\n')
39 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_mhe0.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "aux_loss_func": ["mhe_loss"],
16 |   "mhe_lambda": 0.01,
17 | 
18 |   "noupdate_var_list": [],
19 |   "noload_var_list": [],
20 | 
21 |   "batch_type": "softmax",
22 |   "pooling_type": "statistics_pooling",
23 |   "embedding_node": "tdnn6_dense",
24 | 
25 |   "learning_rate": 0.01,
26 |   "use_nesterov": false,
27 |   "clip_gradient": false,
28 |   "clip_gradient_norm": 3,
29 | 
30 |   "weight_l2_regularizer": 1e-2,
31 |   "batchnorm_momentum": 0.99,
32 | 
33 |   "num_epochs": 100,
34 |   "num_steps_per_epoch": 30000,
35 |   "reduce_lr_epochs": 4,
36 |   "show_training_progress": 100,
37 |   "keep_checkpoint_max": 100,
38 |   "save_summary_steps": 10000,
39 |   "save_checkpoints_steps": 30000,
40 |   "valid_max_iterations": 1000,
41 | 
42 |   "num_parallel_datasets": 8,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 200,
47 |   "max_segment_len": 400,
48 | 
49 |   "early_stop_epochs": 10,
50 |   "min_learning_rate": 1e-6
51 | }


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment",
16 |   "context_size": 7,
17 |   "num_shared_layers": 0,
18 |   "pooling_type": "statistics_pooling",
19 | 
20 |   "spk_loss_weight": 1.0,
21 |   "speaker_dim": 512,
22 |   "spk_last_layer_no_bn": false,
23 |   "spk_last_layer_linear": false,
24 |   "spk_loss_type": "softmax",
25 | 
26 |   "phn_loss_weight": 0.0,
27 |   "phone_dim": 512,
28 |   "phn_loss_type": "softmax",
29 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
30 |   "num_frames_per_utt": 4,
31 | 
32 |   "spk_embedding_node": "zs_mu_relu",
33 |   "phn_embedding_node": "zp_mu_relu",
34 | 
35 |   "num_parallel_datasets": 8,
36 |   "max_queue_size": 10,
37 |   "num_speakers_per_batch": 64,
38 |   "num_segments_per_speaker": 1,
39 |   "min_segment_len": 100,
40 |   "max_segment_len": 300,
41 | 
42 |   "num_epochs": 200,
43 |   "num_steps_per_epoch": 7000,
44 |   "show_training_progress": 200,
45 |   "keep_checkpoint_max": 100,
46 |   "save_summary_steps": 3500,
47 |   "save_checkpoints_steps": 7000,
48 |   "valid_max_iterations": 1000,
49 | 
50 |   "reduce_lr_epochs": 3,
51 |   "early_stop_epochs": 8,
52 |   "min_learning_rate": 1e-6
53 | }
54 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment",
16 |   "context_size": 7,
17 |   "num_shared_layers": 0,
18 |   "pooling_type": "statistics_pooling",
19 | 
20 |   "spk_loss_weight": 1.0,
21 |   "speaker_dim": 512,
22 |   "spk_last_layer_no_bn": false,
23 |   "spk_last_layer_linear": false,
24 |   "spk_loss_type": "softmax",
25 | 
26 |   "phn_loss_weight": 1.0,
27 |   "phone_dim": 512,
28 |   "phn_loss_type": "softmax",
29 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
30 |   "num_frames_per_utt": 4,
31 | 
32 |   "spk_embedding_node": "zs_mu_relu",
33 |   "phn_embedding_node": "zp_mu_relu",
34 | 
35 |   "num_parallel_datasets": 8,
36 |   "max_queue_size": 10,
37 |   "num_speakers_per_batch": 64,
38 |   "num_segments_per_speaker": 1,
39 |   "min_segment_len": 100,
40 |   "max_segment_len": 300,
41 | 
42 |   "num_epochs": 200,
43 |   "num_steps_per_epoch": 7000,
44 |   "show_training_progress": 200,
45 |   "keep_checkpoint_max": 100,
46 |   "save_summary_steps": 3500,
47 |   "save_checkpoints_steps": 7000,
48 |   "valid_max_iterations": 1000,
49 | 
50 |   "reduce_lr_epochs": 3,
51 |   "early_stop_epochs": 8,
52 |   "min_learning_rate": 1e-6
53 | }
54 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.001,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment",
16 |   "context_size": 7,
17 |   "num_shared_layers": 0,
18 |   "pooling_type": "statistics_pooling",
19 | 
20 |   "spk_loss_weight": 0,
21 |   "speaker_dim": 512,
22 |   "spk_last_layer_no_bn": false,
23 |   "spk_last_layer_linear": false,
24 |   "spk_loss_type": "softmax",
25 | 
26 |   "phn_loss_weight": 1.0,
27 |   "phone_dim": 512,
28 |   "phn_loss_type": "softmax",
29 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
30 |   "num_frames_per_utt": 4,
31 | 
32 |   "spk_embedding_node": "zs_mu_relu",
33 |   "phn_embedding_node": "zp_mu_relu",
34 | 
35 |   "num_parallel_datasets": 8,
36 |   "max_queue_size": 10,
37 |   "num_speakers_per_batch": 64,
38 |   "num_segments_per_speaker": 1,
39 |   "min_segment_len": 100,
40 |   "max_segment_len": 300,
41 | 
42 |   "num_epochs": 200,
43 |   "num_steps_per_epoch": 7000,
44 |   "show_training_progress": 200,
45 |   "keep_checkpoint_max": 100,
46 |   "save_summary_steps": 3500,
47 |   "save_checkpoints_steps": 7000,
48 |   "valid_max_iterations": 1000,
49 | 
50 |   "reduce_lr_epochs": 3,
51 |   "early_stop_epochs": 8,
52 |   "min_learning_rate": 1e-6
53 | }
54 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.001,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-4,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment",
16 |   "context_size": 7,
17 |   "num_shared_layers": 0,
18 |   "pooling_type": "statistics_pooling",
19 | 
20 |   "spk_loss_weight": 0,
21 |   "speaker_dim": 512,
22 |   "spk_last_layer_no_bn": false,
23 |   "spk_last_layer_linear": false,
24 |   "spk_loss_type": "softmax",
25 | 
26 |   "phn_loss_weight": 1.0,
27 |   "phone_dim": 512,
28 |   "phn_loss_type": "softmax",
29 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
30 |   "num_frames_per_utt": 4,
31 | 
32 |   "spk_embedding_node": "zs_mu_relu",
33 |   "phn_embedding_node": "zp_mu_relu",
34 | 
35 |   "num_parallel_datasets": 8,
36 |   "max_queue_size": 10,
37 |   "num_speakers_per_batch": 64,
38 |   "num_segments_per_speaker": 1,
39 |   "min_segment_len": 100,
40 |   "max_segment_len": 300,
41 | 
42 |   "num_epochs": 200,
43 |   "num_steps_per_epoch": 7000,
44 |   "show_training_progress": 200,
45 |   "keep_checkpoint_max": 100,
46 |   "save_summary_steps": 3500,
47 |   "save_checkpoints_steps": 7000,
48 |   "valid_max_iterations": 1000,
49 | 
50 |   "reduce_lr_epochs": 3,
51 |   "early_stop_epochs": 8,
52 |   "min_learning_rate": 1e-6
53 | }
54 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.001,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-4,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "?context_size": "context_size is to make sure the phonetic outputs have exactly the same #frames with the alignment",
16 |   "context_size": 11,
17 |   "num_shared_layers": 0,
18 |   "pooling_type": "statistics_pooling",
19 | 
20 |   "spk_loss_weight": 0,
21 |   "speaker_dim": 512,
22 |   "spk_last_layer_no_bn": false,
23 |   "spk_last_layer_linear": false,
24 |   "spk_loss_type": "softmax",
25 | 
26 |   "phn_loss_weight": 1.0,
27 |   "phone_dim": 512,
28 |   "phn_loss_type": "softmax",
29 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
30 |   "num_frames_per_utt": 4,
31 | 
32 |   "spk_embedding_node": "zs_mu_relu",
33 |   "phn_embedding_node": "zp_mu_relu",
34 | 
35 |   "num_parallel_datasets": 8,
36 |   "max_queue_size": 10,
37 |   "num_speakers_per_batch": 64,
38 |   "num_segments_per_speaker": 1,
39 |   "min_segment_len": 100,
40 |   "max_segment_len": 300,
41 | 
42 |   "num_epochs": 100,
43 |   "num_steps_per_epoch": 7000,
44 |   "show_training_progress": 200,
45 |   "keep_checkpoint_max": 100,
46 |   "save_summary_steps": 3500,
47 |   "save_checkpoints_steps": 7000,
48 |   "valid_max_iterations": 1000,
49 | 
50 |   "reduce_lr_epochs": 1,
51 |   "early_stop_epochs": 8,
52 |   "min_learning_rate": 1e-6
53 | }
54 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet_conf/tdnn_amsoftmax_m0.20_linear_bn_1e-2_tdnn4_att.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 |   "network_type": "tdnn",
 4 |   "last_layer_no_bn": false,
 5 |   "last_layer_linear": true,
 6 |   "feature_norm": false,
 7 | 
 8 |   "loss_func": "additive_margin_softmax",
 9 |   "amsoftmax_m": 0.20,
10 |   "amsoftmax_lambda_min": 0,
11 |   "amsoftmax_lambda_base": 1000,
12 |   "amsoftmax_lambda_gamma": 0.0001,
13 |   "amsoftmax_lambda_power": 5,
14 | 
15 |   "batch_type": "softmax",
16 | 
17 |   "pooling_type": "self_attention",
18 |   "att_key_input": "tdnn4_relu",
19 |   "att_key_num_nodes": [1500, 1500],
20 |   "att_key_network_type": 3,
21 |   "att_value_input": "tdnn5_relu",
22 |   "att_value_num_nodes": [],
23 |   "att_value_network_type": 0,
24 |   "att_apply_nonlinear": false,
25 |   "att_use_scale": true,
26 |   "att_num_heads": 1,
27 |   "att_split_key": false,
28 |   "att_penalty_term": 0,
29 | 
30 |   "learning_rate": 0.01,
31 |   "use_nesterov": false,
32 |   "clip_gradient": false,
33 |   "clip_gradient_norm": 3,
34 | 
35 |   "weight_l2_regularizer": 1e-2,
36 |   "batchnorm_momentum": 0.99,
37 | 
38 |   "num_epochs": 100,
39 |   "num_steps_per_epoch": 30000,
40 |   "reduce_lr_epochs": 4,
41 |   "show_training_progress": 100,
42 |   "keep_checkpoint_max": 100,
43 |   "save_summary_steps": 10000,
44 |   "save_checkpoints_steps": 30000,
45 |   "valid_max_iterations": 1000,
46 | 
47 |   "num_parallel_datasets": 16,
48 |   "max_queue_size": 10,
49 |   "num_speakers_per_batch": 64,
50 |   "num_segments_per_speaker": 1,
51 |   "min_segment_len": 200,
52 |   "max_segment_len": 400,
53 | 
54 |   "early_stop_epochs": 10,
55 |   "min_learning_rate": 1e-6
56 | }


--------------------------------------------------------------------------------
/misc/DETware_v2.1/Set_DET_limits.m:
--------------------------------------------------------------------------------
 1 | function Set_DET_limits(Pmiss_min, Pmiss_max, Pfa_min, Pfa_max)
 2 | % function Set_DET_limits(Pmiss_min, Pmiss_max, Pfa_min, Pfa_max)
 3 | %
 4 | %  Set_DET_limits initializes the min.max plotting limits for P_min and P_fa.
 5 | %
 6 | %  See DET_usage for an example of how to use Set_DET_limits.
 7 | 
 8 | Pmiss_min_default = 0.0005+eps;
 9 | Pmiss_max_default = 0.5-eps;
10 | Pfa_min_default = 0.0005+eps;
11 | Pfa_max_default = 0.5-eps;
12 | 
13 | global DET_limits;
14 | 
15 | %-------------------------
16 | % If value not supplied as arguement, then use previous value
17 | % or use default value if DET_limits hasn't been initialized.
18 | 
19 | if (~isempty(DET_limits))
20 | 	Pmiss_min_default = DET_limits(1);
21 | 	Pmiss_max_default = DET_limits(2);
22 | 	Pfa_min_default  = DET_limits(3);
23 | 	Pfa_max_default  = DET_limits(4);
24 | end
25 | 
26 | if ~(exist('Pmiss_min')); Pmiss_min = Pmiss_min_default; end;
27 | if ~(exist('Pmiss_max')); Pmiss_max = Pmiss_max_default; end;
28 | if ~(exist('Pfa_min')); Pfa_min = Pfa_min_default; end;
29 | if ~(exist('Pfa_max')); Pfa_max = Pfa_max_default; end;
30 | 
31 | %-------------------------
32 | % Limit bounds to reasonable values
33 | 
34 | Pmiss_min = max(Pmiss_min,eps);
35 | Pmiss_max = min(Pmiss_max,1-eps);
36 | if Pmiss_max <= Pmiss_min
37 | 	Pmiss_min = eps;
38 | 	Pmiss_max = 1-eps;
39 | end
40 | 
41 | Pfa_min = max(Pfa_min,eps);
42 | Pfa_max = min(Pfa_max,1-eps);
43 | if Pfa_max <= Pfa_min
44 | 	Pfa_min = eps;
45 | 	Pfa_max = 1-eps;
46 | end
47 | 
48 | %--------------------------
49 | % Load DET_limits with bounds to use
50 | 
51 | DET_limits = [Pmiss_min Pmiss_max Pfa_min Pfa_max];
52 | 
53 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | #a) Old options
14 | #export cuda_cmd="queue.pl -l qp=cuda-low -l osrel='*' -l gpuclass='*'"
15 | #export decode_cmd="queue.pl -l qp=low -l osrel='*' -l osrel='*'"
16 | #export mkgraph_cmd="queue.pl -l qp=low -l osrel='*'"
17 | 
18 | #b) THU Tianjin Cluster
19 | queue_conf=$PWD/slurm_conf/slurm.conf
20 | #export train_cmd="slurm.pl --config $queue_conf"
21 | export train_cmd="run.pl"
22 | export cuda_cmd="run.pl"
23 | 
24 | if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
25 |   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
26 |   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
27 |   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
28 |   export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
29 | fi


--------------------------------------------------------------------------------
/egs/voxceleb/v2_unfinished/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | #a) Old options
14 | #export cuda_cmd="queue.pl -l qp=cuda-low -l osrel='*' -l gpuclass='*'"
15 | #export decode_cmd="queue.pl -l qp=low -l osrel='*' -l osrel='*'"
16 | #export mkgraph_cmd="queue.pl -l qp=low -l osrel='*'"
17 | 
18 | #b) THU Tianjin Cluster
19 | queue_conf=$PWD/slurm_conf/slurm.conf
20 | #export train_cmd="slurm.pl --config $queue_conf"
21 | export train_cmd="run.pl"
22 | export cuda_cmd="run.pl"
23 | 
24 | if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
25 |   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
26 |   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
27 |   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
28 |   export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
29 | fi


--------------------------------------------------------------------------------
/egs/fisher/v1/eval_plda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nnetdir=$1
 4 | 
 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
 6 | echo "EER: ${eer}%"
 7 | paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test.new
 8 | grep ' target$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.target
 9 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.nontarget
10 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test.target', '$nnetdir/xvector_scores_hires/test.nontarget', '$nnetdir/xvector_scores_hires/test_lda_plda.result')"`
11 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
12 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_plda.result
13 | 
14 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
15 | echo "EER: ${eer}%"
16 | paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_lda_cos.new
17 | grep ' target$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.target
18 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.nontarget
19 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_lda_cos.target', '$nnetdir/xvector_scores_hires/test_lda_cos.nontarget', '$nnetdir/xvector_scores_hires/test_lda_cos.result')"`
20 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
21 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_cos.result
22 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/eval_plda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nnetdir=$1
 4 | 
 5 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
 6 | echo "EER: ${eer}%"
 7 | paste $trials $nnetdir/xvector_scores_hires/test | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test.new
 8 | grep ' target$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.target
 9 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test.nontarget
10 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test.target', '$nnetdir/xvector_scores_hires/test.nontarget', '$nnetdir/xvector_scores_hires/test_lda_plda.result')"`
11 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
12 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_plda.result
13 | 
14 | eer=$(paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
15 | echo "EER: ${eer}%"
16 | paste $trials $nnetdir/xvector_scores_hires/test_lda_cos | awk '{print $6, $3}' > $nnetdir/xvector_scores_hires/test_lda_cos.new
17 | grep ' target$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.target
18 | grep ' nontarget$' $nnetdir/xvector_scores_hires/test_lda_cos.new | cut -d ' ' -f 1 > $nnetdir/xvector_scores_hires/test_lda_cos.nontarget
19 | comm=`echo "addpath('../../../misc/DETware_v2.1'); Get_DCF('$nnetdir/xvector_scores_hires/test_lda_cos.target', '$nnetdir/xvector_scores_hires/test_lda_cos.nontarget', '$nnetdir/xvector_scores_hires/test_lda_cos.result')"`
20 | echo "$comm"| matlab -nodesktop -noFigureWindows > /dev/null
21 | tail -n 1 $nnetdir/xvector_scores_hires/test_lda_cos.result
22 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_6.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "phone_layer_size": [512, 512, 512, 512, 512],
16 |   "phone_kernel_size": [5, 5, 7, 1, 3],
17 |   "phone_dilation_size": [1, 1, 1, 1, 4],
18 | 
19 |   "?left_and_right_context": "The context is used in feature expansion",
20 |   "speaker_left_context": 7,
21 |   "speaker_right_context": 7,
22 |   "phone_left_context": 11,
23 |   "phone_right_context": 11,
24 |   "num_shared_layers": 0,
25 | 
26 |   "pooling_type": "statistics_pooling",
27 |   "spk_loss_weight": 0,
28 |   "speaker_dim": 512,
29 |   "spk_last_layer_no_bn": false,
30 |   "spk_last_layer_linear": false,
31 |   "spk_loss_type": "softmax",
32 | 
33 |   "phn_loss_weight": 1.0,
34 |   "phone_dim": 512,
35 |   "phn_loss_type": "softmax",
36 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
37 |   "num_frames_per_utt": 4,
38 | 
39 |   "spk_embedding_node": "zs_mu_relu",
40 |   "phn_embedding_node": "zp_mu_relu",
41 | 
42 |   "num_parallel_datasets": 4,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 100,
47 |   "max_segment_len": 300,
48 | 
49 |   "num_epochs": 100,
50 |   "num_steps_per_epoch": 7000,
51 |   "show_training_progress": 200,
52 |   "keep_checkpoint_max": 100,
53 |   "save_summary_steps": 3500,
54 |   "save_checkpoints_steps": 7000,
55 |   "valid_max_iterations": 1000,
56 | 
57 |   "reduce_lr_epochs": 3,
58 |   "early_stop_epochs": 8,
59 |   "min_learning_rate": 1e-6
60 | }
61 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_7.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "phone_layer_size": [512, 512, 512, 512, 512],
16 |   "phone_kernel_size": [5, 5, 7, 1, 3],
17 |   "phone_dilation_size": [1, 1, 1, 1, 4],
18 | 
19 |   "?left_and_right_context": "The context is used in feature expansion",
20 |   "speaker_left_context": 7,
21 |   "speaker_right_context": 7,
22 |   "phone_left_context": 11,
23 |   "phone_right_context": 11,
24 |   "num_shared_layers": 0,
25 | 
26 |   "pooling_type": "statistics_pooling",
27 |   "spk_loss_weight": 0,
28 |   "speaker_dim": 512,
29 |   "spk_last_layer_no_bn": false,
30 |   "spk_last_layer_linear": false,
31 |   "spk_loss_type": "softmax",
32 | 
33 |   "phn_loss_weight": 1.0,
34 |   "phone_dim": 512,
35 |   "phn_loss_type": "softmax",
36 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
37 |   "num_frames_per_utt": 4,
38 | 
39 |   "spk_embedding_node": "zs_mu_relu",
40 |   "phn_embedding_node": "zp_mu_relu",
41 | 
42 |   "num_parallel_datasets": 4,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 30,
47 |   "max_segment_len": 30,
48 | 
49 |   "num_epochs": 100,
50 |   "num_steps_per_epoch": 7000,
51 |   "show_training_progress": 200,
52 |   "keep_checkpoint_max": 100,
53 |   "save_summary_steps": 3500,
54 |   "save_checkpoints_steps": 7000,
55 |   "valid_max_iterations": 1000,
56 | 
57 |   "reduce_lr_epochs": 3,
58 |   "early_stop_epochs": 8,
59 |   "min_learning_rate": 1e-6
60 | }
61 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_8.2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "phone_layer_size": [512, 512, 512, 512, 512],
16 |   "phone_kernel_size": [5, 5, 7, 1, 3],
17 |   "phone_dilation_size": [1, 1, 1, 1, 1],
18 | 
19 |   "?left_and_right_context": "The context is used in feature expansion",
20 |   "speaker_left_context": 7,
21 |   "speaker_right_context": 7,
22 |   "phone_left_context": 8,
23 |   "phone_right_context": 8,
24 |   "num_shared_layers": 0,
25 | 
26 |   "pooling_type": "statistics_pooling",
27 |   "spk_loss_weight": 0,
28 |   "speaker_dim": 512,
29 |   "spk_last_layer_no_bn": false,
30 |   "spk_last_layer_linear": false,
31 |   "spk_loss_type": "softmax",
32 | 
33 |   "phn_loss_weight": 1.0,
34 |   "phone_dim": 512,
35 |   "phn_loss_type": "softmax",
36 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
37 |   "num_frames_per_utt": 4,
38 | 
39 |   "spk_embedding_node": "zs_mu_relu",
40 |   "phn_embedding_node": "zp_mu_relu",
41 | 
42 |   "num_parallel_datasets": 4,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 100,
47 |   "max_segment_len": 300,
48 | 
49 |   "num_epochs": 100,
50 |   "num_steps_per_epoch": 7000,
51 |   "show_training_progress": 200,
52 |   "keep_checkpoint_max": 100,
53 |   "save_summary_steps": 3500,
54 |   "save_checkpoints_steps": 7000,
55 |   "valid_max_iterations": 1000,
56 | 
57 |   "reduce_lr_epochs": 3,
58 |   "early_stop_epochs": 8,
59 |   "min_learning_rate": 1e-6
60 | }
61 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "phone_layer_size": [512, 512, 512, 512, 512],
16 |   "phone_kernel_size": [5, 5, 7, 1, 3],
17 |   "phone_dilation_size": [1, 1, 1, 1, 1],
18 | 
19 |   "?left_and_right_context": "The context is used in feature expansion",
20 |   "speaker_left_context": 7,
21 |   "speaker_right_context": 7,
22 |   "phone_left_context": 8,
23 |   "phone_right_context": 8,
24 |   "num_shared_layers": 0,
25 | 
26 |   "pooling_type": "statistics_pooling",
27 |   "spk_loss_weight": 0,
28 |   "speaker_dim": 512,
29 |   "spk_last_layer_no_bn": false,
30 |   "spk_last_layer_linear": false,
31 |   "spk_loss_type": "softmax",
32 | 
33 |   "phn_loss_weight": 1.0,
34 |   "phone_dim": 512,
35 |   "phn_loss_type": "softmax",
36 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
37 |   "num_frames_per_utt": 1,
38 | 
39 |   "spk_embedding_node": "zs_mu_relu",
40 |   "phn_embedding_node": "zp_mu_relu",
41 | 
42 |   "num_parallel_datasets": 4,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 100,
47 |   "max_segment_len": 300,
48 | 
49 |   "num_epochs": 100,
50 |   "num_steps_per_epoch": 7000,
51 |   "show_training_progress": 200,
52 |   "keep_checkpoint_max": 100,
53 |   "save_summary_steps": 3500,
54 |   "save_checkpoints_steps": 7000,
55 |   "valid_max_iterations": 1000,
56 | 
57 |   "reduce_lr_epochs": 3,
58 |   "early_stop_epochs": 8,
59 |   "min_learning_rate": 1e-6
60 | }
61 | 


--------------------------------------------------------------------------------
/egs/fisher/v3/nnet_conf/mt_softmax_8.3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "seed": 0,
 3 | 
 4 |   "learning_rate": 0.005,
 5 |   "optimizer": "momentum",
 6 |   "momentum": 0.9,
 7 |   "use_nesterov": false,
 8 |   "clip_gradient": false,
 9 | 
10 |   "weight_l2_regularizer": 1e-2,
11 |   "batchnorm_momentum": 0.99,
12 | 
13 |   "batch_type": "softmax",
14 | 
15 |   "phone_layer_size": [512, 512, 512, 512, 512],
16 |   "phone_kernel_size": [5, 5, 7, 1, 3],
17 |   "phone_dilation_size": [1, 1, 1, 1, 1],
18 | 
19 |   "?left_and_right_context": "The context is used in feature expansion",
20 |   "speaker_left_context": 7,
21 |   "speaker_right_context": 7,
22 |   "phone_left_context": 8,
23 |   "phone_right_context": 8,
24 |   "num_shared_layers": 0,
25 | 
26 |   "pooling_type": "statistics_pooling",
27 |   "spk_loss_weight": 0,
28 |   "speaker_dim": 512,
29 |   "spk_last_layer_no_bn": false,
30 |   "spk_last_layer_linear": false,
31 |   "spk_loss_type": "softmax",
32 | 
33 |   "phn_loss_weight": 1.0,
34 |   "phone_dim": 512,
35 |   "phn_loss_type": "softmax",
36 |   "?num_frames_per_utt": "How many frames in a segment should be used to train the phone network. If -1, use all.",
37 |   "num_frames_per_utt": -1,
38 | 
39 |   "spk_embedding_node": "zs_mu_relu",
40 |   "phn_embedding_node": "zp_mu_relu",
41 | 
42 |   "num_parallel_datasets": 4,
43 |   "max_queue_size": 10,
44 |   "num_speakers_per_batch": 64,
45 |   "num_segments_per_speaker": 1,
46 |   "min_segment_len": 100,
47 |   "max_segment_len": 300,
48 | 
49 |   "num_epochs": 100,
50 |   "num_steps_per_epoch": 7000,
51 |   "show_training_progress": 200,
52 |   "keep_checkpoint_max": 100,
53 |   "save_summary_steps": 3500,
54 |   "save_checkpoints_steps": 7000,
55 |   "valid_max_iterations": 1000,
56 | 
57 |   "reduce_lr_epochs": 3,
58 |   "early_stop_epochs": 8,
59 |   "min_learning_rate": 1e-6
60 | }
61 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_finetune_lr_learning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | env=tf_gpu
 5 | num_gpus=1
 6 | checkpoint=-1
 7 | tune_period=100
 8 | 
 9 | echo "$0 $@"
10 | 
11 | if [ -f path.sh ]; then . ./path.sh; fi
12 | . parse_options.sh || exit 1;
13 | 
14 | if [ $# != 7 ]; then
15 |   echo "Usage: $0 [options] <config> <train-dir> <train-spklist> <valid-dir> <valid-spklist> <pretrained-nnet> <nnet>"
16 |   echo "Options:"
17 |   echo "  --tune-period <100>"
18 |   echo "  --checkpoint <-1>"
19 |   echo "  --env <tf_gpu>"
20 |   echo "  --num-gpus <n_gpus>"
21 |   exit 100
22 | fi
23 | 
24 | config=$1
25 | train=$2
26 | train_spklist=$3
27 | valid=$4
28 | valid_spklist=$5
29 | pretrain_nnetdir=$6
30 | nnetdir=$7
31 | 
32 | # add the library to the python path.
33 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
34 | 
35 | mkdir -p $nnetdir/log
36 | 
37 | # Get available GPUs before we can train the network.
38 | num_total_gpus=`nvidia-smi -L | wc -l`
39 | num_gpus_assigned=0
40 | while [ $num_gpus_assigned -ne $num_gpus ]; do
41 |   num_gpus_assigned=0
42 |   for i in `seq 0 $[$num_total_gpus-1]`; do
43 |     # going over all GPUs and check if it is idle, and add to the list if yes
44 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
45 |       num_gpus_assigned=$[$num_gpus_assigned+1]
46 |     fi
47 |     # once we have enough GPUs, break out of the loop
48 |     [ $num_gpus_assigned -eq $num_gpus ] && break
49 |   done
50 |   [ $num_gpus_assigned -eq $num_gpus ] && break
51 |   sleep 300
52 | done
53 | 
54 | source $TF_ENV/$env/bin/activate
55 | $cmd $nnetdir/log/finetune_lr_learning.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
56 |   python nnet/lib/finetune_lr_learning.py --tune_period $tune_period --checkpoint $checkpoint --config $config $train $train_spklist $valid $valid_spklist $pretrain_nnetdir $nnetdir
57 | deactivate
58 | 
59 | exit 0


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_train_lr_learning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | env=tf_gpu
 5 | num_gpus=1
 6 | tune_period=100
 7 | 
 8 | echo "$0 $@"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 6 ]; then
14 |   echo "Usage: $0 [options] <config> <train-dir> <train-spklist> <valid-dir> <valid-spklist> <nnet>"
15 |   echo "Options:"
16 |   echo "  --tune-period <100>"
17 |   echo "  --env <tf_gpu>"
18 |   echo "  --num-gpus <n_gpus>"
19 |   exit 100
20 | fi
21 | 
22 | config=$1
23 | train=$2
24 | train_spklist=$3
25 | valid=$4
26 | valid_spklist=$5
27 | nnetdir=$6
28 | 
29 | # add the library to the python path.
30 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
31 | 
32 | mkdir -p $nnetdir/log
33 | 
34 | 
35 | # Get available GPUs before we can train the network.
36 | num_total_gpus=`nvidia-smi -L | wc -l`
37 | num_gpus_assigned=0
38 | while [ $num_gpus_assigned -ne $num_gpus ]; do
39 |   num_gpus_assigned=0
40 |   for i in `seq 0 $[$num_total_gpus-1]`; do
41 |     # going over all GPUs and check if it is idle, and add to the list if yes
42 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
43 |       num_gpus_assigned=$[$num_gpus_assigned+1]
44 |     fi
45 |     # once we have enough GPUs, break out of the loop
46 |     [ $num_gpus_assigned -eq $num_gpus ] && break
47 |   done
48 |   [ $num_gpus_assigned -eq $num_gpus ] && break
49 |   sleep 300
50 | done
51 | 
52 | # Activate the gpu virtualenv
53 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways.
54 | # Limit the GPU number to what we want.
55 | source $TF_ENV/$env/bin/activate
56 | $cmd $nnetdir/log/train_lr_learning.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
57 |     python nnet/lib/train_lr_learning.py --tune_period $tune_period --config $config $train $train_spklist $valid $valid_spklist $nnetdir
58 | deactivate
59 | 
60 | exit 0


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_finetune_nnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | continue_training=false
 5 | env=tf_gpu
 6 | num_gpus=1
 7 | checkpoint=-1
 8 | 
 9 | echo "$0 $@"
10 | 
11 | if [ -f path.sh ]; then . ./path.sh; fi
12 | . parse_options.sh || exit 1;
13 | 
14 | if [ $# != 7 ]; then
15 |   echo "Usage: $0 [options] <config> <train-dir> <train-spklist> <valid-dir> <valid-spklist> <pretrained-nnet> <nnet>"
16 |   echo "Options:"
17 |   echo "  --continue-training <false>"
18 |   echo "  --checkpoint <-1>"
19 |   echo "  --env <tf_gpu>"
20 |   echo "  --num-gpus <n_gpus>"
21 |   exit 100
22 | fi
23 | 
24 | config=$1
25 | train=$2
26 | train_spklist=$3
27 | valid=$4
28 | valid_spklist=$5
29 | pretrain_nnetdir=$6
30 | nnetdir=$7
31 | 
32 | # add the library to the python path.
33 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
34 | 
35 | mkdir -p $nnetdir/log
36 | 
37 | if [ $continue_training == 'true' ]; then
38 |   cmdopts="-c"
39 | fi
40 | 
41 | # Get available GPUs before we can train the network.
42 | num_total_gpus=`nvidia-smi -L | wc -l`
43 | num_gpus_assigned=0
44 | while [ $num_gpus_assigned -ne $num_gpus ]; do
45 |   num_gpus_assigned=0
46 |   for i in `seq 0 $[$num_total_gpus-1]`; do
47 |     # going over all GPUs and check if it is idle, and add to the list if yes
48 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
49 |       num_gpus_assigned=$[$num_gpus_assigned+1]
50 |     fi
51 |     # once we have enough GPUs, break out of the loop
52 |     [ $num_gpus_assigned -eq $num_gpus ] && break
53 |   done
54 |   [ $num_gpus_assigned -eq $num_gpus ] && break
55 |   sleep 300
56 | done
57 | 
58 | source $TF_ENV/$env/bin/activate
59 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
60 |   python nnet/lib/finetune.py $cmdopts --checkpoint $checkpoint --config $config $train $train_spklist $valid $valid_spklist $pretrain_nnetdir $nnetdir
61 | deactivate
62 | 
63 | exit 0
64 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_train_mi_nnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | continue_training=false
 5 | env=tf_gpu
 6 | num_gpus=1
 7 | 
 8 | echo "$0 $@"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 8 ]; then
14 |   echo "Usage: $0 [options] <config> <train-dir> <train-aux-dir> <train-spklist> <valid-dir> <valid-aux-dir> <valid-spklist> <nnet>"
15 |   echo "Options:"
16 |   echo "  --continue-training <false>"
17 |   echo "  --env <tf_gpu>"
18 |   echo "  --num-gpus <n_gpus>"
19 |   exit 100
20 | fi
21 | 
22 | config=$1
23 | train=$2
24 | train_aux=$3
25 | train_spklist=$4
26 | valid=$5
27 | valid_aux=$6
28 | valid_spklist=$7
29 | nnetdir=$8
30 | 
31 | # add the library to the python path.
32 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
33 | 
34 | mkdir -p $nnetdir/log
35 | 
36 | if [ $continue_training == 'true' ]; then
37 |   cmdopts="-c"
38 | fi
39 | 
40 | # Get available GPUs before we can train the network.
41 | num_total_gpus=`nvidia-smi -L | wc -l`
42 | num_gpus_assigned=0
43 | while [ $num_gpus_assigned -ne $num_gpus ]; do
44 |   num_gpus_assigned=0
45 |   for i in `seq 0 $[$num_total_gpus-1]`; do
46 |     # going over all GPUs and check if it is idle, and add to the list if yes
47 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
48 |       num_gpus_assigned=$[$num_gpus_assigned+1]
49 |     fi
50 |     # once we have enough GPUs, break out of the loop
51 |     [ $num_gpus_assigned -eq $num_gpus ] && break
52 |   done
53 |   [ $num_gpus_assigned -eq $num_gpus ] && break
54 |   sleep 300
55 | done
56 | 
57 | # Activate the gpu virtualenv
58 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways.
59 | # Limit the GPU number to what we want.
60 | source $TF_ENV/$env/bin/activate
61 | #$cmd $nnetdir/log/train_mi_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
62 |     python nnet/lib/train_mi.py $cmdopts --config $config $train $train_aux $train_spklist $valid $valid_aux $valid_spklist $nnetdir
63 | deactivate
64 | 
65 | exit 0


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_train_nnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | continue_training=false
 5 | env=tf_gpu
 6 | num_gpus=1
 7 | 
 8 | echo "$0 $@"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 6 ]; then
14 |   echo "Usage: $0 [options] <config> <train-dir> <train-spklist> <valid-dir> <valid-spklist> <nnet>"
15 |   echo "Options:"
16 |   echo "  --continue-training <false>"
17 |   echo "  --env <tf_gpu>"
18 |   echo "  --num-gpus <n_gpus>"
19 |   exit 100
20 | fi
21 | 
22 | config=$1
23 | train=$2
24 | train_spklist=$3
25 | valid=$4
26 | valid_spklist=$5
27 | nnetdir=$6
28 | 
29 | # add the library to the python path.
30 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
31 | 
32 | mkdir -p $nnetdir/log
33 | 
34 | if [ $continue_training == 'true' ]; then
35 |   cmdopts="-c"
36 | fi
37 | 
38 | # Get available GPUs before we can train the network.
39 | num_total_gpus=`nvidia-smi -L | wc -l`
40 | num_gpus_assigned=0
41 | while [ $num_gpus_assigned -ne $num_gpus ]; do
42 |   num_gpus_assigned=0
43 |   for i in `seq 0 $[$num_total_gpus-1]`; do
44 |     # going over all GPUs and check if it is idle, and add to the list if yes
45 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
46 |       num_gpus_assigned=$[$num_gpus_assigned+1]
47 |     fi
48 |     # once we have enough GPUs, break out of the loop
49 |     [ $num_gpus_assigned -eq $num_gpus ] && break
50 |   done
51 |   [ $num_gpus_assigned -eq $num_gpus ] && break
52 |   sleep 300
53 | done
54 | 
55 | if [ -d $nnetdir/log ] && [ `ls $nnetdir/log | wc -l` -ge 1 ]; then
56 |   mkdir -p $nnetdir/.backup/log
57 |   cp $nnetdir/log/* $nnetdir/.backup/log
58 | fi
59 | 
60 | # Activate the gpu virtualenv
61 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways.
62 | # Limit the GPU number to what we want.
63 | source $TF_ENV/$env/bin/activate
64 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
65 |     python nnet/lib/train.py $cmdopts --config $config $train $train_spklist $valid $valid_spklist $nnetdir
66 | deactivate
67 | 
68 | exit 0


--------------------------------------------------------------------------------
/misc/tuning/target_logit_curve.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | angle = np.arange(0, 180, 1)
 5 | softmax = np.cos(angle/180*np.pi)
 6 | 
 7 | m = 0.2
 8 | amsoftmax = np.cos(angle/180*np.pi) - m
 9 | 
10 | m = 0.3
11 | arcsoftmax = np.cos(angle/180*np.pi + m)
12 | 
13 | m = 4.0
14 | l = 10.0
15 | angle1 = np.arange(0, 180/4, 1)
16 | a1 = l / (1+l) * np.cos(angle1/180*np.pi) + 1 / (1+l) * np.cos(4 * angle1/180*np.pi)
17 | angle2 = np.arange(180/4, 180/2, 1)
18 | a2 = l / (1+l) * np.cos(angle2/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle2/180*np.pi) - 2)
19 | angle3 = np.arange(180/2, 180*3/4, 1)
20 | a3 = l / (1+l) * np.cos(angle3/180*np.pi) + 1 / (1+l) * (np.cos(4 * angle3/180*np.pi) - 4)
21 | angle4 = np.arange(180*3/4, 180, 1)
22 | a4 = l / (1+l) * np.cos(angle4/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle4/180*np.pi) - 6)
23 | angle_new = np.concatenate([angle1, angle2, angle3, angle4], axis=0)
24 | asoftmax = np.concatenate([a1, a2, a3, a4], axis=0)
25 | 
26 | l = 0
27 | a1 = l / (1+l) * np.cos(angle1/180*np.pi) + 1 / (1+l) * np.cos(4 * angle1/180*np.pi)
28 | a2 = l / (1+l) * np.cos(angle2/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle2/180*np.pi) - 2)
29 | a3 = l / (1+l) * np.cos(angle3/180*np.pi) + 1 / (1+l) * (np.cos(4 * angle3/180*np.pi) - 4)
30 | a4 = l / (1+l) * np.cos(angle4/180*np.pi) + 1 / (1+l) * (-np.cos(4 * angle4/180*np.pi) - 6)
31 | asoftmax_nolambda = np.concatenate([a1, a2, a3, a4], axis=0)
32 | 
33 | m = 1.20
34 | asoftmax_new = np.cos(m * angle / 180 * np.pi)
35 | 
36 | plt.figure(1)
37 | plt.plot(angle, softmax, 'b', label='Softmax')
38 | plt.plot(angle_new, asoftmax_nolambda, 'r', label='ASoftmax ($m_1=4$, $\lambda=0$)')
39 | plt.plot(angle_new, asoftmax, 'r', label='ASoftmax ($m_1=4$, $\lambda=10$)')
40 | plt.plot(angle, arcsoftmax, 'c', label='ArcSoftmax ($m_2=0.30$)')
41 | plt.plot(angle, amsoftmax, 'm', label='AMSoftmax ($m_3=0.20$)')
42 | plt.xlabel(r'$\theta$', fontsize='x-large')
43 | plt.ylabel(r'$\psi(\theta)$', fontsize='x-large')
44 | plt.xlim((10, 120))
45 | plt.ylim((-1.0, 1.0))
46 | plt.legend(loc='lower left', fontsize='medium')
47 | plt.savefig('target_logit_curve.pdf', format='pdf')
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/scripts/prepare_bnfeats_for_egs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nj=30
 4 | cmd="run.pl"
 5 | stage=0
 6 | compress=true
 7 | 
 8 | echo "$0 $@"  # Print the command line for logging
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 3 ]; then
14 |   echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
15 |   echo "e.g.: $0 data/bnf data/bnf_nosil exp/bnf_nosil"
16 |   echo "Options: "
17 |   echo "  --nj <nj>                                        # number of parallel jobs"
18 |   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
19 |   exit 1;
20 | fi
21 | 
22 | data_in=$1
23 | data_out=$2
24 | dir=$3
25 | 
26 | name=`basename $data_in`
27 | 
28 | for f in $data_in/feats.scp $data_in/vad.scp $data_in/cmvn.scp ; do
29 |   [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
30 | done
31 | 
32 | # Set various variables.
33 | mkdir -p $dir/log
34 | mkdir -p $data_out
35 | featdir=$(utils/make_absolute.sh $dir)
36 | 
37 | cp $data_in/utt2spk $data_out/utt2spk
38 | cp $data_in/spk2utt $data_out/spk2utt
39 | cp $data_in/wav.scp $data_out/wav.scp
40 | [ -f $data_in/segments ] && cp $data_in/segments $data_out/segments
41 | 
42 | write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
43 | 
44 | sdata_in=$data_in/split$nj;
45 | utils/split_data.sh $data_in $nj || exit 1;
46 | 
47 | $cmd JOB=1:$nj $dir/log/create_bnfeats_${name}.JOB.log \
48 |   apply-cmvn --norm-means=true --norm-vars=false --utt2spk=ark:${sdata_in}/JOB/utt2spk scp:${sdata_in}/JOB/cmvn.scp scp:${sdata_in}/JOB/feats.scp ark:- \| \
49 |   select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
50 |   copy-feats --compress=$compress $write_num_frames_opt ark:- \
51 |   ark,scp:$featdir/bnfeats_${name}.JOB.ark,$featdir/bnfeats_${name}.JOB.scp || exit 1;
52 | 
53 | for n in $(seq $nj); do
54 |   cat $featdir/bnfeats_${name}.$n.scp || exit 1;
55 | done > ${data_out}/feats.scp || exit 1
56 | 
57 | for n in $(seq $nj); do
58 |   cat $featdir/log/utt2num_frames.$n || exit 1;
59 | done > $data_out/utt2num_frames || exit 1
60 | rm $featdir/log/utt2num_frames.*
61 | 
62 | echo "$0: Succeeded creating bottleneck features with cvmn and vad for $name"
63 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_train_mt_nnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cmd="run.pl"
 4 | continue_training=false
 5 | env=tf_gpu
 6 | num_gpus=1
 7 | 
 8 | echo "$0 $@"
 9 | 
10 | if [ -f path.sh ]; then . ./path.sh; fi
11 | . parse_options.sh || exit 1;
12 | 
13 | if [ $# != 8 ]; then
14 |   echo "Usage: $0 [options] <config> <train-dir> <train-ali-dir> <train-spklist> <valid-dir> <valid-ali-dir> <valid-spklist> <nnet>"
15 |   echo "Options:"
16 |   echo "  --continue-training <false>"
17 |   echo "  --env <tf_gpu>"
18 |   echo "  --num-gpus <n_gpus>"
19 |   exit 100
20 | fi
21 | 
22 | config=$1
23 | train=$2
24 | train_ali_dir=$3
25 | train_spklist=$4
26 | valid=$5
27 | valid_ali_dir=$6
28 | valid_spklist=$7
29 | nnetdir=$8
30 | 
31 | # add the library to the python path.
32 | export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
33 | 
34 | mkdir -p $nnetdir/log
35 | 
36 | if [ $continue_training == 'true' ]; then
37 |   cmdopts="-c"
38 | fi
39 | 
40 | # Get available GPUs before we can train the network.
41 | num_total_gpus=`nvidia-smi -L | wc -l`
42 | num_gpus_assigned=0
43 | while [ $num_gpus_assigned -ne $num_gpus ]; do
44 |   num_gpus_assigned=0
45 |   for i in `seq 0 $[$num_total_gpus-1]`; do
46 |     # going over all GPUs and check if it is idle, and add to the list if yes
47 |     if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
48 |       num_gpus_assigned=$[$num_gpus_assigned+1]
49 |     fi
50 |     # once we have enough GPUs, break out of the loop
51 |     [ $num_gpus_assigned -eq $num_gpus ] && break
52 |   done
53 |   [ $num_gpus_assigned -eq $num_gpus ] && break
54 |   sleep 300
55 | done
56 | 
57 | if [ -d $nnetdir/log ] && [ `ls $nnetdir/log | wc -l` -ge 1 ]; then
58 |   mkdir -p $nnetdir/.backup/log
59 |   cp $nnetdir/log/* $nnetdir/.backup/log
60 | fi
61 | 
62 | # Activate the gpu virtualenv
63 | # The tensorflow is installed using pip (virtualenv). Modify the code if you activate TF by other ways.
64 | # Limit the GPU number to what we want.
65 | source $TF_ENV/$env/bin/activate
66 | $cmd $nnetdir/log/train_nnet.log utils/parallel/limit_num_gpus.sh --num-gpus $num_gpus \
67 |     python nnet/lib/train_mt.py $cmdopts --config $config $train $train_ali_dir $train_spklist $valid $valid_ali_dir $valid_spklist $nnetdir
68 | deactivate
69 | 
70 | exit 0
71 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/train_insight.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | import sys
 5 | import numpy, scipy, sklearn
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | from misc.utils import save_codes_and_config, compute_cos_pairwise_eer
 9 | from model.trainer import Trainer
10 | from dataset.data_loader import KaldiDataRandomQueue
11 | from dataset.kaldi_io import FeatureReader
12 | 
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("data_dir", type=str, help="The data directory of the dataset.")
16 | parser.add_argument("data_spklist", type=str, help="The spklist maps the speakers to the indices.")
17 | parser.add_argument("model", type=str, help="The output model directory.")
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     tf.logging.set_verbosity(tf.logging.INFO)
22 |     args = parser.parse_args()
23 |     params = save_codes_and_config(True, args.model, None)
24 | 
25 |     # The model directory always has a folder named nnet
26 |     model_dir = os.path.join(args.model, "nnet")
27 | 
28 |     # Set the random seed. The random operations may appear in data input, batch forming, etc.
29 |     tf.set_random_seed(params.seed)
30 |     random.seed(params.seed)
31 |     np.random.seed(params.seed)
32 | 
33 |     dim = FeatureReader(args.data_dir).get_dim()
34 |     with open(args.data_spklist, 'r') as f:
35 |         num_total_train_speakers = len(f.readlines())
36 |     trainer = Trainer(params, args.model)
37 |     trainer.build("valid",
38 |                   dim=dim,
39 |                   loss_type=params.loss_func,
40 |                   num_speakers=num_total_train_speakers)
41 |     # valid_loss, valid_embeddings, valid_labels = trainer.valid(args.data_dir, args.data_spklist,
42 |     #                                                            batch_type=params.batch_type,
43 |     #                                                            output_embeddings=True)
44 | 
45 |     valid_loss, valid_embeddings, valid_labels = trainer.insight(args.data_dir, args.data_spklist,
46 |                                                      batch_type=params.batch_type,
47 |                                                      output_embeddings=True)
48 |     eer = compute_cos_pairwise_eer(valid_embeddings, valid_labels)
49 |     tf.logging.info("EER: %f" % eer)
50 |     trainer.close()
51 | 


--------------------------------------------------------------------------------
/scripts/prepare_feats_for_multitask_egs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Apache 2.0.
 4 | 
 5 | nj=40
 6 | cmd="run.pl"
 7 | stage=0
 8 | norm_vars=false
 9 | center=true
10 | compress=true
11 | cmn_window=300
12 | 
13 | echo "$0 $@"  # Print the command line for logging
14 | 
15 | if [ -f path.sh ]; then . ./path.sh; fi
16 | . parse_options.sh || exit 1;
17 | if [ $# != 3 ]; then
18 |   echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
19 |   echo "e.g.: $0 data/train data/train_wcmvn exp/make_xvector_features"
20 |   echo "Options: "
21 |   echo "  --nj <nj>                                        # number of parallel jobs"
22 |   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
23 |   echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
24 |   exit 1;
25 | fi
26 | 
27 | data_in=$1
28 | data_out=$2
29 | dir=$3
30 | 
31 | name=`basename $data_in`
32 | 
33 | for f in $data_in/feats.scp $data_in/vad.scp ; do
34 |   [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
35 | done
36 | 
37 | # Set various variables.
38 | mkdir -p $dir/log
39 | mkdir -p $data_out
40 | featdir=$(utils/make_absolute.sh $dir)
41 | 
42 | cp $data_in/utt2spk $data_out/utt2spk
43 | cp $data_in/spk2utt $data_out/spk2utt
44 | cp $data_in/wav.scp $data_out/wav.scp
45 | [ -f $data_in/segments ] && cp $data_in/segments $data_out/segments
46 | [ -f $data_in/vad.scp ] && cp $data_in/vad.scp $data_out/vad.scp
47 | 
48 | write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
49 | 
50 | sdata_in=$data_in/split$nj;
51 | utils/split_data.sh $data_in $nj || exit 1;
52 | 
53 | $cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
54 |   apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
55 |   scp:${sdata_in}/JOB/feats.scp ark:- \| \
56 |   copy-feats --compress=$compress $write_num_frames_opt ark:- \
57 |   ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
58 | 
59 | for n in $(seq $nj); do
60 |   cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
61 | done > ${data_out}/feats.scp || exit 1
62 | 
63 | for n in $(seq $nj); do
64 |   cat $featdir/log/utt2num_frames.$n || exit 1;
65 | done > $data_out/utt2num_frames || exit 1
66 | rm $featdir/log/utt2num_frames.*
67 | 
68 | echo "$0: Succeeded creating xvector features for $name"
69 | 


--------------------------------------------------------------------------------
/scripts/lmrescore_const_arpa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | # This script rescores lattices with the ConstArpaLm format language model.
 7 | 
 8 | # Begin configuration section.
 9 | cmd=run.pl
10 | skip_scoring=false
11 | stage=1
12 | scoring_opts=
13 | # End configuration section.
14 | 
15 | echo "$0 $@"  # Print the command line for logging
16 | 
17 | . ./utils/parse_options.sh
18 | 
19 | if [ $# != 5 ]; then
20 |    echo "Does language model rescoring of lattices (remove old LM, add new LM)"
21 |    echo "Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\"
22 |    echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
23 |    echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
24 |    exit 1;
25 | fi
26 | 
27 | [ -f path.sh ] && . ./path.sh;
28 | 
29 | oldlang=$1
30 | newlang=$2
31 | data=$3
32 | indir=$4
33 | outdir=$5
34 | 
35 | oldlm=$oldlang/G.fst
36 | newlm=$newlang/G.carpa
37 | ! cmp $oldlang/words.txt $newlang/words.txt &&\
38 |   echo "$0: Warning: vocabularies may be incompatible."
39 | [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
40 | [ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1;
41 | ! ls $indir/lat.*.gz >/dev/null &&\
42 |   echo "$0: No lattices input directory $indir" && exit 1;
43 | 
44 | if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
45 |   echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
46 | fi
47 | 
48 | rm -f $outdir/lat.*.gz
49 | 
50 | oldlmcommand="fstproject --project_output=true $oldlm |"
51 | mkdir -p $outdir/log
52 | nj=`cat $indir/num_jobs` || exit 1;
53 | cp $indir/num_jobs $outdir
54 | 
55 | if [ $stage -le 1 ]; then
56 |   $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
57 |     lattice-lmrescore --lm-scale=-1.0 \
58 |     "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
59 |     lattice-lmrescore-const-arpa --lm-scale=1.0 \
60 |     ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
61 | fi
62 | 
63 | if ! $skip_scoring && [ $stage -le 2 ]; then
64 |   err_msg="Not scoring because local/score.sh does not exist or not executable."
65 |   [ ! -x scripts/diagnostic/score.sh ] && echo $err_msg && exit 1;
66 |   scripts/diagnostic/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
67 | else
68 |   echo "Not scoring because requested so..."
69 | fi
70 | 
71 | exit 0;


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/train_mt_lr_learning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | import sys
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | from misc.utils import ValidLoss, load_valid_loss, save_codes_and_config, compute_cos_pairwise_eer
 8 | from dataset.multitask.data_loader_v2 import KaldiDataRandomQueueV2
 9 | from dataset.kaldi_io import FeatureReaderV2
10 | from six.moves import range
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-c", "--cont", action="store_true", help="Continue training from an existing model.")
14 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.")
15 | parser.add_argument("--config", type=str, help="The configuration file.")
16 | parser.add_argument("train_data_dir", type=str, help="The data directory of the training set.")
17 | parser.add_argument("train_ali_dir", type=str, help="The ali directory of the training set.")
18 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.")
19 | parser.add_argument("model", type=str, help="The output model directory.")
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     tf.logging.set_verbosity(tf.logging.INFO)
24 |     args = parser.parse_args()
25 |     params = save_codes_and_config(False, args.model, args.config)
26 | 
27 |     # The model directory always has a folder named nnet
28 |     model_dir = os.path.join(args.model, "nnet")
29 | 
30 |     # Set the random seed. The random operations may appear in data input, batch forming, etc.
31 |     tf.set_random_seed(params.seed)
32 |     random.seed(params.seed)
33 |     np.random.seed(params.seed)
34 | 
35 |     start_epoch = 0
36 | 
37 |     feat_reader = FeatureReaderV2(args.train_data_dir, args.train_ali_dir)
38 |     dim = feat_reader.get_dim()
39 | 
40 |     feat_reader = KaldiDataRandomQueueV2(args.train_data_dir, args.train_ali_dir, args.train_spklist)
41 |     num_total_speakers = feat_reader.num_total_speakers
42 |     num_total_phones = feat_reader.num_total_phones
43 | 
44 |     from model.multitask_v1.base_v1 import BaseMT
45 | 
46 |     trainer = BaseMT(params, args.model, dim, num_total_speakers, num_total_phones)
47 |     trainer.build("train")
48 |     trainer.train_tune_lr(args.train_data_dir, args.train_ali_dir, args.train_spklist, args.tune_period)
49 |     trainer.close()
50 |     tf.logging.info("Finish tuning.")
51 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/train_vae_lr_learning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | from misc.utils import ValidLoss, save_codes_and_config
 7 | from dataset.data_loader import KaldiDataRandomQueueV2
 8 | from dataset.kaldi_io import FeatureReaderV2
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.")
12 | parser.add_argument("--config", type=str, help="The configuration file.")
13 | parser.add_argument("train_data_dir", type=str, help="The data directory of the training set.")
14 | parser.add_argument("train_ali_dir", type=str, help="The ali directory of the training set.")
15 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.")
16 | parser.add_argument("model", type=str, help="The output model directory.")
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     tf.logging.set_verbosity(tf.logging.INFO)
21 |     args = parser.parse_args()
22 |     params = save_codes_and_config(False, args.model, args.config)
23 | 
24 |     # The model directory always has a folder named nnet
25 |     model_dir = os.path.join(args.model, "nnet")
26 | 
27 |     # Set the random seed. The random operations may appear in data input, batch forming, etc.
28 |     tf.set_random_seed(params.seed)
29 |     random.seed(params.seed)
30 |     np.random.seed(params.seed)
31 | 
32 |     start_epoch = 0
33 | 
34 |     feat_reader = FeatureReaderV2(args.train_data_dir, args.train_ali_dir)
35 |     dim = feat_reader.get_dim()
36 |     feat_reader = KaldiDataRandomQueueV2(args.train_data_dir, args.train_ali_dir, args.train_spklist)
37 |     num_total_speakers = feat_reader.num_total_speakers
38 |     num_total_phones = feat_reader.num_total_phones
39 |     min_valid_loss = ValidLoss()
40 | 
41 |     from model.vae.base_v1 import BaseMT
42 |     trainer = BaseMT(params, args.model, dim, num_total_speakers, num_total_phones)
43 |     trainer.build("train")
44 | 
45 |     # You can tune the learning rate using the following function.
46 |     # After training, you should plot the loss v.s. the learning rate and pich a learning rate that decrease the
47 |     # loss fastest.
48 |     trainer.train_tune_lr(args.train_data_dir, args.train_ali_dir, args.train_spklist, args.tune_period)
49 |     trainer.close()
50 |     tf.logging.info("Finish tuning.")
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | share/python-wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 | 
113 | # Pyre type checker
114 | .pyre/
115 | 
116 | .idea/
117 | 
118 | # MacOS
119 | .DS_Store
120 | 
121 | # Matlab
122 | *.m~
123 | ._*
124 | 
125 | ._.DS_Store*
126 | 
127 | # backup
128 | egs/voxceleb/v1/nnet_conf.bak/*
129 | egs/voxceleb/v2/nnet_conf.bak/*
130 | egs/sre/v1/nnet_conf.bak/*
131 | egs/sre/v1/nnet_conf/test.json
132 | egs/mgb*
133 | egs/leap*
134 | egs/fisher/v1/nnet_conf.bak/*
135 | egs/fisher/v2*
136 | 
137 | # Unused 
138 | egs/voxceleb/nnet/run_extract_bnf_mi_embeddings.sh
139 | egs/voxceleb/nnet/run_train_mi_nnet.sh
140 | egs/voxceleb/nnet/wrap/extract_mi_wrapper.sh
141 | scripts/prepare_bnfeats_for_egs.sh
142 | scripts/extract_bnf.sh
143 | 
144 | # intermediate files
145 | misc/tools/score*
146 | misc/tuning*
147 | 
148 | *.png
149 | 


--------------------------------------------------------------------------------
/model/multitask_v1/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def make_phone_masks(length, resample, num_frames_per_utt):
 5 |     """Randomly select frames for each utterance.
 6 | 
 7 |     Args:
 8 |         length: The length of each utterance.
 9 |         resample: If 0, return the beginning frame; otherwise random select a frame.
10 |                   resample is designed to try to make every frame has the same probability to be sampled.
11 |         num_frames_per_utt: #frames selected. if -1, then select all frames
12 |     :return: a mat with [n_selected_frames, 2], each row is the index of the selected frame
13 |     """
14 |     n_utts = length.shape[0]
15 | 
16 |     # This sampling strategy will make the sampling probability of each frame the same
17 |     if num_frames_per_utt == -1:
18 |         mat = []
19 |         for i in range(n_utts):
20 |             for j in range(length[i]):
21 |                 mat.append([i, j])
22 |         mat = np.array(mat, dtype=np.int32)
23 |     else:
24 |         # # Uniform sampling
25 |         # mat = np.zeros((length.shape[0] * num_frames_per_utt, 2), dtype=np.int32)
26 |         # assert num_frames_per_utt > 0, "The num of frames should be greater than 0 (or -1)"
27 |         # for i in range(n_utts):
28 |         #     mat[i * num_frames_per_utt:(i+1) * num_frames_per_utt, 0] = i
29 |         #     if resample[i] == 1:
30 |         #         # Resample the last segment
31 |         #         tmp = []
32 |         #         for _ in range(num_frames_per_utt):
33 |         #             while True:
34 |         #                 a = np.random.randint(0, length[i], dtype=np.int32)
35 |         #                 if a not in tmp:
36 |         #                     tmp.append(a)
37 |         #                     break
38 |         #         mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = tmp
39 |         #     else:
40 |         #         mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = np.arange(num_frames_per_utt, dtype=np.int32)
41 | 
42 |         # Totally random sampling (the central frames will get higher sampling probabilities)
43 |         mat = np.zeros((length.shape[0] * num_frames_per_utt, 2), dtype=np.int32)
44 |         for i in range(n_utts):
45 |             mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 0] = i
46 |             # Resample the last segment
47 |             tmp = []
48 |             for _ in range(num_frames_per_utt):
49 |                 while True:
50 |                     a = np.random.randint(0, length[i], dtype=np.int32)
51 |                     if a not in tmp:
52 |                         tmp.append(a)
53 |                         break
54 |             mat[i * num_frames_per_utt:(i + 1) * num_frames_per_utt, 1] = tmp
55 | 
56 |     return mat
57 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/train_lr_learning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | import sys
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | from misc.utils import ValidLoss, load_valid_loss, save_codes_and_config
 8 | from model.trainer import Trainer
 9 | from dataset.data_loader import KaldiDataRandomQueue
10 | from dataset.kaldi_io import FeatureReader
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.")
14 | parser.add_argument("--config", type=str, help="The configuration file.")
15 | parser.add_argument("train_dir", type=str, help="The data directory of the training set.")
16 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.")
17 | parser.add_argument("valid_dir", type=str, help="The data directory of the validation set.")
18 | parser.add_argument("valid_spklist", type=str, help="The spklist maps the VALID speakers to the indices.")
19 | parser.add_argument("model", type=str, help="The output model directory.")
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     tf.logging.set_verbosity(tf.logging.INFO)
24 |     args = parser.parse_args()
25 |     params = save_codes_and_config(False, args.model, args.config)
26 | 
27 |     # The model directory always has a folder named nnet
28 |     model_dir = os.path.join(args.model, "nnet")
29 | 
30 |     # Set the random seed. The random operations may appear in data input, batch forming, etc.
31 |     tf.set_random_seed(params.seed)
32 |     random.seed(params.seed)
33 |     np.random.seed(params.seed)
34 | 
35 |     start_epoch = 0
36 | 
37 |     dim = FeatureReader(args.train_dir).get_dim()
38 |     with open(os.path.join(model_dir, "feature_dim"), "w") as f:
39 |         f.write("%d\n" % dim)
40 | 
41 |     num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers
42 |     tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim))
43 | 
44 |     # Load the history valid loss
45 |     min_valid_loss = ValidLoss()
46 | 
47 |     # The trainer is used to control the training process
48 |     trainer = Trainer(params, args.model)
49 |     trainer.build("train",
50 |                   dim=dim,
51 |                   loss_type=params.loss_func,
52 |                   num_speakers=num_total_train_speakers)
53 |     trainer.build("valid",
54 |                   dim=dim,
55 |                   loss_type=params.loss_func,
56 |                   num_speakers=num_total_train_speakers)
57 | 
58 |     # You can tune the learning rate using the following function.
59 |     # After training, you should plot the loss v.s. the learning rate and pich a learning rate that decrease the
60 |     # loss fastest.
61 |     trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period)
62 |     trainer.close()
63 |     tf.logging.info("Finish tuning.")
64 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_extract_mt_phone_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nj=32
 4 | use_gpu=false
 5 | cmd="run.pl"
 6 | min_chunk_size=25
 7 | chunk_size=10000
 8 | stage=0
 9 | normalize=false
10 | checkpoint=-1
11 | env=tf_cpu
12 | node="output"
13 | compress=true
14 | 
15 | echo "$0 $@"
16 | 
17 | if [ -f path.sh ]; then . ./path.sh; fi
18 | . parse_options.sh || exit 1;
19 | 
20 | if [ $# != 4 ]; then
21 |   echo "Usage: $0 [options] <nnet-dir> <data> <ali-dir> <embeddings-dir>"
22 |   echo "Options:"
23 |   echo "  --use-gpu <false>"
24 |   echo "  --nj <32>"
25 |   echo "  --min-chunk-size <25>"
26 |   echo "  --chunk-size <10000>"
27 |   echo "  --normalize <false>"
28 |   echo "  --checkpoint <-1>"
29 |   echo "  --node <output>"
30 |   echo "  --compress <true>"
31 |   echo ""
32 |   exit 100
33 | fi
34 | 
35 | nnetdir=$1
36 | data=$2
37 | alidir=$3
38 | dir=$4
39 | 
40 | for f in $nnetdir/nnet/checkpoint $data/feats.scp $data/vad.scp $alidir/pdf.scp; do
41 |   [ ! -f $f ] && echo "No such file $f" && exit 1;
42 | done
43 | 
44 | mkdir -p $dir/log
45 | 
46 | utils/split_data.sh $data $nj
47 | echo "$0: extracting embeddings for $data"
48 | sdata=$data/split$nj/JOB
49 | 
50 | # Filter the alignments to match the feats.
51 | utils/filter_scps.pl JOB=1:$nj \
52 |   $data/split${nj}/JOB/utt2spk $alidir/pdf.scp $data/split${nj}/JOB/pdf.scp || exit 1;
53 | 
54 | feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- |"
55 | 
56 | # I use conda to load TF (in cpu case), so some preparations are applied before python. So a wrapper make things more flexible.
57 | # If no conda is used, simply set "--use-env false"
58 | if [ $stage -le 0 ]; then
59 |   echo "$0: extracting xvectors from nnet"
60 |   echo "$0: embedding from node $node"
61 | 
62 |   # Set the checkpoint.
63 |   source $TF_ENV/$env/bin/activate
64 |   export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
65 |   python nnet/lib/make_checkpoint.py --checkpoint $checkpoint "$nnetdir"
66 |   deactivate
67 | 
68 |   if $use_gpu; then
69 |     echo "Using CPU to do inference is a better choice."
70 |     exit 1
71 | #    $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
72 | #      nnet/wrap/extract_wrapper.sh --gpuid JOB --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size --normalize $normalize \
73 | #        "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp"
74 |   else
75 |     $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
76 |       nnet/wrap/extract_mt_phone_wrapper.sh --gpuid -1 --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size \
77 |         --normalize $normalize --node $node \
78 |         "$nnetdir" "$feat" ${sdata}/pdf.scp "ark:| copy-feats --compress=$compress ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp"
79 | 
80 |   fi
81 | fi
82 | 
83 | if [ $stage -le 1 ]; then
84 |   echo "$0: combining xvectors across jobs"
85 |   for j in $(seq $nj); do cat $dir/xvector.$j.scp; done > $dir/xvector.scp || exit 1;
86 | fi
87 | 
88 | exit 0


--------------------------------------------------------------------------------
/misc/tuning/tune_lr.m:
--------------------------------------------------------------------------------
  1 | clear
  2 | close all;
  3 | 
  4 | lr = [0.000010
  5 | 0.000012
  6 | 0.000013
  7 | 0.000015
  8 | 0.000017
  9 | 0.000020
 10 | 0.000023
 11 | 0.000027
 12 | 0.000031
 13 | 0.000035
 14 | 0.000040
 15 | 0.000047
 16 | 0.000054
 17 | 0.000062
 18 | 0.000071
 19 | 0.000081
 20 | 0.000094
 21 | 0.000108
 22 | 0.000124
 23 | 0.000142
 24 | 0.000164
 25 | 0.000188
 26 | 0.000216
 27 | 0.000249
 28 | 0.000286
 29 | 0.000329
 30 | 0.000379
 31 | 0.000435
 32 | 0.000501
 33 | 0.000576
 34 | 0.000662
 35 | 0.000761
 36 | 0.000876
 37 | 0.001007
 38 | 0.001158
 39 | 0.001332
 40 | 0.001532
 41 | 0.001761
 42 | 0.002025
 43 | 0.002329
 44 | 0.002679
 45 | 0.003080
 46 | 0.003542
 47 | 0.004074
 48 | 0.004685
 49 | 0.005388
 50 | 0.006196
 51 | 0.007125
 52 | 0.008194
 53 | 0.009423
 54 | 0.010837
 55 | 0.012462
 56 | 0.014331
 57 | 0.016481
 58 | 0.018953
 59 | 0.021796
 60 | 0.025066
 61 | 0.028826
 62 | 0.033149
 63 | 0.038122
 64 | 0.043840
 65 | 0.050416
 66 | 0.057978
 67 | 0.066675
 68 | 0.076676
 69 | 0.088178
 70 | 0.101405
 71 | 0.116615
 72 | 0.134108
 73 | 0.154224
 74 | 0.177357
 75 | 0.203961
 76 | 0.234555
 77 | 0.269738
 78 | 0.310199
 79 | 0.356729
 80 | 0.410238
 81 | 0.471774
 82 | 0.542540
 83 | 0.623921
 84 | 0.717509
 85 | 0.825135
 86 | 0.948905
 87 | 1.091241
 88 | 1.254927
 89 | 1.443166
 90 | 1.659641
 91 | 1.908588
 92 | 2.194876
 93 | 2.524107
 94 | 2.902723
 95 | 3.338132
 96 | 3.838852
 97 | 4.414679
 98 | 5.076881
 99 | 5.838413
100 | 6.714175
101 | 7.721302
102 | 8.879497
103 | 10.211421];
104 | 
105 | loss = [7.862374
106 | 7.870405
107 | 7.771949
108 | 7.787009
109 | 7.566071
110 | 7.733312
111 | 6.704276
112 | 7.570509
113 | 6.750152
114 | 7.261982
115 | 6.866084
116 | 6.672805
117 | 6.590648
118 | 6.298755
119 | 6.757052
120 | 6.595728
121 | 6.486756
122 | 5.642969
123 | 6.621517
124 | 6.393176
125 | 6.472243
126 | 6.267687
127 | 6.596249
128 | 6.058064
129 | 6.151696
130 | 6.340888
131 | 5.645424
132 | 6.459932
133 | 6.390144
134 | 5.754430
135 | 5.931551
136 | 5.213816
137 | 5.011546
138 | 6.196012
139 | 5.601851
140 | 4.494273
141 | 5.674572
142 | 5.236257
143 | 5.222935
144 | 5.152613
145 | 5.424874
146 | 4.766945
147 | 4.949891
148 | 4.694318
149 | 4.824037
150 | 4.918430
151 | 4.113710
152 | 4.040040
153 | 4.822907
154 | 3.912708
155 | 4.655045
156 | 4.444558
157 | 4.385447
158 | 4.390773
159 | 4.715075
160 | 4.955003
161 | 4.506167
162 | 4.952337
163 | 4.916849
164 | 4.933915
165 | 5.132740
166 | 4.584652
167 | 5.110647
168 | 5.910470
169 | 5.527468
170 | 5.851896
171 | 5.173183
172 | 5.006588
173 | 5.238864
174 | 6.415123
175 | 5.679238
176 | 6.092204
177 | 5.951892
178 | 6.053728
179 | 5.838257
180 | 6.347813
181 | 5.253940
182 | 5.873345
183 | 5.180672
184 | 6.765231
185 | 6.544772
186 | 6.581923
187 | 6.521677
188 | 6.496094
189 | 6.449677
190 | 6.650800
191 | 6.242509
192 | 6.709395
193 | 6.472134
194 | 6.652347
195 | 6.052146
196 | 7.097000
197 | 7.214063
198 | 6.960054
199 | 6.783081
200 | 6.404983
201 | 6.553833
202 | 6.387044
203 | 7.082532
204 | 6.591753
205 | ];
206 | 
207 | sma = 1;
208 | derivatives = (loss(1+sma:end) - loss(1:end-sma))/sma;
209 | derivatives = filter(ones(1,5)/5,1,derivatives);
210 | figure();
211 | semilogx(lr, loss);
212 | figure();
213 | semilogx(lr(2:end), derivatives)
214 | 
215 | 


--------------------------------------------------------------------------------
/scripts/extract_bnf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./cmd.sh
 4 | . ./path.sh
 5 | set -e
 6 | 
 7 | # Begin configuration section.
 8 | stage=0
 9 | nj=30
10 | cmd="run.pl"
11 | use_gpu=false
12 | compress=true
13 | # End configuration options.
14 | 
15 | echo "$0 $@"  # Print the command line for logging
16 | 
17 | if [ -f path.sh ]; then . ./path.sh; fi
18 | . parse_options.sh || exit 1;
19 | 
20 | if [ $# != 5 ]; then
21 |   echo "Usage: $0 <nnet-dir> <output-node> <input-data> <output-data> <dir>"
22 |   echo " e.g.: $0 exp/nnet data/train data/train_bn exp/train_bn"
23 |   echo "main options (for others, see top of script file)"
24 |   echo "  --config <config-file>                           # config containing options"
25 |   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
26 |   echo "  --use-gpu <bool|false>                           # If true, use GPU."
27 |   echo "  --nj <n|30>                                      # Number of jobs"
28 |   echo "  --stage <stage|0>                                # To control partial reruns"
29 |   echo "  --compress <true>"
30 |   exit 1
31 | fi
32 | 
33 | srcdir=$1
34 | output_node=$2
35 | data=$3
36 | bnf_data=$4
37 | dir=$5
38 | 
39 | for f in $srcdir/final.mdl $data/feats.scp; do
40 |   [ ! -f $f ] && echo "No such file $f" && exit 1;
41 | done
42 | 
43 | cmvn_opts=`cat $srcdir/cmvn_opts`
44 | name=`basename $data`
45 | sdata=$data/split$nj
46 | utils/split_data.sh $data $nj
47 | 
48 | mkdir -p $dir/log
49 | mkdir -p $bnf_data
50 | 
51 | echo "$0: extracting bottleneck features for $data"
52 | 
53 | echo "$0: Generating bottleneck features using $srcdir/final.mdl as output of "
54 | echo "    component-node with name $output_node."
55 | echo "output-node name=output input=$output_node" > $dir/extract.config
56 | 
57 | raw_nnet="nnet3-am-copy --raw=true $srcdir/final.mdl - | nnet3-copy --nnet-config=$dir/extract.config - - |"
58 | # Set up the features
59 | # The feature processing pipeline:
60 | # apply-cmvn --norm-means=true --norm-vars=false --utt2spk=xxx scp:xxx scp:xxx ark:xxx
61 | feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
62 | 
63 | if [ $stage -le 0 ]; then
64 |   echo "$0: extracting xvectors from nnet"
65 |   if $use_gpu; then
66 |     echo "Set use_gpu=false"
67 |     exit 1
68 |   else
69 |     $cmd JOB=1:$nj $dir/log/extract.JOB.log \
70 |       nnet3-compute --use-gpu=no "$raw_nnet" "$feats" ark:- \| \
71 |         copy-feats --compress=$compress ark:- ark,scp:$dir/raw_bnfeat_$name.JOB.ark,$dir/raw_bnfeat_$name.JOB.scp || exit 1;
72 |   fi
73 | fi
74 | 
75 | N0=$(cat $data/feats.scp | wc -l)
76 | N1=$(cat $dir/raw_bnfeat_$name.*.scp | wc -l)
77 | if [[ "$N0" != "$N1" ]]; then
78 |   echo "$0: Error happens when generating bottleneck features for $name (Original:$N0  BNF:$N1)"
79 |   exit 1;
80 | fi
81 | 
82 | # Concatenate feats.scp into bnf_data
83 | for n in $(seq $nj); do
84 |   cat $dir/raw_bnfeat_$name.$n.scp
85 | done > $bnf_data/feats.scp
86 | 
87 | for f in segments spk2utt spk2gender text utt2spk wav.scp vad.scp utt2num_frames char.stm glm kws reco2file_and_channel stm; do
88 |   [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
89 | done
90 | 
91 | if [ $stage -le 1 ]; then
92 |   echo "$0: computing CMVN stats."
93 |   steps/compute_cmvn_stats.sh $bnf_data $dir/log $dir
94 | fi
95 | 
96 | echo "$0: done making bottleneck features."
97 | 
98 | exit 0;


--------------------------------------------------------------------------------
/model/multitask_v1/pooling.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from model.common import shape_list, dense_relu, dense_tanh, split_heads, combine_last_two_dimensions
 3 | import sys
 4 | 
 5 | 
 6 | VAR2STD_EPSILON = 1e-12
 7 | 
 8 | 
 9 | def statistics_pooling_v2(features, feat_length, endpoints, params, is_training):
10 |     """Statistics pooling
11 |     Note that we need to take care of the zeros in the variance since the sqrt on 0 will lead to NaN.
12 | 
13 |     Args:
14 |         features: A tensor with shape [batch, length, dim].
15 |         feat_length: The length of each utterance.
16 |         endpoints: Outputs of different parts of the network.
17 |         params:
18 |         is_training:
19 |     :return:
20 |         Statistics pooling result [mean, stddev] with shape [batch, dim].
21 |     """
22 |     with tf.variable_scope("stat_pooling"):
23 |         feat_shape = shape_list(features)
24 |         frame_index = tf.tile(tf.expand_dims(tf.range(feat_shape[1]), axis=0), [feat_shape[0], 1])
25 |         feat_length = tf.expand_dims(feat_length, axis=1)
26 |         feat_length_new = tf.tile(feat_length, [1, feat_shape[1]])
27 |         mask = tf.expand_dims(tf.to_float(tf.less(frame_index, feat_length_new)), axis=2)
28 |         feat_length = tf.to_float(tf.expand_dims(feat_length, axis=2))
29 |         mean = tf.reduce_sum(features * mask, axis=1, keep_dims=True) / (feat_length + 1e-16)
30 |         variance = tf.reduce_sum(tf.squared_difference(features, mean) * mask, axis=1, keep_dims=True) / (feat_length + 1e-16)
31 | 
32 |         mean = tf.squeeze(mean, 1)
33 |         variance = tf.squeeze(variance, 1)
34 | 
35 |         mask = tf.to_float(tf.less_equal(variance, VAR2STD_EPSILON))
36 |         variance = (1.0 - mask) * variance + mask * VAR2STD_EPSILON
37 |         stddev = tf.sqrt(variance)
38 |         stat_pooling = tf.concat([mean, stddev], 1, name="concat")
39 | 
40 |     return stat_pooling
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     num_labels = 10
45 |     num_data = 100
46 |     num_length = 1000
47 |     num_dim = 1500
48 |     features = tf.placeholder(tf.float32, shape=[None, None, num_dim], name="features")
49 |     feat_length = tf.placeholder(tf.int32, shape=[None], name="feat_length")
50 |     from collections import OrderedDict
51 |     endpoints = OrderedDict()
52 |     from misc.utils import ParamsPlain
53 | 
54 |     # Self-attention
55 |     params = ParamsPlain()
56 | 
57 |     stat_pooling = statistics_pooling_v2(features, feat_length, endpoints, params, True)
58 | 
59 |     with tf.Session() as sess:
60 |         sess.run(tf.global_variables_initializer())
61 |         import numpy as np
62 |         features_val = np.random.rand(num_data, num_length, num_dim).astype(np.float32)
63 |         features_val[0, :, :] = 0
64 |         length_val = np.random.randint(100, 1001, size=(num_data))
65 |         stat_pooling_tf = sess.run(stat_pooling, feed_dict={features: features_val,
66 |                                                             feat_length: length_val})
67 | 
68 |         def compute_stat_pooling(features, length):
69 |             num_data, l, dim = features.shape
70 |             assert num_data == length.shape[0]
71 |             mean = np.zeros((num_data, dim))
72 |             stddev = np.zeros((num_data, dim))
73 |             for i in range(num_data):
74 |                 for j in range(length[i]):
75 |                     mean[i, :] += features[i, j, :]
76 |                     stddev[i, :] += np.square(features[i, j, :])
77 |                 mean[i, :] /= length[i]
78 |                 stddev[i, :] /= length[i]
79 |                 stddev[i, :] = np.sqrt(np.maximum(stddev[i, :] - np.square(mean[i, :]), 1e-12))
80 |             return np.concatenate([mean, stddev], axis=1)
81 | 
82 |         stat_pooling_np = compute_stat_pooling(features_val, length_val)
83 |         assert np.allclose(stat_pooling_tf, stat_pooling_np)
84 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/lib/finetune_lr_learning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | import sys
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | from misc.utils import get_pretrain_model
 8 | from misc.utils import ValidLoss, save_codes_and_config, compute_cos_pairwise_eer
 9 | from model.trainer import Trainer
10 | from dataset.data_loader import KaldiDataRandomQueue
11 | from dataset.kaldi_io import FeatureReader
12 | from six.moves import range
13 | 
14 | # We don't need to use a `continue` option here, because if we want to resume training, we should simply use train.py.
15 | # In the beginning of finetuning, we want to restore a part of the model rather than the entire graph.
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("--tune_period", type=int, default=100, help="How many steps per learning rate.")
18 | parser.add_argument("--checkpoint", type=str, default="-1", help="The checkpoint in the pre-trained model. The default is to load the BEST checkpoint (according to valid_loss)")
19 | parser.add_argument("--config", type=str, help="The configuration file.")
20 | parser.add_argument("train_dir", type=str, help="The data directory of the training set.")
21 | parser.add_argument("train_spklist", type=str, help="The spklist file maps the TRAINING speakers to the indices.")
22 | parser.add_argument("valid_dir", type=str, help="The data directory of the validation set.")
23 | parser.add_argument("valid_spklist", type=str, help="The spklist maps the VALID speakers to the indices.")
24 | parser.add_argument("pretrain_model", type=str, help="The pre-trained model directory.")
25 | parser.add_argument("finetune_model", type=str, help="The fine-tuned model directory")
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     tf.logging.set_verbosity(tf.logging.INFO)
30 |     args = parser.parse_args()
31 |     params = save_codes_and_config(False, args.finetune_model, args.config)
32 | 
33 |     # Load the pre-trained model to the target model directory.
34 |     # The pre-trained model will be copied as the fine-tuned model and can be loaded from the new directory.
35 |     # The pre-trained model is now just like an initialized model.
36 |     get_pretrain_model(os.path.join(args.pretrain_model, "nnet"),
37 |                        os.path.join(args.finetune_model, "nnet"),
38 |                        args.checkpoint)
39 | 
40 |     # The model directory always has a folder named nnet
41 |     model_dir = os.path.join(args.finetune_model, "nnet")
42 | 
43 |     # Set the random seed. The random operations may appear in data input, batch forming, etc.
44 |     tf.set_random_seed(params.seed)
45 |     random.seed(params.seed)
46 |     np.random.seed(params.seed)
47 | 
48 |     dim = FeatureReader(args.train_dir).get_dim()
49 |     with open(os.path.join(model_dir, "feature_dim"), "w") as f:
50 |         f.write("%d\n" % dim)
51 | 
52 |     num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers
53 |     tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim))
54 | 
55 |     min_valid_loss = ValidLoss()
56 | 
57 |     # The trainer is used to control the training process
58 |     trainer = Trainer(params, args.finetune_model)
59 |     trainer.build("train",
60 |                   dim=dim,
61 |                   loss_type=params.loss_func,
62 |                   num_speakers=num_total_train_speakers)
63 |     trainer.build("valid",
64 |                   dim=dim,
65 |                   loss_type=params.loss_func,
66 |                   num_speakers=num_total_train_speakers)
67 | 
68 |     # Load the pre-trained model and transfer to current model
69 |     trainer.get_finetune_model(params.noload_var_list)
70 | 
71 |     trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period)
72 |     trainer.close()
73 |     tf.logging.info("Finish tuning.")
74 | 


--------------------------------------------------------------------------------
/egs/voxceleb/v1/nnet/run_extract_embeddings_no_vad.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | nj=32
  4 | use_gpu=false
  5 | cmd="run.pl"
  6 | min_chunk_size=25
  7 | chunk_size=10000
  8 | stage=0
  9 | normalize=false
 10 | checkpoint=-1
 11 | env=tf_cpu
 12 | node="output"
 13 | 
 14 | echo "$0 $@"
 15 | 
 16 | if [ -f path.sh ]; then . ./path.sh; fi
 17 | . parse_options.sh || exit 1;
 18 | 
 19 | if [ $# != 3 ]; then
 20 |   echo "Usage: $0 [options] <nnet-dir> <data> <embeddings-dir>"
 21 |   echo "Options:"
 22 |   echo "  --use-gpu <false>"
 23 |   echo "  --nj <32>"
 24 |   echo "  --min-chunk-size <25>"
 25 |   echo "  --chunk-size <10000>"
 26 |   echo "  --normalize <false>"
 27 |   echo "  --checkpoint <-1>"
 28 |   echo "  --node <output>"
 29 |   echo ""
 30 |   exit 100
 31 | fi
 32 | 
 33 | nnetdir=$1
 34 | data=$2
 35 | dir=$3
 36 | 
 37 | for f in $nnetdir/nnet/checkpoint $data/feats.scp; do
 38 |   [ ! -f $f ] && echo "No such file $f" && exit 1;
 39 | done
 40 | 
 41 | mkdir -p $dir/log
 42 | 
 43 | utils/split_data.sh $data $nj
 44 | echo "$0: extracting embeddings for $data"
 45 | sdata=$data/split$nj/JOB
 46 | 
 47 | feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- |"
 48 | 
 49 | # I use conda to load TF (in cpu case), so some preparations are applied before python. So a wrapper make things more flexible.
 50 | # If no conda is used, simply set "--use-env false"
 51 | if [ $stage -le 0 ]; then
 52 |   echo "$0: extracting xvectors from nnet"
 53 |   echo "$0: embedding from node $node"
 54 | 
 55 |   # Set the checkpoint.
 56 |   source $TF_ENV/$env/bin/activate
 57 |   export PYTHONPATH=$TF_KALDI_ROOT:$PYTHONPATH
 58 |   python nnet/lib/make_checkpoint.py --checkpoint $checkpoint "$nnetdir"
 59 |   deactivate
 60 | 
 61 |   if $use_gpu; then
 62 |     echo "Using CPU to do inference is a better choice."
 63 |     exit 1
 64 | #    $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
 65 | #      nnet/wrap/extract_wrapper.sh --gpuid JOB --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size --normalize $normalize \
 66 | #        "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp"
 67 |   else
 68 |     $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
 69 |       nnet/wrap/extract_wrapper.sh --gpuid -1 --env $env --min-chunk-size $min_chunk_size --chunk-size $chunk_size \
 70 |         --normalize $normalize --node $node \
 71 |         "$nnetdir" "$feat" "ark:| copy-vector ark:- ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp"
 72 |   fi
 73 | fi
 74 | 
 75 | if [ $stage -le 1 ]; then
 76 |   echo "$0: combining xvectors across jobs"
 77 |   for j in $(seq $nj); do cat $dir/xvector.$j.scp; done >$dir/xvector.scp || exit 1;
 78 | fi
 79 | 
 80 | if [ $stage -le 2 ]; then
 81 |   # Average the utterance-level xvectors to get speaker-level xvectors
 82 |   echo "$0: computing mean of xvectors for each speaker"
 83 |   if $normalize; then
 84 |     echo "$0:   Normalize xvectors before computing the mean."
 85 |     $cmd $dir/log/speaker_mean.log \
 86 |       ivector-normalize-length --scaleup=false scp:$dir/xvector.scp ark:- \| \
 87 |       ivector-mean ark:$data/spk2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \
 88 |       ivector-normalize-length --scaleup=false ark:- ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp || exit 1
 89 |   else
 90 |     $cmd $dir/log/speaker_mean.log \
 91 |       ivector-mean ark:$data/spk2utt scp:$dir/xvector.scp \
 92 |         ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp ark,t:$dir/num_utts.ark || exit 1;
 93 |   fi
 94 | fi
 95 | 
 96 | if [ $stage -le 3 ]; then
 97 |   if $normalize; then
 98 |     # Normalize the output embeddings
 99 |     cp $dir/xvector.scp $dir/xvector_before_norm.scp
100 |     $cmd $dir/log/length_norm.log \
101 |       ivector-normalize-length --scaleup=false scp:$dir/xvector_before_norm.scp ark,scp:$dir/xvector.ark,$dir/xvector.scp
102 |   fi
103 | fi
104 | 
105 | exit 0


--------------------------------------------------------------------------------