├── figs
    └── meta_learning_arch.png
├── .gitignore
├── path.sh
├── cmd.sh
├── LICENSE.md
├── compute_valid_acc.py
├── egs
    ├── convert_rttm_to_vad.py
    ├── wrapper_eval.sh
    ├── sv_voices_kaldi.sh
    ├── sv_voices.sh
    ├── diarize_kaldi.sh
    └── diarize.sh
├── subsetEgsIntoHdf5.py
├── extract.py
├── train_relation.py
├── train_proto.py
├── train_xent.py
├── README.md
├── pytorch_run.sh
├── models.py
└── train_utils.py


/figs/meta_learning_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manojpamk/pytorch_xvectors/HEAD/figs/meta_learning_arch.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | benchmarks
 2 | TODO
 3 | *.pyc
 4 | final_models/
 5 | models/
 6 | archived_setups/
 7 | *cifar*
 8 | *.config
 9 | *elessar*
10 | old*
11 | train_hdf5.py
12 | convert_egs_to_hdf5.py
13 | local/
14 | sid/
15 | steps/
16 | utils/
17 | 


--------------------------------------------------------------------------------
/path.sh:
--------------------------------------------------------------------------------
1 | export KALDI_ROOT=/home/manoj/kaldi/
2 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
3 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
4 | . $KALDI_ROOT/tools/config/common_path.sh
5 | export LC_ALL=C
6 | 


--------------------------------------------------------------------------------
/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="run.pl"
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Manoj Kumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/compute_valid_acc.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python3.6
 2 | 
 3 | """ Date Created: Feb 17 2020
 4 |     This script computes the validation accuracy
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import torch
10 | import socket
11 | import kaldi_python_io
12 | from train_utils import *
13 | 
14 | 
15 | egsDir =
16 | modelDir = '/home/manoj/Projects/pytorch_spkembed/xvectors_voxceleb/models/isXvec_False_modelType_3_event_202002-1719-0729'
17 | modelFile = max(glob.glob(modelDir), key=os.path.getctime)
18 | 
19 | # Load the model
20 | net = simpleTDNN(params['numSpkrs'], p_dropout=0)
21 | checkpoint = torch.load(modelFile)
22 | net.load_state_dict(checkpoint['model_state_dict'])
23 | net.eval()
24 | 
25 | correct, incorrect = 0, 0
26 | for validArk in glob.glob(egsDir+'/valid_egs.*.ark'):
27 |     x = kaldi_python_io.Nnet3EgsReader(validArk)
28 |     for key, mat in x:
29 |         out = net(torch.Tensor(mat[0]['matrix']).permute(1,0).unsqueeze(0))
30 |         if mat[1]['matrix'][0][0][0] == torch.argmax(out)+1:
31 |             correct += 1
32 |         else:
33 |             incorrect += 1
34 |         #print('%d,%d' %(mat[1]['matrix'][0][0][0],torch.argmax(out)+1))
35 | print('Valid accuracy: %1.2f percent' %(1.0*correct/(correct+incorrect)))
36 | 


--------------------------------------------------------------------------------
/egs/convert_rttm_to_vad.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | """
 4 | 	This script converts the RTTM ground truth files into oracleVAD files to be
 5 | 	used for computing SER (speaker error rate)
 6 | 
 7 | 	Output format (xxx.csv):
 8 | 	<time>,<label>
 9 | 
10 | """
11 | 
12 | import os, sys
13 | import numpy as np
14 | 
15 | if len(sys.argv)!=4:
16 | 	print("Usage: convert_rttm_to_vad.py <audio_dir> <in_rttm_dir> <out_oracleVad_dir>\n")
17 | 	print("Converts RTTM ground truth files into oracleVAD files")
18 | 	sys.exit(1)
19 | 
20 | inWavDir = sys.argv[1]
21 | inRttmDir = sys.argv[2]
22 | outOracleVadDir = sys.argv[3]
23 | frameRate = 100
24 | 
25 | if not os.path.exists(outOracleVadDir):
26 |     os.makedirs(outOracleVadDir)
27 | 
28 | for wavFile in sorted(os.listdir(inWavDir)):
29 | 
30 | 	wavBase = wavFile.replace('.wav','')
31 | 	if not os.path.exists(inRttmDir+'/'+wavBase+'.rttm'):
32 | 		print('No rttm file for %s' %wavFile)
33 | 	# print('Creating VAD file for %s' %wavBase)
34 | 	audioDur = np.round(float(os.popen('soxi -D '+inWavDir+'/'+wavFile).readlines()[0].strip('\n')),2)
35 | 	binVad = np.zeros(int(np.ceil(frameRate*audioDur))).astype('int')
36 | 
37 | 	with open(inRttmDir+'/'+wavBase+'.rttm') as fid:
38 | 		data = fid.read().splitlines()
39 | 		startTimes = [ float(x.split()[3]) for x in data ]
40 | 		endTimes  = [ float(x.split()[3]) + float(x.split()[4]) for x in data ]
41 | 
42 | 	for s,e in zip(startTimes, endTimes):
43 | 		binVad[int(np.ceil(frameRate*s)):int(np.ceil(frameRate*e))] = 1
44 | 
45 | 	timeStamps = np.linspace(0.01,len(binVad)/100.0,len(binVad),endpoint=False)
46 | 	np.savetxt(outOracleVadDir+'/'+wavBase+'.csv',np.vstack((timeStamps,binVad)).T,fmt="%1.2f,%1.2f")
47 | 


--------------------------------------------------------------------------------
/egs/wrapper_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | : ' Date Created: Apr 27 2019
 4 | 
 5 |     A wrapper script for the diarization evals. For every model checkpoint,
 6 |     this computes the oracle and est spkr DERs
 7 | 
 8 | '
 9 | 
10 | currDir=$PWD
11 | 
12 | # 'xvecTDNN' or 'proto_xvecTDNN' or 'relation_encoder_xvecTDNN'
13 | modelType=proto_xvecTDNN
14 | # 'fc2' or 'fc3' or 'fc4'
15 | layerName=fc3
16 | modelDir=$currDir/../models/temp_eval/
17 | 
18 | # 'dihard' or 'ami' or 'adosMod3'
19 | evalCorpus=dihard
20 | wavDir=$currDir/${evalCorpus}_wav
21 | rttmDir=$currDir/${evalCorpus}_rttm
22 | 
23 | # 'plda' or 'SC'
24 | method=SC  
25 | 
26 | expDir=$currDir/exp
27 | baseScript=$currDir/diarize.sh
28 | collarCmd="--ignore_overlaps"
29 | outFile=$currDir/RESULTS.txt
30 | 
31 | rm -f $outFile
32 | sed -i "/^wavDir=/c\wavDir=$wavDir" $baseScript
33 | sed -i "/^rttmDir=/c\rttmDir=$rttmDir" $baseScript
34 | sed -i "/^modelDir=/c\modelDir=$modelDir" $baseScript
35 | sed -i "/^expDir=/c\expDir=$expDir" $baseScript
36 | sed -i "/^method=/c\method=$method" $baseScript
37 | sed -i "/^modelType=/c\modelType=$modelType" $baseScript
38 | sed -i "/^layerName=/c\layerName=$layerName" $baseScript
39 | 
40 | for modelFile in $modelDir/*.tar; do
41 | 
42 |   touch $modelFile;
43 |   echo "Evaluating $modelFile"
44 | 
45 |   bash diarize.sh > /dev/null 2>&1
46 | 
47 |   # Eval is a repeatition from inside $baseScript, but its OK
48 |   cd $currDir/dscore/
49 |   oracleResults=`python score.py -R <(ls $rttmDir/*) \
50 |     -S <(ls $expDir/$method/clustering_oracleNumSpkr/rttm) |\
51 |     grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
52 |   estResults=`python score.py -R <(ls $rttmDir/*) \
53 |     -S <(ls $expDir/$method/clustering_estNumSpkr/rttm) |\
54 |     grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
55 |   cd ..
56 |   echo "       WITHOUT COLLAR                "
57 |   echo "`basename $modelFile` `echo $oracleResults | cut -f 1 -d ' '` `echo $estResults | cut -f 1 -d ' '`" >> $outFile
58 | 
59 |   cd $currDir/dscore/
60 |   oracleResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
61 |     -S <(ls $expDir/$method/clustering_oracleNumSpkr/rttm) |\
62 |     grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
63 |   estResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
64 |     -S <(ls $expDir/$method/clustering_estNumSpkr/rttm) |\
65 |     grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
66 |   cd ..
67 |   echo "       WITH COLLAR                "
68 |   echo "`basename $modelFile` `echo $oracleResults | cut -f 1 -d ' '` `echo $estResults | cut -f 1 -d ' '`" >> $outFile
69 | 
70 | 
71 | done
72 | 


--------------------------------------------------------------------------------
/egs/sv_voices_kaldi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | : ' Date Created: Apr 2 2020
 3 | 
 4 |     Speaker verification on the voices corpora
 5 | 
 6 |     voices-eval:
 7 |     EER: 10.3%
 8 |     minDCF(p-target=0.01): 0.7845
 9 |     minDCF(p-target=0.001): 0.9406
10 | '
11 | 
12 | currDir=$PWD
13 | kaldiDir=/home/manoj/kaldi
14 | expDir=$currDir/exp_voices_kaldi
15 | wavDir=/home/manoj/Datasets/voices/Speaker_Recognition/sid_eval
16 | trialsFile=/home/manoj/Datasets/voices/eval_trials
17 | wavList=$currDir/wavList
18 | readlink -f $wavDir/* > $wavList
19 | dataDir=$expDir/data
20 | featDir=$expDir/feats
21 | 
22 | for f in sid steps utils local conf diarization; do
23 |   [ ! -L $f ] && ln -s $kaldiDir/egs/voxceleb/v2/$f;
24 | done
25 | 
26 | . cmd.sh
27 | . path.sh
28 | 
29 | # Kaldi data preparation
30 | rm -rf $dataDir; mkdir -p $dataDir
31 | paste -d ' ' <(rev $wavList | cut -f 1 -d '/' | rev | sed "s/\.wav$/-rec/g") \
32 |   <(cat $wavList | xargs readlink -f) > $dataDir/wav.scp
33 | paste -d ' ' <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") \
34 |   <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") > $dataDir/utt2spk
35 | cp $dataDir/utt2spk $dataDir/spk2utt
36 | numUtts=`wc -l $dataDir/utt2spk | cut -f 1 -d ' '`
37 | paste -d ' ' <(cut -f 1 -d ' ' $dataDir/utt2spk) \
38 |   <(cut -f 1 -d ' ' $dataDir/wav.scp) <(yes "0" | head -n $numUtts) <(cat $wavList | xargs soxi -D) \
39 |   >  $dataDir/segments
40 | 
41 | # Feature extraction pipeline 
42 | steps/make_mfcc.sh --write-utt2num-frames true \
43 |   --mfcc-config conf/mfcc.conf --nj 16 --cmd "$train_cmd" \
44 |   $dataDir
45 | utils/fix_data_dir.sh $dataDir
46 | sid/compute_vad_decision.sh --nj 16 --cmd "$train_cmd" $dataDir
47 | utils/fix_data_dir.sh $dataDir
48 | 
49 | local/nnet3/xvector/prepare_feats_for_egs.sh --nj 8 --cmd "$train_cmd" \
50 |   $dataDir $featDir $expDir/data_no_sil
51 | utils/fix_data_dir.sh $featDir
52 | 
53 | # Kaldi xvectors
54 | nnetDir=$kaldiDir/egs/voxceleb/v2/exp/xvector_nnet_1a
55 | transformDir=$nnetDir/xvectors_train
56 | sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 16 \
57 |   $nnetDir $dataDir \
58 |   $expDir/kaldi_xvectors
59 | 
60 | # Scoring
61 | $train_cmd $expDir/log_scores.log \
62 | ivector-plda-scoring --normalize-length=true \
63 | "ivector-copy-plda --smoothing=0.0 $transformDir/plda - |" \
64 | "ark:ivector-subtract-global-mean $transformDir/mean.vec scp:$expDir/kaldi_xvectors/xvector.scp ark:- | transform-vec $transformDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
65 | "ark:ivector-subtract-global-mean $transformDir/mean.vec scp:$expDir/kaldi_xvectors/xvector.scp ark:- | transform-vec $transformDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
66 | "cat '$trialsFile' | cut -d\  --fields=1,2 |" $expDir/scores_eval
67 | 
68 | eer=`compute-eer <(local/prepare_for_eer.py $trialsFile $expDir/scores_eval) 2> /dev/null`
69 | mindcf1=`sid/compute_min_dcf.py --p-target 0.01 $expDir/scores_eval $trialsFile 2> /dev/null`
70 | mindcf2=`sid/compute_min_dcf.py --p-target 0.001 $expDir/scores_eval $trialsFile 2> /dev/null`
71 | echo "EER: $eer%"
72 | echo "minDCF(p-target=0.01): $mindcf1"
73 | echo "minDCF(p-target=0.001): $mindcf2"
74 | 


--------------------------------------------------------------------------------
/egs/sv_voices.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | : ' Date Created: Apr 2 2020
 3 | 
 4 |     Speaker verification on the voices corpora using pytorch embeddings
 5 | 
 6 |     voices-eval:
 7 |     EER: 8.591%
 8 |     minDCF(p-target=0.01): 0.6961
 9 |     minDCF(p-target=0.001): 0.8934
10 | 
11 | '
12 | 
13 | currDir=$PWD
14 | kaldiDir=/home/manoj/kaldi
15 | expDir=$currDir/exp_voices_pytorch
16 | wavDir=/home/manoj/Datasets/voices/Speaker_Recognition/sid_eval
17 | trialsFile=/home/manoj/Datasets/voices/eval_trials
18 | wavList=$currDir/wavList
19 | readlink -f $wavDir/* > $wavList
20 | dataDir=$expDir/data
21 | featDir=$expDir/feats
22 | 
23 | for f in sid steps utils local conf diarization; do
24 |   [ ! -L $f ] && ln -s $kaldiDir/egs/voxceleb/v2/$f;
25 | done
26 | 
27 | . cmd.sh
28 | . path.sh
29 | 
30 | # Kaldi data preparation
31 | rm -rf $dataDir; mkdir -p $dataDir
32 | paste -d ' ' <(rev $wavList | cut -f 1 -d '/' | rev | sed "s/\.wav$/-rec/g") \
33 |   <(cat $wavList | xargs readlink -f) > $dataDir/wav.scp
34 | paste -d ' ' <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") \
35 |   <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") > $dataDir/utt2spk
36 | cp $dataDir/utt2spk $dataDir/spk2utt
37 | numUtts=`wc -l $dataDir/utt2spk | cut -f 1 -d ' '`
38 | paste -d ' ' <(cut -f 1 -d ' ' $dataDir/utt2spk) \
39 |   <(cut -f 1 -d ' ' $dataDir/wav.scp) <(yes "0" | head -n $numUtts) <(cat $wavList | xargs soxi -D) \
40 |   >  $dataDir/segments
41 | 
42 | # Feature extraction pipeline
43 | steps/make_mfcc.sh --write-utt2num-frames true \
44 |   --mfcc-config conf/mfcc.conf --nj 16 --cmd "$train_cmd" \
45 |   $dataDir
46 | utils/fix_data_dir.sh $dataDir
47 | sid/compute_vad_decision.sh --nj 16 --cmd "$train_cmd" $dataDir
48 | utils/fix_data_dir.sh $dataDir
49 | 
50 | local/nnet3/xvector/prepare_feats_for_egs.sh --nj 16 --cmd "$train_cmd" \
51 |   $dataDir $featDir $expDir/data_no_sil
52 | utils/fix_data_dir.sh $featDir
53 | utils/split_data.sh $featDir 8
54 | 
55 | # Pytorch embeddings
56 | modelDir=$currDir/models/xvec_preTrained
57 | transformDir=$currDir/xvectors/xvec_preTrained/train
58 | cd ..
59 | python extract.py $modelDir $featDir $expDir/pytorch_xvectors
60 | cd egs/
61 | 
62 | # Scoring
63 | $train_cmd $expDir/log_scores.log \
64 | ivector-plda-scoring --normalize-length=true \
65 | "ivector-copy-plda --smoothing=0.0 $transformDir/plda - |" \
66 | "ark:ivector-subtract-global-mean $transformDir/mean.vec scp:$expDir/pytorch_xvectors/xvector.scp ark:- | transform-vec $transformDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
67 | "ark:ivector-subtract-global-mean $transformDir/mean.vec scp:$expDir/pytorch_xvectors/xvector.scp ark:- | transform-vec $transformDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
68 | "cat '$trialsFile' | cut -d\  --fields=1,2 |" $expDir/scores_eval
69 | 
70 | eer=`compute-eer <(local/prepare_for_eer.py $trialsFile $expDir/scores_eval) 2> /dev/null`
71 | mindcf1=`sid/compute_min_dcf.py --p-target 0.01 $expDir/scores_eval $trialsFile 2> /dev/null`
72 | mindcf2=`sid/compute_min_dcf.py --p-target 0.001 $expDir/scores_eval $trialsFile 2> /dev/null`
73 | echo "EER: $eer%"
74 | echo "minDCF(p-target=0.01): $mindcf1"
75 | echo "minDCF(p-target=0.001): $mindcf2"
76 | 


--------------------------------------------------------------------------------
/subsetEgsIntoHdf5.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | 
  3 | """ Date Created: Apr 8 2020
  4 | 
  5 |     This script breaks down an nnet3-egs ark file into multiple hdf5 files
  6 |     suitable for protonet training
  7 |     [ Looking for a more direct way for this (or) Bypassing HDF5 for protonet ]
  8 | 
  9 | """
 10 | 
 11 | import os
 12 | import sys
 13 | import glob
 14 | import h5py
 15 | import random
 16 | import subprocess
 17 | import numpy as np
 18 | import kaldi_python_io
 19 | from multiprocessing import Pool
 20 | 
 21 | def writeHdf5File(egsFile, scpFile, chunkLen, hdf5File):
 22 | 
 23 |     featDim = 30
 24 |     output = subprocess.run(['wc','-l',scpFile], stdout=subprocess.PIPE).stdout.decode('utf-8')
 25 |     numSamples = int(output.split()[0])
 26 |     x = kaldi_python_io.Nnet3EgsReader(egsFile)
 27 |     with h5py.File(hdf5File,'w') as fid:
 28 |         feats = fid.create_dataset('feats',(numSamples,chunkLen,featDim), dtype='f')#, compression="gzip")
 29 |         labels = fid.create_dataset('labels',(numSamples,1), dtype='i8')#, compression="gzip")
 30 |         count = 0
 31 |         for key,mat in x:
 32 |             labels[count] = mat[1]['matrix'][0][0][0]
 33 |             feats[count] = mat[0]['matrix']
 34 |             count += 1
 35 | 
 36 | if len(sys.argv) != 3:
 37 |     print('Usage: python subsetEgsIntoHdf5.py <egsDir> <hdf5Dir>')
 38 |     sys.exit(1)
 39 | 
 40 | egsDir = sys.argv[1]
 41 | hdf5Dir = sys.argv[2]
 42 | tempDir = hdf5Dir + '/temp/'
 43 | numSplits = 8
 44 | 
 45 | os.system('rm -rf %s' %hdf5Dir)
 46 | os.system('mkdir -p %s' %tempDir)
 47 | 
 48 | arkCount = 0
 49 | for fileI, scpFile in enumerate(sorted(glob.glob(egsDir+'/egs.*.scp'))):
 50 |    
 51 |     print('working on archive %d' %(arkCount+1))
 52 |     #if arkCount == 45:
 53 |     #    break
 54 |     arkCount += 1
 55 |     inputArkNum = int(os.path.basename(scpFile).split('.')[1])
 56 |     # First, read all the labels alongwith indices
 57 |     with open(scpFile,'r') as fid:
 58 |         data = fid.read().splitlines()
 59 |     chunkLen = int(data[0].split()[0].split('-')[-2])
 60 | 
 61 |     # Divide the speakers into splits with approx. equal speakers
 62 |     print('Creating new speaker lists..')
 63 |     spkrLoc = {}
 64 |     for i,x in enumerate(data):
 65 |         spkrID = int(x.split()[0].split('-')[-1])
 66 |         if spkrID in spkrLoc:
 67 |             spkrLoc[spkrID].append(i)
 68 |         else:
 69 |             spkrLoc[spkrID] = [i]
 70 |     uniqSpkrs = np.fromiter(spkrLoc.keys(), dtype=int)
 71 |     spkrSplits = [uniqSpkrs[i::numSplits] for i in range(numSplits)]
 72 |     for splitI,split in enumerate(spkrSplits):
 73 |         with open(tempDir+'/temp.{}.scp'.format(splitI+1),'w') as fid:
 74 |             for spkr in split:
 75 |                 for loc in spkrLoc[spkr]:
 76 |                     fid.write('%s\n' %data[loc])
 77 | 
 78 |     print('Creating temporary egs files')
 79 |     copyCommands = [ 'nnet3-copy-egs scp:%s ark:%s > /dev/null 2>&1' %(
 80 |         tempDir+'/temp.{}.scp'.format(splitI+1),
 81 |         tempDir+'/temp.{}.ark'.format(splitI+1)) for splitI in range(numSplits)]
 82 | 
 83 |     # start all programs
 84 |     processes = [subprocess.Popen(program, shell=True) for program in copyCommands]
 85 |     # wait
 86 |     for process in processes:
 87 |         process.wait()
 88 | 
 89 |     print('Creating hdf5 files now..')
 90 |     nProcs = 8
 91 |     L = [(tempDir+'/temp.{}.ark'.format(splitI+1),
 92 |         tempDir+'/temp.{}.scp'.format(splitI+1),
 93 |         chunkLen,
 94 |         hdf5Dir+'/egs.{}.{}.hdf5'.format(inputArkNum,splitI+1)) for splitI in range(numSplits)]
 95 |     pool = Pool(processes=nProcs)
 96 |     results = pool.starmap(writeHdf5File, L)
 97 |     pool.terminate()
 98 | 
 99 |     # Cleanup
100 |     os.system('rm -f %s/*' %tempDir)
101 | 
102 | os.system('rm -rf %s' %tempDir)
103 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | """
 4 |     Date Created: Feb 26 2018
 5 | 
 6 |     This script extracts trained embeddings given the model directory, and saves them in kaldi format
 7 | 
 8 | """
 9 | import os
10 | import sys
11 | import glob
12 | import argparse
13 | import kaldi_io
14 | from models import *
15 | import kaldi_python_io
16 | import socket
17 | from train_utils import *
18 | from collections import OrderedDict
19 | from torch.multiprocessing import Pool, Process, set_start_method
20 | torch.multiprocessing.set_start_method('spawn', force=True)
21 | 
22 | def getSplitNum(text):
23 |     return int(text.split('/')[-1].lstrip('split'))
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('-modelType', default='xvecTDNN', help='Refer train_utils.py ')
28 |     parser.add_argument('-numSpkrs', default=7323, type=int, help='Number of output labels for model')
29 |     parser.add_argument('-layerName', default='fc1', help="DNN layer for embeddings")
30 |     parser.add_argument('-nProcs', default=0, type=int, help='Number of parallel processes. Default=0(Number of input directory splits)')
31 |     parser.add_argument('modelDirectory', help='Directory containing the model checkpoints')
32 |     parser.add_argument('featDir', help='Directory containing features ready for extraction')
33 |     parser.add_argument('embeddingDir', help='Output directory')
34 |     args = parser.parse_args()
35 | 
36 |     # Checking for input features and splitN directories
37 |     try:
38 |         nSplits = int(sorted(glob.glob(args.featDir+'/split*'),
39 |                   key=getSplitNum)[-1].split('/')[-1].lstrip('split'))
40 |     except ValueError:
41 |         print('[ERROR] Cannot find %s/splitN directory' %args.featDir)
42 |         print('Use utils/split_data.sh to create this directory')
43 |         sys.exit(1)
44 | 
45 |     if not os.path.isfile('%s/split%d/1/feats.scp' %(args.featDir, nSplits)):
46 |         print('Cannot find input features')
47 |         sys.exit(1)
48 | 
49 |     # Check for trained model
50 |     try:
51 |         modelFile = max(glob.glob(args.modelDirectory+'/*.tar'), key=os.path.getctime)
52 |     except ValueError:
53 |         print("[ERROR] No trained model has been found in {}.".format(args.modelDirectory) )
54 |         sys.exit(1)
55 | 
56 |     # Load model definition
57 |     net = eval('{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
58 | 
59 |     checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
60 |     new_state_dict = OrderedDict()
61 |     if 'relation' in args.modelType:
62 |         checkpoint_dict = checkpoint['encoder_state_dict']
63 |     else:
64 |         checkpoint_dict = checkpoint['model_state_dict']
65 |     for k, v in checkpoint_dict.items():
66 |         if k.startswith('module.'):
67 |             new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
68 |         else:
69 |             new_state_dict[k] = v
70 | 
71 |     # load trained weights
72 |     net.load_state_dict(new_state_dict)
73 |     net = net.cuda()
74 |     net.eval()
75 | 
76 |     if not os.path.isdir(args.embeddingDir):
77 |         os.makedirs(args.embeddingDir)
78 | 
79 |     print('Extracting xvectors by distributing jobs to pool workers... ')
80 |     if not args.nProcs:
81 |         args.nProcs = nSplits
82 | 
83 |     L = [('%s/split%d/%d/feats.scp' %(args.featDir, nSplits, i),
84 |         '%s/xvector.%d.ark' %(args.embeddingDir, i),
85 |         '%s/xvector.%d.scp' %(args.embeddingDir, i), net, args.layerName ) for i in range(1,nSplits+1)]
86 |     pool2 = Pool(processes=args.nProcs)
87 |     result = pool2.starmap(par_core_extractXvectors, L )
88 |     pool2.terminate()
89 |     print('Multithread job has been finished.')
90 | 
91 |     print('Writing xvectors to {}'.format(args.embeddingDir))
92 |     os.system('cat %s/xvector.*.scp > %s/xvector.scp' %(args.embeddingDir, args.embeddingDir))
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/train_relation.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3.6
  2 | 
  3 | """
  4 |     Date Created: Apr 6 2020
  5 | 
  6 |     Training script for relation networks
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import glob
 13 | import time
 14 | import socket
 15 | import torch
 16 | import numpy as np
 17 | from train_utils import *
 18 | import torch.multiprocessing as mp
 19 | from torch.utils.data import DataLoader
 20 | from torch.nn.functional import softmax
 21 | 
 22 | # SEEDS
 23 | torch.manual_seed(0)
 24 | np.random.seed(0)
 25 | 
 26 | # PARAMS, MODEL PREP
 27 | parser = getParams()
 28 | args = parser.parse_args()
 29 | checkParams(args)
 30 | print(args)
 31 | 
 32 | totalEpisodes = args.totalEpisodes
 33 | encoder_net, relation_net, encoder_optimizer, relation_optimizer, episodeI, saveDir = prepareRelationModel(args)
 34 | currLR = encoder_optimizer.param_groups[0]['lr']
 35 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 36 | numBatchesPerArk = int(args.numEgsPerArk/args.batchSize)
 37 | 
 38 | # LR SCHEDULERS
 39 | encoder_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(encoder_optimizer, gamma=0.95)
 40 | relation_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(relation_optimizer, gamma=0.95)
 41 | criterion = nn.MSELoss()
 42 | criterion_xent = nn.CrossEntropyLoss()
 43 | 
 44 | encoder_optimizer.param_groups[0]['lr'] = currLR
 45 | relation_optimizer.param_groups[0]['lr'] = currLR
 46 | eps = args.noiseEps
 47 | featDir = args.featDir
 48 | 
 49 | # TRAINING
 50 | while episodeI < totalEpisodes:
 51 | 
 52 |     hdf5File = np.random.choice(glob.glob(featDir+'/*.hdf5'))
 53 |     print('Reading from archive %s' %os.path.basename(hdf5File))
 54 |     dataSet = myH5DL(hdf5File)
 55 |     samplesPerClass = np.random.randint(3,4)
 56 |     numSupports = samplesPerClass - 1
 57 |     numQueries = 1
 58 |     batchSampler = myH5DL_sampler(hdf5File,
 59 |                        minClasses=args.protoMinClasses,
 60 |                        maxClasses=args.protoMaxClasses,
 61 |                        samplesPerClass=samplesPerClass,
 62 |                        numEpisodes=args.protoEpisodesPerArk)
 63 |     dataLoader = DataLoader(dataSet, batch_sampler=batchSampler, num_workers=0)
 64 | 
 65 |     loggingLoss = []
 66 |     archive_start_time = time.time()
 67 |     for x, _ in dataLoader:
 68 |         encoder_optimizer.zero_grad()
 69 |         relation_optimizer.zero_grad()
 70 |         episode_start_time = time.time()
 71 |         numClasses = int(len(x)/samplesPerClass)
 72 |         x = x.view(samplesPerClass, numClasses, -1, args.featDim)
 73 |         supports = x[:numSupports,:,:,:].detach()
 74 |         queries = x[numSupports:,:,:,:].detach()
 75 | 
 76 |         encoder_sup = encoder_net(
 77 |             supports.view(-1, supports.shape[2], args.featDim).permute(0,2,1).to(device), eps)
 78 |         encoder_quer = encoder_net(
 79 |             queries.view(-1, queries.shape[2], args.featDim).permute(0,2,1).to(device), eps)
 80 | 
 81 |         # Computing sum across supports within each class
 82 |         encoder_dim = encoder_sup.shape[-1]
 83 |         encoder_sup = torch.sum(encoder_sup.view(-1, numClasses, encoder_dim), dim=0)
 84 | 
 85 |         encoder_sup = encoder_sup.unsqueeze(0).expand(numClasses, numClasses, encoder_dim)
 86 |         encoder_quer = encoder_quer.unsqueeze(1).expand(numClasses, numClasses, encoder_dim)
 87 | 
 88 |         relation_out = relation_net(torch.cat((encoder_sup, encoder_quer), dim=2).view(-1, 2*encoder_dim))
 89 | 
 90 |         # X-ent loss
 91 |         labels = torch.arange(0,numClasses)
 92 |         loss = criterion_xent(relation_out.view(numClasses, numClasses), labels.cuda())        
 93 |         loggingLoss.append(loss.item())
 94 | 
 95 |         if np.isnan(loss.item()):
 96 |             print('Nan encountered at iter %d. Exiting..' %iter)
 97 |             sys.exit(1)
 98 |         loss.backward()
 99 |         encoder_optimizer.step()
100 |         relation_optimizer.step()
101 |         print('Episode time: %1.3f   Episode Loss: %1.3f'  %(time.time()-episode_start_time, loss.item()))
102 |         episodeI += 1
103 | 
104 |     if episodeI%(10*args.protoEpisodesPerArk) == 0:
105 |         encoder_lr_scheduler.step()
106 |         relation_lr_scheduler.step()
107 | 
108 |     # Log, as long as episodeI <= totalEpisodes
109 |     print('Episode: (%d/%d)     Avg Loss/batch: %1.3f' %(
110 |         episodeI,
111 |         totalEpisodes,
112 |         np.mean(loggingLoss)))
113 | 
114 |     print('Archive time: %1.3f' %(time.time()-archive_start_time))
115 | 
116 |     # Save checkpoint
117 |     torch.save({
118 |         'episodeI': episodeI,
119 |         'encoder_state_dict': encoder_net.state_dict(),
120 |         'relation_state_dict': relation_net.state_dict(),
121 |         'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
122 |         'relation_optimizer_state_dict': relation_optimizer.state_dict(),
123 |         'args': args,
124 |         }, '{}/checkpoint_episode_{}.tar'.format(saveDir, episodeI))
125 | 
126 |     if episodeI > 10*args.protoEpisodesPerArk:
127 |         if os.path.exists('%s/checkpoint_step_%d.tar' %(saveDir,episodeI-10*args.protoEpisodesPerArk)):
128 |             if episodeI%(50*args.protoEpisodesPerArk) !=0:
129 |                 os.remove('%s/checkpoint_step_%d.tar' %(saveDir,episodeI-10*args.protoEpisodesPerArk))
130 | 


--------------------------------------------------------------------------------
/train_proto.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3.6
  2 | 
  3 | """
  4 |     Date Created: Apr 6 2020
  5 | 
  6 |     Training script for prototypical networks
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import glob
 13 | import time
 14 | import socket
 15 | import torch
 16 | import numpy as np
 17 | from train_utils import *
 18 | import torch.multiprocessing as mp
 19 | from torch.utils.data import DataLoader
 20 | from torch.nn.functional import softmax
 21 | 
 22 | def euclideanLoss(embed_quer, prototypes):
 23 |     """
 24 |         prototypes: (N, D)
 25 |         embed_quer: (M, N, D)
 26 | 
 27 |         D: embedding dimension
 28 |         N: number of classes
 29 |         M: samples per class
 30 | 
 31 |     """
 32 |     M, N, D = embed_quer.shape
 33 |     embed_quer = embed_quer.unsqueeze(2).expand(-1, -1, N, -1)
 34 |     prototypes = prototypes.view(1, 1, N, D).expand(M, N, -1, -1)
 35 |     logits = ((embed_quer - prototypes)**2).sum(dim=3)
 36 |     return -logits
 37 | 
 38 | # SEEDS
 39 | torch.manual_seed(0)
 40 | np.random.seed(0)
 41 | 
 42 | # PARAMS, MODEL PREP
 43 | parser = getParams()
 44 | args = parser.parse_args()
 45 | checkParams(args)
 46 | print(args)
 47 | 
 48 | totalEpisodes = args.totalEpisodes
 49 | net, optimizer, episodeI, saveDir = prepareProtoModel(args)
 50 | currLR = optimizer.param_groups[0]['lr']
 51 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 52 | numBatchesPerArk = int(args.numEgsPerArk/args.batchSize)
 53 | 
 54 | # LR SCHEDULERS
 55 | cyclic_lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
 56 |                           max_lr=args.maxLR,
 57 |                           cycle_momentum=False,
 58 |                           div_factor=5,
 59 |                           final_div_factor=1e+3,
 60 |                           total_steps=totalEpisodes,
 61 |                           pct_start=0.15)
 62 | exponential_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
 63 |                           gamma=0.95)
 64 | criterion = nn.CrossEntropyLoss()
 65 | optimizer.param_groups[0]['lr'] = currLR
 66 | eps = args.noiseEps
 67 | featDir = args.featDir
 68 | 
 69 | # TRAINING
 70 | while episodeI < totalEpisodes:
 71 | 
 72 |     hdf5File = np.random.choice(glob.glob(featDir+'/*.hdf5'))
 73 |     print('Reading from archive %s' %os.path.basename(hdf5File))
 74 |     dataSet = myH5DL(hdf5File)
 75 |     samplesPerClass = np.random.randint(3,4)
 76 |     numSupports = samplesPerClass - 1
 77 |     numQueries = 1
 78 |     batchSampler = myH5DL_sampler(hdf5File,
 79 |                        minClasses=args.protoMinClasses,
 80 |                        maxClasses=args.protoMaxClasses,
 81 |                        samplesPerClass=samplesPerClass,
 82 |                        numEpisodes=args.protoEpisodesPerArk)
 83 |     dataLoader = DataLoader(dataSet, batch_sampler=batchSampler, num_workers=0)
 84 | 
 85 |     loggingLoss = []
 86 |     archive_start_time = time.time()
 87 |     for x, _ in dataLoader:
 88 |         optimizer.zero_grad()
 89 |         episode_start_time = time.time()
 90 |         numClasses = int(len(x)/samplesPerClass)
 91 |         x = x.view(samplesPerClass, numClasses, -1, args.featDim)
 92 |         supports = x[:numSupports,:,:,:].detach()
 93 |         queries = x[numSupports:,:,:,:].detach()
 94 |         labels = torch.arange(numClasses).repeat(numQueries)
 95 | 
 96 |         embed_sup = net(
 97 |             supports.view(-1, supports.shape[2], args.featDim).permute(0,2,1).to(device), eps)
 98 |         embed_quer = net(
 99 |             queries.view(-1, queries.shape[2], args.featDim).permute(0,2,1).to(device), eps)
100 | 
101 |         # Prototype computation
102 |         prototypes = embed_sup.view(supports.shape[0], supports.shape[1], -1).mean(dim=0)
103 | 
104 |         # Euclidean-softmax
105 |         logits = euclideanLoss(embed_quer.view(queries.shape[0], queries.shape[1], -1), prototypes)
106 | 
107 |         # Original implementation of loss function
108 |         loss = criterion(logits.view(numQueries*numClasses,numClasses), labels.to(device))
109 | 
110 |         # print(loss.item())
111 |         loggingLoss.append(loss.item())
112 | 
113 |         loss.backward()
114 |         optimizer.step()
115 |         print('Episode time: %1.3f   Episode Loss: %1.3f'  %(time.time()-episode_start_time, loss.item()))
116 |         del x, supports, queries, embed_sup, embed_quer, loss, logits, prototypes
117 |         episodeI += 1
118 | 
119 |     if episodeI%(10*args.protoEpisodesPerArk) == 0:
120 |         exponential_lr_scheduler.step()
121 |     # Log, as long as episodeI <= totalEpisodes
122 |     print('Episode: (%d/%d)     Avg Loss/batch: %1.3f' %(
123 |         episodeI,
124 |         totalEpisodes,
125 |         np.mean(loggingLoss)))
126 | 
127 |     print('Archive time: %1.3f' %(time.time()-archive_start_time))
128 | 
129 |     # Save checkpoint
130 |     torch.save({
131 |         'episodeI': episodeI,
132 |         'model_state_dict': net.state_dict(),
133 |         'optimizer_state_dict': optimizer.state_dict(),
134 |         'args': args,
135 |         }, '{}/checkpoint_episode_{}.tar'.format(saveDir, episodeI))
136 | 
137 |     if episodeI > 10*args.protoEpisodesPerArk:
138 |         if os.path.exists('%s/checkpoint_step_%d.tar' %(saveDir,episodeI-10*args.protoEpisodesPerArk)):
139 |             if episodeI%(50*args.protoEpisodesPerArk) !=0:
140 |                 os.remove('%s/checkpoint_step_%d.tar' %(saveDir,episodeI-10*args.protoEpisodesPerArk))
141 | 


--------------------------------------------------------------------------------
/train_xent.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3.6
  2 | 
  3 | """
  4 |     Date Created: Feb 10 2020
  5 | 
  6 |     This is the main training script for speaker embeddings, which will evolve
  7 |     over time
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import glob
 13 | import time
 14 | import socket
 15 | import torch
 16 | import numpy as np
 17 | from train_utils import *
 18 | import torch.multiprocessing as mp
 19 | from torch.utils.data import DataLoader
 20 | 
 21 | 
 22 | # SEEDS
 23 | torch.manual_seed(0)
 24 | np.random.seed(0)
 25 | 
 26 | # PARAMS, MODEL PREP
 27 | parser = getParams()
 28 | args = parser.parse_args()
 29 | print(args)
 30 | 
 31 | totalSteps = args.numEpochs * args.numArchives
 32 | net, optimizer, step, saveDir = prepareModel(args)
 33 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 34 | numBatchesPerArk = int(args.numEgsPerArk/args.batchSize)
 35 | 
 36 | # LR SCHEDULERS
 37 | cyclic_lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
 38 |                           max_lr=args.maxLR,
 39 |                           cycle_momentum=False,
 40 |                           div_factor=5,
 41 |                           final_div_factor=1e+3,
 42 |                           total_steps=totalSteps*numBatchesPerArk,
 43 |                           pct_start=0.15)
 44 | criterion = nn.CrossEntropyLoss()
 45 | eps = args.noiseEps
 46 | 
 47 | 
 48 | # TRAINING
 49 | while step < totalSteps:
 50 | 
 51 |     archiveI = step%args.numArchives + 1
 52 |     archive_start_time = time.time()
 53 |     ark_file = '{}/egs.{}.ark'.format(args.featDir,archiveI)
 54 |     print('Reading from archive %d' %archiveI)
 55 | 
 56 |     preFetchRatio = args.preFetchRatio
 57 |     # Read with data data_loader
 58 |     data_loader = nnet3EgsDL(ark_file)
 59 |     par_data_loader = DataLoader(data_loader,
 60 |                                  batch_size=preFetchRatio*args.batchSize,
 61 |                                  shuffle=False,
 62 |                                  num_workers=0,
 63 |                                  drop_last=False,
 64 |                                  pin_memory=True)
 65 | 
 66 |     batchI, loggedBatch = 0, 0
 67 |     loggingLoss =  0.0
 68 |     start_time = time.time()
 69 |     for _,(X, Y) in par_data_loader:
 70 |         Y = Y['matrix'][0][0][0].to(device)
 71 |         X = X['matrix'].to(device)
 72 |         try:
 73 |             assert max(Y) < args.numSpkrs and min(Y) >= 0
 74 |         except:
 75 |             print('Read an out of range value at iter %d' %iter)
 76 |             continue
 77 |         if torch.isnan(X).any():
 78 |             print('Read a nan value at iter %d' %iter)
 79 |             continue
 80 | 
 81 |         accumulateStepSize = 4
 82 |         preFetchBatchI = 0  # this counter within the prefetched batches only
 83 |         while preFetchBatchI < int(len(Y)/args.batchSize) - accumulateStepSize:
 84 | 
 85 |             # Accumulated gradients used
 86 |             optimizer.zero_grad()
 87 |             for _ in range(accumulateStepSize):
 88 |                 batchI += 1
 89 |                 preFetchBatchI += 1
 90 |                 # fwd + bckwd + optim
 91 |                 output = net(X[preFetchBatchI*args.batchSize:(preFetchBatchI+1)*args.batchSize,:,:].permute(0,2,1), eps)
 92 |                 loss = criterion(output, Y[preFetchBatchI*args.batchSize:(preFetchBatchI+1)*args.batchSize].squeeze())
 93 |                 if np.isnan(loss.item()):
 94 |                     print('Nan encountered at iter %d. Exiting..' %iter)
 95 |                     sys.exit(1)
 96 |                 loss.backward()
 97 |                 loggingLoss += loss.item()
 98 | 
 99 |             optimizer.step()    # Does the update
100 |             cyclic_lr_scheduler.step()
101 | 
102 |             # Log
103 |             if batchI-loggedBatch >= args.logStepSize:
104 |                 logStepTime = time.time() - start_time
105 |                 print('Batch: (%d/%d)     Avg Time/batch: %1.3f      Avg Loss/batch: %1.3f' %(
106 |                     batchI,
107 |                     numBatchesPerArk,
108 |                     logStepTime/(batchI-loggedBatch),
109 |                     loggingLoss/(batchI-loggedBatch)))
110 |                 loggingLoss = 0.0
111 |                 start_time = time.time()
112 |                 loggedBatch = batchI
113 | 
114 |     print('Archive processing time: %1.3f' %(time.time()-archive_start_time))
115 |     # Update dropout
116 |     if 1.0*step < args.stepFrac*totalSteps:
117 |         p_drop = args.pDropMax*step/(args.stepFrac*totalSteps)
118 |     else:
119 |         p_drop = max(0,args.pDropMax*(2*step - totalSteps*(args.stepFrac+1))/(totalSteps*(args.stepFrac-1))) # fast decay
120 |     for x in net.modules():
121 |         if isinstance(x, torch.nn.Dropout):
122 |             x.p = p_drop
123 |     print('Dropout updated to %f' %p_drop)
124 | 
125 |     # Save checkpoint
126 |     torch.save({
127 |         'step': step,
128 |         'archiveI':archiveI,
129 |         'model_state_dict': net.state_dict(),
130 |         'optimizer_state_dict': optimizer.state_dict(),
131 |         'loss': loss,
132 |         'args': args,
133 |         }, '{}/checkpoint_step{}.tar'.format(saveDir, step))
134 | 
135 |     # Compute validation loss, update LR if using plateau rule
136 |     valAcc = computeValidAccuracy(args, saveDir)
137 |     print('Validation accuracy is %1.2f precent' %(valAcc))
138 | 
139 |     # Cleanup. We always retain the last 10 models
140 |     if step > 10:
141 |         if os.path.exists('%s/checkpoint_step%d.tar' %(saveDir,step-10)):
142 |             os.remove('%s/checkpoint_step%d.tar' %(saveDir,step-10))
143 |     step += 1
144 | 


--------------------------------------------------------------------------------
/egs/diarize_kaldi.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | : ' Date Created: Mar 27 2020
  4 |     Perform speaker diarization using kaldi xvectors
  5 | '
  6 | 
  7 | currDir=$PWD
  8 | kaldiDir=/home/manoj/kaldi
  9 | expDir=$currDir/exp_kaldi
 10 | wavDir=$currDir/demo_wav
 11 | rttmDir=$currDir/demo_rttm
 12 | wavList=$currDir/wavList
 13 | readlink -f $wavDir/* > $wavList
 14 | 
 15 | # Extraction parameters
 16 | window=1.5
 17 | window_period=0.75
 18 | min_segment=0.5
 19 | nnetDir=$kaldiDir/egs/voxceleb/v2/exp/xvector_nnet_1a/
 20 | transformDir=$nnetDir/xvectors_train/
 21 | 
 22 | # Evaluation parameters
 23 | method=plda # plda or SC (spectral clustering)
 24 | useOracleNumSpkr=1
 25 | useCollar=1
 26 | skipDataPrep=1
 27 | dataDir=$expDir/data
 28 | nj=16
 29 | 
 30 | if [[ "$method" == "SC" ]] && [[ ! -d Auto-Tuning-Spectral-Clustering ]]; then
 31 |   echo "Please install https://github.com/tango4j/Auto-Tuning-Spectral-Clustering"
 32 |   exit 1
 33 | fi
 34 | 
 35 | if [[ ! -d dscore ]]; then
 36 |   echo "Please install https://github.com/nryant/dscore"
 37 |   exit 1
 38 | fi
 39 | 
 40 | for f in sid steps utils local conf diarization; do
 41 |   [ ! -L $f ] && ln -s $kaldiDir/egs/voxceleb/v2/$f;
 42 | done
 43 | 
 44 | if [[ "$useCollar" == "1" ]]; then
 45 |   collarCmd="-1 -c 0.25"
 46 | else
 47 |   collarCmd=""
 48 | fi
 49 | 
 50 | . cmd.sh
 51 | . path.sh
 52 | 
 53 | # Kaldi directory preparation
 54 | if [ "$skipDataPrep" == "0" ]; then
 55 | 
 56 |   rm -rf $dataDir; mkdir -p $dataDir
 57 |   paste -d ' ' <(rev $wavList | cut -f 1 -d '/' | rev | sed "s/\.wav$/-rec/g") \
 58 |     <(cat $wavList | xargs readlink -f) > $dataDir/wav.scp
 59 |   paste -d ' ' <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") \
 60 |     <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") > $dataDir/utt2spk
 61 |   cp $dataDir/utt2spk $dataDir/spk2utt
 62 |   numUtts=`wc -l $dataDir/utt2spk | cut -f 1 -d ' '`
 63 |   paste -d ' ' <(cut -f 1 -d ' ' $dataDir/utt2spk) \
 64 |     <(cut -f 1 -d ' ' $dataDir/wav.scp) <(yes "0" | head -n $numUtts) <(cat $wavList | xargs soxi -D) \
 65 |     >  $dataDir/segments
 66 |   if [ "$useOracleNumSpkr" == "1" ]; then
 67 |     for rttmFile in $rttmDir/*.rttm; do
 68 |       n=`cut -f 8 -d ' ' $rttmFile | sort | uniq | wc -l`
 69 |       echo "`basename $rttmFile .rttm` $n" >> $dataDir/reco2num_spk
 70 |     done
 71 |   fi
 72 | 
 73 |   # Create VAD directory
 74 |   echo "Creating VAD files.."
 75 |   python convert_rttm_to_vad.py $wavDir $rttmDir $expDir/oracleVAD
 76 |   while read -r line; do
 77 |       uttID=`echo $line | cut -f 1 -d ' '`
 78 |       inVadFile=$expDir/oracleVAD/$uttID.csv # this change yet to be verified
 79 |       [ ! -f $inVadFile ] && { echo "Input vad file does not exist"; exit 0; }
 80 |       paste -d ' ' <(echo $uttID) <(cut -f 2 -d ',' $inVadFile | tr "\n" " " | sed "s/^/ [ /g" | sed "s/$/ ]/g") >> $dataDir/vad.txt
 81 |   done < $dataDir/utt2spk
 82 |   copy-vector ark,t:$dataDir/vad.txt ark,scp:$dataDir/vad.ark,$dataDir/vad.scp
 83 |   echo "Done"
 84 | 
 85 |   # Feature preparation pipeline, until train_combined_no_sil
 86 |   utils/fix_data_dir.sh $dataDir
 87 |   steps/make_mfcc.sh --nj $nj \
 88 |                  --cmd "$train_cmd" \
 89 |                  --mfcc-config conf/mfcc.conf \
 90 |                  --write-utt2num-frames true \
 91 |                  $dataDir || exit 1
 92 |   utils/fix_data_dir.sh $dataDir
 93 | 
 94 |   diarization/vad_to_segments.sh --nj $nj \
 95 |                  --cmd "$train_cmd" \
 96 |                  --segmentation-opts '--silence-proportion 0.01001' \
 97 |                  --min-duration 0.5 \
 98 |                  $dataDir $dataDir/segmented || exit 1
 99 | 
100 |   local/nnet3/xvector/prepare_feats.sh --nj $nj \
101 |                  --cmd "$train_cmd" \
102 |                  $dataDir/segmented \
103 |                  $dataDir/segmented_cmn \
104 |                  $dataDir/segmented_cmn/feats || exit 1
105 |   cp $dataDir/segmented/segments $dataDir/segmented_cmn/segments
106 |   utils/fix_data_dir.sh $dataDir/segmented_cmn
107 |   utils/split_data.sh $dataDir/segmented_cmn $nj
108 | 
109 |   # Use extract.py or kaldi's extract_xvectors.sh
110 |   diarization/nnet3/xvector/extract_xvectors.sh --nj $nj \
111 |                --cmd "$train_cmd --mem 5G" \
112 |                --window $window \
113 |                --period $window_period \
114 |                --apply-cmn false \
115 |                --min-segment $min_segment \
116 |                $nnetDir \
117 |                $dataDir/segmented_cmn \
118 |                $dataDir/kaldi_xvectors/
119 | 
120 | else
121 |   [ ! -f $dataDir/kaldi_xvectors/xvector.scp ] && echo "Cannot find features" && exit 1;
122 | fi
123 | 
124 | if [ "$method" == "plda" ]; then
125 | 
126 |   diarization/nnet3/xvector/score_plda.sh --nj $nj \
127 |                --cmd "$train_cmd" \
128 |                $transformDir \
129 |                $dataDir/kaldi_xvectors \
130 |                $expDir/plda/scoring
131 | 
132 |   diarization/cluster.sh --nj $nj \
133 |                --cmd "$train_cmd --mem 5G" \
134 |                --reco2num-spk $dataDir/reco2num_spk \
135 |                $expDir/plda/scoring \
136 |                $expDir/plda/clustering_oracleNumSpkr
137 | 
138 |   diarization/cluster.sh --nj $nj \
139 |               --cmd "$train_cmd --mem 5G" \
140 |               --threshold 0 \
141 |               $expDir/plda/scoring \
142 |               $expDir/plda/clustering_estNumSpkr
143 | 
144 | else
145 | 
146 |   # Compute the cosine affinity
147 |   cd Auto-Tuning-Spectral-Clustering/sc_utils
148 |   bash score_embedding.sh --cmd "$train_cmd" --nj 16 \
149 |                --python_env ~/virtualenv/keras_fixed/bin/activate \
150 |                --score_metric cos --out_dir $expDir/SC/cos_scores  \
151 |                $dataDir/kaldi_xvectors $expDir/SC/cos_scores
152 |   cd ..
153 | 
154 |   # Perform spectral clustering
155 |   python spectral_opt.py --affinity_score_file $expDir/SC/cos_scores/scores.scp \
156 |                --threshold 'None' --score_metric "cos" --max_speaker 10 \
157 |                --spt_est_thres 'NMESC' --reco2num_spk $dataDir/reco2num_spk \
158 |                --segment_file_input_path $dataDir/kaldi_xvectors/segments \
159 |                --spk_labels_out_path $expDir/SC/labels_oracleNumSpkr \
160 |                --sparse_search True
161 |   mkdir -p $expDir/SC/clustering_oracleNumSpkr
162 |   python sc_utils/make_rttm.py $dataDir/kaldi_xvectors/segments \
163 |     $expDir/SC/labels_oracleNumSpkr $expDir/SC/clustering_oracleNumSpkr/rttm
164 | 
165 |   python spectral_opt.py --affinity_score_file $expDir/SC/cos_scores/scores.scp \
166 |                --threshold 'None' --score_metric "cos" --max_speaker 10 \
167 |                --spt_est_thres 'NMESC' \
168 |                --segment_file_input_path $dataDir/kaldi_xvectors/segments \
169 |                --spk_labels_out_path $expDir/SC/labels_estNumSpkr \
170 |                --sparse_search True
171 |   mkdir -p $expDir/SC/clustering_estNumSpkr
172 |   python sc_utils/make_rttm.py $dataDir/kaldi_xvectors/segments \
173 |     $expDir/SC/labels_estNumSpkr $expDir/SC/clustering_estNumSpkr/rttm
174 |   cd ..
175 | 
176 | fi
177 | 
178 | 
179 | # Evaluation
180 | 
181 | sed -i "s/-rec//g" $expDir/$method/clustering_oracleNumSpkr/rttm
182 | sed -i "s/-rec//g" $expDir/$method/clustering_estNumSpkr/rttm
183 | 
184 | cd dscore/
185 | oracleResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
186 |   -S <(ls $expDir/$method/clustering_oracleNumSpkr/rttm) |\
187 |   grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
188 | estResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
189 |   -S <(ls $expDir/$method/clustering_estNumSpkr/rttm) |\
190 |   grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
191 | cd ..
192 | rm $wavList
193 | 
194 | echo "DER with Oracle #Spkrs: `echo $oracleResults | cut -f 1 -d ' '`"
195 | echo "DER with Est #Spkrs   : `echo $estResults | cut -f 1 -d ' '`"
196 | 


--------------------------------------------------------------------------------
/egs/diarize.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | : ' Date Created: Mar 30 2020
  4 |     Perform speaker diarization using pytorch embeddings
  5 | '
  6 | 
  7 | source /usr/usc/sox/14.4.2/setup.sh;
  8 | currDir=$PWD
  9 | kaldiDir=/home/manoj/kaldi
 10 | expDir=/scratch/prabakar/par3_pytorch_xvectors/egs/exp
 11 | wavDir=/scratch/prabakar/par3_pytorch_xvectors/egs/adosMod3_wav
 12 | rttmDir=/scratch/prabakar/par3_pytorch_xvectors/egs/adosMod3_rttm
 13 | wavList=$currDir/wavList
 14 | readlink -f $wavDir/* > $wavList
 15 | 
 16 | # Extraction parameters
 17 | window=1.5
 18 | window_period=0.75
 19 | min_segment=0.5
 20 | modelDir=/scratch/prabakar/par3_pytorch_xvectors/egs/../models/temp_eval/
 21 | transformDir=../xvectors/xvec_preTrained_fc2/train
 22 | 
 23 | # Evaluation parameters
 24 | modelType=relation_encoder_xvecTDNN
 25 | layerName=fc3
 26 | method=SC
 27 | useCollar=0
 28 | skipDataPrep=0
 29 | dataDir=$expDir/data
 30 | nj=8
 31 | 
 32 | if [[ "$method" == "SC" ]] && [[ ! -d Auto-Tuning-Spectral-Clustering ]]; then
 33 |   echo "Please install https://github.com/tango4j/Auto-Tuning-Spectral-Clustering"
 34 |   exit 1
 35 | fi
 36 | 
 37 | if [[ ! -d dscore ]]; then
 38 |   echo "Please install https://github.com/nryant/dscore"
 39 |   exit 1
 40 | fi
 41 | 
 42 | for f in sid steps utils conf diarization; do
 43 |   [ ! -L $f ] && ln -s $kaldiDir/egs/voxceleb/v2/$f;
 44 | done
 45 | 
 46 | if [[ "$useCollar" == "1" ]]; then
 47 |   collarCmd="--collar 0.25 --ignore_overlaps"
 48 | else
 49 |   collarCmd="--ignore_overlaps"
 50 | fi
 51 | 
 52 | . cmd.sh
 53 | . path.sh
 54 | 
 55 | # Kaldi directory preparation
 56 | if [ "$skipDataPrep" == "0" ]; then
 57 | 
 58 |   rm -rf $expDir; mkdir -p $dataDir
 59 |   paste -d ' ' <(rev $wavList | cut -f 1 -d '/' | rev | sed "s/\.wav$/-rec/g") \
 60 |     <(cat $wavList | xargs readlink -f) > $dataDir/wav.scp
 61 |   paste -d ' ' <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") \
 62 |     <(cut -f 1 -d ' ' $dataDir/wav.scp | sed "s/-rec$//g") > $dataDir/utt2spk
 63 |   cp $dataDir/utt2spk $dataDir/spk2utt
 64 |   numUtts=`wc -l $dataDir/utt2spk | cut -f 1 -d ' '`
 65 |   paste -d ' ' <(cut -f 1 -d ' ' $dataDir/utt2spk) \
 66 |     <(cut -f 1 -d ' ' $dataDir/wav.scp) <(yes "0" | head -n $numUtts) <(cat $wavList | xargs soxi -D) \
 67 |     >  $dataDir/segments
 68 |   for rttmFile in $rttmDir/*.rttm; do
 69 |     n=`cut -f 8 -d ' ' $rttmFile | sort | uniq | wc -l`
 70 |     echo "`basename $rttmFile .rttm` $n" >> $dataDir/reco2num_spk
 71 |   done
 72 | 
 73 |   # Create VAD directory
 74 |   echo "Creating VAD files.."
 75 |   python convert_rttm_to_vad.py $wavDir $rttmDir $expDir/oracleVAD
 76 |   while read -r line; do
 77 |       uttID=`echo $line | cut -f 1 -d ' '`
 78 |       inVadFile=$expDir/oracleVAD/$uttID.csv
 79 |       [ ! -f $inVadFile ] && { echo "Input vad file does not exist"; exit 0; }
 80 |       paste -d ' ' <(echo $uttID) <(cut -f 2 -d ',' $inVadFile | tr "\n" " " | sed "s/^/ [ /g" | sed "s/$/ ]/g") >> $dataDir/vad.txt
 81 |   done < $dataDir/utt2spk
 82 |   copy-vector ark,t:$dataDir/vad.txt ark,scp:$dataDir/vad.ark,$dataDir/vad.scp
 83 |   echo "Done"
 84 | 
 85 |   # Feature preparation pipeline, until train_combined_no_sil
 86 |   utils/fix_data_dir.sh $dataDir
 87 |   steps/make_mfcc.sh --nj $nj \
 88 |                  --cmd "$train_cmd" \
 89 |                  --mfcc-config conf/mfcc.conf \
 90 |                  --write-utt2num-frames true \
 91 |                  $dataDir || exit 1
 92 |   utils/fix_data_dir.sh $dataDir
 93 | 
 94 |   diarization/vad_to_segments.sh --nj $nj \
 95 |                  --cmd "$train_cmd" \
 96 |                  --segmentation-opts '--silence-proportion 0.01001' \
 97 |                  --min-duration 0.5 \
 98 |                  $dataDir $dataDir/segmented || exit 1
 99 | 
100 |   local/nnet3/xvector/prepare_feats.sh --nj $nj \
101 |                  --cmd "$train_cmd" \
102 |                  $dataDir/segmented \
103 |                  $dataDir/segmented_cmn \
104 |                  $dataDir/segmented_cmn/feats || exit 1
105 |   cp $dataDir/segmented/segments $dataDir/segmented_cmn/segments
106 |   utils/fix_data_dir.sh $dataDir/segmented_cmn
107 |   utils/split_data.sh $dataDir/segmented_cmn $nj
108 | 
109 |   # Compute the subsegments directory
110 |   utils/data/get_uniform_subsegments.py \
111 |                --max-segment-duration=$window \
112 |                --overlap-duration=$(perl -e "print ($window-$window_period);") \
113 |                --max-remaining-duration=$min_segment \
114 |                --constant-duration=True \
115 |               $dataDir/segmented_cmn/segments > $dataDir/segmented_cmn/subsegments
116 |   utils/data/subsegment_data_dir.sh $dataDir/segmented_cmn \
117 |     $dataDir/segmented_cmn/subsegments $dataDir/pytorch_xvectors/subsegments
118 |   utils/split_data.sh $dataDir/pytorch_xvectors/subsegments $nj
119 | 
120 |   # Extract x-vectors
121 |   cd ..
122 |   python extract.py -modelType $modelType -layerName $layerName $modelDir \
123 |     $dataDir/pytorch_xvectors/subsegments \
124 |     $dataDir/pytorch_xvectors || exit 1
125 |   cd egs/
126 | 
127 |   for f in segments utt2spk spk2utt; do
128 |     cp $dataDir/pytorch_xvectors/subsegments/$f $dataDir/pytorch_xvectors/$f
129 |   done
130 | 
131 | else
132 |   [ ! -f $dataDir/pytorch_xvectors/xvector.scp ] && echo "Cannot find features" && exit 1;
133 | fi
134 | 
135 | if [ "$method" == "plda" ]; then
136 | 
137 |   diarization/nnet3/xvector/score_plda.sh --nj 16 \
138 |                --cmd "$train_cmd" \
139 |                $transformDir \
140 |                $dataDir/pytorch_xvectors \
141 |                $expDir/plda/scoring
142 | 
143 |   diarization/cluster.sh --nj 8 \
144 |                --cmd "$train_cmd --mem 5G" \
145 |                --reco2num-spk $dataDir/reco2num_spk \
146 |                $expDir/plda/scoring \
147 |                $expDir/plda/clustering_oracleNumSpkr
148 | 
149 |   diarization/cluster.sh --nj 8 \
150 |                --cmd "$train_cmd --mem 5G" \
151 |                --threshold 0 \
152 |                $expDir/plda/scoring \
153 |                $expDir/plda/clustering_estNumSpkr
154 | 
155 | else
156 | 
157 |   # Compute the cosine affinity
158 |   cd Auto-Tuning-Spectral-Clustering/sc_utils
159 |   rm -rf $expDir/SC/cos_scores
160 |   bash score_embedding.sh --cmd "$train_cmd" --nj 16 \
161 |                --score_metric cos --out_dir $expDir/SC/cos_scores  \
162 |                $dataDir/pytorch_xvectors $expDir/SC/cos_scores || exit 1
163 |   cd ..
164 | 
165 |   # Perform spectral clustering
166 |   rm -rf $expDir/SC/labels_oracleNumSpkr
167 |   python spectral_opt.py --affinity_score_file $expDir/SC/cos_scores/scores.scp \
168 |                --threshold 'None' --score_metric "cos" --max_speaker 10 \
169 |                --spt_est_thres 'NMESC' --reco2num_spk $dataDir/reco2num_spk \
170 |                --segment_file_input_path $dataDir/pytorch_xvectors/segments \
171 |                --spk_labels_out_path $expDir/SC/labels_oracleNumSpkr \
172 |                --sparse_search True || exit 1
173 |   mkdir -p $expDir/SC/clustering_oracleNumSpkr
174 |   python sc_utils/make_rttm.py $dataDir/pytorch_xvectors/segments \
175 |     $expDir/SC/labels_oracleNumSpkr $expDir/SC/clustering_oracleNumSpkr/rttm
176 | 
177 |   rm -rf $expDir/SC/labels_estNumSpkr
178 |   python spectral_opt.py --affinity_score_file $expDir/SC/cos_scores/scores.scp \
179 |                --threshold 'None' --score_metric "cos" --max_speaker 10 \
180 |                --spt_est_thres 'NMESC' \
181 |                --segment_file_input_path $dataDir/pytorch_xvectors/segments \
182 |                --spk_labels_out_path $expDir/SC/labels_estNumSpkr \
183 |                --sparse_search True || exit 1
184 |   mkdir -p $expDir/SC/clustering_estNumSpkr
185 |   python sc_utils/make_rttm.py $dataDir/pytorch_xvectors/segments \
186 |     $expDir/SC/labels_estNumSpkr $expDir/SC/clustering_estNumSpkr/rttm
187 |   cd ..
188 | 
189 | fi
190 | 
191 | # Evaluation
192 | 
193 | sed -i "s/-rec//g" $expDir/$method/clustering_oracleNumSpkr/rttm
194 | sed -i "s/-rec//g" $expDir/$method/clustering_estNumSpkr/rttm
195 | 
196 | cd dscore/
197 | oracleResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
198 |   -S <(ls $expDir/$method/clustering_oracleNumSpkr/rttm) |\
199 |   grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
200 | estResults=`python score.py $collarCmd -R <(ls $rttmDir/*) \
201 |   -S <(ls $expDir/$method/clustering_estNumSpkr/rttm) |\
202 |   grep OVERALL | tr -s ' ' | cut -f 4-5 -d ' '`
203 | cd ..
204 | rm $wavList
205 | 
206 | echo "DER with Oracle #Spkrs: `echo $oracleResults | cut -f 1 -d ' '`"
207 | echo "DER with Est #Spkrs   : `echo $estResults | cut -f 1 -d ' '`"
208 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## <div align="center">Deep speaker embeddings in PyTorch</div>
  2 | 
  3 |  * [Requirements:](#requirements)
  4 |        * [Other Tools:](#other-tools)
  5 |  * [Installation:](#installation)
  6 |  * [Data preparation](#data-preparation)
  7 |     * [Training data preparation](#training-data-preparation)
  8 |     * [Dataset for data augmentation](#dataset-for-data-augmentation)
  9 |  * [Training](#training)
 10 |  * [Embedding extraction](#embedding-extraction)
 11 |  * [Pretrained model](#pretrained-model)
 12 |     * [Downloading](#downloading)
 13 |     * [Speaker Verification](#speaker-verification)
 14 |     * [Speaker Diarization](#speaker-diarization)
 15 |  * [Results](#results)
 16 |     * [1. Speaker Verification (%R)](#1-speaker-verification-eer)
 17 |     * [2. Speaker Diarization (%R)](#2-speaker-diarization-der)
 18 | 
 19 | 
 20 | 
 21 | This repository contains code and models for training an x-vector speaker recognition model using Kaldi for feature preparation and PyTorch for DNN model training. MFCC feature configurations and TDNN model architecture follow the Voxceleb recipe in Kaldi (commit hash `9b4dc93c9`). Training procedures including optimizer and step count are similar to, but not exactly the same as Kaldi.
 22 | 
 23 | Additionally, code for training meta-learning embeddings are available in [train_proto.py](train_proto.py) and [train_relation.py](train_relation.py). An overview of these models is available at [https://arxiv.org/abs/2007.16196](https://arxiv.org/abs/2007.16196) and in the below figure:
 24 | 
 25 | ![Overview: Meta Learning Models](figs/meta_learning_arch.png)
 26 | 
 27 | 
 28 | ### Citation
 29 | 
 30 | If you found this toolkit useful in your research, consider citing the following:
 31 | 
 32 | ```
 33 | @misc{kumar2020designing,
 34 |     title={Designing Neural Speaker Embeddings with Meta Learning},
 35 |     author={Manoj Kumar and Tae Jin-Park and Somer Bishop and Catherine Lord and Shrikanth Narayanan},
 36 |     year={2020},    
 37 |     eprint={2007.16196},
 38 |     archivePrefix={arXiv}  
 39 | }
 40 | ```
 41 | 
 42 | ### Requirements:
 43 | Python Libraries
 44 | ```
 45 | python==3.6.10
 46 | torch==1.4.0
 47 | kaldiio==2.15.1
 48 | kaldi-python-io==1.0.4
 49 | ```
 50 | 
 51 | ##### Other Tools:
 52 | 
 53 | * Spectral Clustering using normalized maximum eigengap [GitHub](https://github.com/tango4j/Auto-Tuning-Spectral-Clustering)
 54 |   * Used for speaker clustering during diarization
 55 | * Diarization scoring tool [GitHub](https://github.com/nryant/dscore)
 56 |   * Used for computing diarization error rate (DER)
 57 | 
 58 | 
 59 | 
 60 | ### Installation:
 61 | 
 62 | * Install the python libraries listed in [Requirements](#requirements)
 63 | * Install [Kaldi toolkit](https://github.com/kaldi-asr/kaldi/blob/master/INSTALL).
 64 |   * This repository is tested with commit hash `9b4dc93c9` of the above [Kaldi repository](https://github.com/kaldi-asr/kaldi/blob/master/INSTALL).
 65 |   * Kaldi is recommended to be installed in `$HOME/kaldi`.
 66 | * Download this repository. NOTE: Destination need not be inside Kaldi installation.
 67 | * Set the `voxcelebDir` variable inside [pytorch_run.sh](pytorch_run.sh)
 68 | * (Optional) Install Other Tools listering in [Requirements](#requirements)
 69 | 
 70 | ### Data preparation
 71 | 
 72 | #### Training data preparation
 73 | 
 74 | * Training features are expected in Kaldi nnet3 egs format, and read using the `nnet3EgsDL` class defined in [train_utils.py](train_utils.py).
 75 | * The voxceleb recipe is provided in [pytorch_run.sh](pytorch_run.sh) to prepare them.
 76 | * Extracted embeddings are written in Kaldi vector format, similar to `xvector.ark`.
 77 | 
 78 | #### Dataset for data augmentation
 79 | 
 80 | [pytorch_run.sh](pytorch_run.sh) script augments the training data using the following two datasets.
 81 | * Download [MUSAN](https://openslr.org/17/) and extract to ./musan.
 82 | * Download [RIRS_NOISES](https://openslr.org/28/) and extract to ./RIRS_NOISES.
 83 | 
 84 | 
 85 | ### Training
 86 | ```
 87 | CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node=1 train_xent.py <egsDir>
 88 | ```
 89 | ```
 90 | usage: train_xent.py [-h] [--local_rank LOCAL_RANK] [-modelType MODELTYPE]
 91 |                      [-featDim FEATDIM] [-resumeTraining RESUMETRAINING]
 92 |                      [-resumeModelDir RESUMEMODELDIR]
 93 |                      [-numArchives NUMARCHIVES] [-numSpkrs NUMSPKRS]
 94 |                      [-logStepSize LOGSTEPSIZE] [-batchSize BATCHSIZE]
 95 |                      [-numEgsPerArk NUMEGSPERARK]
 96 |                      [-preFetchRatio PREFETCHRATIO]
 97 |                      [-optimMomentum OPTIMMOMENTUM] [-baseLR BASELR]
 98 |                      [-maxLR MAXLR] [-numEpochs NUMEPOCHS]
 99 |                      [-noiseEps NOISEEPS] [-pDropMax PDROPMAX]
100 |                      [-stepFrac STEPFRAC]
101 |                      egsDir
102 | 
103 | positional arguments:
104 |   egsDir                Directory with training archives
105 | 
106 | optional arguments:
107 |   -h, --help            show this help message and exit
108 |   --local_rank LOCAL_RANK
109 |   -modelType MODELTYPE  Refer train_utils.py
110 |   -featDim FEATDIM      Frame-level feature dimension
111 |   -resumeTraining RESUMETRAINING
112 |                         (1) Resume training, or (0) Train from scratch
113 |   -resumeModelDir RESUMEMODELDIR
114 |                         Path containing training checkpoints
115 |   -numArchives NUMARCHIVES
116 |                         Number of egs.*.ark files
117 |   -numSpkrs NUMSPKRS    Number of output labels
118 |   -logStepSize LOGSTEPSIZE
119 |                         Iterations per log
120 |   -batchSize BATCHSIZE  Batch size
121 |   -numEgsPerArk NUMEGSPERARK
122 |                         Number of training examples per egs file
123 |   -preFetchRatio PREFETCHRATIO
124 |                         xbatchSize to fetch from dataloader
125 |   -optimMomentum OPTIMMOMENTUM
126 |                         Optimizer momentum
127 |   -baseLR BASELR        Initial LR
128 |   -maxLR MAXLR          Maximum LR
129 |   -numEpochs NUMEPOCHS  Number of training epochs
130 |   -noiseEps NOISEEPS    Noise strength before pooling
131 |   -pDropMax PDROPMAX    Maximum dropout probability
132 |   -stepFrac STEPFRAC    Training iteration when dropout = pDropMax
133 | 
134 | ```
135 | `egsDir` contains the nnet3 egs files.
136 | 
137 | ### Embedding extraction
138 | ```
139 | usage: extract.py [-h] [-modelType MODELTYPE] [-numSpkrs NUMSPKRS]
140 |                   modelDirectory featDir embeddingDir
141 | 
142 | positional arguments:
143 |   modelDirectory        Directory containing the model checkpoints
144 |   featDir               Directory containing features ready for extraction
145 |   embeddingDir          Output directory
146 | 
147 | optional arguments:
148 |   -h, --help            show this help message and exit
149 |   -modelType MODELTYPE  Refer train_utils.py
150 |   -numSpkrs NUMSPKRS    Number of output labels for model
151 | ```
152 | The script [pytorch_run.sh](pytorch_run.sh) can be used to train embeddings on the voxceleb recipe on an end-to-end basis.
153 | 
154 | ### Pretrained model
155 | 
156 | #### Downloading
157 | Two ways to download the pre-trained model:
158 | 1. Google Drive [link](https://drive.google.com/file/d/1gbAWDdWN_pkOim4rWVXUlfuYjfyJqUHZ/view?usp=sharing) *(or)*
159 | 2. Command line ([reference](https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99))
160 |     ```
161 |     wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1gbAWDdWN_pkOim4rWVXUlfuYjfyJqUHZ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1gbAWDdWN_pkOim4rWVXUlfuYjfyJqUHZ" -O preTrainedModel.zip && rm -rf /tmp/cookies.txt
162 |     ```
163 | 
164 | #### Speaker Verification
165 | To reproduce voxceleb EER results with the pretrained model, follow the below steps.
166 | NOTE: The voxceleb features must be prepared using `prepare_feats_for_egs.sh` prior to evaluation.
167 | 
168 | 1) Extract `models/` and `xvectors/` from the pre-trained archive into the installation directory
169 | 2) Set the following variables in [pytorch_run.sh](pytorch_run.sh):
170 |     ```
171 |     modelDir=models/xvec_preTrained
172 |     trainFeatDir=data/train_combined_no_sil
173 |     trainXvecDir=xvectors/xvec_preTrained/train
174 |     testFeatDir=data/voxceleb1_test_no_sil
175 |     testXvecDir=xvectors/xvec_preTrained/test
176 |     ```
177 | 3) Extract embeddings and compute EER, minDCF. Set `stage=7` in [pytorch_run.sh](pytorch_run.sh) and execute:
178 |    ```
179 |    bash pytorch_run.sh
180 |    ```
181 | 4) Alternatively, pretrained PLDA model is available inside `xvectors/train` directory. Set `stage=9` in [pytorch_run.sh](pytorch_run.sh) and execute:
182 |    ```
183 |    bash pytorch_run.sh
184 |    ```
185 | #### Speaker Diarization
186 | 
187 | ```
188 | cd egs/
189 | ```
190 | Place the audio files to diarize and their corresponding RTTM files in `demo_wav/` and `demo_rttm/` directories. Execute:
191 | ```
192 | bash diarize.sh
193 | ```
194 | 
195 | ### Results
196 | 
197 | #### 1. Speaker Verification (%EER)
198 | 
199 | |         | Kaldi           | pytorch_xvectors  |
200 | |:-------------|:-------------:|:-----:|
201 | | Vox1-test      | 3.13 | 2.82 |
202 | | VOICES-dev      | 10.30 | 8.59 |
203 | 
204 | 
205 | #### 2. Speaker Diarization (%DER)
206 | 
207 | NOTE: Clustering using [https://github.com/tango4j/Auto-Tuning-Spectral-Clustering](https://github.com/tango4j/Auto-Tuning-Spectral-Clustering)
208 | 
209 | |         | Kaldi           | pytorch_xvectors  |
210 | |:-------------|:-------------:|:-----:|
211 | | DIHARD2 dev (no collar, oracle #spk)      | 26.97 | 27.50 |
212 | | DIHARD2 dev (no collar, est #spk)      | 24.49 | 24.66 |
213 | | AMI dev+test (26 meetings, collar, oracle #spk) | 6.39 | 6.30 |
214 | | AMI dev+test (26 meetings, collar, est #spk) | 7.29 | 10.14 |
215 | 


--------------------------------------------------------------------------------
/pytorch_run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash	
  2 | # Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
  3 | #             2017   Johns Hopkins University (Author: Daniel Povey)
  4 | #        2017-2018   David Snyder	
  5 | #             2018   Ewald Enzinger
  6 | #             2020   Manoj Kumar	
  7 | # Apache 2.0.	
  8 | 
  9 | # Grabs Kaldi directory and replace the first line of path.sh
 10 | EXPORT_LINE="export KALDI_ROOT=$HOME/kaldi"
 11 | sed -i '1c\'"$EXPORT_LINE" path.sh
 12 | 
 13 | voxcelebDir=$HOME/kaldi/egs/voxceleb/v2/
 14 | configFile=local.config
 15 | 
 16 | # Get symlinks, if not present
 17 | for f in sid steps utils local conf; do
 18 |   [ ! -L $f ] && ln -s $voxcelebDir/$f;
 19 | done
 20 | 
 21 | . ./cmd.sh
 22 | . ./path.sh
 23 | 
 24 | mfccdir=mfcc
 25 | vaddir=mfcc
 26 | 
 27 | # The trials file is downloaded by local/make_voxceleb1_v2.pl.
 28 | voxceleb1_trials=data/voxceleb1_test/trials
 29 | voxceleb1_root=/path/to/VOXCELEB1
 30 | voxceleb2_root=/path/to/VOXCELEB2
 31 | musan_root=$PWD/musan
 32 | RIRS_NOISES_root=$PWD/RIRS_NOISES
 33 | 
 34 | modelDir=models/xvec_preTrained
 35 | trainFeatDir=data/train_combined_no_sil
 36 | trainXvecDir=xvectors/xvec_preTrained/train
 37 | testFeatDir=data/voxceleb1_test_no_sil
 38 | testXvecDir=xvectors/xvec_preTrained/test
 39 | stage=7
 40 | 
 41 | if [ $stage -le 0 ]; then
 42 |    if [ "$voxceleb1_root" = "/path/to/VOXCELEB1" ] || [ "$voxceleb2_root" = "path/to/VOXCELEB2" ]; then
 43 |          echo "ERROR: VOXCELEB1&2 data path should be specified."
 44 |          exit 1
 45 |    fi
 46 |   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
 47 |   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
 48 |   
 49 |   # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
 50 |   # Our evaluation set is the test portion of VoxCeleb1.
 51 |   local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
 52 |   local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
 53 |   # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
 54 |   # local/make_voxceleb1.pl $voxceleb1_root data
 55 |   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
 56 |   # This should give 7,323 speakers and 1,276,888 utterances.
 57 |   utils/combine_data.sh $PWD/data/train $PWD/data/voxceleb2_train $PWD/data/voxceleb2_test $PWD/data/voxceleb1_train
 58 | fi
 59 | 
 60 | if [ $stage -le 1 ]; then
 61 |   # Make MFCCs and compute the energy-based VAD for each dataset
 62 |   for name in train voxceleb1_test; do
 63 |     steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
 64 |       $PWD/data/${name} $PWD/exp/make_mfcc $mfccdir
 65 |     utils/fix_data_dir.sh $PWD/data/${name}
 66 |     sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
 67 |       $PWD/data/${name} exp/make_vad $vaddir
 68 |     utils/fix_data_dir.sh $PWD/data/${name}
 69 |   done
 70 | fi
 71 | 
 72 | # In this section, we augment the VoxCeleb2 data with reverberation,
 73 | # noise, music, and babble, and combine it with the clean data.
 74 | if [ $stage -le 2 ]; then
 75 |   frame_shift=0.01
 76 |   awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur
 77 | 
 78 |   if [[ ! -d "$RIRS_NOISES_root" ]]; then
 79 |       echo "ERROR: RIRS_NOISES noise dataset directory is not setup."
 80 |       exit 1
 81 |   fi
 82 | 
 83 |   # Make a version with reverberated speech
 84 |   rvb_opts=()
 85 |   rvb_opts+=(--rir-set-parameters "0.5, $RIRS_NOISES_root/simulated_rirs/smallroom/rir_list")
 86 |   rvb_opts+=(--rir-set-parameters "0.5, $RIRS_NOISES_root/simulated_rirs/mediumroom/rir_list")
 87 | 
 88 |   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
 89 |   # additive noise here.
 90 |   steps/data/reverberate_data_dir.py \
 91 |     "${rvb_opts[@]}" \
 92 |     --speech-rvb-probability 1 \
 93 |     --pointsource-noise-addition-probability 0 \
 94 |     --isotropic-noise-addition-probability 0 \
 95 |     --num-replications 1 \
 96 |     --source-sampling-rate 16000 \
 97 |     data/train data/train_reverb
 98 |   cp data/train/vad.scp data/train_reverb/
 99 |   utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new
100 |   rm -rf data/train_reverb
101 |   mv data/train_reverb.new data/train_reverb
102 | 
103 |   # Prepare the MUSAN corpus, which consists of music, speech, and noise
104 |   # suitable for augmentation.
105 |   
106 |   if [[ ! -d "$musan_root" ]]; then
107 |       echo "ERROR: MUSAN noise dataset directory is not setup."
108 |       exit 1
109 |   fi
110 |   steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
111 | 
112 |   # Get the duration of the MUSAN recordings.  This will be used by the
113 |   # script augment_data_dir.py.
114 |   for name in speech noise music; do
115 |     utils/data/get_utt2dur.sh data/musan_${name}
116 |     mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
117 |   done
118 | 
119 |   # Augment with musan_noise
120 |   steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
121 |   # Augment with musan_music
122 |   steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
123 |   # Augment with musan_speech
124 |   steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
125 | 
126 |   # Combine reverb, noise, music, and babble into one directory.
127 |   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
128 | fi
129 | 
130 | 
131 | if [ $stage -le 3 ]; then
132 |   # Take a random subset of the augmentations
133 |   utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m
134 |   utils/fix_data_dir.sh data/train_aug_1m
135 | 
136 |   # Make MFCCs for the augmented data.  Note that we do not compute a new
137 |   # vad.scp file here.  Instead, we use the vad.scp from the clean version of
138 |   # the list.
139 |   steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
140 |     data/train_aug_1m exp/make_mfcc $mfccdir
141 | 
142 |   # Combine the clean and augmented VoxCeleb2 list.  This is now roughly
143 |   # double the size of the original clean list.
144 |   utils/combine_data.sh data/train_combined data/train_aug_1m data/train
145 | fi
146 | 
147 | 
148 | # Now we prepare the features to generate examples for xvector training.
149 | if [ $stage -le 4 ]; then
150 |   # This script applies CMVN and removes nonspeech frames.  Note that this is somewhat
151 |   # wasteful, as it roughly doubles the amount of training data on disk.  After
152 |   # creating training examples, this can be removed.
153 |   local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \
154 |     data/train_combined $trainFeatDir exp/train_combined_no_sil
155 |   utils/fix_data_dir.sh $trainFeatDir
156 | 
157 |   # Preparing the test features as well. This will be used only during testing
158 |   local/nnet3/xvector/prepare_feats_for_egs.sh --nj 10 --cmd "$train_cmd" \
159 |     data/voxceleb1_test $testFeatDir exp/voxceleb1_test_no_sil
160 |   utils/fix_data_dir.sh $testFeatDir
161 | 
162 | fi
163 | 
164 | if [ $stage -le 5 ]; then
165 |   # Now, we need to remove features that are too short after removing silence
166 |   # frames.  We want atleast 5s (500 frames) per utterance.
167 |   min_len=400
168 |   mv $trainFeatDir/utt2num_frames $trainFeatDir/utt2num_frames.bak
169 |   awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' $trainFeatDir/utt2num_frames.bak > $trainFeatDir/utt2num_frames
170 |   utils/filter_scp.pl $trainFeatDir/utt2num_frames $trainFeatDir/utt2spk > $trainFeatDir/utt2spk.new
171 |   mv $trainFeatDir/utt2spk.new $trainFeatDir/utt2spk
172 |   utils/fix_data_dir.sh $trainFeatDir
173 | 
174 |   # We also want several utterances per speaker. Now we'll throw out speakers
175 |   # with fewer than 8 utterances.
176 |   min_num_utts=8
177 |   awk '{print $1, NF-1}' $trainFeatDir/spk2utt > $trainFeatDir/spk2num
178 |   awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' $trainFeatDir/spk2num | utils/filter_scp.pl - $trainFeatDir/spk2utt > $trainFeatDir/spk2utt.new
179 |   mv $trainFeatDir/spk2utt.new $trainFeatDir/spk2utt
180 |   utils/spk2utt_to_utt2spk.pl $trainFeatDir/spk2utt > $trainFeatDir/utt2spk
181 | 
182 |   utils/filter_scp.pl $trainFeatDir/utt2spk $trainFeatDir/utt2num_frames > $trainFeatDir/utt2num_frames.new
183 |   mv $trainFeatDir/utt2num_frames.new $trainFeatDir/utt2num_frames
184 | 
185 |   # Now we're ready to create training examples.
186 |   utils/fix_data_dir.sh $trainFeatDir
187 | fi
188 | 
189 | if [ $stage -le 6 ]; then
190 | 
191 |   # Prepare the egs
192 |   sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
193 |     --nj 8 \
194 |     --stage 0 \
195 |     --frames-per-iter 1000000000 \
196 |     --frames-per-iter-diagnostic 100000 \
197 |     --min-frames-per-chunk 200 \
198 |     --max-frames-per-chunk 400 \
199 |     --num-diagnostic-archives 3 \
200 |     --num-repeats 50 \
201 |     $trainFeatDir exp/xvector_nnet_1a/egs/
202 | 
203 |   # Main DNN training
204 |   CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node=1 \
205 |   train_xent.py exp/xvector_nnet_1a/egs/
206 |   modelDir=models/`ls -t | head -n1`
207 | 
208 | fi
209 | 
210 | 
211 | if [ $stage -le 7 ]; then
212 | 
213 |   python extract.py $modelDir $trainFeatDir $trainXvecDir
214 |   python extract.py $modelDir $testFeatDir $testXvecDir
215 | 
216 | fi
217 | 
218 | if [ $stage -le 8 ]; then
219 | 
220 |   # Reproducing voxceleb results
221 |   # Compute the mean vector for centering the evaluation xvectors.
222 |   $train_cmd $trainXvecDir/log/compute_mean.log \
223 |     ivector-mean scp:$trainXvecDir/xvector.scp \
224 |     $trainXvecDir/mean.vec
225 | 
226 |   # This script uses LDA to decrease the dimensionality prior to PLDA.
227 |   lda_dim=200
228 |   $train_cmd $trainXvecDir/log/lda.log \
229 |     ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
230 |     "ark:ivector-subtract-global-mean scp:$trainXvecDir/xvector.scp ark:- |" \
231 |     ark:$trainFeatDir/utt2spk $trainXvecDir/transform.mat
232 | 
233 |   # Train the PLDA model.
234 |   $train_cmd $trainXvecDir/log/plda.log \
235 |     ivector-compute-plda ark:$trainFeatDir/spk2utt \
236 |     "ark:ivector-subtract-global-mean scp:$trainXvecDir/xvector.scp ark:- | transform-vec $trainXvecDir/transform.mat ark:- ark:- | ivector-normalize-length ark:-  ark:- |" \
237 |     $trainXvecDir/plda
238 | 
239 | fi
240 | 
241 | if [ $stage -le 9 ]; then
242 | 
243 |   $train_cmd $testXvecDir/log/voxceleb1_test_scoring.log \
244 |     ivector-plda-scoring --normalize-length=true \
245 |     "ivector-copy-plda --smoothing=0.0 $trainXvecDir/plda - |" \
246 |     "ark:ivector-subtract-global-mean $trainXvecDir/mean.vec scp:$testXvecDir/xvector.scp ark:- | transform-vec $trainXvecDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
247 |     "ark:ivector-subtract-global-mean $trainXvecDir/mean.vec scp:$testXvecDir/xvector.scp ark:- | transform-vec $trainXvecDir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
248 |     "cat '$voxceleb1_trials' | cut -d\  --fields=1,2 |" $testXvecDir/scores_voxceleb1_test
249 | 
250 |   eer=`compute-eer <(local/prepare_for_eer.py $voxceleb1_trials $testXvecDir/scores_voxceleb1_test) 2> /dev/null`
251 |   mindcf1=`sid/compute_min_dcf.py --p-target 0.01 $testXvecDir/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null`
252 |   mindcf2=`sid/compute_min_dcf.py --p-target 0.001 $testXvecDir/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null`
253 |   echo "EER: $eer%"
254 |   echo "minDCF(p-target=0.01): $mindcf1"
255 |   echo "minDCF(p-target=0.001): $mindcf2"
256 | 
257 | fi
258 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3.6
  2 | 
  3 | """
  4 |     Date Created: Feb 10 2020
  5 | 
  6 |     This file contains the model descriptions, including original x-vector
  7 |     architecture. The first two models are in active developement. All others
  8 |     are provided below
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | from torch.nn import functional as F
 14 | 
 15 | 
 16 | class simpleTDNN(nn.Module):
 17 | 
 18 |     def __init__(self, numSpkrs, p_dropout):
 19 |         super(simpleTDNN, self).__init__()
 20 |         self.tdnn1 = nn.Conv1d(in_channels=30, out_channels=128, kernel_size=5, dilation=1)
 21 |         self.bn_tdnn1 = nn.BatchNorm1d(128, momentum=0.1, affine=False)
 22 |         self.dropout_tdnn1 = nn.Dropout(p=p_dropout)
 23 | 
 24 |         self.tdnn2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, dilation=2)
 25 |         self.bn_tdnn2 = nn.BatchNorm1d(128, momentum=0.1, affine=False)
 26 |         self.dropout_tdnn2 = nn.Dropout(p=p_dropout)
 27 | 
 28 |         self.tdnn3 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=1, dilation=1)
 29 |         self.bn_tdnn3 = nn.BatchNorm1d(128, momentum=0.1, affine=False)
 30 |         self.dropout_tdnn3 = nn.Dropout(p=p_dropout)
 31 | 
 32 |         self.fc1 = nn.Linear(2*128,128)
 33 |         self.bn_fc1 = nn.BatchNorm1d(128, momentum=0.1, affine=False)
 34 |         self.dropout_fc1 = nn.Dropout(p=p_dropout)
 35 | 
 36 |         self.fc2 = nn.Linear(128,64)
 37 |         self.bn_fc2 = nn.BatchNorm1d(64, momentum=0.1, affine=False)
 38 |         self.dropout_fc2 = nn.Dropout(p=p_dropout)
 39 | 
 40 |         self.fc3 = nn.Linear(64,numSpkrs)
 41 | 
 42 |     def forward(self, x, eps):
 43 |         # Note: x must be (batch_size, feat_dim, chunk_len)
 44 | 
 45 |         x = self.dropout_tdnn1(self.bn_tdnn1(F.relu(self.tdnn1(x))))
 46 |         x = self.dropout_tdnn2(self.bn_tdnn2(F.relu(self.tdnn2(x))))
 47 |         x = self.dropout_tdnn3(self.bn_tdnn3(F.relu(self.tdnn3(x))))
 48 | 
 49 |         if self.training:
 50 |             x = x + torch.randn(x.size()).cuda()*eps
 51 |         stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
 52 |         x = self.dropout_fc1(self.bn_fc1(F.relu(self.fc1(stats))))
 53 |         x = self.dropout_fc2(self.bn_fc2(F.relu(self.fc2(x))))
 54 |         x = self.fc3(x)
 55 |         return x
 56 | 
 57 | 
 58 | class xvecTDNN(nn.Module):
 59 | 
 60 |     def __init__(self, numSpkrs, p_dropout):
 61 |         super(xvecTDNN, self).__init__()
 62 |         self.tdnn1 = nn.Conv1d(in_channels=30, out_channels=512, kernel_size=5, dilation=1)
 63 |         self.bn_tdnn1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 64 |         self.dropout_tdnn1 = nn.Dropout(p=p_dropout)
 65 | 
 66 |         self.tdnn2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, dilation=2)
 67 |         self.bn_tdnn2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 68 |         self.dropout_tdnn2 = nn.Dropout(p=p_dropout)
 69 | 
 70 |         self.tdnn3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=7, dilation=3)
 71 |         self.bn_tdnn3 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 72 |         self.dropout_tdnn3 = nn.Dropout(p=p_dropout)
 73 | 
 74 |         self.tdnn4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=1, dilation=1)
 75 |         self.bn_tdnn4 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 76 |         self.dropout_tdnn4 = nn.Dropout(p=p_dropout)
 77 | 
 78 |         self.tdnn5 = nn.Conv1d(in_channels=512, out_channels=1500, kernel_size=1, dilation=1)
 79 |         self.bn_tdnn5 = nn.BatchNorm1d(1500, momentum=0.1, affine=False)
 80 |         self.dropout_tdnn5 = nn.Dropout(p=p_dropout)
 81 | 
 82 |         self.fc1 = nn.Linear(3000,512)
 83 |         self.bn_fc1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 84 |         self.dropout_fc1 = nn.Dropout(p=p_dropout)
 85 | 
 86 |         self.fc2 = nn.Linear(512,512)
 87 |         self.bn_fc2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
 88 |         self.dropout_fc2 = nn.Dropout(p=p_dropout)
 89 | 
 90 |         self.fc3 = nn.Linear(512,numSpkrs)
 91 | 
 92 |     def forward(self, x, eps):
 93 |         # Note: x must be (batch_size, feat_dim, chunk_len)
 94 | 
 95 |         x = self.dropout_tdnn1(self.bn_tdnn1(F.relu(self.tdnn1(x))))
 96 |         x = self.dropout_tdnn2(self.bn_tdnn2(F.relu(self.tdnn2(x))))
 97 |         x = self.dropout_tdnn3(self.bn_tdnn3(F.relu(self.tdnn3(x))))
 98 |         x = self.dropout_tdnn4(self.bn_tdnn4(F.relu(self.tdnn4(x))))
 99 |         x = self.dropout_tdnn5(self.bn_tdnn5(F.relu(self.tdnn5(x))))
100 | 
101 |         if self.training:
102 |             shape = x.size()
103 |             noise = torch.cuda.FloatTensor(shape)
104 |             torch.randn(shape, out=noise)
105 |             x += noise*eps
106 | 
107 |         stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
108 |         x = self.dropout_fc1(self.bn_fc1(F.relu(self.fc1(stats))))
109 |         x = self.dropout_fc2(self.bn_fc2(F.relu(self.fc2(x))))
110 |         x = self.fc3(x)
111 |         return x
112 | 
113 | 
114 | class proto_xvecTDNN(nn.Module):
115 | 
116 |     def __init__(self, numSpkrs, p_dropout):
117 |         super(proto_xvecTDNN, self).__init__()
118 |         self.tdnn1 = nn.Conv1d(in_channels=30, out_channels=512, kernel_size=5, dilation=1)
119 |         self.bn_tdnn1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
120 |         self.dropout_tdnn1 = nn.Dropout(p=p_dropout)
121 | 
122 |         self.tdnn2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, dilation=2)
123 |         self.bn_tdnn2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
124 |         self.dropout_tdnn2 = nn.Dropout(p=p_dropout)
125 | 
126 |         self.tdnn3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=7, dilation=3)
127 |         self.bn_tdnn3 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
128 |         self.dropout_tdnn3 = nn.Dropout(p=p_dropout)
129 | 
130 |         self.tdnn4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=1, dilation=1)
131 |         self.bn_tdnn4 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
132 |         self.dropout_tdnn4 = nn.Dropout(p=p_dropout)
133 | 
134 |         self.tdnn5 = nn.Conv1d(in_channels=512, out_channels=1500, kernel_size=1, dilation=1)
135 |         self.bn_tdnn5 = nn.BatchNorm1d(1500, momentum=0.1, affine=False)
136 |         self.dropout_tdnn5 = nn.Dropout(p=p_dropout)
137 | 
138 |         self.fc1 = nn.Linear(3000,512)
139 |         self.bn_fc1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
140 |         self.dropout_fc1 = nn.Dropout(p=p_dropout)
141 | 
142 |         self.fc2 = nn.Linear(512,512)
143 |         self.bn_fc2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
144 |         self.dropout_fc2 = nn.Dropout(p=p_dropout)
145 | 
146 |         self.fc3 = nn.Linear(512,512)
147 |         self.bn_fc3 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
148 |         self.dropout_fc3 = nn.Dropout(p=p_dropout)
149 | 
150 |         self.fc4 = nn.Linear(512,512)
151 | 
152 |     def forward(self, x, eps):
153 |         # Note: x must be (batch_size, feat_dim, chunk_len)
154 | 
155 |         x = self.dropout_tdnn1(self.bn_tdnn1(F.relu(self.tdnn1(x))))
156 |         x = self.dropout_tdnn2(self.bn_tdnn2(F.relu(self.tdnn2(x))))
157 |         x = self.dropout_tdnn3(self.bn_tdnn3(F.relu(self.tdnn3(x))))
158 |         x = self.dropout_tdnn4(self.bn_tdnn4(F.relu(self.tdnn4(x))))
159 |         x = self.dropout_tdnn5(self.bn_tdnn5(F.relu(self.tdnn5(x))))
160 | 
161 |         if self.training:
162 |             shape = x.size()
163 |             noise = torch.cuda.FloatTensor(shape)
164 |             torch.randn(shape, out=noise)
165 |             x += noise*eps
166 | 
167 |         stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
168 |         x = self.dropout_fc1(self.bn_fc1(F.relu(self.fc1(stats))))
169 |         x = self.dropout_fc2(self.bn_fc2(F.relu(self.fc2(x))))
170 |         x = self.dropout_fc3(self.bn_fc3(F.relu(self.fc3(x))))
171 |         x = self.fc4(x)
172 |         return x
173 | 
174 | 
175 | class relation_encoder_xvecTDNN(nn.Module):
176 | 
177 |     def __init__(self, numSpkrs, p_dropout):
178 |         super(relation_encoder_xvecTDNN, self).__init__()
179 |         self.tdnn1 = nn.Conv1d(in_channels=30, out_channels=512, kernel_size=5, dilation=1)
180 |         self.bn_tdnn1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
181 |         self.dropout_tdnn1 = nn.Dropout(p=p_dropout)
182 | 
183 |         self.tdnn2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, dilation=2)
184 |         self.bn_tdnn2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
185 |         self.dropout_tdnn2 = nn.Dropout(p=p_dropout)
186 | 
187 |         self.tdnn3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=7, dilation=3)
188 |         self.bn_tdnn3 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
189 |         self.dropout_tdnn3 = nn.Dropout(p=p_dropout)
190 | 
191 |         self.tdnn4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=1, dilation=1)
192 |         self.bn_tdnn4 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
193 |         self.dropout_tdnn4 = nn.Dropout(p=p_dropout)
194 | 
195 |         self.tdnn5 = nn.Conv1d(in_channels=512, out_channels=1500, kernel_size=1, dilation=1)
196 |         self.bn_tdnn5 = nn.BatchNorm1d(1500, momentum=0.1, affine=False)
197 |         self.dropout_tdnn5 = nn.Dropout(p=p_dropout)
198 | 
199 |         self.fc1 = nn.Linear(3000,512)
200 |         self.bn_fc1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
201 |         self.dropout_fc1 = nn.Dropout(p=p_dropout)
202 | 
203 |         self.fc2 = nn.Linear(512,512)
204 |         self.bn_fc2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
205 |         self.dropout_fc2 = nn.Dropout(p=p_dropout)
206 | 
207 |         self.fc3 = nn.Linear(512,512)
208 | 
209 |     def forward(self, x, eps):
210 |         # Note: x must be (batch_size, feat_dim, chunk_len)
211 | 
212 |         x = self.dropout_tdnn1(self.bn_tdnn1(F.relu(self.tdnn1(x))))
213 |         x = self.dropout_tdnn2(self.bn_tdnn2(F.relu(self.tdnn2(x))))
214 |         x = self.dropout_tdnn3(self.bn_tdnn3(F.relu(self.tdnn3(x))))
215 |         x = self.dropout_tdnn4(self.bn_tdnn4(F.relu(self.tdnn4(x))))
216 |         x = self.dropout_tdnn5(self.bn_tdnn5(F.relu(self.tdnn5(x))))
217 | 
218 |         if self.training:
219 |             shape = x.size()
220 |             noise = torch.cuda.FloatTensor(shape)
221 |             torch.randn(shape, out=noise)
222 |             x += noise*eps
223 | 
224 |         stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
225 |         x = self.dropout_fc1(self.bn_fc1(F.relu(self.fc1(stats))))
226 |         x = self.dropout_fc2(self.bn_fc2(F.relu(self.fc2(x))))
227 |         x = self.fc3(x)
228 |         return x
229 | 
230 | 
231 | class relation_relation_xvecTDNN(nn.Module):
232 | 
233 |     def __init__(self):
234 |         super(relation_relation_xvecTDNN, self).__init__()
235 |         self.fc1 = nn.Linear(1024,512)
236 |         self.bn_fc1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
237 | 
238 |         self.fc2 = nn.Linear(512,512)
239 |         self.bn_fc2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
240 | 
241 |         self.fc3 = nn.Linear(512,1)
242 | 
243 |     def forward(self, x):
244 |         x = self.bn_fc1(F.relu(self.fc1(x)))
245 |         x = self.bn_fc2(F.relu(self.fc2(x)))
246 |         x = self.fc3(x)
247 |         return x
248 | 
249 | """============================ OLD MODELS ==============================="""
250 | 
251 | class simpleCNN(nn.Module):
252 | 
253 |     def __init__(self):
254 |         super(simpleCNN, self).__init__()
255 |         # 1 input image channel, 6 output channels, 3x3 square convolution
256 |         # kernel
257 |         self.conv1 = nn.Conv2d(1, 16, 5)
258 |         self.conv2 = nn.Conv2d(16, 16, 3)
259 |         # an affine operation: y = Wx + b
260 |         self.fc1 = nn.Linear(16 * 21 * 1, 64)  # 6*6 from image dimension
261 |         self.fc2 = nn.Linear(64, 64)
262 |         self.fc3 = nn.Linear(64, 460)
263 | 
264 |     def forward(self, x):
265 |         # Max pooling over a (2, 2) window
266 |         x = F.max_pool2d(F.relu(self.conv1(x)), (5, 5))
267 |         # If the size is a square you can only specify a single number
268 |         x = F.max_pool2d(F.relu(self.conv2(x)), 3)
269 |         x = x.view(-1, self.num_flat_features(x))
270 |         x = F.relu(self.fc1(x))
271 |         x = F.relu(self.fc2(x))
272 |         x = self.fc3(x)
273 |         return x
274 | 
275 |     def num_flat_features(self, x):
276 |         size = x.size()[1:]  # all dimensions except the batch dimension
277 |         num_features = 1
278 |         for s in size:
279 |             num_features *= s
280 |         return num_features
281 | 
282 | class simpleLSTM(nn.Module):
283 | 
284 |     def __init__(self):
285 |         super(simpleLSTM, self).__init__()
286 |         self.lstm1 = nn.LSTM(input_size=30, hidden_size=128, num_layers=1, batch_first=True)
287 |         self.fc1 = nn.Linear(128,64)
288 |         self.fc2 = nn.Linear(64,64)
289 |         self.fc3 = nn.Linear(64,460)
290 | 
291 |     def forward(self, x):
292 |         # x's shape must be (batch, seq_len, input_size)
293 |         _,(h,_) = self.lstm1(x)
294 |         x = F.relu(self.fc1(h.view(h.shape[1], h.shape[2])))
295 |         x = F.relu(self.fc2(x))
296 |         x = self.fc3(x)
297 |         return x
298 | 


--------------------------------------------------------------------------------
/train_utils.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3.6
  2 | 
  3 | """
  4 |     Date Created: Feb 11 2020
  5 |     This file will contain the training utils
  6 | 
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | import glob
 12 | import h5py
 13 | import torch
 14 | import random
 15 | import configparser
 16 | import argparse
 17 | from datetime import datetime
 18 | import numpy as np
 19 | from models import *
 20 | import kaldi_python_io
 21 | from kaldiio import ReadHelper
 22 | from torch.utils.data import Dataset, IterableDataset
 23 | from collections import OrderedDict
 24 | 
 25 | 
 26 | def readHdf5File_full(fileName):
 27 |     """ Read at-once from the hdf5 file. Rarely used
 28 |         Outputs:
 29 |         feats: (N,1,chunkLen,30)
 30 |         labels: (N,1)
 31 |     """
 32 |     with h5py.File(fileName,'r') as x:
 33 |         feats, labels = np.array(x.get('feats')), np.array(x.get('labels'))
 34 |     chunkLen = feats.shape[1]
 35 |     feats = torch.from_numpy(feats).unsqueeze(1) # make in (N,1,chunkLen,30)
 36 |     labels = torch.from_numpy(labels)
 37 |     return feats, labels
 38 | 
 39 | class nnet3EgsDL(IterableDataset):
 40 |     """ Data loader class to read directly from egs files, no HDF5
 41 |     """
 42 | 
 43 |     def __init__(self, arkFile):
 44 |         self.fid = kaldi_python_io.Nnet3EgsReader(arkFile)
 45 | 
 46 |     def __iter__(self):
 47 |         return iter(self.fid)
 48 | 
 49 | 
 50 | class myH5DL(Dataset):
 51 |     """ Data loader class customized to reading from hdf5 files
 52 |     """
 53 | 
 54 |     def __init__(self, hdf5File):
 55 |         x = h5py.File(hdf5File,'r')
 56 |         self.feats = x.get('feats')
 57 |         self.labels = x.get('labels')
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.labels)
 61 | 
 62 |     def __getitem__(self, idx):
 63 |         """ Return samples from idx:idx+batch_size
 64 |         """
 65 |         X = self.feats[idx,:,:]
 66 |         Y = self.labels[idx]
 67 |         return X, Y
 68 | 
 69 | class myH5DL_sampler(Dataset):
 70 |     """ Data loader class customized to reading from hdf5 files
 71 |         Based on https://github.com/cyvius96/prototypical-network-pytorch/blob/master/samplers.py
 72 |     """
 73 | 
 74 |     def __init__(self, hdf5File, minClasses, maxClasses, samplesPerClass, numEpisodes=100):
 75 |         self.samplesPerClass = samplesPerClass
 76 |         self.minClasses = minClasses
 77 |         self.maxClasses = maxClasses
 78 |         self.numEpisodes = numEpisodes
 79 |         x = h5py.File(hdf5File,'r')
 80 |         self.feats = x.get('feats')
 81 |         self.labels = x.get('labels')
 82 |         npLabels = self.labels[()].reshape(-1)
 83 |         self.uniqLabels = np.ndarray.tolist(np.unique(npLabels))
 84 |         try:
 85 |             assert self.maxClasses <= len(self.uniqLabels)
 86 |         except:
 87 |             print('Requesting more classes (%d) than available (%d)' %(self.maxClasses, len(self.uniqLabels)))
 88 |             sys.exit(1)
 89 | 
 90 |         self.labelIndices = {}
 91 |         for lab in self.uniqLabels:
 92 |             ind = np.argwhere(npLabels==lab).reshape(-1)
 93 |             # self.labelIndices[lab] = torch.from_numpy(ind)
 94 |             self.labelIndices[lab] = np.ndarray.tolist(ind)
 95 |         self.minSamplesPerClass = min([len(v) for v in self.labelIndices.values()])
 96 |         try:
 97 |             assert self.samplesPerClass <= self.minSamplesPerClass
 98 |         except:
 99 |             print('Requesting more samples (%d) than available (%d)' %(self.samplesPerClass, self.minSamplesPerClass))
100 |             sys.exit(1)
101 |         self.nClasses = random.randint(self.minClasses, self.maxClasses+1)
102 | 
103 | 
104 |     def __iter__(self):
105 |         for _ in range(self.numEpisodes):
106 |             classes = random.sample(self.uniqLabels, self.nClasses)
107 |             batchInd = np.empty((self.samplesPerClass, self.nClasses))
108 |             for i,c in enumerate(classes):
109 |                 selectSampleInd = np.random.choice(self.labelIndices[c], self.samplesPerClass)
110 |                 batchInd[:,i] = selectSampleInd
111 |             yield batchInd.ravel()
112 | 
113 | 
114 | def prepareModel(args):
115 | 
116 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
117 |     torch.distributed.init_process_group(backend='nccl', init_method='env://')
118 |     torch.backends.cudnn.benchmark = True
119 | 
120 |     if args.trainingMode == 'resume':
121 |         # select the latest model from modelDir
122 |         modelFile = max(glob.glob(args.resumeModelDir+'/*'), key=os.path.getctime)
123 |         net = eval('{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
124 |         optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.baseLR)
125 |         net.to(device)
126 | 
127 |         if torch.cuda.device_count() > 1:
128 |             print("Using ", torch.cuda.device_count(), "GPUs!")
129 |             net = nn.DataParallel(net)
130 | 
131 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
132 |         new_state_dict = OrderedDict()
133 |         for k, v in checkpoint['model_state_dict'].items():
134 |             if k.startswith('module.'):
135 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
136 |             else:
137 |                 new_state_dict[k] = v
138 |         # load params
139 |         net.load_state_dict(new_state_dict)
140 | 
141 |         step = checkpoint['step']
142 |         totalSteps = args.numEpochs * args.numArchives
143 |         print('Resuming training from step %d' %step)
144 | 
145 |         # set the dropout
146 |         if 1.0*step < args.stepFrac*totalSteps:
147 |             p_drop = args.pDropMax*step*args.stepFrac/totalSteps
148 |         else:
149 |             p_drop = max(0,args.pDropMax*(totalSteps + args.stepFrac - 2*step)/(totalSteps - totalSteps*args.stepFrac))
150 |         for x in net.modules():
151 |             if isinstance(x, torch.nn.Dropout):
152 |                 x.p = p_drop
153 |         saveDir = args.resumeModelDir
154 | 
155 | 
156 |     elif args.trainingMode == 'sanity_check':
157 | 
158 |         # select the latest model from modelDir
159 |         modelFile = max(glob.glob(args.resumeModelDir+'/*'), key=os.path.getctime)
160 |         net = eval('{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
161 |         optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.baseLR)
162 |         net.to(device)
163 | 
164 |         if torch.cuda.device_count() > 1:
165 |             print("Using ", torch.cuda.device_count(), "GPUs!")
166 |             net = nn.DataParallel(net)
167 | 
168 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
169 |         new_state_dict = OrderedDict()
170 |         for k, v in checkpoint['model_state_dict'].items():
171 |             if k.startswith('module.'):
172 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
173 |             else:
174 |                 new_state_dict[k] = v
175 | 
176 |         net.tdnn1.weight = torch.nn.Parameter(new_state_dict['tdnn1.weight'])
177 |         net.tdnn1.bias = torch.nn.Parameter(new_state_dict['tdnn1.bias'])
178 |         net.tdnn2.weight = torch.nn.Parameter(new_state_dict['tdnn2.weight'])
179 |         net.tdnn2.bias = torch.nn.Parameter(new_state_dict['tdnn2.bias'])
180 |         net.tdnn3.weight = torch.nn.Parameter(new_state_dict['tdnn3.weight'])
181 |         net.tdnn3.bias = torch.nn.Parameter(new_state_dict['tdnn3.bias'])
182 |         net.tdnn4.weight = torch.nn.Parameter(new_state_dict['tdnn4.weight'])
183 |         net.tdnn4.bias = torch.nn.Parameter(new_state_dict['tdnn4.bias'])
184 |         net.tdnn5.weight = torch.nn.Parameter(new_state_dict['tdnn5.weight'])
185 |         net.tdnn5.bias = torch.nn.Parameter(new_state_dict['tdnn5.bias'])
186 | 
187 |         step = checkpoint['step']
188 |         totalSteps = args.numEpochs * args.numArchives
189 |         print('Resuming training from step %d' %step)
190 | 
191 |         # set the dropout
192 |         if 1.0*step < args.stepFrac*totalSteps:
193 |             p_drop = args.pDropMax*step*args.stepFrac/totalSteps
194 |         else:
195 |             p_drop = max(0,args.pDropMax*(totalSteps + args.stepFrac - 2*step)/(totalSteps - totalSteps*args.stepFrac))
196 |         for x in net.modules():
197 |             if isinstance(x, torch.nn.Dropout):
198 |                 x.p = p_drop
199 |         saveDir = args.resumeModelDir
200 |         step += 1
201 | 
202 |     elif args.trainingMode == 'init':
203 |         print('Initializing Model..')
204 |         step = 0
205 |         net = eval('{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
206 |         optimizer = torch.optim.Adam(net.parameters(), lr=args.baseLR)
207 | 
208 |         net.to(device)
209 |         net = torch.nn.parallel.DistributedDataParallel(net,
210 |                                                      device_ids=[0],
211 |                                                      output_device=0)
212 |         if torch.cuda.device_count() > 1:
213 |             print("Using ", torch.cuda.device_count(), "GPUs!")
214 |             net = nn.DataParallel(net)
215 |         eventID = datetime.now().strftime('%Y%m-%d%H-%M%S')
216 |         saveDir = './models/modelType_{}_event_{}' .format(args.modelType, eventID)
217 |         os.makedirs(saveDir)
218 | 
219 |     return net, optimizer, step, saveDir
220 | 
221 | 
222 | def prepareProtoModel(args):
223 | 
224 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
225 |     torch.distributed.init_process_group(backend='nccl', init_method='env://')
226 |     torch.backends.cudnn.benchmark = True
227 | 
228 |     if args.trainingMode == 'initMeta':
229 | 
230 |         print('Loading pre-trained model..')
231 |         episodeI = 0
232 |         modelFile = max(glob.glob(args.preTrainedModelDir+'/*'), key=os.path.getctime)
233 |         net = proto_xvecTDNN(args.numSpkrs, p_dropout=0)
234 |         optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.baseLR)
235 | 
236 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
237 |         new_state_dict = OrderedDict()
238 |         for k, v in checkpoint['model_state_dict'].items():
239 |             if k.startswith('module.'):
240 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
241 |             else:
242 |                 new_state_dict[k] = v
243 | 
244 |         net.tdnn1.weight = torch.nn.Parameter(new_state_dict['tdnn1.weight'])
245 |         net.tdnn1.bias = torch.nn.Parameter(new_state_dict['tdnn1.bias'])
246 |         net.tdnn2.weight = torch.nn.Parameter(new_state_dict['tdnn2.weight'])
247 |         net.tdnn2.bias = torch.nn.Parameter(new_state_dict['tdnn2.bias'])
248 |         net.tdnn3.weight = torch.nn.Parameter(new_state_dict['tdnn3.weight'])
249 |         net.tdnn3.bias = torch.nn.Parameter(new_state_dict['tdnn3.bias'])
250 |         net.tdnn4.weight = torch.nn.Parameter(new_state_dict['tdnn4.weight'])
251 |         net.tdnn4.bias = torch.nn.Parameter(new_state_dict['tdnn4.bias'])
252 |         net.tdnn5.weight = torch.nn.Parameter(new_state_dict['tdnn5.weight'])
253 |         net.tdnn5.bias = torch.nn.Parameter(new_state_dict['tdnn5.bias'])
254 |         net.to(device)
255 | 
256 |         if torch.cuda.device_count() > 1:
257 |             print("Using ", torch.cuda.device_count(), "GPUs!")
258 |             net = nn.DataParallel(net)
259 | 
260 |         eventID = datetime.now().strftime('%Y%m-%d%H-%M%S')
261 |         saveDir = './models/modelType_{}_event_{}_proto_{}_{}_{}'.format(
262 |             args.modelType, eventID, args.protoMinClasses, args.protoMaxClasses, args.supportFrac)
263 |         os.makedirs(saveDir)
264 | 
265 |     elif args.trainingMode == 'resumeMeta':
266 | 
267 |         # read the last checkpoint, assign step value
268 |         modelFile = max(glob.glob(args.resumeModelDir+'/*'), key=os.path.getctime)
269 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
270 |         net = eval('proto_{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
271 | 
272 |         currLR = checkpoint['optimizer_state_dict']['param_groups'][0]['lr']
273 |         optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=currLR)
274 |         new_state_dict = OrderedDict()
275 |         for k, v in checkpoint['model_state_dict'].items():
276 |             if k.startswith('module.'):
277 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
278 |             else:
279 |                 new_state_dict[k] = v
280 |         # load params
281 |         net.load_state_dict(new_state_dict)
282 |         net.to(device)
283 |         episodeI = checkpoint['episodeI']
284 |         totalEpisodes = args.totalEpisodes
285 |         print('Resuming training from episodeI %d' %episodeI)
286 |         saveDir = args.resumeModelDir
287 | 
288 |     return net, optimizer, episodeI, saveDir
289 | 
290 | 
291 | def prepareRelationModel(args):
292 | 
293 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
294 |     torch.distributed.init_process_group(backend='nccl', init_method='env://')
295 |     torch.backends.cudnn.benchmark = True
296 | 
297 |     if args.trainingMode == 'initMeta':
298 | 
299 |         print('Loading pre-trained model..')
300 |         episodeI = 0
301 |         modelFile = max(glob.glob(args.preTrainedModelDir+'/*'), key=os.path.getctime)
302 |         encoder_net = relation_encoder_xvecTDNN(args.numSpkrs, p_dropout=0)
303 |         relation_net = relation_relation_xvecTDNN()
304 |         encoder_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, encoder_net.parameters()), lr=args.baseLR)
305 |         relation_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, relation_net.parameters()), lr=args.baseLR)
306 | 
307 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
308 |         new_state_dict = OrderedDict()
309 |         for k, v in checkpoint['model_state_dict'].items():
310 |             if k.startswith('module.'):
311 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
312 |             else:
313 |                 new_state_dict[k] = v
314 | 
315 |         encoder_net.tdnn1.weight = torch.nn.Parameter(new_state_dict['tdnn1.weight'])
316 |         encoder_net.tdnn1.bias = torch.nn.Parameter(new_state_dict['tdnn1.bias'])
317 |         encoder_net.tdnn2.weight = torch.nn.Parameter(new_state_dict['tdnn2.weight'])
318 |         encoder_net.tdnn2.bias = torch.nn.Parameter(new_state_dict['tdnn2.bias'])
319 |         encoder_net.tdnn3.weight = torch.nn.Parameter(new_state_dict['tdnn3.weight'])
320 |         encoder_net.tdnn3.bias = torch.nn.Parameter(new_state_dict['tdnn3.bias'])
321 |         encoder_net.tdnn4.weight = torch.nn.Parameter(new_state_dict['tdnn4.weight'])
322 |         encoder_net.tdnn4.bias = torch.nn.Parameter(new_state_dict['tdnn4.bias'])
323 |         encoder_net.tdnn5.weight = torch.nn.Parameter(new_state_dict['tdnn5.weight'])
324 |         encoder_net.tdnn5.bias = torch.nn.Parameter(new_state_dict['tdnn5.bias'])
325 |         encoder_net.to(device)
326 |         relation_net.to(device)
327 | 
328 |         if torch.cuda.device_count() > 1:
329 |             print("Using ", torch.cuda.device_count(), "GPUs!")
330 |             encoder_net = nn.DataParallel(encoder_net)
331 |             relation_net = nn.DataParallel(relation_net)
332 | 
333 |         eventID = datetime.now().strftime('%Y%m-%d%H-%M%S')
334 |         saveDir = './models/modelType_{}_event_{}_proto_{}_{}_{}'.format(
335 |             args.modelType, eventID, args.protoMinClasses, args.protoMaxClasses, args.supportFrac)
336 |         os.makedirs(saveDir)
337 | 
338 |     elif args.trainingMode == 'resumeMeta':
339 | 
340 |         # read the last checkpoint, assign step value
341 |         modelFile = max(glob.glob(args.resumeModelDir+'/*'), key=os.path.getctime)
342 |         checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
343 | 
344 |         encoder_net = relation_encoder_xvecTDNN(args.numSpkrs, p_dropout=0)
345 |         relation_net = relation_relation_xvecTDNN()
346 |         currLR = checkpoint['encoder_optimizer_state_dict']['param_groups'][0]['lr']
347 |         encoder_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, encoder_net.parameters()), lr=currLR)
348 |         relation_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, relation_net.parameters()), lr=currLR)
349 | 
350 |         new_state_dict = OrderedDict()
351 |         for k, v in checkpoint['encoder_state_dict'].items():
352 |             if k.startswith('module.'):
353 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
354 |             else:
355 |                 new_state_dict[k] = v
356 |         encoder_net.load_state_dict(new_state_dict)
357 |         new_state_dict = OrderedDict()
358 |         for k, v in checkpoint['relation_state_dict'].items():
359 |             if k.startswith('module.'):
360 |                 new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
361 |             else:
362 |                 new_state_dict[k] = v
363 |         relation_net.load_state_dict(new_state_dict)
364 | 
365 |         encoder_net.to(device)
366 |         relation_net.to(device)
367 | 
368 |         episodeI = checkpoint['episodeI']
369 |         totalEpisodes = args.totalEpisodes
370 |         print('Resuming training from episodeI %d' %episodeI)
371 |         saveDir = args.resumeModelDir
372 | 
373 |     return encoder_net, relation_net, encoder_optimizer, relation_optimizer, episodeI, saveDir
374 | 
375 | 
376 | 
377 | def getParams():
378 |     parser = argparse.ArgumentParser()
379 | 
380 |     # PyTorch distributed run
381 |     parser.add_argument("--local_rank", type=int, default=0)
382 | 
383 |     # General Parameters
384 |     parser.add_argument('-modelType', default='xvecTDNN', help='Model class. Check models.py')
385 |     parser.add_argument('-featDim', default=30, type=int, help='Frame-level feature dimension')
386 |     parser.add_argument('-trainingMode', default='init',
387 |         help='(init) Train from scratch, (resume) Resume training, (finetune) Finetune a pretrained model')
388 |     parser.add_argument('-resumeModelDir', default=None, help='Path containing training checkpoints')
389 |     parser.add_argument('featDir', default=None, help='Directory with training archives')
390 | 
391 |     # Training Parameters - no more trainFullXvector = 0
392 |     trainingArgs = parser.add_argument_group('General Training Parameters')
393 |     trainingArgs.add_argument('-numArchives', default=84, type=int, help='Number of egs.*.ark files')
394 |     trainingArgs.add_argument('-numSpkrs', default=7323, type=int, help='Number of output labels')
395 |     trainingArgs.add_argument('-logStepSize', default=200, type=int, help='Iterations per log')
396 |     trainingArgs.add_argument('-batchSize', default=32, type=int, help='Batch size')
397 |     trainingArgs.add_argument('-numEgsPerArk', default=366150, type=int,
398 |         help='Number of training examples per egs file')
399 | 
400 |     # Optimization Params
401 |     optArgs = parser.add_argument_group('Optimization Parameters')
402 |     optArgs.add_argument('-preFetchRatio', default=30, type=int, help='xbatchSize to fetch from dataloader')
403 |     optArgs.add_argument('-optimMomentum', default=0.5, type=float, help='Optimizer momentum')
404 |     optArgs.add_argument('-baseLR', default=1e-3, type=float, help='Initial LR')
405 |     optArgs.add_argument('-maxLR', default=2e-3, type=float, help='Maximum LR')
406 |     optArgs.add_argument('-numEpochs', default=2, type=int, help='Number of training epochs')
407 |     optArgs.add_argument('-noiseEps', default=1e-5, type=float, help='Noise strength before pooling')
408 |     optArgs.add_argument('-pDropMax', default=0.2, type=float, help='Maximum dropout probability')
409 |     optArgs.add_argument('-stepFrac', default=0.5, type=float,
410 |         help='Training iteration when dropout = pDropMax')
411 | 
412 |     # Metalearning params
413 |     protoArgs = parser.add_argument_group('Protonet Parameters')
414 |     protoArgs.add_argument('-preTrainedModelDir', default=None, help='Embedding model to initialize training')
415 |     protoArgs.add_argument('-protoMinClasses', default=5, type=int, help='Minimum N-way')
416 |     protoArgs.add_argument('-protoMaxClasses', default=35, type=int, help='Maximum N-way')
417 |     protoArgs.add_argument('-protoEpisodesPerArk', default=25, type=int, help='Episodes per ark file')
418 |     protoArgs.add_argument('-totalEpisodes', default=100, type=int, help='Number of training episodes')
419 |     protoArgs.add_argument('-supportFrac', default=0.7, type=float, help='Fraction of samples as supports')
420 | 
421 |     return parser
422 | 
423 | def checkParams(args):
424 |     if args.featDir is None:
425 |         print('Features directory cannot be empty!')
426 |         sys.exit()
427 | 
428 |     if args.protoMinClasses > args.protoMaxClasses:
429 |         print('Max Classes must be greater than or equal to min classes')
430 |         sys.exit(1)
431 | 
432 |     if args.trainingMode not in [ 'init', 'resume', 'sanity_check', 'initMeta', 'resumeMeta' ]:
433 |         print('Invalid training mode')
434 |         sys.exit(1)
435 | 
436 |     if 'Meta' in args.trainingMode and args.preTrainedModelDir is None:
437 |         print('Missing pretrained model directory')
438 |         sys.exit(1)
439 | 
440 |     if 'resume' in args.trainingMode and args.resumeModelDir is None:
441 |         print('Provide model directory to resume training from')
442 |         sys.exit(1)
443 | 
444 | 
445 | 
446 | def computeValidAccuracy(args, modelDir):
447 |     """ Computes frame-level validation accruacy
448 |     """
449 |     modelFile = max(glob.glob(modelDir+'/*'), key=os.path.getctime)
450 |     # Load the model
451 |     net = eval('{}({}, p_dropout=0)'.format(args.modelType, args.numSpkrs))
452 | 
453 |     checkpoint = torch.load(modelFile,map_location=torch.device('cuda'))
454 |     new_state_dict = OrderedDict()
455 |     for k, v in checkpoint['model_state_dict'].items():
456 |         if k.startswith('module.'):
457 |             new_state_dict[k[7:]] = v  # ugly fix to remove 'module' from key
458 |         else:
459 |             new_state_dict[k] = v
460 |     # load params
461 |     net.load_state_dict(new_state_dict)
462 |     net = net.cuda()
463 |     net.eval()
464 | 
465 |     correct, incorrect = 0, 0
466 |     for validArk in glob.glob(args.featDir+'/valid_egs.*.ark'):
467 |         x = kaldi_python_io.Nnet3EgsReader(validArk)
468 |         for key, mat in x:
469 |             out = net(x=torch.Tensor(mat[0]['matrix']).permute(1,0).unsqueeze(0).cuda(),eps=0)
470 |             if mat[1]['matrix'][0][0][0]+1 == torch.argmax(out)+1:
471 |                 correct += 1
472 |             else:
473 |                 incorrect += 1
474 |     return 100.0*correct/(correct+incorrect)
475 | 
476 | 
477 | def par_core_extractXvectors(inFeatsScp, outXvecArk, outXvecScp, net, layerName):
478 |     """ To be called using pytorch multiprocessing
479 |         Note: This function reads all the data from feats.scp into memory
480 |         before inference. Hence, make sure the file is not too big (Hint: use
481 |         split_data_dir.sh)
482 |     """
483 | 
484 |     activation = {}
485 |     def get_activation(name):
486 |         def hook(model, input, output):
487 |             activation[name] = output.detach()
488 |         return hook
489 |     eval('net.%s.register_forward_hook(get_activation(layerName))' %layerName)
490 | 
491 |     with kaldi_python_io.ArchiveWriter(outXvecArk, outXvecScp, matrix=False) as writer:
492 |         with ReadHelper('scp:%s'%inFeatsScp) as reader:
493 |             for key, mat in reader:
494 |                 out = net(x=torch.Tensor(mat).permute(1,0).unsqueeze(0).cuda(),
495 |                           eps=0)
496 |                 writer.write(key, np.squeeze(activation[layerName].cpu().numpy()))
497 | 


--------------------------------------------------------------------------------