├── exp ├── model_l4_c320 │ ├── .epoch │ ├── .halving │ ├── .cvacc │ └── .lrate └── train_phn_l3_c320 │ ├── .epoch │ ├── .halving │ ├── .cvacc │ └── .lrate ├── conf ├── pitch.conf ├── fbank.conf └── mfcc.conf ├── run.sh ├── path.sh ├── utils ├── training_trans_fst.py ├── ctc_token_fst.py ├── eps2disambig.pl ├── spk2utt_to_utt2spk.pl ├── s2eps.pl ├── distribute_scp.pl ├── build_const_arpa_lm.sh ├── shuffle_list.pl ├── utt2spk_to_spk2utt.pl ├── best_wer.sh ├── remove_oovs.pl ├── int2sym.pl ├── prep_scps.sh ├── find_arpa_oovs.pl ├── prep_ctc_trans_bkup.py ├── subset_scp.pl ├── prep_ctc_trans.py ├── filter_scp.pl ├── convert_ctm.pl ├── create_data_link.pl ├── sym2int.pl ├── parse_options.sh ├── pinyin_map.pl ├── subset_data_dir_tr_cv.sh ├── add_lex_disambig.pl ├── split_data.sh ├── format_lm_sri.sh ├── ctc_compile_dict_token.sh ├── model_topo.py ├── fix_data_dir.sh ├── run_rocks.pl ├── make_lexicon_fst.pl ├── subset_data_dir.sh ├── split_scp.pl ├── run.pl └── validate_data_dir.sh ├── decode.sh ├── cmd.sh ├── local ├── hkust_normalize.pl ├── thchs-30_data_prep.sh ├── thchs-30_prepare_phn_dict.sh ├── thchs-30_decode_graph.sh ├── hkust_train_lms.sh └── score.sh ├── make_TLG_WFST.sh ├── feature.sh ├── README.md ├── train.sh └── steps ├── align_ctc_single_utt.sh ├── decode_ctc.sh ├── compute_cmvn_stats.sh ├── make_fbank.sh ├── decode_ctc_lat.sh ├── make_fbank_pitch.sh ├── train_ctc_parallel.sh ├── train_ctc_parallel_h.sh └── train_ctc_parallel_x3.sh /exp/model_l4_c320/.epoch: -------------------------------------------------------------------------------- 1 | 25 2 | -------------------------------------------------------------------------------- /exp/model_l4_c320/.halving: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /exp/model_l4_c320/.cvacc: -------------------------------------------------------------------------------- 1 | 85.833 2 | -------------------------------------------------------------------------------- /exp/model_l4_c320/.lrate: -------------------------------------------------------------------------------- 1 | 3.125e-07 2 | -------------------------------------------------------------------------------- /exp/train_phn_l3_c320/.epoch: -------------------------------------------------------------------------------- 1 | 19 2 | -------------------------------------------------------------------------------- /exp/train_phn_l3_c320/.halving: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /exp/train_phn_l3_c320/.cvacc: -------------------------------------------------------------------------------- 1 | 90.5068 2 | -------------------------------------------------------------------------------- /exp/train_phn_l3_c320/.lrate: -------------------------------------------------------------------------------- 1 | 6.25e-07 2 | -------------------------------------------------------------------------------- /conf/pitch.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | -------------------------------------------------------------------------------- /conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --num-mel-bins=40 2 | --sample-frequency=16000 3 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./make_TLG_WFST.sh 4 | 5 | ./feature.sh 6 | 7 | ./train.sh 8 | 9 | ./decode.sh 10 | -------------------------------------------------------------------------------- /conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | --sample-frequency=16000 # Switchboard is sampled at 8kHz 3 | -------------------------------------------------------------------------------- /path.sh: -------------------------------------------------------------------------------- 1 | export EESEN_ROOT=`pwd`/../../.. 2 | export PATH=$PWD/utils/:$EESEN_ROOT/src/netbin:$EESEN_ROOT/src/featbin:$EESEN_ROOT/src/decoderbin:$EESEN_ROOT/src/fstbin:$EESEN_ROOT/tools/openfst/bin:$EESEN_ROOT/tools/irstlm/bin/:$PWD:$PATH 3 | export LC_ALL=C 4 | 5 | -------------------------------------------------------------------------------- /utils/training_trans_fst.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Apache 2.0 4 | 5 | import sys 6 | 7 | fread = open(sys.argv[1], 'r') 8 | 9 | for entry in fread.readlines(): 10 | entry = entry.replace('\n','').strip() 11 | fields = entry.split(' ') 12 | uttid = fields[0] 13 | 14 | for n in range(1, len(fields)): 15 | print str(n-1) + ' ' + str(n) + ' ' + fields[n] + ' ' + fields[n] 16 | 17 | print str(n) + ' ' + '0' + ' ' + '0' + ' ' + '0' # assume that is 0 in words.txt 18 | 19 | print '0' 20 | 21 | fread.close() 22 | -------------------------------------------------------------------------------- /decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. 3 | ## This relates to the queue. 4 | . path.sh 5 | 6 | . parse_options.sh 7 | 8 | model_dir=exp/model_l4_c320 9 | 10 | echo ===================================================================== 11 | echo " Decoding " 12 | echo ===================================================================== 13 | # decoding 14 | steps/decode_ctc_lat.sh --cmd "$decode_cmd" --nj 5 --beam 17.0 --lattice_beam 8.0 --max-active 5000 --acwt 0.9 \ 15 | data/search_Graph data/test $model_dir $model_dir/decode_test || exit 1; 16 | -------------------------------------------------------------------------------- /cmd.sh: -------------------------------------------------------------------------------- 1 | # "queue.pl" uses qsub. The options to it are 2 | # options to qsub. If you have GridEngine installed, 3 | # change this to a queue you have access to. 4 | # Otherwise, use "run.pl", which will run jobs locally 5 | # (make sure your --num-jobs options are no more than 6 | # the number of cpus on your machine. 7 | 8 | #a) JHU cluster options 9 | #export train_cmd="queue.pl -l arch=*64" 10 | #export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" 11 | #export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" 12 | #export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G" 13 | #export cuda_cmd="queue.pl -l gpu=1" 14 | 15 | #c) run it locally... works for CMU rocks cluster 16 | export train_cmd=run.pl 17 | export decode_cmd=run.pl 18 | export cuda_cmd=run.pl 19 | -------------------------------------------------------------------------------- /utils/ctc_token_fst.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Apache 2.0 4 | 5 | import sys 6 | 7 | fread = open(sys.argv[1], 'r') 8 | 9 | print '0 1 ' 10 | print '1 1 ' 11 | print '2 2 ' 12 | print '2 0 ' 13 | 14 | nodeX = 3 15 | for entry in fread.readlines(): 16 | entry = entry.replace('\n','').strip() 17 | fields = entry.split(' ') 18 | phone = fields[0] 19 | if phone == '' or phone == '': 20 | continue 21 | 22 | if '#' in phone: 23 | print str(0) + ' ' + str(0) + ' ' + '' + ' ' + phone; 24 | else: 25 | print str(1) + ' ' + str(nodeX) + ' ' + phone + ' ' + phone; 26 | print str(nodeX) + ' ' + str(nodeX) + ' ' + phone + ' '; 27 | print str(nodeX) + ' ' + str(2) + ' ' + ' '; 28 | nodeX += 1 29 | print '0' 30 | 31 | fread.close() 32 | -------------------------------------------------------------------------------- /utils/eps2disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces epsilon with #0 on the input side only, of the G.fst 18 | # acceptor. 19 | 20 | while(<>){ 21 | s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; 22 | print; 23 | } 24 | -------------------------------------------------------------------------------- /local/hkust_normalize.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | # Copyright Chao Weng 4 | 5 | # normalizations for hkust trascript 6 | # see the docs/trans-guidelines.pdf for details 7 | 8 | while () { 9 | @A = split(" ", $_); 10 | print "$A[0] "; 11 | for ($n = 1; $n < @A; $n++) { 12 | $a = $A[$n]; 13 | if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}") 14 | || ($a eq "{lipsmack}")) {print "[VOCALIZED-NOISE] "; next;} 15 | if (($a eq "{laugh}")) {print "[LAUGHTER] "; next;} 16 | if (($a eq "")) {print "[NOISE] "; next;} 17 | $tmp = $a; 18 | if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:; } 19 | if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; } 20 | if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; } 21 | if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} 22 | print "$tmp "; 23 | } 24 | print "\n"; 25 | } 26 | -------------------------------------------------------------------------------- /utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /utils/s2eps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces and with (on both input and output sides), 18 | # for the G.fst acceptor. 19 | 20 | while(<>){ 21 | @A = split(" ", $_); 22 | if ( @A >= 4 ) { 23 | if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } 24 | if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } 25 | } 26 | print join("\t", @A) . "\n"; 27 | } 28 | -------------------------------------------------------------------------------- /make_TLG_WFST.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. 3 | ## This relates to the queue. 4 | . path.sh 5 | 6 | . parse_options.sh 7 | H=`pwd` 8 | corpus_dir=$H/corpus 9 | 10 | echo ===================================================================== 11 | echo " TLG WFST Construction " 12 | echo ===================================================================== 13 | #Data preparation 14 | local/thchs-30_data_prep.sh $H $corpus_dir 15 | 16 | # Construct the phoneme-based dict. 17 | # We get 216 tokens, representing phonemes with tonality. 18 | local/thchs-30_prepare_phn_dict.sh || exit 1; 19 | 20 | # Compile the lexicon and token FSTs 21 | utils/ctc_compile_dict_token.sh --dict-type "phn" data/dict_phn data/lang_tmp data/lang || exit 1; 22 | 23 | # Train and compile LMs. 24 | #local/hkust_train_lms.sh corpus/train/text data/dict_phn/lexicon.txt data/language_model || exit 1; 25 | 26 | # Compile the language-model FST and the final decoding graph TLG.fst 27 | local/thchs-30_decode_graph.sh data/language_model data/lang data/search_Graph || exit 1; 28 | -------------------------------------------------------------------------------- /local/thchs-30_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. 3 | 4 | #This script pepares the data directory for thchs30 recipe. 5 | #It reads the corpus and get wav.scp and transcriptions. 6 | 7 | dir=$1 8 | corpus_dir=$2 9 | 10 | 11 | cd $dir 12 | echo ======================================== 13 | echo " Data Preparation " 14 | echo ======================================== 15 | echo "creating data/{train,dev,test}" 16 | mkdir -p data/{train,dev,test} 17 | 18 | #create wav.scp, utt2spk.scp, spk2utt.scp, text 19 | ( 20 | for x in train dev test; do 21 | echo "cleaning data/$x" 22 | cd $dir/data/$x 23 | rm -rf wav.scp utt2spk spk2utt word.txt text 24 | echo "preparing scps and text in data/$x" 25 | for nn in `find $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do 26 | echo $nn $corpus_dir/$x/$nn.wav >> wav.scp 27 | echo $nn $nn >> utt2spk 28 | echo $nn $nn >> spk2utt 29 | echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt 30 | done 31 | cp word.txt text 32 | done 33 | ) || exit 1 34 | echo " Data prepration succeeded " 35 | echo -e "\n" 36 | -------------------------------------------------------------------------------- /local/thchs-30_prepare_phn_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script prepares the phoneme-based lexicon. It also generates the list of lexicon units 4 | # and represents the lexicon using the indices of the units. 5 | 6 | dir=data/dict_phn 7 | mkdir -p $dir 8 | srcdict=data/dict/lexicon.txt 9 | 10 | [ -f path.sh ] && . ./path.sh 11 | 12 | [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; 13 | 14 | echo ============================================== 15 | echo " Phoneme-based Dictionary Preparation " 16 | echo ============================================== 17 | 18 | # Raw dictionary preparation 19 | cat $srcdict | grep -v "!SIL" | \ 20 | perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; print $_;}}' \ 21 | > $dir/lexicon.txt || exit 1; 22 | 23 | # Get the set of lexicon units without noises 24 | cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u | awk '{print $1 " " NR}' > $dir/units.txt 25 | 26 | # Convert phoneme sequences into the corresponding sequences of units indices, encoded by units.txt 27 | utils/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt 28 | 29 | echo "Phoneme-based dictionary preparation succeeded" 30 | echo -e "\n" 31 | -------------------------------------------------------------------------------- /utils/distribute_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Copyright 2015 Hang Su. Apache 2.0. 4 | 5 | # This script split an scp list either by length of the frames or in round-robin manner 6 | 7 | $mode = 'frame'; 8 | if ($ARGV[0] eq '--mode') { 9 | shift @ARGV; 10 | $mode = shift @ARGV; 11 | } 12 | 13 | $num_jobs = $ARGV[0]; shift; 14 | $base_filename = $ARGV[0]; shift; 15 | 16 | @num_frames = (0) x $num_jobs; 17 | 18 | foreach $i (1..$num_jobs) { 19 | local *FILE; 20 | open(FILE, "> $base_filename.$i.scp") || die; 21 | push(@file_handles, *FILE); 22 | } 23 | 24 | $count = 0; 25 | while (<>) { 26 | chomp; 27 | if ($mode eq "utt") { 28 | $id = ($count % $num_jobs) ; 29 | print {$file_handles[$id]} $_,"\n"; 30 | } elsif ($mode eq "frame") { 31 | @A = split /\s+/; 32 | $id_min = 0; 33 | $num_frames[$id_min] < $num_frames[$_] or $id_min = $_ for 1 .. $#num_frames; # find the smallest index 34 | $id = $id_min; 35 | $num_frames[$id_min] += $A[1]; 36 | print {$file_handles[$id]} $A[0],"\n"; 37 | } else { 38 | die "Un-recognized mode $mode!"; 39 | } 40 | $count += 1; 41 | } 42 | 43 | $id_min = 0; 44 | $num_frames[$id_min] < $num_frames[$_] or $id_min = $_ for 1 .. $#num_frames; # find the smallest index 45 | print "$num_frames[$id_min]"; 46 | -------------------------------------------------------------------------------- /utils/build_const_arpa_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script reads in an Arpa format language model, and converts it into the 7 | # ConstArpaLm format language model. 8 | 9 | # begin configuration section 10 | # end configuration section 11 | 12 | [ -f path.sh ] && . ./path.sh; 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: " 18 | echo " $0 [options] " 19 | echo "e.g.:" 20 | echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" 21 | echo "Options" 22 | exit 1; 23 | fi 24 | 25 | export LC_ALL=C 26 | 27 | arpa_lm=$1 28 | old_lang=$2 29 | new_lang=$3 30 | 31 | mkdir -p $new_lang 32 | 33 | mkdir -p $new_lang 34 | cp -r $old_lang/* $new_lang 35 | 36 | 37 | unk=`cat $new_lang/oov.int` 38 | bos=`grep "" $new_lang/words.txt | awk '{print $2}'` 39 | eos=`grep "" $new_lang/words.txt | awk '{print $2}'` 40 | if [[ -z $bos || -z $eos ]]; then 41 | echo "$0: and symbols are not in $new_lang/words.txt" 42 | exit 1 43 | fi 44 | 45 | 46 | arpa-to-const-arpa --bos-symbol=$bos \ 47 | --eos-symbol=$eos --unk-symbol=$unk \ 48 | "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; 49 | 50 | exit 0; 51 | -------------------------------------------------------------------------------- /feature.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. 3 | ## This relates to the queue. 4 | . path.sh 5 | 6 | . parse_options.sh 7 | 8 | echo ===================================================================== 9 | echo " FBank Feature Generation " 10 | echo ===================================================================== 11 | fbankdir=fbank 12 | 13 | # Generate the fbank features; by default 40-dimensional fbanks on each frame 14 | #make train fbank 15 | steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train exp/make_fbank/train $fbankdir || exit 1; 16 | utils/fix_data_dir.sh data/train || exit; 17 | steps/compute_cmvn_stats.sh data/train exp/make_fbank/train $fbankdir || exit 1; 18 | echo -e "\n" 19 | 20 | #make test fbank 21 | steps/make_fbank.sh --cmd "$train_cmd" --nj 10 data/test exp/make_fbank/test $fbankdir || exit 1; 22 | utils/fix_data_dir.sh data/test || exit; 23 | steps/compute_cmvn_stats.sh data/test exp/make_fbank/test $fbankdir || exit 1; 24 | echo -e "\n" 25 | 26 | #make dev fbank 27 | steps/make_fbank.sh --cmd "$train_cmd" --nj 10 data/dev exp/make_fbank/dev $fbankdir || exit 1; 28 | utils/fix_data_dir.sh data/dev || exit; 29 | steps/compute_cmvn_stats.sh data/dev exp/make_fbank/dev $fbankdir || exit 1; 30 | echo -e "\n" 31 | -------------------------------------------------------------------------------- /utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines = <>; 37 | @lines = sort { rand() <=> rand() } @lines; 38 | print @lines; 39 | -------------------------------------------------------------------------------- /utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 这个recipe是基于eesen代码改写的中文语音识别,语料库为清华语料库(data_thchs30)。 2 | === 3 | ## 1 功能: 4 | 1)可以实现中文语音识别 5 | 6 | 2)可以加入其它的汉语语料库进行算法研究 7 | 8 | 3)也可以单独研究以wfst为架构的解码器,实现声学模型出来的音素到词的转换 9 | 10 | 11 | ## 2 算法:BiLSTM+CTC+WFST 12 | 13 | 1)BiLSTM:3 layers+ 1 projection layer,320 hidden units 14 | 15 | 2)CTC:216个声韵母标签+1个blank标签 16 | 17 | 3)WFST: CTC token fst(T.fst), lexicon fst(L.fst), language model fst(G.fst) 18 | 19 | ## 3 实验结果: 20 | 1)CTC训练标签正确率:92%左右 21 | 22 | 2)CTC交叉严重标签正确率:90%左右 23 | 24 | 3)最终的解码WER:25%左右 25 | 26 | 27 | 28 | ## 4 该目录下的相关文件说明: 29 | 30 | 1)运行该项目: ./run.sh 31 | 32 | 也可以单独运行每个高亮的shell脚本,其中 33 | 34 | make_TLG_WFST.sh: 用来生成TLG.fst . 无需加参数运行,生成的文件所在的目录为:data/{train,test,dev,lang,search_Graph}. 35 | 36 | feature.sh:用来生成wav音频数据的fbank特征,40+delta+double delta. 无需加参数运行,生成的文件所在目录为:data/{train,test,dev} ,fbank 37 | 38 | train.sh: 训练声学模型 无需加参数运行,也可以修改该脚本里的网络参数。生成的文件所在目录为:exp/model_l$_c$ 39 | 40 | decode.sh:利用 声学模型 和集成了语言模型的WFST进行解码. 生成的文件所在目录为:exp/model_l$_c$/decode_test 41 | 42 | 43 | 2)数据准备: 44 | 45 | 准备语言模型languange model ,放入data/language_model目录下,语言模型的文件格式类似于清华的 46 | 47 | 准备字典lexicon.txt,放入 data/dict目录下 48 | 49 | 准备训练数据,放入corpus/train目录下,wav+text 50 | 51 | 准备测试数据,放入corpus/test目录下, wav+text 52 | 53 | 准备验证数据,放入corpus/dev目录下,wav+text 54 | 55 | ## 5 安装eesen: 56 | 1)要想运行该项目,必须根据https://github.com/srvk/eesen中的INSTALL安装eesen 57 | -------------------------------------------------------------------------------- /utils/best_wer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2010-2011 Microsoft Corporation 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # To be run from one directory above this script. 19 | 20 | perl -e 'while(<>){ 21 | s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g; 22 | if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool. 23 | elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|: 24 | && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite. 25 | if (defined $bestline){ print $bestline; } ' | \ 26 | awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \ 27 | awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \ 28 | awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \ 29 | sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||' 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /utils/remove_oovs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script removes lines that contain these OOVs on either the 18 | # third or fourth fields of the line. It is intended to remove arcs 19 | # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). 20 | 21 | if ( @ARGV < 1 && @ARGV > 2) { 22 | die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; 23 | } 24 | 25 | $unklist = shift @ARGV; 26 | open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; 27 | while(){ 28 | @A = split(" ", $_); 29 | @A == 1 || die "Bad line in unknown-symbol list: $_"; 30 | $unk{$A[0]} = 1; 31 | } 32 | 33 | $num_removed = 0; 34 | while(<>){ 35 | @A = split(" ", $_); 36 | if(defined $unk{$A[2]} || defined $unk{$A[3]}) { 37 | $num_removed++; 38 | } else { 39 | print; 40 | } 41 | } 42 | print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; 43 | 44 | -------------------------------------------------------------------------------- /local/thchs-30_decode_graph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | if [ -f path.sh ]; then . path.sh; fi 5 | 6 | lm_dir=$1 7 | src_lang=$2 8 | tgt_lang=$3 9 | 10 | arpa_lm=${lm_dir}/lm.gz 11 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; 12 | 13 | rm -rf $tgt_lang 14 | cp -r $src_lang $tgt_lang 15 | 16 | echo ============================================================== 17 | echo " Generating The Language Model FST And Composing TLG.fst " 18 | echo ============================================================== 19 | # Compose the language model to FST 20 | gunzip -c "$arpa_lm" | \ 21 | grep -v ' ' | \ 22 | grep -v ' ' | \ 23 | grep -v ' ' | \ 24 | arpa2fst - | fstprint | \ 25 | utils/remove_oovs.pl /dev/null | \ 26 | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ 27 | --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ 28 | fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst 29 | #exit 1; 30 | 31 | echo "Checking how stochastic G is (the first of these numbers should be small):" 32 | fstisstochastic $tgt_lang/G.fst 33 | 34 | # Compose the token, lexicon and language-model FST into the final decoding graph 35 | fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ 36 | fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; 37 | fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; 38 | 39 | echo "Composing decoding graph TLG.fst succeeded" 40 | rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST 41 | echo -e "\n" 42 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. 3 | ## This relates to the queue. 4 | . path.sh 5 | 6 | . parse_options.sh 7 | 8 | 9 | echo ===================================================================== 10 | echo " Model Training " 11 | echo ===================================================================== 12 | # Specify network structure and generate the network topology 13 | input_feat_dim=120 # dimension of the input features; we will use 40-dimensional fbanks with deltas and double deltas 14 | lstm_layer_num=4 # number of LSTM layers 15 | lstm_cell_dim=320 # number of memory cells in every LSTM layer 16 | 17 | dir=exp/model_l${lstm_layer_num}_c${lstm_cell_dim} 18 | mkdir -p $dir 19 | 20 | target_num=`cat data/lang/units.txt | wc -l`; target_num=$[$target_num+1]; # #targets = #labels + 1 (the blank) 21 | 22 | # Output the network topology 23 | utils/model_topo.py --input-feat-dim $input_feat_dim --lstm-layer-num $lstm_layer_num --lstm-cell-dim $lstm_cell_dim --target-num $target_num --fgate-bias-init 1.0 > $dir/nnet.proto || exit 1; 24 | 25 | # Label sequences; simply convert words into their label indices 26 | utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text "" | gzip -c - > $dir/labels.tr.gz 27 | 28 | utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/dev/text "" | gzip -c - > $dir/labels.cv.gz 29 | 30 | # Train the network with CTC. Refer to the script for details about the arguments 31 | steps/train_ctc_parallel.sh --add-deltas true --num-sequence 10 --learn-rate 0.00004 --report-step 10 --halving-after-epoch 12 --feats-tmpdir $dir/XXXXX data/train data/dev $dir || exit 1; 32 | echo -e "\n" 33 | -------------------------------------------------------------------------------- /utils/int2sym.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 3 | # Apache 2.0. 4 | 5 | undef $field_begin; 6 | undef $field_end; 7 | 8 | 9 | if ($ARGV[0] eq "-f") { 10 | shift @ARGV; 11 | $field_spec = shift @ARGV; 12 | if ($field_spec =~ m/^\d+$/) { 13 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 14 | } 15 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) 16 | if ($1 ne "") { 17 | $field_begin = $1 - 1; # Change to zero-based indexing. 18 | } 19 | if ($2 ne "") { 20 | $field_end = $2 - 1; # Change to zero-based indexing. 21 | } 22 | } 23 | if (!defined $field_begin && !defined $field_end) { 24 | die "Bad argument to -f option: $field_spec"; 25 | } 26 | } 27 | $symtab = shift @ARGV; 28 | if(!defined $symtab) { 29 | print STDERR "Usage: sym2int.pl [options] symtab [input] > output\n" . 30 | "options: [-f (|-)]\n" . 31 | "e.g.: -f 2, or -f 3-4\n"; 32 | exit(1); 33 | } 34 | 35 | open(F, "<$symtab") || die "Error opening symbol table file $symtab"; 36 | while() { 37 | @A = split(" ", $_); 38 | @A == 2 || die "bad line in symbol table file: $_"; 39 | $int2sym{$A[1]} = $A[0]; 40 | } 41 | 42 | sub int2sym { 43 | my $a = shift @_; 44 | my $pos = shift @_; 45 | if($a !~ m:^\d+$:) { # not all digits.. 46 | $pos1 = $pos+1; # make it one-based. 47 | die "int2sym.pl: found noninteger token $a [in position $pos1]\n"; 48 | } 49 | $s = $int2sym{$a}; 50 | if(!defined ($s)) { 51 | die "int2sym.pl: integer $a not in symbol table $symtab."; 52 | } 53 | return $s; 54 | } 55 | 56 | $error = 0; 57 | while (<>) { 58 | @A = split(" ", $_); 59 | for ($pos = 0; $pos <= $#A; $pos++) { 60 | $a = $A[$pos]; 61 | if ( (!defined $field_begin || $pos >= $field_begin) 62 | && (!defined $field_end || $pos <= $field_end)) { 63 | $a = int2sym($a, $pos); 64 | } 65 | print $a . " "; 66 | } 67 | print "\n"; 68 | } 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /utils/prep_scps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | { 3 | # Copyright 2015 Hang Su 4 | # Apache 2.0 5 | 6 | # This script prepares feature scp file for CTC training 7 | 8 | set -e 9 | set -o pipefail 10 | 11 | ## Begin configuration section 12 | clean_up=true 13 | seed= 14 | cmd= 15 | nj=1 16 | # End of configuration 17 | 18 | echo "$0 $@" # Print the command line for logging 19 | 20 | [ -f ./path.sh ] && . ./path.sh; 21 | 22 | . utils/parse_options.sh || exit 1; 23 | 24 | if [ $# != 6 ]; then 25 | echo "Usage: " 26 | echo " e.g.: " 27 | exit 1 28 | fi 29 | 30 | feat_tr=$1 31 | feat_cv=$2 32 | num_sequence=$3 33 | frame_num_limit=$4 34 | tmpdir=$5 35 | dir=$6 36 | 37 | for part in tr cv; do 38 | feat=$(eval echo "\$feat_${part}") 39 | 40 | feat-to-len scp:$feat ark,t:- | sort -k2 -n | \ 41 | awk -v num_sequence=$num_sequence -v frame_num_limit=$frame_num_limit ' 42 | BEGIN {max_frame_num = 0; num_utts = 0;} 43 | { 44 | printf("%s ",$1); 45 | num_utts++; 46 | if (max_frame_num < $2) { 47 | max_frame_num = $2; 48 | } 49 | if (num_utts >= num_sequence || num_utts * max_frame_num > frame_num_limit) { 50 | printf("\n"); 51 | num_utts = 0; 52 | max_frame_num = 0; 53 | } 54 | }' | utils/shuffle_list.pl --srand ${seed:-777} > $dir/batch.$part.list 55 | 56 | split_batches="" 57 | for n in $(seq $nj); do 58 | split_batches="$split_batches $tmpdir/batch.$part.$n.list" 59 | done 60 | utils/split_scp.pl $dir/batch.$part.list $split_batches 61 | 62 | for n in $(seq $nj); do 63 | awk ' 64 | NR==FNR {a[$1]=$2;next} 65 | { 66 | for (i=1; i<=NF; i++) { 67 | printf("%s %s\n", $i, a[$i]); 68 | } 69 | }' $feat $tmpdir/batch.$part.$n.list > $tmpdir/batch.$part.$n.scp 70 | done 71 | if [ $nj -ne 1 ]; then 72 | $cmd JOB=1:$nj $dir/log/prepare_feats_$part.JOB.log \ 73 | copy-feats scp:$tmpdir/batch.$part.JOB.scp ark,scp:$tmpdir/feats_$part.JOB.ark,$dir/feats_$part.JOB.scp 74 | else 75 | copy-feats scp:$tmpdir/batch.$part.1.scp ark,scp:$tmpdir/feats_$part.1.ark,$dir/feats_$part.1.scp 76 | fi 77 | 78 | done 79 | 80 | } 81 | -------------------------------------------------------------------------------- /utils/find_arpa_oovs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | if ( @ARGV < 1 && @ARGV > 2) { 19 | die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n"; 20 | # This program finds words in the arpa file that are not symbols 21 | # in the OpenFst-format symbol table words.txt. It prints them 22 | # on the standard output, one per line. 23 | } 24 | 25 | $symtab = shift @ARGV; 26 | open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n"; 27 | while(){ 28 | @A = split(" ", $_); 29 | @A == 2 || die "Bad line in symbol table file: $_"; 30 | $seen{$A[0]} = 1; 31 | } 32 | 33 | $curgram=0; 34 | while(<>) { # Find the \data\ marker. 35 | if(m:^\\data\\$:) { last; } 36 | } 37 | while(<>) { 38 | if(m/^\\(\d+)\-grams:\s*$/) { 39 | $curgram = $1; 40 | if($curgram > 1) { 41 | last; # This is an optimization as we can get the vocab from the 1-grams 42 | } 43 | } elsif($curgram > 0) { 44 | @A = split(" ", $_); 45 | if(@A > 1) { 46 | shift @A; 47 | for($n=0;$n<$curgram;$n++) { 48 | $word = $A[$n]; 49 | if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; } 50 | $in_arpa{$word} = 1; 51 | } 52 | } else { 53 | if(@A > 0 && $A[0] !~ m:\\end\\:) { 54 | print STDERR "Unusual line $_ (line $.) in arpa file\n"; 55 | } 56 | } 57 | } 58 | } 59 | 60 | foreach $w (keys %in_arpa) { 61 | if(!defined $seen{$w} && $w ne "" && $w ne "") { 62 | print "$w\n"; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /utils/prep_ctc_trans_bkup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This python script converts the word-based transcripts into label sequences. The labels are 19 | # represented by their indices. 20 | 21 | import sys 22 | 23 | if __name__ == '__main__': 24 | 25 | if len(sys.argv) != 4: 26 | print "Usage: {0} ".format(sys.argv[0]) 27 | print "e.g., utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text " 28 | print " - the lexicon file in which entries have been represented by indices" 29 | print " - the word-based transcript file" 30 | print " - the word which represents OOVs in transcripts" 31 | exit(1) 32 | 33 | dict_file = sys.argv[1] 34 | trans_file = sys.argv[2] 35 | unk_word = sys.argv[3] 36 | 37 | # read the lexicon into a dictionary data structure 38 | fread = open(dict_file,'r') 39 | dict = {} 40 | for line in fread.readlines(): 41 | line = line.replace('\n','') 42 | splits = line.split(' ') # assume there are no multiple spaces 43 | word = splits[0] 44 | letters = '' 45 | for n in range(1, len(splits)): 46 | letters += splits[n] + ' ' 47 | dict[word] = letters.strip() 48 | fread.close() 49 | 50 | # assume that each line is formatted as "uttid word1 word2 word3 ...", with no multiple spaces appearing 51 | fread = open(trans_file,'r') 52 | for line in fread.readlines(): 53 | out_line = '' 54 | line = line.replace('\n','').strip(); 55 | splits = line.split(' '); 56 | 57 | out_line += splits[0] + ' ' 58 | for n in range(1, len(splits)): 59 | try: 60 | out_line += dict[splits[n]] + ' ' 61 | except Exception: 62 | out_line += dict[unk_word] + ' ' 63 | print out_line.strip() 64 | -------------------------------------------------------------------------------- /local/hkust_train_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # To be run from one directory above this script. 5 | 6 | 7 | #text=data/local/train/text 8 | #lexicon=data/local/dict/lexicon.txt 9 | text=$1 10 | lexicon=$2 11 | dir=$3 12 | for f in "$text" "$lexicon"; do 13 | [ ! -f $x ] && echo "$0: No such file $f" && exit 1; 14 | done 15 | 16 | # This script takes no arguments. It assumes you have already run 17 | # swbd_p1_data_prep.sh. 18 | # It takes as input the files 19 | #data/local/train/text 20 | #data/local/dict/lexicon.txt 21 | #dir=data/local/lm 22 | mkdir -p $dir 23 | export LC_ALL=C # You'll get errors about things being not sorted, if you 24 | # have a different locale. 25 | export PATH=$PATH:`pwd`/../../tools/kaldi_lm 26 | 27 | mkdir -p $dir 28 | 29 | 30 | cleantext=$dir/text.no_oov 31 | 32 | cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } 33 | {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ 34 | > $cleantext || exit 1; 35 | 36 | 37 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ 38 | sort -nr > $dir/word.counts || exit 1; 39 | 40 | 41 | # Get counts from acoustic training transcripts, and add one-count 42 | # for each word in the lexicon (but not silence, we don't want it 43 | # in the LM-- we'll add it optionally later). 44 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ 45 | cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ 46 | sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; 47 | 48 | # note: we probably won't really make use of as there aren't any OOVs 49 | cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ 50 | || exit 1; 51 | 52 | # note: output is 53 | # data/local/lm/3gram-mincount/lm_unpruned.gz 54 | 55 | 56 | # From here is some commands to do a baseline with SRILM (assuming 57 | # you have it installed). 58 | heldout_sent=10000 # Don't change this if you want result to be comparable with 59 | # kaldi_lm results 60 | sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. 61 | mkdir -p $sdir 62 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout 64 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train 66 | 67 | cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist 68 | 69 | 70 | ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ 71 | -map-unk "" -interpolate -lm $sdir/lm.kn.gz 72 | ngram -lm $sdir/lm.kn.gz -ppl $sdir/heldout 73 | # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 74 | -------------------------------------------------------------------------------- /utils/subset_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This program selects a subset of N elements in the scp. 18 | 19 | # By default, it selects them evenly from throughout the scp, in order to avoid 20 | # selecting too many from the same speaker. It prints them on the standard 21 | # output. 22 | # With the option --first, it just selects the N first utterances. 23 | # With the option --last, it just selects the N last utterances. 24 | 25 | # Last modified by JHU & HKUST @2013 26 | 27 | 28 | $quiet = 0; 29 | $first = 0; 30 | $last = 0; 31 | if ($ARGV[0] eq "--quiet") { 32 | shift; 33 | $quiet = 1; 34 | } 35 | if ($ARGV[0] eq "--first") { 36 | shift; 37 | $first = 1; 38 | } 39 | if ($ARGV[0] eq "--last") { 40 | shift; 41 | $last = 1; 42 | } 43 | 44 | if(@ARGV < 2 ) { 45 | die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . 46 | " --quiet causes it to not die if N < num lines in scp.\n" . 47 | " --first and --last make it equivalent to head or tail.\n"; 48 | } 49 | 50 | $N = shift @ARGV; 51 | if($N == 0) { 52 | die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; 53 | } 54 | $inscp = shift @ARGV; 55 | open(I, "<$inscp") || die "Opening input scp file $inscp"; 56 | 57 | @F = (); 58 | while() { 59 | push @F, $_; 60 | } 61 | $numlines = @F; 62 | if($N > $numlines) { 63 | if ($quiet) { 64 | $N = $numlines; 65 | } else { 66 | die "You requested from subset_scp.pl more elements than available: $N > $numlines"; 67 | } 68 | } 69 | 70 | sub select_n { 71 | my ($start,$end,$num_needed) = @_; 72 | my $diff = $end - $start; 73 | if($num_needed > $diff) { die "select_n: code error"; } 74 | if($diff == 1 ) { 75 | if($num_needed > 0) { 76 | print $F[$start]; 77 | } 78 | } else { 79 | my $halfdiff = int($diff/2); 80 | my $halfneeded = int($num_needed/2); 81 | select_n($start, $start+$halfdiff, $halfneeded); 82 | select_n($start+$halfdiff, $end, $num_needed - $halfneeded); 83 | } 84 | } 85 | 86 | if ( ! $first && ! $last) { 87 | select_n(0, $numlines, $N); 88 | } else { 89 | if ($first) { # --first option: same as head. 90 | for ($n = 0; $n < $N; $n++) { 91 | print $F[$n]; 92 | } 93 | } else { # --last option: same as tail. 94 | for ($n = @F - $N; $n < @F; $n++) { 95 | print $F[$n]; 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /utils/prep_ctc_trans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This python script converts the word-based transcripts into label sequences. The labels are 19 | # represented by their indices. 20 | 21 | import sys 22 | 23 | if __name__ == '__main__': 24 | 25 | if len(sys.argv) < 4 or len(sys.argv) > 5: 26 | print "Usage: {0} [space_word]".format(sys.argv[0]) 27 | print "e.g., utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text " 28 | print " - the lexicon file in which entries have been represented by indices" 29 | print " - the word-based transcript file" 30 | print " - the word which represents OOVs in transcripts" 31 | print "[space_word] - optional, the word representing spaces in the transcripts" 32 | exit(1) 33 | 34 | dict_file = sys.argv[1] 35 | trans_file = sys.argv[2] 36 | unk_word = sys.argv[3] 37 | 38 | is_char = False 39 | if len(sys.argv) == 5: 40 | is_char = True 41 | space_word = sys.argv[4] 42 | 43 | # read the lexicon into a dictionary data structure 44 | fread = open(dict_file,'r') 45 | dict = {} 46 | for line in fread.readlines(): 47 | line = line.replace('\n','') 48 | splits = line.split(' ') # assume there are no multiple spaces 49 | word = splits[0] 50 | letters = '' 51 | for n in range(1, len(splits)): 52 | letters += splits[n] + ' ' 53 | dict[word] = letters.strip() 54 | fread.close() 55 | 56 | # assume that each line is formatted as "uttid word1 word2 word3 ...", with no multiple spaces appearing 57 | fread = open(trans_file,'r') 58 | for line in fread.readlines(): 59 | out_line = '' 60 | line = line.replace('\n','').strip() 61 | while ' ' in line: 62 | line = line.replace(' ', ' ') # remove multiple spaces in the transcripts 63 | 64 | uttid = line.split(' ')[0] # the first field is always utterance id 65 | trans = line.replace(uttid, '').strip() 66 | if is_char: 67 | trans = trans.replace(' ', ' ' + space_word + ' ') 68 | splits = trans.split(' ') 69 | 70 | out_line += uttid + ' ' 71 | for n in range(0, len(splits)): 72 | try: 73 | out_line += dict[splits[n]] + ' ' 74 | except Exception: 75 | out_line += dict[unk_word] + ' ' 76 | print out_line.strip() 77 | -------------------------------------------------------------------------------- /utils/filter_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2012 Microsoft Corporation 3 | # Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # This script takes a list of utterance-ids or any file whose first field 20 | # of each line is an utterance-id, and filters an scp 21 | # file (or any file whose "n-th" field is an utterance id), printing 22 | # out only those lines whose "n-th" field is in id_list. The index of 23 | # the "n-th" field is 1, by default, but can be changed by using 24 | # the -f switch 25 | 26 | $exclude = 0; 27 | $field = 1; 28 | $shifted = 0; 29 | 30 | do { 31 | $shifted=0; 32 | if ($ARGV[0] eq "--exclude") { 33 | $exclude = 1; 34 | shift @ARGV; 35 | $shifted=1; 36 | } 37 | if ($ARGV[0] eq "-f") { 38 | $field = $ARGV[1]; 39 | shift @ARGV; shift @ARGV; 40 | $shifted=1 41 | } 42 | } while ($shifted); 43 | 44 | if(@ARGV < 1 || @ARGV > 2) { 45 | die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . 46 | "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . 47 | "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . 48 | "only the lines that were *not* in id_list.\n" . 49 | "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . 50 | "If your older scripts (written before Oct 2014) stopped working and you used the\n" . 51 | "-f option, add 1 to the argument.\n"; 52 | } 53 | 54 | 55 | $idlist = shift @ARGV; 56 | open(F, "<$idlist") || die "Could not open id-list file $idlist"; 57 | while() { 58 | @A = split; 59 | @A>=1 || die "Invalid id-list file line $_"; 60 | $seen{$A[0]} = 1; 61 | } 62 | 63 | if ($field == 1) { # Treat this as special case, since it is common. 64 | while(<>) { 65 | $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; 66 | # $1 is what we filter on. 67 | if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { 68 | print $_; 69 | } 70 | } 71 | } else { 72 | while(<>) { 73 | @A = split; 74 | @A > 0 || die "Invalid scp file line $_"; 75 | @A >= $field || die "Invalid scp file line $_"; 76 | if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { 77 | print $_; 78 | } 79 | } 80 | } 81 | 82 | # tests: 83 | # the following should print "foo 1" 84 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) 85 | # the following should print "bar 2". 86 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) 87 | -------------------------------------------------------------------------------- /utils/convert_ctm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | # This takes as standard input a ctm file that's "relative to the utterance", 6 | # i.e. times are measured relative to the beginning of the segments, and it 7 | # uses a "segments" file (format: 8 | # utterance-id recording-id start-time end-time 9 | # ) and a "reco2file_and_channel" file (format: 10 | # recording-id basename-of-file 11 | 12 | $skip_unknown=undef; 13 | if ( $ARGV[0] eq "--skip-unknown" ) { 14 | $skip_unknown=1; 15 | shift @ARGV; 16 | } 17 | 18 | if (@ARGV < 2 || @ARGV > 3) { 19 | print STDERR "Usage: convert_ctm.pl [] > real-ctm\n"; 20 | exit(1); 21 | } 22 | 23 | $segments = shift @ARGV; 24 | $reco2file_and_channel = shift @ARGV; 25 | 26 | open(S, "<$segments") || die "opening segments file $segments"; 27 | while() { 28 | @A = split(" ", $_); 29 | @A == 4 || die "Bad line in segments file: $_"; 30 | ($utt, $recording_id, $begin_time, $end_time) = @A; 31 | $utt2reco{$utt} = $recording_id; 32 | $begin{$utt} = $begin_time; 33 | $end{$utt} = $end_time; 34 | } 35 | close(S); 36 | open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; 37 | while() { 38 | @A = split(" ", $_); 39 | @A == 3 || die "Bad line in reco2file_and_channel file: $_"; 40 | ($recording_id, $file, $channel) = @A; 41 | $reco2file{$recording_id} = $file; 42 | $reco2channel{$recording_id} = $channel; 43 | } 44 | 45 | 46 | # Now process the ctm file, which is either the standard input or the third 47 | # command-line argument. 48 | $num_done = 0; 49 | while(<>) { 50 | @A= split(" ", $_); 51 | ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_"; 52 | # lines look like: 53 | # 1 [ confidence ] 54 | ($utt, $one, $wbegin, $wlen, $w, $conf) = @A; 55 | $reco = $utt2reco{$utt}; 56 | if (!defined $reco) { 57 | next if defined $skip_unknown; 58 | die "Utterance-id $utt not defined in segments file $segments"; 59 | } 60 | $file = $reco2file{$reco}; 61 | $channel = $reco2channel{$reco}; 62 | if (!defined $file || !defined $channel) { 63 | die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; 64 | } 65 | $b = $begin{$utt}; 66 | $e = $end{$utt}; 67 | $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording. 68 | $wbegin_r = sprintf("%.2f", $wbegin_r); 69 | $wlen = sprintf("%.2f", $wlen); 70 | if (defined $conf) { 71 | $line = "$file $channel $wbegin_r $wlen $w $conf\n"; 72 | } else { 73 | $line = "$file $channel $wbegin_r $wlen $w\n"; 74 | } 75 | if ($wbegin_r + $wlen > $e + 0.01) { 76 | print STDERR "Warning: word appears to be past end of recording; line is $line"; 77 | } 78 | print $line; # goes to stdout. 79 | $num_done++; 80 | } 81 | 82 | if ($num_done == 0) { exit 1; } else { exit 0; } 83 | 84 | __END__ 85 | 86 | # Test example [also test it without the 0.5's] 87 | echo utt reco 10.0 20.0 > segments 88 | echo reco file A > reco2file_and_channel 89 | echo utt 1 8.0 1.0 word 0.5 > ctm_in 90 | echo file A 18.00 1.00 word 0.5 > ctm_out 91 | utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error 92 | rm segments reco2file_and_channel ctm_in ctm_out 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /local/score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Apache 2.0 3 | 4 | [ -f ./path.sh ] && . ./path.sh 5 | 6 | # begin configuration section. 7 | cmd=run.pl 8 | stage=0 9 | min_acwt=5 10 | max_acwt=10 11 | acwt_factor=0.1 # the scaling factor for the acoustic scale. The scaling factor for acoustic likelihoods 12 | # needs to be 0.5 ~1.0. However, the job submission script can only take integers as the 13 | # job marker. That's why we set the acwt to be integers (5 ~ 10), but scale them with 0.1 14 | # when they are actually used. 15 | #end configuration section. 16 | 17 | [ -f ./path.sh ] && . ./path.sh 18 | . parse_options.sh || exit 1; 19 | 20 | if [ $# -ne 3 ]; then 21 | echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " 22 | echo " Options:" 23 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 24 | echo " --min_acwt # minumum LM-weight for lattice rescoring " 25 | echo " --max_acwt # maximum LM-weight for lattice rescoring " 26 | exit 1; 27 | fi 28 | 29 | data=$1 30 | lang_or_graph=$2 31 | dir=$3 32 | 33 | symtab=$lang_or_graph/words.txt 34 | 35 | for f in $symtab $dir/lat.1.gz $data/text; do 36 | [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; 37 | done 38 | 39 | mkdir -p $dir/scoring/log 40 | 41 | function filter_text { 42 | perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } 43 | while() { @A = split(" ", $_); $id = shift @A; print "$id "; 44 | foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \ 45 | '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' 46 | } 47 | filter_text <$data/text >$dir/scoring/text.filt 48 | 49 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/best_path.ACWT.log \ 50 | lattice-scale --acoustic-scale=ACWT --ascale-factor=$acwt_factor "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ 51 | lattice-best-path --word-symbol-table=$symtab ark:- ark,t:$dir/scoring/ACWT.tra || exit 1; 52 | 53 | for acwt in `seq $min_acwt $max_acwt`; do 54 | cat $dir/scoring/${acwt}.tra | utils/int2sym.pl -f 2- $symtab | \ 55 | filter_text > $dir/scoring/$acwt.txt || exit 1; 56 | done 57 | 58 | unset LC_ALL 59 | #for character error rate 60 | cat $dir/scoring/text.filt | awk '{ print $1}' > $dir/scoring/utt_id 61 | cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra 62 | paste $dir/scoring/utt_id $dir/scoring/utt_tra > $dir/scoring/char.filt 63 | 64 | for acwt in `seq $min_acwt $max_acwt`; do 65 | cat $dir/scoring/$acwt.txt | awk '{ print $1}' > $dir/scoring/utt_id 66 | cat $dir/scoring/$acwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra 67 | paste $dir/scoring/utt_id $dir/scoring/utt_tra > $dir/scoring/${acwt}.char 68 | done 69 | 70 | rm $dir/scoring/utt_tra $dir/scoring/utt_id 71 | 72 | export LC_ALL=C 73 | 74 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/score.ACWT.log \ 75 | compute-wer --text --mode=present \ 76 | ark:$dir/scoring/text.filt ark:$dir/scoring/ACWT.txt ">&" $dir/wer_ACWT || exit 1; 77 | 78 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/score.ACWT.cer.log \ 79 | compute-wer --text --mode=present \ 80 | ark:$dir/scoring/char.filt ark:$dir/scoring/ACWT.char ">&" $dir/cer_ACWT || exit 1; 81 | 82 | exit 0; 83 | -------------------------------------------------------------------------------- /utils/create_data_link.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013 Guoguo Chen 4 | # 2014 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0. 6 | # 7 | # This script distributes data onto different file systems by making symbolic 8 | # links. It is supposed to use together with utils/create_split_dir.pl, which 9 | # creates a "storage" directory that links to different file systems. 10 | # 11 | # If a sub-directory egs/storage does not exist, it does nothing. If it exists, 12 | # then it selects pseudo-randomly a number from those available in egs/storage/* 13 | # creates a link such as 14 | # 15 | # egs/egs.3.4.ark -> storage/4/egs.3.4.ark 16 | # 17 | use strict; 18 | use warnings; 19 | use File::Basename; 20 | use File::Spec; 21 | use Getopt::Long; 22 | 23 | sub GetGCD { 24 | my ($a, $b) = @_; 25 | while ($a != $b) { 26 | if ($a > $b) { 27 | $a = $a - $b; 28 | } else { 29 | $b = $b - $a; 30 | } 31 | } 32 | return $a; 33 | } 34 | 35 | my $Usage = < storage/4/egs.3.4.ark 45 | 46 | Usage: utils/create_data_link.pl 47 | e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark 48 | 49 | See also utils/remove_data_links.sh 50 | EOU 51 | 52 | GetOptions(); 53 | 54 | if (@ARGV != 1) { 55 | die $Usage; 56 | } 57 | 58 | my $fullpath = shift(@ARGV); 59 | 60 | # Check if the storage has been created. If so, do nothing. 61 | my $dirname = dirname($fullpath); 62 | if (! -d "$dirname/storage") { 63 | exit(0); 64 | } 65 | 66 | # Storage exists, create symbolic links in the next few steps. 67 | 68 | # First, get a list of the available storage direstories, and check if they are 69 | # properly created. 70 | opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; 71 | my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); 72 | closedir($dh); 73 | my $num_storage = scalar(@storage_dirs); 74 | for (my $x = 1; $x <= $num_storage; $x++) { 75 | (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; 76 | } 77 | 78 | # Second, get the coprime list. 79 | my @coprimes; 80 | for (my $n = 1; $n < $num_storage; $n++) { 81 | if (GetGCD($n, $num_storage) == 1) { 82 | push(@coprimes, $n); 83 | } 84 | } 85 | 86 | # Finally, work out the directory index where we should put the data to. 87 | my $basename = basename($fullpath); 88 | my $filename_numbers = $basename; 89 | $filename_numbers =~ s/[^0-9]+/ /g; 90 | my @filename_numbers = split(" ", $filename_numbers); 91 | my $total = 0; 92 | my $index = 0; 93 | foreach my $x (@filename_numbers) { 94 | if ($index >= scalar(@coprimes)) { 95 | $index = 0; 96 | } 97 | $total += $x * $coprimes[$index]; 98 | $index++; 99 | } 100 | my $dir_index = $total % $num_storage + 1; 101 | 102 | # Make the symbolic link. 103 | if (-e $fullpath) { 104 | unlink($fullpath); 105 | } 106 | my $ret = symlink("storage/$dir_index/$basename", $fullpath); 107 | exit($ret == 1 ? 0 : 1); 108 | -------------------------------------------------------------------------------- /steps/align_ctc_single_utt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2016 Yajie Miao 4 | # Apache 2.0 5 | 6 | # Generate word-level alignment for a single utterance. 7 | 8 | ## Begin configuration section 9 | stage=0 10 | cmd=run.pl 11 | num_threads=1 12 | 13 | max_active=7000 # max-active 14 | beam=15.0 # beam used 15 | lattice_beam=8.0 16 | max_mem=50000000 # approx. limit to memory consumption during minimization in bytes 17 | 18 | acoustic_scale=0.6 # the acoustic scales to be used 19 | oov_word="" # the oov word, used to convert oov words in the transcripts 20 | # feature configurations; will be read from the training dir if not provided 21 | norm_vars= 22 | add_deltas= 23 | ## End configuration section 24 | 25 | echo "$0 $@" # Print the command line for logging 26 | 27 | [ -f ./path.sh ] && . ./path.sh; 28 | . parse_options.sh || exit 1; 29 | 30 | if [ $# != 5 ]; then 31 | echo "Wrong #arguments ($#, expected 3)" 32 | echo "Usage: steps/align_ctc_single_utt.sh [options] " 33 | echo " e.g.: steps/align_ctc_single_utt.sh data/lang_phn data/train data/uttdata exp/train_phn_l5_c320 exp/train_phn_l5_c320/align" 34 | echo "main options (for others, see top of script file)" 35 | echo " --stage # starts from which stage" 36 | echo " --nj # number of parallel jobs" 37 | echo " --cmd # command to run in parallel with" 38 | echo " --acoustic_scale # default 0.6 the value of acoustic scale to be used" 39 | exit 1; 40 | fi 41 | 42 | langdir=$1 43 | data=$2 44 | uttdata=$3 45 | mdldir=$4 46 | dir=`echo $5 | sed 's:/$::g'` # remove any trailing slash. 47 | 48 | thread_string= 49 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 50 | 51 | [ -z "$add_deltas" ] && add_deltas=`cat $mdldir/add_deltas 2>/dev/null` 52 | [ -z "$norm_vars" ] && norm_vars=`cat $mdldir/norm_vars 2>/dev/null` 53 | 54 | mkdir -p $dir/log 55 | 56 | # Check if necessary files exist. 57 | for f in $mdldir/final.nnet $mdldir/label.counts $data/feats.scp $uttdata/feats.scp $uttdata/text; do 58 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 59 | done 60 | 61 | ## Set up the features 62 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})" 63 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$uttdata/feats.scp ark:- |" 64 | $add_deltas && feats="$feats add-deltas ark:- ark:- |" 65 | ## 66 | 67 | ## Create the "decoding" graph for this utterance 68 | oov_int=`grep $oov_word $langdir/words.txt | awk '{print $2}'` 69 | 70 | utils/sym2int.pl --map-oov $oov_int -f 2- $langdir/words.txt $uttdata/text > $dir/text_int 71 | 72 | utils/training_trans_fst.py $dir/text_int | fstcompile | fstarcsort --sort_type=olabel > $dir/G.fst 73 | 74 | fsttablecompose ${langdir}/L.fst $dir/G.fst | fstdeterminizestar --use-log=true | \ 75 | fstminimizeencoded | fstarcsort --sort_type=ilabel > $dir/LG.fst || exit 1; 76 | 77 | fsttablecompose ${langdir}/T.fst $dir/LG.fst > $dir/TLG.fst || exit 1; 78 | 79 | ## Generate alignments 80 | net-output-extract --class-frame-counts=$mdldir/label.counts --apply-log=true $mdldir/final.nnet "$feats" ark:- | \ 81 | latgen-faster --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \ 82 | --acoustic-scale=$acoustic_scale --word-symbol-table=$langdir/words.txt --allow-partial=true $dir/TLG.fst ark:- ark:- | \ 83 | lattice-1best --acoustic-scale=$acoustic_scale --ascale-factor=1 ark:- ark:- | \ 84 | nbest-to-ctm ark:- - | \ 85 | utils/int2sym.pl -f 5 $langdir/words.txt > $dir/ali 86 | 87 | exit 0; 88 | -------------------------------------------------------------------------------- /utils/sym2int.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | $ignore_oov = 0; 19 | 20 | for($x = 0; $x < 2; $x++) { 21 | if ($ARGV[0] eq "--map-oov") { 22 | shift @ARGV; 23 | $map_oov = shift @ARGV; 24 | if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { 25 | # disallow '-f', the empty string and anything ending in words.txt as the 26 | # OOV symbol because these are likely command-line errors. 27 | die "the --map-oov option requires an argument"; 28 | } 29 | } 30 | if ($ARGV[0] eq "-f") { 31 | shift @ARGV; 32 | $field_spec = shift @ARGV; 33 | if ($field_spec =~ m/^\d+$/) { 34 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 35 | } 36 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) 37 | if ($1 ne "") { 38 | $field_begin = $1 - 1; # Change to zero-based indexing. 39 | } 40 | if ($2 ne "") { 41 | $field_end = $2 - 1; # Change to zero-based indexing. 42 | } 43 | } 44 | if (!defined $field_begin && !defined $field_end) { 45 | die "Bad argument to -f option: $field_spec"; 46 | } 47 | } 48 | } 49 | 50 | $symtab = shift @ARGV; 51 | if (!defined $symtab) { 52 | print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . 53 | "options: [--map-oov ] [-f ]\n" . 54 | "note: can look like 4-5, or 4-, or 5-, or 1.\n"; 55 | } 56 | open(F, "<$symtab") || die "Error opening symbol table file $symtab"; 57 | while() { 58 | @A = split(" ", $_); 59 | @A == 2 || die "bad line in symbol table file: $_"; 60 | $sym2int{$A[0]} = $A[1] + 0; 61 | } 62 | 63 | if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up 64 | if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } 65 | $map_oov = $sym2int{$map_oov}; 66 | } 67 | 68 | $num_warning = 0; 69 | $max_warning = 20; 70 | 71 | while (<>) { 72 | @A = split(" ", $_); 73 | @B = (); 74 | for ($n = 0; $n < @A; $n++) { 75 | $a = $A[$n]; 76 | if ( (!defined $field_begin || $n >= $field_begin) 77 | && (!defined $field_end || $n <= $field_end)) { 78 | $i = $sym2int{$a}; 79 | if (!defined ($i)) { 80 | if (defined $map_oov) { 81 | if ($num_warning++ < $max_warning) { 82 | print STDERR "sym2int.pl: replacing $a with $map_oov\n"; 83 | if ($num_warning == $max_warning) { 84 | print STDERR "sym2int.pl: not warning for OOVs any more times\n"; 85 | } 86 | } 87 | $i = $map_oov; 88 | } else { 89 | $pos = $n+1; 90 | die "sym2int.pl: undefined symbol $a (in position $pos)\n"; 91 | } 92 | } 93 | $a = $i; 94 | } 95 | push @B, $a; 96 | } 97 | print join(" ", @B); 98 | print "\n"; 99 | } 100 | if ($num_warning > 0) { 101 | print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; 102 | } 103 | 104 | exit(0); 105 | -------------------------------------------------------------------------------- /steps/decode_ctc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 4 | # 2015 Yajie Miao (Carnegie Mellon University) 5 | # Apache 2.0 6 | 7 | # Decode the CTC-trained model. Currently we are using the simplest best-path decoding. Lattice-based 8 | # decoding and other formats of decoding outputs (e.g., CTM) will be added in our future development. 9 | 10 | 11 | ## Begin configuration section 12 | stage=0 13 | nj=16 14 | cmd=run.pl 15 | num_threads=1 16 | 17 | max_active=7000 # max-active 18 | beam=15.0 # beam used 19 | 20 | skip_scoring=false # whether to skip WER scoring 21 | acoustic_scales="0.5 0.6 0.7 0.8" # the acoustic scales to be used 22 | 23 | # feature configurations; will be read from the training dir if not provided 24 | norm_vars= 25 | add_deltas= 26 | ## End configuration section 27 | 28 | echo "$0 $@" # Print the command line for logging 29 | 30 | [ -f ./path.sh ] && . ./path.sh; 31 | . parse_options.sh || exit 1; 32 | 33 | if [ $# != 3 ]; then 34 | echo "Wrong #arguments ($#, expected 3)" 35 | echo "Usage: steps/decode_ctc.sh [options] " 36 | echo " e.g.: steps/decode_ctc.sh data/lang data/test exp/train_l4_c320/decode" 37 | echo "main options (for others, see top of script file)" 38 | echo " --stage # starts from which stage" 39 | echo " --nj # number of parallel jobs" 40 | echo " --cmd # command to run in parallel with" 41 | echo " --acoustic_scales # default 0.5 0.6 0.7 0.8 ... the values of acoustic scales to be used" 42 | exit 1; 43 | fi 44 | 45 | graphdir=$1 46 | data=$2 47 | dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. 48 | 49 | srcdir=`dirname $dir`; # assume model directory one level up from decoding directory. 50 | sdata=$data/split$nj; 51 | 52 | thread_string= 53 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 54 | 55 | [ -z "$add_deltas" ] && add_deltas=`cat $srcdir/add_deltas 2>/dev/null` 56 | [ -z "$norm_vars" ] && norm_vars=`cat $srcdir/norm_vars 2>/dev/null` 57 | 58 | mkdir -p $dir/log 59 | split_data.sh $data $nj || exit 1; 60 | echo $nj > $dir/num_jobs 61 | 62 | # Check if necessary files exist. 63 | for f in $graphdir/TLG.fst $srcdir/label.counts $data/feats.scp; do 64 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 65 | done 66 | 67 | ## Set up the features 68 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})" 69 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" 70 | $add_deltas && feats="$feats add-deltas ark:- ark:- |" 71 | ## 72 | 73 | # Decode for each of the acoustic scales 74 | for ascale in $acoustic_scales; do 75 | echo "$0: decoding with acoustic scale $ascale" 76 | $cmd JOB=1:$nj $dir/log/decode.$ascale.JOB.log \ 77 | net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true $srcdir/final.nnet "$feats" ark:- \| \ 78 | decode-faster --beam=$beam --max-active=$max_active --acoustic-scale=$ascale --word-symbol-table=$graphdir/words.txt \ 79 | --allow-partial=true $graphdir/TLG.fst ark:- ark,t:$dir/trans.$ascale.JOB 80 | cat $dir/trans.$ascale.* > $dir/trans.$ascale 81 | rm -f $dir/trans.$ascale.* 82 | done 83 | 84 | # Scoring 85 | cat $data/text | sed 's:::g' | sed 's:::g' | sed 's:::g' > $dir/text_filt 86 | if ! $skip_scoring ; then 87 | for ascale in $acoustic_scales; do 88 | cat $dir/trans.$ascale | utils/int2sym.pl -f 2- $graphdir/words.txt | \ 89 | sed 's:::g' | sed 's:::g' | sed 's:::g' | \ 90 | compute-wer --text --mode=present ark:$dir/text_filt ark,p:- >& $dir/wer_$ascale || exit 1; 91 | done 92 | fi 93 | 94 | exit 0; 95 | -------------------------------------------------------------------------------- /utils/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /utils/pinyin_map.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | $num_args = $#ARGV + 1; 4 | if ($num_args != 1) { 5 | print "\nUsage: pinyin2phone.pl pinyin2phone\n"; 6 | exit; 7 | } 8 | 9 | open(MAPS, $ARGV[0]) or die("Could not open pinyin map file."); 10 | my %py2ph; foreach $line () { @A = split(" ", $line); 11 | $py = shift(@A); 12 | $py2ph{$py} = [@A]; 13 | } 14 | 15 | #foreach $word ( keys %py2ph ) { 16 | #foreach $i ( 0 .. $#{ $py2ph{$word} } ) { 17 | # print " $word = $py2ph{$word}[$i]"; 18 | #} 19 | #print " $#{ $py2ph{$word} }"; 20 | #print "\n"; 21 | #} 22 | 23 | my @entry; 24 | 25 | while () { 26 | @A = split(" ", $_); 27 | @entry = (); 28 | $W = shift(@A); 29 | push(@entry, $W); 30 | for($i = 0; $i < @A; $i++) { 31 | $initial= $A[$i]; $final = $A[$i]; 32 | #print $initial, " ", $final, "\n"; 33 | if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;} 34 | elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} 35 | elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;} 36 | elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;} 37 | elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;} 38 | elsif ($A[$i] =~ /^D[A-Z0-9]+$/) {$initial =~ s:(D)[A-Z0-9]+:$1:; $final =~ s:D([A-Z0-9]+):$1:;} 39 | elsif ($A[$i] =~ /^F[A-Z0-9]+$/) {$initial =~ s:(F)[A-Z0-9]+:$1:; $final =~ s:F([A-Z0-9]+):$1:;} 40 | elsif ($A[$i] =~ /^G[A-Z0-9]+$/) {$initial =~ s:(G)[A-Z0-9]+:$1:; $final =~ s:G([A-Z0-9]+):$1:;} 41 | elsif ($A[$i] =~ /^H[A-Z0-9]+$/) {$initial =~ s:(H)[A-Z0-9]+:$1:; $final =~ s:H([A-Z0-9]+):$1:;} 42 | elsif ($A[$i] =~ /^J[A-Z0-9]+$/) {$initial =~ s:(J)[A-Z0-9]+:$1:; $final =~ s:J([A-Z0-9]+):$1:;} 43 | elsif ($A[$i] =~ /^K[A-Z0-9]+$/) {$initial =~ s:(K)[A-Z0-9]+:$1:; $final =~ s:K([A-Z0-9]+):$1:;} 44 | elsif ($A[$i] =~ /^L[A-Z0-9]+$/) {$initial =~ s:(L)[A-Z0-9]+:$1:; $final =~ s:L([A-Z0-9]+):$1:;} 45 | elsif ($A[$i] =~ /^M[A-Z0-9]+$/) {$initial =~ s:(M)[A-Z0-9]+:$1:; $final =~ s:M([A-Z0-9]+):$1:;} 46 | elsif ($A[$i] =~ /^N[A-Z0-9]+$/) {$initial =~ s:(N)[A-Z0-9]+:$1:; $final =~ s:N([A-Z0-9]+):$1:;} 47 | elsif ($A[$i] =~ /^P[A-Z0-9]+$/) {$initial =~ s:(P)[A-Z0-9]+:$1:; $final =~ s:P([A-Z0-9]+):$1:;} 48 | elsif ($A[$i] =~ /^Q[A-Z0-9]+$/) {$initial =~ s:(Q)[A-Z0-9]+:$1:; $final =~ s:Q([A-Z0-9]+):$1:;} 49 | elsif ($A[$i] =~ /^R[A-Z0-9]+$/) {$initial =~ s:(R)[A-Z0-9]+:$1:; $final =~ s:R([A-Z0-9]+):$1:;} 50 | elsif ($A[$i] =~ /^S[A-Z0-9]+$/) {$initial =~ s:(S)[A-Z0-9]+:$1:; $final =~ s:S([A-Z0-9]+):$1:;} 51 | elsif ($A[$i] =~ /^T[A-Z0-9]+$/) {$initial =~ s:(T)[A-Z0-9]+:$1:; $final =~ s:T([A-Z0-9]+):$1:;} 52 | elsif ($A[$i] =~ /^W[A-Z0-9]+$/) {$initial =~ s:(W)[A-Z0-9]+:$1:; $final =~ s:W([A-Z0-9]+):$1:;} 53 | elsif ($A[$i] =~ /^X[A-Z0-9]+$/) {$initial =~ s:(X)[A-Z0-9]+:$1:; $final =~ s:X([A-Z0-9]+):$1:;} 54 | elsif ($A[$i] =~ /^Y[A-Z0-9]+$/) {$initial =~ s:(Y)[A-Z0-9]+:$1:; $final =~ s:Y([A-Z0-9]+):$1:;} 55 | elsif ($A[$i] =~ /^Z[A-Z0-9]+$/) {$initial =~ s:(Z)[A-Z0-9]+:$1:; $final =~ s:Z([A-Z0-9]+):$1:;} 56 | if ($initial ne $A[$i]) { 57 | $tone = $final; 58 | $final =~ s:([A-Z]+)[0-9]:$1:; 59 | $tone =~ s:[A-Z]+([0-9]):$1:; 60 | if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { print "1: no entry find for ", $A[$i], " ", $initial, " ", $final; exit;} 61 | push(@entry, @{$py2ph{$initial}}); 62 | @tmp = @{$py2ph{$final}}; 63 | for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} 64 | push(@entry, @tmp); 65 | } 66 | else { 67 | $tone = $A[$i]; 68 | $A[$i] =~ s:([A-Z]+)[0-9]:$1:; 69 | $tone =~ s:[A-Z]+([0-9]):$1:; 70 | if (!(exists $py2ph{$A[$i]})) { print "2: no entry find for ", $A[$i]; exit;} 71 | @tmp = @{$py2ph{$A[$i]}}; 72 | for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} 73 | push(@entry, @tmp); 74 | } 75 | } 76 | print "@entry"; 77 | print "\n"; 78 | } 79 | -------------------------------------------------------------------------------- /utils/subset_data_dir_tr_cv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin); 3 | # Brno University of Technology (Author: Karel Vesely); 4 | # Johns Hopkins University (Author: Daniel Povey); 5 | # Apache 2.0 6 | 7 | # This script splits dataset to two parts : 8 | # training set from (100-P)% of speakers/utterances and 9 | # held-out set (or cross-validation) from P% of remaining speakers/remaining utterances, 10 | # which will be later on used for neural network training 11 | # 12 | # There are two options for choosing held-out (or cross-validation) set, either by 13 | # --cv-spk-percent P , which will give you CV set based on random chosen P% of speakers, or 14 | # --cv-utt-percent P , which will give you CV set based on last P% utterances in the dataset 15 | # 16 | # If you don't apply the above two options, by default the script will use --cv-utt-percent option, 17 | # and the default cross validation percentage portion is equal to 10% (i.e. P=10) 18 | # 19 | # The --cv-spk-percent option is useful if you would like to have subset chosen from random speakers order, 20 | # especially for the cases where dataset contains multiple different corpora, 21 | # where type of speakers or recording channels may be quite different 22 | 23 | # Begin configuration. 24 | cv_spk_percent= # % of speakers is parsed by option 25 | cv_utt_percent=10 # default 10% of total utterances 26 | seed=777 # use seed for speaker shuffling 27 | # End configuration. 28 | 29 | echo "$0 $@" # Print the command line for logging 30 | 31 | uttbase=true; # by default, we choose last 10% utterances for CV 32 | 33 | if [ "$1" == "--cv-spk-percent" ]; then 34 | uttbase=false; 35 | spkbase=true; 36 | fi 37 | 38 | [ -f path.sh ] && . ./path.sh; 39 | 40 | . parse_options.sh || exit 1; 41 | 42 | if [ $# != 3 ]; then 43 | echo "Usage: $0 [--cv-spk-percent P|--cv-utt-percent P] " 44 | echo " --cv-spk-percent P Cross Validation portion of the total speakers, recommend value is 10% (i.e. P=10)" 45 | echo " --cv-utt-percent P Cross Validation portion of the total utterances, default is 10% (i.e. P=10)" 46 | echo " " 47 | exit 1; 48 | fi 49 | 50 | srcdir=$1 51 | trndir=$2 52 | cvdir=$3 53 | 54 | ## use simple last P% utterance for CV 55 | if $uttbase; then 56 | if [ ! -f $srcdir/utt2spk ]; then 57 | echo "$0: no such file $srcdir/utt2spk" 58 | exit 1; 59 | fi 60 | 61 | #total number of lines 62 | N=$(cat $srcdir/utt2spk | wc -l) 63 | #get line number where (100-P)% of the data lies 64 | P_utt=$((N * cv_utt_percent / 100)) 65 | N_head=$((N -P_utt)) 66 | #move the boundary so it is located on speaker change 67 | N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }') 68 | #the rest of the data will be that big 69 | N_tail=$((N-N_head)) 70 | 71 | #now call the subset_data_dir.sh and fix the directories 72 | subset_data_dir.sh --first $srcdir $N_head $trndir 73 | subset_data_dir.sh --last $srcdir $N_tail $cvdir 74 | 75 | exit 0; 76 | fi 77 | 78 | ## use random chosen P% speakers for CV 79 | if [ ! -f $srcdir/spk2utt ]; then 80 | echo "$0: no such file $srcdir/spk2utt" 81 | exit 1; 82 | fi 83 | 84 | #total, cv, train number of speakers 85 | N=$(cat $srcdir/spk2utt | wc -l) 86 | N_spk_cv=$((N * cv_spk_percent / 100)) 87 | N_spk_trn=$((N - N_spk_cv)) 88 | 89 | mkdir -p $cvdir $trndir 90 | 91 | #shuffle the speaker list 92 | awk '{print $1}' $srcdir/spk2utt | shuffle_list.pl --srand $seed > $trndir/_tmpf_randspk 93 | 94 | #split the train/cv 95 | head -n $N_spk_cv $trndir/_tmpf_randspk > $cvdir/_tmpf_cvspk 96 | tail -n $N_spk_trn $trndir/_tmpf_randspk > $trndir/_tmpf_trainspk 97 | 98 | #now call the subset_data_dir.sh 99 | subset_data_dir.sh --spk-list $trndir/_tmpf_trainspk $srcdir $trndir 100 | subset_data_dir.sh --spk-list $cvdir/_tmpf_cvspk $srcdir $cvdir 101 | 102 | #clean-up 103 | rm -f $trndir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk 104 | 105 | -------------------------------------------------------------------------------- /utils/add_lex_disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # Adds disambiguation symbols to a lexicon. 20 | # Outputs still in the normal lexicon format. 21 | # Disambig syms are numbered #1, #2, #3, etc. (#0 22 | # reserved for symbol in grammar). 23 | # Outputs the number of disambig syms to the standard output. 24 | # With the --pron-probs option, expects the second field 25 | # of each lexicon line to be a pron-prob. 26 | 27 | $pron_probs = 0; 28 | 29 | if ($ARGV[0] eq "--pron-probs") { 30 | $pron_probs = 1; 31 | shift @ARGV; 32 | } 33 | 34 | if(@ARGV != 2) { 35 | die "Usage: add_lex_disambig.pl [--pron-probs] lexicon.txt lexicon_disambig.txt " 36 | } 37 | 38 | 39 | $lexfn = shift @ARGV; 40 | $lexoutfn = shift @ARGV; 41 | 42 | open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; 43 | 44 | # (1) Read in the lexicon. 45 | @L = ( ); 46 | while() { 47 | @A = split(" ", $_); 48 | push @L, join(" ", @A); 49 | } 50 | 51 | # (2) Work out the count of each phone-sequence in the 52 | # lexicon. 53 | 54 | foreach $l (@L) { 55 | @A = split(" ", $l); 56 | shift @A; # Remove word. 57 | if ($pron_probs) { 58 | $p = shift @A; 59 | if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } 60 | } 61 | $count{join(" ",@A)}++; 62 | } 63 | 64 | # (3) For each left sub-sequence of each phone-sequence, note down 65 | # that exists (for identifying prefixes of longer strings). 66 | 67 | foreach $l (@L) { 68 | @A = split(" ", $l); 69 | shift @A; # Remove word. 70 | if ($pron_probs) { shift @A; } # remove pron-prob. 71 | while(@A > 0) { 72 | pop @A; # Remove last phone 73 | $issubseq{join(" ",@A)} = 1; 74 | } 75 | } 76 | 77 | # (4) For each entry in the lexicon: 78 | # if the phone sequence is unique and is not a 79 | # prefix of another word, no diambig symbol. 80 | # Else output #1, or #2, #3, ... if the same phone-seq 81 | # has already been assigned a disambig symbol. 82 | 83 | 84 | open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; 85 | 86 | $max_disambig = 0; 87 | foreach $l (@L) { 88 | @A = split(" ", $l); 89 | $word = shift @A; 90 | if ($pron_probs) { $pron_prob = shift @A; } 91 | $phnseq = join(" ",@A); 92 | if(!defined $issubseq{$phnseq} 93 | && $count{$phnseq} == 1) { 94 | ; # Do nothing. 95 | } else { 96 | if($phnseq eq "") { # need disambig symbols for the empty string 97 | # that are not use anywhere else. 98 | $max_disambig++; 99 | $reserved{$max_disambig} = 1; 100 | $phnseq = "#$max_disambig"; 101 | } else { 102 | $curnumber = $disambig_of{$phnseq}; 103 | if(!defined{$curnumber}) { $curnumber = 0; } 104 | $curnumber++; # now 1 or 2, ... 105 | while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols 106 | if($curnumber > $max_disambig) { 107 | $max_disambig = $curnumber; 108 | } 109 | $disambig_of{$phnseq} = $curnumber; 110 | $phnseq = $phnseq . " #" . $curnumber; 111 | } 112 | } 113 | if ($pron_probs) { print O "$word\t$pron_prob\t$phnseq\n"; } 114 | else { print O "$word\t$phnseq\n"; } 115 | } 116 | 117 | print $max_disambig . "\n"; 118 | 119 | -------------------------------------------------------------------------------- /steps/compute_cmvn_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 4 | # Apache 2.0 5 | # To be run from .. (one directory up from here) 6 | # see ../run.sh for example 7 | 8 | # Compute cepstral mean and variance statistics per speaker. 9 | # We do this in just one job; it's fast. 10 | # This script takes no options. 11 | # 12 | # Note: there is no option to do CMVN per utterance. The idea is 13 | # that if you did it per utterance it would not make sense to do 14 | # per-speaker fMLLR on top of that (since you'd be doing fMLLR on 15 | # top of different offsets). Therefore what would be the use 16 | # of the speaker information? In this case you should probably 17 | # make the speaker-ids identical to the utterance-ids. The 18 | # speaker information does not have to correspond to actual 19 | # speakers, it's just the level you want to adapt at. 20 | 21 | echo "$0 $@" # Print the command line for logging 22 | 23 | fake=false 24 | fake_dims= # If specified, can generate 'fake' stats (that won't normalize) 25 | # from a specified dimension. 26 | two_channel=false 27 | 28 | if [ "$1" == "--fake" ]; then 29 | fake=true 30 | shift 31 | fi 32 | if [ "$1" == "--fake-dims" ]; then 33 | fake_dims=$2 34 | shift 35 | shift 36 | fi 37 | if [ "$1" == "--two-channel" ]; then 38 | two_channel=true 39 | shift 40 | fi 41 | 42 | if [ $# != 3 ]; then 43 | echo "Usage: $0 [options] "; 44 | echo "e.g.: $0 data/train exp/make_mfcc/train mfcc" 45 | echo "Options:" 46 | echo " --fake gives you fake cmvn stats that do no normalization." 47 | echo " --two-channel is for two-channel telephone data, there must be no segments " 48 | echo " file and reco2file_and_channel must be present. It will take" 49 | echo " only frames that are louder than the other channel." 50 | echo " --fake-dims Generate stats that won't cause normalization for these" 51 | echo " dimensions (e.g. 13:14:15)" 52 | exit 1; 53 | fi 54 | 55 | if [ -f path.sh ]; then . ./path.sh; fi 56 | 57 | data=$1 58 | logdir=$2 59 | cmvndir=$3 60 | 61 | # make $cmvndir an absolute pathname. 62 | cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}` 63 | 64 | # use "name" as part of name of the archive. 65 | name=`basename $data` 66 | 67 | mkdir -p $cmvndir || exit 1; 68 | mkdir -p $logdir || exit 1; 69 | 70 | 71 | required="$data/feats.scp $data/spk2utt" 72 | 73 | for f in $required; do 74 | if [ ! -f $f ]; then 75 | echo "make_cmvn.sh: no such file $f" 76 | exit 1; 77 | fi 78 | done 79 | 80 | if $fake; then 81 | dim=`feat-to-dim scp:$data/feats.scp -` 82 | ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1"; 83 | for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \ 84 | copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \ 85 | echo "Error creating fake CMVN stats" && exit 1; 86 | elif $two_channel; then 87 | ! compute-cmvn-stats-two-channel $data/reco2file_and_channel scp:$data/feats.scp \ 88 | ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \ 89 | 2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats (using two-channel method)" && exit 1; 90 | elif [ ! -z "$fake_dims" ]; then 91 | ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | \ 92 | modify-cmvn-stats "$fake_dims" ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \ 93 | echo "Error computing (partially fake) CMVN stats" && exit 1; 94 | else 95 | ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \ 96 | 2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1; 97 | fi 98 | 99 | cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1; 100 | 101 | nc=`cat $data/cmvn.scp | wc -l` 102 | nu=`cat $data/spk2utt | wc -l` 103 | if [ $nc -ne $nu ]; then 104 | echo "$0: warning: it seems not all of the speakers got cmvn stats ($nc != $nu);" 105 | [ $nc -eq 0 ] && exit 1; 106 | fi 107 | 108 | echo "Succeeded creating CMVN stats for $name" 109 | -------------------------------------------------------------------------------- /steps/make_fbank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Karel Vesely Johns Hopkins University (Author: Daniel Povey) 4 | # Apache 2.0 5 | # To be run from .. (one directory up from here) 6 | # see ../run.sh for example 7 | 8 | # Begin configuration section. 9 | nj=4 10 | cmd=run.pl 11 | fbank_config=conf/fbank.conf 12 | compress=true 13 | # End configuration section. 14 | 15 | echo "$0 $@" # Print the command line for logging 16 | 17 | if [ -f path.sh ]; then . ./path.sh; fi 18 | . parse_options.sh || exit 1; 19 | 20 | if [ $# != 3 ]; then 21 | echo "usage: make_fbank.sh [options] "; 22 | echo "options: " 23 | echo " --fbank-config # config passed to compute-fbank-feats " 24 | echo " --nj # number of parallel jobs" 25 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 26 | exit 1; 27 | fi 28 | 29 | data=$1 30 | logdir=$2 31 | fbankdir=$3 32 | 33 | 34 | # make $fbankdir an absolute pathname. 35 | fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}` 36 | 37 | # use "name" as part of name of the archive. 38 | name=`basename $data` 39 | 40 | mkdir -p $fbankdir || exit 1; 41 | mkdir -p $logdir || exit 1; 42 | 43 | if [ -f $data/feats.scp ]; then 44 | mkdir -p $data/.backup 45 | echo "$0: moving $data/feats.scp to $data/.backup" 46 | mv $data/feats.scp $data/.backup 47 | fi 48 | 49 | scp=$data/wav.scp 50 | 51 | required="$scp $fbank_config" 52 | 53 | for f in $required; do 54 | if [ ! -f $f ]; then 55 | echo "make_fbank.sh: no such file $f" 56 | exit 1; 57 | fi 58 | done 59 | 60 | utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; 61 | 62 | if [ -f $data/spk2warp ]; then 63 | echo "$0 [info]: using VTLN warp factors from $data/spk2warp" 64 | vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk" 65 | elif [ -f $data/utt2warp ]; then 66 | echo "$0 [info]: using VTLN warp factors from $data/utt2warp" 67 | vtln_opts="--vtln-map=ark:$data/utt2warp" 68 | fi 69 | 70 | for n in $(seq $nj); do 71 | # the next command does nothing unless $fbankdir/storage/ exists, see 72 | # utils/create_data_link.pl for more info. 73 | utils/create_data_link.pl $fbankdir/raw_fbank_$name.$n.ark 74 | done 75 | 76 | if [ -f $data/segments ]; then 77 | echo "$0 [info]: segments file exists: using that." 78 | split_segments="" 79 | for n in $(seq $nj); do 80 | split_segments="$split_segments $logdir/segments.$n" 81 | done 82 | 83 | utils/split_scp.pl $data/segments $split_segments || exit 1; 84 | rm $logdir/.error 2>/dev/null 85 | 86 | $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \ 87 | extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ 88 | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \ 89 | copy-feats --compress=$compress ark:- \ 90 | ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ 91 | || exit 1; 92 | 93 | else 94 | echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." 95 | split_scps="" 96 | for n in $(seq $nj); do 97 | split_scps="$split_scps $logdir/wav.$n.scp" 98 | done 99 | 100 | utils/split_scp.pl $scp $split_scps || exit 1; 101 | 102 | $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \ 103 | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \ 104 | copy-feats --compress=$compress ark:- \ 105 | ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ 106 | || exit 1; 107 | 108 | fi 109 | 110 | 111 | if [ -f $logdir/.error.$name ]; then 112 | echo "Error producing fbank features for $name:" 113 | tail $logdir/make_fbank_${name}.1.log 114 | exit 1; 115 | fi 116 | 117 | # concatenate the .scp files together. 118 | for n in $(seq $nj); do 119 | cat $fbankdir/raw_fbank_$name.$n.scp || exit 1; 120 | done > $data/feats.scp 121 | 122 | rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null 123 | 124 | nf=`cat $data/feats.scp | wc -l` 125 | nu=`cat $data/utt2spk | wc -l` 126 | if [ $nf -ne $nu ]; then 127 | echo "It seems not all of the feature files were successfully ($nf != $nu);" 128 | echo "consider using utils/fix_data_dir.sh $data" 129 | fi 130 | 131 | echo "Succeeded creating filterbank features for $name" 132 | #echo -e "\n" 133 | -------------------------------------------------------------------------------- /utils/split_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2010-2013 Microsoft Corporation 3 | # Johns Hopkins University (Author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | split_per_spk=true 19 | if [ "$1" == "--per-utt" ]; then 20 | split_per_spk=false 21 | shift 22 | fi 23 | 24 | if [ $# != 2 ]; then 25 | echo "Usage: split_data.sh " 26 | echo "This script will not split the data-dir if it detects that the output is newer than the input." 27 | exit 1 28 | fi 29 | 30 | data=$1 31 | numsplit=$2 32 | 33 | if [ $numsplit -le 0 ]; then 34 | echo "Invalid num-split argument $numsplit"; 35 | exit 1; 36 | fi 37 | 38 | n=0; 39 | feats="" 40 | wavs="" 41 | utt2spks="" 42 | texts="" 43 | 44 | nu=`cat $data/utt2spk | wc -l` 45 | nf=`cat $data/feats.scp 2>/dev/null | wc -l` 46 | nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file 47 | if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then 48 | echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script " 49 | echo " may produce incorrectly split data." 50 | echo "use utils/fix_data_dir.sh $data to fix this." 51 | fi 52 | if [ -f $data/text ] && [ $nu -ne $nt ]; then 53 | echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script " 54 | echo " may produce incorrectly split data." 55 | echo "use utils/fix_data_dir.sh to fix this." 56 | fi 57 | 58 | s1=$data/split$numsplit/1 59 | if [ ! -d $s1 ]; then 60 | need_to_split=true 61 | else 62 | need_to_split=false 63 | for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ 64 | vad.scp segments reco2file_and_channel utt2lang; do 65 | if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then 66 | need_to_split=true 67 | fi 68 | done 69 | fi 70 | 71 | if ! $need_to_split; then 72 | exit 0; 73 | fi 74 | 75 | for n in `seq $numsplit`; do 76 | mkdir -p $data/split$numsplit/$n 77 | feats="$feats $data/split$numsplit/$n/feats.scp" 78 | vads="$vads $data/split$numsplit/$n/vad.scp" 79 | texts="$texts $data/split$numsplit/$n/text" 80 | utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" 81 | utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang" 82 | done 83 | 84 | if $split_per_spk; then 85 | utt2spk_opt="--utt2spk=$data/utt2spk" 86 | else 87 | utt2spk_opt= 88 | fi 89 | 90 | utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 91 | 92 | [ -f $data/feats.scp ] && utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats 93 | 94 | [ -f $data/text ] && utils/split_scp.pl $utt2spk_opt $data/text $texts 95 | 96 | [ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads 97 | 98 | [ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs 99 | 100 | # If lockfile is not installed, just don't lock it. It's not a big deal. 101 | which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock 102 | 103 | for n in `seq $numsplit`; do 104 | dsn=$data/split$numsplit/$n 105 | utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; 106 | for f in spk2gender spk2warp cmvn.scp; do 107 | [ -f $data/$f ] && \ 108 | utils/filter_scp.pl $dsn/spk2utt $data/$f > $dsn/$f 109 | done 110 | if [ -f $data/segments ]; then 111 | utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments 112 | awk '{print $2;}' $dsn/segments |sort|uniq > $data/tmp.reco # recording-ids. 113 | [ -f $data/reco2file_and_channel ] && 114 | utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel 115 | [ -f $data/wav.scp ] && utils/filter_scp.pl $data/tmp.reco $data/wav.scp > $dsn/wav.scp 116 | rm $data/tmp.reco 117 | else # else wav indexed by utterance -> filter on this. 118 | [ -f $data/wav.scp ] && 119 | utils/filter_scp.pl $dsn/utt2spk $data/wav.scp > $dsn/wav.scp 120 | fi 121 | done 122 | 123 | rm -f $data/.split_lock 124 | 125 | exit 0 126 | -------------------------------------------------------------------------------- /utils/format_lm_sri.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Arnab Ghoshal 4 | # Copyright 2010-2011 Microsoft Corporation 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Begin configuration section. 20 | srilm_opts="-subset -prune-lowprobs -unk -tolower" 21 | # end configuration sections 22 | 23 | 24 | . utils/parse_options.sh 25 | 26 | if [ $# -ne 4 ] && [ $# -ne 3 ]; then 27 | echo "Usage: $0 [options] [] " 28 | echo "The argument is no longer needed but is supported for back compatibility" 29 | echo "E.g.: utils/format_lm_sri.sh data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test" 30 | echo "Converts ARPA-format language models to FSTs. Change the LM vocabulary using SRILM." 31 | echo "Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this" 32 | echo "that doesn't require SRILM: see examples in egs/wsj/s5/local/wsj_format_local_lms.sh" 33 | echo "options:" 34 | echo " --help # print this message and exit" 35 | echo " --srilm-opts STRING # options to pass to SRILM tools (default: '$srilm_opts')" 36 | exit 1; 37 | fi 38 | 39 | 40 | if [ $# -eq 4 ] ; then 41 | lang_dir=$1 42 | lm=$2 43 | lexicon=$3 44 | out_dir=$4 45 | else 46 | lang_dir=$1 47 | lm=$2 48 | out_dir=$3 49 | fi 50 | 51 | mkdir -p $out_dir 52 | 53 | for f in $lm $lang_dir/words.txt; do 54 | if [ ! -f $f ]; then 55 | echo "$0: expected input file $f to exist." 56 | exit 1; 57 | fi 58 | done 59 | 60 | [ -f ./path.sh ] && . ./path.sh 61 | 62 | loc=`which change-lm-vocab` 63 | if [ -z $loc ]; then 64 | if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... 65 | sdir=`pwd`/../../../tools/srilm/bin/i686-m64 66 | else 67 | sdir=`pwd`/../../../tools/srilm/bin/i686 68 | fi 69 | if [ -f $sdir/../change-lm-vocab ]; then 70 | echo Using SRILM tools from $sdir 71 | export PATH=$PATH:$sdir:$sdir/.. 72 | else 73 | echo You appear to not have SRILM tools installed, either on your path, 74 | echo or installed in $sdir. See tools/install_srilm.sh for installation 75 | echo instructions. 76 | exit 1 77 | fi 78 | fi 79 | 80 | echo "Converting '$lm' to FST" 81 | tmpdir=data/local/format_sri_tmp 82 | mkdir -p $tmpdir 83 | trap 'rm -rf "$tmpdir"' EXIT 84 | 85 | mkdir -p $out_dir 86 | cp -r $lang_dir/* $out_dir || exit 1; 87 | 88 | lm_base=$(basename $lm '.gz') 89 | gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ 90 | > $out_dir/oovs_${lm_base}.txt || exit 1; 91 | 92 | # Removing all "illegal" combinations of and , which are supposed to 93 | # occur only at being/end of utt. These can cause determinization failures 94 | # of CLG [ends up being epsilon cycles]. 95 | gunzip -c $lm \ 96 | | egrep -v ' | | ' \ 97 | | gzip -c > $tmpdir/lm.gz || exit 1; 98 | 99 | awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1; 100 | 101 | # Change the LM vocabulary to be the intersection of the current LM vocabulary 102 | # and the set of words in the pronunciation lexicon. This also renormalizes the 103 | # LM by recomputing the backoff weights, and remove those ngrams whose 104 | # probabilities are lower than the backed-off estimates. 105 | change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \ 106 | $srilm_opts || exit 1; 107 | 108 | arpa2fst $tmpdir/out_lm | fstprint \ 109 | | utils/eps2disambig.pl | utils/s2eps.pl \ 110 | | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ 111 | --keep_isymbols=false --keep_osymbols=false \ 112 | | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst || exit 1; 113 | 114 | fstisstochastic $out_dir/G.fst 115 | 116 | # The output is like: 117 | # 9.14233e-05 -0.259833 118 | # we do expect the first of these 2 numbers to be close to zero (the second is 119 | # nonzero because the backoff weights make the states sum to >1). 120 | 121 | echo "Succeeded in formatting LM '$lm' -> '$out_dir/G.fst'" 122 | -------------------------------------------------------------------------------- /steps/decode_ctc_lat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Apache 2.0 4 | 5 | # Decode the CTC-trained model by generating lattices. 6 | 7 | 8 | ## Begin configuration section 9 | stage=0 10 | nj=16 11 | cmd=run.pl 12 | num_threads=1 13 | 14 | acwt=0.9 15 | min_active=200 16 | max_active=7000 # max-active 17 | beam=15.0 # beam used 18 | lattice_beam=8.0 19 | max_mem=50000000 # approx. limit to memory consumption during minimization in bytes 20 | mdl=final.nnet 21 | 22 | skip_scoring=false # whether to skip WER scoring 23 | scoring_opts="--min-acwt 5 --max-acwt 10 --acwt-factor 0.1" 24 | score_with_conf=false 25 | 26 | # feature configurations; will be read from the training dir if not provided 27 | norm_vars= 28 | add_deltas= 29 | subsample_feats= 30 | splice_feats= 31 | ## End configuration section 32 | 33 | echo "$0 $@" # Print the command line for logging 34 | 35 | [ -f ./path.sh ] && . ./path.sh; 36 | . parse_options.sh || exit 1; 37 | 38 | if [ $# != 4 ]; then 39 | echo "Wrong #arguments ($#, expected 3)" 40 | echo "Usage: steps/decode_ctc.sh [options] " 41 | echo " e.g.: steps/decode_ctc.sh data/lang data/test exp/train_l4_c320/decode" 42 | echo "main options (for others, see top of script file)" 43 | echo " --stage # starts from which stage" 44 | echo " --nj # number of parallel jobs" 45 | echo " --cmd # command to run in parallel with" 46 | echo " --acwt # default 0.9, the acoustic scale to be used" 47 | exit 1; 48 | fi 49 | 50 | graphdir=$1 51 | data=$2 52 | srcdir=$3 53 | dir=`echo $4 | sed 's:/$::g'` # remove any trailing slash. 54 | 55 | #srcdir=`dirname $dir`; # assume model directory one level up from decoding directory. 56 | #srcdir=/home/sundy/work/egs/hkust/exp/train_phn_l3_c320 57 | 58 | sdata=$data/split$nj; 59 | 60 | thread_string= 61 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 62 | 63 | [ -z "$add_deltas" ] && add_deltas=`cat $srcdir/add_deltas 2>/dev/null` 64 | [ -z "$norm_vars" ] && norm_vars=`cat $srcdir/norm_vars 2>/dev/null` 65 | [ -z "$subsample_feats" ] && subsample_feats=`cat $srcdir/subsample_feats 2>/dev/null` || subsample_feats=false 66 | [ -z "$splice_feats" ] && splice_feats=`cat $srcdir/splice_feats 2>/dev/null` || splice_feats=false 67 | 68 | mkdir -p $dir/log 69 | split_data.sh $data $nj || exit 1; 70 | echo $nj > $dir/num_jobs 71 | 72 | # Check if necessary files exist. 73 | for f in $graphdir/TLG.fst $srcdir/label.counts $data/feats.scp; do 74 | #for f in $graphdir/TLG.fst ./exp/train_phn_l5_c320/label.counts $data/feats.scp; do 75 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 76 | done 77 | 78 | ## Set up the features 79 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})" 80 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" 81 | $add_deltas && feats="$feats add-deltas ark:- ark:- |" 82 | $splice_feats && feats="$feats splice-feats --left-context=1 --right-context=1 ark:- ark:- |" 83 | $subsample_feats && feats="$feats subsample-feats --n=3 --offset=0 ark:- ark:- |" 84 | ## 85 | 86 | #$cmd JOB=1:$nj $dir/log/decode.JOB.log \ 87 | # net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true --use-gpu="no" $srcdir/$mdl "$feats" ark,t:output.txt 88 | #exit 1; 89 | # Decode for each of the acoustic scales 90 | $cmd JOB=1:$nj $dir/log/decode.JOB.log \ 91 | net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true --use-gpu="yes" $srcdir/$mdl "$feats" ark:- \| \ 92 | latgen-faster --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \ 93 | --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ 94 | $graphdir/TLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; 95 | 96 | # Scoring 97 | if ! $skip_scoring ; then 98 | if [ -f $data/stm ]; then # use sclite scoring. 99 | if $score_with_conf ; then 100 | [ ! -x local/score_sclite_conf.sh ] && echo "Not scoring because local/score_sclite_conf.sh does not exist or not executable." && exit 1; 101 | local/score_sclite_conf.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1; 102 | else 103 | [ ! -x local/score_sclite.sh ] && echo "Not scoring because local/score_sclite.sh does not exist or not executable." && exit 1; 104 | local/score_sclite.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1; 105 | fi 106 | else 107 | [ ! -x local/score.sh ] && echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; 108 | local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1; 109 | fi 110 | fi 111 | 112 | exit 0; 113 | -------------------------------------------------------------------------------- /utils/ctc_compile_dict_token.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the 18 | # phoneme and character-based lexicons. 19 | 20 | dict_type="phn" # the type of lexicon, either "phn" or "char" 21 | space_char="" # the character you have used to represent spaces 22 | 23 | . utils/parse_options.sh 24 | 25 | if [ $# -ne 3 ]; then 26 | echo "usage: utils/ctc_compile_dict_token.sh " 27 | echo "e.g.: utils/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn" 28 | echo " should contain the following files:" 29 | echo "lexicon.txt lexicon_numbers.txt units.txt" 30 | echo "options: " 31 | echo " --dict-type # default: phn." 32 | echo " --space-char # default: , the character to represent spaces." 33 | exit 1; 34 | fi 35 | 36 | echo ============================================= 37 | echo " Generating Lexicon FST and CTC tokens FST " 38 | echo ============================================= 39 | srcdir=$1 40 | tmpdir=$2 41 | dir=$3 42 | mkdir -p $dir $tmpdir 43 | 44 | [ -f path.sh ] && . ./path.sh 45 | 46 | cp $srcdir/{lexicon_numbers.txt,units.txt} $dir 47 | 48 | # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. 49 | # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. 50 | perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; 51 | 52 | # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. 53 | # Without these symbols, determinization will fail. 54 | ndisambig=`utils/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` 55 | ndisambig=$[$ndisambig+1]; 56 | 57 | ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list 58 | 59 | # Get the full list of CTC tokens used in FST. These tokens include , the blank , the actual labels (e.g., 60 | # phonemes), and the disambiguation symbols. 61 | cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list 62 | (echo ''; echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt 63 | 64 | # Compile the tokens into FST 65 | utils/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \ 66 | --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; 67 | 68 | # Encode the words with indices. Will be used in lexicon and language model FST compiling. 69 | cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' 70 | BEGIN { 71 | print " 0"; 72 | } 73 | { 74 | printf("%s %d\n", $1, NR); 75 | } 76 | END { 77 | printf("#0 %d\n", NR+1); 78 | }' > $dir/words.txt || exit 1; 79 | 80 | # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. 81 | token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` 82 | word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` 83 | 84 | case $dict_type in 85 | phn) 86 | utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ 87 | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ 88 | --keep_isymbols=false --keep_osymbols=false | \ 89 | fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ 90 | fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; 91 | ;; 92 | char) 93 | echo "Building a character-based lexicon, with $space_char as the space" 94 | utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0.5 "$space_char" '#'$ndisambig | \ 95 | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ 96 | --keep_isymbols=false --keep_osymbols=false | \ 97 | fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ 98 | fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; 99 | ;; 100 | *) echo "$0: invalid dictionary type $dict_type" && exit 1; 101 | esac 102 | 103 | echo "Dict and token FSTs compiling succeeded" 104 | echo -e "\n" 105 | -------------------------------------------------------------------------------- /utils/model_topo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import sys 19 | 20 | def parse_arguments(arg_elements): 21 | args = {} 22 | arg_num = len(arg_elements) / 2 23 | for i in xrange(arg_num): 24 | key = arg_elements[2*i].replace("--","").replace("-", "_"); 25 | args[key] = arg_elements[2*i+1] 26 | return args 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | """ 32 | Python script to generate the network topology. Parameters: 33 | ------------------ 34 | --input-feat-dim : int 35 | Dimension of the input features 36 | Required. 37 | --lstm-layer-num : int 38 | Number of LSTM layers 39 | Required. 40 | --lstm-cell-dim : int 41 | Number of memory cells in LSTM. For the bi-directional case, this is the number of cells 42 | in either the forward or the backward sub-layer. 43 | Required. 44 | --target-num : int 45 | Number of labels as the targets 46 | Required. 47 | --param-range : float 48 | Range to randomly draw the initial values of model parameters. For example, setting it to 49 | 0.1 means model parameters are drawn uniformly from [-0.1, 0.1] 50 | Optional. By default it is set to 0.1. 51 | --lstm-type : string 52 | Type of LSTMs. Optional. Either "bi" (bi-directional) or "uni" (uni-directional). By default, 53 | "bi" (bi-directional). 54 | --fgate-bias-init : float 55 | Initial value of the forget-gate bias. Not specifying this option means the forget-gate bias 56 | will be initialized randomly, in the same way as the other parameters. 57 | --input-dim : int 58 | Reduce the input feature to a given dimensionality before passing to the LSTM. 59 | Optional. 60 | --projection-dim : int 61 | Project the feature vector down to a given dimensionality between LSTM layers. 62 | Optional. 63 | 64 | """ 65 | 66 | 67 | # parse arguments 68 | arg_elements = [sys.argv[i] for i in range(1, len(sys.argv))] 69 | arguments = parse_arguments(arg_elements) 70 | 71 | # these 4 arguments are mandatory 72 | input_feat_dim=int(arguments['input_feat_dim']) 73 | lstm_layer_num=int(arguments['lstm_layer_num']) 74 | lstm_cell_dim=int(arguments['lstm_cell_dim']) 75 | target_num=int(arguments['target_num']) 76 | 77 | # by default, the range of the parameters is set to 0.1; however, you can change it by specifying "--param-range" 78 | # this means for initialization, model parameters are drawn uniformly from the interval [-0.1, 0.1] 79 | param_range='0.1' 80 | if arguments.has_key('param_range'): 81 | param_range = arguments['param_range'] 82 | 83 | actual_cell_dim = 2*lstm_cell_dim 84 | model_type = '' # by default 85 | if arguments.has_key('lstm_type') and arguments['lstm_type'] == 'uni': 86 | actual_cell_dim = lstm_cell_dim 87 | model_type = '' 88 | 89 | # add the option to set the initial value of the forget-gate bias 90 | lstm_comm = ' ' + param_range + ' 1.0 50.0' 91 | if arguments.has_key('fgate_bias_init'): 92 | lstm_comm = lstm_comm + ' ' + arguments['fgate_bias_init'] 93 | 94 | # add the option to specify projection layers 95 | if arguments.has_key('projection_dim'): 96 | proj_dim = arguments['projection_dim'] 97 | else: 98 | proj_dim = 0 99 | 100 | # add the option to reduce the dimensionality of the input features 101 | if arguments.has_key('input_dim'): 102 | input_dim = arguments['input_dim'] 103 | else: 104 | input_dim = 0 105 | 106 | 107 | # pre-amble 108 | print '' 109 | 110 | # optional dimensionality reduction layer 111 | if input_dim > 0: 112 | print ' ' + str(input_feat_dim) + ' ' + str(input_dim) + ' ' + param_range 113 | input_feat_dim = input_dim 114 | 115 | # the first layer takes input features 116 | print model_type + ' ' + str(input_feat_dim) + ' ' + str(actual_cell_dim) + lstm_comm 117 | # the following bidirectional LSTM layers 118 | for n in range(1, lstm_layer_num): 119 | if proj_dim > 0: 120 | print ' ' + str(actual_cell_dim) + ' ' + str(proj_dim) + ' ' + param_range 121 | print model_type + ' ' + str(proj_dim) + ' ' + str(actual_cell_dim) + lstm_comm 122 | else: 123 | print model_type + ' ' + str(actual_cell_dim) + ' ' + str(actual_cell_dim) + lstm_comm 124 | 125 | # the final affine-transform and softmax layer 126 | print ' ' + str(actual_cell_dim) + ' ' + str(target_num) + ' ' + param_range 127 | print ' ' + str(target_num) + ' ' + str(target_num) 128 | print '' 129 | -------------------------------------------------------------------------------- /steps/make_fbank_pitch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech, 4 | # PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi) 5 | # Apache 2.0 6 | # Combine filterbank and pitch features together 7 | # Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh 8 | 9 | # Begin configuration section. 10 | nj=4 11 | cmd=run.pl 12 | fbank_config=conf/fbank.conf 13 | pitch_config=conf/pitch.conf 14 | pitch_postprocess_config= 15 | paste_length_tolerance=2 16 | compress=true 17 | # End configuration section. 18 | 19 | echo "$0 $@" # Print the command line for logging 20 | 21 | if [ -f path.sh ]; then . ./path.sh; fi 22 | . parse_options.sh || exit 1; 23 | 24 | if [ $# != 3 ]; then 25 | echo "usage: make_fbank_pitch.sh [options] "; 26 | echo "options: " 27 | echo " --fbank-config # config passed to compute-fbank-feats " 28 | echo " --pitch-config # config passed to compute-kaldi-pitch-feats " 29 | echo " --pitch-postprocess-config # config passed to process-kaldi-pitch-feats " 30 | echo " --paste-length-tolerance # length tolerance passed to paste-feats" 31 | echo " --nj # number of parallel jobs" 32 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 33 | exit 1; 34 | fi 35 | 36 | data=$1 37 | logdir=$2 38 | fbank_pitch_dir=$3 39 | 40 | 41 | # make $fbank_pitch_dir an absolute pathname. 42 | fbank_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbank_pitch_dir ${PWD}` 43 | 44 | # use "name" as part of name of the archive. 45 | name=`basename $data` 46 | 47 | mkdir -p $fbank_pitch_dir || exit 1; 48 | mkdir -p $logdir || exit 1; 49 | 50 | if [ -f $data/feats.scp ]; then 51 | mkdir -p $data/.backup 52 | echo "$0: moving $data/feats.scp to $data/.backup" 53 | mv $data/feats.scp $data/.backup 54 | fi 55 | 56 | scp=$data/wav.scp 57 | 58 | required="$scp $fbank_config $pitch_config" 59 | 60 | for f in $required; do 61 | if [ ! -f $f ]; then 62 | echo "make_fbank_pitch.sh: no such file $f" 63 | exit 1; 64 | fi 65 | done 66 | 67 | if [ ! -z "$pitch_postprocess_config" ]; then 68 | postprocess_config_opt="--config=$pitch_postprocess_config"; 69 | else 70 | postprocess_config_opt= 71 | fi 72 | 73 | utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; 74 | 75 | if [ -f $data/spk2warp ]; then 76 | echo "$0 [info]: using VTLN warp factors from $data/spk2warp" 77 | vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk" 78 | elif [ -f $data/utt2warp ]; then 79 | echo "$0 [info]: using VTLN warp factors from $data/utt2warp" 80 | vtln_opts="--vtln-map=ark:$data/utt2warp" 81 | fi 82 | 83 | for n in $(seq $nj); do 84 | # the next command does nothing unless $fbank_pitch_dir/storage/ exists, see 85 | # utils/create_data_link.pl for more info. 86 | utils/create_data_link.pl $fbank_pitch_dir/raw_fbank_pitch_$name.$n.ark 87 | done 88 | 89 | if [ -f $data/segments ]; then 90 | echo "$0 [info]: segments file exists: using that." 91 | split_segments="" 92 | for n in $(seq $nj); do 93 | split_segments="$split_segments $logdir/segments.$n" 94 | done 95 | 96 | utils/split_scp.pl $data/segments $split_segments || exit 1; 97 | rm $logdir/.error 2>/dev/null 98 | 99 | fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- |" 100 | pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |" 101 | 102 | $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \ 103 | paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \ 104 | copy-feats --compress=$compress ark:- \ 105 | ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \ 106 | || exit 1; 107 | 108 | else 109 | echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." 110 | split_scps="" 111 | for n in $(seq $nj); do 112 | split_scps="$split_scps $logdir/wav.$n.scp" 113 | done 114 | 115 | utils/split_scp.pl $scp $split_scps || exit 1; 116 | 117 | fbank_feats="ark:compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- |" 118 | pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |" 119 | 120 | $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \ 121 | paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \ 122 | copy-feats --compress=$compress ark:- \ 123 | ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \ 124 | || exit 1; 125 | 126 | fi 127 | 128 | 129 | if [ -f $logdir/.error.$name ]; then 130 | echo "Error producing fbank & pitch features for $name:" 131 | tail $logdir/make_fbank_pitch_${name}.1.log 132 | exit 1; 133 | fi 134 | 135 | # concatenate the .scp files together. 136 | for n in $(seq $nj); do 137 | cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1; 138 | done > $data/feats.scp 139 | 140 | rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null 141 | 142 | nf=`cat $data/feats.scp | wc -l` 143 | nu=`cat $data/utt2spk | wc -l` 144 | if [ $nf -ne $nu ]; then 145 | echo "It seems not all of the feature files were successfully processed ($nf != $nu);" 146 | echo "consider using utils/fix_data_dir.sh $data" 147 | fi 148 | 149 | echo "Succeeded creating filterbank & pitch features for $name" 150 | -------------------------------------------------------------------------------- /utils/fix_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script makes sure that only the segments present in 4 | # all of "feats.scp", "wav.scp" [if present], segments [if present] 5 | # text, and utt2spk are present in any of them. 6 | # It puts the original contents of data-dir into 7 | # data-dir/.backup 8 | 9 | if [ $# != 1 ]; then 10 | echo "Usage: fix_data_dir.sh data-dir" 11 | exit 1 12 | fi 13 | 14 | data=$1 15 | mkdir -p $data/.backup 16 | 17 | [ ! -d $data ] && echo "$0: no such directory $data" && exit 1; 18 | 19 | [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; 20 | 21 | tmpdir=$(mktemp -d); 22 | trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM 23 | 24 | export LC_ALL=C 25 | 26 | 27 | function check_sorted { 28 | file=$1 29 | sort -k1,1 -u <$file >$file.tmp 30 | if ! cmp -s $file $file.tmp; then 31 | echo "$0: file $1 is not in sorted order or not unique, sorting it" 32 | mv $file.tmp $file 33 | else 34 | rm $file.tmp 35 | fi 36 | } 37 | 38 | for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang; do 39 | if [ -f $data/$x ]; then 40 | cp $data/$x $data/.backup/$x 41 | check_sorted $data/$x 42 | fi 43 | done 44 | 45 | 46 | function filter_file { 47 | filter=$1 48 | file_to_filter=$2 49 | cp $file_to_filter ${file_to_filter}.tmp 50 | utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter 51 | if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then 52 | length1=`cat ${file_to_filter}.tmp | wc -l` 53 | length2=`cat ${file_to_filter} | wc -l` 54 | if [ $length1 -ne $length2 ]; then 55 | echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." 56 | fi 57 | fi 58 | rm $file_to_filter.tmp 59 | } 60 | 61 | function filter_recordings { 62 | # We call this once before the stage when we filter on utterance-id, and once 63 | # after. 64 | 65 | if [ -f $data/segments ]; then 66 | # We have a segments file -> we need to filter this and the file wav.scp, and 67 | # reco2file_and_utt, if it exists, to make sure they have the same list of 68 | # recording-ids. 69 | 70 | if [ ! -f $data/wav.scp ]; then 71 | echo "$0: $data/segments exists but not $data/wav.scp" 72 | exit 1; 73 | fi 74 | awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings 75 | n1=`cat $tmpdir/recordings | wc -l` 76 | [ ! -s $tmpdir/recordings ] && \ 77 | echo "Empty list of recordings (bad file $data/segments)?" && exit 1; 78 | utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp 79 | mv $tmpdir/recordings.tmp $tmpdir/recordings 80 | 81 | 82 | cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments 83 | filter_file $tmpdir/recordings $data/segments 84 | cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments 85 | rm $data/segments.tmp 86 | 87 | filter_file $tmpdir/recordings $data/wav.scp 88 | [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel 89 | 90 | fi 91 | } 92 | 93 | function filter_speakers { 94 | # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... 95 | utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt 96 | 97 | cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers 98 | for s in cmvn.scp spk2gender; do 99 | f=$data/$s 100 | if [ -f $f ]; then 101 | filter_file $f $tmpdir/speakers 102 | fi 103 | done 104 | 105 | filter_file $tmpdir/speakers $data/spk2utt 106 | utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk 107 | 108 | for s in cmvn.scp spk2gender; do 109 | f=$data/$s 110 | if [ -f $f ]; then 111 | filter_file $tmpdir/speakers $f 112 | fi 113 | done 114 | } 115 | 116 | function filter_utts { 117 | cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts 118 | 119 | # Do a check. 120 | 121 | ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ 122 | echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; 123 | 124 | ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ 125 | echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ 126 | echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; 127 | 128 | ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ 129 | echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; 130 | 131 | 132 | maybe_wav= 133 | [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. 134 | for x in feats.scp text segments utt2lang $maybe_wav; do 135 | if [ -f $data/$x ]; then 136 | utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp 137 | mv $tmpdir/utts.tmp $tmpdir/utts 138 | fi 139 | done 140 | [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ 141 | rm $tmpdir/utts && exit 1; 142 | 143 | 144 | if [ -f $data/utt2spk ]; then 145 | new_nutts=$(cat $tmpdir/utts | wc -l) 146 | old_nutts=$(cat $data/utt2spk | wc -l) 147 | if [ $new_nutts -ne $old_nutts ]; then 148 | echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" 149 | else 150 | echo "fix_data_dir.sh: kept all $old_nutts utterances." 151 | fi 152 | fi 153 | 154 | for x in utt2spk feats.scp vad.scp text segments utt2lang $maybe_wav; do 155 | if [ -f $data/$x ]; then 156 | cp $data/$x $data/.backup/$x 157 | if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then 158 | utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x 159 | fi 160 | fi 161 | done 162 | 163 | } 164 | 165 | filter_recordings 166 | filter_speakers 167 | filter_utts 168 | filter_speakers 169 | filter_recordings 170 | 171 | 172 | 173 | utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt 174 | 175 | echo "fix_data_dir.sh: old files are kept in $data/.backup" 176 | -------------------------------------------------------------------------------- /utils/run_rocks.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # In general, doing 4 | # run.pl some.log a b c is like running the command a b c in 5 | # the bash shell, and putting the standard error and output into some.log. 6 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.) 7 | # run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB 8 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. 9 | # If any of the jobs fails, this script will fail. 10 | 11 | # A typical example is: 12 | # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz 13 | # and run.pl will run something like: 14 | # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log 15 | # 16 | # Basically it takes the command-line arguments, quotes them 17 | # as necessary to preserve spaces, and evaluates them with bash. 18 | # In addition it puts the command line at the top of the log, and 19 | # the start and end times of the command at the beginning and end. 20 | # The reason why this is useful is so that we can create a different 21 | # version of this program that uses a queueing system instead. 22 | 23 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; 24 | 25 | $jobstart=1; 26 | $jobend=1; 27 | $qsub_opts=""; # These will be ignored. 28 | 29 | # First parse an option like JOB=1:4, and any 30 | # options that would normally be given to 31 | # queue.pl, which we will just discard. 32 | 33 | if (@ARGV > 0) { 34 | while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options 35 | # that would normally go to qsub, but which will be ignored here. 36 | $switch = shift @ARGV; 37 | if ($switch eq "-V") { 38 | $qsub_opts .= "-V "; 39 | } else { 40 | $option = shift @ARGV; 41 | if ($switch eq "-sync" && $option =~ m/^[yY]/) { 42 | $qsub_opts .= "-sync "; # Note: in the 43 | # corresponding code in queue.pl it says instead, just "$sync = 1;". 44 | } 45 | $qsub_opts .= "$switch $option "; 46 | if ($switch eq "-pe") { # e.g. -pe smp 5 47 | $option2 = shift @ARGV; 48 | $qsub_opts .= "$option2 "; 49 | } 50 | } 51 | } 52 | if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 53 | $jobname = $1; 54 | $jobstart = $2; 55 | $jobend = $3; 56 | shift; 57 | if ($jobstart > $jobend) { 58 | die "run.pl: invalid job range $ARGV[0]"; 59 | } 60 | } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. 61 | $jobname = $1; 62 | $jobstart = $2; 63 | $jobend = $2; 64 | shift; 65 | } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { 66 | print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n"; 67 | } 68 | } 69 | 70 | if ($qsub_opts ne "") { 71 | print STDERR "Warning: run.pl ignoring options \"$qsub_opts\"\n"; 72 | } 73 | 74 | $logfile = shift @ARGV; 75 | 76 | if (defined $jobname && $logfile !~ m/$jobname/ && 77 | $jobend > $jobstart) { 78 | print STDERR "run.pl: you are trying to run a parallel job but " 79 | . "you are putting the output into just one log file ($logfile)\n"; 80 | exit(1); 81 | } 82 | 83 | $cmd = ""; 84 | 85 | foreach $x (@ARGV) { 86 | if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } 87 | elsif ($x =~ m:\":) { $cmd .= "'$x' "; } 88 | else { $cmd .= "\"$x\" "; } 89 | } 90 | 91 | 92 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 93 | $childpid = fork(); 94 | if (!defined $childpid) { die "Error forking in run.pl (writing to $logfile)"; } 95 | if ($childpid == 0) { # We're in the child... this branch 96 | # executes the job and returns (possibly with an error status). 97 | if (defined $jobname) { 98 | $cmd =~ s/$jobname/$jobid/g; 99 | $logfile =~ s/$jobname/$jobid/g; 100 | } 101 | system("mkdir -p `dirname $logfile` 2>/dev/null"); 102 | open(F, ">$logfile") || die "Error opening log file $logfile"; 103 | print F "# " . $cmd . "\n"; 104 | print F "# Started at " . `date`; 105 | $starttime = `date +'%s'`; 106 | print F "#\n"; 107 | close(F); 108 | 109 | $cmdid = $jobid - $jobstart + 1; 110 | $cmdfile = "./cmds/cmd." . $cmdid; 111 | $donefile = "./dones/done." . $cmdid; 112 | unlink $donefile; 113 | open(F, ">$cmdfile") || die "Error opening cmd file $cmdfile"; 114 | # print F "( " . $cmd . ") 2>>$logfile >> $logfile"; 115 | print F "" . $cmd . " 2>> $logfile\n"; 116 | close(F); 117 | # Pipe into bash.. make sure we're not using any other shell. 118 | # open(B, "|bash") || die "Error opening shell command"; 119 | # print B "( " . $cmd . ") 2>>$logfile >> $logfile"; 120 | # close(B); # If there was an error, exit status is in $? 121 | while (1) { 122 | if (-e $donefile) { 123 | last; 124 | } else { 125 | sleep(13); 126 | } 127 | } 128 | $ret = $?; 129 | 130 | $endtime = `date +'%s'`; 131 | open(F, ">>$logfile") || die "Error opening log file $logfile (again)"; 132 | $enddate = `date`; 133 | chop $enddate; 134 | print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; 135 | print F "# Ended (code $ret) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; 136 | close(F); 137 | exit($ret == 0 ? 0 : 1); 138 | } 139 | } 140 | 141 | $ret = 0; 142 | $numfail = 0; 143 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 144 | $r = wait(); 145 | if ($r == -1) { die "Error waiting for child process"; } # should never happen. 146 | if ($? != 0) { $numfail++; $ret = 1; } # The child process failed. 147 | } 148 | 149 | if ($ret != 0) { 150 | $njobs = $jobend - $jobstart + 1; 151 | if ($njobs == 1) { 152 | if (defined $jobname) { 153 | $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with 154 | # that job. 155 | } 156 | print STDERR "run.pl: job failed, log is in $logfile\n"; 157 | if ($logfile =~ m/JOB/) { 158 | print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; 159 | } 160 | } 161 | else { 162 | $logfile =~ s/$jobname/*/g; 163 | print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; 164 | } 165 | } 166 | 167 | 168 | exit ($ret); 169 | -------------------------------------------------------------------------------- /utils/make_lexicon_fst.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). 20 | 21 | $pron_probs = 0; 22 | 23 | if ($ARGV[0] eq "--pron-probs") { 24 | $pron_probs = 1; 25 | shift @ARGV; 26 | } 27 | 28 | if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { 29 | print STDERR 30 | "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt 31 | Creates a lexicon FST that transduces phones to words, and may allow optional silence. 32 | Note: ordinarily, each line of lexicon.txt is: word phone1 phone2 ... phoneN; if the --pron-probs option is 33 | used, each line is: word pronunciation-probability phone1 phone2 ... phoneN. The probability 'prob' will 34 | typically be between zero and one, and note that it's generally helpful to normalize so the largest one 35 | for each word is 1.0, but this is your responsibility. The silence disambiguation symbol, e.g. something 36 | like #5, is used only when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst, and was 37 | introduced to fix a particular case of non-determinism of decoding graphs.\n"; 38 | exit(1); 39 | } 40 | 41 | $lexfn = shift @ARGV; 42 | if (@ARGV == 0) { 43 | $silprob = 0.0; 44 | } elsif (@ARGV == 2) { 45 | ($silprob,$silphone) = @ARGV; 46 | } else { 47 | ($silprob,$silphone,$sildisambig) = @ARGV; 48 | } 49 | if ($silprob != 0.0) { 50 | $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; 51 | $silcost = -log($silprob); 52 | $nosilcost = -log(1.0 - $silprob); 53 | } 54 | 55 | 56 | open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; 57 | 58 | 59 | sub is_sil { 60 | # Return true (1) if provided with a phone-sequence 61 | # that means silence. 62 | # @_ is the parameters of the function 63 | # This function returns true if @_ equals ( $silphone ) 64 | # or something of the form ( "#0", $silphone, "#1" ) 65 | # where the "#0" and "#1" are disambiguation symbols. 66 | return ( @_ == 1 && $_[0] eq $silphone || 67 | (@_ == 3 && $_[1] eq $silphone && 68 | $_[0] =~ m/^\#\d+$/ && 69 | $_[0] =~ m/^\#\d+$/)); 70 | } 71 | 72 | if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. 73 | $loopstate = 0; 74 | $nextstate = 1; # next unallocated state. 75 | while () { 76 | @A = split(" ", $_); 77 | @A == 0 && die "Empty lexicon line."; 78 | $w = shift @A; 79 | if (! $pron_probs) { 80 | $pron_cost = 0.0; 81 | } else { 82 | $pron_prob = shift @A; 83 | if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { 84 | die "Bad pronunciation probability in line $_"; 85 | } 86 | $pron_cost = -log($pron_prob); 87 | } 88 | if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } 89 | 90 | $s = $loopstate; 91 | $word_or_eps = $w; 92 | while (@A > 0) { 93 | $p = shift @A; 94 | if (@A > 0) { 95 | $ns = $nextstate++; 96 | } else { 97 | $ns = $loopstate; 98 | } 99 | print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; 100 | $word_or_eps = ""; 101 | $pron_cost_string = ""; # so we only print it on the first arc of the word. 102 | $s = $ns; 103 | } 104 | } 105 | print "$loopstate\t0\n"; # final-cost. 106 | } else { # have silence probs. 107 | $startstate = 0; 108 | $loopstate = 1; 109 | $silstate = 2; # state from where we go to loopstate after emitting silence. 110 | print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. 111 | if (!defined $sildisambig) { 112 | print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. 113 | print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. 114 | $nextstate = 3; 115 | } else { 116 | $disambigstate = 3; 117 | $nextstate = 4; 118 | print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. 119 | print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. 120 | print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. 121 | } 122 | while () { 123 | @A = split(" ", $_); 124 | $w = shift @A; 125 | if (! $pron_probs) { 126 | $pron_cost = 0.0; 127 | } else { 128 | $pron_prob = shift @A; 129 | if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { 130 | die "Bad pronunciation probability in line $_"; 131 | } 132 | $pron_cost = -log($pron_prob); 133 | } 134 | if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } 135 | $s = $loopstate; 136 | $word_or_eps = $w; 137 | while (@A > 0) { 138 | $p = shift @A; 139 | if (@A > 0) { 140 | $ns = $nextstate++; 141 | print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; 142 | $word_or_eps = ""; 143 | $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. 144 | $s = $ns; 145 | } else { 146 | if (!is_sil($p)) { 147 | # This is non-deterministic but relatively compact, 148 | # and avoids epsilons. 149 | $local_nosilcost = $nosilcost + $pron_cost; 150 | $local_silcost = $silcost + $pron_cost; 151 | print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; 152 | print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; 153 | } else { 154 | # no point putting opt-sil after silence word. 155 | print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; 156 | } 157 | } 158 | } 159 | } 160 | print "$loopstate\t0\n"; # final-cost. 161 | } 162 | -------------------------------------------------------------------------------- /utils/subset_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2012-2013 Johns Hopkins University (Author: Daniel Povey) 4 | # Apache 2.0 5 | 6 | 7 | # This script operates on a data directory, such as in data/train/. 8 | # See http://kaldi.sourceforge.net/data_prep.html#data_prep_data 9 | # for what these directories contain. 10 | 11 | # This script creates a subset of that data, consisting of some specified 12 | # number of utterances. (The selected utterances are distributed evenly 13 | # throughout the file, by the program ./subset_scp.pl). 14 | 15 | # There are six options, none compatible with any other. 16 | 17 | # If you give the --per-spk option, it will attempt to select the supplied 18 | # number of utterances for each speaker (typically you would supply a much 19 | # smaller number in this case). 20 | 21 | # If you give the --speakers option, it selects a subset of n randomly 22 | # selected speakers. 23 | 24 | # If you give the --shortest option, it will give you the n shortest utterances. 25 | 26 | # If you give the --first option, it will just give you the n first utterances. 27 | 28 | # If you give the --last option, it will just give you the n last utterances. 29 | 30 | # If you give the --spk-list option, it reads the speakers to keep from " 31 | # (note, in this case there is no positional parameter; see usage message.) 32 | 33 | 34 | shortest=false 35 | perspk=false 36 | first_opt="" 37 | speakers=false 38 | spk_list_specified=false 39 | utt_list_specified=false 40 | 41 | if [ "$1" == "--per-spk" ]; then 42 | perspk=true; 43 | shift; 44 | elif [ "$1" == "--shortest" ]; then 45 | shortest=true; 46 | shift; 47 | elif [ "$1" == "--first" ]; then 48 | first_opt="--first"; 49 | shift; 50 | elif [ "$1" == "--speakers" ]; then 51 | speakers=true 52 | shift; 53 | elif [ "$1" == "--last" ]; then 54 | first_opt="--last"; 55 | shift; 56 | elif [ "$1" == "--spk-list" ]; then 57 | spk_list_specified=true 58 | shift; 59 | elif [ "$1" == "--utt-list" ]; then 60 | utt_list_specified=true 61 | shift; 62 | fi 63 | 64 | 65 | 66 | 67 | if [ $# != 3 ]; then 68 | echo "Usage: " 69 | echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " 70 | echo " subset_data_dir.sh [--spk-list ] " 71 | echo " subset_data_dir.sh [--utt-list ] " 72 | echo "By default, randomly selects utterances from the data directory." 73 | echo "With --speakers, randomly selects enough speakers that we have utterances" 74 | echo "With --per-spk, selects utterances per speaker, if available." 75 | echo "With --first, selects the first utterances" 76 | echo "With --last, selects the last utterances" 77 | echo "With --shortest, selects the shortest utterances." 78 | echo "With --spk-list, reads the speakers to keep from " 79 | exit 1; 80 | fi 81 | 82 | if $spk_list_specified; then 83 | spk_list=$1 84 | srcdir=$2 85 | destdir=$3 86 | elif $utt_list_specified; then 87 | utt_list=$1 88 | srcdir=$2 89 | destdir=$3 90 | else 91 | srcdir=$1 92 | numutt=$2 93 | destdir=$3 94 | fi 95 | 96 | 97 | export LC_ALL=C 98 | 99 | if [ ! -f $srcdir/utt2spk ]; then 100 | echo "subset_data_dir.sh: no such file $srcdir/utt2spk" 101 | exit 1; 102 | fi 103 | 104 | function do_filtering { 105 | # assumes the utt2spk and spk2utt files already exist. 106 | [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp 107 | [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp 108 | [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang 109 | [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp 110 | [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp 111 | [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp 112 | [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text 113 | [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender 114 | [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp 115 | if [ -f $srcdir/segments ]; then 116 | utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments 117 | awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings. 118 | # The next line would override the command above for wav.scp, which would be incorrect. 119 | [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp 120 | [ -f $srcdir/reco2file_and_channel ] && \ 121 | utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel 122 | 123 | # Filter the STM file for proper sclite scoring (this will also remove the comments lines) 124 | [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm 125 | 126 | rm $destdir/reco 127 | fi 128 | srcutts=`cat $srcdir/utt2spk | wc -l` 129 | destutts=`cat $destdir/utt2spk | wc -l` 130 | echo "$0: reducing #utt from $srcutts to $destutts" 131 | } 132 | 133 | 134 | if $spk_list_specified; then 135 | mkdir -p $destdir 136 | utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; 137 | utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; 138 | do_filtering; # bash function. 139 | exit 0; 140 | elif $utt_list_specified; then 141 | mkdir -p $destdir 142 | utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; 143 | utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; 144 | do_filtering; # bash function. 145 | exit 0; 146 | elif $speakers; then 147 | mkdir -p $destdir 148 | utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \ 149 | sort > $destdir/spk2utt 150 | utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk 151 | do_filtering; # bash function. 152 | exit 0; 153 | elif $perspk; then 154 | mkdir -p $destdir 155 | awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; } 156 | for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } 157 | printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt 158 | utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk 159 | do_filtering; # bash function. 160 | exit 0; 161 | else 162 | if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then 163 | echo "subset_data_dir.sh: cannot subset to more utterances than you originally had." 164 | exit 1; 165 | fi 166 | mkdir -p $destdir || exit 1; 167 | 168 | ## scripting note: $shortest evaluates to true or false 169 | ## so this becomes the command true or false. 170 | if $shortest; then 171 | # select the n shortest utterances. 172 | . ./path.sh 173 | [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1; 174 | feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; 175 | sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist 176 | utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk 177 | rm $destdir/tmp.uttlist $destdir/tmp.len 178 | else 179 | utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; 180 | fi 181 | utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt 182 | do_filtering; 183 | exit 0; 184 | fi 185 | -------------------------------------------------------------------------------- /steps/train_ctc_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | # Apache 2.0 5 | 6 | # This script trains acoustic models based on CTC and using SGD. 7 | 8 | ## Begin configuration section 9 | train_tool=train-ctc-parallel # the command for training; by default, we use the 10 | # parallel version which processes multiple utterances at the same time 11 | 12 | # configs for multiple sequences 13 | num_sequence=5 # during training, how many utterances to be processed in parallel 14 | valid_num_sequence=10 # number of parallel sequences in validation 15 | frame_num_limit=12500 # the number of frames to be processed at a time in training; this config acts to 16 | # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max 17 | # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 18 | 19 | # learning rate 20 | learn_rate=0.0001 # learning rate 21 | momentum=0.9 # momentum 22 | 23 | # learning rate schedule 24 | max_iters=25 # max number of iterations 25 | min_iters= # min number of iterations 26 | start_epoch_num=1 # start from which epoch, used for resuming training from a break point 27 | 28 | start_halving_inc=0.5 # start halving learning rates when the accuracy improvement falls below this amount 29 | end_halving_inc=0.1 # terminate training when the accuracy improvement falls below this amount 30 | halving_factor=0.5 # learning rate decay factor 31 | halving_after_epoch=1 # halving bcomes enabled after this many epochs 32 | 33 | # logging 34 | report_step=1 # during training, the step (number of utterances) of reporting objective and accuracy 35 | verbose=1 36 | 37 | # feature configs 38 | sort_by_len=true # whether to sort the utterances by their lengths 39 | min_len=0 # minimal length of utterances to consider 40 | 41 | norm_vars=true # whether to apply variance normalization when we do cmn 42 | add_deltas=true # whether to add deltas 43 | copy_feats=true # whether to copy features into a local dir (on the GPU machine) 44 | feats_tmpdir= # the tmp dir to save the copied features, when copy_feats=true 45 | 46 | # status of learning rate schedule; useful when training is resumed from a break point 47 | cvacc=0 48 | halving=0 49 | 50 | ## End configuration section 51 | 52 | echo "$0 $@" # Print the command line for logging 53 | 54 | [ -f path.sh ] && . ./path.sh; 55 | 56 | . utils/parse_options.sh || exit 1; 57 | 58 | if [ $# != 3 ]; then 59 | echo "Usage: $0 " 60 | echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn" 61 | exit 1; 62 | fi 63 | 64 | data_tr=$1 65 | data_cv=$2 66 | dir=$3 67 | 68 | mkdir -p $dir/log $dir/nnet 69 | 70 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do 71 | [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; 72 | done 73 | 74 | ## Read the training status for resuming 75 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null` 76 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null` 77 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null` 78 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null` 79 | 80 | ## Setup up features 81 | echo $norm_vars > $dir/norm_vars # output feature configs which will be used in decoding 82 | echo $add_deltas > $dir/add_deltas 83 | 84 | if $sort_by_len; then 85 | feat-to-len scp:$data_tr/feats.scp ark,t:- | awk '{print $2}' > $dir/len.tmp || exit 1; 86 | paste -d " " $data_tr/feats.scp $dir/len.tmp | sort -k3 -n - | awk -v m=$min_len '{ if ($3 >= m) {print $1 " " $2} }' > $dir/train.scp || exit 1; 87 | feat-to-len scp:$data_cv/feats.scp ark,t:- | awk '{print $2}' > $dir/len.tmp || exit 1; 88 | paste -d " " $data_cv/feats.scp $dir/len.tmp | sort -k3 -n - | awk '{print $1 " " $2}' > $dir/cv.scp || exit 1; 89 | rm -f $dir/len.tmp 90 | else 91 | cat $data_tr/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp 92 | cat $data_cv/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/cv.scp 93 | fi 94 | 95 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/train.scp ark:- |" 96 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/cv.scp ark:- |" 97 | 98 | # Save the features to a local dir on the GPU machine. On Linux, this usually points to /tmp 99 | if $copy_feats; then 100 | tmpdir=$(mktemp -d $feats_tmpdir); 101 | copy-feats "$feats_tr" ark,scp:$tmpdir/train.ark,$dir/train_local.scp || exit 1; 102 | copy-feats "$feats_cv" ark,scp:$tmpdir/cv.ark,$dir/cv_local.scp || exit 1; 103 | feats_tr="ark,s,cs:copy-feats scp:$dir/train_local.scp ark:- |" 104 | feats_cv="ark,s,cs:copy-feats scp:$dir/cv_local.scp ark:- |" 105 | trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT 106 | fi 107 | 108 | if $add_deltas; then 109 | feats_tr="$feats_tr add-deltas ark:- ark:- |" 110 | feats_cv="$feats_cv add-deltas ark:- ark:- |" 111 | fi 112 | ## End of feature setup 113 | 114 | ## Set up labels 115 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|" 116 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|" 117 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to derive prior probabilities of 118 | # the labels. 119 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \ 120 | analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1 121 | ## 122 | 123 | # Initialize model parameters 124 | if [ ! -f $dir/nnet/nnet.iter0 ]; then 125 | echo "Initializing model as $dir/nnet/nnet.iter0" 126 | net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1; 127 | fi 128 | 129 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 130 | echo "TRAINING STARTS [$cur_time]" 131 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)." 132 | for iter in $(seq $start_epoch_num $max_iters); do 133 | cvacc_prev=$cvacc 134 | echo -n "EPOCH $iter RUNNING ... " 135 | 136 | # train 137 | $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \ 138 | --learn-rate=$learn_rate --momentum=$momentum \ 139 | --verbose=$verbose \ 140 | "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \ 141 | >& $dir/log/tr.iter$iter.log || exit 1; 142 | 143 | end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 144 | echo -n "ENDS [$end_time]: " 145 | 146 | tracc=$(cat $dir/log/tr.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 147 | echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, " 148 | 149 | # validation 150 | $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \ 151 | --cross-validate=true \ 152 | --learn-rate=$learn_rate \ 153 | --momentum=$momentum \ 154 | --verbose=$verbose \ 155 | "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \ 156 | >& $dir/log/cv.iter$iter.log || exit 1; 157 | 158 | cvacc=$(cat $dir/log/cv.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 159 | echo "VALID ACCURACY $(printf "%.4f" $cvacc)%" 160 | 161 | # stopping criterion 162 | rel_impr=$(bc <<< "($cvacc-$cvacc_prev)") 163 | if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]; then 164 | if [[ "$min_iters" != "" ]]; then 165 | if [ $min_iters -gt $iter ]; then 166 | echo we were supposed to finish, but we continue as min_iters : $min_iters 167 | continue 168 | fi 169 | fi 170 | echo finished, too small rel. improvement $rel_impr 171 | break 172 | fi 173 | 174 | # start annealing when improvement is low 175 | if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then 176 | if [ $iter -gt $halving_after_epoch ]; then 177 | halving=1 178 | fi 179 | fi 180 | 181 | # do annealing 182 | if [ 1 == $halving ]; then 183 | learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") 184 | fi 185 | # save the status 186 | echo $[$iter+1] > $dir/.epoch # +1 because we save the epoch to start from 187 | echo $cvacc > $dir/.cvacc 188 | echo $halving > $dir/.halving 189 | echo $learn_rate > $dir/.lrate 190 | done 191 | 192 | # Convert the model marker from "" to "" (no longer needed) 193 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1; 194 | 195 | echo "Training succeeded. The final model $dir/final.nnet" 196 | -------------------------------------------------------------------------------- /steps/train_ctc_parallel_h.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | { 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | # 2015 Hang Su 5 | # Apache 2.0 6 | 7 | # This script trains acoustic models based on CTC and using SGD. 8 | 9 | ## Begin configuration section 10 | train_tool=train-ctc-parallel # the command for training; by default, we use the 11 | # parallel version which processes multiple utterances at the same time 12 | 13 | # configs for multiple sequences 14 | num_sequence=5 # during training, how many utterances to be processed in parallel 15 | valid_num_sequence=10 # number of parallel sequences in validation 16 | frame_num_limit=1000000 # the number of frames to be processed at a time in training; this config acts to 17 | # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max 18 | # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 19 | 20 | # learning rate 21 | learn_rate=0.0001 # learning rate 22 | momentum=0.9 # momentum 23 | 24 | # learning rate schedule 25 | max_iters=25 # max number of iterations 26 | min_iters= # min number of iterations 27 | start_epoch_num=1 # start from which epoch, used for resuming training from a break point 28 | 29 | start_halving_inc=0.5 # start halving learning rates when the accuracy improvement falls below this amount 30 | end_halving_inc=0.1 # terminate training when the accuracy improvement falls below this amount 31 | halving_factor=0.5 # learning rate decay factor 32 | halving_after_epoch=1 # halving bcomes enabled after this many epochs 33 | 34 | # logging 35 | report_step=100 # during training, the step (number of utterances) of reporting objective and accuracy 36 | verbose=1 37 | 38 | # feature configs 39 | sort_by_len=true # whether to sort the utterances by their lengths 40 | 41 | norm_vars=true # whether to apply variance normalization when we do cmn 42 | add_deltas=true # whether to add deltas 43 | 44 | # status of learning rate schedule; useful when training is resumed from a break point 45 | cvacc=-1 46 | halving=0 47 | 48 | # Multi-GPU training 49 | nj=1 50 | utts_per_avg=700 51 | 52 | clean_up=true 53 | 54 | ## End configuration section 55 | 56 | echo "$0 $@" # Print the command line for logging 57 | 58 | [ -f ./path.sh ] && . ./path.sh; 59 | [ -f ./cmd.sh ] && . ./cmd.sh; 60 | 61 | . utils/parse_options.sh || exit 1; 62 | 63 | if [ $# != 3 ]; then 64 | echo "Usage: $0 " 65 | echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn" 66 | exit 1; 67 | fi 68 | 69 | data_tr=$1 70 | data_cv=$2 71 | dir=$3 72 | 73 | mkdir -p $dir/log $dir/nnet 74 | 75 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do 76 | [ ! -f $f ] && echo "train_ctc_parallel.sh: no such file $f" && exit 1; 77 | done 78 | 79 | ## Read the training status for resuming 80 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null` 81 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null` 82 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null` 83 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null` 84 | 85 | ## Setup up features 86 | echo $norm_vars > $dir/norm_vars # output feature configs which will be used in decoding 87 | echo $add_deltas > $dir/add_deltas 88 | 89 | echo "Preparing train and cv features" 90 | tmpdir=$dir/feats; 91 | [ -d $tmpdir ] || mkdir -p $tmpdir 92 | [ $clean_up == true ] && trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT 93 | utils/prep_scps.sh --nj $nj --cmd "$train_cmd" ${seed:+ --seed=$seed} --clean-up $clean_up \ 94 | $data_tr/feats.scp $data_cv/feats.scp $num_sequence $frame_num_limit $tmpdir $dir || exit 1; 95 | 96 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/feats_tr.JOB.scp ark:- |" 97 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/feats_cv.JOB.scp ark:- |" 98 | 99 | if [ $nj -eq 1 ]; then 100 | feats_tr=$(echo $feats_tr | sed 's#JOB#1#') 101 | feats_cv=$(echo $feats_cv | sed 's#JOB#1#') 102 | fi 103 | 104 | if $add_deltas; then 105 | feats_tr="$feats_tr add-deltas ark:- ark:- |" 106 | feats_cv="$feats_cv add-deltas ark:- ark:- |" 107 | fi 108 | ## End of feature setup 109 | 110 | ## Set up labels 111 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|" 112 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|" 113 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to derive prior probabilities of 114 | # the labels. 115 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \ 116 | analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1 117 | ## 118 | 119 | # Initialize model parameters 120 | if [ ! -f $dir/nnet/nnet.iter0 ]; then 121 | echo "Initializing model as $dir/nnet/nnet.iter0" 122 | net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1; 123 | fi 124 | 125 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 126 | echo "TRAINING STARTS [$cur_time]" 127 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)." 128 | for iter in $(seq $start_epoch_num $max_iters); do 129 | cvacc_prev=$cvacc 130 | echo -n "EPOCH $iter RUNNING ... " 131 | 132 | # train 133 | if [ -z "$nj" ]; then 134 | $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \ 135 | --learn-rate=$learn_rate --momentum=$momentum \ 136 | --verbose=$verbose \ 137 | "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \ 138 | >& $dir/log/tr.iter$iter.log || exit 1; 139 | tracc=$(cat $dir/log/tr.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 140 | else 141 | $cuda_cmd JOB=1:$nj $dir/log/tr.iter$iter.JOB.log \ 142 | $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \ 143 | --learn-rate=$learn_rate --momentum=$momentum --num-jobs=$nj --job-id=JOB \ 144 | --verbose=$verbose \ 145 | ${utts_per_avg:+ --utts-per-avg=$utts_per_avg} \ 146 | "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} >& $dir/log/tr.iter$iter.log || exit 1 147 | tracc=$(cat $dir/log/tr.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }') 148 | fi 149 | 150 | 151 | echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, " 152 | end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 153 | echo -n "ENDS [$end_time]: " 154 | 155 | # validation 156 | if [ -z "$nj" ]; then 157 | $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \ 158 | --cross-validate=true \ 159 | --learn-rate=$learn_rate \ 160 | --verbose=$verbose \ 161 | "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \ 162 | >& $dir/log/cv.iter$iter.log || exit 1; 163 | cvacc=$(cat $dir/log/cv.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 164 | else 165 | $cuda_cmd JOB=1:$nj $dir/log/cv.iter$iter.JOB.log \ 166 | $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \ 167 | --cross-validate=true --num-jobs=$nj --job-id=JOB \ 168 | --learn-rate=$learn_rate \ 169 | --verbose=$verbose \ 170 | "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} >& $dir/log/cv.iter$iter.log || exit 1; 171 | cvacc=$(cat $dir/log/cv.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }') 172 | fi 173 | 174 | echo "VALID ACCURACY $(printf "%.4f" $cvacc)%" 175 | 176 | # stopping criterion 177 | rel_impr=$(bc <<< "($cvacc-$cvacc_prev)") 178 | if [[ 1 == "$halving" && 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]]; then 179 | if [[ "$min_iters" != "" ]]; then 180 | if [ $min_iters -gt $iter ]; then 181 | echo we were supposed to finish, but we continue as min_iters : $min_iters 182 | continue 183 | fi 184 | fi 185 | echo finished, too small rel. improvement $rel_impr 186 | break 187 | fi 188 | 189 | # start annealing when improvement is low 190 | if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then 191 | if [ $iter -gt $halving_after_epoch ]; then 192 | halving=1 193 | fi 194 | fi 195 | 196 | # do annealing 197 | if [ 1 == $halving ]; then 198 | learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") 199 | fi 200 | # save the status 201 | echo $[$iter+1] > $dir/.epoch # +1 because we save the epoch to start from 202 | echo $cvacc > $dir/.cvacc 203 | echo $halving > $dir/.halving 204 | echo $learn_rate > $dir/.lrate 205 | done 206 | 207 | # Convert the model marker from "" to "" 208 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1; 209 | 210 | echo "Training succeeded. The final model $dir/final.nnet" 211 | } 212 | -------------------------------------------------------------------------------- /utils/split_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | # This program splits up any kind of .scp or archive-type file. 20 | # If there is no utt2spk option it will work on any text file and 21 | # will split it up with an approximately equal number of lines in 22 | # each but. 23 | # With the --utt2spk option it will work on anything that has the 24 | # utterance-id as the first entry on each line; the utt2spk file is 25 | # of the form "utterance speaker" (on each line). 26 | # It splits it into equal size chunks as far as it can. If you use 27 | # the utt2spk option it will make sure these chunks coincide with 28 | # speaker boundaries. In this case, if there are more chunks 29 | # than speakers (and in some other circumstances), some of the 30 | # resulting chunks will be empty and it 31 | # will print a warning. 32 | # You will normally call this like: 33 | # split_scp.pl scp scp.1 scp.2 scp.3 ... 34 | # or 35 | # split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... 36 | # Note that you can use this script to split the utt2spk file itself, 37 | # e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... 38 | 39 | # You can also call the scripts like: 40 | # split_scp.pl -j 3 0 scp scp.0 41 | # [note: with this option, it assumes zero-based indexing of the split parts, 42 | # i.e. the second number must be 0 <= n < num-jobs.] 43 | 44 | $num_jobs = 0; 45 | $job_id = 0; 46 | $utt2spk_file = ""; 47 | 48 | for ($x = 1; $x <= 2; $x++) { 49 | if ($ARGV[0] eq "-j") { 50 | shift @ARGV; 51 | $num_jobs = shift @ARGV; 52 | $job_id = shift @ARGV; 53 | if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { 54 | die "Invalid num-jobs and job-id: $num_jobs and $job_id"; 55 | } 56 | } 57 | if ($ARGV[0] =~ "--utt2spk=(.+)") { 58 | $utt2spk_file=$1; 59 | shift; 60 | } 61 | } 62 | 63 | if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { 64 | die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... \n" . 65 | " or: split_scp.pl -j num-jobs job-id [--utt2spk=] in.scp [out.scp]\n" . 66 | " ... where 0 <= job-id < num-jobs."; 67 | } 68 | 69 | $error = 0; 70 | $inscp = shift @ARGV; 71 | if ($num_jobs == 0) { # without -j option 72 | @OUTPUTS = @ARGV; 73 | } else { 74 | for ($j = 0; $j < $num_jobs; $j++) { 75 | if ($j == $job_id) { 76 | if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } 77 | else { push @OUTPUTS, "-"; } 78 | } else { 79 | push @OUTPUTS, "/dev/null"; 80 | } 81 | } 82 | } 83 | 84 | if ($utt2spk_file ne "") { # We have the --utt2spk option... 85 | open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; 86 | while() { 87 | @A = split; 88 | @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; 89 | ($u,$s) = @A; 90 | $utt2spk{$u} = $s; 91 | } 92 | open(I, "<$inscp") || die "Opening input scp file $inscp"; 93 | @spkrs = (); 94 | while() { 95 | @A = split; 96 | if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } 97 | $u = $A[0]; 98 | $s = $utt2spk{$u}; 99 | if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } 100 | if(!defined $spk_count{$s}) { 101 | push @spkrs, $s; 102 | $spk_count{$s} = 0; 103 | $spk_data{$s} = ""; 104 | } 105 | $spk_count{$s}++; 106 | $spk_data{$s} = $spk_data{$s} . $_; 107 | } 108 | # Now split as equally as possible .. 109 | # First allocate spks to files by allocating an approximately 110 | # equal number of speakers. 111 | $numspks = @spkrs; # number of speakers. 112 | $numscps = @OUTPUTS; # number of output files. 113 | if ($numspks < $numscps) { 114 | die "Refusing to split data because number of speakers $numspks is less " . 115 | "than the number of output .scp files $numscps"; 116 | } 117 | for($scpidx = 0; $scpidx < $numscps; $scpidx++) { 118 | $scparray[$scpidx] = []; # [] is array reference. 119 | } 120 | for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { 121 | $scpidx = int(($spkidx*$numscps) / $numspks); 122 | $spk = $spkrs[$spkidx]; 123 | push @{$scparray[$scpidx]}, $spk; 124 | $scpcount[$scpidx] += $spk_count{$spk}; 125 | } 126 | 127 | # Now will try to reassign beginning + ending speakers 128 | # to different scp's and see if it gets more balanced. 129 | # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. 130 | # We can show that if considering changing just 2 scp's, we minimize 131 | # this by minimizing the squared difference in sizes. This is 132 | # equivalent to minimizing the absolute difference in sizes. This 133 | # shows this method is bound to converge. 134 | 135 | $changed = 1; 136 | while($changed) { 137 | $changed = 0; 138 | for($scpidx = 0; $scpidx < $numscps; $scpidx++) { 139 | # First try to reassign ending spk of this scp. 140 | if($scpidx < $numscps-1) { 141 | $sz = @{$scparray[$scpidx]}; 142 | if($sz > 0) { 143 | $spk = $scparray[$scpidx]->[$sz-1]; 144 | $count = $spk_count{$spk}; 145 | $nutt1 = $scpcount[$scpidx]; 146 | $nutt2 = $scpcount[$scpidx+1]; 147 | if( abs( ($nutt2+$count) - ($nutt1-$count)) 148 | < abs($nutt2 - $nutt1)) { # Would decrease 149 | # size-diff by reassigning spk... 150 | $scpcount[$scpidx+1] += $count; 151 | $scpcount[$scpidx] -= $count; 152 | pop @{$scparray[$scpidx]}; 153 | unshift @{$scparray[$scpidx+1]}, $spk; 154 | $changed = 1; 155 | } 156 | } 157 | } 158 | if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { 159 | $spk = $scparray[$scpidx]->[0]; 160 | $count = $spk_count{$spk}; 161 | $nutt1 = $scpcount[$scpidx-1]; 162 | $nutt2 = $scpcount[$scpidx]; 163 | if( abs( ($nutt2-$count) - ($nutt1+$count)) 164 | < abs($nutt2 - $nutt1)) { # Would decrease 165 | # size-diff by reassigning spk... 166 | $scpcount[$scpidx-1] += $count; 167 | $scpcount[$scpidx] -= $count; 168 | shift @{$scparray[$scpidx]}; 169 | push @{$scparray[$scpidx-1]}, $spk; 170 | $changed = 1; 171 | } 172 | } 173 | } 174 | } 175 | # Now print out the files... 176 | for($scpidx = 0; $scpidx < $numscps; $scpidx++) { 177 | $scpfn = $OUTPUTS[$scpidx]; 178 | open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; 179 | $count = 0; 180 | if(@{$scparray[$scpidx]} == 0) { 181 | print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n"; 182 | $error = 1; 183 | } else { 184 | foreach $spk ( @{$scparray[$scpidx]} ) { 185 | print F $spk_data{$spk}; 186 | $count += $spk_count{$spk}; 187 | } 188 | if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } 189 | } 190 | close(F); 191 | } 192 | } else { 193 | # This block is the "normal" case where there is no --utt2spk 194 | # option and we just break into equal size chunks. 195 | 196 | open(I, "<$inscp") || die "Opening input scp file $inscp"; 197 | 198 | $numscps = @OUTPUTS; # size of array. 199 | @F = (); 200 | while() { 201 | push @F, $_; 202 | } 203 | $numlines = @F; 204 | if($numlines == 0) { 205 | print STDERR "split_scp.pl: error: empty input scp file $inscp , "; 206 | $error = 1; 207 | } 208 | $linesperscp = int( $numlines / $numscps); # the "whole part".. 209 | $linesperscp >= 1 || die "You are splitting into too many pieces!"; 210 | $remainder = $numlines - ($linesperscp * $numscps); 211 | ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder"; 212 | # [just doing int() rounds down]. 213 | $n = 0; 214 | for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { 215 | $scpfile = $OUTPUTS[$scpidx]; 216 | open(O, ">$scpfile") || die "Opening output scp file $scpfile"; 217 | for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) { 218 | print O $F[$n++]; 219 | } 220 | close(O) || die "Closing scp file $scpfile"; 221 | } 222 | $n == $numlines || die "split_scp.pl: code error., $n != $numlines"; 223 | } 224 | 225 | exit ($error ? 1 : 0); 226 | -------------------------------------------------------------------------------- /utils/run.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | # In general, doing 5 | # run.pl some.log a b c is like running the command a b c in 6 | # the bash shell, and putting the standard error and output into some.log. 7 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.) 8 | # run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB 9 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. 10 | # If any of the jobs fails, this script will fail. 11 | 12 | # A typical example is: 13 | # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz 14 | # and run.pl will run something like: 15 | # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log 16 | # 17 | # Basically it takes the command-line arguments, quotes them 18 | # as necessary to preserve spaces, and evaluates them with bash. 19 | # In addition it puts the command line at the top of the log, and 20 | # the start and end times of the command at the beginning and end. 21 | # The reason why this is useful is so that we can create a different 22 | # version of this program that uses a queueing system instead. 23 | 24 | # use Data::Dumper; 25 | 26 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; 27 | 28 | 29 | $max_jobs_run = -1; 30 | $jobstart = 1; 31 | $jobend = 1; 32 | $ignored_opts = ""; # These will be ignored. 33 | 34 | # First parse an option like JOB=1:4, and any 35 | # options that would normally be given to 36 | # queue.pl, which we will just discard. 37 | 38 | if (@ARGV > 0) { 39 | while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options 40 | # that would normally go to qsub, but which will be ignored here. 41 | $switch = shift @ARGV; 42 | if ($switch eq "-V") { 43 | $ignored_opts .= "-V "; 44 | } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { 45 | # we do support the option --max-jobs-run n, and its GridEngine form -tc n. 46 | $max_jobs_run = shift @ARGV; 47 | if (! ($max_jobs_run > 0)) { 48 | die "run.pl: invalid option --max-jobs-run $max_jobs_run"; 49 | } 50 | } else { 51 | $option = shift @ARGV; 52 | if ($switch eq "-sync" && $option =~ m/^[yY]/) { 53 | $ignored_opts .= "-sync "; # Note: in the 54 | # corresponding code in queue.pl it says instead, just "$sync = 1;". 55 | } 56 | $ignored_opts .= "$switch $option "; 57 | if ($switch eq "-pe") { # e.g. -pe smp 5 58 | $option2 = shift @ARGV; 59 | $ignored_opts .= "$option2 "; 60 | } 61 | } 62 | } 63 | if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 64 | $jobname = $1; 65 | $jobstart = $2; 66 | $jobend = $3; 67 | shift; 68 | if ($jobstart > $jobend) { 69 | die "run.pl: invalid job range $ARGV[0]"; 70 | } 71 | if ($jobstart <= 0) { 72 | die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)."; 73 | } 74 | } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. 75 | $jobname = $1; 76 | $jobstart = $2; 77 | $jobend = $2; 78 | shift; 79 | } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { 80 | print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n"; 81 | } 82 | } 83 | 84 | # Users found this message confusing so we are removing it. 85 | # if ($ignored_opts ne "") { 86 | # print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; 87 | # } 88 | 89 | if ($max_jobs_run == -1) { # If --max-jobs-run option not set, 90 | # then work out the number of processors if possible, 91 | # and set it based on that. 92 | $max_jobs_run = 0; 93 | if (open(P, ") { if (m/^processor/) { $max_jobs_run++; } } 95 | if ($max_jobs_run == 0) { 96 | print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n"; 97 | $max_jobs_run = 10; # reasonable default. 98 | } 99 | close(P); 100 | } elsif (open(P, "sysctl -a |")) { # BSD/Darwin 101 | while (

) { 102 | if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4 103 | $max_jobs_run = $1; 104 | last; 105 | } 106 | } 107 | close(P); 108 | if ($max_jobs_run == 0) { 109 | print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n"; 110 | $max_jobs_run = 10; # reasonable default. 111 | } 112 | } else { 113 | # allow at most 32 jobs at once, on non-UNIX systems; change this code 114 | # if you need to change this default. 115 | $max_jobs_run = 32; 116 | } 117 | # The just-computed value of $max_jobs_run is just the number of processors 118 | # (or our best guess); and if it happens that the number of jobs we need to 119 | # run is just slightly above $max_jobs_run, it will make sense to increase 120 | # $max_jobs_run to equal the number of jobs, so we don't have a small number 121 | # of leftover jobs. 122 | $num_jobs = $jobend - $jobstart + 1; 123 | if ($num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) { 124 | $max_jobs_run = $num_jobs; 125 | } 126 | } 127 | 128 | $logfile = shift @ARGV; 129 | 130 | if (defined $jobname && $logfile !~ m/$jobname/ && 131 | $jobend > $jobstart) { 132 | print STDERR "run.pl: you are trying to run a parallel job but " 133 | . "you are putting the output into just one log file ($logfile)\n"; 134 | exit(1); 135 | } 136 | 137 | $cmd = ""; 138 | 139 | foreach $x (@ARGV) { 140 | if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } 141 | elsif ($x =~ m:\":) { $cmd .= "'$x' "; } 142 | else { $cmd .= "\"$x\" "; } 143 | } 144 | 145 | #$Data::Dumper::Indent=0; 146 | $ret = 0; 147 | $numfail = 0; 148 | %active_pids=(); 149 | 150 | use POSIX ":sys_wait_h"; 151 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 152 | if (scalar(keys %active_pids) >= $max_jobs_run) { 153 | 154 | # Lets wait for a change in any child's status 155 | # Then we have to work out which child finished 156 | $r = waitpid(-1, 0); 157 | $code = $?; 158 | if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. 159 | if ( defined $active_pids{$r} ) { 160 | $jid=$active_pids{$r}; 161 | $fail[$jid]=$code; 162 | if ($code !=0) { $numfail++;} 163 | delete $active_pids{$r}; 164 | # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; 165 | } else { 166 | die "run.pl: Cannot find the PID of the chold process that just finished."; 167 | } 168 | 169 | # In theory we could do a non-blocking waitpid over all jobs running just 170 | # to find out if only one or more jobs finished during the previous waitpid() 171 | # However, we just omit this and will reap the next one in the next pass 172 | # through the for(;;) cycle 173 | } 174 | $childpid = fork(); 175 | if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } 176 | if ($childpid == 0) { # We're in the child... this branch 177 | # executes the job and returns (possibly with an error status). 178 | if (defined $jobname) { 179 | $cmd =~ s/$jobname/$jobid/g; 180 | $logfile =~ s/$jobname/$jobid/g; 181 | } 182 | system("mkdir -p `dirname $logfile` 2>/dev/null"); 183 | open(F, ">$logfile") || die "run.pl: Error opening log file $logfile"; 184 | print F "# " . $cmd . "\n"; 185 | print F "# Started at " . `date`; 186 | $starttime = `date +'%s'`; 187 | print F "#\n"; 188 | close(F); 189 | 190 | # Pipe into bash.. make sure we're not using any other shell. 191 | open(B, "|bash") || die "run.pl: Error opening shell command"; 192 | print B "( " . $cmd . ") 2>>$logfile >> $logfile"; 193 | close(B); # If there was an error, exit status is in $? 194 | $ret = $?; 195 | 196 | $lowbits = $ret & 127; 197 | $highbits = $ret >> 8; 198 | if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" } 199 | else { $return_str = "code $highbits"; } 200 | 201 | $endtime = `date +'%s'`; 202 | open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)"; 203 | $enddate = `date`; 204 | chop $enddate; 205 | print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; 206 | print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; 207 | close(F); 208 | exit($ret == 0 ? 0 : 1); 209 | } else { 210 | $pid[$jobid] = $childpid; 211 | $active_pids{$childpid} = $jobid; 212 | # print STDERR "Queued: " . Dumper(\%active_pids) . "\n"; 213 | } 214 | } 215 | 216 | # Now we have submitted all the jobs, lets wait until all the jobs finish 217 | foreach $child (keys %active_pids) { 218 | $jobid=$active_pids{$child}; 219 | $r = waitpid($pid[$jobid], 0); 220 | $code = $?; 221 | if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen. 222 | if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully 223 | } 224 | 225 | # Some sanity checks: 226 | # The $fail array should not contain undefined codes 227 | # The number of non-zeros in that array should be equal to $numfail 228 | # We cannot do foreach() here, as the JOB ids do not necessarily start by zero 229 | $failed_jids=0; 230 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 231 | $job_return = $fail[$jobid]; 232 | if (not defined $job_return ) { 233 | # print Dumper(\@fail); 234 | 235 | die "run.pl: Sanity check failed: we have indication that some jobs are running " . 236 | "even after we waited for all jobs to finish" ; 237 | } 238 | if ($job_return != 0 ){ $failed_jids++;} 239 | } 240 | if ($failed_jids != $numfail) { 241 | die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)." 242 | } 243 | if ($numfail > 0) { $ret = 1; } 244 | 245 | if ($ret != 0) { 246 | $njobs = $jobend - $jobstart + 1; 247 | if ($njobs == 1) { 248 | if (defined $jobname) { 249 | $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with 250 | # that job. 251 | } 252 | print STDERR "run.pl: job failed, log is in $logfile\n"; 253 | if ($logfile =~ m/JOB/) { 254 | print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; 255 | } 256 | } 257 | else { 258 | $logfile =~ s/$jobname/*/g; 259 | print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; 260 | } 261 | } 262 | 263 | 264 | exit ($ret); 265 | -------------------------------------------------------------------------------- /utils/validate_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | no_feats=false 5 | no_wav=false 6 | no_text=false 7 | 8 | for x in `seq 3`; do 9 | if [ "$1" == "--no-feats" ]; then 10 | no_feats=true 11 | shift; 12 | fi 13 | if [ "$1" == "--no-text" ]; then 14 | no_text=true 15 | shift; 16 | fi 17 | if [ "$1" == "--no-wav" ]; then 18 | no_wav=true 19 | shift; 20 | fi 21 | done 22 | 23 | if [ $# -ne 1 ]; then 24 | echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] " 25 | echo "e.g.: $0 data/train" 26 | fi 27 | 28 | data=$1 29 | 30 | if [ ! -d $data ]; then 31 | echo "$0: no such directory $data" 32 | exit 1; 33 | fi 34 | 35 | for f in spk2utt utt2spk; do 36 | if [ ! -f $data/$f ]; then 37 | echo "$0: no such file $f" 38 | exit 1; 39 | fi 40 | if [ ! -s $data/$f ]; then 41 | echo "$0: empty file $f" 42 | exit 1; 43 | fi 44 | done 45 | 46 | ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ 47 | echo "$0: $data/utt2spk has wrong format." && exit; 48 | 49 | ns=$(wc -l < $data/spk2utt) 50 | if [ "$ns" == 1 ]; then 51 | echo "$0: WARNING: you have only one speaker. This probably a bad idea." 52 | echo " Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html" 53 | echo " for more information." 54 | fi 55 | 56 | 57 | tmpdir=$(mktemp -d); 58 | trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM 59 | 60 | export LC_ALL=C 61 | 62 | function check_sorted_and_uniq { 63 | ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ 64 | echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; 65 | } 66 | 67 | function partial_diff { 68 | diff $1 $2 | head -n 6 69 | echo "..." 70 | diff $1 $2 | tail -n 6 71 | n1=`cat $1 | wc -l` 72 | n2=`cat $2 | wc -l` 73 | echo "[Lengths are $1=$n1 versus $2=$n2]" 74 | } 75 | 76 | check_sorted_and_uniq $data/utt2spk 77 | 78 | ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ 79 | echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ 80 | echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; 81 | 82 | check_sorted_and_uniq $data/spk2utt 83 | 84 | ! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ 85 | <(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \ 86 | echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; 87 | 88 | cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts 89 | 90 | if [ ! -f $data/text ] && ! $no_text; then 91 | echo "$0: no such file $data/text (if this is by design, specify --no-text)" 92 | exit 1; 93 | fi 94 | 95 | num_utts=`cat $tmpdir/utts | wc -l` 96 | if [ -f $data/text ]; then 97 | check_sorted_and_uniq $data/text 98 | text_len=`cat $data/text | wc -l` 99 | illegal_sym_list=" #0" 100 | for x in $illegal_sym_list; do 101 | if grep -w "$x" $data/text > /dev/null; then 102 | echo "$0: Error: in $data, text contains illegal symbol $x" 103 | exit 1; 104 | fi 105 | done 106 | awk '{print $1}' < $data/text > $tmpdir/utts.txt 107 | if ! cmp -s $tmpdir/utts{,.txt}; then 108 | echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" 109 | echo "$0: differ, partial diff is:" 110 | partial_diff $tmpdir/utts{,.txt} 111 | exit 1; 112 | fi 113 | fi 114 | 115 | if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then 116 | echo "$0: in directory $data, segments file exists but no wav.scp" 117 | exit 1; 118 | fi 119 | 120 | 121 | if [ ! -f $data/wav.scp ] && ! $no_wav; then 122 | echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" 123 | exit 1; 124 | fi 125 | 126 | if [ -f $data/wav.scp ]; then 127 | check_sorted_and_uniq $data/wav.scp 128 | 129 | if [ -f $data/segments ]; then 130 | 131 | check_sorted_and_uniq $data/segments 132 | # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. 133 | ! cat $data/segments | \ 134 | awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \ 135 | echo "$0: badly formatted segments file" && exit 1; 136 | 137 | segments_len=`cat $data/segments | wc -l` 138 | if [ -f $data/text ]; then 139 | ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \ 140 | echo "$0: Utterance list differs between $data/text and $data/segments " && \ 141 | echo "$0: Lengths are $segments_len vs $num_utts"; 142 | fi 143 | 144 | cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings 145 | awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav 146 | if ! cmp -s $tmpdir/recordings{,.wav}; then 147 | echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" 148 | echo "$0: differ, partial diff is:" 149 | partial_diff $tmpdir/recordings{,.wav} 150 | exit 1; 151 | fi 152 | if [ -f $data/reco2file_and_channel ]; then 153 | # this file is needed only for ctm scoring; it's indexed by recording-id. 154 | check_sorted_and_uniq $data/reco2file_and_channel 155 | ! cat $data/reco2file_and_channel | \ 156 | awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 157 | if ( NF == 3 && $3 == "1" ) { 158 | warning_issued = 1; 159 | } else { 160 | print "Bad line ", $0; exit 1; 161 | } 162 | } 163 | } 164 | END { 165 | if (warning_issued == 1) { 166 | print "The channel should be marked as A or B, not 1! You should change it ASAP! " 167 | } 168 | }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; 169 | cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc 170 | if ! cmp -s $tmpdir/recordings{,.r2fc}; then 171 | echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" 172 | echo "$0: differ, partial diff is:" 173 | partial_diff $tmpdir/recordings{,.r2fc} 174 | exit 1; 175 | fi 176 | fi 177 | else 178 | # No segments file -> assume wav.scp indexed by utterance. 179 | cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav 180 | if ! cmp -s $tmpdir/utts{,.wav}; then 181 | echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" 182 | echo "$0: differ, partial diff is:" 183 | partial_diff $tmpdir/utts{,.wav} 184 | exit 1; 185 | fi 186 | 187 | if [ -f $data/reco2file_and_channel ]; then 188 | # this file is needed only for ctm scoring; it's indexed by recording-id. 189 | check_sorted_and_uniq $data/reco2file_and_channel 190 | ! cat $data/reco2file_and_channel | \ 191 | awk '{if (NF != 3 || ($3 != "A" && $3 != "B")) { print "Bad line ", $0; exit 1; }}' && \ 192 | echo "$0: badly formatted reco2file_and_channel file" && exit 1; 193 | cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc 194 | if ! cmp -s $tmpdir/utts{,.r2fc}; then 195 | echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" 196 | echo "$0: differ, partial diff is:" 197 | partial_diff $tmpdir/utts{,.r2fc} 198 | exit 1; 199 | fi 200 | fi 201 | fi 202 | fi 203 | 204 | if [ ! -f $data/feats.scp ] && ! $no_feats; then 205 | echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" 206 | exit 1; 207 | fi 208 | 209 | if [ -f $data/feats.scp ]; then 210 | check_sorted_and_uniq $data/feats.scp 211 | cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats 212 | if ! cmp -s $tmpdir/utts{,.feats}; then 213 | echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" 214 | echo "$0: differ, partial diff is:" 215 | partial_diff $tmpdir/utts{,.feats} 216 | exit 1; 217 | fi 218 | fi 219 | 220 | if [ -f $data/cmvn.scp ]; then 221 | check_sorted_and_uniq $data/cmvn.scp 222 | cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn 223 | cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers 224 | if ! cmp -s $tmpdir/speakers{,.cmvn}; then 225 | echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn" 226 | echo "$0: differ, partial diff is:" 227 | partial_diff $tmpdir/speakers{,.cmvn} 228 | exit 1; 229 | fi 230 | fi 231 | 232 | if [ -f $data/spk2gender ]; then 233 | check_sorted_and_uniq $data/spk2gender 234 | ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ 235 | echo "Mal-formed spk2gender file" && exit 1; 236 | cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender 237 | cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers 238 | if ! cmp -s $tmpdir/speakers{,.spk2gender}; then 239 | echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" 240 | echo "$0: differ, partial diff is:" 241 | partial_diff $tmpdir/speakers{,.spk2gender} 242 | exit 1; 243 | fi 244 | fi 245 | 246 | if [ -f $data/spk2warp ]; then 247 | check_sorted_and_uniq $data/spk2warp 248 | ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ 249 | echo "Mal-formed spk2warp file" && exit 1; 250 | cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp 251 | cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers 252 | if ! cmp -s $tmpdir/speakers{,.spk2warp}; then 253 | echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" 254 | echo "$0: differ, partial diff is:" 255 | partial_diff $tmpdir/speakers{,.spk2warp} 256 | exit 1; 257 | fi 258 | fi 259 | 260 | if [ -f $data/utt2warp ]; then 261 | check_sorted_and_uniq $data/utt2warp 262 | ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ 263 | echo "Mal-formed spk2warp file" && exit 1; 264 | cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp 265 | cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts 266 | if ! cmp -s $tmpdir/utts{,.utt2warp}; then 267 | echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" 268 | echo "$0: differ, partial diff is:" 269 | partial_diff $tmpdir/utts{,.utt2warp} 270 | exit 1; 271 | fi 272 | fi 273 | 274 | # check some optionally-required things 275 | for f in vad.scp utt2lang; do 276 | if [ -f $data/$f ]; then 277 | check_sorted_and_uniq $data/$f 278 | if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ 279 | <( awk '{print $1}' $data/$f ); then 280 | echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" 281 | exit 1; 282 | fi 283 | fi 284 | done 285 | 286 | echo "$0: Successfully validated data-directory $data" 287 | -------------------------------------------------------------------------------- /steps/train_ctc_parallel_x3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 Yajie Miao (Carnegie Mellon University) 4 | # 2016 Florian Metze (Carnegie Mellon University) 5 | # Apache 2.0 6 | 7 | # This script trains acoustic models based on CTC and using SGD. 8 | 9 | ## Begin configuration section 10 | train_tool=train-ctc-parallel # the command for training; by default, we use the 11 | # parallel version which processes multiple utterances at the same time 12 | 13 | # configs for multiple sequences 14 | num_sequence=5 # during training, how many utterances to be processed in parallel 15 | valid_num_sequence=10 # number of parallel sequences in validation 16 | frame_num_limit=1000000 # the number of frames to be processed at a time in training; this config acts to 17 | # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max 18 | # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 19 | 20 | # learning rate 21 | learn_rate=0.0001 # learning rate 22 | final_learn_rate=0.0 # final learning rate 23 | momentum=0.9 # momentum 24 | 25 | # learning rate schedule 26 | max_iters=25 # max number of iterations 27 | min_iters= # min number of iterations 28 | start_epoch_num=1 # start from which epoch, used for resuming training from a break point 29 | 30 | start_halving_inc=0.5 # start halving learning rates when the accuracy improvement falls below this amount 31 | end_halving_inc=0.1 # terminate training when the accuracy improvement falls below this amount 32 | halving_factor=0.5 # learning rate decay factor 33 | halving_after_epoch=1 # halving becomes enabled after this many epochs 34 | 35 | # logging 36 | report_step=100 # during training, the step (number of utterances) of reporting objective and accuracy 37 | verbose=1 38 | 39 | # feature configs 40 | sort_by_len=true # whether to sort the utterances by their lengths 41 | min_len=0 # minimal length of utterances to consider 42 | 43 | splice_feats=false # whether to splice neighboring frams 44 | subsample_feats=false # whether to subsample features 45 | norm_vars=true # whether to apply variance normalization when we do cmn 46 | add_deltas=true # whether to add deltas 47 | copy_feats=true # whether to copy features into a local dir (on the GPU machine) 48 | feats_tmpdir= # the tmp dir to save the copied features, when copy_feats=true 49 | 50 | 51 | # status of learning rate schedule; useful when training is resumed from a break point 52 | cvacc=0 53 | halving=0 54 | 55 | ## End configuration section 56 | 57 | echo "$0 $@" # Print the command line for logging 58 | 59 | [ -f path.sh ] && . ./path.sh; 60 | 61 | . utils/parse_options.sh || exit 1; 62 | 63 | if [ $# != 3 ]; then 64 | echo "Usage: $0 " 65 | echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn" 66 | exit 1; 67 | fi 68 | 69 | data_tr=$1 70 | data_cv=$2 71 | dir=$3 72 | 73 | mkdir -p $dir/log $dir/nnet 74 | 75 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do 76 | [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; 77 | done 78 | 79 | ## Read the training status for resuming 80 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null` 81 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null` 82 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null` 83 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null` 84 | 85 | ## Set up labels 86 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|" 87 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|" 88 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to 89 | # derive prior probabilities of the labels. 90 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \ 91 | analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1 92 | ## 93 | 94 | ## Setup up features 95 | # output feature configs which will be used in decoding 96 | echo $norm_vars > $dir/norm_vars 97 | echo $add_deltas > $dir/add_deltas 98 | echo $splice_feats > $dir/splice_feats 99 | echo $subsample_feats > $dir/subsample_feats 100 | 101 | if $sort_by_len; then 102 | feat-to-len scp:$data_tr/feats.scp ark,t:- | awk '{print $2}' | \ 103 | paste -d " " $data_tr/feats.scp - | sort -k3 -n - | awk -v m=$min_len '{ if ($3 >= m) {print $1 " " $2} }' > $dir/train.scp & 104 | feat-to-len scp:$data_cv/feats.scp ark,t:- | awk '{print $2}' | \ 105 | paste -d " " $data_cv/feats.scp - | sort -k3 -n - | awk '{print $1 " " $2}' > $dir/cv.scp & 106 | wait || exit 1; 107 | else 108 | cat $data_tr/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp 109 | cat $data_cv/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/cv.scp 110 | fi 111 | 112 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/train.scp ark:- |" 113 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/cv.scp ark:- |" 114 | 115 | if $splice_feats; then 116 | feats_tr="$feats_tr splice-feats --left-context=1 --right-context=1 ark:- ark:- |" 117 | feats_cv="$feats_cv splice-feats --left-context=1 --right-context=1 ark:- ark:- |" 118 | fi 119 | 120 | if $subsample_feats; then 121 | #tmpdir=$(mktemp -d --tmpdir=$feats_tmpdir); 122 | tmpdir=$(mktemp -d $feats_tmpdir); 123 | 124 | copy-feats "$feats_tr subsample-feats --n=3 --offset=0 ark:- ark:- |" \ 125 | ark,scp:$tmpdir/train0.ark,$tmpdir/train0local.scp || exit 1; 126 | copy-feats "$feats_cv subsample-feats --n=3 --offset=0 ark:- ark:- |" \ 127 | ark,scp:$tmpdir/cv0.ark,$tmpdir/cv0local.scp || exit 1; 128 | copy-feats "$feats_tr subsample-feats --n=3 --offset=1 ark:- ark:- |" \ 129 | ark,scp:$tmpdir/train1.ark,$tmpdir/train1local.scp || exit 1; 130 | copy-feats "$feats_cv subsample-feats --n=3 --offset=1 ark:- ark:- |" \ 131 | ark,scp:$tmpdir/cv1.ark,$tmpdir/cv1local.scp || exit 1; 132 | copy-feats "$feats_tr subsample-feats --n=3 --offset=2 ark:- ark:- |" \ 133 | ark,scp:$tmpdir/train2.ark,$tmpdir/train2local.scp || exit 1; 134 | copy-feats "$feats_cv subsample-feats --n=3 --offset=2 ark:- ark:- |" \ 135 | ark,scp:$tmpdir/cv2.ark,$tmpdir/cv2local.scp || exit 1; 136 | 137 | # this code is experimental - we may need to sort the data carefully 138 | sed 's/^/0x/' $tmpdir/train0local.scp > $tmpdir/train_local.scp 139 | sed 's/^/0x/' $tmpdir/cv0local.scp > $tmpdir/cv_local.scp 140 | sed 's/^/1x/' $tmpdir/train1local.scp | tac >> $tmpdir/train_local.scp 141 | sed 's/^/1x/' $tmpdir/cv1local.scp | tac >> $tmpdir/cv_local.scp 142 | sed 's/^/2x/' $tmpdir/train2local.scp >> $tmpdir/train_local.scp 143 | sed 's/^/2x/' $tmpdir/cv2local.scp >> $tmpdir/cv_local.scp 144 | 145 | feats_tr="ark,s,cs:copy-feats scp:$tmpdir/train_local.scp ark:- |" 146 | feats_cv="ark,s,cs:copy-feats scp:$tmpdir/cv_local.scp ark:- |" 147 | 148 | gzip -cd $dir/labels.tr.gz | sed 's/^/0x/' > $tmpdir/labels.tr 149 | gzip -cd $dir/labels.cv.gz | sed 's/^/0x/' > $tmpdir/labels.cv 150 | gzip -cd $dir/labels.tr.gz | sed 's/^/1x/' >> $tmpdir/labels.tr 151 | gzip -cd $dir/labels.cv.gz | sed 's/^/1x/' >> $tmpdir/labels.cv 152 | gzip -cd $dir/labels.tr.gz | sed 's/^/2x/' >> $tmpdir/labels.tr 153 | gzip -cd $dir/labels.cv.gz | sed 's/^/2x/' >> $tmpdir/labels.cv 154 | 155 | labels_tr="ark:cat $tmpdir/labels.tr|" 156 | labels_cv="ark:cat $tmpdir/labels.cv|" 157 | 158 | trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls -l $tmpdir; rm -r $tmpdir" EXIT 159 | else 160 | 161 | # Save the features to a local dir on the GPU machine. On Linux, this usually points to /tmp 162 | if $copy_feats; then 163 | tmpdir=$(mktemp -d $feats_tmpdir); 164 | copy-feats "$feats_tr" ark,scp:$tmpdir/train.ark,$tmpdir/train_local.scp || exit 1; 165 | copy-feats "$feats_cv" ark,scp:$tmpdir/cv.ark,$tmpdir/cv_local.scp || exit 1; 166 | feats_tr="ark,s,cs:copy-feats scp:$tmpdir/train_local.scp ark:- |" 167 | feats_cv="ark,s,cs:copy-feats scp:$tmpdir/cv_local.scp ark:- |" 168 | trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT 169 | fi 170 | fi 171 | 172 | if $add_deltas; then 173 | feats_tr="$feats_tr add-deltas ark:- ark:- |" 174 | feats_cv="$feats_cv add-deltas ark:- ark:- |" 175 | fi 176 | ## End of feature setup 177 | 178 | # Initialize model parameters 179 | if [ ! -f $dir/nnet/nnet.iter0 ]; then 180 | echo "Initializing model as $dir/nnet/nnet.iter0" 181 | net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1; 182 | fi 183 | 184 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 185 | echo "TRAINING STARTS [$cur_time]" 186 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)." 187 | for iter in $(seq $start_epoch_num $max_iters); do 188 | cvacc_prev=$cvacc 189 | echo -n "EPOCH $iter RUNNING ... " 190 | 191 | # train 192 | $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \ 193 | --learn-rate=$learn_rate --momentum=$momentum \ 194 | --verbose=$verbose \ 195 | "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \ 196 | >& $dir/log/tr.iter$iter.log || exit 1; 197 | 198 | end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'` 199 | echo -n "ENDS [$end_time]: " 200 | 201 | tracc=$(cat $dir/log/tr.iter${iter}.log | grep -a "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 202 | echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, " 203 | 204 | # validation 205 | $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \ 206 | --cross-validate=true \ 207 | --learn-rate=$learn_rate \ 208 | --momentum=$momentum \ 209 | --verbose=$verbose \ 210 | "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \ 211 | >& $dir/log/cv.iter$iter.log || exit 1; 212 | 213 | cvacc=$(cat $dir/log/cv.iter${iter}.log | grep -a "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }') 214 | echo "VALID ACCURACY $(printf "%.4f" $cvacc)%" 215 | 216 | # stopping criterion 217 | rel_impr=$(bc <<< "($cvacc-$cvacc_prev)") 218 | if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]; then 219 | if [[ "$min_iters" != "" ]]; then 220 | if [ $min_iters -gt $iter ]; then 221 | echo we were supposed to finish, but we continue as min_iters : $min_iters 222 | continue 223 | fi 224 | fi 225 | echo finished, too small rel. improvement $rel_impr 226 | break 227 | fi 228 | 229 | # start annealing when improvement is low 230 | if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then 231 | if [ $iter -gt $halving_after_epoch ]; then 232 | halving=1 233 | fi 234 | fi 235 | 236 | # do annealing 237 | if [ 1 == $halving ]; then 238 | learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") 239 | learn_rate=$(awk "BEGIN{if ($learn_rate<$final_learn_rate) {print $final_learn_rate} else {print $learn_rate}}") 240 | fi 241 | # save the status 242 | echo $[$iter+1] > $dir/.epoch # +1 because we save the epoch to start from 243 | echo $cvacc > $dir/.cvacc 244 | echo $halving > $dir/.halving 245 | echo $learn_rate > $dir/.lrate 246 | done 247 | 248 | # Convert the model marker from "" to "" (no longer needed) 249 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1; 250 | 251 | echo "Training succeeded. The final model $dir/final.nnet" 252 | --------------------------------------------------------------------------------