├── exp
    ├── model_l4_c320
    │   ├── .epoch
    │   ├── .halving
    │   ├── .cvacc
    │   └── .lrate
    └── train_phn_l3_c320
    │   ├── .epoch
    │   ├── .halving
    │   ├── .cvacc
    │   └── .lrate
├── conf
    ├── pitch.conf
    ├── fbank.conf
    └── mfcc.conf
├── run.sh
├── path.sh
├── utils
    ├── training_trans_fst.py
    ├── ctc_token_fst.py
    ├── eps2disambig.pl
    ├── spk2utt_to_utt2spk.pl
    ├── s2eps.pl
    ├── distribute_scp.pl
    ├── build_const_arpa_lm.sh
    ├── shuffle_list.pl
    ├── utt2spk_to_spk2utt.pl
    ├── best_wer.sh
    ├── remove_oovs.pl
    ├── int2sym.pl
    ├── prep_scps.sh
    ├── find_arpa_oovs.pl
    ├── prep_ctc_trans_bkup.py
    ├── subset_scp.pl
    ├── prep_ctc_trans.py
    ├── filter_scp.pl
    ├── convert_ctm.pl
    ├── create_data_link.pl
    ├── sym2int.pl
    ├── parse_options.sh
    ├── pinyin_map.pl
    ├── subset_data_dir_tr_cv.sh
    ├── add_lex_disambig.pl
    ├── split_data.sh
    ├── format_lm_sri.sh
    ├── ctc_compile_dict_token.sh
    ├── model_topo.py
    ├── fix_data_dir.sh
    ├── run_rocks.pl
    ├── make_lexicon_fst.pl
    ├── subset_data_dir.sh
    ├── split_scp.pl
    ├── run.pl
    └── validate_data_dir.sh
├── decode.sh
├── cmd.sh
├── local
    ├── hkust_normalize.pl
    ├── thchs-30_data_prep.sh
    ├── thchs-30_prepare_phn_dict.sh
    ├── thchs-30_decode_graph.sh
    ├── hkust_train_lms.sh
    └── score.sh
├── make_TLG_WFST.sh
├── feature.sh
├── README.md
├── train.sh
└── steps
    ├── align_ctc_single_utt.sh
    ├── decode_ctc.sh
    ├── compute_cmvn_stats.sh
    ├── make_fbank.sh
    ├── decode_ctc_lat.sh
    ├── make_fbank_pitch.sh
    ├── train_ctc_parallel.sh
    ├── train_ctc_parallel_h.sh
    └── train_ctc_parallel_x3.sh


/exp/model_l4_c320/.epoch:
--------------------------------------------------------------------------------
1 | 25
2 | 


--------------------------------------------------------------------------------
/exp/model_l4_c320/.halving:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/exp/model_l4_c320/.cvacc:
--------------------------------------------------------------------------------
1 | 85.833
2 | 


--------------------------------------------------------------------------------
/exp/model_l4_c320/.lrate:
--------------------------------------------------------------------------------
1 | 3.125e-07
2 | 


--------------------------------------------------------------------------------
/exp/train_phn_l3_c320/.epoch:
--------------------------------------------------------------------------------
1 | 19
2 | 


--------------------------------------------------------------------------------
/exp/train_phn_l3_c320/.halving:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/exp/train_phn_l3_c320/.cvacc:
--------------------------------------------------------------------------------
1 | 90.5068
2 | 


--------------------------------------------------------------------------------
/exp/train_phn_l3_c320/.lrate:
--------------------------------------------------------------------------------
1 | 6.25e-07
2 | 


--------------------------------------------------------------------------------
/conf/pitch.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | 


--------------------------------------------------------------------------------
/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --num-mel-bins=40
2 | --sample-frequency=16000
3 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ./make_TLG_WFST.sh
 4 | 
 5 | ./feature.sh
 6 | 
 7 | ./train.sh
 8 | 
 9 | ./decode.sh
10 | 


--------------------------------------------------------------------------------
/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | --sample-frequency=16000 #  Switchboard is sampled at 8kHz
3 | 


--------------------------------------------------------------------------------
/path.sh:
--------------------------------------------------------------------------------
1 | export EESEN_ROOT=`pwd`/../../..
2 | export PATH=$PWD/utils/:$EESEN_ROOT/src/netbin:$EESEN_ROOT/src/featbin:$EESEN_ROOT/src/decoderbin:$EESEN_ROOT/src/fstbin:$EESEN_ROOT/tools/openfst/bin:$EESEN_ROOT/tools/irstlm/bin/:$PWD:$PATH
3 | export LC_ALL=C
4 | 
5 | 


--------------------------------------------------------------------------------
/utils/training_trans_fst.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Apache 2.0
 4 | 
 5 | import sys
 6 | 
 7 | fread = open(sys.argv[1], 'r')
 8 | 
 9 | for entry in fread.readlines():
10 |     entry = entry.replace('\n','').strip()
11 |     fields = entry.split(' ')
12 |     uttid = fields[0]
13 |    
14 |     for n in range(1, len(fields)):
15 |       print str(n-1) + ' ' + str(n) + ' ' + fields[n] + ' ' + fields[n]
16 | 
17 |     print str(n) + ' ' + '0' + ' ' + '0' + ' ' + '0'  # assume that <eps> is 0 in words.txt    
18 | 
19 | print '0'
20 | 
21 | fread.close()
22 | 


--------------------------------------------------------------------------------
/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
 3 |            ## This relates to the queue.
 4 | . path.sh
 5 | 
 6 | . parse_options.sh
 7 | 
 8 | model_dir=exp/model_l4_c320
 9 | 
10 | echo =====================================================================
11 | echo "                             Decoding                              "
12 | echo =====================================================================
13 | # decoding
14 | steps/decode_ctc_lat.sh --cmd "$decode_cmd" --nj 5 --beam 17.0 --lattice_beam 8.0 --max-active 5000 --acwt 0.9 \
15 |     data/search_Graph data/test $model_dir $model_dir/decode_test || exit 1;
16 | 


--------------------------------------------------------------------------------
/cmd.sh:
--------------------------------------------------------------------------------
 1 | # "queue.pl" uses qsub.  The options to it are
 2 | # options to qsub.  If you have GridEngine installed,
 3 | # change this to a queue you have access to.
 4 | # Otherwise, use "run.pl", which will run jobs locally
 5 | # (make sure your --num-jobs options are no more than
 6 | # the number of cpus on your machine.
 7 | 
 8 | #a) JHU cluster options
 9 | #export train_cmd="queue.pl -l arch=*64"
10 | #export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
11 | #export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
12 | #export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
13 | #export cuda_cmd="queue.pl -l gpu=1"
14 | 
15 | #c) run it locally... works for CMU rocks cluster
16 | export train_cmd=run.pl
17 | export decode_cmd=run.pl
18 | export cuda_cmd=run.pl
19 | 


--------------------------------------------------------------------------------
/utils/ctc_token_fst.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Apache 2.0
 4 | 
 5 | import sys
 6 | 
 7 | fread = open(sys.argv[1], 'r')
 8 | 
 9 | print '0 1 <eps> <eps>'
10 | print '1 1 <blk> <eps>'
11 | print '2 2 <blk> <eps>'
12 | print '2 0 <eps> <eps>'
13 | 
14 | nodeX = 3
15 | for entry in fread.readlines():
16 |     entry = entry.replace('\n','').strip()
17 |     fields = entry.split(' ')
18 |     phone = fields[0]
19 |     if phone == '<eps>' or phone == '<blk>':
20 |       continue
21 |    
22 |     if '#' in phone:
23 |       print str(0) + ' ' + str(0) + ' ' +  '<eps>' + ' ' + phone;
24 |     else:
25 |       print str(1) + ' ' + str(nodeX) + ' ' +  phone + ' ' + phone;
26 |       print str(nodeX) + ' ' + str(nodeX) + ' ' +  phone + ' <eps>';
27 |       print str(nodeX) + ' ' + str(2) + ' ' + '<eps> <eps>';
28 |     nodeX += 1
29 | print '0'
30 | 
31 | fread.close()
32 | 


--------------------------------------------------------------------------------
/utils/eps2disambig.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script replaces epsilon with #0 on the input side only, of the G.fst
18 | # acceptor.  
19 | 
20 | while(<>){
21 |     s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
22 |     print;
23 | }
24 | 


--------------------------------------------------------------------------------
/local/hkust_normalize.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use warnings; #sed replacement for -w perl parameter
 3 | # Copyright Chao Weng 
 4 | 
 5 | # normalizations for hkust trascript
 6 | # see the docs/trans-guidelines.pdf for details
 7 | 
 8 | while (<STDIN>) {
 9 |   @A = split(" ", $_);
10 |   print "$A[0] ";
11 |   for ($n = 1; $n < @A; $n++) { 
12 |     $a = $A[$n];
13 |     if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}")
14 |        || ($a eq "{lipsmack}")) {print "[VOCALIZED-NOISE] "; next;}
15 |     if (($a eq "{laugh}")) {print "[LAUGHTER] "; next;}
16 |     if (($a eq "<noise>")) {print "[NOISE] "; next;}
17 |     $tmp = $a;
18 |     if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:; }
19 |     if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; }
20 |     if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; }
21 |     if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} 
22 |     print "$tmp "; 
23 |   }
24 |   print "\n"; 
25 | }
26 | 


--------------------------------------------------------------------------------
/utils/spk2utt_to_utt2spk.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | while(<>){ 
19 |     @A = split(" ", $_);
20 |     @A > 1 || die "Invalid line in spk2utt file: $_";
21 |     $s = shift @A;
22 |     foreach $u ( @A ) {
23 |         print "$u $s\n";
24 |     }
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/utils/s2eps.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script replaces <s> and </s> with <eps> (on both input and output sides),
18 | # for the G.fst acceptor.
19 | 
20 | while(<>){
21 |     @A = split(" ", $_);
22 |     if ( @A >= 4 ) {
23 |         if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
24 |         if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
25 |     }
26 |     print join("\t", @A) . "\n";
27 | }
28 | 


--------------------------------------------------------------------------------
/make_TLG_WFST.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
 3 |            ## This relates to the queue.
 4 | . path.sh
 5 | 
 6 | . parse_options.sh
 7 | H=`pwd`
 8 | corpus_dir=$H/corpus
 9 | 
10 | echo =====================================================================
11 | echo "                       TLG WFST Construction                       "
12 | echo =====================================================================
13 | #Data preparation
14 | local/thchs-30_data_prep.sh $H $corpus_dir
15 | 
16 | # Construct the phoneme-based dict.
17 | # We get 216 tokens, representing phonemes with tonality.
18 | local/thchs-30_prepare_phn_dict.sh || exit 1;
19 | 
20 | # Compile the lexicon and token FSTs
21 | utils/ctc_compile_dict_token.sh --dict-type "phn" data/dict_phn data/lang_tmp data/lang || exit 1;
22 | 
23 | # Train and compile LMs. 
24 | #local/hkust_train_lms.sh corpus/train/text data/dict_phn/lexicon.txt data/language_model || exit 1;
25 | 
26 | # Compile the language-model FST and the final decoding graph TLG.fst
27 | local/thchs-30_decode_graph.sh data/language_model data/lang data/search_Graph || exit 1;
28 | 


--------------------------------------------------------------------------------
/local/thchs-30_data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
 3 | 
 4 | #This script pepares the data directory for thchs30 recipe. 
 5 | #It reads the corpus and get wav.scp and transcriptions.
 6 | 
 7 | dir=$1
 8 | corpus_dir=$2
 9 | 
10 | 
11 | cd $dir
12 | echo ========================================
13 | echo "            Data Preparation    "
14 | echo ========================================
15 | echo "creating data/{train,dev,test}"
16 | mkdir -p data/{train,dev,test}
17 | 
18 | #create wav.scp, utt2spk.scp, spk2utt.scp, text
19 | (
20 | for x in train dev test; do
21 |   echo "cleaning data/$x"
22 |   cd $dir/data/$x
23 |   rm -rf wav.scp utt2spk spk2utt word.txt text
24 |   echo "preparing scps and text in data/$x"
25 |   for nn in `find  $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
26 |       echo $nn $corpus_dir/$x/$nn.wav >> wav.scp
27 |       echo $nn $nn >> utt2spk
28 |       echo $nn $nn >> spk2utt
29 |       echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt
30 |   done 
31 |   cp word.txt text
32 | done
33 | ) || exit 1
34 | echo " Data prepration succeeded "
35 | echo -e "\n"
36 | 


--------------------------------------------------------------------------------
/local/thchs-30_prepare_phn_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script prepares the phoneme-based lexicon. It also generates the list of lexicon units
 4 | # and represents the lexicon using the indices of the units. 
 5 | 
 6 | dir=data/dict_phn
 7 | mkdir -p $dir
 8 | srcdict=data/dict/lexicon.txt
 9 | 
10 | [ -f path.sh ] && . ./path.sh
11 | 
12 | [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
13 | 
14 | echo ==============================================
15 | echo "    Phoneme-based Dictionary Preparation    "
16 | echo ==============================================
17 | 
18 | # Raw dictionary preparation
19 | cat $srcdict | grep -v "!SIL" | \
20 |   perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; print $_;}}' \
21 |   > $dir/lexicon.txt || exit 1;
22 | 
23 | # Get the set of lexicon units without noises
24 | cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u  | awk '{print $1 " " NR}' > $dir/units.txt
25 | 
26 | # Convert phoneme sequences into the corresponding sequences of units indices, encoded by units.txt
27 | utils/sym2int.pl -f 2- $dir/units.txt < $dir/lexicon.txt > $dir/lexicon_numbers.txt
28 | 
29 | echo "Phoneme-based dictionary preparation succeeded"
30 | echo -e "\n"
31 | 


--------------------------------------------------------------------------------
/utils/distribute_scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # Copyright 2015  Hang Su.   Apache 2.0.
 4 | 
 5 | # This script split an scp list either by length of the frames or in round-robin manner
 6 | 
 7 | $mode = 'frame';
 8 | if ($ARGV[0] eq '--mode') {
 9 |   shift @ARGV;
10 |   $mode = shift @ARGV;
11 | }
12 | 
13 | $num_jobs = $ARGV[0]; shift;
14 | $base_filename = $ARGV[0]; shift;
15 | 
16 | @num_frames = (0) x $num_jobs;
17 | 
18 | foreach $i (1..$num_jobs) {
19 |   local *FILE;
20 |   open(FILE, "> $base_filename.$i.scp") || die;
21 |   push(@file_handles, *FILE);
22 | }
23 | 
24 | $count = 0;
25 | while (<>) {
26 |   chomp;
27 |   if ($mode eq "utt") {
28 |     $id = ($count % $num_jobs) ;
29 |     print {$file_handles[$id]} $_,"\n";
30 |   } elsif ($mode eq "frame") {
31 |     @A = split /\s+/;
32 |     $id_min = 0;
33 |     $num_frames[$id_min] < $num_frames[$_] or $id_min = $_ for 1 .. $#num_frames;    # find the smallest index
34 |     $id = $id_min;
35 |     $num_frames[$id_min] += $A[1];
36 |     print {$file_handles[$id]} $A[0],"\n";
37 |   } else {
38 |     die "Un-recognized mode $mode!";
39 |   }
40 |   $count += 1;
41 | }
42 | 
43 | $id_min = 0;
44 | $num_frames[$id_min] < $num_frames[$_] or $id_min = $_ for 1 .. $#num_frames;    # find the smallest index
45 | print "$num_frames[$id_min]";
46 | 


--------------------------------------------------------------------------------
/utils/build_const_arpa_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | # This script reads in an Arpa format language model, and converts it into the
 7 | # ConstArpaLm format language model.
 8 | 
 9 | # begin configuration section
10 | # end configuration section
11 | 
12 | [ -f path.sh ] && . ./path.sh;
13 | 
14 | . utils/parse_options.sh
15 | 
16 | if [ $# != 3 ]; then
17 |   echo "Usage: "
18 |   echo "  $0 [options] <arpa-lm-path> <old-lang-dir> <new-lang-dir>"
19 |   echo "e.g.:"
20 |   echo "  $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed"
21 |   echo "Options"
22 |   exit 1;
23 | fi
24 | 
25 | export LC_ALL=C
26 | 
27 | arpa_lm=$1
28 | old_lang=$2
29 | new_lang=$3
30 | 
31 | mkdir -p $new_lang
32 | 
33 | mkdir -p $new_lang
34 | cp -r $old_lang/* $new_lang
35 | 
36 | 
37 | unk=`cat $new_lang/oov.int`
38 | bos=`grep "<s>" $new_lang/words.txt | awk '{print $2}'`
39 | eos=`grep "</s>" $new_lang/words.txt | awk '{print $2}'`
40 | if [[ -z $bos || -z $eos ]]; then
41 |   echo "$0: <s> and </s> symbols are not in $new_lang/words.txt"
42 |   exit 1
43 | fi
44 | 
45 | 
46 | arpa-to-const-arpa --bos-symbol=$bos \
47 |   --eos-symbol=$eos --unk-symbol=$unk \
48 |   "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|"  $new_lang/G.carpa  || exit 1;
49 | 
50 | exit 0;
51 | 


--------------------------------------------------------------------------------
/feature.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
 3 |            ## This relates to the queue.
 4 | . path.sh
 5 | 
 6 | . parse_options.sh
 7 | 
 8 | echo =====================================================================
 9 | echo "                    FBank Feature Generation                       "
10 | echo =====================================================================
11 | fbankdir=fbank
12 | 
13 | # Generate the fbank features; by default 40-dimensional fbanks on each frame
14 | #make train fbank
15 | steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train exp/make_fbank/train $fbankdir || exit 1;
16 | utils/fix_data_dir.sh data/train || exit;
17 | steps/compute_cmvn_stats.sh data/train exp/make_fbank/train $fbankdir || exit 1;
18 | echo -e "\n"
19 | 
20 | #make test fbank
21 | steps/make_fbank.sh --cmd "$train_cmd" --nj 10 data/test exp/make_fbank/test $fbankdir || exit 1;
22 | utils/fix_data_dir.sh data/test || exit;
23 | steps/compute_cmvn_stats.sh data/test exp/make_fbank/test $fbankdir || exit 1;
24 | echo -e "\n"
25 | 
26 | #make dev fbank
27 | steps/make_fbank.sh --cmd "$train_cmd" --nj 10 data/dev exp/make_fbank/dev $fbankdir || exit 1;
28 | utils/fix_data_dir.sh data/dev || exit;
29 | steps/compute_cmvn_stats.sh data/dev exp/make_fbank/dev $fbankdir || exit 1;
30 | echo -e "\n"
31 | 


--------------------------------------------------------------------------------
/utils/shuffle_list.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | if ($ARGV[0] eq "--srand") {
20 |   $n = $ARGV[1];
21 |   $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
22 |   srand($ARGV[1]);
23 |   shift;
24 |   shift;
25 | } else {
26 |   srand(0); # Gives inconsistent behavior if we don't seed.
27 | }
28 | 
29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 
30 |   # don't understand.
31 |   print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
32 |   print "randomizes the order of lines of input.\n";
33 |   exit(1);
34 | }
35 | 
36 | @lines = <>;
37 | @lines = sort { rand() <=> rand() } @lines;
38 | print @lines;
39 | 


--------------------------------------------------------------------------------
/utils/utt2spk_to_spk2utt.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # converts an utt2spk file to a spk2utt file.
18 | # Takes input from the stdin or from a file argument;
19 | # output goes to the standard out.
20 | 
21 | if ( @ARGV > 1 ) {
22 |     die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
23 | }
24 | 
25 | while(<>){ 
26 |     @A = split(" ", $_);
27 |     @A == 2 || die "Invalid line in utt2spk file: $_";
28 |     ($u,$s) = @A;
29 |     if(!$seen_spk{$s}) {
30 |         $seen_spk{$s} = 1;
31 |         push @spklist, $s;
32 |     }
33 |     push (@{$spk_hash{$s}}, "$u");
34 | }
35 | foreach $s (@spklist) {
36 |     $l = join(' ',@{$spk_hash{$s}});
37 |     print "$s $l\n";
38 | }
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 这个recipe是基于eesen代码改写的中文语音识别，语料库为清华语料库(data_thchs30)。
 2 | ===
 3 | ## 1 功能:
 4 | 	1)可以实现中文语音识别  
 5 | 
 6 | 	2)可以加入其它的汉语语料库进行算法研究
 7 | 	
 8 | 	3)也可以单独研究以wfst为架构的解码器，实现声学模型出来的音素到词的转换
 9 | 	
10 | 
11 | ## 2 算法:BiLSTM+CTC+WFST  
12 | 
13 | 	1)BiLSTM:3 layers+ 1 projection layer,320 hidden units  
14 | 
15 | 	2)CTC:216个声韵母标签+1个blank标签  
16 | 
17 | 	3)WFST: CTC token fst(T.fst), lexicon fst(L.fst), language model fst(G.fst) 
18 | 
19 | ## 3 实验结果:
20 | 	1)CTC训练标签正确率:92%左右  
21 | 
22 | 	2)CTC交叉严重标签正确率:90%左右  
23 | 
24 | 	3)最终的解码WER:25%左右  
25 | 
26 | 
27 | 
28 | ## 4 该目录下的相关文件说明:  
29 | 
30 | 	 1)运行该项目: ./run.sh 
31 | 			
32 | 		也可以单独运行每个高亮的shell脚本，其中  
33 | 
34 | 		make_TLG_WFST.sh: 用来生成TLG.fst . 无需加参数运行,生成的文件所在的目录为:data/{train,test,dev,lang,search_Graph}.  
35 | 
36 | 		feature.sh:用来生成wav音频数据的fbank特征，40+delta+double delta. 无需加参数运行,生成的文件所在目录为:data/{train,test,dev} ,fbank  
37 | 
38 | 		train.sh: 训练声学模型 无需加参数运行，也可以修改该脚本里的网络参数。生成的文件所在目录为:exp/model_l$_c$  
39 | 
40 | 		decode.sh:利用 声学模型 和集成了语言模型的WFST进行解码. 生成的文件所在目录为:exp/model_l$_c$/decode_test 
41 | 
42 | 
43 | 	2)数据准备:  
44 | 
45 | 		准备语言模型languange model ,放入data/language_model目录下，语言模型的文件格式类似于清华的  
46 | 
47 | 		准备字典lexicon.txt，放入 data/dict目录下  
48 | 
49 | 		准备训练数据，放入corpus/train目录下，wav+text  
50 | 
51 | 		准备测试数据，放入corpus/test目录下， wav+text  
52 | 
53 | 		准备验证数据，放入corpus/dev目录下，wav+text  
54 | 
55 | ## 5 安装eesen:
56 | 	1)要想运行该项目，必须根据https://github.com/srvk/eesen中的INSTALL安装eesen
57 | 


--------------------------------------------------------------------------------
/utils/best_wer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2010-2011 Microsoft Corporation
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # To be run from one directory above this script.
19 | 
20 | perl -e 'while(<>){ 
21 |     s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g;
22 |     if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
23 |     elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|:
24 |         && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } }  # sclite.
25 |    if (defined $bestline){ print $bestline; } ' | \
26 |   awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \
27 |   awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \
28 |   awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \
29 |   sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||'
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/utils/remove_oovs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script removes lines that contain these OOVs on either the
18 | # third or fourth fields  of the line.  It is intended to remove arcs
19 | # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
20 | 
21 | if (  @ARGV < 1 && @ARGV > 2) {
22 |     die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
23 | }
24 | 
25 | $unklist = shift @ARGV;
26 | open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
27 | while(<S>){ 
28 |     @A = split(" ", $_);
29 |     @A == 1 || die "Bad line in unknown-symbol list: $_";
30 |     $unk{$A[0]} = 1;
31 | }
32 | 
33 | $num_removed = 0;
34 | while(<>){ 
35 |     @A = split(" ", $_);
36 |     if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
37 |         $num_removed++;
38 |     } else {
39 |         print;
40 |     }
41 | }
42 | print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
43 | 
44 | 


--------------------------------------------------------------------------------
/local/thchs-30_decode_graph.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #
 3 | 
 4 | if [ -f path.sh ]; then . path.sh; fi
 5 | 
 6 | lm_dir=$1
 7 | src_lang=$2
 8 | tgt_lang=$3
 9 | 
10 | arpa_lm=${lm_dir}/lm.gz
11 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
12 | 
13 | rm -rf $tgt_lang
14 | cp -r $src_lang $tgt_lang
15 | 
16 | echo ==============================================================
17 | echo "   Generating The Language Model FST And Composing TLG.fst  "
18 | echo ==============================================================
19 | # Compose the language model to FST
20 | gunzip -c "$arpa_lm" | \
21 |    grep -v '<s> <s>' | \
22 |    grep -v '</s> <s>' | \
23 |    grep -v '</s> </s>' | \
24 |    arpa2fst - | fstprint | \
25 |    utils/remove_oovs.pl /dev/null | \
26 |    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
27 |      --osymbols=$tgt_lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
28 |     fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
29 | #exit 1;
30 | 
31 | echo  "Checking how stochastic G is (the first of these numbers should be small):"
32 | fstisstochastic $tgt_lang/G.fst 
33 | 
34 | # Compose the token, lexicon and language-model FST into the final decoding graph
35 | fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
36 |     fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;
37 | fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
38 | 
39 | echo "Composing decoding graph TLG.fst succeeded"
40 | rm -r $tgt_lang/LG.fst   # We don't need to keep this intermediate FST
41 | echo -e "\n"
42 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
 3 |            ## This relates to the queue.
 4 | . path.sh
 5 | 
 6 | . parse_options.sh
 7 | 
 8 | 
 9 | echo =====================================================================
10 | echo "                          Model Training                           "
11 | echo =====================================================================
12 | # Specify network structure and generate the network topology
13 | input_feat_dim=120   # dimension of the input features; we will use 40-dimensional fbanks with deltas and double deltas
14 | lstm_layer_num=4     # number of LSTM layers
15 | lstm_cell_dim=320    # number of memory cells in every LSTM layer
16 | 
17 | dir=exp/model_l${lstm_layer_num}_c${lstm_cell_dim}
18 | mkdir -p $dir
19 | 
20 | target_num=`cat data/lang/units.txt | wc -l`; target_num=$[$target_num+1]; #  #targets = #labels + 1 (the blank)
21 | 
22 | # Output the network topology
23 | utils/model_topo.py --input-feat-dim $input_feat_dim --lstm-layer-num $lstm_layer_num --lstm-cell-dim $lstm_cell_dim --target-num $target_num --fgate-bias-init 1.0 > $dir/nnet.proto || exit 1;
24 | 
25 | # Label sequences; simply convert words into their label indices
26 | utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text "<UNK>" | gzip -c - > $dir/labels.tr.gz
27 | 
28 | utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/dev/text "<UNK>" | gzip -c - > $dir/labels.cv.gz
29 | 
30 | # Train the network with CTC. Refer to the script for details about the arguments
31 | steps/train_ctc_parallel.sh --add-deltas true --num-sequence 10 --learn-rate 0.00004 --report-step 10 --halving-after-epoch 12 --feats-tmpdir $dir/XXXXX data/train data/dev $dir || exit 1;
32 | echo -e "\n"
33 | 


--------------------------------------------------------------------------------
/utils/int2sym.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 3 | # Apache 2.0.
 4 | 
 5 | undef $field_begin;
 6 | undef $field_end;
 7 | 
 8 | 
 9 | if ($ARGV[0] eq "-f") {
10 |   shift @ARGV; 
11 |   $field_spec = shift @ARGV; 
12 |   if ($field_spec =~ m/^\d+$/) {
13 |     $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
14 |   }
15 |   if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
16 |     if ($1 ne "") {
17 |       $field_begin = $1 - 1; # Change to zero-based indexing.
18 |     }
19 |     if ($2 ne "") {
20 |       $field_end = $2 - 1; # Change to zero-based indexing.
21 |     }
22 |   }
23 |   if (!defined $field_begin && !defined $field_end) {
24 |     die "Bad argument to -f option: $field_spec"; 
25 |   }
26 | }
27 | $symtab = shift @ARGV;
28 | if(!defined $symtab) {
29 |     print STDERR "Usage: sym2int.pl [options] symtab [input] > output\n" .
30 |       "options: [-f (<field>|<field_start>-<field-end>)]\n" .
31 |       "e.g.: -f 2, or -f 3-4\n";
32 |     exit(1);
33 | }
34 | 
35 | open(F, "<$symtab") || die "Error opening symbol table file $symtab";
36 | while(<F>) {
37 |     @A = split(" ", $_);
38 |     @A == 2 || die "bad line in symbol table file: $_";
39 |     $int2sym{$A[1]} = $A[0];
40 | }
41 | 
42 | sub int2sym {
43 |     my $a = shift @_;
44 |     my $pos = shift @_;
45 |     if($a !~  m:^\d+$:) { # not all digits..
46 |       $pos1 = $pos+1; # make it one-based.
47 |       die "int2sym.pl: found noninteger token $a [in position $pos1]\n";
48 |     }
49 |     $s = $int2sym{$a};
50 |     if(!defined ($s)) {
51 |       die "int2sym.pl: integer $a not in symbol table $symtab.";
52 |     }
53 |     return $s;
54 | }
55 | 
56 | $error = 0;
57 | while (<>) {
58 |   @A = split(" ", $_);
59 |   for ($pos = 0; $pos <= $#A; $pos++) {
60 |     $a = $A[$pos];
61 |     if ( (!defined $field_begin || $pos >= $field_begin)
62 |          && (!defined $field_end || $pos <= $field_end)) {
63 |       $a = int2sym($a, $pos);
64 |     }
65 |     print $a . " ";
66 |   }
67 |   print "\n";
68 | }
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/utils/prep_scps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | {
 3 | # Copyright  2015  Hang Su
 4 | # Apache 2.0
 5 | 
 6 | # This script prepares feature scp file for CTC training
 7 | 
 8 | set -e
 9 | set -o pipefail
10 | 
11 | ## Begin configuration section
12 | clean_up=true
13 | seed=
14 | cmd=
15 | nj=1
16 | # End of configuration
17 | 
18 | echo "$0 $@"  # Print the command line for logging
19 | 
20 | [ -f ./path.sh ] && . ./path.sh;
21 | 
22 | . utils/parse_options.sh || exit 1;
23 | 
24 | if [ $# != 6 ]; then
25 |   echo "Usage: "
26 |   echo " e.g.: "
27 |   exit 1
28 | fi
29 | 
30 | feat_tr=$1
31 | feat_cv=$2
32 | num_sequence=$3
33 | frame_num_limit=$4
34 | tmpdir=$5
35 | dir=$6
36 | 
37 | for part in tr cv; do
38 |   feat=$(eval echo "\$feat_${part}")
39 | 
40 |   feat-to-len scp:$feat ark,t:- | sort -k2 -n | \
41 |     awk -v num_sequence=$num_sequence -v frame_num_limit=$frame_num_limit '
42 |       BEGIN {max_frame_num = 0; num_utts = 0;}
43 |       { 
44 |         printf("%s ",$1);
45 |         num_utts++;
46 |         if (max_frame_num < $2) {
47 |           max_frame_num = $2;
48 |         }
49 |         if (num_utts >= num_sequence || num_utts * max_frame_num > frame_num_limit) {
50 |           printf("\n");
51 |           num_utts = 0;
52 |           max_frame_num = 0;
53 |         }
54 |       }' | utils/shuffle_list.pl --srand ${seed:-777} > $dir/batch.$part.list
55 | 
56 |   split_batches=""
57 |   for n in $(seq $nj); do
58 |     split_batches="$split_batches $tmpdir/batch.$part.$n.list"
59 |   done
60 |   utils/split_scp.pl $dir/batch.$part.list $split_batches
61 | 
62 |   for n in $(seq $nj); do
63 |     awk '
64 |       NR==FNR {a[$1]=$2;next}
65 |       {
66 |         for (i=1; i<=NF; i++) {
67 |           printf("%s %s\n", $i, a[$i]);
68 |         }
69 |       }' $feat $tmpdir/batch.$part.$n.list > $tmpdir/batch.$part.$n.scp
70 |   done
71 |   if [ $nj -ne 1 ]; then
72 |     $cmd JOB=1:$nj $dir/log/prepare_feats_$part.JOB.log \
73 |       copy-feats scp:$tmpdir/batch.$part.JOB.scp ark,scp:$tmpdir/feats_$part.JOB.ark,$dir/feats_$part.JOB.scp
74 |   else
75 |     copy-feats scp:$tmpdir/batch.$part.1.scp ark,scp:$tmpdir/feats_$part.1.ark,$dir/feats_$part.1.scp
76 |   fi
77 | 
78 | done
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/utils/find_arpa_oovs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | if (  @ARGV < 1 && @ARGV > 2) {
19 |     die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n";
20 |     # This program finds words in the arpa file that are not symbols
21 |     # in the OpenFst-format symbol table words.txt.  It prints them
22 |     # on the standard output, one per line.
23 | }
24 | 
25 | $symtab = shift @ARGV;
26 | open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n";
27 | while(<S>){ 
28 |     @A = split(" ", $_);
29 |     @A == 2 || die "Bad line in symbol table file: $_";
30 |     $seen{$A[0]} = 1;
31 | }
32 | 
33 | $curgram=0;
34 | while(<>) { # Find the \data\ marker.
35 |     if(m:^\\data\\$:) { last; }
36 | }
37 | while(<>) {
38 |     if(m/^\\(\d+)\-grams:\s*$/) {
39 |         $curgram = $1;
40 |         if($curgram > 1) {
41 |             last; # This is an optimization as we can get the vocab from the 1-grams
42 |         }
43 |     } elsif($curgram > 0) {
44 |         @A = split(" ", $_);
45 |         if(@A > 1) {
46 |             shift @A;
47 |             for($n=0;$n<$curgram;$n++) {
48 |                 $word = $A[$n];
49 |                 if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; }
50 |                 $in_arpa{$word} = 1;
51 |             }
52 |         } else {
53 |             if(@A > 0 && $A[0] !~ m:\\end\\:) {
54 |                 print STDERR "Unusual line $_ (line $.) in arpa file\n";
55 |             }
56 |         }
57 |     }
58 | }
59 | 
60 | foreach $w (keys %in_arpa) {
61 |     if(!defined $seen{$w} && $w ne "<s>" && $w ne "</s>") {
62 |         print "$w\n";
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/utils/prep_ctc_trans_bkup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2015       Yajie Miao    (Carnegie Mellon University)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # This python script converts the word-based transcripts into label sequences. The labels are
19 | # represented by their indices. 
20 | 
21 | import sys
22 | 
23 | if __name__ == '__main__':
24 | 
25 |     if len(sys.argv) != 4:
26 |         print "Usage: {0} <lexicon_file> <trans_file> <unk_word>".format(sys.argv[0])
27 |         print "e.g., utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text <UNK>"
28 |         print "<lexicon_file> - the lexicon file in which entries have been represented by indices"
29 |         print "<trans_file>   - the word-based transcript file"
30 |         print "<unk_word>     - the word which represents OOVs in transcripts"
31 |         exit(1)
32 | 
33 |     dict_file = sys.argv[1]
34 |     trans_file = sys.argv[2]
35 |     unk_word = sys.argv[3]
36 | 
37 |     # read the lexicon into a dictionary data structure
38 |     fread = open(dict_file,'r')
39 |     dict = {}
40 |     for line in fread.readlines():
41 |         line = line.replace('\n','')
42 |         splits = line.split(' ')  # assume there are no multiple spaces
43 |         word = splits[0]
44 |         letters = ''
45 |         for n in range(1, len(splits)):
46 |             letters += splits[n] + ' '
47 |         dict[word] = letters.strip()
48 |     fread.close()
49 | 
50 |     # assume that each line is formatted as "uttid word1 word2 word3 ...", with no multiple spaces appearing
51 |     fread = open(trans_file,'r')
52 |     for line in fread.readlines():
53 |         out_line = ''
54 |         line = line.replace('\n','').strip();
55 |         splits = line.split(' ');
56 |     
57 |         out_line += splits[0] + ' '
58 |         for n in range(1, len(splits)):
59 |             try:
60 |               out_line += dict[splits[n]] + ' '
61 |             except Exception:
62 |               out_line += dict[unk_word] + ' '
63 |         print out_line.strip()
64 | 


--------------------------------------------------------------------------------
/local/hkust_train_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # To be run from one directory above this script.
 5 | 
 6 | 
 7 | #text=data/local/train/text
 8 | #lexicon=data/local/dict/lexicon.txt 
 9 | text=$1
10 | lexicon=$2
11 | dir=$3
12 | for f in "$text" "$lexicon"; do
13 |   [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
14 | done
15 | 
16 | # This script takes no arguments.  It assumes you have already run
17 | # swbd_p1_data_prep.sh.  
18 | # It takes as input the files
19 | #data/local/train/text
20 | #data/local/dict/lexicon.txt
21 | #dir=data/local/lm
22 | mkdir -p $dir
23 | export LC_ALL=C # You'll get errors about things being not sorted, if you
24 | # have a different locale.
25 | export PATH=$PATH:`pwd`/../../tools/kaldi_lm
26 | 
27 | mkdir -p $dir
28 | 
29 | 
30 | cleantext=$dir/text.no_oov
31 | 
32 | cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
33 |   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
34 |   > $cleantext || exit 1;
35 | 
36 | 
37 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
38 |    sort -nr > $dir/word.counts || exit 1;
39 | 
40 | 
41 | # Get counts from acoustic training transcripts, and add  one-count
42 | # for each word in the lexicon (but not silence, we don't want it
43 | # in the LM-- we'll add it optionally later).
44 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
45 |   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
46 |    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
47 | 
48 | # note: we probably won't really make use of <UNK> as there aren't any OOVs
49 | cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
50 |    || exit 1;
51 | 
52 | # note: output is
53 | # data/local/lm/3gram-mincount/lm_unpruned.gz 
54 | 
55 | 
56 | # From here is some commands to do a baseline with SRILM (assuming
57 | # you have it installed).
58 | heldout_sent=10000 # Don't change this if you want result to be comparable with
59 |     # kaldi_lm results
60 | sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
61 | mkdir -p $sdir
62 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
63 |   head -$heldout_sent > $sdir/heldout
64 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
65 |   tail -n +$heldout_sent > $sdir/train
66 | 
67 | cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
68 | 
69 | 
70 | ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
71 |   -map-unk "<UNK>"  -interpolate -lm $sdir/lm.kn.gz
72 | ngram -lm $sdir/lm.kn.gz -ppl $sdir/heldout 
73 | # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
74 | 


--------------------------------------------------------------------------------
/utils/subset_scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This program selects a subset of N elements in the scp.
18 | 
19 | # By default, it selects them evenly from throughout the scp, in order to avoid
20 | # selecting too many from the same speaker.  It prints them on the standard
21 | # output.
22 | # With the option --first, it just selects the N first utterances.
23 | # With the option --last, it just selects the N last utterances.
24 | 
25 | # Last modified by JHU & HKUST @2013
26 | 
27 | 
28 | $quiet = 0;
29 | $first = 0;
30 | $last = 0;
31 | if ($ARGV[0] eq "--quiet") {
32 |   shift;
33 |   $quiet = 1;
34 | }
35 | if ($ARGV[0] eq "--first") {
36 |   shift;
37 |   $first = 1;
38 | }
39 | if ($ARGV[0] eq "--last") {
40 |   shift;
41 |   $last = 1;
42 | }
43 | 
44 | if(@ARGV < 2 ) {
45 |     die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
46 |         " --quiet  causes it to not die if N < num lines in scp.\n" .
47 |         " --first and --last make it equivalent to head or tail.\n";
48 | }
49 | 
50 | $N = shift @ARGV;
51 | if($N == 0) {
52 |     die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
53 | }
54 | $inscp = shift @ARGV;
55 | open(I, "<$inscp") || die "Opening input scp file $inscp";
56 | 
57 | @F = ();
58 | while(<I>) {
59 |     push @F, $_;
60 | }
61 | $numlines = @F;
62 | if($N > $numlines) {
63 |   if ($quiet) {
64 |     $N = $numlines;
65 |   } else {
66 |     die "You requested from subset_scp.pl more elements than available: $N > $numlines";
67 |   }
68 | }
69 | 
70 | sub select_n {
71 |     my ($start,$end,$num_needed) = @_;
72 |     my $diff = $end - $start;
73 |     if($num_needed > $diff) { die "select_n: code error"; }
74 |     if($diff == 1 ) {
75 |         if($num_needed  > 0) {
76 |             print $F[$start];
77 |         }
78 |     } else {
79 |         my $halfdiff = int($diff/2);
80 |         my $halfneeded = int($num_needed/2);
81 |         select_n($start, $start+$halfdiff, $halfneeded);
82 |         select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
83 |     }
84 | }
85 | 
86 | if ( ! $first && ! $last) {
87 |   select_n(0, $numlines, $N);
88 | } else {
89 |   if ($first) { # --first option: same as head.
90 |     for ($n = 0; $n < $N; $n++) {
91 |       print $F[$n];
92 |     }
93 |   } else { # --last option: same as tail.
94 |     for ($n = @F - $N; $n < @F; $n++) {
95 |       print $F[$n];
96 |     }
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/utils/prep_ctc_trans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2015       Yajie Miao    (Carnegie Mellon University)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # This python script converts the word-based transcripts into label sequences. The labels are
19 | # represented by their indices. 
20 | 
21 | import sys
22 | 
23 | if __name__ == '__main__':
24 | 
25 |     if len(sys.argv) < 4 or len(sys.argv) > 5:
26 |         print "Usage: {0} <lexicon_file> <trans_file> <unk_word> [space_word]".format(sys.argv[0])
27 |         print "e.g., utils/prep_ctc_trans.py data/lang/lexicon_numbers.txt data/train/text <UNK>"
28 |         print "<lexicon_file> - the lexicon file in which entries have been represented by indices"
29 |         print "<trans_file>   - the word-based transcript file"
30 |         print "<unk_word>     - the word which represents OOVs in transcripts"
31 |         print "[space_word]   - optional, the word representing spaces in the transcripts"
32 |         exit(1)
33 | 
34 |     dict_file = sys.argv[1]
35 |     trans_file = sys.argv[2]
36 |     unk_word = sys.argv[3]
37 | 
38 |     is_char = False
39 |     if len(sys.argv) == 5:
40 |         is_char = True
41 |         space_word = sys.argv[4]
42 | 
43 |     # read the lexicon into a dictionary data structure
44 |     fread = open(dict_file,'r')
45 |     dict = {}
46 |     for line in fread.readlines():
47 |         line = line.replace('\n','')
48 |         splits = line.split(' ')  # assume there are no multiple spaces
49 |         word = splits[0]
50 |         letters = ''
51 |         for n in range(1, len(splits)):
52 |             letters += splits[n] + ' '
53 |         dict[word] = letters.strip()
54 |     fread.close()
55 | 
56 |     # assume that each line is formatted as "uttid word1 word2 word3 ...", with no multiple spaces appearing
57 |     fread = open(trans_file,'r')
58 |     for line in fread.readlines():
59 |         out_line = ''
60 |         line = line.replace('\n','').strip()
61 |         while '  ' in line:
62 |             line = line.replace('  ', ' ')   # remove multiple spaces in the transcripts
63 |         
64 |         uttid = line.split(' ')[0]  # the first field is always utterance id
65 |         trans = line.replace(uttid, '').strip()
66 |         if is_char:
67 |             trans = trans.replace(' ', ' ' + space_word + ' ')
68 |         splits = trans.split(' ')    
69 | 
70 |         out_line += uttid + ' '
71 |         for n in range(0, len(splits)):
72 |             try:
73 |               out_line += dict[splits[n]] + ' '
74 |             except Exception:
75 |               out_line += dict[unk_word] + ' '
76 |         print out_line.strip()
77 | 


--------------------------------------------------------------------------------
/utils/filter_scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2010-2012 Microsoft Corporation
 3 | #                     Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | # This script takes a list of utterance-ids or any file whose first field
20 | # of each line is an utterance-id, and filters an scp
21 | # file (or any file whose "n-th" field is an utterance id), printing
22 | # out only those lines whose "n-th" field is in id_list. The index of
23 | # the "n-th" field is 1, by default, but can be changed by using
24 | # the -f <n> switch
25 | 
26 | $exclude = 0;
27 | $field = 1;
28 | $shifted = 0;
29 | 
30 | do {
31 |   $shifted=0;
32 |   if ($ARGV[0] eq "--exclude") {
33 |     $exclude = 1;
34 |     shift @ARGV;
35 |     $shifted=1;
36 |   }
37 |   if ($ARGV[0] eq "-f") {
38 |     $field = $ARGV[1];
39 |     shift @ARGV; shift @ARGV;
40 |     $shifted=1
41 |   }
42 | } while ($shifted);
43 | 
44 | if(@ARGV < 1 || @ARGV > 2) {
45 |   die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
46 |       "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
47 |       "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
48 |       "only the lines that were *not* in id_list.\n" .
49 |       "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
50 |       "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
51 |       "-f option, add 1 to the argument.\n";
52 | }
53 | 
54 | 
55 | $idlist = shift @ARGV;
56 | open(F, "<$idlist") || die "Could not open id-list file $idlist";
57 | while(<F>) {
58 |   @A = split;
59 |   @A>=1 || die "Invalid id-list file line $_";
60 |   $seen{$A[0]} = 1;
61 | }
62 | 
63 | if ($field == 1) { # Treat this as special case, since it is common.
64 |   while(<>) {
65 |     $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
66 |     # $1 is what we filter on.
67 |     if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
68 |       print $_;
69 |     }
70 |   }
71 | } else {
72 |   while(<>) {
73 |     @A = split;
74 |     @A > 0 || die "Invalid scp file line $_";
75 |     @A >= $field || die "Invalid scp file line $_";
76 |     if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
77 |       print $_;
78 |     }
79 |   }
80 | }
81 | 
82 | # tests:
83 | # the following should print "foo 1"
84 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
85 | # the following should print "bar 2".
86 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
87 | 


--------------------------------------------------------------------------------
/utils/convert_ctm.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 4 | 
 5 | # This takes as standard input a ctm file that's "relative to the utterance",
 6 | # i.e. times are measured relative to the beginning of the segments, and it
 7 | # uses a "segments" file (format:
 8 | # utterance-id recording-id start-time end-time
 9 | # ) and a "reco2file_and_channel" file (format:
10 | # recording-id basename-of-file
11 | 
12 | $skip_unknown=undef;
13 | if ( $ARGV[0] eq "--skip-unknown" ) {
14 |   $skip_unknown=1;
15 |   shift @ARGV;
16 | }
17 | 
18 | if (@ARGV < 2 || @ARGV > 3) {
19 |   print STDERR "Usage: convert_ctm.pl <segments-file> <reco2file_and_channel-file> [<utterance-ctm>] > real-ctm\n";
20 |   exit(1);
21 | }
22 | 
23 | $segments = shift @ARGV;
24 | $reco2file_and_channel = shift @ARGV;
25 | 
26 | open(S, "<$segments") || die "opening segments file $segments";
27 | while(<S>) {
28 |   @A = split(" ", $_);
29 |   @A == 4 || die "Bad line in segments file: $_";
30 |   ($utt, $recording_id, $begin_time, $end_time) = @A;
31 |   $utt2reco{$utt} = $recording_id;
32 |   $begin{$utt} = $begin_time;
33 |   $end{$utt} = $end_time;
34 | }
35 | close(S);
36 | open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel";
37 | while(<R>) {
38 |   @A = split(" ", $_);
39 |   @A == 3 || die "Bad line in reco2file_and_channel file: $_";
40 |   ($recording_id, $file, $channel) = @A;
41 |   $reco2file{$recording_id} = $file;
42 |   $reco2channel{$recording_id} = $channel;
43 | }
44 | 
45 | 
46 | # Now process the ctm file, which is either the standard input or the third
47 | # command-line argument.
48 | $num_done = 0;
49 | while(<>) {
50 |   @A= split(" ", $_);
51 |   ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_";
52 |   # lines look like:
53 |   # <utterance-id> 1 <begin-time> <length> <word> [ confidence ]
54 |   ($utt, $one, $wbegin, $wlen, $w, $conf) = @A;
55 |   $reco = $utt2reco{$utt};
56 |   if (!defined $reco) { 
57 |       next if defined $skip_unknown;
58 |       die "Utterance-id $utt not defined in segments file $segments"; 
59 |   }
60 |   $file = $reco2file{$reco};
61 |   $channel = $reco2channel{$reco};
62 |   if (!defined $file || !defined $channel) { 
63 |     die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; 
64 |   }
65 |   $b = $begin{$utt};
66 |   $e = $end{$utt};
67 |   $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording.
68 |   $wbegin_r = sprintf("%.2f", $wbegin_r);
69 |   $wlen = sprintf("%.2f", $wlen);
70 |   if (defined $conf) {
71 |     $line = "$file $channel $wbegin_r $wlen $w $conf\n"; 
72 |   } else {
73 |     $line = "$file $channel $wbegin_r $wlen $w\n"; 
74 |   }
75 |   if ($wbegin_r + $wlen > $e + 0.01) {
76 |     print STDERR "Warning: word appears to be past end of recording; line is $line";
77 |   }
78 |   print $line; # goes to stdout.
79 |   $num_done++;
80 | }
81 | 
82 | if ($num_done == 0) { exit 1; } else { exit 0; }
83 | 
84 | __END__
85 | 
86 | # Test example [also test it without the 0.5's]
87 | echo utt reco 10.0 20.0 > segments
88 | echo reco file A > reco2file_and_channel
89 | echo utt 1 8.0 1.0 word 0.5 > ctm_in
90 | echo file A 18.00 1.00 word 0.5 > ctm_out
91 | utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error
92 | rm segments reco2file_and_channel ctm_in ctm_out
93 | 
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/local/score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Apache 2.0
 3 | 
 4 | [ -f ./path.sh ] && . ./path.sh
 5 | 
 6 | # begin configuration section.
 7 | cmd=run.pl
 8 | stage=0
 9 | min_acwt=5
10 | max_acwt=10
11 | acwt_factor=0.1   # the scaling factor for the acoustic scale. The scaling factor for acoustic likelihoods
12 |                  # needs to be 0.5 ~1.0. However, the job submission script can only take integers as the
13 |                  # job marker. That's why we set the acwt to be integers (5 ~ 10), but scale them with 0.1
14 |                  # when they are actually used.
15 | #end configuration section.
16 | 
17 | [ -f ./path.sh ] && . ./path.sh
18 | . parse_options.sh || exit 1;
19 | 
20 | if [ $# -ne 3 ]; then
21 |   echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
22 |   echo " Options:"
23 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
24 |   echo "    --min_acwt <int>                # minumum LM-weight for lattice rescoring "
25 |   echo "    --max_acwt <int>                # maximum LM-weight for lattice rescoring "
26 |   exit 1;
27 | fi
28 | 
29 | data=$1
30 | lang_or_graph=$2
31 | dir=$3
32 | 
33 | symtab=$lang_or_graph/words.txt
34 | 
35 | for f in $symtab $dir/lat.1.gz $data/text; do
36 |   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
37 | done
38 | 
39 | mkdir -p $dir/scoring/log
40 | 
41 | function filter_text {
42 |   perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } 
43 |    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";
44 |      foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
45 |    '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
46 | }
47 | filter_text <$data/text >$dir/scoring/text.filt
48 | 
49 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/best_path.ACWT.log \
50 |   lattice-scale --acoustic-scale=ACWT --ascale-factor=$acwt_factor  "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
51 |   lattice-best-path --word-symbol-table=$symtab ark:- ark,t:$dir/scoring/ACWT.tra || exit 1;
52 | 
53 | for acwt in `seq $min_acwt $max_acwt`; do
54 |   cat $dir/scoring/${acwt}.tra | utils/int2sym.pl -f 2- $symtab | \
55 |     filter_text > $dir/scoring/$acwt.txt || exit 1;
56 | done
57 | 
58 | unset LC_ALL
59 | #for character error rate
60 | cat $dir/scoring/text.filt | awk '{ print $1}' > $dir/scoring/utt_id
61 | cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra
62 | paste $dir/scoring/utt_id $dir/scoring/utt_tra  > $dir/scoring/char.filt
63 | 
64 | for acwt in `seq $min_acwt $max_acwt`; do
65 |   cat $dir/scoring/$acwt.txt | awk '{ print $1}' > $dir/scoring/utt_id
66 |   cat $dir/scoring/$acwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra
67 |   paste $dir/scoring/utt_id $dir/scoring/utt_tra  > $dir/scoring/${acwt}.char
68 | done
69 | 
70 | rm $dir/scoring/utt_tra $dir/scoring/utt_id
71 | 
72 | export LC_ALL=C
73 | 
74 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/score.ACWT.log \
75 |   compute-wer --text --mode=present \
76 |    ark:$dir/scoring/text.filt ark:$dir/scoring/ACWT.txt ">&" $dir/wer_ACWT || exit 1;
77 | 
78 | $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/score.ACWT.cer.log \
79 |   compute-wer --text --mode=present \
80 |    ark:$dir/scoring/char.filt ark:$dir/scoring/ACWT.char ">&" $dir/cer_ACWT || exit 1;
81 | 
82 | exit 0;
83 | 


--------------------------------------------------------------------------------
/utils/create_data_link.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013  Guoguo Chen
  4 | #           2014  Johns Hopkins University (author: Daniel Povey)
  5 | # Apache 2.0.
  6 | #
  7 | # This script distributes data onto different file systems by making symbolic
  8 | # links. It is supposed to use together with utils/create_split_dir.pl, which
  9 | # creates a "storage" directory that links to different file systems.
 10 | #
 11 | # If a sub-directory egs/storage does not exist, it does nothing. If it exists,
 12 | # then it selects pseudo-randomly a number from those available in egs/storage/*
 13 | # creates a link such as
 14 | #
 15 | #   egs/egs.3.4.ark -> storage/4/egs.3.4.ark
 16 | #
 17 | use strict;
 18 | use warnings;
 19 | use File::Basename;
 20 | use File::Spec;
 21 | use Getopt::Long;
 22 | 
 23 | sub GetGCD {
 24 |   my ($a, $b) = @_;
 25 |   while ($a != $b) {
 26 |     if ($a > $b) {
 27 |       $a = $a - $b;
 28 |     } else {
 29 |       $b = $b - $a;
 30 |     }
 31 |   }
 32 |   return $a;
 33 | }
 34 | 
 35 | my $Usage = <<EOU;
 36 | This script distributes data onto different file systems by making symbolic
 37 | links. It is supposed to use together with utils/create_split_dir.pl, which
 38 | creates a "storage" directory that links to different file systems.
 39 | 
 40 | If a sub-directory foo/storage does not exist, it does nothing. If it exists,
 41 | then it selects pseudo-randomly a number from those available in foo/storage/*
 42 | creates a link such as
 43 | 
 44 |   foo/egs.3.4.ark -> storage/4/egs.3.4.ark
 45 | 
 46 | Usage: utils/create_data_link.pl <data-archive>
 47 |  e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark
 48 | 
 49 | See also utils/remove_data_links.sh
 50 | EOU
 51 | 
 52 | GetOptions();
 53 | 
 54 | if (@ARGV != 1) {
 55 |   die $Usage;
 56 | }
 57 | 
 58 | my $fullpath = shift(@ARGV);
 59 | 
 60 | # Check if the storage has been created. If so, do nothing.
 61 | my $dirname = dirname($fullpath);
 62 | if (! -d "$dirname/storage") {
 63 |   exit(0);
 64 | }
 65 | 
 66 | # Storage exists, create symbolic links in the next few steps.
 67 | 
 68 | # First, get a list of the available storage direstories, and check if they are
 69 | # properly created.
 70 | opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n";
 71 | my @storage_dirs = grep(/^[0-9]*$/, readdir($dh));
 72 | closedir($dh);
 73 | my $num_storage = scalar(@storage_dirs);
 74 | for (my $x = 1; $x <= $num_storage; $x++) {
 75 |   (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n";
 76 | }
 77 | 
 78 | # Second, get the coprime list.
 79 | my @coprimes;
 80 | for (my $n = 1; $n < $num_storage; $n++) {
 81 |   if (GetGCD($n, $num_storage) == 1) {
 82 |     push(@coprimes, $n);
 83 |   }
 84 | }
 85 | 
 86 | # Finally, work out the directory index where we should put the data to.
 87 | my $basename = basename($fullpath);
 88 | my $filename_numbers = $basename;
 89 | $filename_numbers =~ s/[^0-9]+/ /g;
 90 | my @filename_numbers = split(" ", $filename_numbers);
 91 | my $total = 0;
 92 | my $index = 0;
 93 | foreach my $x (@filename_numbers) {
 94 |   if ($index >= scalar(@coprimes)) {
 95 |     $index = 0;
 96 |   }
 97 |   $total += $x * $coprimes[$index];
 98 |   $index++;
 99 | }
100 | my $dir_index = $total % $num_storage + 1;
101 | 
102 | # Make the symbolic link.
103 | if (-e $fullpath) {
104 |   unlink($fullpath);
105 | }
106 | my $ret = symlink("storage/$dir_index/$basename", $fullpath);
107 | exit($ret == 1 ? 0 : 1);
108 | 


--------------------------------------------------------------------------------
/steps/align_ctc_single_utt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2016  Yajie Miao
 4 | # Apache 2.0
 5 | 
 6 | # Generate word-level alignment for a single utterance.
 7 | 
 8 | ## Begin configuration section
 9 | stage=0
10 | cmd=run.pl
11 | num_threads=1
12 | 
13 | max_active=7000 # max-active
14 | beam=15.0       # beam used
15 | lattice_beam=8.0
16 | max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
17 | 
18 | acoustic_scale=0.6  # the acoustic scales to be used
19 | oov_word="<unk>"   # the oov word, used to convert oov words in the transcripts
20 | # feature configurations; will be read from the training dir if not provided
21 | norm_vars=
22 | add_deltas=
23 | ## End configuration section
24 | 
25 | echo "$0 $@"  # Print the command line for logging
26 | 
27 | [ -f ./path.sh ] && . ./path.sh;
28 | . parse_options.sh || exit 1;
29 | 
30 | if [ $# != 5 ]; then
31 |    echo "Wrong #arguments ($#, expected 3)"
32 |    echo "Usage: steps/align_ctc_single_utt.sh [options] <lang-dir> <data-dir> <uttdata-dir> <model-dir> <working-dir>"
33 |    echo " e.g.: steps/align_ctc_single_utt.sh  data/lang_phn data/train data/uttdata exp/train_phn_l5_c320 exp/train_phn_l5_c320/align"
34 |    echo "main options (for others, see top of script file)"
35 |    echo "  --stage                                  # starts from which stage"
36 |    echo "  --nj <nj>                                # number of parallel jobs"
37 |    echo "  --cmd <cmd>                              # command to run in parallel with"
38 |    echo "  --acoustic_scale                        # default 0.6 the value of acoustic scale to be used"
39 |    exit 1;
40 | fi
41 | 
42 | langdir=$1
43 | data=$2
44 | uttdata=$3
45 | mdldir=$4
46 | dir=`echo $5 | sed 's:/$::g'` # remove any trailing slash.
47 | 
48 | thread_string=
49 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
50 | 
51 | [ -z "$add_deltas" ] && add_deltas=`cat $mdldir/add_deltas 2>/dev/null`
52 | [ -z "$norm_vars" ] && norm_vars=`cat $mdldir/norm_vars 2>/dev/null`
53 | 
54 | mkdir -p $dir/log
55 | 
56 | # Check if necessary files exist.
57 | for f in $mdldir/final.nnet $mdldir/label.counts $data/feats.scp $uttdata/feats.scp $uttdata/text; do
58 |   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
59 | done
60 | 
61 | ## Set up the features
62 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})"
63 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$uttdata/feats.scp ark:- |"
64 | $add_deltas && feats="$feats add-deltas ark:- ark:- |"
65 | ##
66 | 
67 | ## Create the "decoding" graph for this utterance
68 | oov_int=`grep $oov_word $langdir/words.txt | awk '{print $2}'`
69 | 
70 | utils/sym2int.pl --map-oov $oov_int -f 2- $langdir/words.txt $uttdata/text > $dir/text_int 
71 | 
72 | utils/training_trans_fst.py $dir/text_int | fstcompile | fstarcsort --sort_type=olabel > $dir/G.fst
73 | 
74 | fsttablecompose ${langdir}/L.fst $dir/G.fst | fstdeterminizestar --use-log=true | \
75 |   fstminimizeencoded | fstarcsort --sort_type=ilabel > $dir/LG.fst || exit 1;
76 | 
77 | fsttablecompose ${langdir}/T.fst $dir/LG.fst > $dir/TLG.fst || exit 1;
78 | 
79 | ## Generate alignments
80 | net-output-extract --class-frame-counts=$mdldir/label.counts --apply-log=true $mdldir/final.nnet "$feats" ark:- | \
81 |   latgen-faster  --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \
82 |   --acoustic-scale=$acoustic_scale --word-symbol-table=$langdir/words.txt --allow-partial=true $dir/TLG.fst ark:- ark:- | \
83 |   lattice-1best --acoustic-scale=$acoustic_scale --ascale-factor=1 ark:- ark:- | \
84 |   nbest-to-ctm ark:- - | \
85 |   utils/int2sym.pl -f 5 $langdir/words.txt > $dir/ali
86 | 
87 | exit 0;
88 | 


--------------------------------------------------------------------------------
/utils/sym2int.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #  http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 14 | # See the Apache 2 License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | $ignore_oov = 0;
 19 | 
 20 | for($x = 0; $x < 2; $x++) {
 21 |   if ($ARGV[0] eq "--map-oov") {
 22 |     shift @ARGV; 
 23 |     $map_oov = shift @ARGV;
 24 |     if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
 25 |       # disallow '-f', the empty string and anything ending in words.txt as the
 26 |       # OOV symbol because these are likely command-line errors.
 27 |       die "the --map-oov option requires an argument";
 28 |     }
 29 |   }
 30 |   if ($ARGV[0] eq "-f") {
 31 |     shift @ARGV;
 32 |     $field_spec = shift @ARGV; 
 33 |     if ($field_spec =~ m/^\d+$/) {
 34 |       $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
 35 |     }
 36 |     if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
 37 |       if ($1 ne "") {
 38 |         $field_begin = $1 - 1;  # Change to zero-based indexing.
 39 |       }
 40 |       if ($2 ne "") {
 41 |         $field_end = $2 - 1;    # Change to zero-based indexing.
 42 |       }
 43 |     }
 44 |     if (!defined $field_begin && !defined $field_end) {
 45 |       die "Bad argument to -f option: $field_spec"; 
 46 |     }
 47 |   }
 48 | }
 49 | 
 50 | $symtab = shift @ARGV;
 51 | if (!defined $symtab) {
 52 |   print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
 53 |     "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
 54 |       "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
 55 | }
 56 | open(F, "<$symtab") || die "Error opening symbol table file $symtab";
 57 | while(<F>) {
 58 |     @A = split(" ", $_);
 59 |     @A == 2 || die "bad line in symbol table file: $_";
 60 |     $sym2int{$A[0]} = $A[1] + 0;
 61 | }
 62 | 
 63 | if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
 64 |   if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
 65 |   $map_oov = $sym2int{$map_oov};
 66 | }
 67 | 
 68 | $num_warning = 0;
 69 | $max_warning = 20;
 70 | 
 71 | while (<>) {
 72 |   @A = split(" ", $_);
 73 |   @B = ();
 74 |   for ($n = 0; $n < @A; $n++) {
 75 |     $a = $A[$n];
 76 |     if ( (!defined $field_begin || $n >= $field_begin)
 77 |          && (!defined $field_end || $n <= $field_end)) {
 78 |       $i = $sym2int{$a};
 79 |       if (!defined ($i)) {
 80 |         if (defined $map_oov) {
 81 |           if ($num_warning++ < $max_warning) {
 82 |             print STDERR "sym2int.pl: replacing $a with $map_oov\n";
 83 |             if ($num_warning == $max_warning) {
 84 |               print STDERR "sym2int.pl: not warning for OOVs any more times\n";
 85 |             }
 86 |           }
 87 |           $i = $map_oov;
 88 |         } else {
 89 |           $pos = $n+1;
 90 |           die "sym2int.pl: undefined symbol $a (in position $pos)\n";
 91 |         }
 92 |       }
 93 |       $a = $i;
 94 |     }
 95 |     push @B, $a;
 96 |   }
 97 |   print join(" ", @B);
 98 |   print "\n";
 99 | }
100 | if ($num_warning > 0) {
101 |   print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; 
102 | }
103 | 
104 | exit(0);
105 | 


--------------------------------------------------------------------------------
/steps/decode_ctc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 4 | #           2015  Yajie Miao    (Carnegie Mellon University)
 5 | # Apache 2.0
 6 | 
 7 | # Decode the CTC-trained model. Currently we are using the simplest best-path decoding. Lattice-based 
 8 | # decoding and other formats of decoding outputs (e.g., CTM) will be added in our future development.  
 9 | 
10 | 
11 | ## Begin configuration section
12 | stage=0
13 | nj=16
14 | cmd=run.pl
15 | num_threads=1
16 | 
17 | max_active=7000 # max-active
18 | beam=15.0 # beam used
19 | 
20 | skip_scoring=false # whether to skip WER scoring
21 | acoustic_scales="0.5 0.6 0.7 0.8"  # the acoustic scales to be used
22 | 
23 | # feature configurations; will be read from the training dir if not provided
24 | norm_vars=
25 | add_deltas=
26 | ## End configuration section
27 | 
28 | echo "$0 $@"  # Print the command line for logging
29 | 
30 | [ -f ./path.sh ] && . ./path.sh;
31 | . parse_options.sh || exit 1;
32 | 
33 | if [ $# != 3 ]; then
34 |    echo "Wrong #arguments ($#, expected 3)"
35 |    echo "Usage: steps/decode_ctc.sh [options] <graph-dir> <data-dir> <decode-dir>"
36 |    echo " e.g.: steps/decode_ctc.sh data/lang data/test exp/train_l4_c320/decode"
37 |    echo "main options (for others, see top of script file)"
38 |    echo "  --stage                                  # starts from which stage"
39 |    echo "  --nj <nj>                                # number of parallel jobs"
40 |    echo "  --cmd <cmd>                              # command to run in parallel with"
41 |    echo "  --acoustic_scales                        # default 0.5 0.6 0.7 0.8 ... the values of acoustic scales to be used"
42 |    exit 1;
43 | fi
44 | 
45 | graphdir=$1
46 | data=$2
47 | dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.
48 | 
49 | srcdir=`dirname $dir`; # assume model directory one level up from decoding directory.
50 | sdata=$data/split$nj;
51 | 
52 | thread_string=
53 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
54 | 
55 | [ -z "$add_deltas" ] && add_deltas=`cat $srcdir/add_deltas 2>/dev/null`
56 | [ -z "$norm_vars" ] && norm_vars=`cat $srcdir/norm_vars 2>/dev/null`
57 | 
58 | mkdir -p $dir/log
59 | split_data.sh $data $nj || exit 1;
60 | echo $nj > $dir/num_jobs
61 | 
62 | # Check if necessary files exist.
63 | for f in $graphdir/TLG.fst $srcdir/label.counts $data/feats.scp; do
64 |   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
65 | done
66 | 
67 | ## Set up the features
68 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})"
69 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
70 | $add_deltas && feats="$feats add-deltas ark:- ark:- |"
71 | ##
72 | 
73 | # Decode for each of the acoustic scales
74 | for ascale in $acoustic_scales; do
75 |   echo "$0: decoding with acoustic scale $ascale"
76 |   $cmd JOB=1:$nj $dir/log/decode.$ascale.JOB.log \
77 |     net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true $srcdir/final.nnet "$feats" ark:- \| \
78 |     decode-faster --beam=$beam --max-active=$max_active --acoustic-scale=$ascale --word-symbol-table=$graphdir/words.txt \
79 |     --allow-partial=true $graphdir/TLG.fst ark:- ark,t:$dir/trans.$ascale.JOB 
80 |   cat $dir/trans.$ascale.* > $dir/trans.$ascale
81 |   rm -f $dir/trans.$ascale.*
82 | done
83 | 
84 | # Scoring
85 | cat $data/text | sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/text_filt 
86 | if ! $skip_scoring ; then
87 |   for ascale in $acoustic_scales; do
88 |      cat $dir/trans.$ascale | utils/int2sym.pl -f 2- $graphdir/words.txt | \
89 |        sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' | \
90 |        compute-wer --text --mode=present ark:$dir/text_filt  ark,p:-  >& $dir/wer_$ascale || exit 1;
91 |   done
92 | fi
93 | 
94 | exit 0;
95 | 


--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
 4 | #                 Arnab Ghoshal, Karel Vesely
 5 | 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #  http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
16 | # See the Apache 2 License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | 
20 | # Parse command-line options.
21 | # To be sourced by another script (as in ". parse_options.sh").
22 | # Option format is: --option-name arg
23 | # and shell variable "option_name" gets set to value "arg."
24 | # The exception is --help, which takes no arguments, but prints the 
25 | # $help_message variable (if defined).
26 | 
27 | 
28 | ###
29 | ### The --config file options have lower priority to command line 
30 | ### options, so we need to import them first...
31 | ###
32 | 
33 | # Now import all the configs specified by command-line, in left-to-right order
34 | for ((argpos=1; argpos<$#; argpos++)); do
35 |   if [ "${!argpos}" == "--config" ]; then
36 |     argpos_plus1=$((argpos+1))
37 |     config=${!argpos_plus1}
38 |     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
39 |     . $config  # source the config file.
40 |   fi
41 | done
42 | 
43 | 
44 | ###
45 | ### No we process the command line options
46 | ###
47 | while true; do
48 |   [ -z "${1:-}" ] && break;  # break if there are no arguments
49 |   case "$1" in
50 |     # If the enclosing script is called with --help option, print the help 
51 |     # message and exit.  Scripts should put help messages in $help_message
52 |   --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
53 | 	  else printf "$help_message\n" 1>&2 ; fi; 
54 | 	  exit 0 ;; 
55 |   --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
56 |        exit 1 ;;
57 |     # If the first command-line argument begins with "--" (e.g. --foo-bar), 
58 |     # then work out the variable name as $name, which will equal "foo_bar".
59 |   --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
60 |     # Next we test whether the variable in question is undefned-- if so it's 
61 |     # an invalid option and we die.  Note: $0 evaluates to the name of the 
62 |     # enclosing script.
63 |     # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
64 |     # is undefined.  We then have to wrap this test inside "eval" because 
65 |     # foo_bar is itself inside a variable ($name).
66 |       eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
67 |       
68 |       oldval="`eval echo \\$$name`";
69 |     # Work out whether we seem to be expecting a Boolean argument.
70 |       if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
71 | 	was_bool=true;
72 |       else 
73 | 	was_bool=false;
74 |       fi
75 | 
76 |     # Set the variable to the right value-- the escaped quotes make it work if
77 |     # the option had spaces, like --cmd "queue.pl -sync y"
78 |       eval $name=\"$2\"; 
79 |         
80 |     # Check that Boolean-valued arguments are really Boolean.
81 |       if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
82 |         echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
83 |         exit 1;
84 |       fi
85 |       shift 2;
86 |       ;;
87 |   *) break;
88 |   esac
89 | done
90 | 
91 | 
92 | # Check for an empty argument to the --cmd option, which can easily occur as a 
93 | # result of scripting errors.
94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
95 | 
96 | 
97 | true; # so this script returns exit code 0.
98 | 


--------------------------------------------------------------------------------
/utils/pinyin_map.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | $num_args = $#ARGV + 1;
 4 | if ($num_args != 1) {
 5 |   print "\nUsage: pinyin2phone.pl pinyin2phone\n";
 6 |   exit;
 7 | }
 8 | 
 9 | open(MAPS, $ARGV[0]) or die("Could not open pinyin map file.");
10 | my %py2ph; foreach $line (<MAPS>) { @A = split(" ", $line);
11 |   $py = shift(@A);
12 |   $py2ph{$py} = [@A]; 
13 | }
14 | 
15 | #foreach $word ( keys %py2ph ) {
16 |      #foreach $i ( 0 .. $#{ $py2ph{$word} } ) {
17 |      #    print " $word = $py2ph{$word}[$i]";
18 |      #}
19 |      #print " $#{ $py2ph{$word} }";
20 |      #print "\n";
21 | #}
22 | 
23 | my @entry;
24 | 
25 | while (<STDIN>) {
26 |   @A = split(" ", $_);
27 |   @entry = (); 
28 |   $W = shift(@A);
29 |   push(@entry, $W);
30 |   for($i = 0; $i < @A; $i++) {
31 |     $initial= $A[$i]; $final = $A[$i];
32 |     #print $initial, " ", $final, "\n";
33 |     if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;}
34 |     elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} 
35 |     elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;}
36 |     elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;}
37 |     elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;}
38 |     elsif ($A[$i] =~ /^D[A-Z0-9]+$/) {$initial =~ s:(D)[A-Z0-9]+:$1:; $final =~ s:D([A-Z0-9]+):$1:;}
39 |     elsif ($A[$i] =~ /^F[A-Z0-9]+$/) {$initial =~ s:(F)[A-Z0-9]+:$1:; $final =~ s:F([A-Z0-9]+):$1:;}
40 |     elsif ($A[$i] =~ /^G[A-Z0-9]+$/) {$initial =~ s:(G)[A-Z0-9]+:$1:; $final =~ s:G([A-Z0-9]+):$1:;}
41 |     elsif ($A[$i] =~ /^H[A-Z0-9]+$/) {$initial =~ s:(H)[A-Z0-9]+:$1:; $final =~ s:H([A-Z0-9]+):$1:;}
42 |     elsif ($A[$i] =~ /^J[A-Z0-9]+$/) {$initial =~ s:(J)[A-Z0-9]+:$1:; $final =~ s:J([A-Z0-9]+):$1:;}
43 |     elsif ($A[$i] =~ /^K[A-Z0-9]+$/) {$initial =~ s:(K)[A-Z0-9]+:$1:; $final =~ s:K([A-Z0-9]+):$1:;}
44 |     elsif ($A[$i] =~ /^L[A-Z0-9]+$/) {$initial =~ s:(L)[A-Z0-9]+:$1:; $final =~ s:L([A-Z0-9]+):$1:;}
45 |     elsif ($A[$i] =~ /^M[A-Z0-9]+$/) {$initial =~ s:(M)[A-Z0-9]+:$1:; $final =~ s:M([A-Z0-9]+):$1:;}
46 |     elsif ($A[$i] =~ /^N[A-Z0-9]+$/) {$initial =~ s:(N)[A-Z0-9]+:$1:; $final =~ s:N([A-Z0-9]+):$1:;}
47 |     elsif ($A[$i] =~ /^P[A-Z0-9]+$/) {$initial =~ s:(P)[A-Z0-9]+:$1:; $final =~ s:P([A-Z0-9]+):$1:;}
48 |     elsif ($A[$i] =~ /^Q[A-Z0-9]+$/) {$initial =~ s:(Q)[A-Z0-9]+:$1:; $final =~ s:Q([A-Z0-9]+):$1:;}
49 |     elsif ($A[$i] =~ /^R[A-Z0-9]+$/) {$initial =~ s:(R)[A-Z0-9]+:$1:; $final =~ s:R([A-Z0-9]+):$1:;}
50 |     elsif ($A[$i] =~ /^S[A-Z0-9]+$/) {$initial =~ s:(S)[A-Z0-9]+:$1:; $final =~ s:S([A-Z0-9]+):$1:;}
51 |     elsif ($A[$i] =~ /^T[A-Z0-9]+$/) {$initial =~ s:(T)[A-Z0-9]+:$1:; $final =~ s:T([A-Z0-9]+):$1:;}
52 |     elsif ($A[$i] =~ /^W[A-Z0-9]+$/) {$initial =~ s:(W)[A-Z0-9]+:$1:; $final =~ s:W([A-Z0-9]+):$1:;}
53 |     elsif ($A[$i] =~ /^X[A-Z0-9]+$/) {$initial =~ s:(X)[A-Z0-9]+:$1:; $final =~ s:X([A-Z0-9]+):$1:;}
54 |     elsif ($A[$i] =~ /^Y[A-Z0-9]+$/) {$initial =~ s:(Y)[A-Z0-9]+:$1:; $final =~ s:Y([A-Z0-9]+):$1:;}
55 |     elsif ($A[$i] =~ /^Z[A-Z0-9]+$/) {$initial =~ s:(Z)[A-Z0-9]+:$1:; $final =~ s:Z([A-Z0-9]+):$1:;}
56 |     if ($initial ne $A[$i]) {
57 |       $tone = $final;
58 |       $final =~ s:([A-Z]+)[0-9]:$1:;
59 |       $tone =~ s:[A-Z]+([0-9]):$1:;
60 |       if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { print "1: no entry find for ", $A[$i], " ", $initial, " ", $final;  exit;}
61 |       push(@entry, @{$py2ph{$initial}}); 
62 |       @tmp = @{$py2ph{$final}};
63 |       for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
64 |       push(@entry, @tmp); 
65 |     }
66 |     else {
67 |       $tone = $A[$i];
68 |       $A[$i] =~ s:([A-Z]+)[0-9]:$1:;   
69 |       $tone =~ s:[A-Z]+([0-9]):$1:;
70 |       if (!(exists $py2ph{$A[$i]})) { print "2: no entry find for ", $A[$i];  exit;}
71 |       @tmp = @{$py2ph{$A[$i]}};
72 |       for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
73 |       push(@entry, @tmp); 
74 |     }
75 |   } 
76 |   print "@entry";
77 |   print "\n";
78 | }
79 | 


--------------------------------------------------------------------------------
/utils/subset_data_dir_tr_cv.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2013  Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin);
  3 | #                 Brno University of Technology (Author: Karel Vesely);
  4 | #                 Johns Hopkins University (Author: Daniel Povey);
  5 | # Apache 2.0
  6 | 
  7 | # This script splits dataset to two parts : 
  8 | # training set from (100-P)% of speakers/utterances and 
  9 | # held-out set (or cross-validation) from P% of remaining speakers/remaining utterances,
 10 | # which will be later on used for neural network training
 11 | #
 12 | # There are two options for choosing held-out (or cross-validation) set, either by
 13 | # --cv-spk-percent P , which will give you CV set based on random chosen P% of speakers, or
 14 | # --cv-utt-percent P , which will give you CV set based on last P% utterances in the dataset
 15 | # 
 16 | # If you don't apply the above two options, by default the script will use --cv-utt-percent option,
 17 | # and the default cross validation percentage portion is equal to 10% (i.e. P=10)
 18 | #
 19 | # The --cv-spk-percent option is useful if you would like to have subset chosen from random speakers order, 
 20 | # especially for the cases where dataset contains multiple different corpora,
 21 | # where type of speakers or recording channels may be quite different 
 22 | 
 23 | # Begin configuration.
 24 | cv_spk_percent= # % of speakers is parsed by option
 25 | cv_utt_percent=10 # default 10% of total utterances 
 26 | seed=777 # use seed for speaker shuffling
 27 | # End configuration.
 28 | 
 29 | echo "$0 $@"  # Print the command line for logging
 30 | 
 31 | uttbase=true; # by default, we choose last 10% utterances for CV
 32 | 
 33 | if [ "$1" == "--cv-spk-percent" ]; then
 34 |   uttbase=false;
 35 |   spkbase=true;
 36 | fi
 37 | 
 38 | [ -f path.sh ] && . ./path.sh; 
 39 | 
 40 | . parse_options.sh || exit 1;
 41 | 
 42 | if [ $# != 3 ]; then
 43 |   echo "Usage: $0 [--cv-spk-percent P|--cv-utt-percent P] <srcdir> <traindir> <crossvaldir>"
 44 |   echo "  --cv-spk-percent P  Cross Validation portion of the total speakers, recommend value is 10% (i.e. P=10)"
 45 |   echo "  --cv-utt-percent P  Cross Validation portion of the total utterances, default is 10% (i.e. P=10)"
 46 |   echo "  "
 47 |   exit 1;
 48 | fi
 49 | 
 50 | srcdir=$1
 51 | trndir=$2
 52 | cvdir=$3
 53 | 
 54 | ## use simple last P% utterance for CV
 55 | if $uttbase; then
 56 |   if [ ! -f $srcdir/utt2spk ]; then
 57 |     echo "$0: no such file $srcdir/utt2spk"
 58 |     exit 1;
 59 |   fi
 60 | 
 61 |   #total number of lines
 62 |   N=$(cat $srcdir/utt2spk | wc -l)
 63 |   #get line number where (100-P)% of the data lies
 64 |   P_utt=$((N * cv_utt_percent / 100))
 65 |   N_head=$((N -P_utt))
 66 |   #move the boundary so it is located on speaker change
 67 |   N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }')
 68 |   #the rest of the data will be that big
 69 |   N_tail=$((N-N_head))
 70 | 
 71 |   #now call the subset_data_dir.sh and fix the directories
 72 |   subset_data_dir.sh --first $srcdir $N_head $trndir
 73 |   subset_data_dir.sh --last $srcdir $N_tail $cvdir
 74 | 
 75 |   exit 0;
 76 | fi
 77 | 
 78 | ## use random chosen P% speakers for CV
 79 | if [ ! -f $srcdir/spk2utt ]; then
 80 |   echo "$0: no such file $srcdir/spk2utt" 
 81 |   exit 1;
 82 | fi
 83 | 
 84 | #total, cv, train number of speakers
 85 | N=$(cat $srcdir/spk2utt | wc -l)
 86 | N_spk_cv=$((N * cv_spk_percent / 100))
 87 | N_spk_trn=$((N - N_spk_cv))
 88 | 
 89 | mkdir -p $cvdir $trndir
 90 | 
 91 | #shuffle the speaker list
 92 | awk '{print $1}' $srcdir/spk2utt | shuffle_list.pl --srand $seed > $trndir/_tmpf_randspk
 93 | 
 94 | #split the train/cv
 95 | head -n $N_spk_cv $trndir/_tmpf_randspk > $cvdir/_tmpf_cvspk
 96 | tail -n $N_spk_trn $trndir/_tmpf_randspk > $trndir/_tmpf_trainspk
 97 | 
 98 | #now call the subset_data_dir.sh 
 99 | subset_data_dir.sh --spk-list $trndir/_tmpf_trainspk $srcdir $trndir
100 | subset_data_dir.sh --spk-list $cvdir/_tmpf_cvspk $srcdir $cvdir
101 | 
102 | #clean-up
103 | rm -f $trndir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk
104 | 
105 | 


--------------------------------------------------------------------------------
/utils/add_lex_disambig.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | # Copyright 2010-2011  Microsoft Corporation
  3 | #                2013  Johns Hopkins University (author: Daniel Povey)
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #  http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 15 | # See the Apache 2 License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | 
 19 | # Adds disambiguation symbols to a lexicon.
 20 | # Outputs still in the normal lexicon format.
 21 | # Disambig syms are numbered #1, #2, #3, etc. (#0 
 22 | # reserved for symbol in grammar).
 23 | # Outputs the number of disambig syms to the standard output.
 24 | # With the --pron-probs option, expects the second field
 25 | # of each lexicon line to be a pron-prob.
 26 | 
 27 | $pron_probs = 0;
 28 | 
 29 | if ($ARGV[0] eq "--pron-probs") {
 30 |   $pron_probs = 1;
 31 |   shift @ARGV;
 32 | }
 33 | 
 34 | if(@ARGV != 2) {
 35 |     die "Usage: add_lex_disambig.pl [--pron-probs] lexicon.txt lexicon_disambig.txt "
 36 | }
 37 | 
 38 | 
 39 | $lexfn = shift @ARGV;
 40 | $lexoutfn = shift @ARGV;
 41 | 
 42 | open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
 43 | 
 44 | # (1)  Read in the lexicon.
 45 | @L = ( );
 46 | while(<L>) {
 47 |     @A = split(" ", $_);
 48 |     push @L, join(" ", @A);
 49 | }
 50 | 
 51 | # (2) Work out the count of each phone-sequence in the
 52 | # lexicon.
 53 | 
 54 | foreach $l (@L) {
 55 |     @A = split(" ", $l);
 56 |     shift @A; # Remove word.
 57 |     if ($pron_probs) {
 58 |       $p = shift @A;
 59 |       if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
 60 |     }
 61 |     $count{join(" ",@A)}++;
 62 | }
 63 | 
 64 | # (3) For each left sub-sequence of each phone-sequence, note down
 65 | # that exists (for identifying prefixes of longer strings).
 66 | 
 67 | foreach $l (@L) {
 68 |     @A = split(" ", $l);
 69 |     shift @A; # Remove word.
 70 |     if ($pron_probs) { shift @A; } # remove pron-prob.
 71 |     while(@A > 0) {
 72 |         pop @A;  # Remove last phone
 73 |         $issubseq{join(" ",@A)} = 1;
 74 |     }
 75 | }
 76 | 
 77 | # (4) For each entry in the lexicon:
 78 | #  if the phone sequence is unique and is not a
 79 | #  prefix of another word, no diambig symbol.
 80 | #  Else output #1, or #2, #3, ... if the same phone-seq
 81 | #  has already been assigned a disambig symbol.
 82 | 
 83 | 
 84 | open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
 85 | 
 86 | $max_disambig = 0;
 87 | foreach $l (@L) {
 88 |     @A = split(" ", $l);
 89 |     $word = shift @A;
 90 |     if ($pron_probs) { $pron_prob = shift @A; }
 91 |     $phnseq = join(" ",@A);
 92 |     if(!defined $issubseq{$phnseq}
 93 |        && $count{$phnseq} == 1) {
 94 |         ; # Do nothing.
 95 |     } else {
 96 |         if($phnseq eq "") { # need disambig symbols for the empty string
 97 |             # that are not use anywhere else.
 98 |             $max_disambig++;
 99 |             $reserved{$max_disambig} = 1;
100 |             $phnseq = "#$max_disambig";
101 |         } else {
102 |             $curnumber = $disambig_of{$phnseq};
103 |             if(!defined{$curnumber}) { $curnumber = 0; }
104 |             $curnumber++; # now 1 or 2, ... 
105 |             while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
106 |             if($curnumber > $max_disambig) {
107 |                 $max_disambig = $curnumber;
108 |             }
109 |             $disambig_of{$phnseq} = $curnumber;
110 |             $phnseq = $phnseq . " #" . $curnumber;
111 |          }
112 |     }
113 |     if ($pron_probs) {  print O "$word\t$pron_prob\t$phnseq\n"; }
114 |     else {  print O "$word\t$phnseq\n"; }
115 | }
116 | 
117 | print $max_disambig . "\n";
118 | 
119 | 


--------------------------------------------------------------------------------
/steps/compute_cmvn_stats.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
  4 | # Apache 2.0
  5 | # To be run from .. (one directory up from here)
  6 | # see ../run.sh for example
  7 | 
  8 | # Compute cepstral mean and variance statistics per speaker.  
  9 | # We do this in just one job; it's fast.
 10 | # This script takes no options.
 11 | #
 12 | # Note: there is no option to do CMVN per utterance.  The idea is
 13 | # that if you did it per utterance it would not make sense to do
 14 | # per-speaker fMLLR on top of that (since you'd be doing fMLLR on
 15 | # top of different offsets).  Therefore what would be the use
 16 | # of the speaker information?  In this case you should probably
 17 | # make the speaker-ids identical to the utterance-ids.  The
 18 | # speaker information does not have to correspond to actual
 19 | # speakers, it's just the level you want to adapt at.
 20 | 
 21 | echo "$0 $@"  # Print the command line for logging
 22 | 
 23 | fake=false
 24 | fake_dims=       # If specified, can generate 'fake' stats (that won't normalize)
 25 |                  # from a specified dimension.
 26 | two_channel=false
 27 | 
 28 | if [ "$1" == "--fake" ]; then
 29 |   fake=true
 30 |   shift
 31 | fi
 32 | if [ "$1" == "--fake-dims" ]; then
 33 |   fake_dims=$2
 34 |   shift
 35 |   shift
 36 | fi
 37 | if [ "$1" == "--two-channel" ]; then
 38 |   two_channel=true
 39 |   shift
 40 | fi
 41 | 
 42 | if [ $# != 3 ]; then
 43 |    echo "Usage: $0 [options] <data-dir> <log-dir> <path-to-cmvn-dir>";
 44 |    echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
 45 |    echo "Options:"
 46 |    echo " --fake          gives you fake cmvn stats that do no normalization."
 47 |    echo " --two-channel   is for two-channel telephone data, there must be no segments "
 48 |    echo "                 file and reco2file_and_channel must be present.  It will take"
 49 |    echo "                 only frames that are louder than the other channel."
 50 |    echo " --fake-dims <n1:n2>  Generate stats that won't cause normalization for these"
 51 |    echo "                  dimensions (e.g. 13:14:15)"
 52 |    exit 1;
 53 | fi
 54 | 
 55 | if [ -f path.sh ]; then . ./path.sh; fi
 56 | 
 57 | data=$1
 58 | logdir=$2
 59 | cmvndir=$3
 60 | 
 61 | # make $cmvndir an absolute pathname.
 62 | cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}`
 63 | 
 64 | # use "name" as part of name of the archive.
 65 | name=`basename $data`
 66 | 
 67 | mkdir -p $cmvndir || exit 1;
 68 | mkdir -p $logdir || exit 1;
 69 | 
 70 | 
 71 | required="$data/feats.scp $data/spk2utt"
 72 | 
 73 | for f in $required; do
 74 |   if [ ! -f $f ]; then
 75 |     echo "make_cmvn.sh: no such file $f"
 76 |     exit 1;
 77 |   fi
 78 | done
 79 | 
 80 | if $fake; then
 81 |   dim=`feat-to-dim scp:$data/feats.scp -`
 82 |   ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1";
 83 |                                                         for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \
 84 |     copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
 85 |      echo "Error creating fake CMVN stats" && exit 1;
 86 | elif $two_channel; then
 87 |   ! compute-cmvn-stats-two-channel $data/reco2file_and_channel scp:$data/feats.scp \
 88 |        ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
 89 |     2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats (using two-channel method)" && exit 1;
 90 | elif [ ! -z "$fake_dims" ]; then
 91 |   ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | \
 92 |     modify-cmvn-stats "$fake_dims" ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
 93 |     echo "Error computing (partially fake) CMVN stats" && exit 1;
 94 | else
 95 |   ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
 96 |     2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1;
 97 | fi
 98 | 
 99 | cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1;
100 | 
101 | nc=`cat $data/cmvn.scp | wc -l` 
102 | nu=`cat $data/spk2utt | wc -l` 
103 | if [ $nc -ne $nu ]; then
104 |   echo "$0: warning: it seems not all of the speakers got cmvn stats ($nc != $nu);"
105 |   [ $nc -eq 0 ] && exit 1;
106 | fi
107 | 
108 | echo "Succeeded creating CMVN stats for $name"
109 | 


--------------------------------------------------------------------------------
/steps/make_fbank.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | # Copyright 2012  Karel Vesely  Johns Hopkins University (Author: Daniel Povey)
  4 | # Apache 2.0
  5 | # To be run from .. (one directory up from here)
  6 | # see ../run.sh for example
  7 | 
  8 | # Begin configuration section.
  9 | nj=4
 10 | cmd=run.pl
 11 | fbank_config=conf/fbank.conf
 12 | compress=true
 13 | # End configuration section.
 14 | 
 15 | echo "$0 $@"  # Print the command line for logging
 16 | 
 17 | if [ -f path.sh ]; then . ./path.sh; fi
 18 | . parse_options.sh || exit 1;
 19 | 
 20 | if [ $# != 3 ]; then
 21 |    echo "usage: make_fbank.sh [options] <data-dir> <log-dir> <path-to-fbankdir>";
 22 |    echo "options: "
 23 |    echo "  --fbank-config <config-file>                      # config passed to compute-fbank-feats "
 24 |    echo "  --nj <nj>                                        # number of parallel jobs"
 25 |    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
 26 |    exit 1;
 27 | fi
 28 | 
 29 | data=$1
 30 | logdir=$2
 31 | fbankdir=$3
 32 | 
 33 | 
 34 | # make $fbankdir an absolute pathname.
 35 | fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}`
 36 | 
 37 | # use "name" as part of name of the archive.
 38 | name=`basename $data`
 39 | 
 40 | mkdir -p $fbankdir || exit 1;
 41 | mkdir -p $logdir || exit 1;
 42 | 
 43 | if [ -f $data/feats.scp ]; then
 44 |   mkdir -p $data/.backup
 45 |   echo "$0: moving $data/feats.scp to $data/.backup"
 46 |   mv $data/feats.scp $data/.backup
 47 | fi
 48 | 
 49 | scp=$data/wav.scp
 50 | 
 51 | required="$scp $fbank_config"
 52 | 
 53 | for f in $required; do
 54 |   if [ ! -f $f ]; then
 55 |     echo "make_fbank.sh: no such file $f"
 56 |     exit 1;
 57 |   fi
 58 | done
 59 | 
 60 | utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 61 | 
 62 | if [ -f $data/spk2warp ]; then
 63 |   echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
 64 |   vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
 65 | elif [ -f $data/utt2warp ]; then
 66 |   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
 67 |   vtln_opts="--vtln-map=ark:$data/utt2warp"
 68 | fi
 69 | 
 70 | for n in $(seq $nj); do
 71 |   # the next command does nothing unless $fbankdir/storage/ exists, see
 72 |   # utils/create_data_link.pl for more info.
 73 |   utils/create_data_link.pl $fbankdir/raw_fbank_$name.$n.ark  
 74 | done
 75 | 
 76 | if [ -f $data/segments ]; then
 77 |   echo "$0 [info]: segments file exists: using that."
 78 |   split_segments=""
 79 |   for n in $(seq $nj); do
 80 |     split_segments="$split_segments $logdir/segments.$n"
 81 |   done
 82 | 
 83 |   utils/split_scp.pl $data/segments $split_segments || exit 1;
 84 |   rm $logdir/.error 2>/dev/null
 85 | 
 86 |   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
 87 |     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
 88 |     compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \
 89 |     copy-feats --compress=$compress ark:- \
 90 |      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
 91 |      || exit 1;
 92 | 
 93 | else
 94 |   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
 95 |   split_scps=""
 96 |   for n in $(seq $nj); do
 97 |     split_scps="$split_scps $logdir/wav.$n.scp"
 98 |   done
 99 | 
100 |   utils/split_scp.pl $scp $split_scps || exit 1;
101 |  
102 |   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
103 |     compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
104 |     copy-feats --compress=$compress ark:- \
105 |      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
106 |      || exit 1;
107 | 
108 | fi
109 | 
110 | 
111 | if [ -f $logdir/.error.$name ]; then
112 |   echo "Error producing fbank features for $name:"
113 |   tail $logdir/make_fbank_${name}.1.log
114 |   exit 1;
115 | fi
116 | 
117 | # concatenate the .scp files together.
118 | for n in $(seq $nj); do
119 |   cat $fbankdir/raw_fbank_$name.$n.scp || exit 1;
120 | done > $data/feats.scp
121 | 
122 | rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
123 | 
124 | nf=`cat $data/feats.scp | wc -l` 
125 | nu=`cat $data/utt2spk | wc -l` 
126 | if [ $nf -ne $nu ]; then
127 |   echo "It seems not all of the feature files were successfully ($nf != $nu);"
128 |   echo "consider using utils/fix_data_dir.sh $data"
129 | fi
130 | 
131 | echo "Succeeded creating filterbank features for $name"
132 | #echo -e "\n"
133 | 


--------------------------------------------------------------------------------
/utils/split_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2010-2013 Microsoft Corporation 
  3 | #                     Johns Hopkins University (Author: Daniel Povey)
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #  http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 15 | # See the Apache 2 License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | split_per_spk=true
 19 | if [ "$1" == "--per-utt" ]; then
 20 |   split_per_spk=false
 21 |   shift
 22 | fi
 23 | 
 24 | if [ $# != 2 ]; then
 25 |   echo "Usage: split_data.sh <data-dir> <num-to-split>"
 26 |   echo "This script will not split the data-dir if it detects that the output is newer than the input."
 27 |   exit 1
 28 | fi
 29 | 
 30 | data=$1
 31 | numsplit=$2
 32 | 
 33 | if [ $numsplit -le 0 ]; then
 34 |   echo "Invalid num-split argument $numsplit";
 35 |   exit 1;
 36 | fi
 37 | 
 38 | n=0;
 39 | feats=""
 40 | wavs=""
 41 | utt2spks=""
 42 | texts=""
 43 | 
 44 | nu=`cat $data/utt2spk | wc -l`
 45 | nf=`cat $data/feats.scp 2>/dev/null | wc -l`
 46 | nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
 47 | if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
 48 |   echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script "
 49 |   echo " may produce incorrectly split data."
 50 |   echo "use utils/fix_data_dir.sh $data to fix this."
 51 | fi
 52 | if [ -f $data/text ] && [ $nu -ne $nt ]; then
 53 |   echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script "
 54 |   echo " may produce incorrectly split data."
 55 |   echo "use utils/fix_data_dir.sh to fix this."
 56 | fi
 57 | 
 58 | s1=$data/split$numsplit/1
 59 | if [ ! -d $s1 ]; then 
 60 |   need_to_split=true
 61 | else 
 62 |   need_to_split=false
 63 |   for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
 64 |     vad.scp segments reco2file_and_channel utt2lang; do
 65 |     if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
 66 |       need_to_split=true
 67 |     fi
 68 |   done
 69 | fi
 70 | 
 71 | if ! $need_to_split; then
 72 |   exit 0;
 73 | fi
 74 |   
 75 | for n in `seq $numsplit`; do
 76 |    mkdir -p $data/split$numsplit/$n
 77 |    feats="$feats $data/split$numsplit/$n/feats.scp"
 78 |    vads="$vads $data/split$numsplit/$n/vad.scp"
 79 |    texts="$texts $data/split$numsplit/$n/text"
 80 |    utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
 81 |    utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang"
 82 | done
 83 | 
 84 | if $split_per_spk; then
 85 |   utt2spk_opt="--utt2spk=$data/utt2spk"
 86 | else
 87 |   utt2spk_opt=
 88 | fi
 89 | 
 90 | utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
 91 | 
 92 | [ -f $data/feats.scp ] && utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats
 93 | 
 94 | [ -f $data/text ] && utils/split_scp.pl $utt2spk_opt $data/text $texts
 95 | 
 96 | [ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads
 97 | 
 98 | [ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs
 99 | 
100 | # If lockfile is not installed, just don't lock it.  It's not a big deal.
101 | which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock 
102 | 
103 | for n in `seq $numsplit`; do
104 |    dsn=$data/split$numsplit/$n
105 |    utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
106 |    for f in spk2gender spk2warp cmvn.scp; do
107 |      [ -f $data/$f ] && \
108 |        utils/filter_scp.pl $dsn/spk2utt $data/$f > $dsn/$f
109 |    done
110 |    if [ -f $data/segments ]; then
111 |      utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
112 |       awk '{print $2;}' $dsn/segments |sort|uniq > $data/tmp.reco # recording-ids.
113 |      [ -f $data/reco2file_and_channel ] &&
114 |      utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
115 |      [ -f $data/wav.scp ] && utils/filter_scp.pl $data/tmp.reco $data/wav.scp  > $dsn/wav.scp
116 |      rm $data/tmp.reco
117 |    else # else wav indexed by utterance -> filter on this.
118 |      [ -f $data/wav.scp ] &&
119 |        utils/filter_scp.pl $dsn/utt2spk $data/wav.scp > $dsn/wav.scp
120 |    fi
121 | done
122 | 
123 | rm -f $data/.split_lock
124 | 
125 | exit 0
126 | 


--------------------------------------------------------------------------------
/utils/format_lm_sri.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2012  Arnab Ghoshal
  4 | # Copyright 2010-2011  Microsoft Corporation
  5 | 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #  http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 16 | # See the Apache 2 License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | 
 19 | # Begin configuration section.
 20 | srilm_opts="-subset -prune-lowprobs -unk -tolower"
 21 | # end configuration sections
 22 | 
 23 | 
 24 | . utils/parse_options.sh
 25 | 
 26 | if [ $# -ne 4 ] && [ $# -ne 3 ]; then
 27 |   echo "Usage: $0 [options] <lang-dir> <arpa-LM> [<lexicon>] <out-dir>"
 28 |   echo "The <lexicon> argument is no longer needed but is supported for back compatibility"
 29 |   echo "E.g.: utils/format_lm_sri.sh data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test"
 30 |   echo "Converts ARPA-format language models to FSTs. Change the LM vocabulary using SRILM."
 31 |   echo "Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this"
 32 |   echo "that doesn't require SRILM: see examples in egs/wsj/s5/local/wsj_format_local_lms.sh"
 33 |   echo "options:"
 34 |   echo " --help                 # print this message and exit"
 35 |   echo " --srilm-opts STRING      # options to pass to SRILM tools (default: '$srilm_opts')"
 36 |   exit 1;
 37 | fi
 38 | 
 39 | 
 40 | if [ $# -eq 4 ] ; then
 41 |   lang_dir=$1
 42 |   lm=$2
 43 |   lexicon=$3
 44 |   out_dir=$4
 45 | else
 46 |   lang_dir=$1
 47 |   lm=$2
 48 |   out_dir=$3
 49 | fi
 50 | 
 51 | mkdir -p $out_dir
 52 | 
 53 | for f in $lm $lang_dir/words.txt; do
 54 |   if [ ! -f $f ]; then
 55 |     echo "$0: expected input file $f to exist."
 56 |     exit 1;
 57 |   fi
 58 | done
 59 | 
 60 | [ -f ./path.sh ] && . ./path.sh
 61 | 
 62 | loc=`which change-lm-vocab`
 63 | if [ -z $loc ]; then
 64 |   if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
 65 |     sdir=`pwd`/../../../tools/srilm/bin/i686-m64
 66 |   else
 67 |     sdir=`pwd`/../../../tools/srilm/bin/i686
 68 |   fi
 69 |   if [ -f $sdir/../change-lm-vocab ]; then
 70 |     echo Using SRILM tools from $sdir
 71 |     export PATH=$PATH:$sdir:$sdir/..
 72 |   else
 73 |     echo You appear to not have SRILM tools installed, either on your path,
 74 |     echo or installed in $sdir.  See tools/install_srilm.sh for installation
 75 |     echo instructions.
 76 |     exit 1
 77 |   fi
 78 | fi
 79 | 
 80 | echo "Converting '$lm' to FST"
 81 | tmpdir=data/local/format_sri_tmp
 82 | mkdir -p $tmpdir
 83 | trap 'rm -rf "$tmpdir"' EXIT
 84 | 
 85 | mkdir -p $out_dir
 86 | cp -r $lang_dir/* $out_dir || exit 1;
 87 | 
 88 | lm_base=$(basename $lm '.gz')
 89 | gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
 90 |   > $out_dir/oovs_${lm_base}.txt || exit 1;
 91 | 
 92 | # Removing all "illegal" combinations of <s> and </s>, which are supposed to 
 93 | # occur only at being/end of utt.  These can cause determinization failures 
 94 | # of CLG [ends up being epsilon cycles].
 95 | gunzip -c $lm \
 96 |   | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
 97 |   | gzip -c > $tmpdir/lm.gz || exit 1;
 98 | 
 99 | awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;
100 | 
101 | # Change the LM vocabulary to be the intersection of the current LM vocabulary
102 | # and the set of words in the pronunciation lexicon. This also renormalizes the 
103 | # LM by recomputing the backoff weights, and remove those ngrams whose 
104 | # probabilities are lower than the backed-off estimates.
105 | change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \
106 |   $srilm_opts || exit 1;
107 | 
108 | arpa2fst $tmpdir/out_lm | fstprint \
109 |   | utils/eps2disambig.pl | utils/s2eps.pl \
110 |   | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
111 |     --keep_isymbols=false --keep_osymbols=false \
112 |   | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst || exit 1;
113 | 
114 | fstisstochastic $out_dir/G.fst
115 | 
116 | # The output is like:
117 | # 9.14233e-05 -0.259833
118 | # we do expect the first of these 2 numbers to be close to zero (the second is
119 | # nonzero because the backoff weights make the states sum to >1).
120 | 
121 | echo "Succeeded in formatting LM '$lm' -> '$out_dir/G.fst'"
122 | 


--------------------------------------------------------------------------------
/steps/decode_ctc_lat.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Apache 2.0
  4 | 
  5 | # Decode the CTC-trained model by generating lattices.   
  6 | 
  7 | 
  8 | ## Begin configuration section
  9 | stage=0
 10 | nj=16
 11 | cmd=run.pl
 12 | num_threads=1
 13 | 
 14 | acwt=0.9
 15 | min_active=200
 16 | max_active=7000 # max-active
 17 | beam=15.0       # beam used
 18 | lattice_beam=8.0
 19 | max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
 20 | mdl=final.nnet
 21 | 
 22 | skip_scoring=false # whether to skip WER scoring
 23 | scoring_opts="--min-acwt 5 --max-acwt 10 --acwt-factor 0.1"
 24 | score_with_conf=false
 25 | 
 26 | # feature configurations; will be read from the training dir if not provided
 27 | norm_vars=
 28 | add_deltas=
 29 | subsample_feats=
 30 | splice_feats=
 31 | ## End configuration section
 32 | 
 33 | echo "$0 $@"  # Print the command line for logging
 34 | 
 35 | [ -f ./path.sh ] && . ./path.sh;
 36 | . parse_options.sh || exit 1;
 37 | 
 38 | if [ $# != 4 ]; then
 39 |    echo "Wrong #arguments ($#, expected 3)"
 40 |    echo "Usage: steps/decode_ctc.sh [options] <graph-dir> <data-dir> <decode-dir>"
 41 |    echo " e.g.: steps/decode_ctc.sh data/lang data/test exp/train_l4_c320/decode"
 42 |    echo "main options (for others, see top of script file)"
 43 |    echo "  --stage                                  # starts from which stage"
 44 |    echo "  --nj <nj>                                # number of parallel jobs"
 45 |    echo "  --cmd <cmd>                              # command to run in parallel with"
 46 |    echo "  --acwt                                   # default 0.9, the acoustic scale to be used"
 47 |    exit 1;
 48 | fi
 49 | 
 50 | graphdir=$1
 51 | data=$2
 52 | srcdir=$3
 53 | dir=`echo $4 | sed 's:/$::g'` # remove any trailing slash.
 54 | 
 55 | #srcdir=`dirname $dir`; # assume model directory one level up from decoding directory.
 56 | #srcdir=/home/sundy/work/egs/hkust/exp/train_phn_l3_c320
 57 | 
 58 | sdata=$data/split$nj;
 59 | 
 60 | thread_string=
 61 | [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 62 | 
 63 | [ -z "$add_deltas" ] && add_deltas=`cat $srcdir/add_deltas 2>/dev/null`
 64 | [ -z "$norm_vars" ] && norm_vars=`cat $srcdir/norm_vars 2>/dev/null`
 65 | [ -z "$subsample_feats" ] && subsample_feats=`cat $srcdir/subsample_feats 2>/dev/null` || subsample_feats=false
 66 | [ -z "$splice_feats" ] && splice_feats=`cat $srcdir/splice_feats 2>/dev/null` || splice_feats=false
 67 | 
 68 | mkdir -p $dir/log
 69 | split_data.sh $data $nj || exit 1;
 70 | echo $nj > $dir/num_jobs
 71 | 
 72 | # Check if necessary files exist.
 73 | for f in $graphdir/TLG.fst $srcdir/label.counts $data/feats.scp; do
 74 | #for f in $graphdir/TLG.fst ./exp/train_phn_l5_c320/label.counts $data/feats.scp; do
 75 |   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 76 | done
 77 | 
 78 | ## Set up the features
 79 | echo "$0: feature: norm_vars(${norm_vars}) add_deltas(${add_deltas})"
 80 | feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
 81 | $add_deltas && feats="$feats add-deltas ark:- ark:- |"
 82 | $splice_feats && feats="$feats splice-feats --left-context=1 --right-context=1 ark:- ark:- |"
 83 | $subsample_feats && feats="$feats subsample-feats --n=3 --offset=0 ark:- ark:- |"
 84 | ##
 85 | 
 86 | #$cmd JOB=1:$nj $dir/log/decode.JOB.log \ 
 87 | #	net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true --use-gpu="no" $srcdir/$mdl "$feats" ark,t:output.txt
 88 | #exit 1;
 89 | # Decode for each of the acoustic scales
 90 | $cmd JOB=1:$nj $dir/log/decode.JOB.log \
 91 |   net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true --use-gpu="yes" $srcdir/$mdl "$feats" ark:- \| \
 92 | 	latgen-faster  --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \
 93 |   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
 94 |   $graphdir/TLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
 95 | 
 96 | # Scoring
 97 | if ! $skip_scoring ; then
 98 |   if [ -f $data/stm ]; then # use sclite scoring.
 99 |     if $score_with_conf ; then
100 |       [ ! -x local/score_sclite_conf.sh ] && echo "Not scoring because local/score_sclite_conf.sh does not exist or not executable." && exit 1;
101 |       local/score_sclite_conf.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
102 |     else
103 |       [ ! -x local/score_sclite.sh ] && echo "Not scoring because local/score_sclite.sh does not exist or not executable." && exit 1;
104 |       local/score_sclite.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
105 |     fi
106 |   else
107 |     [ ! -x local/score.sh ] && echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
108 |     local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
109 |   fi
110 | fi
111 | 
112 | exit 0;
113 | 


--------------------------------------------------------------------------------
/utils/ctc_compile_dict_token.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2015       Yajie Miao    (Carnegie Mellon University)
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #  http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 14 | # See the Apache 2 License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
 18 | # phoneme and character-based lexicons. 
 19 | 
 20 | dict_type="phn"        # the type of lexicon, either "phn" or "char"
 21 | space_char="<SPACE>"   # the character you have used to represent spaces
 22 | 
 23 | . utils/parse_options.sh 
 24 | 
 25 | if [ $# -ne 3 ]; then 
 26 |   echo "usage: utils/ctc_compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>"
 27 |   echo "e.g.: utils/ctc_compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn"
 28 |   echo "<dict-src-dir> should contain the following files:"
 29 |   echo "lexicon.txt lexicon_numbers.txt units.txt"
 30 |   echo "options: "
 31 |   echo "     --dict-type <type of lexicon>                   # default: phn."
 32 |   echo "     --space-char <space character>                  # default: <SPACE>, the character to represent spaces."
 33 |   exit 1;
 34 | fi
 35 | 
 36 | echo =============================================
 37 | echo " Generating Lexicon FST and CTC tokens FST "
 38 | echo =============================================
 39 | srcdir=$1
 40 | tmpdir=$2
 41 | dir=$3
 42 | mkdir -p $dir $tmpdir
 43 | 
 44 | [ -f path.sh ] && . ./path.sh
 45 | 
 46 | cp $srcdir/{lexicon_numbers.txt,units.txt} $dir
 47 | 
 48 | # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
 49 | # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. 
 50 | perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
 51 | 
 52 | # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
 53 | # Without these symbols, determinization will fail. 
 54 | ndisambig=`utils/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
 55 | ndisambig=$[$ndisambig+1];
 56 | 
 57 | ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
 58 | 
 59 | # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,
 60 | # phonemes), and the disambiguation symbols. 
 61 | cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
 62 | (echo '<eps>'; echo '<blk>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
 63 | 
 64 | # Compile the tokens into FST
 65 | utils/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \
 66 |    --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
 67 | 
 68 | # Encode the words with indices. Will be used in lexicon and language model FST compiling. 
 69 | cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
 70 |   BEGIN {
 71 |     print "<eps> 0";
 72 |   } 
 73 |   {
 74 |     printf("%s %d\n", $1, NR);
 75 |   }
 76 |   END {
 77 |     printf("#0 %d\n", NR+1);
 78 |   }' > $dir/words.txt || exit 1;
 79 | 
 80 | # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. 
 81 | token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
 82 | word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
 83 | 
 84 | case $dict_type in
 85 |   phn)
 86 |      utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
 87 |        fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
 88 |        --keep_isymbols=false --keep_osymbols=false |   \
 89 |        fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
 90 |        fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
 91 |        ;;
 92 |   char) 
 93 |      echo "Building a character-based lexicon, with $space_char as the space"
 94 |      utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0.5 "$space_char" '#'$ndisambig | \
 95 |        fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
 96 |        --keep_isymbols=false --keep_osymbols=false |   \
 97 |        fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
 98 |        fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
 99 |        ;;
100 |   *) echo "$0: invalid dictionary type $dict_type" && exit 1;
101 | esac
102 | 
103 | echo "Dict and token FSTs compiling succeeded"
104 | echo -e "\n"
105 | 


--------------------------------------------------------------------------------
/utils/model_topo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2015       Yajie Miao    (Carnegie Mellon University)
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #  http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 15 | # See the Apache 2 License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | import sys
 19 | 
 20 | def parse_arguments(arg_elements):
 21 |     args = {}
 22 |     arg_num = len(arg_elements) / 2
 23 |     for i in xrange(arg_num):
 24 |         key = arg_elements[2*i].replace("--","").replace("-", "_");
 25 |         args[key] = arg_elements[2*i+1]
 26 |     return args
 27 | 
 28 | 
 29 | if __name__ == '__main__':
 30 | 
 31 |     """
 32 |     Python script to generate the network topology. Parameters:
 33 |     ------------------
 34 |     --input-feat-dim : int
 35 |         Dimension of the input features
 36 |         Required.
 37 |     --lstm-layer-num : int
 38 |         Number of LSTM layers
 39 |         Required.
 40 |     --lstm-cell-dim : int
 41 |         Number of memory cells in LSTM. For the bi-directional case, this is the number of cells
 42 |         in either the forward or the backward sub-layer.
 43 |         Required.
 44 |     --target-num : int
 45 |         Number of labels as the targets
 46 |         Required.
 47 |     --param-range : float
 48 |         Range to randomly draw the initial values of model parameters. For example, setting it to
 49 |         0.1 means model parameters are drawn uniformly from [-0.1, 0.1]
 50 |         Optional. By default it is set to 0.1.
 51 |     --lstm-type : string
 52 |         Type of LSTMs. Optional. Either "bi" (bi-directional) or "uni" (uni-directional). By default,
 53 |         "bi" (bi-directional).
 54 |     --fgate-bias-init : float
 55 |         Initial value of the forget-gate bias. Not specifying this option means the forget-gate bias
 56 |         will be initialized randomly, in the same way as the other parameters. 
 57 |     --input-dim : int
 58 |         Reduce the input feature to a given dimensionality before passing to the LSTM.
 59 |         Optional.
 60 |     --projection-dim : int
 61 |         Project the feature vector down to a given dimensionality between LSTM layers.
 62 |         Optional.
 63 | 
 64 |     """
 65 | 
 66 | 
 67 |     # parse arguments
 68 |     arg_elements = [sys.argv[i] for i in range(1, len(sys.argv))]
 69 |     arguments = parse_arguments(arg_elements)
 70 | 
 71 |     # these 4 arguments are mandatory
 72 |     input_feat_dim=int(arguments['input_feat_dim'])
 73 |     lstm_layer_num=int(arguments['lstm_layer_num'])
 74 |     lstm_cell_dim=int(arguments['lstm_cell_dim'])
 75 |     target_num=int(arguments['target_num'])
 76 | 
 77 |     # by default, the range of the parameters is set to 0.1; however, you can change it by specifying "--param-range"
 78 |     # this means for initialization, model parameters are drawn uniformly from the interval [-0.1, 0.1]
 79 |     param_range='0.1'
 80 |     if arguments.has_key('param_range'):
 81 |         param_range = arguments['param_range']
 82 | 
 83 |     actual_cell_dim = 2*lstm_cell_dim
 84 |     model_type = '<BiLstmParallel>'   # by default
 85 |     if arguments.has_key('lstm_type') and arguments['lstm_type'] == 'uni':
 86 |         actual_cell_dim = lstm_cell_dim
 87 |         model_type = '<LstmParallel>'
 88 | 
 89 |     # add the option to set the initial value of the forget-gate bias
 90 |     lstm_comm = ' <ParamRange> ' + param_range + ' <LearnRateCoef> 1.0 <MaxGrad> 50.0'
 91 |     if arguments.has_key('fgate_bias_init'):
 92 |         lstm_comm = lstm_comm + ' <FgateBias> ' + arguments['fgate_bias_init']
 93 | 
 94 |     # add the option to specify projection layers
 95 |     if arguments.has_key('projection_dim'):
 96 |         proj_dim = arguments['projection_dim']
 97 |     else:
 98 |         proj_dim = 0
 99 | 
100 |     # add the option to reduce the dimensionality of the input features
101 |     if arguments.has_key('input_dim'):
102 |         input_dim = arguments['input_dim']
103 |     else:
104 |         input_dim = 0
105 | 
106 | 
107 |     # pre-amble
108 |     print '<Nnet>'
109 | 
110 |     # optional dimensionality reduction layer
111 |     if input_dim > 0:
112 |         print '<AffineTransform> <InputDim> ' + str(input_feat_dim) + ' <OutputDim> ' + str(input_dim) + ' <ParamRange> ' + param_range
113 |         input_feat_dim = input_dim
114 | 
115 |     # the first layer takes input features
116 |     print model_type + ' <InputDim> ' + str(input_feat_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
117 |     # the following bidirectional LSTM layers
118 |     for n in range(1, lstm_layer_num):
119 |         if proj_dim > 0:
120 |             print '<AffineTransform> <InputDim> ' + str(actual_cell_dim) + ' <OutputDim> ' + str(proj_dim) + ' <ParamRange> ' + param_range
121 |             print model_type + ' <InputDim> ' +        str(proj_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
122 |         else:
123 |             print model_type + ' <InputDim> ' + str(actual_cell_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
124 | 
125 |     # the final affine-transform and softmax layer
126 |     print '<AffineTransform> <InputDim> ' + str(actual_cell_dim) + ' <OutputDim> ' + str(target_num) + ' <ParamRange> ' + param_range
127 |     print '<Softmax> <InputDim> ' + str(target_num) + ' <OutputDim> ' + str(target_num)
128 |     print '</Nnet>'
129 | 


--------------------------------------------------------------------------------
/steps/make_fbank_pitch.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | # Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
  4 | #                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
  5 | # Apache 2.0
  6 | # Combine filterbank and pitch features together 
  7 | # Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh
  8 | 
  9 | # Begin configuration section.
 10 | nj=4
 11 | cmd=run.pl
 12 | fbank_config=conf/fbank.conf
 13 | pitch_config=conf/pitch.conf
 14 | pitch_postprocess_config=
 15 | paste_length_tolerance=2
 16 | compress=true
 17 | # End configuration section.
 18 | 
 19 | echo "$0 $@"  # Print the command line for logging
 20 | 
 21 | if [ -f path.sh ]; then . ./path.sh; fi
 22 | . parse_options.sh || exit 1;
 23 | 
 24 | if [ $# != 3 ]; then
 25 |    echo "usage: make_fbank_pitch.sh [options] <data-dir> <log-dir> <path-to-fbank-pitch-dir>";
 26 |    echo "options: "
 27 |    echo "  --fbank-config             <config-file>             # config passed to compute-fbank-feats "
 28 |    echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
 29 |    echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
 30 |    echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
 31 |    echo "  --nj                       <nj>                      # number of parallel jobs"
 32 |    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
 33 |    exit 1;
 34 | fi
 35 | 
 36 | data=$1
 37 | logdir=$2
 38 | fbank_pitch_dir=$3
 39 | 
 40 | 
 41 | # make $fbank_pitch_dir an absolute pathname.
 42 | fbank_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbank_pitch_dir ${PWD}`
 43 | 
 44 | # use "name" as part of name of the archive.
 45 | name=`basename $data`
 46 | 
 47 | mkdir -p $fbank_pitch_dir || exit 1;
 48 | mkdir -p $logdir || exit 1;
 49 | 
 50 | if [ -f $data/feats.scp ]; then
 51 |   mkdir -p $data/.backup
 52 |   echo "$0: moving $data/feats.scp to $data/.backup"
 53 |   mv $data/feats.scp $data/.backup
 54 | fi
 55 | 
 56 | scp=$data/wav.scp
 57 | 
 58 | required="$scp $fbank_config $pitch_config"
 59 | 
 60 | for f in $required; do
 61 |   if [ ! -f $f ]; then
 62 |     echo "make_fbank_pitch.sh: no such file $f"
 63 |     exit 1;
 64 |   fi
 65 | done
 66 | 
 67 | if [ ! -z "$pitch_postprocess_config" ]; then
 68 | 	postprocess_config_opt="--config=$pitch_postprocess_config";
 69 | else
 70 | 	postprocess_config_opt=
 71 | fi
 72 | 
 73 | utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 74 | 
 75 | if [ -f $data/spk2warp ]; then
 76 |   echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
 77 |   vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
 78 | elif [ -f $data/utt2warp ]; then
 79 |   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
 80 |   vtln_opts="--vtln-map=ark:$data/utt2warp"
 81 | fi
 82 | 
 83 | for n in $(seq $nj); do
 84 |   # the next command does nothing unless $fbank_pitch_dir/storage/ exists, see
 85 |   # utils/create_data_link.pl for more info.
 86 |   utils/create_data_link.pl $fbank_pitch_dir/raw_fbank_pitch_$name.$n.ark  
 87 | done
 88 | 
 89 | if [ -f $data/segments ]; then
 90 |   echo "$0 [info]: segments file exists: using that."
 91 |   split_segments=""
 92 |   for n in $(seq $nj); do
 93 |     split_segments="$split_segments $logdir/segments.$n"
 94 |   done
 95 | 
 96 |   utils/split_scp.pl $data/segments $split_segments || exit 1;
 97 |   rm $logdir/.error 2>/dev/null
 98 |    
 99 |   fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- |"
100 |   pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
101 | 
102 |   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
103 |     paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
104 |     copy-feats --compress=$compress ark:- \
105 |       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
106 |      || exit 1;
107 | 
108 | else
109 |   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
110 |   split_scps=""
111 |   for n in $(seq $nj); do
112 |     split_scps="$split_scps $logdir/wav.$n.scp"
113 |   done
114 | 
115 |   utils/split_scp.pl $scp $split_scps || exit 1;
116 |   
117 |   fbank_feats="ark:compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- |"
118 |   pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
119 |  
120 |   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
121 |     paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
122 |     copy-feats --compress=$compress ark:- \
123 |       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
124 |       || exit 1;
125 | 
126 | fi
127 | 
128 | 
129 | if [ -f $logdir/.error.$name ]; then
130 |   echo "Error producing fbank & pitch features for $name:"
131 |   tail $logdir/make_fbank_pitch_${name}.1.log
132 |   exit 1;
133 | fi
134 | 
135 | # concatenate the .scp files together.
136 | for n in $(seq $nj); do
137 |   cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1;
138 | done > $data/feats.scp
139 | 
140 | rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
141 | 
142 | nf=`cat $data/feats.scp | wc -l` 
143 | nu=`cat $data/utt2spk | wc -l` 
144 | if [ $nf -ne $nu ]; then
145 |   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
146 |   echo "consider using utils/fix_data_dir.sh $data"
147 | fi
148 | 
149 | echo "Succeeded creating filterbank & pitch features for $name"
150 | 


--------------------------------------------------------------------------------
/utils/fix_data_dir.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This script makes sure that only the segments present in 
  4 | # all of "feats.scp", "wav.scp" [if present], segments [if present]
  5 | # text, and utt2spk are present in any of them.
  6 | # It puts the original contents of data-dir into 
  7 | # data-dir/.backup
  8 | 
  9 | if [ $# != 1 ]; then
 10 |   echo "Usage: fix_data_dir.sh data-dir"
 11 |   exit 1
 12 | fi
 13 | 
 14 | data=$1
 15 | mkdir -p $data/.backup
 16 | 
 17 | [ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
 18 | 
 19 | [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
 20 | 
 21 | tmpdir=$(mktemp -d);
 22 | trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 23 | 
 24 | export LC_ALL=C
 25 | 
 26 | 
 27 | function check_sorted {
 28 |   file=$1
 29 |   sort -k1,1 -u <$file >$file.tmp
 30 |   if ! cmp -s $file $file.tmp; then
 31 |     echo "$0: file $1 is not in sorted order or not unique, sorting it"
 32 |     mv $file.tmp $file
 33 |   else
 34 |     rm $file.tmp
 35 |   fi
 36 | }
 37 | 
 38 | for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang; do
 39 |   if [ -f $data/$x ]; then
 40 |     cp $data/$x $data/.backup/$x
 41 |     check_sorted $data/$x
 42 |   fi
 43 | done
 44 | 
 45 | 
 46 | function filter_file {
 47 |   filter=$1
 48 |   file_to_filter=$2
 49 |   cp $file_to_filter ${file_to_filter}.tmp
 50 |   utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
 51 |   if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
 52 |     length1=`cat ${file_to_filter}.tmp | wc -l`
 53 |     length2=`cat ${file_to_filter} | wc -l`
 54 |     if [ $length1 -ne $length2 ]; then
 55 |       echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
 56 |     fi
 57 |   fi
 58 |   rm $file_to_filter.tmp
 59 | }
 60 | 
 61 | function filter_recordings {
 62 |   # We call this once before the stage when we filter on utterance-id, and once
 63 |   # after.
 64 |   
 65 |   if [ -f $data/segments ]; then
 66 |   # We have a segments file -> we need to filter this and the file wav.scp, and
 67 |   # reco2file_and_utt, if it exists, to make sure they have the same list of
 68 |   # recording-ids.
 69 | 
 70 |     if [ ! -f $data/wav.scp ]; then
 71 |       echo "$0: $data/segments exists but not $data/wav.scp"
 72 |       exit 1;
 73 |     fi
 74 |     awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
 75 |     n1=`cat $tmpdir/recordings | wc -l`
 76 |     [ ! -s $tmpdir/recordings ] && \
 77 |       echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
 78 |     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
 79 |     mv $tmpdir/recordings.tmp $tmpdir/recordings
 80 | 
 81 |     
 82 |     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
 83 |     filter_file $tmpdir/recordings $data/segments
 84 |     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
 85 |     rm $data/segments.tmp
 86 | 
 87 |     filter_file $tmpdir/recordings $data/wav.scp
 88 |     [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
 89 |     
 90 |   fi
 91 | }
 92 | 
 93 | function filter_speakers {
 94 |   # throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
 95 |   utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
 96 | 
 97 |   cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
 98 |   for s in cmvn.scp spk2gender; do
 99 |     f=$data/$s
100 |     if [ -f $f ]; then
101 |       filter_file $f $tmpdir/speakers
102 |     fi
103 |   done
104 | 
105 |   filter_file $tmpdir/speakers $data/spk2utt
106 |   utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
107 | 
108 |   for s in cmvn.scp spk2gender; do
109 |     f=$data/$s
110 |     if [ -f $f ]; then
111 |       filter_file $tmpdir/speakers $f
112 |     fi
113 |   done
114 | }
115 | 
116 | function filter_utts {
117 |   cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
118 | 
119 | # Do a check.
120 | 
121 |   ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
122 |     echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
123 | 
124 |   ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
125 |     echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
126 |     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
127 | 
128 |   ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
129 |     echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
130 | 
131 | 
132 |   maybe_wav=
133 |   [ ! -f $data/segments ] && maybe_wav=wav.scp  # wav indexed by utts only if segments does not exist.
134 |   for x in feats.scp text segments utt2lang $maybe_wav; do
135 |     if [ -f $data/$x ]; then
136 |       utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
137 |       mv $tmpdir/utts.tmp $tmpdir/utts
138 |     fi
139 |   done
140 |   [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
141 |     rm $tmpdir/utts && exit 1;
142 | 
143 | 
144 |   if [ -f $data/utt2spk ]; then
145 |     new_nutts=$(cat $tmpdir/utts | wc -l)
146 |     old_nutts=$(cat $data/utt2spk | wc -l)
147 |     if [ $new_nutts -ne $old_nutts ]; then
148 |       echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
149 |     else
150 |       echo "fix_data_dir.sh: kept all $old_nutts utterances."
151 |     fi
152 |   fi
153 | 
154 |   for x in utt2spk feats.scp vad.scp text segments utt2lang $maybe_wav; do
155 |     if [ -f $data/$x ]; then
156 |       cp $data/$x $data/.backup/$x
157 |       if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
158 |         utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
159 |       fi
160 |     fi
161 |   done
162 | 
163 | }
164 | 
165 | filter_recordings
166 | filter_speakers
167 | filter_utts
168 | filter_speakers
169 | filter_recordings
170 | 
171 | 
172 | 
173 | utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
174 | 
175 | echo "fix_data_dir.sh: old files are kept in $data/.backup"
176 | 


--------------------------------------------------------------------------------
/utils/run_rocks.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | # In general, doing 
  4 | #  run.pl some.log a b c is like running the command a b c in
  5 | # the bash shell, and putting the standard error and output into some.log.
  6 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
  7 | #  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
  8 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
  9 | # If any of the jobs fails, this script will fail.
 10 | 
 11 | # A typical example is:
 12 | #  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
 13 | # and run.pl will run something like:
 14 | # ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
 15 | # 
 16 | # Basically it takes the command-line arguments, quotes them
 17 | # as necessary to preserve spaces, and evaluates them with bash.
 18 | # In addition it puts the command line at the top of the log, and
 19 | # the start and end times of the command at the beginning and end.
 20 | # The reason why this is useful is so that we can create a different
 21 | # version of this program that uses a queueing system instead.
 22 | 
 23 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
 24 | 
 25 | $jobstart=1;
 26 | $jobend=1;
 27 | $qsub_opts=""; # These will be ignored.
 28 | 
 29 | # First parse an option like JOB=1:4, and any
 30 | # options that would normally be given to
 31 | # queue.pl, which we will just discard.
 32 | 
 33 | if (@ARGV > 0) {
 34 |   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
 35 |     # that would normally go to qsub, but which will be ignored here.
 36 |     $switch = shift @ARGV;
 37 |     if ($switch eq "-V") {
 38 |       $qsub_opts .= "-V ";
 39 |     } else {
 40 |       $option = shift @ARGV;
 41 |       if ($switch eq "-sync" && $option =~ m/^[yY]/) {
 42 |         $qsub_opts .= "-sync "; # Note: in the
 43 |         # corresponding code in queue.pl it says instead, just "$sync = 1;".
 44 |       }
 45 |       $qsub_opts .= "$switch $option ";
 46 |       if ($switch eq "-pe") { # e.g. -pe smp 5
 47 |         $option2 = shift @ARGV;
 48 |         $qsub_opts .= "$option2 ";
 49 |       }
 50 |     }
 51 |   }
 52 |   if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
 53 |     $jobname = $1;
 54 |     $jobstart = $2;
 55 |     $jobend = $3;
 56 |     shift;
 57 |     if ($jobstart > $jobend) {
 58 |       die "run.pl: invalid job range $ARGV[0]";
 59 |     }
 60 |   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
 61 |     $jobname = $1;
 62 |     $jobstart = $2;
 63 |     $jobend = $2;
 64 |     shift;
 65 |   } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
 66 |     print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n";
 67 |   }
 68 | }
 69 | 
 70 | if ($qsub_opts ne "") {
 71 |   print STDERR "Warning: run.pl ignoring options \"$qsub_opts\"\n";
 72 | }
 73 | 
 74 | $logfile = shift @ARGV;
 75 | 
 76 | if (defined $jobname && $logfile !~ m/$jobname/ &&
 77 |     $jobend > $jobstart) {
 78 |   print STDERR "run.pl: you are trying to run a parallel job but "
 79 |     . "you are putting the output into just one log file ($logfile)\n";
 80 |   exit(1);
 81 | }
 82 | 
 83 | $cmd = "";
 84 | 
 85 | foreach $x (@ARGV) { 
 86 |     if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
 87 |     elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
 88 |     else { $cmd .= "\"$x\" "; } 
 89 | }
 90 | 
 91 | 
 92 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
 93 |   $childpid = fork();
 94 |   if (!defined $childpid) { die "Error forking in run.pl (writing to $logfile)"; }
 95 |   if ($childpid == 0) { # We're in the child... this branch
 96 |     # executes the job and returns (possibly with an error status).
 97 |     if (defined $jobname) { 
 98 |       $cmd =~ s/$jobname/$jobid/g;
 99 |       $logfile =~ s/$jobname/$jobid/g;
100 |     }
101 |     system("mkdir -p `dirname $logfile` 2>/dev/null");
102 |     open(F, ">$logfile") || die "Error opening log file $logfile";
103 |     print F "# " . $cmd . "\n";
104 |     print F "# Started at " . `date`;
105 |     $starttime = `date +'%s'`;
106 |     print F "#\n";
107 |     close(F);
108 | 
109 |     $cmdid = $jobid - $jobstart + 1;
110 |     $cmdfile = "./cmds/cmd." . $cmdid;
111 |     $donefile = "./dones/done." . $cmdid;
112 |     unlink $donefile;
113 |     open(F, ">$cmdfile") || die "Error opening cmd file $cmdfile";
114 | #    print F "( " . $cmd . ") 2>>$logfile >> $logfile";
115 |     print F "" . $cmd . " 2>> $logfile\n";
116 |     close(F);
117 |     # Pipe into bash.. make sure we're not using any other shell.
118 | #    open(B, "|bash") || die "Error opening shell command"; 
119 | #    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
120 | #    close(B);                   # If there was an error, exit status is in $?
121 |     while (1) {
122 |       if (-e $donefile) {
123 |         last;
124 |       } else {
125 |         sleep(13);    
126 |       } 
127 |     }     
128 |     $ret = $?;
129 | 
130 |     $endtime = `date +'%s'`;
131 |     open(F, ">>$logfile") || die "Error opening log file $logfile (again)";
132 |     $enddate = `date`;
133 |     chop $enddate;
134 |     print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
135 |     print F "# Ended (code $ret) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
136 |     close(F);
137 |     exit($ret == 0 ? 0 : 1);
138 |   }
139 | }
140 | 
141 | $ret = 0;
142 | $numfail = 0;
143 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
144 |   $r = wait();
145 |   if ($r == -1) { die "Error waiting for child process"; } # should never happen.
146 |   if ($? != 0) { $numfail++; $ret = 1; } # The child process failed.
147 | }
148 | 
149 | if ($ret != 0) {
150 |   $njobs = $jobend - $jobstart + 1;
151 |   if ($njobs == 1) { 
152 |     if (defined $jobname) {
153 |       $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
154 |                                          # that job.
155 |     }
156 |     print STDERR "run.pl: job failed, log is in $logfile\n";
157 |     if ($logfile =~ m/JOB/) {
158 |       print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
159 |     }
160 |   }
161 |   else {
162 |     $logfile =~ s/$jobname/*/g;
163 |     print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
164 |   }
165 | }
166 | 
167 | 
168 | exit ($ret);
169 | 


--------------------------------------------------------------------------------
/utils/make_lexicon_fst.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # Copyright 2010-2011  Microsoft Corporation
  3 | #                2013  Johns Hopkins University (author: Daniel Povey)
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #  http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 15 | # See the Apache 2 License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | 
 19 | # makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
 20 | 
 21 | $pron_probs = 0;
 22 | 
 23 | if ($ARGV[0] eq "--pron-probs") {
 24 |   $pron_probs = 1;
 25 |   shift @ARGV;
 26 | }
 27 | 
 28 | if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
 29 |   print STDERR
 30 |     "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt
 31 | Creates a lexicon FST that transduces phones to words, and may allow optional silence. 
 32 | Note: ordinarily, each line of lexicon.txt is: word phone1 phone2 ... phoneN; if the --pron-probs option is
 33 | used, each line is: word pronunciation-probability phone1 phone2 ... phoneN.  The probability 'prob' will
 34 | typically be between zero and one, and note that it's generally helpful to normalize so the largest one
 35 | for each word is 1.0, but this is your responsibility.  The silence disambiguation symbol, e.g. something
 36 | like #5, is used only when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst, and was
 37 | introduced to fix a particular case of non-determinism of decoding graphs.\n";
 38 |   exit(1);
 39 | }
 40 | 
 41 | $lexfn = shift @ARGV;
 42 | if (@ARGV == 0) {
 43 |   $silprob = 0.0;
 44 | } elsif (@ARGV == 2) { 
 45 |   ($silprob,$silphone) = @ARGV;
 46 | } else {
 47 |   ($silprob,$silphone,$sildisambig) = @ARGV;
 48 | }
 49 | if ($silprob != 0.0) {
 50 |   $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
 51 |   $silcost = -log($silprob);
 52 |   $nosilcost = -log(1.0 - $silprob);
 53 | }
 54 | 
 55 | 
 56 | open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
 57 | 
 58 | 
 59 | sub is_sil {
 60 |   # Return true (1) if provided with a phone-sequence
 61 |   # that means silence.
 62 |   # @_ is the parameters of the function
 63 |   # This function returns true if @_ equals ( $silphone )
 64 |   # or something of the form ( "#0", $silphone, "#1" )
 65 |   # where the "#0" and "#1" are disambiguation symbols.
 66 |   return ( @_ == 1 && $_[0] eq $silphone ||
 67 |            (@_ == 3 && $_[1] eq $silphone &&
 68 |             $_[0] =~ m/^\#\d+$/ &&
 69 |             $_[0] =~ m/^\#\d+$/));
 70 | }
 71 | 
 72 | if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
 73 |   $loopstate = 0;
 74 |   $nextstate = 1;               # next unallocated state.
 75 |   while (<L>) {
 76 |     @A = split(" ", $_);
 77 |     @A == 0 && die "Empty lexicon line.";
 78 |     $w = shift @A;
 79 |     if (! $pron_probs) {
 80 |       $pron_cost = 0.0;
 81 |     } else {
 82 |       $pron_prob = shift @A;
 83 |       if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
 84 |         die "Bad pronunciation probability in line $_";
 85 |       }
 86 |       $pron_cost = -log($pron_prob);
 87 |     }
 88 |     if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
 89 |     
 90 |     $s = $loopstate;
 91 |     $word_or_eps = $w;
 92 |     while (@A > 0) {
 93 |       $p = shift @A;
 94 |       if (@A > 0) {
 95 |         $ns = $nextstate++;
 96 |       } else {
 97 |         $ns = $loopstate;
 98 |       }
 99 |       print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
100 |       $word_or_eps = "<eps>";
101 |       $pron_cost_string = ""; # so we only print it on the first arc of the word.
102 |       $s = $ns;
103 |     }
104 |   }
105 |   print "$loopstate\t0\n";      # final-cost.
106 | } else {                        # have silence probs.
107 |   $startstate = 0;
108 |   $loopstate = 1;
109 |   $silstate = 2;   # state from where we go to loopstate after emitting silence.
110 |   print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
111 |   if (!defined $sildisambig) {
112 |     print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
113 |     print "$silstate\t$loopstate\t$silphone\t<eps>\n";             # no cost.
114 |     $nextstate = 3;
115 |   } else {
116 |     $disambigstate = 3;
117 |     $nextstate = 4;
118 |     print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
119 |     print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
120 |     print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
121 |   }
122 |   while (<L>) {
123 |     @A = split(" ", $_);
124 |     $w = shift @A;
125 |     if (! $pron_probs) {
126 |       $pron_cost = 0.0;
127 |     } else {
128 |       $pron_prob = shift @A;
129 |       if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
130 |         die "Bad pronunciation probability in line $_";
131 |       }
132 |       $pron_cost = -log($pron_prob);
133 |     }
134 |     if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
135 |     $s = $loopstate;
136 |     $word_or_eps = $w;
137 |     while (@A > 0) {
138 |       $p = shift @A;
139 |       if (@A > 0) {
140 |         $ns = $nextstate++;
141 |         print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
142 |         $word_or_eps = "<eps>";
143 |         $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
144 |         $s = $ns;
145 |       } else {
146 |         if (!is_sil($p)) {
147 |           # This is non-deterministic but relatively compact,
148 |           # and avoids epsilons.
149 |           $local_nosilcost = $nosilcost + $pron_cost;
150 |           $local_silcost = $silcost + $pron_cost;
151 |           print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
152 |           print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
153 |         } else {
154 |           # no point putting opt-sil after silence word.
155 |           print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
156 |         }
157 |       }
158 |     }
159 |   }
160 |   print "$loopstate\t0\n";      # final-cost.
161 | }
162 | 


--------------------------------------------------------------------------------
/utils/subset_data_dir.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2010-2011  Microsoft Corporation 
  3 | #           2012-2013  Johns Hopkins University (Author: Daniel Povey)
  4 | # Apache 2.0
  5 | 
  6 | 
  7 | # This script operates on a data directory, such as in data/train/.
  8 | # See http://kaldi.sourceforge.net/data_prep.html#data_prep_data
  9 | # for what these directories contain.
 10 | 
 11 | # This script creates a subset of that data, consisting of some specified
 12 | # number of utterances.  (The selected utterances are distributed evenly
 13 | # throughout the file, by the program ./subset_scp.pl).
 14 | 
 15 | # There are six options, none compatible with any other.
 16 | 
 17 | # If you give the --per-spk option, it will attempt to select the supplied
 18 | # number of utterances for each speaker (typically you would supply a much
 19 | # smaller number in this case).
 20 | 
 21 | # If you give the --speakers option, it selects a subset of n randomly
 22 | # selected speakers.
 23 | 
 24 | # If you give the --shortest option, it will give you the n shortest utterances.
 25 | 
 26 | # If you give the --first option, it will just give you the n first utterances.
 27 | 
 28 | # If you give the --last option, it will just give you the n last utterances.
 29 | 
 30 | # If you give the --spk-list option, it reads the speakers to keep from <speaker-list-file>"
 31 | # (note, in this case there is no <num-utt> positional parameter; see usage message.)
 32 | 
 33 | 
 34 | shortest=false
 35 | perspk=false
 36 | first_opt=""
 37 | speakers=false
 38 | spk_list_specified=false
 39 | utt_list_specified=false
 40 | 
 41 | if [ "$1" == "--per-spk" ]; then
 42 |   perspk=true;
 43 |   shift;
 44 | elif [ "$1" == "--shortest" ]; then
 45 |   shortest=true;
 46 |   shift;
 47 | elif [ "$1" == "--first" ]; then
 48 |   first_opt="--first";
 49 |   shift;
 50 | elif [ "$1" == "--speakers" ]; then
 51 |   speakers=true
 52 |   shift;
 53 | elif [ "$1" == "--last" ]; then
 54 |   first_opt="--last";
 55 |   shift;
 56 | elif [ "$1" == "--spk-list" ]; then
 57 |   spk_list_specified=true
 58 |   shift;
 59 | elif [ "$1" == "--utt-list" ]; then
 60 |   utt_list_specified=true
 61 |   shift;
 62 | fi
 63 | 
 64 | 
 65 | 
 66 | 
 67 | if [ $# != 3 ]; then
 68 |   echo "Usage: "
 69 |   echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
 70 |   echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
 71 |   echo "  subset_data_dir.sh [--utt-list <utterance-list-file>] <srcdir> <destdir>"
 72 |   echo "By default, randomly selects <num-utt> utterances from the data directory."
 73 |   echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
 74 |   echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
 75 |   echo "With --first, selects the first <num-utt> utterances"
 76 |   echo "With --last, selects the last <num-utt> utterances"
 77 |   echo "With --shortest, selects the shortest <num-utt> utterances."
 78 |   echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
 79 |   exit 1;
 80 | fi
 81 | 
 82 | if $spk_list_specified; then
 83 |   spk_list=$1
 84 |   srcdir=$2
 85 |   destdir=$3
 86 | elif $utt_list_specified; then
 87 |   utt_list=$1
 88 |   srcdir=$2
 89 |   destdir=$3
 90 | else
 91 |   srcdir=$1
 92 |   numutt=$2
 93 |   destdir=$3
 94 | fi
 95 | 
 96 | 
 97 | export LC_ALL=C
 98 | 
 99 | if [ ! -f $srcdir/utt2spk ]; then
100 |   echo "subset_data_dir.sh: no such file $srcdir/utt2spk" 
101 |   exit 1;
102 | fi
103 | 
104 | function do_filtering {
105 |   # assumes the utt2spk and spk2utt files already exist.
106 |   [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
107 |   [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
108 |   [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
109 |   [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
110 |   [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
111 |   [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
112 |   [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
113 |   [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
114 |   [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
115 |   if [ -f $srcdir/segments ]; then
116 |      utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
117 |      awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
118 |      # The next line would override the command above for wav.scp, which would be incorrect.
119 |      [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
120 |      [ -f $srcdir/reco2file_and_channel ] && \
121 |        utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
122 |      
123 |      # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
124 |      [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
125 |      
126 |      rm $destdir/reco
127 |   fi
128 |   srcutts=`cat $srcdir/utt2spk | wc -l`
129 |   destutts=`cat $destdir/utt2spk | wc -l`
130 |   echo "$0: reducing #utt from $srcutts to $destutts"
131 | }
132 | 
133 | 
134 | if $spk_list_specified; then
135 |   mkdir -p $destdir
136 |   utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
137 |   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
138 |   do_filtering; # bash function.
139 |   exit 0;
140 | elif $utt_list_specified; then
141 |   mkdir -p $destdir
142 |   utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
143 |   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
144 |   do_filtering; # bash function.
145 |   exit 0;
146 | elif $speakers; then
147 |   mkdir -p $destdir
148 |   utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
149 |     sort > $destdir/spk2utt
150 |   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
151 |   do_filtering; # bash function.
152 |   exit 0;  
153 | elif $perspk; then
154 |   mkdir -p $destdir
155 |   awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
156 |          for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } 
157 |          printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
158 |   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
159 |   do_filtering; # bash function.
160 |   exit 0;
161 | else
162 |   if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then
163 |     echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
164 |     exit 1;
165 |   fi 
166 |   mkdir -p $destdir || exit 1;
167 | 
168 |   ## scripting note: $shortest evaluates to true or false
169 |   ## so this becomes the command true or false.
170 |   if $shortest; then
171 |     # select the n shortest utterances.
172 |     . ./path.sh
173 |     [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
174 |     feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
175 |     sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
176 |     utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
177 |     rm $destdir/tmp.uttlist $destdir/tmp.len
178 |   else
179 |     utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
180 |   fi
181 |   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
182 |   do_filtering;
183 |   exit 0;
184 | fi
185 | 


--------------------------------------------------------------------------------
/steps/train_ctc_parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2015  Yajie Miao    (Carnegie Mellon University)
  4 | # Apache 2.0
  5 | 
  6 | # This script trains acoustic models based on CTC and using SGD. 
  7 | 
  8 | ## Begin configuration section
  9 | train_tool=train-ctc-parallel  # the command for training; by default, we use the
 10 |                 # parallel version which processes multiple utterances at the same time 
 11 | 
 12 | # configs for multiple sequences
 13 | num_sequence=5           # during training, how many utterances to be processed in parallel
 14 | valid_num_sequence=10    # number of parallel sequences in validation
 15 | frame_num_limit=12500  # the number of frames to be processed at a time in training; this config acts to
 16 |          # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max
 17 |          # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 
 18 | 
 19 | # learning rate
 20 | learn_rate=0.0001        # learning rate
 21 | momentum=0.9             # momentum
 22 | 
 23 | # learning rate schedule
 24 | max_iters=25             # max number of iterations
 25 | min_iters=               # min number of iterations
 26 | start_epoch_num=1        # start from which epoch, used for resuming training from a break point
 27 | 
 28 | start_halving_inc=0.5    # start halving learning rates when the accuracy improvement falls below this amount
 29 | end_halving_inc=0.1      # terminate training when the accuracy improvement falls below this amount
 30 | halving_factor=0.5       # learning rate decay factor
 31 | halving_after_epoch=1    # halving bcomes enabled after this many epochs
 32 | 
 33 | # logging
 34 | report_step=1          # during training, the step (number of utterances) of reporting objective and accuracy
 35 | verbose=1
 36 | 
 37 | # feature configs
 38 | sort_by_len=true         # whether to sort the utterances by their lengths
 39 | min_len=0                # minimal length of utterances to consider
 40 | 
 41 | norm_vars=true           # whether to apply variance normalization when we do cmn
 42 | add_deltas=true          # whether to add deltas
 43 | copy_feats=true          # whether to copy features into a local dir (on the GPU machine)
 44 | feats_tmpdir=            # the tmp dir to save the copied features, when copy_feats=true
 45 | 
 46 | # status of learning rate schedule; useful when training is resumed from a break point
 47 | cvacc=0
 48 | halving=0
 49 | 
 50 | ## End configuration section
 51 | 
 52 | echo "$0 $@"  # Print the command line for logging
 53 | 
 54 | [ -f path.sh ] && . ./path.sh; 
 55 | 
 56 | . utils/parse_options.sh || exit 1;
 57 | 
 58 | if [ $# != 3 ]; then
 59 |    echo "Usage: $0 <data-tr> <data-cv> <exp-dir>"
 60 |    echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn"
 61 |    exit 1;
 62 | fi
 63 | 
 64 | data_tr=$1
 65 | data_cv=$2
 66 | dir=$3
 67 | 
 68 | mkdir -p $dir/log $dir/nnet
 69 | 
 70 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do
 71 |   [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
 72 | done
 73 | 
 74 | ## Read the training status for resuming
 75 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null`
 76 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null`
 77 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null`
 78 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null`
 79 | 
 80 | ## Setup up features
 81 | echo $norm_vars > $dir/norm_vars  # output feature configs which will be used in decoding
 82 | echo $add_deltas > $dir/add_deltas
 83 | 
 84 | if $sort_by_len; then
 85 |   feat-to-len scp:$data_tr/feats.scp ark,t:- | awk '{print $2}' > $dir/len.tmp || exit 1;
 86 |   paste -d " " $data_tr/feats.scp $dir/len.tmp | sort -k3 -n - | awk -v m=$min_len '{ if ($3 >= m) {print $1 " " $2} }' > $dir/train.scp || exit 1;
 87 |   feat-to-len scp:$data_cv/feats.scp ark,t:- | awk '{print $2}' > $dir/len.tmp || exit 1;
 88 |   paste -d " " $data_cv/feats.scp $dir/len.tmp | sort -k3 -n - | awk '{print $1 " " $2}' > $dir/cv.scp || exit 1;
 89 |   rm -f $dir/len.tmp
 90 | else
 91 |   cat $data_tr/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp
 92 |   cat $data_cv/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/cv.scp
 93 | fi
 94 | 
 95 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/train.scp ark:- |"
 96 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/cv.scp ark:- |"
 97 | 
 98 | # Save the features to a local dir on the GPU machine. On Linux, this usually points to /tmp
 99 | if $copy_feats; then
100 |   tmpdir=$(mktemp -d $feats_tmpdir);
101 |   copy-feats "$feats_tr" ark,scp:$tmpdir/train.ark,$dir/train_local.scp || exit 1;
102 |   copy-feats "$feats_cv" ark,scp:$tmpdir/cv.ark,$dir/cv_local.scp || exit 1;
103 |   feats_tr="ark,s,cs:copy-feats scp:$dir/train_local.scp ark:- |"
104 |   feats_cv="ark,s,cs:copy-feats scp:$dir/cv_local.scp ark:- |"
105 |   trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT
106 | fi
107 | 
108 | if $add_deltas; then
109 |   feats_tr="$feats_tr add-deltas ark:- ark:- |"
110 |   feats_cv="$feats_cv add-deltas ark:- ark:- |"
111 | fi
112 | ## End of feature setup
113 | 
114 | ## Set up labels  
115 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|"
116 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|"
117 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to derive prior probabilities of
118 | # the labels.
119 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \
120 |   analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1
121 | ##
122 | 
123 | # Initialize model parameters
124 | if [ ! -f $dir/nnet/nnet.iter0 ]; then
125 |     echo "Initializing model as $dir/nnet/nnet.iter0"
126 |     net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1;
127 | fi
128 | 
129 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
130 | echo "TRAINING STARTS [$cur_time]"
131 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)."
132 | for iter in $(seq $start_epoch_num $max_iters); do
133 |     cvacc_prev=$cvacc
134 |     echo -n "EPOCH $iter RUNNING ... "
135 | 
136 |     # train
137 |     $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \
138 |         --learn-rate=$learn_rate --momentum=$momentum \
139 |         --verbose=$verbose \
140 |         "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \
141 |         >& $dir/log/tr.iter$iter.log || exit 1;
142 | 
143 |     end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
144 |     echo -n "ENDS [$end_time]: "
145 | 
146 |     tracc=$(cat $dir/log/tr.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
147 |     echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, "
148 | 
149 |     # validation
150 |     $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \
151 |         --cross-validate=true \
152 |         --learn-rate=$learn_rate \
153 |         --momentum=$momentum \
154 |         --verbose=$verbose \
155 |         "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \
156 |         >& $dir/log/cv.iter$iter.log || exit 1;
157 | 
158 |     cvacc=$(cat $dir/log/cv.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
159 |     echo "VALID ACCURACY $(printf "%.4f" $cvacc)%"
160 | 
161 |     # stopping criterion
162 |     rel_impr=$(bc <<< "($cvacc-$cvacc_prev)")
163 |     if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]; then
164 |       if [[ "$min_iters" != "" ]]; then
165 |         if [ $min_iters -gt $iter ]; then
166 |           echo we were supposed to finish, but we continue as min_iters : $min_iters
167 |           continue
168 |         fi
169 |       fi
170 |       echo finished, too small rel. improvement $rel_impr
171 |       break
172 |     fi
173 | 
174 |     # start annealing when improvement is low
175 |     if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then
176 |       if [ $iter -gt $halving_after_epoch ]; then
177 |         halving=1
178 |       fi
179 |     fi
180 | 
181 |     # do annealing
182 |     if [ 1 == $halving ]; then
183 |       learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
184 |     fi
185 |     # save the status 
186 |     echo $[$iter+1] > $dir/.epoch    # +1 because we save the epoch to start from
187 |     echo $cvacc > $dir/.cvacc
188 |     echo $halving > $dir/.halving
189 |     echo $learn_rate > $dir/.lrate
190 | done
191 | 
192 | # Convert the model marker from "<BiLstmParallel>" to "<BiLstm>" (no longer needed)
193 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1;
194 | 
195 | echo "Training succeeded. The final model $dir/final.nnet"
196 | 


--------------------------------------------------------------------------------
/steps/train_ctc_parallel_h.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | {
  3 | # Copyright 2015  Yajie Miao    (Carnegie Mellon University)
  4 | #           2015  Hang Su
  5 | # Apache 2.0
  6 | 
  7 | # This script trains acoustic models based on CTC and using SGD. 
  8 | 
  9 | ## Begin configuration section
 10 | train_tool=train-ctc-parallel  # the command for training; by default, we use the
 11 |                 # parallel version which processes multiple utterances at the same time 
 12 | 
 13 | # configs for multiple sequences
 14 | num_sequence=5           # during training, how many utterances to be processed in parallel
 15 | valid_num_sequence=10    # number of parallel sequences in validation
 16 | frame_num_limit=1000000  # the number of frames to be processed at a time in training; this config acts to
 17 |          # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max
 18 |          # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 
 19 | 
 20 | # learning rate
 21 | learn_rate=0.0001        # learning rate
 22 | momentum=0.9             # momentum
 23 | 
 24 | # learning rate schedule
 25 | max_iters=25             # max number of iterations
 26 | min_iters=               # min number of iterations
 27 | start_epoch_num=1        # start from which epoch, used for resuming training from a break point
 28 | 
 29 | start_halving_inc=0.5    # start halving learning rates when the accuracy improvement falls below this amount
 30 | end_halving_inc=0.1      # terminate training when the accuracy improvement falls below this amount
 31 | halving_factor=0.5       # learning rate decay factor
 32 | halving_after_epoch=1    # halving bcomes enabled after this many epochs
 33 | 
 34 | # logging
 35 | report_step=100          # during training, the step (number of utterances) of reporting objective and accuracy
 36 | verbose=1
 37 | 
 38 | # feature configs
 39 | sort_by_len=true         # whether to sort the utterances by their lengths
 40 | 
 41 | norm_vars=true           # whether to apply variance normalization when we do cmn
 42 | add_deltas=true          # whether to add deltas
 43 | 
 44 | # status of learning rate schedule; useful when training is resumed from a break point
 45 | cvacc=-1
 46 | halving=0
 47 | 
 48 | # Multi-GPU training
 49 | nj=1
 50 | utts_per_avg=700
 51 | 
 52 | clean_up=true
 53 | 
 54 | ## End configuration section
 55 | 
 56 | echo "$0 $@"  # Print the command line for logging
 57 | 
 58 | [ -f ./path.sh ] && . ./path.sh; 
 59 | [ -f ./cmd.sh ] && . ./cmd.sh; 
 60 | 
 61 | . utils/parse_options.sh || exit 1;
 62 | 
 63 | if [ $# != 3 ]; then
 64 |    echo "Usage: $0 <data-tr> <data-cv> <exp-dir>"
 65 |    echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn"
 66 |    exit 1;
 67 | fi
 68 | 
 69 | data_tr=$1
 70 | data_cv=$2
 71 | dir=$3
 72 | 
 73 | mkdir -p $dir/log $dir/nnet
 74 | 
 75 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do
 76 |   [ ! -f $f ] && echo "train_ctc_parallel.sh: no such file $f" && exit 1;
 77 | done
 78 | 
 79 | ## Read the training status for resuming
 80 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null`
 81 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null`
 82 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null`
 83 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null`
 84 | 
 85 | ## Setup up features
 86 | echo $norm_vars > $dir/norm_vars  # output feature configs which will be used in decoding
 87 | echo $add_deltas > $dir/add_deltas
 88 | 
 89 | echo "Preparing train and cv features"
 90 | tmpdir=$dir/feats; 
 91 | [ -d $tmpdir ] || mkdir -p $tmpdir
 92 | [ $clean_up == true ] && trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT
 93 | utils/prep_scps.sh --nj $nj --cmd "$train_cmd" ${seed:+ --seed=$seed} --clean-up $clean_up \
 94 |   $data_tr/feats.scp $data_cv/feats.scp $num_sequence $frame_num_limit $tmpdir $dir || exit 1;
 95 | 
 96 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/feats_tr.JOB.scp ark:- |"
 97 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/feats_cv.JOB.scp ark:- |"
 98 | 
 99 | if [ $nj -eq 1 ]; then
100 |   feats_tr=$(echo $feats_tr | sed 's#JOB#1#')
101 |   feats_cv=$(echo $feats_cv | sed 's#JOB#1#')
102 | fi
103 | 
104 | if $add_deltas; then
105 |   feats_tr="$feats_tr add-deltas ark:- ark:- |"
106 |   feats_cv="$feats_cv add-deltas ark:- ark:- |"
107 | fi
108 | ## End of feature setup
109 | 
110 | ## Set up labels  
111 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|"
112 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|"
113 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to derive prior probabilities of
114 | # the labels.
115 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \
116 |   analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1
117 | ##
118 | 
119 | # Initialize model parameters
120 | if [ ! -f $dir/nnet/nnet.iter0 ]; then
121 |     echo "Initializing model as $dir/nnet/nnet.iter0"
122 |     net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1;
123 | fi
124 | 
125 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
126 | echo "TRAINING STARTS [$cur_time]"
127 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)."
128 | for iter in $(seq $start_epoch_num $max_iters); do
129 |     cvacc_prev=$cvacc
130 |     echo -n "EPOCH $iter RUNNING ... "
131 | 
132 |     # train
133 |     if [ -z "$nj" ]; then
134 |       $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \
135 |         --learn-rate=$learn_rate --momentum=$momentum \
136 |         --verbose=$verbose \
137 |         "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \
138 |         >& $dir/log/tr.iter$iter.log || exit 1;
139 |       tracc=$(cat $dir/log/tr.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
140 |     else
141 |       $cuda_cmd JOB=1:$nj $dir/log/tr.iter$iter.JOB.log \
142 |         $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \
143 |         --learn-rate=$learn_rate --momentum=$momentum --num-jobs=$nj --job-id=JOB \
144 |         --verbose=$verbose \
145 |         ${utts_per_avg:+ --utts-per-avg=$utts_per_avg} \
146 |         "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} >& $dir/log/tr.iter$iter.log || exit 1
147 |       tracc=$(cat $dir/log/tr.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }')
148 |     fi
149 | 
150 | 
151 |     echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, "
152 |     end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
153 |     echo -n "ENDS [$end_time]: "
154 | 
155 |     # validation
156 |     if [ -z "$nj" ]; then
157 |       $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \
158 |         --cross-validate=true \
159 |         --learn-rate=$learn_rate \
160 |         --verbose=$verbose \
161 |         "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \
162 |         >& $dir/log/cv.iter$iter.log || exit 1;
163 |       cvacc=$(cat $dir/log/cv.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
164 |     else
165 |       $cuda_cmd JOB=1:$nj $dir/log/cv.iter$iter.JOB.log \
166 |         $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \
167 |         --cross-validate=true --num-jobs=$nj --job-id=JOB \
168 |         --learn-rate=$learn_rate \
169 |         --verbose=$verbose \
170 |         "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} >& $dir/log/cv.iter$iter.log || exit 1;
171 |       cvacc=$(cat $dir/log/cv.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }')
172 |     fi
173 | 
174 |     echo "VALID ACCURACY $(printf "%.4f" $cvacc)%"
175 | 
176 |     # stopping criterion
177 |     rel_impr=$(bc <<< "($cvacc-$cvacc_prev)")
178 |     if [[ 1 == "$halving" && 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]]; then
179 |       if [[ "$min_iters" != "" ]]; then
180 |         if [ $min_iters -gt $iter ]; then
181 |           echo we were supposed to finish, but we continue as min_iters : $min_iters
182 |           continue
183 |         fi
184 |       fi
185 |       echo finished, too small rel. improvement $rel_impr
186 |       break
187 |     fi
188 | 
189 |     # start annealing when improvement is low
190 |     if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then
191 |       if [ $iter -gt $halving_after_epoch ]; then
192 |         halving=1
193 |       fi
194 |     fi
195 | 
196 |     # do annealing
197 |     if [ 1 == $halving ]; then
198 |       learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
199 |     fi
200 |     # save the status 
201 |     echo $[$iter+1] > $dir/.epoch    # +1 because we save the epoch to start from
202 |     echo $cvacc > $dir/.cvacc
203 |     echo $halving > $dir/.halving
204 |     echo $learn_rate > $dir/.lrate
205 | done
206 | 
207 | # Convert the model marker from "<BiLstmParallel>" to "<BiLstm>"
208 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1;
209 | 
210 | echo "Training succeeded. The final model $dir/final.nnet"
211 | }
212 | 


--------------------------------------------------------------------------------
/utils/split_scp.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # Copyright 2010-2011 Microsoft Corporation
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #  http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 14 | # See the Apache 2 License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | 
 19 | # This program splits up any kind of .scp or archive-type file.
 20 | # If there is no utt2spk option it will work on any text  file and
 21 | # will split it up with an approximately equal number of lines in
 22 | # each but.
 23 | # With the --utt2spk option it will work on anything that has the 
 24 | # utterance-id as the first entry on each line; the utt2spk file is
 25 | # of the form "utterance speaker" (on each line).
 26 | # It splits it into equal size chunks as far as it can.  If you use
 27 | # the utt2spk option it will make sure these chunks coincide with
 28 | # speaker boundaries.  In this case, if there are more chunks
 29 | # than speakers (and in some other circumstances), some of the 
 30 | # resulting  chunks will be empty and it
 31 | # will print a warning.
 32 | # You will normally call this like:
 33 | # split_scp.pl scp scp.1 scp.2 scp.3 ...
 34 | # or
 35 | # split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
 36 | # Note that you can use this script to split the utt2spk file itself,
 37 | # e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
 38 | 
 39 | # You can also call the scripts like:
 40 | # split_scp.pl -j 3 0 scp scp.0
 41 | # [note: with this option, it assumes zero-based indexing of the split parts,
 42 | # i.e. the second number must be 0 <= n < num-jobs.]
 43 | 
 44 | $num_jobs = 0;
 45 | $job_id = 0;
 46 | $utt2spk_file = "";
 47 | 
 48 | for ($x = 1; $x <= 2; $x++) {
 49 |     if ($ARGV[0] eq "-j") {
 50 |         shift @ARGV;
 51 |         $num_jobs = shift @ARGV;
 52 |         $job_id = shift @ARGV;
 53 |         if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
 54 |             die "Invalid num-jobs and job-id: $num_jobs and $job_id";
 55 |         }
 56 |     }
 57 |     if ($ARGV[0] =~ "--utt2spk=(.+)") {
 58 |         $utt2spk_file=$1;
 59 |         shift;
 60 |     }
 61 | }
 62 | 
 63 | if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
 64 |     die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
 65 |         " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
 66 |         " ... where 0 <= job-id < num-jobs.";
 67 | }
 68 | 
 69 | $error = 0;   
 70 | $inscp = shift @ARGV;
 71 | if ($num_jobs == 0) { # without -j option
 72 |     @OUTPUTS = @ARGV;
 73 | } else {
 74 |     for ($j = 0; $j < $num_jobs; $j++) {
 75 |         if ($j == $job_id) { 
 76 |             if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
 77 |             else { push @OUTPUTS, "-"; }
 78 |         } else {
 79 |             push @OUTPUTS, "/dev/null";
 80 |         }
 81 |     }
 82 | } 
 83 | 
 84 | if ($utt2spk_file ne "") {  # We have the --utt2spk option...
 85 |     open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
 86 |     while(<U>) {
 87 |         @A = split;
 88 |         @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
 89 |         ($u,$s) = @A;
 90 |         $utt2spk{$u} = $s;
 91 |     }
 92 |     open(I, "<$inscp") || die "Opening input scp file $inscp";
 93 |     @spkrs = ();
 94 |     while(<I>) {
 95 |         @A = split;
 96 |         if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
 97 |         $u = $A[0];
 98 |         $s = $utt2spk{$u};
 99 |         if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
100 |         if(!defined $spk_count{$s}) { 
101 |             push @spkrs, $s; 
102 |             $spk_count{$s} = 0;
103 |             $spk_data{$s} = "";
104 |         }
105 |         $spk_count{$s}++;
106 |         $spk_data{$s} = $spk_data{$s} . $_;
107 |     }
108 |     # Now split as equally as possible ..
109 |     # First allocate spks to files by allocating an approximately
110 |     # equal number of speakers.
111 |     $numspks = @spkrs;  # number of speakers.
112 |     $numscps = @OUTPUTS; # number of output files.
113 |     if ($numspks < $numscps) {
114 |       die "Refusing to split data because number of speakers $numspks is less " .
115 |           "than the number of output .scp files $numscps";
116 |     }
117 |     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
118 |         $scparray[$scpidx] = []; # [] is array reference.
119 |     }
120 |     for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
121 |         $scpidx = int(($spkidx*$numscps) / $numspks);
122 |         $spk = $spkrs[$spkidx];
123 |         push @{$scparray[$scpidx]}, $spk;
124 |         $scpcount[$scpidx] += $spk_count{$spk};
125 |     }
126 | 
127 |     # Now will try to reassign beginning + ending speakers
128 |     # to different scp's and see if it gets more balanced.
129 |     # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
130 |     # We can show that if considering changing just 2 scp's, we minimize
131 |     # this by minimizing the squared difference in sizes.  This is
132 |     # equivalent to minimizing the absolute difference in sizes.  This
133 |     # shows this method is bound to converge.
134 | 
135 |     $changed = 1;
136 |     while($changed) {
137 |         $changed = 0;
138 |         for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
139 |             # First try to reassign ending spk of this scp.
140 |             if($scpidx < $numscps-1) {
141 |                 $sz = @{$scparray[$scpidx]};
142 |                 if($sz > 0) {
143 |                     $spk = $scparray[$scpidx]->[$sz-1];
144 |                     $count = $spk_count{$spk};
145 |                     $nutt1 = $scpcount[$scpidx];
146 |                     $nutt2 = $scpcount[$scpidx+1];
147 |                     if( abs( ($nutt2+$count) - ($nutt1-$count))
148 |                         < abs($nutt2 - $nutt1))  { # Would decrease
149 |                         # size-diff by reassigning spk...
150 |                         $scpcount[$scpidx+1] += $count;
151 |                         $scpcount[$scpidx] -= $count;
152 |                         pop @{$scparray[$scpidx]};
153 |                         unshift @{$scparray[$scpidx+1]}, $spk;
154 |                         $changed = 1;
155 |                     }
156 |                 }
157 |             }
158 |             if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
159 |                 $spk = $scparray[$scpidx]->[0];
160 |                 $count = $spk_count{$spk};
161 |                 $nutt1 = $scpcount[$scpidx-1];
162 |                 $nutt2 = $scpcount[$scpidx];
163 |                 if( abs( ($nutt2-$count) - ($nutt1+$count))
164 |                     < abs($nutt2 - $nutt1))  { # Would decrease
165 |                     # size-diff by reassigning spk...
166 |                     $scpcount[$scpidx-1] += $count;
167 |                     $scpcount[$scpidx] -= $count;
168 |                     shift @{$scparray[$scpidx]};
169 |                     push @{$scparray[$scpidx-1]}, $spk;
170 |                     $changed = 1;
171 |                 }
172 |             }
173 |         }
174 |     }
175 |     # Now print out the files...
176 |     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
177 |         $scpfn = $OUTPUTS[$scpidx];
178 |         open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
179 |         $count = 0;
180 |         if(@{$scparray[$scpidx]} == 0) {
181 |             print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
182 |             $error = 1;
183 |         } else {
184 |             foreach $spk ( @{$scparray[$scpidx]} ) {
185 |                 print F $spk_data{$spk};
186 |                 $count += $spk_count{$spk};
187 |             }
188 |             if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
189 |         }
190 |         close(F);
191 |     }
192 | } else { 
193 |    # This block is the "normal" case where there is no --utt2spk 
194 |    # option and we just break into equal size chunks.
195 | 
196 |     open(I, "<$inscp") || die "Opening input scp file $inscp";
197 | 
198 |     $numscps = @OUTPUTS;  # size of array.
199 |     @F = ();
200 |     while(<I>) {
201 |         push @F, $_;
202 |     }
203 |     $numlines = @F;
204 |     if($numlines == 0) {
205 |         print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
206 |         $error = 1;
207 |     }
208 |     $linesperscp = int( $numlines / $numscps); # the "whole part"..
209 |     $linesperscp >= 1 || die "You are splitting into too many pieces!";
210 |     $remainder = $numlines - ($linesperscp * $numscps);
211 |     ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
212 |     # [just doing int() rounds down].
213 |     $n = 0;
214 |     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
215 |         $scpfile = $OUTPUTS[$scpidx];
216 |         open(O, ">$scpfile") || die "Opening output scp file $scpfile";
217 |         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
218 |             print O $F[$n++];
219 |         }
220 |         close(O) || die "Closing scp file $scpfile";
221 |     }
222 |     $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
223 | }
224 | 
225 | exit ($error ? 1 : 0);
226 | 


--------------------------------------------------------------------------------
/utils/run.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings; #sed replacement for -w perl parameter
  3 | 
  4 | # In general, doing 
  5 | #  run.pl some.log a b c is like running the command a b c in
  6 | # the bash shell, and putting the standard error and output into some.log.
  7 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
  8 | #  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
  9 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
 10 | # If any of the jobs fails, this script will fail.
 11 | 
 12 | # A typical example is:
 13 | #  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
 14 | # and run.pl will run something like:
 15 | # ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
 16 | # 
 17 | # Basically it takes the command-line arguments, quotes them
 18 | # as necessary to preserve spaces, and evaluates them with bash.
 19 | # In addition it puts the command line at the top of the log, and
 20 | # the start and end times of the command at the beginning and end.
 21 | # The reason why this is useful is so that we can create a different
 22 | # version of this program that uses a queueing system instead.
 23 | 
 24 | # use Data::Dumper;
 25 | 
 26 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
 27 | 
 28 | 
 29 | $max_jobs_run = -1;
 30 | $jobstart = 1;
 31 | $jobend = 1;
 32 | $ignored_opts = ""; # These will be ignored.
 33 | 
 34 | # First parse an option like JOB=1:4, and any
 35 | # options that would normally be given to
 36 | # queue.pl, which we will just discard.
 37 | 
 38 | if (@ARGV > 0) {
 39 |   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
 40 |     # that would normally go to qsub, but which will be ignored here.
 41 |     $switch = shift @ARGV;
 42 |     if ($switch eq "-V") {
 43 |       $ignored_opts .= "-V ";
 44 |     } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
 45 |       # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
 46 |       $max_jobs_run = shift @ARGV;
 47 |       if (! ($max_jobs_run > 0)) {
 48 |         die "run.pl: invalid option --max-jobs-run $max_jobs_run";
 49 |       }
 50 |     } else {
 51 |       $option = shift @ARGV;
 52 |       if ($switch eq "-sync" && $option =~ m/^[yY]/) {
 53 |         $ignored_opts .= "-sync "; # Note: in the
 54 |         # corresponding code in queue.pl it says instead, just "$sync = 1;".
 55 |       }
 56 |       $ignored_opts .= "$switch $option ";
 57 |       if ($switch eq "-pe") { # e.g. -pe smp 5
 58 |         $option2 = shift @ARGV;
 59 |         $ignored_opts .= "$option2 ";
 60 |       }
 61 |     }
 62 |   }
 63 |   if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
 64 |     $jobname = $1;
 65 |     $jobstart = $2;
 66 |     $jobend = $3;
 67 |     shift;
 68 |     if ($jobstart > $jobend) {
 69 |       die "run.pl: invalid job range $ARGV[0]";
 70 |     }
 71 |     if ($jobstart <= 0) {
 72 |       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
 73 |     }
 74 |   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
 75 |     $jobname = $1;
 76 |     $jobstart = $2;
 77 |     $jobend = $2;
 78 |     shift;
 79 |   } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
 80 |     print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
 81 |   }
 82 | }
 83 | 
 84 | # Users found this message confusing so we are removing it.
 85 | # if ($ignored_opts ne "") {
 86 | #  print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
 87 | # }
 88 | 
 89 | if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
 90 |                            # then work out the number of processors if possible,
 91 |                            # and set it based on that.
 92 |   $max_jobs_run = 0;
 93 |   if (open(P, "</proc/cpuinfo")) {  # Linux
 94 |     while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
 95 |     if ($max_jobs_run == 0) {
 96 |       print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
 97 |       $max_jobs_run = 10;  # reasonable default.
 98 |     }
 99 |     close(P);
100 |   } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
101 |     while (<P>) {
102 |       if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
103 |         $max_jobs_run = $1;
104 |         last;
105 |       }
106 |     }
107 |     close(P);
108 |     if ($max_jobs_run == 0) {
109 |       print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
110 |       $max_jobs_run = 10;  # reasonable default.
111 |     }
112 |   } else {
113 |     # allow at most 32 jobs at once, on non-UNIX systems; change this code
114 |     # if you need to change this default.
115 |     $max_jobs_run = 32;
116 |   }
117 |   # The just-computed value of $max_jobs_run is just the number of processors
118 |   # (or our best guess); and if it happens that the number of jobs we need to
119 |   # run is just slightly above $max_jobs_run, it will make sense to increase
120 |   # $max_jobs_run to equal the number of jobs, so we don't have a small number
121 |   # of leftover jobs.
122 |   $num_jobs = $jobend - $jobstart + 1;
123 |   if ($num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
124 |     $max_jobs_run = $num_jobs;
125 |   }
126 | }
127 | 
128 | $logfile = shift @ARGV;
129 | 
130 | if (defined $jobname && $logfile !~ m/$jobname/ &&
131 |     $jobend > $jobstart) {
132 |   print STDERR "run.pl: you are trying to run a parallel job but "
133 |     . "you are putting the output into just one log file ($logfile)\n";
134 |   exit(1);
135 | }
136 | 
137 | $cmd = "";
138 | 
139 | foreach $x (@ARGV) { 
140 |     if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
141 |     elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
142 |     else { $cmd .= "\"$x\" "; } 
143 | }
144 | 
145 | #$Data::Dumper::Indent=0;
146 | $ret = 0;
147 | $numfail = 0;
148 | %active_pids=();
149 | 
150 | use POSIX ":sys_wait_h";
151 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
152 |   if (scalar(keys %active_pids) >= $max_jobs_run) {
153 |     
154 |     # Lets wait for a change in any child's status
155 |     # Then we have to work out which child finished
156 |     $r = waitpid(-1, 0);
157 |     $code = $?;
158 |     if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
159 |     if ( defined $active_pids{$r} ) {
160 |         $jid=$active_pids{$r};
161 |         $fail[$jid]=$code; 
162 |         if ($code !=0) { $numfail++;}
163 |         delete $active_pids{$r};
164 |         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
165 |     } else {
166 |         die "run.pl: Cannot find the PID of the chold process that just finished.";
167 |     }
168 | 
169 |     # In theory we could do a non-blocking waitpid over all jobs running just 
170 |     # to find out if only one or more jobs finished during the previous waitpid()
171 |     # However, we just omit this and will reap the next one in the next pass
172 |     # through the for(;;) cycle
173 |   }
174 |   $childpid = fork();
175 |   if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
176 |   if ($childpid == 0) { # We're in the child... this branch
177 |     # executes the job and returns (possibly with an error status).
178 |     if (defined $jobname) { 
179 |       $cmd =~ s/$jobname/$jobid/g;
180 |       $logfile =~ s/$jobname/$jobid/g;
181 |     }
182 |     system("mkdir -p `dirname $logfile` 2>/dev/null");
183 |     open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
184 |     print F "# " . $cmd . "\n";
185 |     print F "# Started at " . `date`;
186 |     $starttime = `date +'%s'`;
187 |     print F "#\n";
188 |     close(F);
189 | 
190 |     # Pipe into bash.. make sure we're not using any other shell.
191 |     open(B, "|bash") || die "run.pl: Error opening shell command"; 
192 |     print B "( " . $cmd . ") 2>>$logfile >> $logfile";
193 |     close(B);                   # If there was an error, exit status is in $?
194 |     $ret = $?;
195 | 
196 |     $lowbits = $ret & 127;
197 |     $highbits = $ret >> 8;
198 |     if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
199 |     else { $return_str = "code $highbits"; }
200 | 
201 |     $endtime = `date +'%s'`;
202 |     open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
203 |     $enddate = `date`;
204 |     chop $enddate;
205 |     print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
206 |     print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
207 |     close(F);
208 |     exit($ret == 0 ? 0 : 1);
209 |   } else {
210 |     $pid[$jobid] = $childpid;
211 |     $active_pids{$childpid} = $jobid;
212 |     # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
213 |   }
214 | }
215 | 
216 | # Now we have submitted all the jobs, lets wait until all the jobs finish
217 | foreach $child (keys %active_pids) {
218 |     $jobid=$active_pids{$child};
219 |     $r = waitpid($pid[$jobid], 0);
220 |     $code = $?;
221 |     if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
222 |     if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
223 | }
224 | 
225 | # Some sanity checks:
226 | # The $fail array should not contain undefined codes
227 | # The number of non-zeros in that array  should be equal to $numfail
228 | # We cannot do foreach() here, as the JOB ids do not necessarily start by zero
229 | $failed_jids=0;
230 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
231 |   $job_return = $fail[$jobid];
232 |   if (not defined $job_return ) {
233 |     # print Dumper(\@fail);
234 |     
235 |     die "run.pl: Sanity check failed: we have indication that some jobs are running " . 
236 |       "even after we waited for all jobs to finish" ; 
237 |   }
238 |   if ($job_return != 0 ){ $failed_jids++;}
239 | }
240 | if ($failed_jids != $numfail) {
241 |   die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
242 | }
243 | if ($numfail > 0) { $ret = 1; }
244 | 
245 | if ($ret != 0) {
246 |   $njobs = $jobend - $jobstart + 1;
247 |   if ($njobs == 1) { 
248 |     if (defined $jobname) {
249 |       $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
250 |                                          # that job.
251 |     }
252 |     print STDERR "run.pl: job failed, log is in $logfile\n";
253 |     if ($logfile =~ m/JOB/) {
254 |       print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
255 |     }
256 |   }
257 |   else {
258 |     $logfile =~ s/$jobname/*/g;
259 |     print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
260 |   }
261 | }
262 | 
263 | 
264 | exit ($ret);
265 | 


--------------------------------------------------------------------------------
/utils/validate_data_dir.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | no_feats=false
  5 | no_wav=false
  6 | no_text=false
  7 | 
  8 | for x in `seq 3`; do
  9 |   if [ "$1" == "--no-feats" ]; then
 10 |     no_feats=true
 11 |     shift;
 12 |   fi
 13 |   if [ "$1" == "--no-text" ]; then
 14 |     no_text=true
 15 |     shift;
 16 |   fi
 17 |   if [ "$1" == "--no-wav" ]; then
 18 |     no_wav=true
 19 |     shift;
 20 |   fi
 21 | done
 22 | 
 23 | if [ $# -ne 1 ]; then
 24 |   echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] <data-dir>"
 25 |   echo "e.g.: $0 data/train"
 26 | fi
 27 | 
 28 | data=$1
 29 | 
 30 | if [ ! -d $data ]; then
 31 |   echo "$0: no such directory $data"
 32 |   exit 1;
 33 | fi
 34 | 
 35 | for f in spk2utt utt2spk; do
 36 |   if [ ! -f $data/$f ]; then
 37 |     echo "$0: no such file $f"
 38 |     exit 1;
 39 |   fi
 40 |   if [ ! -s $data/$f ]; then
 41 |     echo "$0: empty file $f"
 42 |     exit 1;
 43 |   fi
 44 | done
 45 | 
 46 | ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
 47 |   echo "$0: $data/utt2spk has wrong format." && exit;
 48 | 
 49 | ns=$(wc -l < $data/spk2utt)
 50 | if [ "$ns" == 1 ]; then
 51 |   echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
 52 |   echo "   Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html"
 53 |   echo "   for more information."
 54 | fi
 55 | 
 56 | 
 57 | tmpdir=$(mktemp -d);
 58 | trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 59 | 
 60 | export LC_ALL=C
 61 | 
 62 | function check_sorted_and_uniq {
 63 |   ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
 64 |     echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
 65 | }
 66 | 
 67 | function partial_diff {
 68 |   diff $1 $2 | head -n 6
 69 |   echo "..."
 70 |   diff $1 $2 | tail -n 6
 71 |   n1=`cat $1 | wc -l`
 72 |   n2=`cat $2 | wc -l`
 73 |   echo "[Lengths are $1=$n1 versus $2=$n2]"
 74 | }
 75 | 
 76 | check_sorted_and_uniq $data/utt2spk
 77 | 
 78 | ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
 79 |    echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
 80 |    echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
 81 | 
 82 | check_sorted_and_uniq $data/spk2utt
 83 | 
 84 | ! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
 85 |      <(utils/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
 86 |    echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
 87 | 
 88 | cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
 89 | 
 90 | if [ ! -f $data/text ] && ! $no_text; then
 91 |   echo "$0: no such file $data/text (if this is by design, specify --no-text)"
 92 |   exit 1;
 93 | fi
 94 | 
 95 | num_utts=`cat $tmpdir/utts | wc -l`
 96 | if [ -f $data/text ]; then
 97 |   check_sorted_and_uniq $data/text
 98 |   text_len=`cat $data/text | wc -l`
 99 |   illegal_sym_list="<s> </s> #0"
100 |   for x in $illegal_sym_list; do
101 |     if grep -w "$x" $data/text > /dev/null; then
102 |       echo "$0: Error: in $data, text contains illegal symbol $x"
103 |       exit 1;
104 |     fi
105 |   done
106 |   awk '{print $1}' < $data/text > $tmpdir/utts.txt
107 |   if ! cmp -s $tmpdir/utts{,.txt}; then
108 |     echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
109 |     echo "$0: differ, partial diff is:"
110 |     partial_diff $tmpdir/utts{,.txt}
111 |     exit 1;
112 |   fi
113 | fi
114 | 
115 | if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
116 |   echo "$0: in directory $data, segments file exists but no wav.scp"
117 |   exit 1;
118 | fi
119 | 
120 | 
121 | if [ ! -f $data/wav.scp ] && ! $no_wav; then
122 |   echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
123 |   exit 1;
124 | fi
125 | 
126 | if [ -f $data/wav.scp ]; then
127 |   check_sorted_and_uniq $data/wav.scp
128 | 
129 |   if [ -f $data/segments ]; then
130 | 
131 |     check_sorted_and_uniq $data/segments
132 |     # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
133 |     ! cat $data/segments | \
134 |       awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
135 |       echo "$0: badly formatted segments file" && exit 1;
136 |     
137 |     segments_len=`cat $data/segments | wc -l`
138 |     if [ -f $data/text ]; then
139 |       ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \
140 |         echo "$0: Utterance list differs between $data/text and $data/segments " && \
141 |         echo "$0: Lengths are $segments_len vs $num_utts";
142 |     fi
143 | 
144 |     cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
145 |     awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
146 |     if ! cmp -s $tmpdir/recordings{,.wav}; then
147 |       echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
148 |       echo "$0: differ, partial diff is:"
149 |       partial_diff $tmpdir/recordings{,.wav}
150 |       exit 1;
151 |     fi
152 |     if [ -f $data/reco2file_and_channel ]; then
153 |       # this file is needed only for ctm scoring; it's indexed by recording-id.
154 |       check_sorted_and_uniq $data/reco2file_and_channel
155 |       ! cat $data/reco2file_and_channel | \
156 |         awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 
157 |                 if ( NF == 3 && $3 == "1" ) {
158 |                   warning_issued = 1;
159 |                 } else {
160 |                   print "Bad line ", $0; exit 1; 
161 |                 }
162 |               }
163 |             } 
164 |             END {
165 |               if (warning_issued == 1) {
166 |                 print "The channel should be marked as A or B, not 1! You should change it ASAP! "
167 |               }
168 |             }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
169 |       cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
170 |       if ! cmp -s $tmpdir/recordings{,.r2fc}; then
171 |         echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
172 |         echo "$0: differ, partial diff is:"
173 |         partial_diff $tmpdir/recordings{,.r2fc}
174 |         exit 1;
175 |       fi
176 |     fi
177 |   else
178 |     # No segments file -> assume wav.scp indexed by utterance.
179 |     cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
180 |     if ! cmp -s $tmpdir/utts{,.wav}; then
181 |       echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
182 |       echo "$0: differ, partial diff is:"
183 |       partial_diff $tmpdir/utts{,.wav}
184 |       exit 1;
185 |     fi
186 | 
187 |     if [ -f $data/reco2file_and_channel ]; then
188 |       # this file is needed only for ctm scoring; it's indexed by recording-id.
189 |       check_sorted_and_uniq $data/reco2file_and_channel
190 |       ! cat $data/reco2file_and_channel | \
191 |         awk '{if (NF != 3 || ($3 != "A" && $3 != "B")) { print "Bad line ", $0; exit 1; }}' && \
192 |         echo "$0: badly formatted reco2file_and_channel file" && exit 1;
193 |       cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
194 |       if ! cmp -s $tmpdir/utts{,.r2fc}; then
195 |         echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
196 |         echo "$0: differ, partial diff is:"
197 |         partial_diff $tmpdir/utts{,.r2fc}
198 |         exit 1;
199 |       fi
200 |     fi
201 |   fi
202 | fi
203 | 
204 | if [ ! -f $data/feats.scp ] && ! $no_feats; then
205 |   echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
206 |   exit 1;
207 | fi
208 | 
209 | if [ -f $data/feats.scp ]; then
210 |   check_sorted_and_uniq $data/feats.scp
211 |   cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
212 |   if ! cmp -s $tmpdir/utts{,.feats}; then
213 |     echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
214 |     echo "$0: differ, partial diff is:"
215 |     partial_diff $tmpdir/utts{,.feats}
216 |     exit 1;
217 |   fi
218 | fi
219 | 
220 | if [ -f $data/cmvn.scp ]; then
221 |   check_sorted_and_uniq $data/cmvn.scp
222 |   cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
223 |   cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
224 |   if ! cmp -s $tmpdir/speakers{,.cmvn}; then
225 |     echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn"
226 |     echo "$0: differ, partial diff is:"
227 |     partial_diff $tmpdir/speakers{,.cmvn}
228 |     exit 1;
229 |   fi
230 | fi
231 | 
232 | if [ -f $data/spk2gender ]; then
233 |   check_sorted_and_uniq $data/spk2gender
234 |   ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
235 |      echo "Mal-formed spk2gender file" && exit 1;
236 |   cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
237 |   cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
238 |   if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
239 |     echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
240 |     echo "$0: differ, partial diff is:"
241 |     partial_diff $tmpdir/speakers{,.spk2gender}
242 |     exit 1;
243 |   fi
244 | fi
245 | 
246 | if [ -f $data/spk2warp ]; then
247 |   check_sorted_and_uniq $data/spk2warp
248 |   ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
249 |      echo "Mal-formed spk2warp file" && exit 1;
250 |   cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
251 |   cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
252 |   if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
253 |     echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
254 |     echo "$0: differ, partial diff is:"
255 |     partial_diff $tmpdir/speakers{,.spk2warp}
256 |     exit 1;
257 |   fi
258 | fi
259 | 
260 | if [ -f $data/utt2warp ]; then
261 |   check_sorted_and_uniq $data/utt2warp
262 |   ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
263 |      echo "Mal-formed spk2warp file" && exit 1;
264 |   cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
265 |   cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
266 |   if ! cmp -s $tmpdir/utts{,.utt2warp}; then
267 |     echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
268 |     echo "$0: differ, partial diff is:"
269 |     partial_diff $tmpdir/utts{,.utt2warp}
270 |     exit 1;
271 |   fi
272 | fi
273 | 
274 | # check some optionally-required things
275 | for f in vad.scp utt2lang; do
276 |   if [ -f $data/$f ]; then
277 |     check_sorted_and_uniq $data/$f
278 |     if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
279 |       <( awk '{print $1}' $data/$f ); then
280 |       echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
281 |       exit 1;
282 |     fi
283 |   fi
284 | done
285 | 
286 | echo "$0: Successfully validated data-directory $data"
287 | 


--------------------------------------------------------------------------------
/steps/train_ctc_parallel_x3.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2015  Yajie Miao    (Carnegie Mellon University)
  4 | #           2016  Florian Metze (Carnegie Mellon University)
  5 | # Apache 2.0
  6 | 
  7 | # This script trains acoustic models based on CTC and using SGD. 
  8 | 
  9 | ## Begin configuration section
 10 | train_tool=train-ctc-parallel  # the command for training; by default, we use the
 11 |                 # parallel version which processes multiple utterances at the same time 
 12 | 
 13 | # configs for multiple sequences
 14 | num_sequence=5           # during training, how many utterances to be processed in parallel
 15 | valid_num_sequence=10    # number of parallel sequences in validation
 16 | frame_num_limit=1000000  # the number of frames to be processed at a time in training; this config acts to
 17 |          # to prevent running out of GPU memory if #num_sequence very long sequences are processed;the max
 18 |          # number of training examples is decided by if num_sequence or frame_num_limit is reached first. 
 19 | 
 20 | # learning rate
 21 | learn_rate=0.0001        # learning rate
 22 | final_learn_rate=0.0     # final learning rate
 23 | momentum=0.9             # momentum
 24 | 
 25 | # learning rate schedule
 26 | max_iters=25             # max number of iterations
 27 | min_iters=               # min number of iterations
 28 | start_epoch_num=1        # start from which epoch, used for resuming training from a break point
 29 | 
 30 | start_halving_inc=0.5    # start halving learning rates when the accuracy improvement falls below this amount
 31 | end_halving_inc=0.1      # terminate training when the accuracy improvement falls below this amount
 32 | halving_factor=0.5       # learning rate decay factor
 33 | halving_after_epoch=1    # halving becomes enabled after this many epochs
 34 | 
 35 | # logging
 36 | report_step=100          # during training, the step (number of utterances) of reporting objective and accuracy
 37 | verbose=1
 38 | 
 39 | # feature configs
 40 | sort_by_len=true         # whether to sort the utterances by their lengths
 41 | min_len=0                # minimal length of utterances to consider
 42 | 
 43 | splice_feats=false       # whether to splice neighboring frams
 44 | subsample_feats=false    # whether to subsample features
 45 | norm_vars=true           # whether to apply variance normalization when we do cmn
 46 | add_deltas=true          # whether to add deltas
 47 | copy_feats=true          # whether to copy features into a local dir (on the GPU machine)
 48 | feats_tmpdir=            # the tmp dir to save the copied features, when copy_feats=true
 49 | 
 50 | 
 51 | # status of learning rate schedule; useful when training is resumed from a break point
 52 | cvacc=0
 53 | halving=0
 54 | 
 55 | ## End configuration section
 56 | 
 57 | echo "$0 $@"  # Print the command line for logging
 58 | 
 59 | [ -f path.sh ] && . ./path.sh; 
 60 | 
 61 | . utils/parse_options.sh || exit 1;
 62 | 
 63 | if [ $# != 3 ]; then
 64 |   echo "Usage: $0 <data-tr> <data-cv> <exp-dir>"
 65 |   echo " e.g.: $0 data/train_tr data/train_cv exp/train_phn"
 66 |   exit 1;
 67 | fi
 68 | 
 69 | data_tr=$1
 70 | data_cv=$2
 71 | dir=$3
 72 | 
 73 | mkdir -p $dir/log $dir/nnet
 74 | 
 75 | for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do
 76 |   [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
 77 | done
 78 | 
 79 | ## Read the training status for resuming
 80 | [ -f $dir/.epoch ] && start_epoch_num=`cat $dir/.epoch 2>/dev/null`
 81 | [ -f $dir/.cvacc ] && cvacc=`cat $dir/.cvacc 2>/dev/null`
 82 | [ -f $dir/.halving ] && halving=`cat $dir/.halving 2>/dev/null`
 83 | [ -f $dir/.lrate ] && learn_rate=`cat $dir/.lrate 2>/dev/null`
 84 | 
 85 | ## Set up labels  
 86 | labels_tr="ark:gunzip -c $dir/labels.tr.gz|"
 87 | labels_cv="ark:gunzip -c $dir/labels.cv.gz|"
 88 | # Compute the occurrence counts of labels in the label sequences. These counts will be used to
 89 | # derive prior probabilities of the labels.
 90 | gunzip -c $dir/labels.tr.gz | awk '{line=$0; gsub(" "," 0 ",line); print line " 0";}' | \
 91 |   analyze-counts --verbose=1 --binary=false ark:- $dir/label.counts >& $dir/log/compute_label_counts.log || exit 1
 92 | ##
 93 | 
 94 | ## Setup up features
 95 | # output feature configs which will be used in decoding
 96 | echo $norm_vars > $dir/norm_vars
 97 | echo $add_deltas > $dir/add_deltas
 98 | echo $splice_feats > $dir/splice_feats
 99 | echo $subsample_feats > $dir/subsample_feats
100 | 
101 | if $sort_by_len; then
102 |   feat-to-len scp:$data_tr/feats.scp ark,t:- | awk '{print $2}' | \
103 |     paste -d " " $data_tr/feats.scp - | sort -k3 -n - | awk -v m=$min_len '{ if ($3 >= m) {print $1 " " $2} }' > $dir/train.scp &
104 |   feat-to-len scp:$data_cv/feats.scp ark,t:- | awk '{print $2}' | \
105 |     paste -d " " $data_cv/feats.scp - | sort -k3 -n - | awk '{print $1 " " $2}' > $dir/cv.scp &
106 |   wait || exit 1;
107 | else
108 |   cat $data_tr/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp
109 |   cat $data_cv/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/cv.scp
110 | fi
111 | 
112 | feats_tr="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_tr/utt2spk scp:$data_tr/cmvn.scp scp:$dir/train.scp ark:- |"
113 | feats_cv="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp scp:$dir/cv.scp ark:- |"
114 | 
115 | if $splice_feats; then
116 |   feats_tr="$feats_tr splice-feats --left-context=1 --right-context=1 ark:- ark:- |"
117 |   feats_cv="$feats_cv splice-feats --left-context=1 --right-context=1 ark:- ark:- |"
118 | fi
119 | 
120 | if $subsample_feats; then
121 |   #tmpdir=$(mktemp -d --tmpdir=$feats_tmpdir);
122 |   tmpdir=$(mktemp -d $feats_tmpdir);
123 | 
124 |   copy-feats "$feats_tr subsample-feats --n=3 --offset=0 ark:- ark:- |" \
125 |              ark,scp:$tmpdir/train0.ark,$tmpdir/train0local.scp || exit 1;
126 |   copy-feats "$feats_cv subsample-feats --n=3 --offset=0 ark:- ark:- |" \
127 |              ark,scp:$tmpdir/cv0.ark,$tmpdir/cv0local.scp || exit 1;
128 |   copy-feats "$feats_tr subsample-feats --n=3 --offset=1 ark:- ark:- |" \
129 |              ark,scp:$tmpdir/train1.ark,$tmpdir/train1local.scp || exit 1;
130 |   copy-feats "$feats_cv subsample-feats --n=3 --offset=1 ark:- ark:- |" \
131 |              ark,scp:$tmpdir/cv1.ark,$tmpdir/cv1local.scp || exit 1;
132 |   copy-feats "$feats_tr subsample-feats --n=3 --offset=2 ark:- ark:- |" \
133 |              ark,scp:$tmpdir/train2.ark,$tmpdir/train2local.scp || exit 1;
134 |   copy-feats "$feats_cv subsample-feats --n=3 --offset=2 ark:- ark:- |" \
135 |              ark,scp:$tmpdir/cv2.ark,$tmpdir/cv2local.scp || exit 1;
136 | 
137 |   # this code is experimental - we may need to sort the data carefully
138 |   sed 's/^/0x/' $tmpdir/train0local.scp        > $tmpdir/train_local.scp
139 |   sed 's/^/0x/' $tmpdir/cv0local.scp           > $tmpdir/cv_local.scp
140 |   sed 's/^/1x/' $tmpdir/train1local.scp | tac >> $tmpdir/train_local.scp
141 |   sed 's/^/1x/' $tmpdir/cv1local.scp    | tac >> $tmpdir/cv_local.scp
142 |   sed 's/^/2x/' $tmpdir/train2local.scp       >> $tmpdir/train_local.scp
143 |   sed 's/^/2x/' $tmpdir/cv2local.scp          >> $tmpdir/cv_local.scp
144 |   
145 |   feats_tr="ark,s,cs:copy-feats scp:$tmpdir/train_local.scp ark:- |"
146 |   feats_cv="ark,s,cs:copy-feats scp:$tmpdir/cv_local.scp ark:- |"
147 | 
148 |   gzip -cd $dir/labels.tr.gz | sed 's/^/0x/'  > $tmpdir/labels.tr
149 |   gzip -cd $dir/labels.cv.gz | sed 's/^/0x/'  > $tmpdir/labels.cv
150 |   gzip -cd $dir/labels.tr.gz | sed 's/^/1x/' >> $tmpdir/labels.tr
151 |   gzip -cd $dir/labels.cv.gz | sed 's/^/1x/' >> $tmpdir/labels.cv
152 |   gzip -cd $dir/labels.tr.gz | sed 's/^/2x/' >> $tmpdir/labels.tr
153 |   gzip -cd $dir/labels.cv.gz | sed 's/^/2x/' >> $tmpdir/labels.cv
154 |   
155 |   labels_tr="ark:cat $tmpdir/labels.tr|"
156 |   labels_cv="ark:cat $tmpdir/labels.cv|"
157 |   
158 |   trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls -l $tmpdir; rm -r $tmpdir" EXIT
159 | else
160 | 
161 |   # Save the features to a local dir on the GPU machine. On Linux, this usually points to /tmp
162 |   if $copy_feats; then
163 |     tmpdir=$(mktemp -d $feats_tmpdir);
164 |     copy-feats "$feats_tr" ark,scp:$tmpdir/train.ark,$tmpdir/train_local.scp || exit 1;
165 |     copy-feats "$feats_cv" ark,scp:$tmpdir/cv.ark,$tmpdir/cv_local.scp || exit 1;
166 |     feats_tr="ark,s,cs:copy-feats scp:$tmpdir/train_local.scp ark:- |"
167 |     feats_cv="ark,s,cs:copy-feats scp:$tmpdir/cv_local.scp ark:- |"
168 |     trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT
169 |   fi
170 | fi
171 | 
172 | if $add_deltas; then
173 |     feats_tr="$feats_tr add-deltas ark:- ark:- |"
174 |     feats_cv="$feats_cv add-deltas ark:- ark:- |"
175 | fi
176 | ## End of feature setup
177 | 
178 | # Initialize model parameters
179 | if [ ! -f $dir/nnet/nnet.iter0 ]; then
180 |   echo "Initializing model as $dir/nnet/nnet.iter0"
181 |   net-initialize --binary=true $dir/nnet.proto $dir/nnet/nnet.iter0 >& $dir/log/initialize_model.log || exit 1;
182 | fi
183 | 
184 | cur_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
185 | echo "TRAINING STARTS [$cur_time]"
186 | echo "[NOTE] TOKEN_ACCURACY refers to token accuracy, i.e., (1.0 - token_error_rate)."
187 | for iter in $(seq $start_epoch_num $max_iters); do
188 |     cvacc_prev=$cvacc
189 |     echo -n "EPOCH $iter RUNNING ... "
190 | 
191 |     # train
192 |     $train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \
193 |         --learn-rate=$learn_rate --momentum=$momentum \
194 |         --verbose=$verbose \
195 |         "$feats_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} \
196 |         >& $dir/log/tr.iter$iter.log || exit 1;
197 | 
198 |     end_time=`date | awk '{print $6 "-" $2 "-" $3 " " $4}'`
199 |     echo -n "ENDS [$end_time]: "
200 | 
201 |     tracc=$(cat $dir/log/tr.iter${iter}.log | grep -a "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
202 |     echo -n "lrate $(printf "%.6g" $learn_rate), TRAIN ACCURACY $(printf "%.4f" $tracc)%, "
203 | 
204 |     # validation
205 |     $train_tool --report-step=$report_step --num-sequence=$valid_num_sequence --frame-limit=$frame_num_limit \
206 |         --cross-validate=true \
207 |         --learn-rate=$learn_rate \
208 |         --momentum=$momentum \
209 |         --verbose=$verbose \
210 |         "$feats_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} \
211 |         >& $dir/log/cv.iter$iter.log || exit 1;
212 | 
213 |     cvacc=$(cat $dir/log/cv.iter${iter}.log | grep -a "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
214 |     echo "VALID ACCURACY $(printf "%.4f" $cvacc)%"
215 | 
216 |     # stopping criterion
217 |     rel_impr=$(bc <<< "($cvacc-$cvacc_prev)")
218 |     if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_inc") ]; then
219 |       if [[ "$min_iters" != "" ]]; then
220 |         if [ $min_iters -gt $iter ]; then
221 |           echo we were supposed to finish, but we continue as min_iters : $min_iters
222 |           continue
223 |         fi
224 |       fi
225 |       echo finished, too small rel. improvement $rel_impr
226 |       break
227 |     fi
228 | 
229 |     # start annealing when improvement is low
230 |     if [ 1 == $(bc <<< "$rel_impr < $start_halving_inc") ]; then
231 |       if [ $iter -gt $halving_after_epoch ]; then
232 |         halving=1
233 |       fi
234 |     fi
235 | 
236 |     # do annealing
237 |     if [ 1 == $halving ]; then
238 |       learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
239 |       learn_rate=$(awk "BEGIN{if ($learn_rate<$final_learn_rate) {print $final_learn_rate} else {print $learn_rate}}")
240 |     fi
241 |     # save the status 
242 |     echo $[$iter+1] > $dir/.epoch    # +1 because we save the epoch to start from
243 |     echo $cvacc > $dir/.cvacc
244 |     echo $halving > $dir/.halving
245 |     echo $learn_rate > $dir/.lrate
246 | done
247 | 
248 | # Convert the model marker from "<BiLstmParallel>" to "<BiLstm>" (no longer needed)
249 | format-to-nonparallel $dir/nnet/nnet.iter${iter} $dir/final.nnet >& $dir/log/model_to_nonparal.log || exit 1;
250 | 
251 | echo "Training succeeded. The final model $dir/final.nnet"
252 | 


--------------------------------------------------------------------------------