├── VERSION
├── recipe
    ├── data
    │   └── local
    │   │   ├── lang
    │   │       ├── lex_ndisambig
    │   │       └── phone_map.txt
    │   │   └── dict
    │   │       ├── optional_silence.txt
    │   │       ├── silence_phones.txt
    │   │       ├── extra_questions.txt
    │   │       └── nonsilence_phones.txt
    ├── lm
    │   └── README
    ├── conf
    │   ├── mfcc.conf
    │   ├── online_cmvn.conf
    │   └── mfcc_hires.conf
    ├── path.sh
    ├── cmd.sh
    ├── local
    │   ├── score.sh
    │   └── nnet3
    │   │   └── run_ivector_common.sh
    └── run.sh
├── requirements.txt
├── welcome.wav
├── .gitignore
├── RESULTS.txt
├── LICENSE
├── README.md
└── transcribe.py


/VERSION:
--------------------------------------------------------------------------------
1 | 1.0
2 | 


--------------------------------------------------------------------------------
/recipe/data/local/lang/lex_ndisambig:
--------------------------------------------------------------------------------
1 | 13
2 | 


--------------------------------------------------------------------------------
/recipe/lm/README:
--------------------------------------------------------------------------------
1 | Put lm.arpa.gz here
2 | 


--------------------------------------------------------------------------------
/recipe/data/local/dict/optional_silence.txt:
--------------------------------------------------------------------------------
1 | SIL
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa~=0.8.0
2 | vosk~=0.3.0
3 | 


--------------------------------------------------------------------------------
/recipe/data/local/dict/silence_phones.txt:
--------------------------------------------------------------------------------
1 | NSN
2 | SIL
3 | SPN
4 | 


--------------------------------------------------------------------------------
/recipe/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false
2 | --sample-frequency=16000
3 | 


--------------------------------------------------------------------------------
/welcome.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhasspy/fa_kaldi-rhasspy/HEAD/welcome.wav


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | *.log
4 | tmp/
5 | 
6 | model/
7 | .venv/
8 | *.gz
9 | 


--------------------------------------------------------------------------------
/recipe/data/local/dict/extra_questions.txt:
--------------------------------------------------------------------------------
1 | SIL SPN NSN
2 | æ ɒː e̞ iː o uː b p t d t͡ʃ d͡ʒ k g ʔ f v s z ʃ ʒ x ɢ h m n l ɾ j
3 | 


--------------------------------------------------------------------------------
/recipe/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
2 | 


--------------------------------------------------------------------------------
/RESULTS.txt:
--------------------------------------------------------------------------------
1 | %WER 15.57 [ 17728 / 113894, 3384 ins, 1474 del, 12870 sub ] exp/nnet3_chain/tdnn_250/decode_test/wer_8_1.0
2 | %WER 13.58 [ 15472 / 113894, 2559 ins, 990 del, 11923 sub ] exp/nnet3_chain/tdnn_f/decode_test/wer_7_1.0
3 | 


--------------------------------------------------------------------------------
/recipe/data/local/dict/nonsilence_phones.txt:
--------------------------------------------------------------------------------
 1 | æ
 2 | ɒː
 3 | e̞
 4 | iː
 5 | o
 6 | uː
 7 | b
 8 | p
 9 | t
10 | d
11 | t͡ʃ
12 | d͡ʒ
13 | k
14 | g
15 | ʔ
16 | f
17 | v
18 | s
19 | z
20 | ʃ
21 | ʒ
22 | x
23 | ɢ
24 | h
25 | m
26 | n
27 | l
28 | ɾ
29 | j
30 | 


--------------------------------------------------------------------------------
/recipe/path.sh:
--------------------------------------------------------------------------------
 1 | if [ -d /opt/kaldi ]; then
 2 |     export KALDI_ROOT=/opt/kaldi
 3 | else
 4 |     export KALDI_ROOT="$(realpath ${PWD}/../../..)"
 5 | fi
 6 | 
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | 
12 | # Add for mkgraph_lookahead.sh
13 | export LD_LIBRARY_PATH="${KALDI_ROOT}/tools/openfst/lib/fst:${LD_LIBRARY_PATH}"
14 | 
15 | export LC_ALL=C
16 | 


--------------------------------------------------------------------------------
/recipe/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why 
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --num-mel-bins=40     # similar to Google's setup.
 7 | --num-ceps=40     # there is no dimensionality reduction.
 8 | --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
 9 |                   # there might be some information at the low end.
10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
11 | 


--------------------------------------------------------------------------------
/recipe/cmd.sh:
--------------------------------------------------------------------------------
 1 | # "queue.pl" uses qsub.  The options to it are
 2 | # options to qsub.  If you have GridEngine installed,
 3 | # change this to a queue you have access to.
 4 | # Otherwise, use "run.pl", which will run jobs locally
 5 | # (make sure your --num-jobs options are no more than
 6 | # the number of cpus on your machine.
 7 | 
 8 | 
 9 | #activate this if you want to run the corpus with gridengine (http://gridengine.org/)
10 | #export train_cmd="queue.pl -l 'arch=*64*'"
11 | #export decode_cmd="queue.pl -l 'arch=*64*'"
12 | #export cuda_cmd="queue.pl -l gpu=1"
13 | 
14 | export train_cmd="utils/run.pl"
15 | export decode_cmd="utils/run.pl"
16 | export cuda_cmd="utils/run.pl -l gpu=1"
17 | 
18 | export nJobs=12
19 | export nDecodeJobs=12
20 | 


--------------------------------------------------------------------------------
/recipe/data/local/lang/phone_map.txt:
--------------------------------------------------------------------------------
 1 | NSN NSN NSN_B NSN_E NSN_I NSN_S 
 2 | SIL SIL SIL_B SIL_E SIL_I SIL_S 
 3 | SPN SPN SPN_B SPN_E SPN_I SPN_S 
 4 | æ æ_B æ_E æ_I æ_S 
 5 | ɒː ɒː_B ɒː_E ɒː_I ɒː_S 
 6 | e̞ e̞_B e̞_E e̞_I e̞_S 
 7 | iː iː_B iː_E iː_I iː_S 
 8 | o o_B o_E o_I o_S 
 9 | uː uː_B uː_E uː_I uː_S 
10 | b b_B b_E b_I b_S 
11 | p p_B p_E p_I p_S 
12 | t t_B t_E t_I t_S 
13 | d d_B d_E d_I d_S 
14 | t͡ʃ t͡ʃ_B t͡ʃ_E t͡ʃ_I t͡ʃ_S 
15 | d͡ʒ d͡ʒ_B d͡ʒ_E d͡ʒ_I d͡ʒ_S 
16 | k k_B k_E k_I k_S 
17 | g g_B g_E g_I g_S 
18 | ʔ ʔ_B ʔ_E ʔ_I ʔ_S 
19 | f f_B f_E f_I f_S 
20 | v v_B v_E v_I v_S 
21 | s s_B s_E s_I s_S 
22 | z z_B z_E z_I z_S 
23 | ʃ ʃ_B ʃ_E ʃ_I ʃ_S 
24 | ʒ ʒ_B ʒ_E ʒ_I ʒ_S 
25 | x x_B x_E x_I x_S 
26 | ɢ ɢ_B ɢ_E ɢ_I ɢ_S 
27 | h h_B h_E h_I h_S 
28 | m m_B m_E m_I m_S 
29 | n n_B n_E n_I n_S 
30 | l l_B l_E l_I l_S 
31 | ɾ ɾ_B ɾ_E ɾ_I ɾ_S 
32 | j j_B j_E j_I j_S 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Michael Hansen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Persian Kaldi Profile
 2 | 
 3 | A [Rhasspy](https://github.com/rhasspy/rhasspy) profile for Persian (`fa`).
 4 | 
 5 | Trained from approximately 293 hours of audio from [Common Voice](https://commonvoice.mozilla.org/) (Persian 7.0 dataset, validated, 10% test).
 6 | 
 7 | Available [Vosk](https://alphacephei.com/vosk) models:
 8 | 
 9 | * [Small nnet3](https://github.com/rhasspy/fa_kaldi-rhasspy/releases/download/v1.0/vosk-model-small-fa-rhasspy-0.15.zip)
10 |     * WER: 15.57%
11 | * [Large nnet3](https://github.com/rhasspy/fa_kaldi-rhasspy/releases/download/v1.0/vosk-model-large-fa-rhasspy-0.15.zip)
12 |     * WER: 13.58%
13 | 
14 | ## Installation
15 | 
16 | Get started by first installing [Vosk](https://alphacephei.com/vosk):
17 | 
18 | ``` sh
19 | # Create virtual environment
20 | python3 -m venv .venv
21 | source .venv/bin/activate
22 | pip3 install --upgrade pip
23 | pip3 install --upgrade wheel setuptools
24 | 
25 | # Install Vosk
26 | pip3 install vosk
27 | ```
28 | 
29 | Next, [download the model](https://github.com/rhasspy/fa_kaldi-rhasspy/releases/download/v1.0/vosk-model-small-fa-rhasspy-0.15.zip) and extract it:
30 | 
31 | ``` sh
32 | wget 'https://github.com/rhasspy/fa_kaldi-rhasspy/releases/download/v1.0/vosk-model-small-fa-rhasspy-0.15.zip'
33 | unzip vosk-model-small-fa-rhasspy-0.15.zip
34 | ```
35 | 
36 | Finally, run the `transcribe.py` Python program with the model and an audio file:
37 | 
38 | ``` sh
39 | python3 transcribe.py vosk-model-small-fa-rhasspy-0.15 welcome.wav
40 | 
41 | {"result": [{"conf": 1.0, "end": 0.48, "start": 0.06, "word": "خوش"}, {"conf": 1.0, "end": 1.11, "start": 0.48, "word": "آمدید"}], "text": "خوش آمدید"}
42 | ```
43 | 
44 | For each audio file given to `transcribe.py`, a line of JSON will be printed in the output with the transcription details.
45 | 


--------------------------------------------------------------------------------
/transcribe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Transcribes audio files with Vosk (https://alphacephei.com/vosk)"""
 3 | import argparse
 4 | import json
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | import librosa
 9 | from vosk import Model, KaldiRecognizer
10 | 
11 | 
12 | def main():
13 |     """Main entry point"""
14 |     parser = argparse.ArgumentParser("vosk_example")
15 |     parser.add_argument("model", help="Directory with speech to text model")
16 |     parser.add_argument("audio", nargs="+", help="Audio file(s) to transcribe")
17 |     parser.add_argument(
18 |         "--sample-rate", default=16000, help="Sample rate of model in Hertz"
19 |     )
20 |     args = parser.parse_args()
21 | 
22 |     model = Model(args.model)
23 | 
24 |     for audio_path in args.audio:
25 |         # Load and re-sample audio if necessary
26 |         audio, _sample_rate = librosa.load(audio_path, sr=args.sample_rate, mono=True)
27 |         audio = audio_float_to_int16(audio).tobytes()
28 | 
29 |         rec = KaldiRecognizer(model, args.sample_rate)
30 |         rec.SetWords(True)
31 | 
32 |         rec.AcceptWaveform(audio)
33 | 
34 |         # Parse JSON result and re-print so it's all on one line (JSONL)
35 |         result = json.loads(rec.FinalResult())
36 |         json.dump(result, sys.stdout, ensure_ascii=False)
37 |         print("")
38 | 
39 | 
40 | def audio_float_to_int16(
41 |     audio: np.ndarray, max_wav_value: float = 32767.0
42 | ) -> np.ndarray:
43 |     """Normalize audio and convert to int16 range"""
44 |     audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
45 |     audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
46 |     audio_norm = audio_norm.astype("int16")
47 |     return audio_norm
48 | 
49 | 
50 | # -----------------------------------------------------------------------------
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/recipe/local/score.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
  3 | # Apache 2.0
  4 | 
  5 | [ -f ./path.sh ] && . ./path.sh
  6 | 
  7 | # begin configuration section.
  8 | cmd=run.pl
  9 | stage=0
 10 | decode_mbr=false
 11 | reverse=false
 12 | stats=true
 13 | beam=6
 14 | word_ins_penalty=0.0,0.5,1.0
 15 | min_lmwt=7
 16 | max_lmwt=17
 17 | iter=final
 18 | #end configuration section.
 19 | 
 20 | echo "$0 $@"  # Print the command line for logging
 21 | [ -f ./path.sh ] && . ./path.sh
 22 | . parse_options.sh || exit 1;
 23 | 
 24 | if [ $# -ne 3 ]; then
 25 |   echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
 26 |   echo " Options:"
 27 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
 28 |   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
 29 |   echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
 30 |   echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
 31 |   echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
 32 |   echo "    --reverse (true/false)          # score with time reversed features "
 33 |   exit 1;
 34 | fi
 35 | 
 36 | data=$1
 37 | lang_or_graph=$2
 38 | dir=$3
 39 | 
 40 | symtab=$lang_or_graph/words.txt
 41 | 
 42 | for f in $symtab $dir/lat.1.gz $data/text; do
 43 |   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
 44 | done
 45 | 
 46 | 
 47 | ref_filtering_cmd="cat"
 48 | [ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
 49 | [ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
 50 | hyp_filtering_cmd="cat"
 51 | [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
 52 | [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
 53 | 
 54 | 
 55 | if $decode_mbr ; then
 56 |   echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
 57 | else
 58 |   echo "$0: scoring with word insertion penalty=$word_ins_penalty"
 59 | fi
 60 | 
 61 | 
 62 | mkdir -p $dir/scoring_kaldi
 63 | cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
 64 | 
 65 | if [ $stage -le 0 ]; then
 66 | 
 67 |   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
 68 |     mkdir -p $dir/scoring_kaldi/penalty_$wip/log
 69 | 
 70 |     if $decode_mbr ; then
 71 |       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
 72 |         acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
 73 |         lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
 74 |         lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
 75 |         lattice-prune --beam=$beam ark:- ark:- \| \
 76 |         lattice-mbr-decode  --word-symbol-table=$symtab \
 77 |         ark:- ark,t:- \| \
 78 |         utils/int2sym.pl -f 2- $symtab \| \
 79 |         $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
 80 | 
 81 |     else
 82 |       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
 83 |         lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
 84 |         lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
 85 |         lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
 86 |         utils/int2sym.pl -f 2- $symtab \| \
 87 |         $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
 88 |     fi
 89 | 
 90 |     if $reverse; then # rarely-used option, ignore this.
 91 |       for lmwt in `seq $min_lmwt $max_lmwt`; do
 92 |         mv $dir/scoring_kaldi/penalty_$wip/$lmwt.txt $dir/scoring_kaldi/penalty_$wip/$lmwt.txt.orig
 93 |         awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
 94 |           <$dir/scoring_kaldi/penalty_$wip/$lmwt.txt.orig >$dir/scoring_kaldi/penalty_$wip/$lmwt.txt
 95 |       done
 96 |     fi
 97 | 
 98 |     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
 99 |       cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
100 |       compute-wer --text --mode=present \
101 |       ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
102 | 
103 |   done
104 | fi
105 | 
106 | 
107 | 
108 | if [ $stage -le 1 ]; then
109 | 
110 |   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
111 |     for lmwt in $(seq $min_lmwt $max_lmwt); do
112 |       # adding /dev/null to the command list below forces grep to output the filename
113 |       grep WER $dir/wer_${lmwt}_${wip} /dev/null
114 |     done
115 |   done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
116 | 
117 |   best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
118 |   best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
119 |   best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
120 | 
121 |   if [ -z "$best_lmwt" ]; then
122 |     echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
123 |     exit 1;
124 |   fi
125 | 
126 |   if $stats; then
127 |     mkdir -p $dir/scoring_kaldi/wer_details
128 |     echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
129 |     echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
130 | 
131 |     $cmd $dir/scoring_kaldi/log/stats1.log \
132 |       cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
133 |       align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
134 |       utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
135 |        utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
136 | 
137 |     $cmd $dir/scoring_kaldi/log/stats2.log \
138 |       cat $dir/scoring_kaldi/wer_details/per_utt \| \
139 |       utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
140 |       sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
141 | 
142 |     $cmd $dir/scoring_kaldi/log/wer_bootci.log \
143 |       compute-wer-bootci --mode=present \
144 |         ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
145 |         '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
146 | 
147 |   fi
148 | fi
149 | 
150 | # If we got here, the scoring was successful.
151 | # As a  small aid to prevent confusion, we remove all wer_{?,??} files;
152 | # these originate from the previous version of the scoring files
153 | rm $dir/wer_{?,??} 2>/dev/null
154 | 
155 | exit 0;
156 | 


--------------------------------------------------------------------------------
/recipe/local/nnet3/run_ivector_common.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e -o pipefail
  4 | 
  5 | 
  6 | # This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
  7 | # be called by more scripts).  It contains the common feature preparation and iVector-related parts
  8 | # of the script.  See those scripts for examples of usage.
  9 | 
 10 | 
 11 | stage=0
 12 | nj=30
 13 | min_seg_len=1.55  # min length in seconds... we do this because chain training
 14 |                   # will discard segments shorter than 1.5 seconds.   Must remain in sync
 15 |                   # with the same option given to prepare_lores_feats_and_alignments.sh
 16 | train_set=train_cleaned   # you might set this to e.g. train.
 17 | gmm=tri3_cleaned          # This specifies a GMM-dir from the features of the type you're training the system on;
 18 |                          # it should contain alignments for 'train_set'.
 19 | 
 20 | num_threads_ubm=32
 21 | nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
 22 |                          # becomes exp/nnet3_cleaned or whatever.
 23 | 
 24 | . ./cmd.sh
 25 | . ./path.sh
 26 | . utils/parse_options.sh
 27 | 
 28 | gmm_dir=exp/${gmm}
 29 | ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
 30 | 
 31 | for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
 32 |   if [ ! -f $f ]; then
 33 |     echo "$0: expected file $f to exist"
 34 |     exit 1
 35 |   fi
 36 | done
 37 | 
 38 | 
 39 | 
 40 | if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
 41 |   echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
 42 |   echo " ... Please either remove it, or rerun this script with stage > 2."
 43 |   exit 1
 44 | fi
 45 | 
 46 | 
 47 | if [ $stage -le 1 ]; then
 48 |   echo "$0: preparing directory for speed-perturbed data"
 49 |   utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
 50 | fi
 51 | 
 52 | if [ $stage -le 2 ]; then
 53 |   echo "$0: creating high-resolution MFCC features"
 54 | 
 55 |   # this shows how you can split across multiple file-systems.  we'll split the
 56 |   # MFCC dir across multiple locations.  You might want to be careful here, if you
 57 |   # have multiple copies of Kaldi checked out and run the same recipe, not to let
 58 |   # them overwrite each other.
 59 |   mfccdir=data/${train_set}_sp_hires/data
 60 |   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
 61 |     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
 62 |   fi
 63 | 
 64 |   for datadir in ${train_set}_sp test; do
 65 |     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
 66 |   done
 67 | 
 68 |   # do volume-perturbation on the training data prior to extracting hires
 69 |   # features; this helps make trained nnets more invariant to test data volume.
 70 |   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 71 | 
 72 |   for datadir in ${train_set}_sp test; do
 73 |     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
 74 |       --cmd "$train_cmd" data/${datadir}_hires
 75 |     steps/compute_cmvn_stats.sh data/${datadir}_hires
 76 |     utils/fix_data_dir.sh data/${datadir}_hires
 77 |   done
 78 | fi
 79 | 
 80 | if [ $stage -le 3 ]; then
 81 |   echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
 82 |   # we have to combine short segments or we won't be able to train chain models
 83 |   # on those segments.
 84 |   utils/data/combine_short_segments.sh \
 85 |      data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
 86 | 
 87 |   # just copy over the CMVN to avoid having to recompute it.
 88 |   cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
 89 |   utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
 90 | fi
 91 | 
 92 | if [ $stage -le 4 ]; then
 93 |   echo "$0: selecting segments of hires training data that were also present in the"
 94 |   echo " ... original training data."
 95 | 
 96 |   # note, these data-dirs are temporary; we put them in a sub-directory
 97 |   # of the place where we'll make the alignments.
 98 |   temp_data_root=exp/nnet3${nnet3_affix}/tri5
 99 |   mkdir -p $temp_data_root
100 | 
101 |   utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
102 |           data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
103 | 
104 |   # note: essentially all the original segments should be in the hires data.
105 |   n1=$(wc -l <data/${train_set}/feats.scp)
106 |   n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
107 |   if [ $n1 != $n1 ]; then
108 |     echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
109 |   fi
110 | 
111 |   echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
112 |   if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
113 |     # we don't want to overwrite old stuff, ask the user to delete it.
114 |     echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
115 |     echo " ... please delete and then rerun, or use a later --stage option."
116 |     exit 1;
117 |   fi
118 |   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
119 |      --splice-opts "--left-context=3 --right-context=3" \
120 |      3000 10000 $temp_data_root/${train_set}_hires data/lang \
121 |       $gmm_dir exp/nnet3${nnet3_affix}/tri5
122 | fi
123 | 
124 | 
125 | if [ $stage -le 5 ]; then
126 |   echo "$0: computing a subset of data to train the diagonal UBM."
127 | 
128 |   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
129 |   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
130 | 
131 |   # train a diagonal UBM using a subset of about a quarter of the data
132 |   # we don't use the _comb data for this as there is no need for compatibility with
133 |   # the alignments, and using the non-combined data is more efficient for I/O
134 |   # (no messing about with piped commands).
135 |   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
136 |   num_utts=$[$num_utts_total/4]
137 |   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
138 |       $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
139 | 
140 |   echo "$0: training the diagonal UBM."
141 |   # Use 512 Gaussians in the UBM.
142 |   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $nj \
143 |     --num-frames 700000 \
144 |     --num-threads $num_threads_ubm \
145 |     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
146 |     exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
147 | fi
148 | 
149 | if [ $stage -le 6 ]; then
150 |   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
151 |   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
152 |   # 100.
153 |   echo "$0: training the iVector extractor"
154 |   # steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
155 |   #   data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
156 |   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 1 --num-processes $nj --num-threads 1 \
157 |     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
158 | fi
159 | 
160 | if [ $stage -le 7 ]; then
161 |   # note, we don't encode the 'max2' in the name of the ivectordir even though
162 |   # that's the data we extract the ivectors from, as it's still going to be
163 |   # valid for the non-'max2' data, the utterance list is the same.
164 |   ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
165 |   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
166 |     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
167 |   fi
168 |   # We extract iVectors on the speed-perturbed training data after combining
169 |   # short segments, which will be what we train the system on.  With
170 |   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
171 |   # each of these pairs as one speaker; this gives more diversity in iVectors..
172 |   # Note that these are extracted 'online'.
173 | 
174 |   # having a larger number of speakers is helpful for generalization, and to
175 |   # handle per-utterance decoding well (iVector starts at zero).
176 |   temp_data_root=${ivectordir}
177 |   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
178 |     data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
179 | 
180 |   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
181 |     ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
182 |     exp/nnet3${nnet3_affix}/extractor $ivectordir
183 | 
184 |   # Also extract iVectors for the test data, but in this case we don't need the speed
185 |   # perturbation (sp) or small-segment concatenation (comb).
186 |   for data in test; do
187 |     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
188 |       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
189 |       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
190 |   done
191 | fi
192 | 
193 | if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
194 |   echo "$0: $feats already exists.  Refusing to overwrite the features "
195 |   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
196 |   exit 1;
197 | fi
198 | 
199 | 
200 | if [ $stage -le 8 ]; then
201 |   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
202 |   utils/data/perturb_data_dir_speed_3way.sh \
203 |     data/${train_set} data/${train_set}_sp
204 | fi
205 | 
206 | if [ $stage -le 9 ]; then
207 |   echo "$0: making MFCC features for low-resolution speed-perturbed data"
208 |   steps/make_mfcc.sh --nj $nj \
209 |     --cmd "$train_cmd" data/${train_set}_sp
210 |   steps/compute_cmvn_stats.sh data/${train_set}_sp
211 |   echo "$0: fixing input data-dir to remove nonexistent features, in case some "
212 |   echo ".. speed-perturbed segments were too short."
213 |   utils/fix_data_dir.sh data/${train_set}_sp
214 | fi
215 | 
216 | if [ $stage -le 10 ]; then
217 |   echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
218 |   src=data/${train_set}_sp
219 |   dest=data/${train_set}_sp_comb
220 |   utils/data/combine_short_segments.sh $src $min_seg_len $dest
221 |   # re-use the CMVN stats from the source directory, since it seems to be slow to
222 |   # re-compute them after concatenating short segments.
223 |   cp $src/cmvn.scp $dest/
224 |   utils/fix_data_dir.sh $dest
225 | fi
226 | 
227 | if [ $stage -le 11 ]; then
228 |   if [ -f $ali_dir/ali.1.gz ]; then
229 |     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
230 |     echo " ... or use a later --stage option."
231 |     exit 1
232 |   fi
233 |   echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
234 |   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
235 |          data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
236 | fi
237 | 
238 | 
239 | exit 0;
240 | 


--------------------------------------------------------------------------------
/recipe/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #
  4 | # Modified 2020 by Michael Hansen
  5 | # Copyright 2016, 2017, 2018, 2019 Guenter Bartsch
  6 | #
  7 | # This program is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Lesser General Public License as published by
  9 | # the Free Software Foundation, either version 3 of the License, or
 10 | # (at your option) any later version.
 11 | #
 12 | # This program is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Lesser General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Lesser General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | #
 20 | # adapted from kaldi's egs/tedlium/s5_r2/local/chain/run_tdnn.sh
 21 | 
 22 | mfccdir=mfcc_chain
 23 | 
 24 | stage=0
 25 | min_seg_len=1.55
 26 | train_set=train
 27 | gmm=tri2b_chain # the gmm for the target data
 28 | nnet3_affix=_chain  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 29 | num_threads_ubm=12
 30 | get_egs_stage=-10
 31 | 
 32 | xent_regularize=0.1
 33 | train_stage=-10
 34 | common_egs_dir=  # you can set this to use previously dumped egs.
 35 | dropout_schedule='0,0@0.20,0.5@0.50,0'
 36 | frames_per_eg=150,110,100
 37 | 
 38 | # pre-flight checks
 39 | 
 40 | if [ -f cmd.sh ]; then
 41 |       . cmd.sh; else
 42 |          echo "missing cmd.sh"; exit 1;
 43 | fi
 44 | 
 45 | # Path also sets LC_ALL=C for Kaldi, otherwise you will experience strange (and hard to debug!) bugs. It should be set here, after the python scripts and not at the beginning of this script
 46 | if [ -f path.sh ]; then
 47 |       . path.sh; else
 48 |          echo "missing path.sh"; exit 1;
 49 | 
 50 | fi
 51 | 
 52 | # At this script level we don't support not running on GPU, as it would be painfully slow.
 53 | # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 54 | # --num-threads 16 and --minibatch-size 128.
 55 | 
 56 | if ! cuda-compiled; then
 57 |   cat <<EOF && exit 1
 58 | This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 59 | If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 60 | where "nvcc" is installed.
 61 | EOF
 62 | fi
 63 | 
 64 | . utils/parse_options.sh
 65 | 
 66 | # Fix utils/steps links
 67 | rm -f utils steps
 68 | ln -s "${KALDI_ROOT}/egs/wsj/s5/utils"
 69 | ln -s "${KALDI_ROOT}/egs/wsj/s5/steps"
 70 | 
 71 | echo "Runtime configuration is: nJobs $nJobs, nDecodeJobs $nDecodeJobs. If this is not what you want, edit cmd.sh"
 72 | echo "Starting at stage $stage, train_stage $train_stage"
 73 | 
 74 | if [ $stage -le 0 ]; then
 75 | 
 76 |     # remove old lang dir if it exists
 77 |     rm -rf data/lang
 78 | 
 79 |     # Kaldi won't regenerate this even if lexicon.txt has changed
 80 |     rm -f data/local/dict/lexiconp.txt
 81 | 
 82 | fi
 83 | 
 84 | 
 85 | if [ $stage -le 1 ]; then
 86 | 
 87 |     echo
 88 |     echo Prepare phoneme data for Kaldi
 89 |     echo
 90 | 
 91 |     if [[ -f data/local/dict/lexicon.txt.gz ]]; then
 92 |         zcat data/local/dict/lexicon.txt.gz > data/local/dict/lexicon.txt
 93 |     fi
 94 | 
 95 |     utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 96 | 
 97 | fi
 98 | 
 99 | #
100 | # adapt our LM for kaldi
101 | #
102 | 
103 | if [ $stage -le 2 ]; then
104 | 
105 |     echo
106 |     echo "adapt our LM for kaldi..."
107 |     echo
108 | 
109 |     rm -rf data/lang_test
110 |     cp -r data/lang data/lang_test
111 | 
112 |     echo
113 |     echo "creating G.fst..."
114 | 
115 |     mkdir -p data/local/lm/
116 |     zcat lm/lm.arpa.gz | utils/find_arpa_oovs.pl data/lang_test/words.txt  > data/local/lm/oovs_lm.txt
117 | 
118 |     zcat lm/lm.arpa.gz | \
119 |         grep -v '<s> <s>' | \
120 |         grep -v '</s> <s>' | \
121 |         grep -v '</s> </s>' | \
122 |         arpa2fst - | fstprint | \
123 |         utils/remove_oovs.pl data/local/lm/oovs_lm.txt | \
124 |         utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
125 |           --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
126 |          fstrmepsilon > data/lang_test/G.fst
127 | 
128 | fi
129 | 
130 | if [ $stage -le 3 ]; then
131 |     echo
132 |     echo make mfcc
133 |     echo
134 | 
135 |     rm -rf exp/
136 | 
137 |     for datadir in train test; do
138 |         utils/fix_data_dir.sh data/$datadir
139 | 
140 |         mkdir -p data/$datadir/wav.scp exp/make_mfcc_chain/$datadir
141 | 
142 |         for f in wav.scp utt2spk spk2utt; do
143 |             cp data/$datadir/$f exp/make_mfcc_chain/$datadir/
144 |         done
145 | 
146 |         steps/make_mfcc.sh --cmd "$train_cmd" --nj $nJobs data/$datadir exp/make_mfcc_chain/$datadir $mfccdir || exit 1;
147 |         utils/fix_data_dir.sh data/${datadir} # some files fail to get mfcc for many reasons
148 |         steps/compute_cmvn_stats.sh data/${datadir} exp/make_mfcc_chain/$datadir $mfccdir || exit 1;
149 |         utils/fix_data_dir.sh data/${datadir} # some files fail to get mfcc for many reasons
150 |     done
151 | fi
152 | 
153 | if [ $stage -le 4 ]; then
154 |     echo
155 |     echo mono0a_chain
156 |     echo
157 | 
158 |     steps/train_mono.sh --nj $nJobs --cmd "$train_cmd" \
159 |       data/train data/lang exp/mono0a_chain || exit 1;
160 | fi
161 | 
162 | if [ $stage -le 5 ]; then
163 |     echo
164 |     echo tri1_chain
165 |     echo
166 | 
167 |     steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
168 |       data/train data/lang exp/mono0a_chain exp/mono0a_ali_chain || exit 1;
169 | 
170 |     steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
171 |       data/train data/lang exp/mono0a_ali_chain exp/tri1_chain || exit 1;
172 | fi
173 | 
174 | if [ $stage -le 6 ]; then
175 |     echo
176 |     echo tri2b_chain
177 |     echo
178 | 
179 |     steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
180 |       data/train data/lang exp/tri1_chain exp/tri1_ali_chain || exit 1;
181 | 
182 |     steps/train_lda_mllt.sh --cmd "$train_cmd" \
183 |       --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
184 |       data/train data/lang exp/tri1_ali_chain exp/tri2b_chain || exit 1;
185 | 
186 |     utils/mkgraph.sh data/lang_test \
187 |       exp/tri2b_chain exp/tri2b_chain/graph || exit 1;
188 | fi
189 | 
190 | gmm_dir=exp/$gmm
191 | ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
192 | tree_dir=exp/nnet3${nnet3_affix}/tree_sp
193 | lang=data/lang_chain
194 | lat_dir=exp/nnet3${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
195 | dir=exp/nnet3${nnet3_affix}/tdnn_250
196 | train_data_dir=data/${train_set}_sp_hires_comb
197 | lores_train_data_dir=data/${train_set}_sp_comb
198 | train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
199 | 
200 | if [ $stage -le 7 ]; then
201 |     echo
202 |     echo run_ivector_common.sh
203 |     echo
204 | 
205 |     local/nnet3/run_ivector_common.sh --stage 0 \
206 |                                       --nj $nJobs \
207 |                                       --min-seg-len $min_seg_len \
208 |                                       --train-set $train_set \
209 |                                       --gmm $gmm \
210 |                                       --num-threads-ubm $num_threads_ubm \
211 |                                       --nnet3-affix "$nnet3_affix"
212 | 
213 |     for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
214 |         $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
215 |       [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
216 |     done
217 | fi
218 | 
219 | if [ $stage -le 8 ]; then
220 |     echo
221 |     echo creating lang directory with one state per phone.
222 |     echo
223 | 
224 |     if [ -d data/lang_chain ]; then
225 |       if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
226 |         echo "$0: data/lang_chain already exists, not overwriting it; continuing"
227 |       else
228 |         echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
229 |         echo " ... not sure what to do.  Exiting."
230 |         exit 1;
231 |       fi
232 |     else
233 |       cp -r data/lang data/lang_chain
234 |       silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
235 |       nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
236 |       # Use our special topology... note that later on may have to tune this
237 |       # topology.
238 |       steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
239 |     fi
240 | fi
241 | 
242 | if [ $stage -le 9 ]; then
243 |     echo
244 |     echo 'Get the alignments as lattices (gives the chain training more freedom).'
245 |     echo
246 | 
247 |     steps/align_fmllr_lats.sh --nj $nJobs --cmd "$train_cmd" ${lores_train_data_dir} \
248 |         data/lang $gmm_dir $lat_dir
249 |     rm $lat_dir/fsts.*.gz # save space
250 | fi
251 | 
252 | if [ $stage -le 10 ]; then
253 |     echo
254 |     echo 'Build a tree using our new topology.  We know we have alignments for the'
255 |     echo 'speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use'
256 |     echo 'those.'
257 |     echo
258 | 
259 |     if [ -f $tree_dir/final.mdl ]; then
260 |       echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
261 |       exit 1;
262 |     fi
263 |     steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
264 |         --context-opts "--context-width=2 --central-position=1" \
265 |         --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
266 | 
267 | fi
268 | 
269 | #
270 | # smaller model for embedded use
271 | #
272 | 
273 | if [ $stage -le 11 ]; then
274 | 
275 |     mkdir -p $dir
276 | 
277 |     echo
278 |     echo "$0: creating neural net configs using the xconfig parser";
279 |     echo
280 | 
281 |     num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
282 |     learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
283 | 
284 |     mkdir -p $dir/configs
285 |     cat <<EOF > $dir/configs/network.xconfig
286 | input dim=100 name=ivector
287 | input dim=40 name=input
288 | 
289 | # please note that it is important to have input layer with the name=input
290 | # as the layer immediately preceding the fixed-affine-layer to enable
291 | # the use of short notation for the descriptor
292 | fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
293 | 
294 | # the first splicing is moved before the lda layer, so no splicing here
295 | relu-batchnorm-layer name=tdnn1 dim=250 self-repair-scale=1.0e-04
296 | relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=250
297 | relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=250
298 | relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=250
299 | relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=250
300 | relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=250
301 | 
302 | ## adding the layers for chain branch
303 | relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=250 target-rms=0.5
304 | output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
305 | 
306 | # adding the layers for xent branch
307 | # This block prints the configs for a separate output that will be
308 | # trained with a cross-entropy objective in the 'chain' models... this
309 | # has the effect of regularizing the hidden parts of the model.  we use
310 | # 0.5 / args.xent_regularize as the learning rate factor- the factor of
311 | # 0.5 / args.xent_regularize is suitable as it means the xent
312 | # final-layer learns at a rate independent of the regularization
313 | # constant; and the 0.5 was tuned so as to make the relative progress
314 | # similar in the xent and regular final layers.
315 | relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=250 target-rms=0.5
316 | output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
317 | 
318 | EOF
319 |     steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
320 | 
321 |     echo
322 |     echo train.py
323 |     echo
324 | 
325 |     steps/nnet3/chain/train.py --stage $train_stage \
326 |       --cmd "$decode_cmd" \
327 |       --feat.online-ivector-dir $train_ivector_dir \
328 |       --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
329 |       --chain.xent-regularize 0.1 \
330 |       --chain.leaky-hmm-coefficient 0.1 \
331 |       --chain.l2-regularize 0.00005 \
332 |       --chain.apply-deriv-weights false \
333 |       --chain.lm-opts="--num-extra-lm-states=2000" \
334 |       --egs.dir "$common_egs_dir" \
335 |       --egs.opts "--frames-overlap-per-eg 0" \
336 |       --egs.chunk-width 150 \
337 |       --trainer.num-chunk-per-minibatch 512 \
338 |       --trainer.frames-per-iter 1500000 \
339 |       --trainer.num-epochs 4 \
340 |       --trainer.optimization.proportional-shrink 20 \
341 |       --trainer.optimization.num-jobs-initial 1 \
342 |       --trainer.optimization.num-jobs-final 1 \
343 |       --trainer.optimization.initial-effective-lrate 0.001 \
344 |       --trainer.optimization.final-effective-lrate 0.0001 \
345 |       --trainer.max-param-change 2.0 \
346 |       --use-gpu wait \
347 |       --cleanup.remove-egs true \
348 |       --feat-dir $train_data_dir \
349 |       --tree-dir $tree_dir \
350 |       --lat-dir $lat_dir \
351 |       --dir $dir
352 | 
353 |     echo
354 |     echo mkgraph
355 |     echo
356 | 
357 |     utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
358 | 
359 |     if [[ -f utils/mkgraph_lookahead.sh ]]; then
360 |       utils/mkgraph_lookahead.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph_lookahead
361 |     fi
362 | fi
363 | 
364 | if [ $stage -le 12 ]; then
365 |     echo
366 |     echo decode
367 |     echo
368 | 
369 |     steps/nnet3/decode.sh --num-threads 1 --nj $nDecodeJobs --cmd "$decode_cmd" \
370 |                           --acwt 1.0 --post-decode-acwt 10.0 \
371 |                           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
372 |                           --scoring-opts "--min-lmwt 5 " \
373 |                           $dir/graph data/test_hires $dir/decode_test || exit 1;
374 | 
375 |     grep WER $dir/decode_test/scoring_kaldi/best_wer >>RESULTS.txt
376 | fi
377 | 
378 | #
379 | # larger tdnn_f model for higher end machines
380 | #
381 | # network config based on
382 | # egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
383 | #
384 | 
385 | dir=exp/nnet3${nnet3_affix}/tdnn_f
386 | 
387 | num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
388 | learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python3)
389 | affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
390 | tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
391 | linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
392 | prefinal_opts="l2-regularize=0.01"
393 | output_opts="l2-regularize=0.002"
394 | 
395 | if [ $stage -le 13 ]; then
396 | 
397 |     mkdir -p $dir
398 | 
399 |     echo
400 |     echo "$0: creating neural net configs using the xconfig parser";
401 |     echo
402 | 
403 |     mkdir -p $dir/configs
404 |     cat <<EOF > $dir/configs/network.xconfig
405 | input dim=100 name=ivector
406 | input dim=40 name=input
407 | # please note that it is important to have input layer with the name=input
408 | # as the layer immediately preceding the fixed-affine-layer to enable
409 | # the use of short notation for the descriptor
410 | fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
411 | # the first splicing is moved before the lda layer, so no splicing here
412 | relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
413 | tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
414 | tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
415 | tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
416 | tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
417 | tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
418 | tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
419 | tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
420 | tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
421 | tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
422 | tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
423 | tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
424 | tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
425 | tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
426 | tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
427 | linear-component name=prefinal-l dim=256 $linear_opts
428 | prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
429 | output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
430 | prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
431 | output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
432 | EOF
433 |     steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
434 | 
435 | fi
436 | 
437 | if [ $stage -le 14 ]; then
438 | 
439 |     echo
440 |     echo train.py
441 |     echo
442 | 
443 |     steps/nnet3/chain/train.py --stage $train_stage \
444 |       --cmd "$decode_cmd" \
445 |       --feat.online-ivector-dir $train_ivector_dir \
446 |       --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
447 |       --chain.xent-regularize $xent_regularize \
448 |       --chain.leaky-hmm-coefficient 0.1 \
449 |       --chain.l2-regularize 0.0 \
450 |       --chain.apply-deriv-weights false \
451 |       --chain.lm-opts="--num-extra-lm-states=2000" \
452 |       --trainer.dropout-schedule $dropout_schedule \
453 |       --trainer.add-option="--optimization.memory-compression-level=2" \
454 |       --egs.dir "$common_egs_dir" \
455 |       --egs.stage $get_egs_stage \
456 |       --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
457 |       --egs.chunk-width $frames_per_eg \
458 |       --trainer.num-chunk-per-minibatch 288 \
459 |       --trainer.frames-per-iter 1500000 \
460 |       --trainer.num-epochs 6 \
461 |       --trainer.optimization.num-jobs-initial 1 \
462 |       --trainer.optimization.num-jobs-final 1 \
463 |       --trainer.optimization.initial-effective-lrate 0.00025 \
464 |       --trainer.optimization.final-effective-lrate 0.000025 \
465 |       --trainer.max-param-change 2.0 \
466 |       --use-gpu wait \
467 |       --cleanup.remove-egs true \
468 |       --feat-dir $train_data_dir \
469 |       --tree-dir $tree_dir \
470 |       --lat-dir $lat_dir \
471 |       --dir $dir  || exit 1;
472 | 
473 |     echo
474 |     echo mkgraph
475 |     echo
476 | 
477 |     utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
478 | 
479 |     if [[ -f utils/mkgraph_lookahead.sh ]]; then
480 |       utils/mkgraph_lookahead.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph_lookahead
481 |     fi
482 | 
483 |     echo
484 |     echo decode
485 |     echo
486 | 
487 |     steps/nnet3/decode.sh --num-threads 1 --nj $nDecodeJobs --cmd "$decode_cmd" \
488 |                           --acwt 1.0 --post-decode-acwt 10.0 \
489 |                           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
490 |                           --scoring-opts "--min-lmwt 5 " \
491 |                           $dir/graph data/test_hires $dir/decode_test || exit 1;
492 | 
493 |     grep WER $dir/decode_test/scoring_kaldi/best_wer >>RESULTS.txt
494 | 
495 | fi
496 | 


--------------------------------------------------------------------------------