├── local
    ├── files
    │   ├── cvlist
    │   │   ├── IND_cv_spk
    │   │   ├── KR_cv_spk
    │   │   ├── RU_cv_spk
    │   │   ├── CHN_cv_spk
    │   │   ├── JPN_cv_spk
    │   │   ├── PT_cv_spk
    │   │   ├── US_cv_spk
    │   │   └── UK_cv_spk
    │   ├── ar.dict
    │   └── asr.dict
    ├── tools
    │   ├── train_kenlm.sh
    │   ├── preprocess.py
    │   ├── apply_lexicon.py
    │   ├── parse_track1_jsons.py
    │   ├── dump_spk_yzl23.sh
    │   ├── word_frequency.py
    │   ├── dump.sh
    │   ├── data2json.sh
    │   └── merge_scp2json.py
    ├── prepare_LG.fst
    ├── track2_kaldi_gmm_train.sh
    ├── track1_espnet_transformer_train.sh
    ├── track2_espnet_transformer_train.sh
    ├── prepare_data.sh
    └── track2_kaldi_chain_train.sh
├── conf
    ├── espnet_decode.yaml
    ├── fbank.conf
    ├── espnet_lm.yaml
    ├── espnet_specaug.yaml
    ├── track1_accent_transformer.yaml
    ├── espnet_train.yaml
    └── xconfig
├── README.md
├── README_en.md
├── module
    ├── track1_accent_transformer.py
    └── track2_asr_transformer.py
└── LICENSE


/local/files/cvlist/IND_cv_spk:
--------------------------------------------------------------------------------
1 | IND-G00892
2 | IND-G01006
3 | IND-G01501
4 | IND-G0760


--------------------------------------------------------------------------------
/local/files/cvlist/KR_cv_spk:
--------------------------------------------------------------------------------
1 | KR-G00022
2 | KR-G00276
3 | KR-G10029
4 | KR-G10122


--------------------------------------------------------------------------------
/local/files/cvlist/RU_cv_spk:
--------------------------------------------------------------------------------
1 | RU-G00163
2 | RU-G00196
3 | RU-G00439
4 | RU-G10416


--------------------------------------------------------------------------------
/local/files/cvlist/CHN_cv_spk:
--------------------------------------------------------------------------------
1 | CHN-G00190
2 | CHN-G00992
3 | CHN-G61365
4 | CHN-G01372


--------------------------------------------------------------------------------
/local/files/cvlist/JPN_cv_spk:
--------------------------------------------------------------------------------
1 | JPN-G00040
2 | JPN-G00125
3 | JPN-G00354
4 | JPN-G20194


--------------------------------------------------------------------------------
/local/files/cvlist/PT_cv_spk:
--------------------------------------------------------------------------------
1 | PT-G00600
2 | PT-G00643
3 | PT-G00963
4 | PT-G10618
5 | PT-G20539


--------------------------------------------------------------------------------
/local/files/cvlist/US_cv_spk:
--------------------------------------------------------------------------------
1 | US-G00007
2 | US-G01459
3 | US-G10948
4 | US-G20537
5 | US-G20939
6 | US-G30201


--------------------------------------------------------------------------------
/local/files/ar.dict:
--------------------------------------------------------------------------------
1 | <US> 0
2 | <UK> 1
3 | <CHN> 2
4 | <IND> 3
5 | <JPN> 4
6 | <KR> 5
7 | <PT> 6
8 | <RU> 7
9 | 


--------------------------------------------------------------------------------
/local/files/cvlist/UK_cv_spk:
--------------------------------------------------------------------------------
1 | UK-G00025
2 | UK-G00808
3 | UK-G01337
4 | UK-G01807
5 | UK-G10261
6 | UK-G11032
7 | UK-G11739
8 | UK-G40517


--------------------------------------------------------------------------------
/conf/espnet_decode.yaml:
--------------------------------------------------------------------------------
1 | batchsize: 0
2 | beam-size: 10
3 | penalty: 0.0
4 | maxlenratio: 0.0
5 | minlenratio: 0.0
6 | ctc-weight: 0.3
7 | lm-weight: 0.3
8 | 


--------------------------------------------------------------------------------
/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --sample-frequency=16000
3 | --num-mel-bins=71     # 8kHz so we use 36 bins (@ 8 filters/octave to get closer to 40 filters/16Khz used by IBM)
4 | --allow_downsample=true
5 | 


--------------------------------------------------------------------------------
/conf/espnet_lm.yaml:
--------------------------------------------------------------------------------
1 | # rnnlm related
2 | layer: 2
3 | unit: 1024
4 | opt: sgd        # or adam
5 | batchsize: 64   # batch size in LM training
6 | epoch: 30     # if the data size is large, we can reduce this
7 | patience: 3
8 | maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
9 | 


--------------------------------------------------------------------------------
/conf/espnet_specaug.yaml:
--------------------------------------------------------------------------------
 1 | process:
 2 |   # these three processes are a.k.a. SpecAugument
 3 |  #  - type: "time_warp"
 4 |  #   max_time_warp: 5
 5 |  #   inplace: true
 6 |  #   mode: "PIL"
 7 |   - type: "freq_mask"
 8 |     F: 20
 9 |     n_mask: 2
10 |     inplace: true
11 |     replace_with_zero: false
12 |   - type: "time_mask"
13 |     T: 40
14 |     n_mask: 2
15 |     inplace: true
16 |     replace_with_zero: false
17 | 


--------------------------------------------------------------------------------
/local/tools/train_kenlm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
 4 | # Apache 2.0
 5 | 
 6 | order=3
 7 | prune="0 1 1"
 8 | mem_rate=40%
 9 | output_dir=
10 | arpa_name=
11 | fallback="0.5 1 1.5"
12 | 
13 | input=$1
14 | 
15 | cat $input | lmplz \
16 |     -o $order \
17 |     -S $mem_rate \
18 |     --prune $prune \
19 |     --discount_fallback $fallback \
20 |     --arpa $2
21 | 
22 | echo "local/train_kenlm.sh succeeded"
23 | exit 0;
24 | 


--------------------------------------------------------------------------------
/local/tools/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
 2 | # Apache 2.0
 3 | 
 4 | import sys
 5 | 
 6 | fin=open(sys.argv[1], 'r')
 7 | fout_text = open(sys.argv[2], 'w')
 8 | fout_utt2spk = open(sys.argv[3], 'w')
 9 | 
10 | for line in fin.readlines():
11 |     uttid, path = line.strip('\n').split('\t')
12 |     text_path = path.replace('.wav', '.txt')
13 |     text_ori = open(text_path, 'r').readlines()[0].strip('\n')
14 |     feild = path.split('/')
15 |     accid = feild[-3]
16 |     spkid = accid + '-' + feild[-2]
17 |     fout_utt2spk.write(uttid + '\t' + spkid + '\n')
18 |     fout_text.write(text_ori + '\n')


--------------------------------------------------------------------------------
/conf/track1_accent_transformer.yaml:
--------------------------------------------------------------------------------
 1 | # network architecture
 2 | # encoder related
 3 | elayers: 6
 4 | eunits: 2048
 5 | # attention related
 6 | adim: 256
 7 | aheads: 4
 8 | 
 9 | # label smoothing
10 | lsm-weight: 0.0
11 | 
12 | # minibatch related
13 | batch-size: 32
14 | maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
15 | maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
16 | 
17 | # optimization related
18 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
19 | opt: noam
20 | accum-grad: 2
21 | grad-clip: 5
22 | patience: 0
23 | epochs: 40
24 | dropout-rate: 0.1
25 | 
26 | # transformer specific setting
27 | backend: pytorch
28 | model-module: "espnet.nets.pytorch_backend.track1_accent_transformer:E2E"
29 | transformer-input-layer: conv2d     # encoder architecture type
30 | transformer-lr: 5.0
31 | transformer-warmup-steps: 25000
32 | transformer-attn-dropout-rate: 0.0
33 | transformer-length-normalized-loss: false
34 | transformer-init: pytorch
35 | 
36 | 


--------------------------------------------------------------------------------
/local/prepare_LG.fst:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
 4 | # Apache 2.0
 5 | 
 6 | stage=1
 7 | 
 8 | . ./cmd.sh
 9 | . ./path.sh
10 | . ./utils/parse_options.sh
11 | 
12 | if [ $# -ne 2 ]; then
13 |   echo "prepare_all.sh <corpus-data-dir> <lm-training-text>"
14 |   echo " e.g prepare_all.sh data data/train/trans the data/ contains the dir of data and mfcc."
15 |   exit 1;
16 | fi
17 | 
18 | data_set=$1
19 | train_text=$2
20 | 
21 | # L
22 | if [ $stage -le 1 ]; then
23 |   ./utils/prepare_lang.sh --position-dependent-phones false \
24 |     $data_set/local/dict "<UNK>" $data_set/local/lang $data_set/lang || exit 1;
25 | fi
26 | 
27 | # arpa LM
28 | if [ $stage -le 2 ]; then
29 |   local/train_kenlm.sh $train_text \
30 |       $data_set/local/lm.arpa || exit 1;
31 | fi
32 | 
33 | # G compilation, check LG composition
34 | if [ $stage -le 3 ]; then
35 |   ./local/format_lm.sh $data_set/lang $data_set/local/lm.arpa \
36 |     $data_set/local/dict/lexicon.txt $data_set/lang_test || exit 1;
37 | fi
38 | 
39 | echo "local/prepare_lang.sh succeeded"
40 | exit 0;
41 | 
42 | 


--------------------------------------------------------------------------------
/conf/espnet_train.yaml:
--------------------------------------------------------------------------------
 1 | # network architecture
 2 | # encoder related
 3 | elayers: 12
 4 | eunits: 2048
 5 | # decoder related
 6 | dlayers: 6
 7 | dunits: 2048
 8 | # attention related
 9 | adim: 256
10 | aheads: 4
11 | 
12 | # hybrid CTC/attention
13 | mtlalpha: 0.3
14 | 
15 | # label smoothing
16 | lsm-weight: 0.1
17 | 
18 | # minibatch related
19 | batch-size: 32
20 | maxlen-in: 450  # if input length  > maxlen-in, batchsize is automatically reduced
21 | maxlen-out: 18 # if output length > maxlen-out, batchsize is automatically reduced
22 | 
23 | # batch-count: frame
24 | # batch-frames-in 3200
25 | # batch-frames-out 100 
26 | # batch-frames-inout 900
27 | 
28 | # optimization related
29 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
30 | opt: noam
31 | accum-grad: 2
32 | grad-clip: 5
33 | patience: 0
34 | epochs: 50
35 | dropout-rate: 0.1
36 | 
37 | # transformer specific setting
38 | backend: pytorch
39 | model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
40 | transformer-input-layer: conv2d     # encoder architecture type
41 | transformer-lr: 1.0
42 | transformer-warmup-steps: 25000
43 | transformer-attn-dropout-rate: 0.0
44 | transformer-length-normalized-loss: false
45 | transformer-init: pytorch
46 | 


--------------------------------------------------------------------------------
/local/tools/apply_lexicon.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
 2 | # Apache 2.0
 3 | 
 4 | import codecs
 5 | import sys
 6 | 
 7 | dict = sys.argv[1]
 8 | input = sys.argv[2]
 9 | output = sys.argv[3]
10 | unk = sys.argv[4]
11 | warning = sys.argv[5]
12 | unit_name = sys.argv[6]
13 | 
14 | map = {}
15 | units = []
16 | 
17 | with codecs.open(dict, 'r', encoding='utf-8') as f1:
18 |     for line in f1:
19 |         word = line.split('\t')[0]
20 |         tokens = line.rstrip('\n').split('\t')[1]
21 |         map[word] = tokens
22 | 
23 | with codecs.open(input, 'r', encoding='utf-8') as f2:
24 |     with codecs.open(output, 'w', encoding='utf-8') as f3 ,codecs.open(warning, 'w', encoding='utf-8') as f4:
25 |         for line in f2:
26 |             if len(line.split('\t')) > 1:
27 |                 head = line.split('\t')[0]
28 |                 sentence = line.rstrip('\n').split('\t')[1].split(' ')
29 |             else:
30 |                 head = line.split(' ')[0]
31 |                 sentence = line.rstrip('\n').split(' ')[1:]
32 |             result = head + '\t'
33 |             for word in sentence:
34 |                 if len(word):
35 |                     if word in map:
36 |                         result += map[word] + ' '
37 |                         for unit in map[word].split(' '):
38 |                             if unit not in units:
39 |                                 units.append(unit)
40 |                     else:
41 |                         f4.write(word + '\n')
42 |                         result += unk + ' '
43 |             f3.write(result.rstrip(' ').lstrip(' ') + '\n')
44 | 
45 | list.sort(units)
46 | units.insert(0, '<unk>')
47 | with codecs.open(unit_name, 'w', encoding='utf-8') as f5:
48 |     for i in range(len(units)):
49 |        f5.write(str(units[i]) + ' ' + str(i+1)+'\n')
50 | 


--------------------------------------------------------------------------------
/local/tools/parse_track1_jsons.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: luyizhou4
 3 | # @Date:   2019-10-08 15:36:36
 4 | # @Function:            
 5 | # @Last Modified time: 2020-09-13 19:17:44
 6 | 
 7 | import sys
 8 | import json
 9 | 
10 | def parse_result(result_label):
11 |     ACCENT_LIST = ["US", "UK", "CHN", "IND", "JPN", "KR", "PT", "RU"]
12 |     ACCENT_NUM = len(ACCENT_LIST)
13 |     utt_nums = [0] * ACCENT_NUM
14 |     correct_nums = [0] * ACCENT_NUM
15 |     
16 |     with open(result_label, 'r') as fd:
17 |         for line in fd.readlines():
18 |             if not line.strip():
19 |                 continue
20 |             uttid, hyp = line.split()[:]
21 |             hyp = int(hyp) 
22 |             ref = ACCENT_LIST.index(uttid.split('-')[0])
23 |             utt_nums[ref] += 1
24 |             
25 |             if ref == hyp:
26 |                 correct_nums[ref] += 1
27 | 
28 |     acc_per_accent = [100.0 * correct_nums[i] / utt_nums[i] for i in range(ACCENT_NUM)]
29 |     for i in range(ACCENT_NUM):
30 |         print('{} Accent Accuracy: {:.1f}'.format(ACCENT_LIST[i], acc_per_accent[i]))
31 |     print('Average ACC: {} / {} = {:.1f}'.format(sum(correct_nums), sum(utt_nums), 100.0 * sum(correct_nums) / sum(utt_nums)))
32 | 
33 | def main():
34 |     json_file = sys.argv[1]
35 |     result_label = sys.argv[2]
36 | 
37 |     with open(json_file, 'r') as fd, open(result_label, 'w+') as w_fd:
38 |         data = json.load(fd)
39 |         uttid_list = list (data["utts"].keys())
40 |         uttid_list.sort()
41 |         print('There are totally %s utts'%(len(uttid_list)))
42 |         for uttid in uttid_list:
43 |             rec_tokenid_list = data['utts'][uttid]["output"][0]["rec_tokenid"].split()
44 |             rec_tokenid = ' '.join(rec_tokenid_list)
45 |             w_fd.write(uttid + ' ' + rec_tokenid + '\n')
46 | 
47 |     parse_result(result_label)
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/local/track2_kaldi_gmm_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
 4 | # Apache 2.0
 5 | 
 6 | set -e
 7 | 
 8 | nj=50
 9 | stage=2
10 | 
11 | . ./cmd.sh
12 | [ -f ./path.sh ] && . ./path.sh;
13 | . ./utils/parse_options.sh
14 | 
15 | data=kdata
16 | exp=kaldi-exp
17 | 
18 | test_sets="CHN IND KR PT RU UK US JPN"
19 | 
20 | # mono
21 | if [ $stage -le 2 ]; then
22 |   # training
23 |   steps/train_mono.sh --cmd "$decode_cmd" --nj $nj \
24 |     $data/train $data/lang $exp/mono || exit 1;
25 | 
26 |   # alignment
27 |   steps/align_si.sh --cmd "$decode_cmd" --nj $nj \
28 |     $data/train $data/lang $exp/mono $exp/mono_ali || exit 1;
29 | fi
30 | 
31 | # tri1
32 | if [ $stage -le 3 ]; then
33 |   # training
34 |   steps/train_deltas.sh --cmd "$decode_cmd" \
35 |     4000 32000 $data/train $data/lang $exp/mono_ali $exp/tri1 || exit 1;
36 | 
37 |   # alignment
38 |   steps/align_si.sh --cmd "$decode_cmd" --nj $nj \
39 |     $data/train $data/lang $exp/tri1 $exp/tri1_ali || exit 1;
40 | fi
41 | 
42 | # tri2
43 | if [ $stage -le 4 ]; then
44 |   # training
45 |   steps/train_deltas.sh --cmd "$decode_cmd" \
46 |     7000 56000 $data/train $data/lang $exp/tri1_ali $exp/tri2 || exit 1;
47 | 
48 |   # alignment
49 |   steps/align_si.sh --cmd "$decode_cmd" --nj $nj \
50 |     $data/train $data/lang $exp/tri2 $exp/tri2_ali || exit 1;
51 | fi
52 | 
53 | # tri3
54 | if [ $stage -le 5 ]; then
55 |   # training [LDA+MLLT]
56 |   steps/train_lda_mllt.sh --cmd "$decode_cmd" \
57 |     10000 80000 $data/train $data/lang $exp/tri2_ali $exp/tri3 || exit 1;
58 | 
59 |   # decoding
60 |   utils/mkgraph.sh $data/lang_test $exp/tri3 $exp/tri3/graph || exit 1;
61 | 
62 |   for test_set in $test_sets;do
63 |     steps/decode.sh --cmd "$decode_cmd" --nj 30 --config conf/decode.conf \
64 |       $exp/tri3/graph $data/cv/$test_set $exp/tri3/decode_test_$test_set
65 |   done
66 | 
67 |   # alignment
68 |   steps/align_si.sh --cmd "$decode_cmd" --nj $nj \
69 |     $data/train $data/lang $exp/tri3 $exp/tri3_ali || exit 1;
70 | fi
71 | 
72 | echo "local/track2_kaldi_gmm_train.sh succeeded"
73 | exit 0;
74 | 
75 | 


--------------------------------------------------------------------------------
/local/tools/dump_spk_yzl23.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | echo "$0 $*"  # Print the command line for logging
 7 | . ./path.sh
 8 | 
 9 | cmd=run.pl
10 | nj=1
11 | verbose=0
12 | compress=true
13 | write_utt2num_frames=true
14 | filetype='mat'  # mat or hdf5
15 | 
16 | . utils/parse_options.sh
17 | 
18 | scp=$1
19 | cvmnark=$2
20 | logdir=$3
21 | dumpdir=$4
22 | utt2spk=$5
23 | 
24 | if [ $# != 5 ]; then
25 |     echo "Usage: $0 <scp> <cmvnark> <logdir> <dumpdir> <utt2spk>"
26 |     exit 1;
27 | fi
28 | 
29 | set -euo pipefail
30 | 
31 | mkdir -p ${logdir}
32 | mkdir -p ${dumpdir}
33 | 
34 | dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD})
35 | 
36 | for n in $(seq ${nj}); do
37 |     # the next command does nothing unless $dumpdir/storage/ exists, see
38 |     # utils/create_data_link.pl for more info.
39 |     utils/create_data_link.pl ${dumpdir}/feats.${n}.ark
40 | done
41 | 
42 | if ${write_utt2num_frames}; then
43 |     write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB"
44 | else
45 |     write_num_frames_opt=
46 | fi
47 | 
48 | # split scp file
49 | split_scps=""
50 | for n in $(seq ${nj}); do
51 |     split_scps="$split_scps $logdir/feats.$n.scp"
52 | done
53 | 
54 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
55 | 
56 | # dump features
57 | ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
58 |     apply-cmvn --norm-vars=true --utt2spk=ark:${utt2spk} scp:${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
59 |     copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
60 |         --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
61 |         ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
62 |     || exit 1
63 | 
64 | # concatenate scp files
65 | for n in $(seq ${nj}); do
66 |     cat ${dumpdir}/feats.${n}.scp || exit 1;
67 | done > ${dumpdir}/feats.scp || exit 1
68 | 
69 | if ${write_utt2num_frames}; then
70 |     for n in $(seq ${nj}); do
71 |         cat ${dumpdir}/utt2num_frames.${n} || exit 1;
72 |     done > ${dumpdir}/utt2num_frames || exit 1
73 |     rm ${dumpdir}/utt2num_frames.* 2>/dev/null
74 | fi
75 | 
76 | # Write the filetype, this will be used for data2json.sh
77 | echo ${filetype} > ${dumpdir}/filetype
78 | 
79 | 
80 | # remove temp scps
81 | rm ${logdir}/feats.*.scp 2>/dev/null
82 | if [ ${verbose} -eq 1 ]; then
83 |     echo "Succeeded dumping features for training"
84 | fi
85 | 


--------------------------------------------------------------------------------
/local/tools/word_frequency.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | ######################################################################
 5 | #
 6 | # Copyright ASLP@NPU. All Rights Reserved
 7 | #
 8 | # Licensed under the Apache License, Veresion 2.0(the "License");
 9 | # You may not use the file except in compliance with the Licese.
10 | # You may obtain a copy of the License at
11 | #
12 | #   http://www.apache.org/license/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing,software
15 | # distributed under the License is distributed on an "AS IS" BASIS
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | # Author shixian(npu)
21 | # Date 2019/10/09 14:25:50
22 | #
23 | ######################################################################
24 | import codecs
25 | import sys
26 | import operator
27 | 
28 | if __name__ == '__main__':
29 |     filename = sys.argv[1]
30 |     top_nums = int(sys.argv[2])
31 |     prefix = sys.argv[3]
32 |     dict_cn = {}
33 |     dict_en = {}
34 |     f2 = codecs.open("enwords.txt", "w", encoding='utf-8')
35 |     with codecs.open(filename, "r", encoding='utf-8') as f:
36 |         for line in f.readlines():
37 |             if len(line.split('\t')) > 1:
38 |                 line = line.split('\t')[1]
39 |                 start = 0
40 |             else:
41 |                 start = 1
42 |             for char in line.rstrip('\n').split(' ')[start:]:
43 |                 if char >= u'\u4e00' and char <= u'\u9fa5':
44 |                     if char not in dict_cn:
45 |                         dict_cn[char] = 1
46 |                     else:
47 |                         dict_cn[char] += 1
48 |                 else:
49 |                     f2.write(char + ' ')
50 |                     if char not in dict_en:
51 |                         dict_en[char] = 1
52 |                     else:
53 |                         dict_en[char] += 1
54 |             f2.write('\n')
55 |     dict_cn = sorted(dict_cn.items(),key=operator.itemgetter(1),reverse=True)
56 |     dict_en = sorted(dict_en.items(),key=operator.itemgetter(1),reverse=True)
57 |     fout_cn = codecs.open(prefix + '.cnwf', 'w', encoding='utf-8')
58 |     fout_en = codecs.open(prefix + '.enwf', 'w', encoding='utf-8')
59 |     if len(dict_cn):
60 |         if top_nums == 0:
61 |             for i in range(len(dict_cn)):
62 |                 fout_cn.write(dict_cn[i][0] + ' ' + str(dict_cn[i][1]) + '\n')
63 |         else:
64 |             for i in range(top_nums):
65 |                 fout_cn.write(dict_cn[i][0] + ' ' + str(dict_cn[i][1]) + '\n')
66 |     if len(dict_en):
67 |         for i in range(len(dict_en)):
68 |             fout_en.write(dict_en[i][0] + ' ' + str(dict_en[i][1]) + '\n')
69 |     fout_cn.close()
70 |     fout_en.close()
71 |     f2.close()
72 | 


--------------------------------------------------------------------------------
/local/tools/dump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | echo "$0 $*"  # Print the command line for logging
 7 | . ./path.sh
 8 | 
 9 | cmd=run.pl
10 | do_delta=false
11 | nj=1
12 | verbose=0
13 | compress=true
14 | write_utt2num_frames=true
15 | filetype='mat'  # mat or hdf5
16 | help_message="Usage: $0 <scp> <cmvnark> <logdir> <dumpdir>"
17 | 
18 | . utils/parse_options.sh
19 | 
20 | scp=$1
21 | cvmnark=$2
22 | logdir=$3
23 | dumpdir=$4
24 | 
25 | if [ $# != 4 ]; then
26 |     echo "${help_message}"
27 |     exit 1;
28 | fi
29 | 
30 | set -euo pipefail
31 | 
32 | mkdir -p ${logdir}
33 | mkdir -p ${dumpdir}
34 | 
35 | dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD})
36 | 
37 | for n in $(seq ${nj}); do
38 |     # the next command does nothing unless $dumpdir/storage/ exists, see
39 |     # utils/create_data_link.pl for more info.
40 |     utils/create_data_link.pl ${dumpdir}/feats.${n}.ark
41 | done
42 | 
43 | if ${write_utt2num_frames}; then
44 |     write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB"
45 | else
46 |     write_num_frames_opt=
47 | fi
48 | 
49 | # split scp file
50 | split_scps=""
51 | for n in $(seq ${nj}); do
52 |     split_scps="$split_scps $logdir/feats.$n.scp"
53 | done
54 | 
55 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
56 | 
57 | # dump features
58 | if ${do_delta}; then
59 |     ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
60 |         apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
61 |         add-deltas ark:- ark:- \| \
62 |         copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
63 |             --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
64 |             ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
65 |         || exit 1
66 | else
67 |     ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
68 |         apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
69 |         copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
70 |             --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
71 |             ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
72 |         || exit 1
73 | fi
74 | 
75 | # concatenate scp files
76 | for n in $(seq ${nj}); do
77 |     cat ${dumpdir}/feats.${n}.scp || exit 1;
78 | done > ${dumpdir}/feats.scp || exit 1
79 | 
80 | if ${write_utt2num_frames}; then
81 |     for n in $(seq ${nj}); do
82 |         cat ${dumpdir}/utt2num_frames.${n} || exit 1;
83 |     done > ${dumpdir}/utt2num_frames || exit 1
84 |     rm ${dumpdir}/utt2num_frames.* 2>/dev/null
85 | fi
86 | 
87 | # Write the filetype, this will be used for data2json.sh
88 | echo ${filetype} > ${dumpdir}/filetype
89 | 
90 | 
91 | # remove temp scps
92 | # rm ${logdir}/feats.*.scp 2>/dev/null
93 | if [ ${verbose} -eq 1 ]; then
94 |     echo "Succeeded dumping features for training"
95 | fi
96 | 


--------------------------------------------------------------------------------
/conf/xconfig:
--------------------------------------------------------------------------------
 1 | # This file was created by the command:
 2 | # steps/nnet3/xconfig_to_configs.py --xconfig-file exp2/chain/tdnn_a_all_sp/configs/network.xconfig --config-dir exp2/chain/tdnn_a_all_sp/configs/
 3 | # It is a copy of the source from which the config files in # this directory were generated.
 4 | 
 5 | input dim=71 name=input
 6 | conv-relu-batchnorm-layer name=cnn1 l2-regularize=0.005 height-in=71 height-out=71 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=32
 7 | linear-component name=cnn2 dim=284 orthonormal-constraint=1.0
 8 | # the first splicing is moved before the lda layer, so no splicing here
 9 | relu-batchnorm-dropout-layer name=tdnn1 l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true dim=1280
10 | tdnnf-layer name=tdnnf2 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1
11 | tdnnf-layer name=tdnnf3 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1
12 | tdnnf-layer name=tdnnf4 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1
13 | tdnnf-layer name=tdnnf5 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=0
14 | tdnnf-layer name=tdnnf6 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
15 | tdnnf-layer name=tdnnf7 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
16 | tdnnf-layer name=tdnnf8 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
17 | tdnnf-layer name=tdnnf9 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
18 | tdnnf-layer name=tdnnf10 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
19 | tdnnf-layer name=tdnnf11 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
20 | tdnnf-layer name=tdnnf12 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
21 | tdnnf-layer name=tdnnf13 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
22 | tdnnf-layer name=tdnnf14 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
23 | tdnnf-layer name=tdnnf15 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
24 | tdnnf-layer name=tdnnf16 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
25 | tdnnf-layer name=tdnnf17 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
26 | tdnnf-layer name=tdnnf18 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
27 | tdnnf-layer name=tdnnf19 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3
28 | linear-component name=prefinal-l dim=512 orthonormal-constraint=1.0
29 | 
30 | ## adding the layers for chain branch
31 | prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.03 small-dim=512 big-dim=1280
32 | output-layer name=output include-log-softmax=false dim=2170 l2-regularize=0.015
33 | 
34 | # adding the layers for xent branch
35 | prefinal-layer name=prefinal-xent input=prefinal-l l2-regularize=0.03 small-dim=512 big-dim=1280
36 | output-layer name=output-xent dim=2170 learning-rate-factor=5.0 l2-regularize=0.015
37 | 


--------------------------------------------------------------------------------
/local/track1_espnet_transformer_train.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2020 Speechlab @ SJTU (Author: Yizhou Lu)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # general configuration
 10 | backend=pytorch
 11 | stage=1
 12 | stop_stage=2
 13 | ngpu=4         # number of gpus ("0" uses cpu, otherwise use gpu)
 14 | debugmode=1
 15 | N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
 16 | verbose=0      # verbose option
 17 | resume=        # Resume the training from snapshot
 18 | log=100
 19 | 
 20 | preprocess_config=conf/specaug.yaml
 21 | train_config=conf/track1_accent_transformer.yaml
 22 | 
 23 | # others
 24 | accum_grad=2
 25 | n_iter_processes=2
 26 | lsm_weight=0.0
 27 | epochs=40
 28 | elayers=12
 29 | batch_size=20
 30 | pretrained_model=
 31 | 
 32 | # decoding parameter
 33 | recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
 34 | 
 35 | . utils/parse_options.sh || exit 1;
 36 | 
 37 | # Set bash to 'debug' mode, it will exit on :
 38 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 39 | set -e
 40 | set -u
 41 | set -o pipefail
 42 | 
 43 | train_json=kdata/train/ar.json
 44 | valid_json=kdata/cv_all/ar.json
 45 | 
 46 | expdir=exp/track1_accent_classification_transformer_elayers${elayers}
 47 | mkdir -p ${expdir}
 48 | 
 49 | 
 50 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 51 |     echo "stage 1: Network Training"
 52 |     ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
 53 |         asr_train.py \
 54 |         --config ${train_config} \
 55 |         --preprocess-conf ${preprocess_config} \
 56 |         --ngpu ${ngpu} \
 57 |         --backend ${backend} \
 58 |         --outdir ${expdir}/results \
 59 |         --debugmode ${debugmode} \
 60 |         --debugdir ${expdir} \
 61 |         --minibatches ${N} \
 62 |         --verbose ${verbose} \
 63 |         --resume ${resume} \
 64 |         --report-interval-iters ${log} \
 65 |         --accum-grad ${accum_grad} \
 66 |         --n-iter-processes ${n_iter_processes} \
 67 |         --elayers ${elayers} \
 68 |         --lsm-weight ${lsm_weight} \
 69 |         --epochs ${epochs} \
 70 |         --batch-size ${batch_size} \
 71 |         ${pretrained_model:+--pretrained-model $pretrained_model} \
 72 |         --train-json ${train_json} \
 73 |         --valid-json ${valid_json}
 74 | fi
 75 | 
 76 | decode_dir=decode_track1
 77 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 78 |     echo "stage 2: Decoding"
 79 |     nj=10
 80 |     
 81 |     # split data
 82 |     dev_root=kdata/cv_all
 83 |     splitjson.py --parts ${nj} ${dev_root}/ar.json
 84 |     #### use CPU for decoding
 85 |     ngpu=0
 86 | 
 87 |     slurm.pl JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
 88 |         asr_recog.py \
 89 |         --ngpu ${ngpu} \
 90 |         --backend ${backend} \
 91 |         --batchsize 0 \
 92 |         --recog-json ${dev_root}/split${nj}utt/ar.JOB.json \
 93 |         --result-label ${expdir}/${decode_dir}/ar.JOB.json \
 94 |         --model ${expdir}/results/${recog_model}
 95 | 
 96 |     concatjson.py ${expdir}/${decode_dir}/ar.*.json >  ${expdir}/${decode_dir}/ar.json
 97 |     echo "Decoding finished"
 98 | fi
 99 | 
100 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
101 |     echo "stage 3: Analyze decoding results"
102 |     python ./local/tools/parse_track1_jsons.py  ${expdir}/${decode_dir}/ar.json ${expdir}/${decode_dir}/result.txt
103 | 
104 | fi
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AESRC2020
  2 | 
  3 | 
  4 | #### 介绍
  5 | 
  6 | Interspeech 2020 口音英语识别挑战赛数据准备相关脚本、训练流程代码与基线实验结果。
  7 | 
  8 | Data preparation scripts and training pipeline for the Interspeech 2020 Accented English Speech Recognition Challenge (AESRC).
  9 | 
 10 | #### 依赖环境
 11 | 
 12 | 1.  安装Kaldi (数据准备有关功能脚本、Track2传统模型训练) 
 13 |     [Github链接](https://github.com/kaldi-asr/kaldi)
 14 | 2.  安装ESPnet（Track1 E2E AR Model训练、Track2 E2E ASR Transformer训练）
 15 |     [Github链接](https://github.com/espnet/espnet)
 16 | 3.  （可选）安装Google SentencePiece （Track2 E2E ASR 词表缩减、建模单元构建）
 17 |     [Github链接](https://github.com/google/sentencepiece)
 18 | 4.  （可选）安装KenLM （N-gram语言模型训练）
 19 |     [Github链接](http://https://github.com/kpu/kenlm)
 20 | 
 21 | #### 使用说明
 22 | 
 23 |  **数据准备 Data Preparation** 
 24 | 
 25 | 1.  下载评测数据
 26 | 2.  准备数据，划分开发集，特征准备以及训练BPE模型 `./local/prepare_data.sh`
 27 | 
 28 |  **口音识别赛道 AR Track** 
 29 | 
 30 | 训练Track1 ESPnet AR模型 `./local/track1_espnet_transformer_train.sh`
 31 | 
 32 |  **语音识别赛道 ASR Track** 
 33 | 
 34 | 1.  训练Track2 Kaldi GMM对齐模型  `./local/track2_kaldi_gmm_train.sh`
 35 | 2.  生成Lattice，决策树，训练Track2 Kaldi Chain Model  `./local/track2_kaldi_chain_train.sh`
 36 | 3.  训练Track2 ESPnet Transformer模型（Track2 ESPnet RNN语言模型） `./local/track2_espnet_transformer_train.sh`
 37 | 
 38 | **注意**
 39 | 1.  官方不提供Kaldi模型所需的英文的发音词典
 40 | 2.  训练脚本中不包括数据扩充、添加Librispeech数据等，参赛者可按需添加
 41 | 3.  正确安装并激活Kaldi与ESPnet的环境之后才能运行相关脚本
 42 | 4.  ASR Track中Baseline提供了多种数据的组合、Librispeech全量数据预训练等试验结果
 43 | 5.  参赛者应严格按照评测中关于数据使用的相关规则训练模型，以确保结果的公平可比性
 44 | 
 45 | #### 基线实验结果
 46 | 
 47 | **Track1基线实验结果** 
 48 | 
 49 | | Model    | RU   | KR   | US   | PT   | JPN  | UK   | CHN  | IND  | AVE  |
 50 | | -------- | -- |---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
 51 | | Transformer-3L  | 30.0 | 45.0 | 45.7 | 57.2 | 48.5 | 70.0 | 56.2 | 83.5 | 54.1 |
 52 | | Transformer-6L  | 34.0 | 43.7 | 30.6 | 65.7 | 44.0 | 74.5 | 50.9 | 75.2 | 52.2 |
 53 | | Transformer-12L | 49.6 | 26.0 | 21.2 | 51.8 | 42.7 | 85.0 | 38.2 | 66.1 | 47.8 |
 54 | | + ASR-init      | 75.7 | 55.6 | 60.2 | 85.5 | 73.2 | 93.9 | 67.0 | 97.0 | 76.1 |
 55 | 
 56 | Transformer-3L、Transformer-6L、Transformer-12L均使用`./local/track1_espnet_transformer_train.sh`训练（elayers分别为3、6、12），ASR-init实验使用Track2中Joint CTC/Attention模型进行初始化
 57 | 
 58 | *在cv集的结果上发现了某个语种的acc与说话人强相关的现象，由于cv集说话人较少，所以上述结果的绝对数值并不具备统计意义，测试集将包含更多的说话人
 59 | 
 60 | **Track2基线实验结果** 
 61 | 
 62 | Kaldi Hybrid Chain Model: CNN + 18 TDNN
 63 | *基于内部的非开源英文发音词典
 64 | *随后会公布基于CMU词典的结果
 65 | 
 66 | ESPnet Transformer Model: 12 Encoder + 6 Decoder (simple self-attention, CTC joint training used, 1k sub-word BPE)
 67 | 
 68 | 详细超参数见`./local/files/conf/`目录中模型配置与相关脚本中的设置
 69 | <table>
 70 | <thead>
 71 |   <tr>
 72 |     <th rowspan="2"></th>
 73 |     <th rowspan="2">Data</th>
 74 |     <th rowspan="2">Decode Related</th>
 75 |     <th colspan="9">WER on cv set</th>
 76 |   </tr>
 77 |   <tr>
 78 |     <td>RU</td>
 79 |     <td>KR</td>
 80 |     <td>US</td>
 81 |     <td>PT</td>
 82 |     <td>JPN</td>
 83 |     <td>UK</td>
 84 |     <td>CHN</td>
 85 |     <td>IND</td>
 86 |     <td>AVE</td>
 87 |   </tr>
 88 | </thead>
 89 | <tbody>
 90 |   <tr>
 91 |     <td rowspan="3">Kaldi<br></td>
 92 |     <td>Accent160</td>
 93 |     <td rowspan="3">-</td>
 94 |     <td>6.67</td>
 95 |     <td>11.46</td>
 96 |     <td>15.95</td>
 97 |     <td>10.27</td>
 98 |     <td>9.78</td>
 99 |     <td>16.88</td>
100 |     <td>20.97</td>
101 |     <td>17.48</td>
102 |     <td>13.68</td>
103 |   </tr>
104 |   <tr>
105 |     <td>Libri960 ~ Accent160</td>
106 |     <td>6.61</td>
107 |     <td>10.95</td>
108 |     <td>15.33</td>
109 |     <td>9.79</td>
110 |     <td>9.75</td>
111 |     <td>16.03</td>
112 |     <td>19.68</td>
113 |     <td>16.93</td>
114 |     <td>13.13</td>
115 |   </tr>
116 |   <tr>
117 |     <td>Accent160 + Libri160</td>
118 |     <td>6.95</td>
119 |     <td>11.76</td>
120 |     <td>13.05</td>
121 |     <td>9.96</td>
122 |     <td>10.15</td>
123 |     <td>14.21</td>
124 |     <td>20.76</td>
125 |     <td>18.26</td>
126 |     <td>13.14</td>
127 |   </tr>
128 |   <tr>
129 |     <td rowspan="5">ESPnet<br></td>
130 |     <td>Accent160</td>
131 |     <td>+0.3RNNLM</td>
132 |     <td>5.26</td>
133 |     <td>7.69</td>
134 |     <td>9.96</td>
135 |     <td>7.45</td>
136 |     <td>6.79</td>
137 |     <td>10.06</td>
138 |     <td>11.77</td>
139 |     <td>10.05</td>
140 |     <td>8.63</td>
141 |   </tr>
142 |   <tr>
143 |     <td>Libri960 ~ Accent160</td>
144 |     <td>+0.3RNNLM</td>
145 |     <td>4.6</td>
146 |     <td>6.4</td>
147 |     <td>7.42</td>
148 |     <td>5.9</td>
149 |     <td>5.71</td>
150 |     <td>7.64</td>
151 |     <td>9.87</td>
152 |     <td>7.85</td>
153 |     <td>6.92</td>
154 |   </tr>
155 |   <tr>
156 |     <td rowspan="3">Accent160 +Libri160<br></td>
157 |     <td>-</td>
158 |     <td>5.35</td>
159 |     <td>9.07</td>
160 |     <td>8.52</td>
161 |     <td>7.13</td>
162 |     <td>7.29</td>
163 |     <td>8.6</td>
164 |     <td>12.03</td>
165 |     <td>9.05</td>
166 |     <td>8.38</td>
167 |   </tr>
168 |   <tr>
169 |     <td>+0.3RNNLM</td>
170 |     <td>4.68</td>
171 |     <td>7.59</td>
172 |     <td>7.7</td>
173 |     <td>6.42</td>
174 |     <td>6.37</td>
175 |     <td>7.76</td>
176 |     <td>10.88</td>
177 |     <td>8.41</td>
178 |     <td>7.48</td>
179 |   </tr>
180 |   <tr>
181 |     <td>+0.3RNNLM+0.3CTC</td>
182 |     <td>4.76</td>
183 |     <td>7.81</td>
184 |     <td>7.71</td>
185 |     <td>6.36</td>
186 |     <td>6.4</td>
187 |     <td>7.23</td>
188 |     <td>10.77</td>
189 |     <td>8.01</td>
190 |     <td>7.38   </td>
191 |   </tr>
192 | </tbody>
193 | </table>
194 | * Data A ~ Data B指使用Data B fine-tune Data A训练的模型
195 | 


--------------------------------------------------------------------------------
/local/track2_espnet_transformer_train.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # general configuration
 10 | backend=pytorch
 11 | stage=1        # start from 0 if you need to start from data preparation
 12 | stop_stage=1
 13 | ngpu=4         # number of gpus ("0" uses cpu, otherwise use gpu)
 14 | debugmode=1
 15 | dumpdir=dump   # directory to dump full features
 16 | N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
 17 | verbose=0      # verbose option
 18 | resume=        # Resume the training from snapshot
 19 | 
 20 | # feature configuration
 21 | do_delta=false
 22 | 
 23 | train_config=conf/espnet_train.conf
 24 | lm_config=conf/espnet_lm.yaml
 25 | decode_config=conf/espnet_decode.yaml
 26 | preprocess_config=conf/espnet_specaug.yaml
 27 | 
 28 | # rnnlm related
 29 | lm_resume=         # specify a snapshot file to resume LM training
 30 | lmtag=0             # tag for managing LMs
 31 | 
 32 | # decoding parameter
 33 | recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
 34 | n_average=5
 35 | 
 36 | # exp tag
 37 | tag="base" # tag for managing experiments.
 38 | 
 39 | . utils/parse_options.sh || exit 1;
 40 | 
 41 | # Set bash to 'debug' mode, it will exit on :
 42 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 43 | set -e
 44 | set -u
 45 | set -o pipefail
 46 | 
 47 | data=kdata
 48 | exp=exp-espnet
 49 | 
 50 | train_set=train
 51 | train_dev=cv_all
 52 | #recog_set="dev test"
 53 | recog_set="cv/UK cv/US cv/CHN cv/JPN cv/KR cv/RU cv/IND cv/PT"
 54 | 
 55 | 
 56 | lexi=$data/lang/lexicon.txt
 57 | dict=$data/lang/units.txt
 58 | echo "dictionary: ${dict}"
 59 | 
 60 | # you can skip this and remove --rnnlm option in the recognition (stage 5)
 61 | if [ -z ${lmtag} ]; then
 62 |     lmtag=$(basename ${lm_config%.*})
 63 | fi
 64 | lmexpname=train_rnnlm_${backend}_${lmtag}
 65 | lmexpdir=$exp/${lmexpname}
 66 | mkdir -p ${lmexpdir}
 67 | 
 68 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 69 |     echo "stage 1: LM Preparation"
 70 |     lmdatadir=$data/local/lm_train
 71 |     mkdir -p ${lmdatadir}
 72 |     text2token.py -s 1 -n 1 $data/train/text | cut -f 2- -d" " \
 73 |         > ${lmdatadir}/train.txt
 74 |     text2token.py -s 1 -n 1 $data/${train_dev}/text | cut -f 2- -d" " \
 75 |         > ${lmdatadir}/valid.txt
 76 | 
 77 |     ${cuda_cmd} --gpu 1 ${lmexpdir}/train.log \
 78 |         lm_train.py \
 79 |         --config ${lm_config} \
 80 |         --ngpu $ngpu \
 81 |         --backend ${backend} \
 82 | 		--batchsize 1000 \
 83 |         --verbose 1 \
 84 |         --outdir ${lmexpdir} \
 85 |         --tensorboard-dir tensorboard/${lmexpname} \
 86 |         --train-label ${lmdatadir}/train.txt \
 87 |         --valid-label ${lmdatadir}/valid.txt \
 88 |         --resume ${lm_resume} \
 89 |         --dict ${dict}
 90 | fi
 91 | 
 92 | if [ -z ${tag} ]; then
 93 |     expname=${train_set}_${backend}_$(basename ${train_config%.*})
 94 |     if ${do_delta}; then
 95 |         expname=${expname}_delta
 96 |     fi
 97 | else
 98 |     expname=${train_set}_${backend}_${tag}
 99 | fi
100 | expdir=$exp/${expname}
101 | mkdir -p ${expdir}
102 | 
103 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
104 |     echo "stage 2: Network Training"
105 |     ${cuda_cmd} --gpu $ngpu ${expdir}/train.log \
106 |         asr_train.py \
107 | 	        --config ${train_config} \
108 | 	        --ngpu $ngpu \
109 | 	        --backend ${backend} \
110 | 			--preprocess-conf $preprocess_config \
111 | 	        --outdir ${expdir}/results \
112 | 	        --tensorboard-dir tensorboard/${expname} \
113 | 	        --debugmode ${debugmode} \
114 | 	        --dict ${dict} \
115 | 	        --debugdir ${expdir} \
116 |         	--minibatches ${N} \
117 | 	        --verbose ${verbose} \
118 | 	        --resume ${resume} \
119 | 	        --train-json $data/$train_set/asr.json \
120 | 	        --valid-json $data/$train_dev/asr.json \
121 | 			--n-iter-processes $ngpu
122 | fi
123 | 
124 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
125 |     echo "stage 3: Decoding"
126 |     nj=30
127 |     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
128 | 	recog_model=model.last${n_average}.avg.best
129 | 	average_checkpoints.py --backend ${backend} \
130 | 			       --snapshots ${expdir}/results/snapshot.ep.* \
131 | 			       --out ${expdir}/results/${recog_model} \
132 | 			       --num ${n_average}
133 |     fi
134 |     pids=() # initialize pids
135 |     for rtask in ${recog_set}; do
136 |     (
137 |         decode_dir=decode_${rtask}_$(basename ${decode_config%.*})_${lmtag}
138 |         feat_recog_dir=$data/$rtask
139 | 		echo $feat_recog_dir 
140 |         # split data
141 |         splitjson.py --parts ${nj} ${feat_recog_dir}/asr.json
142 | 
143 |         #### use CPU for decoding
144 |         ngpu=0
145 | 
146 |         ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
147 |             asr_recog.py \
148 |             --config ${decode_config} \
149 |             --ngpu ${ngpu} \
150 |             --backend ${backend} \
151 |             --batchsize 0 \
152 |             --recog-json ${feat_recog_dir}/split${nj}utt/asr.JOB.json \
153 |             --result-label ${expdir}/${decode_dir}/asr.JOB.json \
154 |             --model ${expdir}/results/${recog_model}
155 | 
156 |         score_sclite.sh ${expdir}/${decode_dir} ${dict}
157 |     ) &
158 |     pids+=($!) # store background pids
159 |     done
160 |     i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
161 |     [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
162 |     echo "Finished"
163 | fi
164 | 


--------------------------------------------------------------------------------
/local/tools/data2json.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | echo "$0 $*" >&2 # Print the command line for logging
  7 | . ./path.sh
  8 | 
  9 | nj=20
 10 | cmd="queue.pl -q all.q"
 11 | nlsyms=""
 12 | lang=""
 13 | feat="" # feat.scp
 14 | oov="<unk>"
 15 | bpecode=""
 16 | allow_one_column=false
 17 | verbose=0
 18 | trans_type=phn
 19 | filetype=""
 20 | preprocess_conf=""
 21 | category=""
 22 | out="" # If omitted, write in stdout
 23 | 
 24 | text=""
 25 | multilingual=false
 26 | 
 27 | help_message=$(cat << EOF
 28 | Usage: $0 <data-dir> <dict>
 29 | e.g. $0 data/train data/lang_1char/train_units.txt
 30 | Options:
 31 |   --nj <nj>                                        # number of parallel jobs
 32 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
 33 |   --feat <feat-scp>                                # feat.scp or feat1.scp,feat2.scp,...
 34 |   --oov <oov-word>                                 # Default: <unk>
 35 |   --out <outputfile>                               # If omitted, write in stdout
 36 |   --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
 37 |   --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
 38 |   --verbose <num>                                  # Default: 0
 39 |   --text text_file                                 # uttid to label of each utt
 40 | EOF
 41 | )
 42 | . utils/parse_options.sh
 43 | 
 44 | if [ $# != 2 ]; then
 45 |     echo "${help_message}" 1>&2
 46 |     exit 1;
 47 | fi
 48 | 
 49 | set -euo pipefail
 50 | 
 51 | dir=$1
 52 | dic=$2
 53 | tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
 54 | trap 'rm -rf ${tmpdir}' EXIT
 55 | 
 56 | if [ -z ${text} ]; then
 57 |     text=${dir}/text
 58 | fi
 59 | 
 60 | # 1. Create scp files for inputs
 61 | #   These are not necessary for decoding mode, and make it as an option
 62 | input=
 63 | if [ -n "${feat}" ]; then
 64 |     _feat_scps=$(echo "${feat}" | tr ',' ' ' )
 65 |     read -r -a feat_scps <<< $_feat_scps
 66 |     num_feats=${#feat_scps[@]}
 67 | 
 68 |     for (( i=1; i<=num_feats; i++ )); do
 69 |         feat=${feat_scps[$((i-1))]}
 70 |         mkdir -p ${tmpdir}/input_${i}
 71 |         input+="input_${i} "
 72 |         cat ${feat} > ${tmpdir}/input_${i}/feat.scp
 73 | 
 74 |         # Dump in the "legacy" style JSON format
 75 |         if [ -n "${filetype}" ]; then
 76 |             awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
 77 |                 > ${tmpdir}/input_${i}/filetype.scp
 78 |         fi
 79 | 
 80 |         feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
 81 |             --filetype "${filetype}" \
 82 |             --preprocess-conf "${preprocess_conf}" \
 83 |             --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
 84 |     done
 85 | fi
 86 | 
 87 | # 2. Create scp files for outputs
 88 | mkdir -p ${tmpdir}/output
 89 | if [ -n "${bpecode}" ]; then
 90 |     if [ ${multilingual} = true ]; then
 91 |         # remove a space before the language ID
 92 |         paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
 93 |             | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
 94 |             > ${tmpdir}/output/token.scp
 95 |     else
 96 |         paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
 97 |             | spm_encode --model=${bpecode} --output_format=piece) \
 98 |             > ${tmpdir}/output/token.scp
 99 |     fi
100 | elif [ -n "${nlsyms}" ]; then
101 |     text2token.py -s 1 -n 1 -l ${nlsyms} ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
102 | else
103 |     text2token.py -s 1 -n 1 ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
104 | fi
105 | < ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
106 | # +2 comes from CTC blank and EOS
107 | vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
108 | odim=$(echo "$vocsize + 2" | bc)
109 | < ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
110 | 
111 | cat ${text} > ${tmpdir}/output/text.scp
112 | 
113 | 
114 | # 3. Create scp files for the others
115 | mkdir -p ${tmpdir}/other
116 | if [ ${multilingual} == true ]; then
117 |     awk '{
118 |         n = split($1,S,"[-]");
119 |         lang=S[n];
120 |         print $1 " " lang
121 |     }' ${text} > ${tmpdir}/other/lang.scp
122 | elif [ -n "${lang}" ]; then
123 |     awk -v lang=${lang} '{print $1 " " lang}' ${text} > ${tmpdir}/other/lang.scp
124 | fi
125 | 
126 | if [ -n "${category}" ]; then
127 |     awk -v category=${category} '{print $1 " " category}' ${dir}/text \
128 |         > ${tmpdir}/other/category.scp
129 | fi
130 | cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
131 | 
132 | 
133 | # 4. Merge scp files into a JSON file
134 | opts=""
135 | if [ -n "${feat}" ]; then
136 |     intypes="${input} output other"
137 | else
138 |     intypes="output other"
139 | fi
140 | for intype in ${intypes}; do
141 |     if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
142 |         continue
143 |     fi
144 | 
145 |     if [ ${intype} != other ]; then
146 |         opts+="--${intype%_*}-scps "
147 |     else
148 |         opts+="--scps "
149 |     fi
150 | 
151 |     for x in "${tmpdir}/${intype}"/*.scp; do
152 |         k=$(basename ${x} .scp)
153 |         if [ ${k} = shape ]; then
154 |             opts+="shape:${x}:shape "
155 |         else
156 |             opts+="${k}:${x} "
157 |         fi
158 |     done
159 | done
160 | 
161 | if ${allow_one_column}; then
162 |     opts+="--allow-one-column true "
163 | else
164 |     opts+="--allow-one-column false "
165 | fi
166 | 
167 | if [ -n "${out}" ]; then
168 |     opts+="-O ${out}"
169 | fi
170 | 
171 | local/tools/merge_scp2json.py --verbose ${verbose} ${opts}
172 | 
173 | rm -fr ${tmpdir}
174 | 


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | # AESRC2020
  2 | 
  3 | 
  4 | #### Introduction
  5 | 
  6 | Data preparation scripts and training pipeline for the Interspeech 2020 Accented English Speech Recognition Challenge (AESRC).
  7 | 
  8 | #### Dependent Environment
  9 | 
 10 | 1.  Install Kaldi (Data preparation scripts, Track2 traditional ASR model training) 
 11 |     [Github Link](https://github.com/kaldi-asr/kaldi)
 12 | 2.  Install ESPnet(Track1 E2E AR Model training, Track2 E2E ASR Transformer training)
 13 |     [Github Link](https://github.com/espnet/espnet)
 14 | 3.  (Optional) Install Google SentencePiece (Track2 E2E ASR modeling units building)
 15 |     [Github Link](https://github.com/google/sentencepiece)
 16 | 4.  (Optional) Install KenLM (N-gram language model training)
 17 |     [Github Link](http://https://github.com/kpu/kenlm)
 18 | 
 19 | #### Usage
 20 | 
 21 |  **Data Preparation** 
 22 | 
 23 | 1.  Download challenge data
 24 | 2.  Data preparation, divide cv set, feature extraction and bpe model training `./local/prepare_data.sh`
 25 | 
 26 |  **AR Track** 
 27 | 
 28 | Train Track1 ESPnet AR model `./local/track1_espnet_transformer_train.sh`
 29 | 
 30 |  **ASR Track** 
 31 | 
 32 | 1.  Train Track2 Kaldi GMM alignment model  `./local/track2_kaldi_gmm_train.sh`
 33 | 2.  Generate Lattice, decision tree, Train Track2 Kaldi Chain Model  `./local/track2_kaldi_chain_train.sh`
 34 | 3.  Train Track2 ESPnet Transformer Model (Track2 ESPnet RNN Language Model) `./local/track2_espnet_transformer_train.sh`
 35 | 
 36 | **Notice**
 37 | 1.  There's no lexicon provided, please prepare it by yourself.
 38 | 2.  Data augment methods are not included in scirpts.
 39 | 3.  Install Kaldi and ESPnet and activate their envrionment then you can run the scripts.
 40 | 4.  Baseline experiments in Track2 include several data using methods.
 41 | 5.  Participants should obey the rules about data strictly.
 42 | 
 43 | #### Baseline Experiments Results
 44 | 
 45 | **Track1** 
 46 | 
 47 | | Model    | RU   | KR   | US   | PT   | JPN  | UK   | CHN  | IND  | AVE  |
 48 | | -------- | -- |---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
 49 | | Transformer-3L  | 30.0 | 45.0 | 45.7 | 57.2 | 48.5 | 70.0 | 56.2 | 83.5 | 54.1 |
 50 | | Transformer-6L  | 34.0 | 43.7 | 30.6 | 65.7 | 44.0 | 74.5 | 50.9 | 75.2 | 52.2 |
 51 | | Transformer-12L | 49.6 | 26.0 | 21.2 | 51.8 | 42.7 | 85.0 | 38.2 | 66.1 | 47.8 |
 52 | | + ASR-init      | 75.7 | 55.6 | 60.2 | 85.5 | 73.2 | 93.9 | 67.0 | 97.0 | 76.1 |
 53 | 
 54 | Transformer-3L, Transformer-6L, Transformer-12L all use`./local/track1_espnet_transformer_train.sh` (elayers: 3, 6, 12)
 55 | 
 56 | ASR-init uses encoder in Track2 to initialize self-attention parameters
 57 | 
 58 | *In cv sets, we found that the acc of some accent is strongly related with speaker. As there are few speakers in cv sets, the absolute value above is not statistically significant, and the test set will contain more speakers
 59 | 
 60 | **Track2** 
 61 | 
 62 | Kaldi Hybrid Chain Model: CNN + 18 TDNN
 63 | *Based on internal non open source dictionary
 64 | *Results on CMU dict comes up soon
 65 | 
 66 | ESPnet Transformer Model: 12 Encoder + 6 Decoder (simple self-attention, CTC joint training used, 1k sub-word BPE)
 67 | 
 68 | You can find detailed hyperparameters settings in `./local/files/conf/` and training scripts
 69 | <table>
 70 | <thead>
 71 |   <tr>
 72 |     <th rowspan="2"></th>
 73 |     <th rowspan="2">Data</th>
 74 |     <th rowspan="2">Decode Related</th>
 75 |     <th colspan="9">WER on cv set</th>
 76 |   </tr>
 77 |   <tr>
 78 |     <td>RU</td>
 79 |     <td>KR</td>
 80 |     <td>US</td>
 81 |     <td>PT</td>
 82 |     <td>JPN</td>
 83 |     <td>UK</td>
 84 |     <td>CHN</td>
 85 |     <td>IND</td>
 86 |     <td>AVE</td>
 87 |   </tr>
 88 | </thead>
 89 | <tbody>
 90 |   <tr>
 91 |     <td rowspan="3">Kaldi<br></td>
 92 |     <td>Accent160</td>
 93 |     <td rowspan="3">-</td>
 94 |     <td>6.67</td>
 95 |     <td>11.46</td>
 96 |     <td>15.95</td>
 97 |     <td>10.27</td>
 98 |     <td>9.78</td>
 99 |     <td>16.88</td>
100 |     <td>20.97</td>
101 |     <td>17.48</td>
102 |     <td>13.68</td>
103 |   </tr>
104 |   <tr>
105 |     <td>Libri960 ~ Accent160</td>
106 |     <td>6.61</td>
107 |     <td>10.95</td>
108 |     <td>15.33</td>
109 |     <td>9.79</td>
110 |     <td>9.75</td>
111 |     <td>16.03</td>
112 |     <td>19.68</td>
113 |     <td>16.93</td>
114 |     <td>13.13</td>
115 |   </tr>
116 |   <tr>
117 |     <td>Accent160 + Libri160</td>
118 |     <td>6.95</td>
119 |     <td>11.76</td>
120 |     <td>13.05</td>
121 |     <td>9.96</td>
122 |     <td>10.15</td>
123 |     <td>14.21</td>
124 |     <td>20.76</td>
125 |     <td>18.26</td>
126 |     <td>13.14</td>
127 |   </tr>
128 |   <tr>
129 |     <td rowspan="5">ESPnet<br></td>
130 |     <td>Accent160</td>
131 |     <td>+0.3RNNLM</td>
132 |     <td>5.26</td>
133 |     <td>7.69</td>
134 |     <td>9.96</td>
135 |     <td>7.45</td>
136 |     <td>6.79</td>
137 |     <td>10.06</td>
138 |     <td>11.77</td>
139 |     <td>10.05</td>
140 |     <td>8.63</td>
141 |   </tr>
142 |   <tr>
143 |     <td>Libri960 ~ Accent160</td>
144 |     <td>+0.3RNNLM</td>
145 |     <td>4.6</td>
146 |     <td>6.4</td>
147 |     <td>7.42</td>
148 |     <td>5.9</td>
149 |     <td>5.71</td>
150 |     <td>7.64</td>
151 |     <td>9.87</td>
152 |     <td>7.85</td>
153 |     <td>6.92</td>
154 |   </tr>
155 |   <tr>
156 |     <td rowspan="3">Accent160 +Libri160<br></td>
157 |     <td>-</td>
158 |     <td>5.35</td>
159 |     <td>9.07</td>
160 |     <td>8.52</td>
161 |     <td>7.13</td>
162 |     <td>7.29</td>
163 |     <td>8.6</td>
164 |     <td>12.03</td>
165 |     <td>9.05</td>
166 |     <td>8.38</td>
167 |   </tr>
168 |   <tr>
169 |     <td>+0.3RNNLM</td>
170 |     <td>4.68</td>
171 |     <td>7.59</td>
172 |     <td>7.7</td>
173 |     <td>6.42</td>
174 |     <td>6.37</td>
175 |     <td>7.76</td>
176 |     <td>10.88</td>
177 |     <td>8.41</td>
178 |     <td>7.48</td>
179 |   </tr>
180 |   <tr>
181 |     <td>+0.3RNNLM+0.3CTC</td>
182 |     <td>4.76</td>
183 |     <td>7.81</td>
184 |     <td>7.71</td>
185 |     <td>6.36</td>
186 |     <td>6.4</td>
187 |     <td>7.23</td>
188 |     <td>10.77</td>
189 |     <td>8.01</td>
190 |     <td>7.38   </td>
191 |   </tr>
192 | </tbody>
193 | </table>
194 | * Data A ~ Data B means fine-tune Data A model with Data B
195 | 


--------------------------------------------------------------------------------
/local/prepare_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
  4 | # Apache 2.0
  5 | 
  6 | raw_data=$1     # raw data with metadata, txt and wav
  7 | data=$2         # data transformed into kaldi format
  8 | zipped_data=$raw_data/AESRC2020.zip 
  9 | 
 10 | stage=2
 11 | feature_cmd="run.pl"
 12 | nj=50
 13 | 
 14 | vocab_size=1000
 15 | 
 16 | 
 17 | # unzip and rename each accent
 18 | if [ $stage -le 1 ];then
 19 |     # unzip $zipped_data
 20 |     mv $raw_data/American\ English\ Speech\ Data $raw_data/US
 21 |     mv $raw_data/British\ English\ Speech\ Data $raw_data/UK
 22 |     mv $raw_data/Chinese\ Speaking\ English\ Speech\ Data $raw_data/CHN 
 23 |     mv $raw_data/Indian\ English\ Speech\ Data $raw_data/IND 
 24 |     mv $raw_data/Portuguese\ Speaking\ English\ Speech\ Data $raw_data/PT 
 25 |     mv $raw_data/Russian\ Speaking\ English\ Speech\ Data $raw_data/RU 
 26 |     mv $raw_data/Japanese\ Speaking\ English\ Speech\ Data $raw_data/JPN 
 27 |     mv $raw_data/Korean\ Speaking\ English\ Speech\ Data $raw_data/KR
 28 | fi
 29 | 
 30 | 
 31 | # generate kaldi format data for all
 32 | if [ $stage -le 2 ];then 
 33 |     echo "Generating kaldi format data."
 34 |     mkdir -p $data/data_all
 35 |     find `pwd`/ -name '*.wav' > $data/data_all/wavpath
 36 |     awk -F'/' '{print $(NF-2)"-"$(NF-1)"-"$NF}' $data/data_all/wavpath | sed 's:\.wav::g' > $data/data_all/uttlist
 37 |     paste $data/data_all/uttlist $data/data_all/wavpath > $data/data_all/wav.scp
 38 |     python local/tools/preprocess.py $data/data_all/wav.scp $data/data_all/trans $data/data_all/utt2spk # faster than for in shell
 39 |     ./utils/utt2spk_to_spk2utt.pl $data/data_all/utt2spk > $data/data_all/spk2utt
 40 | fi
 41 | 
 42 | 
 43 | # clean transcription
 44 | if [ $stage -le 3 ];then
 45 |     echo "Cleaning transcription."
 46 |     tr '[a-z]' '[A-Z]' < $data/data_all/trans > $data/data_all/trans_upper
 47 |     # turn "." in specific abbreviations into "<m>" tag
 48 |     sed -i -e 's: MR\.: MR<m>:g' -e 's: MRS\.: MRS<m>:g' -e 's: MS\.: MS<m>:g' \
 49 |         -e 's:^MR\.:MR<m>:g' -e 's:^MRS\.:MRS<m>:g' -e 's:^MS\.:MS<m>:g' $data/data_all/trans_upper 
 50 | 	# fix bug
 51 |     sed -i 's:^ST\.:STREET:g' $data/data_all/trans_upper 
 52 |     sed -i 's: ST\.: STREET:g' $data/data_all/trans_upper 
 53 |     # punctuation marks
 54 |     sed -i "s%,\|\.\|?\|!\|;\|-\|:\|,'\|\.'\|?'\|!'\| '% %g" $data/data_all/trans_upper
 55 |     sed -i 's:<m>:.:g' $data/data_all/trans_upper
 56 |     # blank
 57 |     sed -i 's:[ ][ ]*: :g' $data/data_all/trans_upper
 58 |     paste $data/data_all/uttlist $data/data_all/trans_upper > $data/data_all/text
 59 | fi
 60 | 
 61 | exit 1;
 62 | # extracting filter-bank features and cmvn
 63 | if [ $stage -le 4 ];then 
 64 |     ./utils/fix_data_dir.sh $data/data_all
 65 |     ./steps/make_fbank.sh --cmd $feature_cmd --nj $nj --fbank-config conf/fbank.conf $data/data_all $data/feats/log $data/feats/ark
 66 |     ./steps/compute_cmvn_stats.sh $data/data_all $data/feats/log $data/feats/ark # for kaldi 
 67 | fi
 68 | 
 69 | exit 1;
 70 | # divide development set for cross validation
 71 | if [ $stage -le 5 ];then 
 72 |     for i in US UK IND CHN JPN PT RU KR;do 
 73 |         ./utils/subset_data_dir.sh --spk-list local/files/cvlist/${i}_cv_spk $data/data_all $data/cv/$i 
 74 |         cat $data/cv/$i/feats.scp >> $data/cv.scp 
 75 |     done
 76 |     ./utils/filter_scp.pl --exclude $data/cv.scp $data/data_all/feats.scp > $data/train.scp 
 77 |     ./utils/subset_data_dir.sh --utt-list $data/train.scp $data/data_all $data/train
 78 | 	./utils/subset_data_dir.sh --utt-list $data/cv.scp $data/data_all $data/cv_all
 79 | 	compute-cmvn-stats scp:$data/train/feats.scp `pwd`/$data/train/dump_cmvn.ark # for espnet
 80 |     rm $data/cv.scp $data/train.scp 
 81 | fi
 82 | 
 83 | 
 84 | # generate label file and dump features for track2:E2E
 85 | if [ $stage -le 6 ];then 
 86 |     for i in US UK IND CHN JPN PT RU KR;do 
 87 |         local/tools/dump.sh --cmd $feature_cmd --nj 3 --do_delta false \
 88 |             $data/cv/$i/feats.scp $data/train/dump_cmvn.ark $data/cv/$i/dump/log $data/cv/$i/dump # for track2 e2e testing
 89 |     done 
 90 |     local/tools/dump.sh --cmd $feature_cmd --nj $nj  --do_delta false \
 91 |         $data/train/feats.scp $data/train/dump_cmvn.ark $data/train/dump/log $data/train/dump # for track2 e2e training
 92 |     # for track1, utterance-level CMVN is applied
 93 |     for data_set in train cv_all; do
 94 |         set_dir=$data/$data_set
 95 |         # hack to set utterance-level spk2utt & utt2spk
 96 |         awk '{printf "%s %s\n", $1, $1 }' $set_dir/text > $set_dir/spk2utt.utt
 97 |         cp $set_dir/spk2utt.utt $set_dir/utt2spk.utt
 98 |         compute-cmvn-stats --spk2utt=ark:$set_dir/spk2utt.utt scp:$set_dir/feats.scp \
 99 |             ark,scp:`pwd`/$set_dir/cmvn_utt.ark,$set_dir/cmvn_utt.scp
100 |         local/tools/dump_spk_yzl23.sh --cmd slurm.pl --nj 48 \
101 |             $set_dir/feats.scp $set_dir/cmvn_utt.scp \
102 |             exp/dump_feats/$data_set $set_dir/dump_utt $set_dir/utt2spk.utt
103 |     done
104 | fi
105 | 
106 | 
107 | # generate label file for track1
108 | if [ $stage -le 7 ];then 
109 |     for i in train cv_all;do 
110 |         cut -f 1 $data/$i/text > $data/$i/uttlist 
111 |         cut -d '-' -f 1 $data/$i/text | sed -e "s:^:<:g" -e "s:$:>:g" > $data/$i/accentlist
112 |         paste $data/$i/uttlist $data/$i/accentlist > $data/$i/utt2accent 
113 |         rm $data/$i/uttlist
114 | 		local/tools/data2json.sh --nj 20 --feat $data/$i/dump_utt/feats.scp --text $data/$i/utt2accent --oov 8 $data/$i local/files/ar.dict > $data/$i/ar.json
115 | 	done
116 | fi    
117 | 
118 | 
119 | # generate label file for track2 e2e 
120 | if [ $stage -le 8 ];then 
121 | 	# goolgle sentence piece toolkit is used to train a bpe model and decode
122 | 	mkdir -p $data/bpe 
123 | 	mkdir -p $data/lang 
124 | 	# male sure you have installed sentencepiece successfully
125 | 	spm_train  \
126 | 		--input=$data/train/trans_upper \
127 | 		--model_prefix=$data/bpe/bpe_${vocab_size} \
128 | 		--vocab_size=$vocab_size \
129 | 		--character_coverage=1.0 \
130 | 		--model_type=unigram
131 | 	python local/tools/word_frequency.py $data/train/trans_upper 0 $data/bpe/train 
132 | 	cut -d ' ' -f 1 $data/bpe/train.enwf | awk '{if(NF==1)print $0}' > $data/bpe/wordlist.txt 
133 |     spm_encode \
134 | 		--model=$data/bpe/bpe_${vocab_size}.model  \
135 | 		--output_format=piece < $data/bpe/wordlist.txt > $data/bpe/bpelist.txt 
136 | 	paste $data/bpe/wordlist.txt $data/bpe/bpelist.txt > $data/lang/lexicon.txt
137 | 	sed -i 's:▁ :▁:g' $data/lang/lexicon.txt 
138 | 	python local/tools/apply_lexicon.py $data/lang/lexicon.txt $data/train/text $data/train/utt2tokens "<unk>" $data/train/.warning $data/lang/units.txt 
139 | 	local/tools/data2json.sh --nj 20 --feat $data/train/dump/feats.scp --text $data/train/utt2tokens --oov 0 $data/train $data/lang/units.txt > $data/train/asr.json || exit 1;
140 | 	for i in US UK IND CHN JPN PT RU KR; do 
141 | 		# units.txt generate form cv set aborted  
142 | 		python local/tools/apply_lexicon.py $data/lang/lexicon.txt $data/cv/$i/text $data/cv/$i/utt2tokens "<unk>" $data/cv/$i/.warning $data/cv/${i}/.units.txt  || exit 1;
143 | 		local/tools/data2json.sh --nj 20 --feat $data/cv/$i/dump/feats.scp --text $data/cv/$i/utt2tokens --oov 0 $data/cv/$i $data/lang/units.txt > $data/cv/$i/asr.json
144 | 	done 
145 | 
146 | fi
147 | 
148 | echo "local/prepare_data.sh succeeded"
149 | exit 0;
150 | 


--------------------------------------------------------------------------------
/local/track2_kaldi_chain_train.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
  4 | # Apache 2.0
  5 | 
  6 | set -e
  7 | 
  8 | exp=exp-kaldi
  9 | data=kdata
 10 | # configs for 'chain'
 11 | affix=all
 12 | stage=1
 13 | train_stage=-6
 14 | get_egs_stage=0
 15 | dir=$exp/chain/tdnn  # Note: _sp will get added to this
 16 | decode_iter=
 17 | 
 18 | # training options
 19 | num_epochs=5
 20 | initial_effective_lrate=0.001
 21 | final_effective_lrate=0.0001
 22 | max_param_change=2.0
 23 | final_layer_normalize_target=0.5
 24 | num_jobs_initial=2
 25 | num_jobs_final=2
 26 | nj=50
 27 | minibatch_size=128
 28 | dropout_schedule='0,0@0.20,0.3@0.50,0'
 29 | frames_per_eg=150,110,90
 30 | remove_egs=false
 31 | common_egs_dir=
 32 | common_egs_dir=
 33 | xent_regularize=0.1
 34 | graph=$exp/chain/graph 
 35 | 
 36 | # End configuration section.
 37 | echo "$0 $@"  # Print the command line for logging
 38 | 
 39 | . ./cmd.sh
 40 | . ./path.sh
 41 | . ./utils/parse_options.sh
 42 | 
 43 | if ! cuda-compiled; then
 44 |   cat <<EOF && exit 1
 45 | This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 46 | If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 47 | where "nvcc" is installed.
 48 | EOF
 49 | fi
 50 | 
 51 | dir=${dir}${affix:+_$affix}_sp
 52 | train_set=train 
 53 | test_sets="CHN IND KR PT RU UK US JPN"
 54 | ali_dir=$exp/tri3_ali
 55 | treedir=$exp/chain/tri4_cd_tree_sp
 56 | lang=$data/lang_chain
 57 | 
 58 | if [ $stage -le 1 ]; then
 59 |   # Get the alignments as lattices (gives the LF-MMI training more freedom).
 60 |   # use the same num-jobs as the alignments
 61 |   nj=$(cat $ali_dir/num_jobs) || exit 1;
 62 |   steps/align_fmllr_lats.sh --nj 60 --cmd "$train_cmd" $data/$train_set \
 63 |     $data/lang $exp/tri3 $exp/tri4_sp_lats
 64 |   rm $exp/tri4_sp_lats/fsts.*.gz # save space
 65 | fi
 66 | 
 67 | if [ $stage -le 2 ]; then
 68 |   # Create a version of the lang/ directory that has one state per phone in the
 69 |   # topo file. [note, it really has two states.. the first one is only repeated
 70 |   # once, the second one has zero or more repeats.]
 71 |   rm -rf $lang
 72 |   cp -r $data/lang $lang
 73 |   silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 74 |   nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
 75 |   # Use our special topology... note that later on may have to tune this
 76 |   # topology.
 77 |   steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 78 | fi
 79 | 
 80 | if [ $stage -le 3 ]; then
 81 |   # Build a tree using our new topology. This is the critically different
 82 |   # step compared with other recipes.
 83 |   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
 84 |     --context-opts "--context-width=2 --central-position=1" \
 85 |     --cmd "$train_cmd" 11500 $data/$train_set $lang $ali_dir $treedir
 86 | fi
 87 | 
 88 | if [ $stage -le 4 ]; then
 89 |   echo "$0: creating neural net configs using the xconfig parser";
 90 | 
 91 |   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
 92 |   
 93 |   learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 94 |   output_opts="l2-regularize=0.015"
 95 |   tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
 96 |   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
 97 |   prefinal_opts="l2-regularize=0.03"
 98 |   ivector_affine_opts="l2-regularize=0.005"
 99 |   cnn_opts="l2-regularize=0.005"
100 |   linear_opts="orthonormal-constraint=1.0"
101 |   echo "$feat_dim"
102 |   mkdir -p $dir/configs
103 |   cat <<EOF > $dir/configs/network.xconfig
104 |   input dim=71 name=input
105 |   conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=71 height-out=71 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=32 
106 |   linear-component name=cnn2 dim=284 $linear_opts
107 |   # the first splicing is moved before the lda layer, so no splicing here
108 |   relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1280
109 |   tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1
110 |   tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1
111 |   tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1
112 |   tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=0
113 |   tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
114 |   tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
115 |   tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
116 |   tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
117 |   tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
118 |   tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
119 |   tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
120 |   tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
121 |   tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
122 |   tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
123 |   tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
124 |   tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
125 |   tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
126 |   tdnnf-layer name=tdnnf19 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3
127 |   linear-component name=prefinal-l dim=512 $linear_opts
128 | 
129 |   ## adding the layers for chain branch
130 |   prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=512 big-dim=1280
131 |   output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
132 | 
133 |   # adding the layers for xent branch
134 |   prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=512 big-dim=1280
135 |   output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
136 | EOF
137 |   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
138 | fi
139 | 
140 | if [ $stage -le 5 ]; then
141 |   steps/nnet3/chain/train.py --stage $train_stage \
142 |     --cmd "run.pl" \
143 |     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
144 |     --chain.xent-regularize $xent_regularize \
145 |     --chain.leaky-hmm-coefficient 0.1 \
146 |     --chain.l2-regularize 0.00005 \
147 |     --chain.apply-deriv-weights false \
148 |     --chain.lm-opts="--num-extra-lm-states=2000" \
149 |     --egs.dir "$common_egs_dir" \
150 |     --egs.stage $get_egs_stage \
151 |     --egs.opts "--frames-overlap-per-eg 0" \
152 |     --egs.chunk-width $frames_per_eg \
153 |     --trainer.dropout-schedule $dropout_schedule \
154 |     --trainer.num-chunk-per-minibatch $minibatch_size \
155 |     --trainer.frames-per-iter 1500000 \
156 |     --trainer.num-epochs $num_epochs \
157 |     --trainer.optimization.num-jobs-initial $num_jobs_initial \
158 |     --trainer.optimization.num-jobs-final $num_jobs_final \
159 |     --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
160 |     --trainer.optimization.final-effective-lrate $final_effective_lrate \
161 |     --trainer.max-param-change $max_param_change \
162 |     --cleanup.remove-egs $remove_egs \
163 |     --feat-dir $data/${train_set} \
164 |     --tree-dir $treedir \
165 |     --lat-dir $exp/tri4_sp_lats \
166 |     --dir $dir  || exit 1;
167 | fi
168 | 
169 | 
170 | if [ $stage -le 6 ]; then
171 | 	./local/mkgraph.sh $lang $dir/final.mdl $graph 
172 | fi
173 | 
174 | if [ $stage -le 7 ]; then
175 |   for test_set in $test_sets; do
176 |     steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
177 | 		--nj 20 --cmd "$decode_cmd" \
178 | 		$graph $data/cv/${test_set} $dir/decode_${test_set} || exit 1;
179 |   done
180 | fi
181 | 
182 | echo "local/track2_kaldi_chain_train.sh succeeded"
183 | exit 0;
184 | 


--------------------------------------------------------------------------------
/module/track1_accent_transformer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # encoding: utf-8
  3 | 
  4 | # Copyright 2020 SpeechLab @ SJTU (Author: Yizhou Lu)
  5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """ Transformer-based accent recognition model (pytorch), 
  8 |     Codes mainly borrowed from espnet (https://github.com/espnet/espnet)
  9 | """
 10 | 
 11 | from argparse import Namespace
 12 | from distutils.util import strtobool
 13 | 
 14 | import logging
 15 | import math
 16 | 
 17 | import torch
 18 | import chainer
 19 | from chainer import reporter
 20 | 
 21 | from espnet.nets.asr_interface import ASRInterface
 22 | from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
 23 | from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
 24 | from espnet.nets.pytorch_backend.nets_utils import th_accuracy
 25 | from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
 26 | from espnet.nets.pytorch_backend.transformer.encoder import Encoder
 27 | from espnet.nets.pytorch_backend.transformer.initializer import initialize
 28 | from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import LabelSmoothingLoss
 29 | from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
 30 | 
 31 | class Reporter(chainer.Chain):
 32 |     """A chainer reporter wrapper."""
 33 | 
 34 |     def report(self, acc, loss):
 35 |         """Report at every step."""
 36 |         reporter.report({'acc': acc}, self)
 37 |         reporter.report({'loss': loss}, self)
 38 | 
 39 | class E2E(ASRInterface, torch.nn.Module):
 40 |     """E2E module.
 41 | 
 42 |     :param int idim: dimension of inputs
 43 |     :param int odim: dimension of outputs
 44 |     :param Namespace args: argument Namespace containing options
 45 | 
 46 |     """
 47 | 
 48 |     @staticmethod
 49 |     def add_arguments(parser):
 50 |         """Add arguments."""
 51 |         group = parser.add_argument_group("transformer model setting")
 52 | 
 53 |         group.add_argument("--transformer-init", type=str, default="pytorch",
 54 |                            choices=["pytorch", "xavier_uniform", "xavier_normal",
 55 |                                     "kaiming_uniform", "kaiming_normal"],
 56 |                            help='how to initialize transformer parameters')
 57 |         group.add_argument("--transformer-input-layer", type=str, default="conv2d",
 58 |                            choices=["conv2d", "linear", "embed"],
 59 |                            help='transformer input layer type')
 60 |         group.add_argument('--transformer-attn-dropout-rate', default=None, type=float,
 61 |                            help='dropout in transformer attention. use --dropout-rate if None is set')
 62 |         group.add_argument('--transformer-lr', default=10.0, type=float,
 63 |                            help='Initial value of learning rate')
 64 |         group.add_argument('--transformer-warmup-steps', default=25000, type=int,
 65 |                            help='optimizer warmup steps')
 66 |         group.add_argument('--transformer-length-normalized-loss', default=True, type=strtobool,
 67 |                            help='normalize loss by length')
 68 | 
 69 |         group.add_argument('--dropout-rate', default=0.0, type=float,
 70 |                            help='Dropout rate for the encoder')
 71 |         # Encoder
 72 |         group.add_argument('--elayers', default=4, type=int,
 73 |                            help='Number of encoder layers (for shared recognition part in multi-speaker asr mode)')
 74 |         group.add_argument('--eunits', '-u', default=300, type=int,
 75 |                            help='Number of encoder hidden units')
 76 |         # Attention
 77 |         group.add_argument('--adim', default=320, type=int,
 78 |                            help='Number of attention transformation dimensions')
 79 |         group.add_argument('--aheads', default=4, type=int,
 80 |                            help='Number of heads for multi head attention')
 81 |         group.add_argument('--pretrained-model', default="", type=str,
 82 |                            help='pretrained ASR model for initialization')
 83 |         return parser
 84 | 
 85 |     @property
 86 |     def attention_plot_class(self):
 87 |         """Return PlotAttentionReport."""
 88 |         return PlotAttentionReport
 89 | 
 90 |     def __init__(self, idim, odim, args, ignore_id=-1):
 91 |         """Construct an E2E object.
 92 | 
 93 |         :param int idim: dimension of inputs
 94 |         :param int odim: dimension of outputs
 95 |         :param Namespace args: argument Namespace containing options
 96 |         """
 97 |         torch.nn.Module.__init__(self)
 98 |         if args.transformer_attn_dropout_rate is None:
 99 |             args.transformer_attn_dropout_rate = args.dropout_rate
100 |         self.encoder = Encoder(
101 |             idim=idim,
102 |             attention_dim=args.adim,
103 |             attention_heads=args.aheads,
104 |             linear_units=args.eunits,
105 |             num_blocks=args.elayers,
106 |             input_layer=args.transformer_input_layer,
107 |             dropout_rate=args.dropout_rate,
108 |             positional_dropout_rate=args.dropout_rate,
109 |             attention_dropout_rate=args.transformer_attn_dropout_rate
110 |         )
111 |         odim = odim - 1 # ignore additional dim added by data2json
112 |         self.odim = odim 
113 |         self.ignore_id = ignore_id
114 |         self.subsample = [1]
115 |         self.reporter = Reporter()
116 |         self.criterion = LabelSmoothingLoss(self.odim, self.ignore_id, args.lsm_weight,
117 |                                             args.transformer_length_normalized_loss)
118 |         self.output = torch.nn.Linear(2 * args.adim, self.odim) # mean + std pooling
119 |         # reset parameters
120 |         self.reset_parameters(args)
121 |         logging.warning(self)
122 | 
123 |     def reset_parameters(self, args):
124 |         """Initialize parameters."""
125 |         # initialize parameters
126 |         if args.pretrained_model:
127 |             path = args.pretrained_model
128 |             logging.warning("load pretrained asr model from {}".format(path))
129 |             if 'snapshot' in path:
130 |                 model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)['model']
131 |             else:
132 |                 model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)
133 |             self.load_state_dict(model_state_dict, strict=False)
134 |             del model_state_dict
135 |         else:
136 |             initialize(self, args.transformer_init)
137 | 
138 |     def forward(self, xs_pad, ilens, ys_pad):
139 |         """E2E forward.
140 | 
141 |         :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
142 |         :param torch.Tensor ilens: batch of lengths of source sequences (B)
143 |         :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
144 |         :return: label smoothing loss value
145 |         :rtype: torch.Tensor
146 |         """
147 |         # forward encoder
148 |         xs_pad = xs_pad[:, :max(ilens)]  # for data parallel
149 |         src_mask = (~make_pad_mask(ilens.tolist())).to(xs_pad.device).unsqueeze(-2)
150 |         hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
151 |         mean = torch.mean(hs_pad, dim=1).unsqueeze(1)
152 |         std = torch.std(hs_pad, dim=1).unsqueeze(1)
153 |         hs_pad = torch.cat((mean, std), dim=-1) # (B, 1, D)
154 |         # output layer
155 |         pred_pad = self.output(hs_pad)
156 | 
157 |         # compute loss
158 |         self.loss = self.criterion(pred_pad, ys_pad)
159 |         self.acc = th_accuracy(pred_pad.view(-1, self.odim), ys_pad,
160 |                                ignore_label=self.ignore_id)
161 | 
162 |         loss_data = float(self.loss)
163 |         if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
164 |             self.reporter.report(self.acc, loss_data)
165 |         else:
166 |             logging.warning('loss (=%f) is not correct', loss_data)
167 |         return self.loss
168 | 
169 |     def encode(self, x):
170 |         """Encode acoustic features.
171 | 
172 |         :param ndarray x: source acoustic feature (T, D)
173 |         :return: encoder outputs
174 |         :rtype: torch.Tensor
175 |         """
176 |         self.eval()
177 |         x = torch.as_tensor(x).unsqueeze(0) # (B, T, D) with #B=1
178 |         enc_output, _ = self.encoder(x, None)
179 |         return enc_output.squeeze(0) # returns tensor(T, D)
180 | 
181 |     # todo: batch decoding
182 |     def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
183 |         """Recognize input speech.
184 | 
185 |         """
186 |         enc_output = self.encode(x).unsqueeze(0) # (1, T, D)
187 |         mean = torch.mean(enc_output, dim=1).unsqueeze(1) # (1, 1, D)
188 |         std = torch.std(enc_output, dim=1).unsqueeze(1)
189 |         enc_output = torch.cat((mean, std), dim=-1)
190 |         lpz = self.output(enc_output)
191 |         lpz = lpz.squeeze(0) # shape of (T, D)
192 |         idx = lpz.argmax(-1).cpu().numpy().tolist()
193 |         hyp = {}
194 |         # [-1] is added here to be compatible with ASR decoding, see espnet/asr/asr_utils/parse_hypothesis
195 |         hyp['yseq'] = [-1] + idx
196 |         hyp['score'] = -1
197 |         logging.info(hyp['yseq'])
198 |         return [hyp]
199 | 
200 |     def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
201 |         """E2E attention calculation.
202 | 
203 |         :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
204 |         :param torch.Tensor ilens: batch of lengths of input sequences (B)
205 |         :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
206 |         :return: attention weights with the following shape,
207 |             1) multi-head case => attention weights (B, H, Lmax, Tmax),
208 |             2) other case => attention weights (B, Lmax, Tmax).
209 |         :rtype: float ndarray
210 |         """
211 |         with torch.no_grad():
212 |             self.forward(xs_pad, ilens, ys_pad)
213 |         ret = dict()
214 |         for name, m in self.named_modules():
215 |             if isinstance(m, MultiHeadedAttention):
216 |                 ret[name] = m.attn.cpu().numpy()
217 |         return ret
218 | 
219 |     # fix calculate_all_ctc_probs method not implemented bug
220 |     def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad):
221 |         return None
222 | 
223 | 


--------------------------------------------------------------------------------
/local/tools/merge_scp2json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # encoding: utf-8
  3 | 
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import argparse
  8 | import codecs
  9 | from distutils.util import strtobool
 10 | from io import open
 11 | import json
 12 | import logging
 13 | import sys
 14 | 
 15 | from espnet.utils.cli_utils import get_commandline_args
 16 | 
 17 | PY2 = sys.version_info[0] == 2
 18 | sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer)
 19 | sys.stdout = codecs.getwriter('utf-8')(
 20 |     sys.stdout if PY2 else sys.stdout.buffer)
 21 | 
 22 | 
 23 | # Special types:
 24 | def shape(x):
 25 |     """Change str to List[int]
 26 | 
 27 |     >>> shape('3,5')
 28 |     [3, 5]
 29 |     >>> shape(' [3, 5] ')
 30 |     [3, 5]
 31 | 
 32 |     """
 33 | 
 34 |     # x: ' [3, 5] ' -> '3, 5'
 35 |     x = x.strip()
 36 |     if x[0] == '[':
 37 |         x = x[1:]
 38 |     if x[-1] == ']':
 39 |         x = x[:-1]
 40 | 
 41 |     return list(map(int, x.split(',')))
 42 | 
 43 | 
 44 | def get_parser():
 45 |     parser = argparse.ArgumentParser(
 46 |         description='Given each file paths with such format as '
 47 |                     '<key>:<file>:<type>. type> can be omitted and the default '
 48 |                     'is "str". e.g. {} '
 49 |                     '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape '
 50 |                     '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape '
 51 |                     '--output-scps text:data/text shape:data/utt2text_shape:shape '
 52 |                     '--scps utt2spk:data/utt2spk'.format(sys.argv[0]),
 53 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 54 |     parser.add_argument('--input-scps', type=str, nargs='*', action='append',
 55 |                         default=[], help='Json files for the inputs')
 56 |     parser.add_argument('--output-scps', type=str, nargs='*', action='append',
 57 |                         default=[], help='Json files for the outputs')
 58 |     parser.add_argument('--scps', type=str, nargs='+', default=[],
 59 |                         help='The json files except for the input and outputs')
 60 |     parser.add_argument('--verbose', '-V', default=1, type=int,
 61 |                         help='Verbose option')
 62 |     parser.add_argument('--allow-one-column', type=strtobool, default=False,
 63 |                         help='Allow one column in input scp files. '
 64 |                              'In this case, the value will be empty string.')
 65 |     parser.add_argument('--out', '-O', type=str,
 66 |                         help='The output filename. '
 67 |                              'If omitted, then output to sys.stdout')
 68 |     return parser
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 |     parser = get_parser()
 73 |     args = parser.parse_args()
 74 |     args.scps = [args.scps]
 75 | 
 76 |     # logging info
 77 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 78 |     if args.verbose > 0:
 79 |         logging.basicConfig(level=logging.INFO, format=logfmt)
 80 |     else:
 81 |         logging.basicConfig(level=logging.WARN, format=logfmt)
 82 |     logging.info(get_commandline_args())
 83 | 
 84 |     # List[List[Tuple[str, str, Callable[[str], Any], str, str]]]
 85 |     input_infos = []
 86 |     output_infos = []
 87 |     infos = []
 88 |     for lis_list, key_scps_list in [(input_infos, args.input_scps),
 89 |                                     (output_infos, args.output_scps),
 90 |                                     (infos, args.scps)]:
 91 |         for key_scps in key_scps_list:
 92 |             lis = []
 93 |             for key_scp in key_scps:
 94 |                 sps = key_scp.split(':')
 95 |                 if len(sps) == 2:
 96 |                     key, scp = sps
 97 |                     type_func = None
 98 |                     type_func_str = 'none'
 99 |                 elif len(sps) == 3:
100 |                     key, scp, type_func_str = sps
101 |                     fail = False
102 | 
103 |                     try:
104 |                         # type_func: Callable[[str], Any]
105 |                         # e.g. type_func_str = "int" -> type_func = int
106 |                         type_func = eval(type_func_str)
107 |                     except Exception:
108 |                         raise RuntimeError(
109 |                             'Unknown type: {}'.format(type_func_str))
110 | 
111 |                     if not callable(type_func):
112 |                         raise RuntimeError(
113 |                             'Unknown type: {}'.format(type_func_str))
114 | 
115 |                 else:
116 |                     raise RuntimeError(
117 |                         'Format <key>:<filepath> '
118 |                         'or <key>:<filepath>:<type>  '
119 |                         'e.g. feat:data/feat.scp '
120 |                         'or shape:data/feat.scp:shape: {}'.format(key_scp))
121 | 
122 |                 for item in lis:
123 |                     if key == item[0]:
124 |                         raise RuntimeError('The key "{}" is duplicated: {} {}'
125 |                                            .format(key, item[3], key_scp))
126 | 
127 |                 lis.append((key, scp, type_func, key_scp, type_func_str))
128 |             lis_list.append(lis)
129 | 
130 |     # Open  scp files
131 |     input_fscps = [[open(i[1], 'r', encoding='utf-8')
132 |                     for i in il] for il in input_infos]
133 |     output_fscps = [[open(i[1], 'r', encoding='utf-8') for i in il]
134 |                     for il in output_infos]
135 |     fscps = [[open(i[1], 'r', encoding='utf-8') for i in il] for il in infos]
136 | 
137 |     # Note(kamo): What is done here?
138 |     # The final goal is creating a JSON file such as.
139 |     # {
140 |     #     "utts": {
141 |     #         "sample_id1": {(omitted)},
142 |     #         "sample_id2": {(omitted)},
143 |     #          ....
144 |     #     }
145 |     # }
146 |     #
147 |     # To reduce memory usage, reading the input text files for each lines
148 |     # and writing JSON elements per samples.
149 |     if args.out is None:
150 |         out = sys.stdout
151 |     else:
152 |         out = open(args.out, 'w', encoding='utf-8')
153 |     out.write('{\n    "utts": {\n')
154 |     nutt = 0
155 |     while True:
156 |         nutt += 1
157 |         # List[List[str]]
158 |         input_lines = [[f.readline() for f in fl] for fl in input_fscps]
159 |         output_lines = [[f.readline() for f in fl] for fl in output_fscps]
160 |         lines = [[f.readline() for f in fl] for fl in fscps]
161 | 
162 |         # Get the first line
163 |         concat = sum(input_lines + output_lines + lines, [])
164 |         if len(concat) == 0:
165 |             break
166 |         first = concat[0]
167 | 
168 |         # Sanity check: Must be sorted by the first column and have same keys
169 |         count = 0
170 |         for ls_list in (input_lines, output_lines, lines):
171 |             for ls in ls_list:
172 |                 for line in ls:
173 |                     if line == '' or first == '':
174 |                         if line != first:
175 |                             concat = sum(
176 |                                 input_infos + output_infos + infos, [])
177 |                             raise RuntimeError(
178 |                                 'The number of lines mismatch '
179 |                                 'between: "{}" and "{}"'
180 |                                 .format(concat[0][1], concat[count][1]))
181 | 
182 |                     elif line.split()[0] != first.split()[0]:
183 |                         concat = sum(input_infos + output_infos + infos, [])
184 |                         raise RuntimeError(
185 |                             'The keys are mismatch at {}th line '
186 |                             'between "{}" and "{}":\n>>> {}\n>>> {}'
187 |                             .format(nutt, concat[0][1], concat[count][1],
188 |                                     first.rstrip(), line.rstrip()))
189 |                     count += 1
190 | 
191 |         # The end of file
192 |         if first == '':
193 |             if nutt != 1:
194 |                 out.write('\n')
195 |             break
196 |         if nutt != 1:
197 |             out.write(',\n')
198 | 
199 |         entry = {}
200 |         for inout, _lines, _infos in [('input', input_lines, input_infos),
201 |                                       ('output', output_lines, output_infos),
202 |                                       ('other', lines, infos)]:
203 | 
204 |             lis = []
205 |             for idx, (line_list, info_list) \
206 |                     in enumerate(zip(_lines, _infos), 1):
207 |                 if inout == 'input':
208 |                     d = {'name': 'input{}'.format(idx)}
209 |                 elif inout == 'output':
210 |                     d = {'name': 'target{}'.format(idx)}
211 |                 else:
212 |                     d = {}
213 | 
214 |                 # info_list: List[Tuple[str, str, Callable]]
215 |                 # line_list: List[str]
216 |                 for line, info in zip(line_list, info_list):
217 |                     sps = line.split(None, 1)
218 |                     if len(sps) < 2:
219 |                         if not args.allow_one_column:
220 |                             raise RuntimeError(
221 |                                 'Format error {}th line in {}: '
222 |                                 ' Expecting "<key> <value>":\n>>> {}'
223 |                                 .format(nutt, info[1], line))
224 |                         uttid = sps[0]
225 |                         value = ''
226 |                     else:
227 |                         uttid, value = sps
228 | 
229 |                     key = info[0]
230 |                     type_func = info[2]
231 |                     value = value.rstrip()
232 | 
233 |                     if type_func is not None:
234 |                         try:
235 |                             # type_func: Callable[[str], Any]
236 |                             value = type_func(value)
237 |                         except Exception:
238 |                             logging.error('"{}" is an invalid function '
239 |                                           'for the {} th line in {}: \n>>> {}'
240 |                                           .format(info[4], nutt, info[1], line))
241 |                             raise
242 | 
243 |                     d[key] = value
244 |                 lis.append(d)
245 | 
246 |             if inout != 'other':
247 |                 entry[inout] = lis
248 |             else:
249 |                 # If key == 'other'. only has the first item
250 |                 entry.update(lis[0])
251 | 
252 |         entry = json.dumps(entry, indent=None, ensure_ascii=False,
253 |                            sort_keys=True, separators=(',', ': '))
254 |         # Add indent
255 |         indent = '    ' * 2
256 |         entry = ('\n' + indent).join(entry.split('\n'))
257 | 
258 |         uttid = first.split()[0]
259 |         out.write('        "{}": {}'.format(uttid, entry))
260 | 
261 |     out.write('    }\n}\n')
262 | 
263 |     logging.info('{} entries in {}'.format(nutt, out.name))
264 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/local/files/asr.dict:
--------------------------------------------------------------------------------
   1 | <unk> 1
   2 | " 2
   3 | ' 3
   4 | . 4
   5 | A 5
   6 | ABLE 6
   7 | AC 7
   8 | AD 8
   9 | AGE 9
  10 | AK 10
  11 | AL 11
  12 | AM 12
  13 | AN 13
  14 | ANCE 14
  15 | ANT 15
  16 | AR 16
  17 | ARD 17
  18 | ARY 18
  19 | AS 19
  20 | AT 20
  21 | ATE 21
  22 | ATION 22
  23 | B 23
  24 | BLE 24
  25 | BO 25
  26 | C 26
  27 | CAME 27
  28 | CE 28
  29 | CH 29
  30 | CK 30
  31 | CLOCK 31
  32 | CO 32
  33 | D 33
  34 | DA 34
  35 | E 35
  36 | ED 36
  37 | EL 37
  38 | EN 38
  39 | ENT 39
  40 | ER 40
  41 | ERS 41
  42 | ES 42
  43 | EST 43
  44 | EVER 44
  45 | F 45
  46 | FORD 46
  47 | FUL 47
  48 | G 48
  49 | GE 49
  50 | H 50
  51 | HA 51
  52 | HE 52
  53 | I 53
  54 | IA 54
  55 | IC 55
  56 | ICAL 56
  57 | ID 57
  58 | IE 58
  59 | IES 59
  60 | IGH 60
  61 | IL 61
  62 | IN 62
  63 | INE 63
  64 | ING 64
  65 | ION 65
  66 | IP 66
  67 | IR 67
  68 | IS 68
  69 | ISE 69
  70 | IST 70
  71 | IT 71
  72 | ITY 72
  73 | IVE 73
  74 | IZE 74
  75 | J 75
  76 | K 76
  77 | KE 77
  78 | L 78
  79 | LA 79
  80 | LAND 80
  81 | LE 81
  82 | LESS 82
  83 | LI 83
  84 | LIGHT 84
  85 | LL 85
  86 | LO 86
  87 | LY 87
  88 | M 88
  89 | MA 89
  90 | MAN 90
  91 | ME 91
  92 | MENT 92
  93 | MP 93
  94 | N 94
  95 | NA 95
  96 | NCE 96
  97 | NE 97
  98 | NESS 98
  99 | O 99
 100 | OL 100
 101 | ON 101
 102 | OOK 102
 103 | OR 103
 104 | OUND 104
 105 | OW 105
 106 | P 106
 107 | PE 107
 108 | PER 108
 109 | Q 109
 110 | QUE 110
 111 | QUI 111
 112 | R 112
 113 | RA 113
 114 | RE 114
 115 | REET 115
 116 | RI 116
 117 | RO 117
 118 | RROW 118
 119 | RY 119
 120 | S 120
 121 | SE 121
 122 | SH 122
 123 | SIDE 123
 124 | STREET 124
 125 | T 125
 126 | TE 126
 127 | TED 127
 128 | TER 128
 129 | TH 129
 130 | THER 130
 131 | TING 131
 132 | TION 132
 133 | TURE 133
 134 | TY 134
 135 | U 135
 136 | UGH 136
 137 | UM 137
 138 | UN 138
 139 | UR 139
 140 | US 140
 141 | V 141
 142 | VE 142
 143 | VER 143
 144 | W 144
 145 | WARD 145
 146 | WAY 146
 147 | X 147
 148 | Y 148
 149 | Z 149
 150 | ▁" 150
 151 | ▁A 151
 152 | ▁ABOUT 152
 153 | ▁AC 153
 154 | ▁ACCOUNT 154
 155 | ▁ACROSS 155
 156 | ▁ACTRESS 156
 157 | ▁ACTUALLY 157
 158 | ▁ADD 158
 159 | ▁ADJUST 159
 160 | ▁AFRAID 160
 161 | ▁AFTER 161
 162 | ▁AGAIN 162
 163 | ▁AGAINST 163
 164 | ▁AGO 164
 165 | ▁AGREE 165
 166 | ▁AIR 166
 167 | ▁ALBUM 167
 168 | ▁ALL 168
 169 | ▁ALMOST 169
 170 | ▁ALONE 170
 171 | ▁ALONG 171
 172 | ▁ALREADY 172
 173 | ▁ALSO 173
 174 | ▁ALWAYS 174
 175 | ▁AM 175
 176 | ▁AMERICA 176
 177 | ▁AN 177
 178 | ▁AND 178
 179 | ▁ANGEL 179
 180 | ▁ANIMAL 180
 181 | ▁ANOTHER 181
 182 | ▁ANSWER 182
 183 | ▁ANY 183
 184 | ▁ANYMORE 184
 185 | ▁ANYONE 185
 186 | ▁ANYTHING 186
 187 | ▁APP 187
 188 | ▁ARD 188
 189 | ▁ARE 189
 190 | ▁ARM 190
 191 | ▁AROUND 191
 192 | ▁ARRIVE 192
 193 | ▁ART 193
 194 | ▁ARTICLE 194
 195 | ▁ARTIST 195
 196 | ▁AS 196
 197 | ▁ASK 197
 198 | ▁AT 198
 199 | ▁ATTACK 199
 200 | ▁AVERAGE 200
 201 | ▁AWARDS 201
 202 | ▁AWAY 202
 203 | ▁B 203
 204 | ▁BA 204
 205 | ▁BABY 205
 206 | ▁BACK 206
 207 | ▁BAD 207
 208 | ▁BANK 208
 209 | ▁BAR 209
 210 | ▁BE 210
 211 | ▁BEACH 211
 212 | ▁BECAUSE 212
 213 | ▁BECOME 213
 214 | ▁BEEN 214
 215 | ▁BEFORE 215
 216 | ▁BEGAN 216
 217 | ▁BEGIN 217
 218 | ▁BEHIND 218
 219 | ▁BEING 219
 220 | ▁BELIEVE 220
 221 | ▁BEST 221
 222 | ▁BETTER 222
 223 | ▁BETWEEN 223
 224 | ▁BIG 224
 225 | ▁BIT 225
 226 | ▁BLACK 226
 227 | ▁BLOOD 227
 228 | ▁BLOW 228
 229 | ▁BLUE 229
 230 | ▁BO 230
 231 | ▁BODY 231
 232 | ▁BOOK 232
 233 | ▁BOTH 233
 234 | ▁BOTTLE 234
 235 | ▁BOUGHT 235
 236 | ▁BOX 236
 237 | ▁BOY 237
 238 | ▁BR 238
 239 | ▁BRANCH 239
 240 | ▁BREAK 240
 241 | ▁BREATH 241
 242 | ▁BRING 242
 243 | ▁BROKE 243
 244 | ▁BROTHER 244
 245 | ▁BROUGHT 245
 246 | ▁BUILD 246
 247 | ▁BURN 247
 248 | ▁BUS 248
 249 | ▁BUSINESS 249
 250 | ▁BUT 250
 251 | ▁BUY 251
 252 | ▁BY 252
 253 | ▁C 253
 254 | ▁CA 254
 255 | ▁CAFE 255
 256 | ▁CALL 256
 257 | ▁CAME 257
 258 | ▁CAN 258
 259 | ▁CAR 259
 260 | ▁CARE 260
 261 | ▁CASE 261
 262 | ▁CAST 262
 263 | ▁CATCH 263
 264 | ▁CAUSE 264
 265 | ▁CERTAIN 265
 266 | ▁CH 266
 267 | ▁CHA 267
 268 | ▁CHANCE 268
 269 | ▁CHANGE 269
 270 | ▁CHANNEL 270
 271 | ▁CHARGE 271
 272 | ▁CHEAP 272
 273 | ▁CHECK 273
 274 | ▁CHILD 274
 275 | ▁CHILDREN 275
 276 | ▁CHINA 276
 277 | ▁CHOICE 277
 278 | ▁CINEMAS 278
 279 | ▁CITY 279
 280 | ▁CLASS 280
 281 | ▁CLEAN 281
 282 | ▁CLEAR 282
 283 | ▁CLOCK 283
 284 | ▁CLOSE 284
 285 | ▁CLOUD 285
 286 | ▁CLUB 286
 287 | ▁CO 287
 288 | ▁COFFEE 288
 289 | ▁COLD 289
 290 | ▁COLLEGE 290
 291 | ▁COME 291
 292 | ▁COMING 292
 293 | ▁COMMON 293
 294 | ▁COMPANY 294
 295 | ▁COMPLETE 295
 296 | ▁COMPUTER 296
 297 | ▁CON 297
 298 | ▁CONCERT 298
 299 | ▁CONTROL 299
 300 | ▁COOK 300
 301 | ▁CORNER 301
 302 | ▁COST 302
 303 | ▁COULD 303
 304 | ▁COUNT 304
 305 | ▁COUNTRY 305
 306 | ▁COUPLE 306
 307 | ▁COURSE 307
 308 | ▁CR 308
 309 | ▁CRAZY 309
 310 | ▁CREATE 310
 311 | ▁CROSS 311
 312 | ▁CURRENT 312
 313 | ▁CUT 313
 314 | ▁D 314
 315 | ▁DA 315
 316 | ▁DARK 316
 317 | ▁DAUGHTER 317
 318 | ▁DAY 318
 319 | ▁DAYS 319
 320 | ▁DE 320
 321 | ▁DEAD 321
 322 | ▁DEAL 322
 323 | ▁DEATH 323
 324 | ▁DECIDE 324
 325 | ▁DECISION 325
 326 | ▁DEEP 326
 327 | ▁DELIVERY 327
 328 | ▁DESIGN 328
 329 | ▁DEVICE 329
 330 | ▁DI 330
 331 | ▁DID 331
 332 | ▁DIDN 332
 333 | ▁DIFFERENT 333
 334 | ▁DINNER 334
 335 | ▁DIRECT 335
 336 | ▁DIS 336
 337 | ▁DISCOUNT 337
 338 | ▁DISHES 338
 339 | ▁DO 339
 340 | ▁DOCTOR 340
 341 | ▁DOES 341
 342 | ▁DOING 342
 343 | ▁DON 343
 344 | ▁DONE 344
 345 | ▁DOUBT 345
 346 | ▁DOWN 346
 347 | ▁DRAW 347
 348 | ▁DREAM 348
 349 | ▁DRESS 349
 350 | ▁DRINK 350
 351 | ▁DRIVE 351
 352 | ▁DROP 352
 353 | ▁DU 353
 354 | ▁E 354
 355 | ▁EACH 355
 356 | ▁EASY 356
 357 | ▁EAT 357
 358 | ▁ED 358
 359 | ▁EFFECT 359
 360 | ▁EIGHT 360
 361 | ▁EIGHTEEN 361
 362 | ▁EIGHTY 362
 363 | ▁EITHER 363
 364 | ▁ELEVEN 364
 365 | ▁ELSE 365
 366 | ▁EN 366
 367 | ▁END 367
 368 | ▁ENGLISH 368
 369 | ▁ENJOY 369
 370 | ▁ENOUGH 370
 371 | ▁EPISODE 371
 372 | ▁ER 372
 373 | ▁EST 373
 374 | ▁EVEN 374
 375 | ▁EVER 375
 376 | ▁EVERY 376
 377 | ▁EVERYONE 377
 378 | ▁EVERYTHING 378
 379 | ▁EX 379
 380 | ▁EXACTLY 380
 381 | ▁EXAMPLE 381
 382 | ▁EXCUSE 382
 383 | ▁EXPECT 383
 384 | ▁EXPLAIN 384
 385 | ▁F 385
 386 | ▁FACE 386
 387 | ▁FACT 387
 388 | ▁FAIL 388
 389 | ▁FAIR 389
 390 | ▁FALL 390
 391 | ▁FAMILY 391
 392 | ▁FAMOUS 392
 393 | ▁FAN 393
 394 | ▁FAR 394
 395 | ▁FAST 395
 396 | ▁FATHER 396
 397 | ▁FEAR 397
 398 | ▁FEATURE 398
 399 | ▁FEEL 399
 400 | ▁FELL 400
 401 | ▁FELT 401
 402 | ▁FEW 402
 403 | ▁FIELD 403
 404 | ▁FIFTEEN 404
 405 | ▁FIFTY 405
 406 | ▁FIGHT 406
 407 | ▁FIGURE 407
 408 | ▁FILM 408
 409 | ▁FINAL 409
 410 | ▁FIND 410
 411 | ▁FINE 411
 412 | ▁FINISH 412
 413 | ▁FIRE 413
 414 | ▁FIRST 414
 415 | ▁FISH 415
 416 | ▁FIVE 416
 417 | ▁FIX 417
 418 | ▁FLOOR 418
 419 | ▁FLOW 419
 420 | ▁FOLLOW 420
 421 | ▁FOOD 421
 422 | ▁FOOL 422
 423 | ▁FOR 423
 424 | ▁FORTY 424
 425 | ▁FOUND 425
 426 | ▁FOUR 426
 427 | ▁FOURTEEN 427
 428 | ▁FREE 428
 429 | ▁FRESH 429
 430 | ▁FRIEND 430
 431 | ▁FRIENDS 431
 432 | ▁FROM 432
 433 | ▁FRONT 433
 434 | ▁FUL 434
 435 | ▁FULL 435
 436 | ▁FUN 436
 437 | ▁FUTURE 437
 438 | ▁G 438
 439 | ▁GA 439
 440 | ▁GAME 440
 441 | ▁GARDEN 441
 442 | ▁GAVE 442
 443 | ▁GENERAL 443
 444 | ▁GET 444
 445 | ▁GETTING 445
 446 | ▁GIRL 446
 447 | ▁GIVE 447
 448 | ▁GLASS 448
 449 | ▁GO 449
 450 | ▁GOD 450
 451 | ▁GOING 451
 452 | ▁GOLD 452
 453 | ▁GONNA 453
 454 | ▁GOOD 454
 455 | ▁GOT 455
 456 | ▁GR 456
 457 | ▁GRAND 457
 458 | ▁GREAT 458
 459 | ▁GREEN 459
 460 | ▁GROUP 460
 461 | ▁GROW 461
 462 | ▁GUESS 462
 463 | ▁GUY 463
 464 | ▁H 464
 465 | ▁HA 465
 466 | ▁HAD 466
 467 | ▁HALF 467
 468 | ▁HAND 468
 469 | ▁HAPPEN 469
 470 | ▁HAPPY 470
 471 | ▁HARD 471
 472 | ▁HAS 472
 473 | ▁HAVE 473
 474 | ▁HAVING 474
 475 | ▁HE 475
 476 | ▁HEAD 476
 477 | ▁HEALTH 477
 478 | ▁HEAR 478
 479 | ▁HEART 479
 480 | ▁HELP 480
 481 | ▁HER 481
 482 | ▁HERE 482
 483 | ▁HI 483
 484 | ▁HIGH 484
 485 | ▁HIM 485
 486 | ▁HIMSELF 486
 487 | ▁HIS 487
 488 | ▁HISTORY 488
 489 | ▁HO 489
 490 | ▁HOLD 490
 491 | ▁HOME 491
 492 | ▁HOPE 492
 493 | ▁HORSE 493
 494 | ▁HOSPITAL 494
 495 | ▁HOTEL 495
 496 | ▁HOUSE 496
 497 | ▁HOW 497
 498 | ▁HUMAN 498
 499 | ▁HUNDRED 499
 500 | ▁HURT 500
 501 | ▁HUSBAND 501
 502 | ▁I 502
 503 | ▁IDEA 503
 504 | ▁IF 504
 505 | ▁IMAGINE 505
 506 | ▁IMPORTANT 506
 507 | ▁IN 507
 508 | ▁INDIA 508
 509 | ▁INTEREST 509
 510 | ▁INTO 510
 511 | ▁IS 511
 512 | ▁ISSUE 512
 513 | ▁IT 513
 514 | ▁ITSELF 514
 515 | ▁J 515
 516 | ▁JA 516
 517 | ▁JAPAN 517
 518 | ▁JO 518
 519 | ▁JOB 519
 520 | ▁JOHN 520
 521 | ▁JOURNEY 521
 522 | ▁JU 522
 523 | ▁JUST 523
 524 | ▁K 524
 525 | ▁KEEP 525
 526 | ▁KEPT 526
 527 | ▁KEY 527
 528 | ▁KID 528
 529 | ▁KILL 529
 530 | ▁KIND 530
 531 | ▁KITCHEN 531
 532 | ▁KNEW 532
 533 | ▁KNOW 533
 534 | ▁L 534
 535 | ▁LA 535
 536 | ▁LADY 536
 537 | ▁LAND 537
 538 | ▁LANGUAGE 538
 539 | ▁LARGE 539
 540 | ▁LAST 540
 541 | ▁LATE 541
 542 | ▁LAW 542
 543 | ▁LE 543
 544 | ▁LEAD 544
 545 | ▁LEARN 545
 546 | ▁LEAST 546
 547 | ▁LEAVE 547
 548 | ▁LEFT 548
 549 | ▁LEG 549
 550 | ▁LESS 550
 551 | ▁LET 551
 552 | ▁LEVEL 552
 553 | ▁LI 553
 554 | ▁LIFE 554
 555 | ▁LIGHT 555
 556 | ▁LIKE 556
 557 | ▁LINE 557
 558 | ▁LISTEN 558
 559 | ▁LITTLE 559
 560 | ▁LIVE 560
 561 | ▁LIVING 561
 562 | ▁LL 562
 563 | ▁LO 563
 564 | ▁LOCAL 564
 565 | ▁LOCATION 565
 566 | ▁LONDON 566
 567 | ▁LONG 567
 568 | ▁LOOK 568
 569 | ▁LOST 569
 570 | ▁LOT 570
 571 | ▁LOVE 571
 572 | ▁LOW 572
 573 | ▁LU 573
 574 | ▁LY 574
 575 | ▁LYRICS 575
 576 | ▁M 576
 577 | ▁MA 577
 578 | ▁MACHINE 578
 579 | ▁MADE 579
 580 | ▁MAIN 580
 581 | ▁MAKE 581
 582 | ▁MAKING 582
 583 | ▁MAN 583
 584 | ▁MANY 584
 585 | ▁MAR 585
 586 | ▁MARKET 586
 587 | ▁MATTER 587
 588 | ▁MAY 588
 589 | ▁MAYBE 589
 590 | ▁ME 590
 591 | ▁MEAN 591
 592 | ▁MEET 592
 593 | ▁MEMORY 593
 594 | ▁MEN 594
 595 | ▁MESSAGE 595
 596 | ▁METHOD 596
 597 | ▁MI 597
 598 | ▁MIDDLE 598
 599 | ▁MIGHT 599
 600 | ▁MILLION 600
 601 | ▁MIND 601
 602 | ▁MINUTE 602
 603 | ▁MIRROR 603
 604 | ▁MISS 604
 605 | ▁MISTAKE 605
 606 | ▁MO 606
 607 | ▁MODE 607
 608 | ▁MOMENT 608
 609 | ▁MONEY 609
 610 | ▁MONTH 610
 611 | ▁MORE 611
 612 | ▁MORNING 612
 613 | ▁MOST 613
 614 | ▁MOTHER 614
 615 | ▁MOVE 615
 616 | ▁MOVIE 616
 617 | ▁MP 617
 618 | ▁MR 618
 619 | ▁MU 619
 620 | ▁MUCH 620
 621 | ▁MUSIC 621
 622 | ▁MUST 622
 623 | ▁MY 623
 624 | ▁MYSELF 624
 625 | ▁N 625
 626 | ▁NA 626
 627 | ▁NAME 627
 628 | ▁NE 628
 629 | ▁NEAR 629
 630 | ▁NEED 630
 631 | ▁NESS 631
 632 | ▁NEVER 632
 633 | ▁NEW 633
 634 | ▁NEWS 634
 635 | ▁NEXT 635
 636 | ▁NICE 636
 637 | ▁NIGHT 637
 638 | ▁NINE 638
 639 | ▁NINETEEN 639
 640 | ▁NINETY 640
 641 | ▁NO 641
 642 | ▁NOBODY 642
 643 | ▁NORMAL 643
 644 | ▁NORTH 644
 645 | ▁NOT 645
 646 | ▁NOTHING 646
 647 | ▁NOW 647
 648 | ▁NUMBER 648
 649 | ▁O 649
 650 | ▁OF 650
 651 | ▁OFF 651
 652 | ▁OFFICE 652
 653 | ▁OFTEN 653
 654 | ▁OH 654
 655 | ▁OKAY 655
 656 | ▁OLD 656
 657 | ▁ON 657
 658 | ▁ONCE 658
 659 | ▁ONE 659
 660 | ▁ONLINE 660
 661 | ▁ONLY 661
 662 | ▁OPEN 662
 663 | ▁OPERA 663
 664 | ▁OPTION 664
 665 | ▁OR 665
 666 | ▁ORDER 666
 667 | ▁ORGAN 667
 668 | ▁ORIGIN 668
 669 | ▁ORIGINAL 669
 670 | ▁OTHER 670
 671 | ▁OUR 671
 672 | ▁OUT 672
 673 | ▁OVER 673
 674 | ▁OW 674
 675 | ▁OWN 675
 676 | ▁P 676
 677 | ▁PA 677
 678 | ▁PARENTS 678
 679 | ▁PARK 679
 680 | ▁PART 680
 681 | ▁PASS 681
 682 | ▁PAST 682
 683 | ▁PATIENT 683
 684 | ▁PAY 684
 685 | ▁PE 685
 686 | ▁PEOPLE 686
 687 | ▁PERSON 687
 688 | ▁PHONE 688
 689 | ▁PHOTO 689
 690 | ▁PHRASE 690
 691 | ▁PICK 691
 692 | ▁PICTURE 692
 693 | ▁PIECE 693
 694 | ▁PLACE 694
 695 | ▁PLACES 695
 696 | ▁PLAN 696
 697 | ▁PLAY 697
 698 | ▁PLAYLIST 698
 699 | ▁PLEASE 699
 700 | ▁PM 700
 701 | ▁PO 701
 702 | ▁POCKET 702
 703 | ▁POINT 703
 704 | ▁POLICE 704
 705 | ▁POPULAR 705
 706 | ▁POSITION 706
 707 | ▁POWER 707
 708 | ▁PRE 708
 709 | ▁PREPARE 709
 710 | ▁PRESSURE 710
 711 | ▁PRETTY 711
 712 | ▁PRICE 712
 713 | ▁PRO 713
 714 | ▁PROBABLY 714
 715 | ▁PROBLEM 715
 716 | ▁PRODUCE 716
 717 | ▁PRODUCT 717
 718 | ▁PROJECT 718
 719 | ▁PROMISE 719
 720 | ▁PUBLIC 720
 721 | ▁PULL 721
 722 | ▁PUT 722
 723 | ▁Q 723
 724 | ▁QUA 724
 725 | ▁QUALITY 725
 726 | ▁QUE 726
 727 | ▁QUESTION 727
 728 | ▁QUEUE 728
 729 | ▁QUI 729
 730 | ▁QUICK 730
 731 | ▁QUIET 731
 732 | ▁QUITE 732
 733 | ▁R 733
 734 | ▁RA 734
 735 | ▁RADIO 735
 736 | ▁RAIN 736
 737 | ▁RATING 737
 738 | ▁RE 738
 739 | ▁REACH 739
 740 | ▁READ 740
 741 | ▁REAL 741
 742 | ▁REALLY 742
 743 | ▁REASON 743
 744 | ▁RECENT 744
 745 | ▁RECORD 745
 746 | ▁RELEASED 746
 747 | ▁REMEMBER 747
 748 | ▁REMIND 748
 749 | ▁REMOVE 749
 750 | ▁REPEAT 750
 751 | ▁REPLAY 751
 752 | ▁REPORT 752
 753 | ▁REPUBLIC 753
 754 | ▁REST 754
 755 | ▁RESTART 755
 756 | ▁RESTAURANT 756
 757 | ▁RESULT 757
 758 | ▁RETURN 758
 759 | ▁REVIEWS 759
 760 | ▁RI 760
 761 | ▁RICH 761
 762 | ▁RID 762
 763 | ▁RIGHT 763
 764 | ▁RISE 764
 765 | ▁RISK 765
 766 | ▁RIVER 766
 767 | ▁RO 767
 768 | ▁ROAD 768
 769 | ▁ROBOT 769
 770 | ▁ROCK 770
 771 | ▁ROOM 771
 772 | ▁ROUND 772
 773 | ▁RU 773
 774 | ▁RUN 774
 775 | ▁RY 775
 776 | ▁S 776
 777 | ▁SA 777
 778 | ▁SAFE 778
 779 | ▁SAID 779
 780 | ▁SAME 780
 781 | ▁SAW 781
 782 | ▁SAY 782
 783 | ▁SCHOOL 783
 784 | ▁SCIENCE 784
 785 | ▁SCREEN 785
 786 | ▁SE 786
 787 | ▁SECOND 787
 788 | ▁SECRET 788
 789 | ▁SEE 789
 790 | ▁SEND 790
 791 | ▁SENSE 791
 792 | ▁SENTENCE 792
 793 | ▁SERIES 793
 794 | ▁SERIOUS 794
 795 | ▁SERVICE 795
 796 | ▁SET 796
 797 | ▁SEVEN 797
 798 | ▁SEVENTY 798
 799 | ▁SH 799
 800 | ▁SHALL 800
 801 | ▁SHARE 801
 802 | ▁SHE 802
 803 | ▁SHOP 803
 804 | ▁SHORT 804
 805 | ▁SHOULD 805
 806 | ▁SHOW 806
 807 | ▁SHOWS 807
 808 | ▁SHUT 808
 809 | ▁SICK 809
 810 | ▁SIDE 810
 811 | ▁SIGN 811
 812 | ▁SIMILAR 812
 813 | ▁SIMPLE 813
 814 | ▁SINCE 814
 815 | ▁SINGLE 815
 816 | ▁SISTER 816
 817 | ▁SIT 817
 818 | ▁SIX 818
 819 | ▁SIXTEEN 819
 820 | ▁SIXTY 820
 821 | ▁SKIP 821
 822 | ▁SLEEP 822
 823 | ▁SLOW 823
 824 | ▁SMALL 824
 825 | ▁SMART 825
 826 | ▁SNOW 826
 827 | ▁SO 827
 828 | ▁SOME 828
 829 | ▁SOMEBODY 829
 830 | ▁SOMEONE 830
 831 | ▁SOMETHING 831
 832 | ▁SONG 832
 833 | ▁SONGS 833
 834 | ▁SOON 834
 835 | ▁SORRY 835
 836 | ▁SORT 836
 837 | ▁SOUL 837
 838 | ▁SOUND 838
 839 | ▁SOUTH 839
 840 | ▁SP 840
 841 | ▁SPACE 841
 842 | ▁SPEAK 842
 843 | ▁SPECIAL 843
 844 | ▁SPEED 844
 845 | ▁SPEND 845
 846 | ▁ST 846
 847 | ▁STAND 847
 848 | ▁STAR 848
 849 | ▁START 849
 850 | ▁STATE 850
 851 | ▁STAY 851
 852 | ▁STEP 852
 853 | ▁STEREO 853
 854 | ▁STILL 854
 855 | ▁STOP 855
 856 | ▁STORIES 856
 857 | ▁STORY 857
 858 | ▁STRAIGHT 858
 859 | ▁STRANGE 859
 860 | ▁STREET 860
 861 | ▁STRONG 861
 862 | ▁STUDENT 862
 863 | ▁STUDY 863
 864 | ▁STUFF 864
 865 | ▁SU 865
 866 | ▁SUCH 866
 867 | ▁SUDDEN 867
 868 | ▁SUMMER 868
 869 | ▁SUN 869
 870 | ▁SUPPORT 870
 871 | ▁SUPPOSE 871
 872 | ▁SURE 872
 873 | ▁SW 873
 874 | ▁SWEET 874
 875 | ▁SWITCH 875
 876 | ▁SYSTEM 876
 877 | ▁T 877
 878 | ▁TA 878
 879 | ▁TABLE 879
 880 | ▁TAKE 880
 881 | ▁TAKING 881
 882 | ▁TALK 882
 883 | ▁TALKING 883
 884 | ▁TE 884
 885 | ▁TEA 885
 886 | ▁TEACHER 886
 887 | ▁TELL 887
 888 | ▁TEMPERATURE 888
 889 | ▁TEN 889
 890 | ▁TH 890
 891 | ▁THAN 891
 892 | ▁THANK 892
 893 | ▁THAT 893
 894 | ▁THE 894
 895 | ▁THEIR 895
 896 | ▁THEM 896
 897 | ▁THEN 897
 898 | ▁THERE 898
 899 | ▁THESE 899
 900 | ▁THEY 900
 901 | ▁THING 901
 902 | ▁THINGS 902
 903 | ▁THINK 903
 904 | ▁THIRD 904
 905 | ▁THIRTEEN 905
 906 | ▁THIRTY 906
 907 | ▁THIS 907
 908 | ▁THOSE 908
 909 | ▁THOUGH 909
 910 | ▁THOUGHT 910
 911 | ▁THOUSAND 911
 912 | ▁THREE 912
 913 | ▁THROUGH 913
 914 | ▁THROW 914
 915 | ▁TI 915
 916 | ▁TICKET 916
 917 | ▁TIME 917
 918 | ▁TO 918
 919 | ▁TODAY 919
 920 | ▁TOGETHER 920
 921 | ▁TOLD 921
 922 | ▁TOMORROW 922
 923 | ▁TONIGHT 923
 924 | ▁TOO 924
 925 | ▁TOOK 925
 926 | ▁TOWN 926
 927 | ▁TRA 927
 928 | ▁TRAIN 928
 929 | ▁TRAVEL 929
 930 | ▁TREAT 930
 931 | ▁TREE 931
 932 | ▁TRIED 932
 933 | ▁TRIP 933
 934 | ▁TROUBLE 934
 935 | ▁TRU 935
 936 | ▁TRUE 936
 937 | ▁TRY 937
 938 | ▁TRYING 938
 939 | ▁TURE 939
 940 | ▁TURN 940
 941 | ▁TWELVE 941
 942 | ▁TWENTY 942
 943 | ▁TWO 943
 944 | ▁TYPE 944
 945 | ▁U 945
 946 | ▁UN 946
 947 | ▁UNDER 947
 948 | ▁UNDERSTAND 948
 949 | ▁UNITED 949
 950 | ▁UNTI 950
 951 | ▁UP 951
 952 | ▁UR 952
 953 | ▁US 953
 954 | ▁USE 954
 955 | ▁USED 955
 956 | ▁V 956
 957 | ▁VA 957
 958 | ▁VALUE 958
 959 | ▁VE 959
 960 | ▁VER 960
 961 | ▁VERSION 961
 962 | ▁VERY 962
 963 | ▁VI 963
 964 | ▁VIDEO 964
 965 | ▁VIEW 965
 966 | ▁VISIT 966
 967 | ▁VOICE 967
 968 | ▁W 968
 969 | ▁WAIT 969
 970 | ▁WALK 970
 971 | ▁WALL 971
 972 | ▁WANT 972
 973 | ▁WAR 973
 974 | ▁WAS 974
 975 | ▁WATCH 975
 976 | ▁WATER 976
 977 | ▁WAY 977
 978 | ▁WE 978
 979 | ▁WEATHER 979
 980 | ▁WEEK 980
 981 | ▁WELL 981
 982 | ▁WENT 982
 983 | ▁WERE 983
 984 | ▁WHAT 984
 985 | ▁WHEN 985
 986 | ▁WHERE 986
 987 | ▁WHICH 987
 988 | ▁WHILE 988
 989 | ▁WHITE 989
 990 | ▁WHO 990
 991 | ▁WHY 991
 992 | ▁WIFE 992
 993 | ▁WILL 993
 994 | ▁WIN 994
 995 | ▁WIND 995
 996 | ▁WINDOW 996
 997 | ▁WISH 997
 998 | ▁WITH 998
 999 | ▁WITHOUT 999
1000 | ▁WOMAN 1000
1001 | ▁WOMEN 1001
1002 | ▁WON 1002
1003 | ▁WONDER 1003
1004 | ▁WOOD 1004
1005 | ▁WORD 1005
1006 | ▁WORDS 1006
1007 | ▁WORK 1007
1008 | ▁WORLD 1008
1009 | ▁WORRY 1009
1010 | ▁WORTH 1010
1011 | ▁WOULD 1011
1012 | ▁WRITE 1012
1013 | ▁WRONG 1013
1014 | ▁X 1014
1015 | ▁Y 1015
1016 | ▁YEAH 1016
1017 | ▁YEAR 1017
1018 | ▁YEARS 1018
1019 | ▁YES 1019
1020 | ▁YET 1020
1021 | ▁YORK 1021
1022 | ▁YOU 1022
1023 | ▁YOUNG 1023
1024 | ▁YOUR 1024
1025 | ▁YOURSELF 1025
1026 | ▁Z 1026
1027 | ▁ZERO 1027
1028 | 


--------------------------------------------------------------------------------
/module/track2_asr_transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Shigeki Karita
  2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  3 | 
  4 | """Transformer speech recognition model (pytorch)."""
  5 | 
  6 | from argparse import Namespace
  7 | from distutils.util import strtobool
  8 | 
  9 | import logging
 10 | import math
 11 | 
 12 | import torch
 13 | import pdb
 14 | 
 15 | from espnet.nets.asr_interface import ASRInterface
 16 | from espnet.nets.pytorch_backend.ctc import CTC
 17 | from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
 18 | from espnet.nets.pytorch_backend.e2e_asr import Reporter
 19 | from espnet.nets.pytorch_backend.nets_utils import get_subsample
 20 | from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
 21 | from espnet.nets.pytorch_backend.nets_utils import th_accuracy
 22 | from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
 23 | from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
 24 | from espnet.nets.pytorch_backend.transformer.decoder import Decoder
 25 | from espnet.nets.pytorch_backend.transformer.encoder import Encoder
 26 | from espnet.nets.pytorch_backend.transformer.initializer import initialize
 27 | from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import LabelSmoothingLoss
 28 | from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
 29 | from espnet.nets.pytorch_backend.transformer.mask import target_mask
 30 | from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
 31 | from espnet.nets.scorers.ctc import CTCPrefixScorer
 32 | 
 33 | 
 34 | class E2E(ASRInterface, torch.nn.Module):
 35 |     """E2E module.
 36 | 
 37 |     :param int idim: dimension of inputs
 38 |     :param int odim: dimension of outputs
 39 |     :param Namespace args: argument Namespace containing options
 40 | 
 41 |     """
 42 | 
 43 |     @staticmethod
 44 |     def add_arguments(parser):
 45 |         """Add arguments."""
 46 |         group = parser.add_argument_group("transformer model setting")
 47 | 
 48 |         group.add_argument("--transformer-init", type=str, default="pytorch",
 49 |                            choices=["pytorch", "xavier_uniform", "xavier_normal",
 50 |                                     "kaiming_uniform", "kaiming_normal"],
 51 |                            help='how to initialize transformer parameters')
 52 |         group.add_argument("--transformer-input-layer", type=str, default="conv2d",
 53 |                            choices=["conv2d", "linear", "embed"],
 54 |                            help='transformer input layer type')
 55 |         group.add_argument('--transformer-attn-dropout-rate', default=None, type=float,
 56 |                            help='dropout in transformer attention. use --dropout-rate if None is set')
 57 |         group.add_argument('--transformer-lr', default=10.0, type=float,
 58 |                            help='Initial value of learning rate')
 59 |         group.add_argument('--transformer-warmup-steps', default=25000, type=int,
 60 |                            help='optimizer warmup steps')
 61 |         group.add_argument('--transformer-length-normalized-loss', default=True, type=strtobool,
 62 |                            help='normalize loss by length')
 63 | 
 64 |         group.add_argument('--dropout-rate', default=0.0, type=float,
 65 |                            help='Dropout rate for the encoder')
 66 |         # Encoder
 67 |         group.add_argument('--elayers', default=4, type=int,
 68 |                            help='Number of encoder layers (for shared recognition part in multi-speaker asr mode)')
 69 |         group.add_argument('--eunits', '-u', default=300, type=int,
 70 |                            help='Number of encoder hidden units')
 71 |         # Attention
 72 |         group.add_argument('--adim', default=320, type=int,
 73 |                            help='Number of attention transformation dimensions')
 74 |         group.add_argument('--aheads', default=4, type=int,
 75 |                            help='Number of heads for multi head attention')
 76 |         # Decoder
 77 |         group.add_argument('--dlayers', default=1, type=int,
 78 |                            help='Number of decoder layers')
 79 |         group.add_argument('--dunits', default=320, type=int,
 80 |                            help='Number of decoder hidden units')
 81 |         return parser
 82 | 
 83 |     @property
 84 |     def attention_plot_class(self):
 85 |         """Return PlotAttentionReport."""
 86 |         return PlotAttentionReport
 87 | 
 88 |     def __init__(self, idim, odim, args, ignore_id=-1):
 89 |         """Construct an E2E object.
 90 | 
 91 |         :param int idim: dimension of inputs
 92 |         :param int odim: dimension of outputs
 93 |         :param Namespace args: argument Namespace containing options
 94 |         """
 95 |         torch.nn.Module.__init__(self)
 96 |         if args.transformer_attn_dropout_rate is None:
 97 |             args.transformer_attn_dropout_rate = args.dropout_rate
 98 |         self.encoder = Encoder(
 99 |             idim=idim,
100 |             attention_dim=args.adim,
101 |             attention_heads=args.aheads,
102 |             linear_units=args.eunits,
103 |             num_blocks=args.elayers,
104 |             input_layer=args.transformer_input_layer,
105 |             dropout_rate=args.dropout_rate,
106 |             positional_dropout_rate=args.dropout_rate,
107 |             attention_dropout_rate=args.transformer_attn_dropout_rate
108 |         )
109 |         self.decoder = Decoder(
110 |             odim=odim,
111 |             attention_dim=args.adim,
112 |             attention_heads=args.aheads,
113 |             linear_units=args.dunits,
114 |             num_blocks=args.dlayers,
115 |             dropout_rate=args.dropout_rate,
116 |             positional_dropout_rate=args.dropout_rate,
117 |             self_attention_dropout_rate=args.transformer_attn_dropout_rate,
118 |             src_attention_dropout_rate=args.transformer_attn_dropout_rate
119 |         )
120 |         self.sos = odim - 1
121 |         self.eos = odim - 1
122 |         self.odim = odim
123 |         self.ignore_id = ignore_id
124 |         self.subsample = get_subsample(args, mode='asr', arch='transformer')
125 |         self.reporter = Reporter()
126 | 
127 |         # self.lsm_weight = a
128 |         self.criterion = LabelSmoothingLoss(self.odim, self.ignore_id, args.lsm_weight,
129 |                                             args.transformer_length_normalized_loss)
130 |         # self.verbose = args.verbose
131 |         self.reset_parameters(args)
132 |         self.adim = args.adim
133 |         self.mtlalpha = args.mtlalpha
134 |         if args.mtlalpha > 0.0:
135 |             self.ctc = CTC(odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True)
136 |         else:
137 |             self.ctc = None
138 | 
139 |         if args.report_cer or args.report_wer:
140 |             from espnet.nets.e2e_asr_common import ErrorCalculator
141 |             self.error_calculator = ErrorCalculator(args.char_list,
142 |                                                     args.sym_space, args.sym_blank,
143 |                                                     args.report_cer, args.report_wer)
144 |         else:
145 |             self.error_calculator = None
146 |         self.rnnlm = None
147 | 
148 |     def reset_parameters(self, args):
149 |         """Initialize parameters."""
150 |         # initialize parameters
151 |         initialize(self, args.transformer_init)
152 | 
153 |     def forward(self, xs_pad, ilens, ys_pad):
154 |         """E2E forward.
155 | 
156 |         :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
157 |         :param torch.Tensor ilens: batch of lengths of source sequences (B)
158 |         :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
159 |         :return: ctc loass value
160 |         :rtype: torch.Tensor
161 |         :return: attention loss value
162 |         :rtype: torch.Tensor
163 |         :return: accuracy in attention decoder
164 |         :rtype: float
165 |         """
166 |         # 1. forward encoder
167 |         xs_pad = xs_pad[:, :max(ilens)]  # for data parallel
168 |         src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
169 |         hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
170 |         self.hs_pad = hs_pad
171 | 
172 |         # 2. forward decoder
173 |         ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
174 |         ys_mask = target_mask(ys_in_pad, self.ignore_id)
175 |         pred_pad, pred_mask, _, _ = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)
176 |         self.pred_pad = pred_pad
177 | 
178 |         # 3. compute attention loss
179 |         loss_att = self.criterion(pred_pad, ys_out_pad)
180 |         self.acc = th_accuracy(pred_pad.view(-1, self.odim), ys_out_pad,
181 |                                ignore_label=self.ignore_id)
182 | 
183 |         # TODO(karita) show predicted text
184 |         # TODO(karita) calculate these stats
185 |         cer_ctc = None
186 |         if self.mtlalpha == 0.0:
187 |             loss_ctc = None
188 |         else:
189 |             batch_size = xs_pad.size(0)
190 |             hs_len = hs_mask.view(batch_size, -1).sum(1)
191 |             loss_ctc = self.ctc(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad)
192 |             if self.error_calculator is not None:
193 |                 ys_hat = self.ctc.argmax(hs_pad.view(batch_size, -1, self.adim)).data
194 |                 cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
195 | 
196 |         # 5. compute cer/wer
197 |         if self.training or self.error_calculator is None:
198 |             cer, wer = None, None
199 |         else:
200 |             ys_hat = pred_pad.argmax(dim=-1)
201 |             cer, wer = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
202 | 
203 |         # copyied from e2e_asr
204 |         alpha = self.mtlalpha
205 |         if alpha == 0:
206 |             self.loss = loss_att
207 |             loss_att_data = float(loss_att)
208 |             loss_ctc_data = None
209 |         elif alpha == 1:
210 |             self.loss = loss_ctc
211 |             loss_att_data = None
212 |             loss_ctc_data = float(loss_ctc)
213 |         else:
214 |             self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
215 |             loss_att_data = float(loss_att)
216 |             loss_ctc_data = float(loss_ctc)
217 | 
218 |         loss_data = float(self.loss)
219 |         if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
220 |             self.reporter.report(loss_ctc_data, loss_att_data, self.acc, cer_ctc, cer, wer, loss_data)
221 |         else:
222 |             logging.warning('loss (=%f) is not correct', loss_data)
223 |         return self.loss
224 | 
225 |     def scorers(self):
226 |         """Scorers."""
227 |         return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
228 | 
229 |     def encode(self, x):
230 |         """Encode acoustic features.
231 | 
232 |         :param ndarray x: source acoustic feature (T, D)
233 |         :return: encoder outputs
234 |         :rtype: torch.Tensor
235 |         """
236 |         self.eval()
237 |         x = torch.as_tensor(x).unsqueeze(0)
238 |         enc_output, _ = self.encoder(x, None)
239 |         return enc_output.squeeze(0)
240 | 
241 |     def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
242 |         """Recognize input speech.
243 | 
244 |         :param ndnarray x: input acoustic feature (B, T, D) or (T, D)
245 |         :param Namespace recog_args: argment Namespace contraining options
246 |         :param list char_list: list of characters
247 |         :param torch.nn.Module rnnlm: language model module
248 |         :return: N-best decoding results
249 |         :rtype: list
250 |         """
251 |         enc_output = self.encode(x).unsqueeze(0)
252 |         if recog_args.ctc_weight > 0.0:
253 |             lpz = self.ctc.log_softmax(enc_output)
254 |             lpz = lpz.squeeze(0)
255 |         else:
256 |             lpz = None
257 | 
258 |         h = enc_output.squeeze(0)
259 | 
260 |         logging.info('input lengths: ' + str(h.size(0)))
261 |         # search parms
262 |         beam = recog_args.beam_size
263 |         penalty = recog_args.penalty
264 |         ctc_weight = recog_args.ctc_weight
265 | 
266 |         # preprare sos
267 |         y = self.sos
268 |         vy = h.new_zeros(1).long()
269 | 
270 |         if recog_args.maxlenratio == 0:
271 |             maxlen = h.shape[0]
272 |         else:
273 |             # maxlen >= 1
274 |             maxlen = max(1, int(recog_args.maxlenratio * h.size(0)))
275 |         minlen = int(recog_args.minlenratio * h.size(0))
276 |         logging.info('max output length: ' + str(maxlen))
277 |         logging.info('min output length: ' + str(minlen))
278 | 
279 |         # initialize hypothesis
280 |         if rnnlm:
281 |             hyp = {'score': 0.0, 'yseq': [y], 'rnnlm_prev': None}
282 |         else:
283 |             hyp = {'score': 0.0, 'yseq': [y]}
284 |         if lpz is not None:
285 |             import numpy
286 | 
287 |             from espnet.nets.ctc_prefix_score import CTCPrefixScore
288 | 
289 |             ctc_prefix_score = CTCPrefixScore(lpz.detach().numpy(), 0, self.eos, numpy)
290 |             hyp['ctc_state_prev'] = ctc_prefix_score.initial_state()
291 |             hyp['ctc_score_prev'] = 0.0
292 |             if ctc_weight != 1.0:
293 |                 # pre-pruning based on attention scores
294 |                 from espnet.nets.pytorch_backend.rnn.decoders import CTC_SCORING_RATIO
295 |                 ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
296 |             else:
297 |                 ctc_beam = lpz.shape[-1]
298 |         hyps = [hyp]
299 |         ended_hyps = []
300 | 
301 |         import six
302 |         traced_decoder = None
303 |         for i in six.moves.range(maxlen):
304 |             logging.debug('position ' + str(i))
305 | 
306 |             hyps_best_kept = []
307 |             for hyp in hyps:
308 |                 vy.unsqueeze(1)
309 |                 vy[0] = hyp['yseq'][i]
310 | 
311 |                 # get nbest local scores and their ids
312 |                 ys_mask = subsequent_mask(i + 1).unsqueeze(0)
313 |                 ys = torch.tensor(hyp['yseq']).unsqueeze(0)
314 |                 # FIXME: jit does not match non-jit result
315 |                 if use_jit:
316 |                     if traced_decoder is None:
317 |                         traced_decoder = torch.jit.trace(self.decoder.forward_one_step,
318 |                                                          (ys, ys_mask, enc_output))
319 |                     local_att_scores = traced_decoder(ys, ys_mask, enc_output)[0]
320 |                 else:
321 |                     local_att_scores = self.decoder.forward_one_step(ys, ys_mask, enc_output)[0]
322 | 
323 |                 if rnnlm:
324 |                     rnnlm_state, local_lm_scores = rnnlm.predict(hyp['rnnlm_prev'], vy)
325 |                     local_scores = local_att_scores + recog_args.lm_weight * local_lm_scores
326 |                 else:
327 |                     local_scores = local_att_scores
328 | 
329 |                 if lpz is not None:
330 |                     local_best_scores, local_best_ids = torch.topk(
331 |                         local_att_scores, ctc_beam, dim=1)
332 |                     ctc_scores, ctc_states = ctc_prefix_score(
333 |                         hyp['yseq'], local_best_ids[0], hyp['ctc_state_prev'])
334 |                     local_scores = \
335 |                         (1.0 - ctc_weight) * local_att_scores[:, local_best_ids[0]] \
336 |                         + ctc_weight * torch.from_numpy(ctc_scores - hyp['ctc_score_prev'])
337 |                     if rnnlm:
338 |                         local_scores += recog_args.lm_weight * local_lm_scores[:, local_best_ids[0]]
339 |                     local_best_scores, joint_best_ids = torch.topk(local_scores, beam, dim=1)
340 |                     local_best_ids = local_best_ids[:, joint_best_ids[0]]
341 |                 else:
342 |                     local_best_scores, local_best_ids = torch.topk(local_scores, beam, dim=1)
343 | 
344 |                 for j in six.moves.range(beam):
345 |                     new_hyp = {}
346 |                     new_hyp['score'] = hyp['score'] + float(local_best_scores[0, j])
347 |                     new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
348 |                     new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
349 |                     new_hyp['yseq'][len(hyp['yseq'])] = int(local_best_ids[0, j])
350 |                     if rnnlm:
351 |                         new_hyp['rnnlm_prev'] = rnnlm_state
352 |                     if lpz is not None:
353 |                         new_hyp['ctc_state_prev'] = ctc_states[joint_best_ids[0, j]]
354 |                         new_hyp['ctc_score_prev'] = ctc_scores[joint_best_ids[0, j]]
355 |                     # will be (2 x beam) hyps at most
356 |                     hyps_best_kept.append(new_hyp)
357 | 
358 |                 hyps_best_kept = sorted(
359 |                     hyps_best_kept, key=lambda x: x['score'], reverse=True)[:beam]
360 | 
361 |             # sort and get nbest
362 |             hyps = hyps_best_kept
363 |             logging.debug('number of pruned hypothes: ' + str(len(hyps)))
364 |             if char_list is not None:
365 |                 logging.debug(
366 |                     'best hypo: ' + ''.join([char_list[int(x)] for x in hyps[0]['yseq'][1:]]))
367 | 
368 |             # add eos in the final loop to avoid that there are no ended hyps
369 |             if i == maxlen - 1:
370 |                 logging.info('adding <eos> in the last postion in the loop')
371 |                 for hyp in hyps:
372 |                     hyp['yseq'].append(self.eos)
373 | 
374 |             # add ended hypothes to a final list, and removed them from current hypothes
375 |             # (this will be a probmlem, number of hyps < beam)
376 |             remained_hyps = []
377 |             for hyp in hyps:
378 |                 if hyp['yseq'][-1] == self.eos:
379 |                     # only store the sequence that has more than minlen outputs
380 |                     # also add penalty
381 |                     if len(hyp['yseq']) > minlen:
382 |                         hyp['score'] += (i + 1) * penalty
383 |                         if rnnlm:  # Word LM needs to add final <eos> score
384 |                             hyp['score'] += recog_args.lm_weight * rnnlm.final(
385 |                                 hyp['rnnlm_prev'])
386 |                         ended_hyps.append(hyp)
387 |                 else:
388 |                     remained_hyps.append(hyp)
389 | 
390 |             # end detection
391 |             from espnet.nets.e2e_asr_common import end_detect
392 |             if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
393 |                 logging.info('end detected at %d', i)
394 |                 break
395 | 
396 |             hyps = remained_hyps
397 |             if len(hyps) > 0:
398 |                 logging.debug('remeined hypothes: ' + str(len(hyps)))
399 |             else:
400 |                 logging.info('no hypothesis. Finish decoding.')
401 |                 break
402 | 
403 |             if char_list is not None:
404 |                 for hyp in hyps:
405 |                     logging.debug(
406 |                         'hypo: ' + ''.join([char_list[int(x)] for x in hyp['yseq'][1:]]))
407 | 
408 |             logging.debug('number of ended hypothes: ' + str(len(ended_hyps)))
409 | 
410 |         nbest_hyps = sorted(
411 |             ended_hyps, key=lambda x: x['score'], reverse=True)[:min(len(ended_hyps), recog_args.nbest)]
412 | 
413 |         # check number of hypotheis
414 |         if len(nbest_hyps) == 0:
415 |             logging.warning('there is no N-best results, perform recognition again with smaller minlenratio.')
416 |             # should copy becasuse Namespace will be overwritten globally
417 |             recog_args = Namespace(**vars(recog_args))
418 |             recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
419 |             return self.recognize(x, recog_args, char_list, rnnlm)
420 | 
421 |         logging.info('total log probability: ' + str(nbest_hyps[0]['score']))
422 |         logging.info('normalized log probability: ' + str(nbest_hyps[0]['score'] / len(nbest_hyps[0]['yseq'])))
423 |         return nbest_hyps
424 | 
425 |     def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
426 |         """E2E attention calculation.
427 | 
428 |         :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
429 |         :param torch.Tensor ilens: batch of lengths of input sequences (B)
430 |         :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
431 |         :return: attention weights with the following shape,
432 |             1) multi-head case => attention weights (B, H, Lmax, Tmax),
433 |             2) other case => attention weights (B, Lmax, Tmax).
434 |         :rtype: float ndarray
435 |         """
436 |         with torch.no_grad():
437 |             self.forward(xs_pad, ilens, ys_pad)
438 |         ret = dict()
439 |         for name, m in self.named_modules():
440 |             if isinstance(m, MultiHeadedAttention):
441 |                 ret[name] = m.attn.cpu().numpy()
442 |         return ret
443 | 


--------------------------------------------------------------------------------
	Data	Decode Related	WER on cv set
	Data	Decode Related	RU	KR	US	PT	JPN	UK	CHN	IND	AVE
Kaldi	Accent160	-	6.67	11.46	15.95	10.27	9.78	16.88	20.97	17.48	13.68
	Libri960 ~ Accent160		6.61	10.95	15.33	9.79	9.75	16.03	19.68	16.93	13.13
	Accent160 + Libri160		6.95	11.76	13.05	9.96	10.15	14.21	20.76	18.26	13.14
ESPnet	Accent160	+0.3RNNLM	5.26	7.69	9.96	7.45	6.79	10.06	11.77	10.05	8.63
	Libri960 ~ Accent160	+0.3RNNLM	4.6	6.4	7.42	5.9	5.71	7.64	9.87	7.85	6.92
	Accent160 +Libri160	-	5.35	9.07	8.52	7.13	7.29	8.6	12.03	9.05	8.38
		+0.3RNNLM	4.68	7.59	7.7	6.42	6.37	7.76	10.88	8.41	7.48
		+0.3RNNLM+0.3CTC	4.76	7.81	7.71	6.36	6.4	7.23	10.77	8.01	7.38